1 #include "fht.h"
2 static inline void helper_float_1(float *buf);
helper_float_1(float * buf)3 static inline void helper_float_1(float *buf) {
4 for (int j = 0; j < 2; j += 2) {
5 for (int k = 0; k < 1; ++k) {
6 float u = buf[j + k];
7 float v = buf[j + k + 1];
8 buf[j + k] = u + v;
9 buf[j + k + 1] = u - v;
10 }
11 }
12 }
13 static inline void helper_float_2(float *buf);
helper_float_2(float * buf)14 static inline void helper_float_2(float *buf) {
15 for (int j = 0; j < 4; j += 4) {
16 __asm__ volatile (
17 "movups (%0), %%xmm0\n"
18 "movaps %%xmm0, %%xmm8\n"
19 "shufps $160, %%xmm8, %%xmm8\n"
20 "shufps $245, %%xmm0, %%xmm0\n"
21 "xorps %%xmm9, %%xmm9\n"
22 "subps %%xmm0, %%xmm9\n"
23 "addsubps %%xmm9, %%xmm8\n"
24 "movaps %%xmm8, %%xmm0\n"
25 "movaps %%xmm0, %%xmm8\n"
26 "shufps $68, %%xmm8, %%xmm8\n"
27 "xorps %%xmm9, %%xmm9\n"
28 "movaps %%xmm0, %%xmm10\n"
29 "shufps $14, %%xmm9, %%xmm10\n"
30 "movaps %%xmm0, %%xmm11\n"
31 "shufps $224, %%xmm11, %%xmm9\n"
32 "addps %%xmm8, %%xmm10\n"
33 "subps %%xmm9, %%xmm10\n"
34 "movaps %%xmm10, %%xmm0\n"
35 "movups %%xmm0, (%0)\n"
36 :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
37 );
38 }
39 }
40 static inline void helper_float_3(float *buf);
helper_float_3(float * buf)41 static inline void helper_float_3(float *buf) {
42 for (int j = 0; j < 8; j += 8) {
43 for (int k = 0; k < 4; k += 4) {
44 __asm__ volatile (
45 "movups (%0), %%xmm0\n"
46 "movups (%1), %%xmm1\n"
47 "movaps %%xmm0, %%xmm8\n"
48 "shufps $160, %%xmm8, %%xmm8\n"
49 "shufps $245, %%xmm0, %%xmm0\n"
50 "xorps %%xmm9, %%xmm9\n"
51 "subps %%xmm0, %%xmm9\n"
52 "addsubps %%xmm9, %%xmm8\n"
53 "movaps %%xmm8, %%xmm0\n"
54 "movaps %%xmm1, %%xmm8\n"
55 "shufps $160, %%xmm8, %%xmm8\n"
56 "shufps $245, %%xmm1, %%xmm1\n"
57 "xorps %%xmm9, %%xmm9\n"
58 "subps %%xmm1, %%xmm9\n"
59 "addsubps %%xmm9, %%xmm8\n"
60 "movaps %%xmm8, %%xmm1\n"
61 "movaps %%xmm0, %%xmm8\n"
62 "shufps $68, %%xmm8, %%xmm8\n"
63 "xorps %%xmm9, %%xmm9\n"
64 "movaps %%xmm0, %%xmm10\n"
65 "shufps $14, %%xmm9, %%xmm10\n"
66 "movaps %%xmm0, %%xmm11\n"
67 "shufps $224, %%xmm11, %%xmm9\n"
68 "addps %%xmm8, %%xmm10\n"
69 "subps %%xmm9, %%xmm10\n"
70 "movaps %%xmm10, %%xmm0\n"
71 "movaps %%xmm1, %%xmm8\n"
72 "shufps $68, %%xmm8, %%xmm8\n"
73 "xorps %%xmm9, %%xmm9\n"
74 "movaps %%xmm1, %%xmm10\n"
75 "shufps $14, %%xmm9, %%xmm10\n"
76 "movaps %%xmm1, %%xmm11\n"
77 "shufps $224, %%xmm11, %%xmm9\n"
78 "addps %%xmm8, %%xmm10\n"
79 "subps %%xmm9, %%xmm10\n"
80 "movaps %%xmm10, %%xmm1\n"
81 "movaps %%xmm0, %%xmm8\n"
82 "movaps %%xmm0, %%xmm9\n"
83 "addps %%xmm1, %%xmm8\n"
84 "subps %%xmm1, %%xmm9\n"
85 "movups %%xmm8, (%0)\n"
86 "movups %%xmm9, (%1)\n"
87 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
88 );
89 }
90 }
91 }
92 static inline void helper_float_4(float *buf);
helper_float_4(float * buf)93 static inline void helper_float_4(float *buf) {
94 for (int j = 0; j < 16; j += 16) {
95 for (int k = 0; k < 4; k += 4) {
96 __asm__ volatile (
97 "movups (%0), %%xmm0\n"
98 "movups (%1), %%xmm1\n"
99 "movups (%2), %%xmm2\n"
100 "movups (%3), %%xmm3\n"
101 "movaps %%xmm0, %%xmm8\n"
102 "shufps $160, %%xmm8, %%xmm8\n"
103 "shufps $245, %%xmm0, %%xmm0\n"
104 "xorps %%xmm9, %%xmm9\n"
105 "subps %%xmm0, %%xmm9\n"
106 "addsubps %%xmm9, %%xmm8\n"
107 "movaps %%xmm8, %%xmm0\n"
108 "movaps %%xmm1, %%xmm8\n"
109 "shufps $160, %%xmm8, %%xmm8\n"
110 "shufps $245, %%xmm1, %%xmm1\n"
111 "xorps %%xmm9, %%xmm9\n"
112 "subps %%xmm1, %%xmm9\n"
113 "addsubps %%xmm9, %%xmm8\n"
114 "movaps %%xmm8, %%xmm1\n"
115 "movaps %%xmm2, %%xmm8\n"
116 "shufps $160, %%xmm8, %%xmm8\n"
117 "shufps $245, %%xmm2, %%xmm2\n"
118 "xorps %%xmm9, %%xmm9\n"
119 "subps %%xmm2, %%xmm9\n"
120 "addsubps %%xmm9, %%xmm8\n"
121 "movaps %%xmm8, %%xmm2\n"
122 "movaps %%xmm3, %%xmm8\n"
123 "shufps $160, %%xmm8, %%xmm8\n"
124 "shufps $245, %%xmm3, %%xmm3\n"
125 "xorps %%xmm9, %%xmm9\n"
126 "subps %%xmm3, %%xmm9\n"
127 "addsubps %%xmm9, %%xmm8\n"
128 "movaps %%xmm8, %%xmm3\n"
129 "movaps %%xmm0, %%xmm8\n"
130 "shufps $68, %%xmm8, %%xmm8\n"
131 "xorps %%xmm9, %%xmm9\n"
132 "movaps %%xmm0, %%xmm10\n"
133 "shufps $14, %%xmm9, %%xmm10\n"
134 "movaps %%xmm0, %%xmm11\n"
135 "shufps $224, %%xmm11, %%xmm9\n"
136 "addps %%xmm8, %%xmm10\n"
137 "subps %%xmm9, %%xmm10\n"
138 "movaps %%xmm10, %%xmm0\n"
139 "movaps %%xmm1, %%xmm8\n"
140 "shufps $68, %%xmm8, %%xmm8\n"
141 "xorps %%xmm9, %%xmm9\n"
142 "movaps %%xmm1, %%xmm10\n"
143 "shufps $14, %%xmm9, %%xmm10\n"
144 "movaps %%xmm1, %%xmm11\n"
145 "shufps $224, %%xmm11, %%xmm9\n"
146 "addps %%xmm8, %%xmm10\n"
147 "subps %%xmm9, %%xmm10\n"
148 "movaps %%xmm10, %%xmm1\n"
149 "movaps %%xmm2, %%xmm8\n"
150 "shufps $68, %%xmm8, %%xmm8\n"
151 "xorps %%xmm9, %%xmm9\n"
152 "movaps %%xmm2, %%xmm10\n"
153 "shufps $14, %%xmm9, %%xmm10\n"
154 "movaps %%xmm2, %%xmm11\n"
155 "shufps $224, %%xmm11, %%xmm9\n"
156 "addps %%xmm8, %%xmm10\n"
157 "subps %%xmm9, %%xmm10\n"
158 "movaps %%xmm10, %%xmm2\n"
159 "movaps %%xmm3, %%xmm8\n"
160 "shufps $68, %%xmm8, %%xmm8\n"
161 "xorps %%xmm9, %%xmm9\n"
162 "movaps %%xmm3, %%xmm10\n"
163 "shufps $14, %%xmm9, %%xmm10\n"
164 "movaps %%xmm3, %%xmm11\n"
165 "shufps $224, %%xmm11, %%xmm9\n"
166 "addps %%xmm8, %%xmm10\n"
167 "subps %%xmm9, %%xmm10\n"
168 "movaps %%xmm10, %%xmm3\n"
169 "movaps %%xmm0, %%xmm8\n"
170 "movaps %%xmm0, %%xmm9\n"
171 "addps %%xmm1, %%xmm8\n"
172 "subps %%xmm1, %%xmm9\n"
173 "movaps %%xmm2, %%xmm10\n"
174 "movaps %%xmm2, %%xmm11\n"
175 "addps %%xmm3, %%xmm10\n"
176 "subps %%xmm3, %%xmm11\n"
177 "movaps %%xmm8, %%xmm0\n"
178 "movaps %%xmm8, %%xmm2\n"
179 "addps %%xmm10, %%xmm0\n"
180 "subps %%xmm10, %%xmm2\n"
181 "movaps %%xmm9, %%xmm1\n"
182 "movaps %%xmm9, %%xmm3\n"
183 "addps %%xmm11, %%xmm1\n"
184 "subps %%xmm11, %%xmm3\n"
185 "movups %%xmm0, (%0)\n"
186 "movups %%xmm1, (%1)\n"
187 "movups %%xmm2, (%2)\n"
188 "movups %%xmm3, (%3)\n"
189 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
190 );
191 }
192 }
193 }
194 static inline void helper_float_5(float *buf);
helper_float_5(float * buf)195 static inline void helper_float_5(float *buf) {
196 for (int j = 0; j < 32; j += 32) {
197 for (int k = 0; k < 4; k += 4) {
198 __asm__ volatile (
199 "movups (%0), %%xmm0\n"
200 "movups (%1), %%xmm1\n"
201 "movups (%2), %%xmm2\n"
202 "movups (%3), %%xmm3\n"
203 "movups (%4), %%xmm4\n"
204 "movups (%5), %%xmm5\n"
205 "movups (%6), %%xmm6\n"
206 "movups (%7), %%xmm7\n"
207 "movaps %%xmm0, %%xmm8\n"
208 "shufps $160, %%xmm8, %%xmm8\n"
209 "shufps $245, %%xmm0, %%xmm0\n"
210 "xorps %%xmm9, %%xmm9\n"
211 "subps %%xmm0, %%xmm9\n"
212 "addsubps %%xmm9, %%xmm8\n"
213 "movaps %%xmm8, %%xmm0\n"
214 "movaps %%xmm1, %%xmm8\n"
215 "shufps $160, %%xmm8, %%xmm8\n"
216 "shufps $245, %%xmm1, %%xmm1\n"
217 "xorps %%xmm9, %%xmm9\n"
218 "subps %%xmm1, %%xmm9\n"
219 "addsubps %%xmm9, %%xmm8\n"
220 "movaps %%xmm8, %%xmm1\n"
221 "movaps %%xmm2, %%xmm8\n"
222 "shufps $160, %%xmm8, %%xmm8\n"
223 "shufps $245, %%xmm2, %%xmm2\n"
224 "xorps %%xmm9, %%xmm9\n"
225 "subps %%xmm2, %%xmm9\n"
226 "addsubps %%xmm9, %%xmm8\n"
227 "movaps %%xmm8, %%xmm2\n"
228 "movaps %%xmm3, %%xmm8\n"
229 "shufps $160, %%xmm8, %%xmm8\n"
230 "shufps $245, %%xmm3, %%xmm3\n"
231 "xorps %%xmm9, %%xmm9\n"
232 "subps %%xmm3, %%xmm9\n"
233 "addsubps %%xmm9, %%xmm8\n"
234 "movaps %%xmm8, %%xmm3\n"
235 "movaps %%xmm4, %%xmm8\n"
236 "shufps $160, %%xmm8, %%xmm8\n"
237 "shufps $245, %%xmm4, %%xmm4\n"
238 "xorps %%xmm9, %%xmm9\n"
239 "subps %%xmm4, %%xmm9\n"
240 "addsubps %%xmm9, %%xmm8\n"
241 "movaps %%xmm8, %%xmm4\n"
242 "movaps %%xmm5, %%xmm8\n"
243 "shufps $160, %%xmm8, %%xmm8\n"
244 "shufps $245, %%xmm5, %%xmm5\n"
245 "xorps %%xmm9, %%xmm9\n"
246 "subps %%xmm5, %%xmm9\n"
247 "addsubps %%xmm9, %%xmm8\n"
248 "movaps %%xmm8, %%xmm5\n"
249 "movaps %%xmm6, %%xmm8\n"
250 "shufps $160, %%xmm8, %%xmm8\n"
251 "shufps $245, %%xmm6, %%xmm6\n"
252 "xorps %%xmm9, %%xmm9\n"
253 "subps %%xmm6, %%xmm9\n"
254 "addsubps %%xmm9, %%xmm8\n"
255 "movaps %%xmm8, %%xmm6\n"
256 "movaps %%xmm7, %%xmm8\n"
257 "shufps $160, %%xmm8, %%xmm8\n"
258 "shufps $245, %%xmm7, %%xmm7\n"
259 "xorps %%xmm9, %%xmm9\n"
260 "subps %%xmm7, %%xmm9\n"
261 "addsubps %%xmm9, %%xmm8\n"
262 "movaps %%xmm8, %%xmm7\n"
263 "movaps %%xmm0, %%xmm8\n"
264 "shufps $68, %%xmm8, %%xmm8\n"
265 "xorps %%xmm9, %%xmm9\n"
266 "movaps %%xmm0, %%xmm10\n"
267 "shufps $14, %%xmm9, %%xmm10\n"
268 "movaps %%xmm0, %%xmm11\n"
269 "shufps $224, %%xmm11, %%xmm9\n"
270 "addps %%xmm8, %%xmm10\n"
271 "subps %%xmm9, %%xmm10\n"
272 "movaps %%xmm10, %%xmm0\n"
273 "movaps %%xmm1, %%xmm8\n"
274 "shufps $68, %%xmm8, %%xmm8\n"
275 "xorps %%xmm9, %%xmm9\n"
276 "movaps %%xmm1, %%xmm10\n"
277 "shufps $14, %%xmm9, %%xmm10\n"
278 "movaps %%xmm1, %%xmm11\n"
279 "shufps $224, %%xmm11, %%xmm9\n"
280 "addps %%xmm8, %%xmm10\n"
281 "subps %%xmm9, %%xmm10\n"
282 "movaps %%xmm10, %%xmm1\n"
283 "movaps %%xmm2, %%xmm8\n"
284 "shufps $68, %%xmm8, %%xmm8\n"
285 "xorps %%xmm9, %%xmm9\n"
286 "movaps %%xmm2, %%xmm10\n"
287 "shufps $14, %%xmm9, %%xmm10\n"
288 "movaps %%xmm2, %%xmm11\n"
289 "shufps $224, %%xmm11, %%xmm9\n"
290 "addps %%xmm8, %%xmm10\n"
291 "subps %%xmm9, %%xmm10\n"
292 "movaps %%xmm10, %%xmm2\n"
293 "movaps %%xmm3, %%xmm8\n"
294 "shufps $68, %%xmm8, %%xmm8\n"
295 "xorps %%xmm9, %%xmm9\n"
296 "movaps %%xmm3, %%xmm10\n"
297 "shufps $14, %%xmm9, %%xmm10\n"
298 "movaps %%xmm3, %%xmm11\n"
299 "shufps $224, %%xmm11, %%xmm9\n"
300 "addps %%xmm8, %%xmm10\n"
301 "subps %%xmm9, %%xmm10\n"
302 "movaps %%xmm10, %%xmm3\n"
303 "movaps %%xmm4, %%xmm8\n"
304 "shufps $68, %%xmm8, %%xmm8\n"
305 "xorps %%xmm9, %%xmm9\n"
306 "movaps %%xmm4, %%xmm10\n"
307 "shufps $14, %%xmm9, %%xmm10\n"
308 "movaps %%xmm4, %%xmm11\n"
309 "shufps $224, %%xmm11, %%xmm9\n"
310 "addps %%xmm8, %%xmm10\n"
311 "subps %%xmm9, %%xmm10\n"
312 "movaps %%xmm10, %%xmm4\n"
313 "movaps %%xmm5, %%xmm8\n"
314 "shufps $68, %%xmm8, %%xmm8\n"
315 "xorps %%xmm9, %%xmm9\n"
316 "movaps %%xmm5, %%xmm10\n"
317 "shufps $14, %%xmm9, %%xmm10\n"
318 "movaps %%xmm5, %%xmm11\n"
319 "shufps $224, %%xmm11, %%xmm9\n"
320 "addps %%xmm8, %%xmm10\n"
321 "subps %%xmm9, %%xmm10\n"
322 "movaps %%xmm10, %%xmm5\n"
323 "movaps %%xmm6, %%xmm8\n"
324 "shufps $68, %%xmm8, %%xmm8\n"
325 "xorps %%xmm9, %%xmm9\n"
326 "movaps %%xmm6, %%xmm10\n"
327 "shufps $14, %%xmm9, %%xmm10\n"
328 "movaps %%xmm6, %%xmm11\n"
329 "shufps $224, %%xmm11, %%xmm9\n"
330 "addps %%xmm8, %%xmm10\n"
331 "subps %%xmm9, %%xmm10\n"
332 "movaps %%xmm10, %%xmm6\n"
333 "movaps %%xmm7, %%xmm8\n"
334 "shufps $68, %%xmm8, %%xmm8\n"
335 "xorps %%xmm9, %%xmm9\n"
336 "movaps %%xmm7, %%xmm10\n"
337 "shufps $14, %%xmm9, %%xmm10\n"
338 "movaps %%xmm7, %%xmm11\n"
339 "shufps $224, %%xmm11, %%xmm9\n"
340 "addps %%xmm8, %%xmm10\n"
341 "subps %%xmm9, %%xmm10\n"
342 "movaps %%xmm10, %%xmm7\n"
343 "movaps %%xmm0, %%xmm8\n"
344 "movaps %%xmm0, %%xmm9\n"
345 "addps %%xmm1, %%xmm8\n"
346 "subps %%xmm1, %%xmm9\n"
347 "movaps %%xmm2, %%xmm10\n"
348 "movaps %%xmm2, %%xmm11\n"
349 "addps %%xmm3, %%xmm10\n"
350 "subps %%xmm3, %%xmm11\n"
351 "movaps %%xmm4, %%xmm12\n"
352 "movaps %%xmm4, %%xmm13\n"
353 "addps %%xmm5, %%xmm12\n"
354 "subps %%xmm5, %%xmm13\n"
355 "movaps %%xmm6, %%xmm14\n"
356 "movaps %%xmm6, %%xmm15\n"
357 "addps %%xmm7, %%xmm14\n"
358 "subps %%xmm7, %%xmm15\n"
359 "movaps %%xmm8, %%xmm0\n"
360 "movaps %%xmm8, %%xmm2\n"
361 "addps %%xmm10, %%xmm0\n"
362 "subps %%xmm10, %%xmm2\n"
363 "movaps %%xmm9, %%xmm1\n"
364 "movaps %%xmm9, %%xmm3\n"
365 "addps %%xmm11, %%xmm1\n"
366 "subps %%xmm11, %%xmm3\n"
367 "movaps %%xmm12, %%xmm4\n"
368 "movaps %%xmm12, %%xmm6\n"
369 "addps %%xmm14, %%xmm4\n"
370 "subps %%xmm14, %%xmm6\n"
371 "movaps %%xmm13, %%xmm5\n"
372 "movaps %%xmm13, %%xmm7\n"
373 "addps %%xmm15, %%xmm5\n"
374 "subps %%xmm15, %%xmm7\n"
375 "movaps %%xmm0, %%xmm8\n"
376 "movaps %%xmm0, %%xmm12\n"
377 "addps %%xmm4, %%xmm8\n"
378 "subps %%xmm4, %%xmm12\n"
379 "movaps %%xmm1, %%xmm9\n"
380 "movaps %%xmm1, %%xmm13\n"
381 "addps %%xmm5, %%xmm9\n"
382 "subps %%xmm5, %%xmm13\n"
383 "movaps %%xmm2, %%xmm10\n"
384 "movaps %%xmm2, %%xmm14\n"
385 "addps %%xmm6, %%xmm10\n"
386 "subps %%xmm6, %%xmm14\n"
387 "movaps %%xmm3, %%xmm11\n"
388 "movaps %%xmm3, %%xmm15\n"
389 "addps %%xmm7, %%xmm11\n"
390 "subps %%xmm7, %%xmm15\n"
391 "movups %%xmm8, (%0)\n"
392 "movups %%xmm9, (%1)\n"
393 "movups %%xmm10, (%2)\n"
394 "movups %%xmm11, (%3)\n"
395 "movups %%xmm12, (%4)\n"
396 "movups %%xmm13, (%5)\n"
397 "movups %%xmm14, (%6)\n"
398 "movups %%xmm15, (%7)\n"
399 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
400 );
401 }
402 }
403 }
404 static inline void helper_float_6(float *buf);
helper_float_6(float * buf)405 static inline void helper_float_6(float *buf) {
406 for (int j = 0; j < 64; j += 32) {
407 for (int k = 0; k < 4; k += 4) {
408 __asm__ volatile (
409 "movups (%0), %%xmm0\n"
410 "movups (%1), %%xmm1\n"
411 "movups (%2), %%xmm2\n"
412 "movups (%3), %%xmm3\n"
413 "movups (%4), %%xmm4\n"
414 "movups (%5), %%xmm5\n"
415 "movups (%6), %%xmm6\n"
416 "movups (%7), %%xmm7\n"
417 "movaps %%xmm0, %%xmm8\n"
418 "shufps $160, %%xmm8, %%xmm8\n"
419 "shufps $245, %%xmm0, %%xmm0\n"
420 "xorps %%xmm9, %%xmm9\n"
421 "subps %%xmm0, %%xmm9\n"
422 "addsubps %%xmm9, %%xmm8\n"
423 "movaps %%xmm8, %%xmm0\n"
424 "movaps %%xmm1, %%xmm8\n"
425 "shufps $160, %%xmm8, %%xmm8\n"
426 "shufps $245, %%xmm1, %%xmm1\n"
427 "xorps %%xmm9, %%xmm9\n"
428 "subps %%xmm1, %%xmm9\n"
429 "addsubps %%xmm9, %%xmm8\n"
430 "movaps %%xmm8, %%xmm1\n"
431 "movaps %%xmm2, %%xmm8\n"
432 "shufps $160, %%xmm8, %%xmm8\n"
433 "shufps $245, %%xmm2, %%xmm2\n"
434 "xorps %%xmm9, %%xmm9\n"
435 "subps %%xmm2, %%xmm9\n"
436 "addsubps %%xmm9, %%xmm8\n"
437 "movaps %%xmm8, %%xmm2\n"
438 "movaps %%xmm3, %%xmm8\n"
439 "shufps $160, %%xmm8, %%xmm8\n"
440 "shufps $245, %%xmm3, %%xmm3\n"
441 "xorps %%xmm9, %%xmm9\n"
442 "subps %%xmm3, %%xmm9\n"
443 "addsubps %%xmm9, %%xmm8\n"
444 "movaps %%xmm8, %%xmm3\n"
445 "movaps %%xmm4, %%xmm8\n"
446 "shufps $160, %%xmm8, %%xmm8\n"
447 "shufps $245, %%xmm4, %%xmm4\n"
448 "xorps %%xmm9, %%xmm9\n"
449 "subps %%xmm4, %%xmm9\n"
450 "addsubps %%xmm9, %%xmm8\n"
451 "movaps %%xmm8, %%xmm4\n"
452 "movaps %%xmm5, %%xmm8\n"
453 "shufps $160, %%xmm8, %%xmm8\n"
454 "shufps $245, %%xmm5, %%xmm5\n"
455 "xorps %%xmm9, %%xmm9\n"
456 "subps %%xmm5, %%xmm9\n"
457 "addsubps %%xmm9, %%xmm8\n"
458 "movaps %%xmm8, %%xmm5\n"
459 "movaps %%xmm6, %%xmm8\n"
460 "shufps $160, %%xmm8, %%xmm8\n"
461 "shufps $245, %%xmm6, %%xmm6\n"
462 "xorps %%xmm9, %%xmm9\n"
463 "subps %%xmm6, %%xmm9\n"
464 "addsubps %%xmm9, %%xmm8\n"
465 "movaps %%xmm8, %%xmm6\n"
466 "movaps %%xmm7, %%xmm8\n"
467 "shufps $160, %%xmm8, %%xmm8\n"
468 "shufps $245, %%xmm7, %%xmm7\n"
469 "xorps %%xmm9, %%xmm9\n"
470 "subps %%xmm7, %%xmm9\n"
471 "addsubps %%xmm9, %%xmm8\n"
472 "movaps %%xmm8, %%xmm7\n"
473 "movaps %%xmm0, %%xmm8\n"
474 "shufps $68, %%xmm8, %%xmm8\n"
475 "xorps %%xmm9, %%xmm9\n"
476 "movaps %%xmm0, %%xmm10\n"
477 "shufps $14, %%xmm9, %%xmm10\n"
478 "movaps %%xmm0, %%xmm11\n"
479 "shufps $224, %%xmm11, %%xmm9\n"
480 "addps %%xmm8, %%xmm10\n"
481 "subps %%xmm9, %%xmm10\n"
482 "movaps %%xmm10, %%xmm0\n"
483 "movaps %%xmm1, %%xmm8\n"
484 "shufps $68, %%xmm8, %%xmm8\n"
485 "xorps %%xmm9, %%xmm9\n"
486 "movaps %%xmm1, %%xmm10\n"
487 "shufps $14, %%xmm9, %%xmm10\n"
488 "movaps %%xmm1, %%xmm11\n"
489 "shufps $224, %%xmm11, %%xmm9\n"
490 "addps %%xmm8, %%xmm10\n"
491 "subps %%xmm9, %%xmm10\n"
492 "movaps %%xmm10, %%xmm1\n"
493 "movaps %%xmm2, %%xmm8\n"
494 "shufps $68, %%xmm8, %%xmm8\n"
495 "xorps %%xmm9, %%xmm9\n"
496 "movaps %%xmm2, %%xmm10\n"
497 "shufps $14, %%xmm9, %%xmm10\n"
498 "movaps %%xmm2, %%xmm11\n"
499 "shufps $224, %%xmm11, %%xmm9\n"
500 "addps %%xmm8, %%xmm10\n"
501 "subps %%xmm9, %%xmm10\n"
502 "movaps %%xmm10, %%xmm2\n"
503 "movaps %%xmm3, %%xmm8\n"
504 "shufps $68, %%xmm8, %%xmm8\n"
505 "xorps %%xmm9, %%xmm9\n"
506 "movaps %%xmm3, %%xmm10\n"
507 "shufps $14, %%xmm9, %%xmm10\n"
508 "movaps %%xmm3, %%xmm11\n"
509 "shufps $224, %%xmm11, %%xmm9\n"
510 "addps %%xmm8, %%xmm10\n"
511 "subps %%xmm9, %%xmm10\n"
512 "movaps %%xmm10, %%xmm3\n"
513 "movaps %%xmm4, %%xmm8\n"
514 "shufps $68, %%xmm8, %%xmm8\n"
515 "xorps %%xmm9, %%xmm9\n"
516 "movaps %%xmm4, %%xmm10\n"
517 "shufps $14, %%xmm9, %%xmm10\n"
518 "movaps %%xmm4, %%xmm11\n"
519 "shufps $224, %%xmm11, %%xmm9\n"
520 "addps %%xmm8, %%xmm10\n"
521 "subps %%xmm9, %%xmm10\n"
522 "movaps %%xmm10, %%xmm4\n"
523 "movaps %%xmm5, %%xmm8\n"
524 "shufps $68, %%xmm8, %%xmm8\n"
525 "xorps %%xmm9, %%xmm9\n"
526 "movaps %%xmm5, %%xmm10\n"
527 "shufps $14, %%xmm9, %%xmm10\n"
528 "movaps %%xmm5, %%xmm11\n"
529 "shufps $224, %%xmm11, %%xmm9\n"
530 "addps %%xmm8, %%xmm10\n"
531 "subps %%xmm9, %%xmm10\n"
532 "movaps %%xmm10, %%xmm5\n"
533 "movaps %%xmm6, %%xmm8\n"
534 "shufps $68, %%xmm8, %%xmm8\n"
535 "xorps %%xmm9, %%xmm9\n"
536 "movaps %%xmm6, %%xmm10\n"
537 "shufps $14, %%xmm9, %%xmm10\n"
538 "movaps %%xmm6, %%xmm11\n"
539 "shufps $224, %%xmm11, %%xmm9\n"
540 "addps %%xmm8, %%xmm10\n"
541 "subps %%xmm9, %%xmm10\n"
542 "movaps %%xmm10, %%xmm6\n"
543 "movaps %%xmm7, %%xmm8\n"
544 "shufps $68, %%xmm8, %%xmm8\n"
545 "xorps %%xmm9, %%xmm9\n"
546 "movaps %%xmm7, %%xmm10\n"
547 "shufps $14, %%xmm9, %%xmm10\n"
548 "movaps %%xmm7, %%xmm11\n"
549 "shufps $224, %%xmm11, %%xmm9\n"
550 "addps %%xmm8, %%xmm10\n"
551 "subps %%xmm9, %%xmm10\n"
552 "movaps %%xmm10, %%xmm7\n"
553 "movaps %%xmm0, %%xmm8\n"
554 "movaps %%xmm0, %%xmm9\n"
555 "addps %%xmm1, %%xmm8\n"
556 "subps %%xmm1, %%xmm9\n"
557 "movaps %%xmm2, %%xmm10\n"
558 "movaps %%xmm2, %%xmm11\n"
559 "addps %%xmm3, %%xmm10\n"
560 "subps %%xmm3, %%xmm11\n"
561 "movaps %%xmm4, %%xmm12\n"
562 "movaps %%xmm4, %%xmm13\n"
563 "addps %%xmm5, %%xmm12\n"
564 "subps %%xmm5, %%xmm13\n"
565 "movaps %%xmm6, %%xmm14\n"
566 "movaps %%xmm6, %%xmm15\n"
567 "addps %%xmm7, %%xmm14\n"
568 "subps %%xmm7, %%xmm15\n"
569 "movaps %%xmm8, %%xmm0\n"
570 "movaps %%xmm8, %%xmm2\n"
571 "addps %%xmm10, %%xmm0\n"
572 "subps %%xmm10, %%xmm2\n"
573 "movaps %%xmm9, %%xmm1\n"
574 "movaps %%xmm9, %%xmm3\n"
575 "addps %%xmm11, %%xmm1\n"
576 "subps %%xmm11, %%xmm3\n"
577 "movaps %%xmm12, %%xmm4\n"
578 "movaps %%xmm12, %%xmm6\n"
579 "addps %%xmm14, %%xmm4\n"
580 "subps %%xmm14, %%xmm6\n"
581 "movaps %%xmm13, %%xmm5\n"
582 "movaps %%xmm13, %%xmm7\n"
583 "addps %%xmm15, %%xmm5\n"
584 "subps %%xmm15, %%xmm7\n"
585 "movaps %%xmm0, %%xmm8\n"
586 "movaps %%xmm0, %%xmm12\n"
587 "addps %%xmm4, %%xmm8\n"
588 "subps %%xmm4, %%xmm12\n"
589 "movaps %%xmm1, %%xmm9\n"
590 "movaps %%xmm1, %%xmm13\n"
591 "addps %%xmm5, %%xmm9\n"
592 "subps %%xmm5, %%xmm13\n"
593 "movaps %%xmm2, %%xmm10\n"
594 "movaps %%xmm2, %%xmm14\n"
595 "addps %%xmm6, %%xmm10\n"
596 "subps %%xmm6, %%xmm14\n"
597 "movaps %%xmm3, %%xmm11\n"
598 "movaps %%xmm3, %%xmm15\n"
599 "addps %%xmm7, %%xmm11\n"
600 "subps %%xmm7, %%xmm15\n"
601 "movups %%xmm8, (%0)\n"
602 "movups %%xmm9, (%1)\n"
603 "movups %%xmm10, (%2)\n"
604 "movups %%xmm11, (%3)\n"
605 "movups %%xmm12, (%4)\n"
606 "movups %%xmm13, (%5)\n"
607 "movups %%xmm14, (%6)\n"
608 "movups %%xmm15, (%7)\n"
609 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
610 );
611 }
612 }
613 for (int j = 0; j < 64; j += 64) {
614 for (int k = 0; k < 32; k += 4) {
615 __asm__ volatile (
616 "movups (%0), %%xmm0\n"
617 "movups (%1), %%xmm1\n"
618 "movaps %%xmm0, %%xmm8\n"
619 "movaps %%xmm0, %%xmm9\n"
620 "addps %%xmm1, %%xmm8\n"
621 "subps %%xmm1, %%xmm9\n"
622 "movups %%xmm8, (%0)\n"
623 "movups %%xmm9, (%1)\n"
624 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
625 );
626 }
627 }
628 }
629 void helper_float_7_recursive(float *buf, int depth);
helper_float_7_recursive(float * buf,int depth)630 void helper_float_7_recursive(float *buf, int depth) {
631 if (depth == 7) {
632 for (int j = 0; j < 128; j += 32) {
633 for (int k = 0; k < 4; k += 4) {
634 __asm__ volatile (
635 "movups (%0), %%xmm0\n"
636 "movups (%1), %%xmm1\n"
637 "movups (%2), %%xmm2\n"
638 "movups (%3), %%xmm3\n"
639 "movups (%4), %%xmm4\n"
640 "movups (%5), %%xmm5\n"
641 "movups (%6), %%xmm6\n"
642 "movups (%7), %%xmm7\n"
643 "movaps %%xmm0, %%xmm8\n"
644 "shufps $160, %%xmm8, %%xmm8\n"
645 "shufps $245, %%xmm0, %%xmm0\n"
646 "xorps %%xmm9, %%xmm9\n"
647 "subps %%xmm0, %%xmm9\n"
648 "addsubps %%xmm9, %%xmm8\n"
649 "movaps %%xmm8, %%xmm0\n"
650 "movaps %%xmm1, %%xmm8\n"
651 "shufps $160, %%xmm8, %%xmm8\n"
652 "shufps $245, %%xmm1, %%xmm1\n"
653 "xorps %%xmm9, %%xmm9\n"
654 "subps %%xmm1, %%xmm9\n"
655 "addsubps %%xmm9, %%xmm8\n"
656 "movaps %%xmm8, %%xmm1\n"
657 "movaps %%xmm2, %%xmm8\n"
658 "shufps $160, %%xmm8, %%xmm8\n"
659 "shufps $245, %%xmm2, %%xmm2\n"
660 "xorps %%xmm9, %%xmm9\n"
661 "subps %%xmm2, %%xmm9\n"
662 "addsubps %%xmm9, %%xmm8\n"
663 "movaps %%xmm8, %%xmm2\n"
664 "movaps %%xmm3, %%xmm8\n"
665 "shufps $160, %%xmm8, %%xmm8\n"
666 "shufps $245, %%xmm3, %%xmm3\n"
667 "xorps %%xmm9, %%xmm9\n"
668 "subps %%xmm3, %%xmm9\n"
669 "addsubps %%xmm9, %%xmm8\n"
670 "movaps %%xmm8, %%xmm3\n"
671 "movaps %%xmm4, %%xmm8\n"
672 "shufps $160, %%xmm8, %%xmm8\n"
673 "shufps $245, %%xmm4, %%xmm4\n"
674 "xorps %%xmm9, %%xmm9\n"
675 "subps %%xmm4, %%xmm9\n"
676 "addsubps %%xmm9, %%xmm8\n"
677 "movaps %%xmm8, %%xmm4\n"
678 "movaps %%xmm5, %%xmm8\n"
679 "shufps $160, %%xmm8, %%xmm8\n"
680 "shufps $245, %%xmm5, %%xmm5\n"
681 "xorps %%xmm9, %%xmm9\n"
682 "subps %%xmm5, %%xmm9\n"
683 "addsubps %%xmm9, %%xmm8\n"
684 "movaps %%xmm8, %%xmm5\n"
685 "movaps %%xmm6, %%xmm8\n"
686 "shufps $160, %%xmm8, %%xmm8\n"
687 "shufps $245, %%xmm6, %%xmm6\n"
688 "xorps %%xmm9, %%xmm9\n"
689 "subps %%xmm6, %%xmm9\n"
690 "addsubps %%xmm9, %%xmm8\n"
691 "movaps %%xmm8, %%xmm6\n"
692 "movaps %%xmm7, %%xmm8\n"
693 "shufps $160, %%xmm8, %%xmm8\n"
694 "shufps $245, %%xmm7, %%xmm7\n"
695 "xorps %%xmm9, %%xmm9\n"
696 "subps %%xmm7, %%xmm9\n"
697 "addsubps %%xmm9, %%xmm8\n"
698 "movaps %%xmm8, %%xmm7\n"
699 "movaps %%xmm0, %%xmm8\n"
700 "shufps $68, %%xmm8, %%xmm8\n"
701 "xorps %%xmm9, %%xmm9\n"
702 "movaps %%xmm0, %%xmm10\n"
703 "shufps $14, %%xmm9, %%xmm10\n"
704 "movaps %%xmm0, %%xmm11\n"
705 "shufps $224, %%xmm11, %%xmm9\n"
706 "addps %%xmm8, %%xmm10\n"
707 "subps %%xmm9, %%xmm10\n"
708 "movaps %%xmm10, %%xmm0\n"
709 "movaps %%xmm1, %%xmm8\n"
710 "shufps $68, %%xmm8, %%xmm8\n"
711 "xorps %%xmm9, %%xmm9\n"
712 "movaps %%xmm1, %%xmm10\n"
713 "shufps $14, %%xmm9, %%xmm10\n"
714 "movaps %%xmm1, %%xmm11\n"
715 "shufps $224, %%xmm11, %%xmm9\n"
716 "addps %%xmm8, %%xmm10\n"
717 "subps %%xmm9, %%xmm10\n"
718 "movaps %%xmm10, %%xmm1\n"
719 "movaps %%xmm2, %%xmm8\n"
720 "shufps $68, %%xmm8, %%xmm8\n"
721 "xorps %%xmm9, %%xmm9\n"
722 "movaps %%xmm2, %%xmm10\n"
723 "shufps $14, %%xmm9, %%xmm10\n"
724 "movaps %%xmm2, %%xmm11\n"
725 "shufps $224, %%xmm11, %%xmm9\n"
726 "addps %%xmm8, %%xmm10\n"
727 "subps %%xmm9, %%xmm10\n"
728 "movaps %%xmm10, %%xmm2\n"
729 "movaps %%xmm3, %%xmm8\n"
730 "shufps $68, %%xmm8, %%xmm8\n"
731 "xorps %%xmm9, %%xmm9\n"
732 "movaps %%xmm3, %%xmm10\n"
733 "shufps $14, %%xmm9, %%xmm10\n"
734 "movaps %%xmm3, %%xmm11\n"
735 "shufps $224, %%xmm11, %%xmm9\n"
736 "addps %%xmm8, %%xmm10\n"
737 "subps %%xmm9, %%xmm10\n"
738 "movaps %%xmm10, %%xmm3\n"
739 "movaps %%xmm4, %%xmm8\n"
740 "shufps $68, %%xmm8, %%xmm8\n"
741 "xorps %%xmm9, %%xmm9\n"
742 "movaps %%xmm4, %%xmm10\n"
743 "shufps $14, %%xmm9, %%xmm10\n"
744 "movaps %%xmm4, %%xmm11\n"
745 "shufps $224, %%xmm11, %%xmm9\n"
746 "addps %%xmm8, %%xmm10\n"
747 "subps %%xmm9, %%xmm10\n"
748 "movaps %%xmm10, %%xmm4\n"
749 "movaps %%xmm5, %%xmm8\n"
750 "shufps $68, %%xmm8, %%xmm8\n"
751 "xorps %%xmm9, %%xmm9\n"
752 "movaps %%xmm5, %%xmm10\n"
753 "shufps $14, %%xmm9, %%xmm10\n"
754 "movaps %%xmm5, %%xmm11\n"
755 "shufps $224, %%xmm11, %%xmm9\n"
756 "addps %%xmm8, %%xmm10\n"
757 "subps %%xmm9, %%xmm10\n"
758 "movaps %%xmm10, %%xmm5\n"
759 "movaps %%xmm6, %%xmm8\n"
760 "shufps $68, %%xmm8, %%xmm8\n"
761 "xorps %%xmm9, %%xmm9\n"
762 "movaps %%xmm6, %%xmm10\n"
763 "shufps $14, %%xmm9, %%xmm10\n"
764 "movaps %%xmm6, %%xmm11\n"
765 "shufps $224, %%xmm11, %%xmm9\n"
766 "addps %%xmm8, %%xmm10\n"
767 "subps %%xmm9, %%xmm10\n"
768 "movaps %%xmm10, %%xmm6\n"
769 "movaps %%xmm7, %%xmm8\n"
770 "shufps $68, %%xmm8, %%xmm8\n"
771 "xorps %%xmm9, %%xmm9\n"
772 "movaps %%xmm7, %%xmm10\n"
773 "shufps $14, %%xmm9, %%xmm10\n"
774 "movaps %%xmm7, %%xmm11\n"
775 "shufps $224, %%xmm11, %%xmm9\n"
776 "addps %%xmm8, %%xmm10\n"
777 "subps %%xmm9, %%xmm10\n"
778 "movaps %%xmm10, %%xmm7\n"
779 "movaps %%xmm0, %%xmm8\n"
780 "movaps %%xmm0, %%xmm9\n"
781 "addps %%xmm1, %%xmm8\n"
782 "subps %%xmm1, %%xmm9\n"
783 "movaps %%xmm2, %%xmm10\n"
784 "movaps %%xmm2, %%xmm11\n"
785 "addps %%xmm3, %%xmm10\n"
786 "subps %%xmm3, %%xmm11\n"
787 "movaps %%xmm4, %%xmm12\n"
788 "movaps %%xmm4, %%xmm13\n"
789 "addps %%xmm5, %%xmm12\n"
790 "subps %%xmm5, %%xmm13\n"
791 "movaps %%xmm6, %%xmm14\n"
792 "movaps %%xmm6, %%xmm15\n"
793 "addps %%xmm7, %%xmm14\n"
794 "subps %%xmm7, %%xmm15\n"
795 "movaps %%xmm8, %%xmm0\n"
796 "movaps %%xmm8, %%xmm2\n"
797 "addps %%xmm10, %%xmm0\n"
798 "subps %%xmm10, %%xmm2\n"
799 "movaps %%xmm9, %%xmm1\n"
800 "movaps %%xmm9, %%xmm3\n"
801 "addps %%xmm11, %%xmm1\n"
802 "subps %%xmm11, %%xmm3\n"
803 "movaps %%xmm12, %%xmm4\n"
804 "movaps %%xmm12, %%xmm6\n"
805 "addps %%xmm14, %%xmm4\n"
806 "subps %%xmm14, %%xmm6\n"
807 "movaps %%xmm13, %%xmm5\n"
808 "movaps %%xmm13, %%xmm7\n"
809 "addps %%xmm15, %%xmm5\n"
810 "subps %%xmm15, %%xmm7\n"
811 "movaps %%xmm0, %%xmm8\n"
812 "movaps %%xmm0, %%xmm12\n"
813 "addps %%xmm4, %%xmm8\n"
814 "subps %%xmm4, %%xmm12\n"
815 "movaps %%xmm1, %%xmm9\n"
816 "movaps %%xmm1, %%xmm13\n"
817 "addps %%xmm5, %%xmm9\n"
818 "subps %%xmm5, %%xmm13\n"
819 "movaps %%xmm2, %%xmm10\n"
820 "movaps %%xmm2, %%xmm14\n"
821 "addps %%xmm6, %%xmm10\n"
822 "subps %%xmm6, %%xmm14\n"
823 "movaps %%xmm3, %%xmm11\n"
824 "movaps %%xmm3, %%xmm15\n"
825 "addps %%xmm7, %%xmm11\n"
826 "subps %%xmm7, %%xmm15\n"
827 "movups %%xmm8, (%0)\n"
828 "movups %%xmm9, (%1)\n"
829 "movups %%xmm10, (%2)\n"
830 "movups %%xmm11, (%3)\n"
831 "movups %%xmm12, (%4)\n"
832 "movups %%xmm13, (%5)\n"
833 "movups %%xmm14, (%6)\n"
834 "movups %%xmm15, (%7)\n"
835 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
836 );
837 }
838 }
839 for (int j = 0; j < 128; j += 128) {
840 for (int k = 0; k < 32; k += 4) {
841 __asm__ volatile (
842 "movups (%0), %%xmm0\n"
843 "movups (%1), %%xmm1\n"
844 "movups (%2), %%xmm2\n"
845 "movups (%3), %%xmm3\n"
846 "movaps %%xmm0, %%xmm8\n"
847 "movaps %%xmm0, %%xmm9\n"
848 "addps %%xmm1, %%xmm8\n"
849 "subps %%xmm1, %%xmm9\n"
850 "movaps %%xmm2, %%xmm10\n"
851 "movaps %%xmm2, %%xmm11\n"
852 "addps %%xmm3, %%xmm10\n"
853 "subps %%xmm3, %%xmm11\n"
854 "movaps %%xmm8, %%xmm0\n"
855 "movaps %%xmm8, %%xmm2\n"
856 "addps %%xmm10, %%xmm0\n"
857 "subps %%xmm10, %%xmm2\n"
858 "movaps %%xmm9, %%xmm1\n"
859 "movaps %%xmm9, %%xmm3\n"
860 "addps %%xmm11, %%xmm1\n"
861 "subps %%xmm11, %%xmm3\n"
862 "movups %%xmm0, (%0)\n"
863 "movups %%xmm1, (%1)\n"
864 "movups %%xmm2, (%2)\n"
865 "movups %%xmm3, (%3)\n"
866 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
867 );
868 }
869 }
870 return;
871 }
872 }
873 void helper_float_7(float *buf);
helper_float_7(float * buf)874 void helper_float_7(float *buf) {
875 helper_float_7_recursive(buf, 7);
876 }
877 static inline void helper_float_8(float *buf);
helper_float_8(float * buf)878 static inline void helper_float_8(float *buf) {
879 for (int j = 0; j < 256; j += 32) {
880 for (int k = 0; k < 4; k += 4) {
881 __asm__ volatile (
882 "movups (%0), %%xmm0\n"
883 "movups (%1), %%xmm1\n"
884 "movups (%2), %%xmm2\n"
885 "movups (%3), %%xmm3\n"
886 "movups (%4), %%xmm4\n"
887 "movups (%5), %%xmm5\n"
888 "movups (%6), %%xmm6\n"
889 "movups (%7), %%xmm7\n"
890 "movaps %%xmm0, %%xmm8\n"
891 "shufps $160, %%xmm8, %%xmm8\n"
892 "shufps $245, %%xmm0, %%xmm0\n"
893 "xorps %%xmm9, %%xmm9\n"
894 "subps %%xmm0, %%xmm9\n"
895 "addsubps %%xmm9, %%xmm8\n"
896 "movaps %%xmm8, %%xmm0\n"
897 "movaps %%xmm1, %%xmm8\n"
898 "shufps $160, %%xmm8, %%xmm8\n"
899 "shufps $245, %%xmm1, %%xmm1\n"
900 "xorps %%xmm9, %%xmm9\n"
901 "subps %%xmm1, %%xmm9\n"
902 "addsubps %%xmm9, %%xmm8\n"
903 "movaps %%xmm8, %%xmm1\n"
904 "movaps %%xmm2, %%xmm8\n"
905 "shufps $160, %%xmm8, %%xmm8\n"
906 "shufps $245, %%xmm2, %%xmm2\n"
907 "xorps %%xmm9, %%xmm9\n"
908 "subps %%xmm2, %%xmm9\n"
909 "addsubps %%xmm9, %%xmm8\n"
910 "movaps %%xmm8, %%xmm2\n"
911 "movaps %%xmm3, %%xmm8\n"
912 "shufps $160, %%xmm8, %%xmm8\n"
913 "shufps $245, %%xmm3, %%xmm3\n"
914 "xorps %%xmm9, %%xmm9\n"
915 "subps %%xmm3, %%xmm9\n"
916 "addsubps %%xmm9, %%xmm8\n"
917 "movaps %%xmm8, %%xmm3\n"
918 "movaps %%xmm4, %%xmm8\n"
919 "shufps $160, %%xmm8, %%xmm8\n"
920 "shufps $245, %%xmm4, %%xmm4\n"
921 "xorps %%xmm9, %%xmm9\n"
922 "subps %%xmm4, %%xmm9\n"
923 "addsubps %%xmm9, %%xmm8\n"
924 "movaps %%xmm8, %%xmm4\n"
925 "movaps %%xmm5, %%xmm8\n"
926 "shufps $160, %%xmm8, %%xmm8\n"
927 "shufps $245, %%xmm5, %%xmm5\n"
928 "xorps %%xmm9, %%xmm9\n"
929 "subps %%xmm5, %%xmm9\n"
930 "addsubps %%xmm9, %%xmm8\n"
931 "movaps %%xmm8, %%xmm5\n"
932 "movaps %%xmm6, %%xmm8\n"
933 "shufps $160, %%xmm8, %%xmm8\n"
934 "shufps $245, %%xmm6, %%xmm6\n"
935 "xorps %%xmm9, %%xmm9\n"
936 "subps %%xmm6, %%xmm9\n"
937 "addsubps %%xmm9, %%xmm8\n"
938 "movaps %%xmm8, %%xmm6\n"
939 "movaps %%xmm7, %%xmm8\n"
940 "shufps $160, %%xmm8, %%xmm8\n"
941 "shufps $245, %%xmm7, %%xmm7\n"
942 "xorps %%xmm9, %%xmm9\n"
943 "subps %%xmm7, %%xmm9\n"
944 "addsubps %%xmm9, %%xmm8\n"
945 "movaps %%xmm8, %%xmm7\n"
946 "movaps %%xmm0, %%xmm8\n"
947 "shufps $68, %%xmm8, %%xmm8\n"
948 "xorps %%xmm9, %%xmm9\n"
949 "movaps %%xmm0, %%xmm10\n"
950 "shufps $14, %%xmm9, %%xmm10\n"
951 "movaps %%xmm0, %%xmm11\n"
952 "shufps $224, %%xmm11, %%xmm9\n"
953 "addps %%xmm8, %%xmm10\n"
954 "subps %%xmm9, %%xmm10\n"
955 "movaps %%xmm10, %%xmm0\n"
956 "movaps %%xmm1, %%xmm8\n"
957 "shufps $68, %%xmm8, %%xmm8\n"
958 "xorps %%xmm9, %%xmm9\n"
959 "movaps %%xmm1, %%xmm10\n"
960 "shufps $14, %%xmm9, %%xmm10\n"
961 "movaps %%xmm1, %%xmm11\n"
962 "shufps $224, %%xmm11, %%xmm9\n"
963 "addps %%xmm8, %%xmm10\n"
964 "subps %%xmm9, %%xmm10\n"
965 "movaps %%xmm10, %%xmm1\n"
966 "movaps %%xmm2, %%xmm8\n"
967 "shufps $68, %%xmm8, %%xmm8\n"
968 "xorps %%xmm9, %%xmm9\n"
969 "movaps %%xmm2, %%xmm10\n"
970 "shufps $14, %%xmm9, %%xmm10\n"
971 "movaps %%xmm2, %%xmm11\n"
972 "shufps $224, %%xmm11, %%xmm9\n"
973 "addps %%xmm8, %%xmm10\n"
974 "subps %%xmm9, %%xmm10\n"
975 "movaps %%xmm10, %%xmm2\n"
976 "movaps %%xmm3, %%xmm8\n"
977 "shufps $68, %%xmm8, %%xmm8\n"
978 "xorps %%xmm9, %%xmm9\n"
979 "movaps %%xmm3, %%xmm10\n"
980 "shufps $14, %%xmm9, %%xmm10\n"
981 "movaps %%xmm3, %%xmm11\n"
982 "shufps $224, %%xmm11, %%xmm9\n"
983 "addps %%xmm8, %%xmm10\n"
984 "subps %%xmm9, %%xmm10\n"
985 "movaps %%xmm10, %%xmm3\n"
986 "movaps %%xmm4, %%xmm8\n"
987 "shufps $68, %%xmm8, %%xmm8\n"
988 "xorps %%xmm9, %%xmm9\n"
989 "movaps %%xmm4, %%xmm10\n"
990 "shufps $14, %%xmm9, %%xmm10\n"
991 "movaps %%xmm4, %%xmm11\n"
992 "shufps $224, %%xmm11, %%xmm9\n"
993 "addps %%xmm8, %%xmm10\n"
994 "subps %%xmm9, %%xmm10\n"
995 "movaps %%xmm10, %%xmm4\n"
996 "movaps %%xmm5, %%xmm8\n"
997 "shufps $68, %%xmm8, %%xmm8\n"
998 "xorps %%xmm9, %%xmm9\n"
999 "movaps %%xmm5, %%xmm10\n"
1000 "shufps $14, %%xmm9, %%xmm10\n"
1001 "movaps %%xmm5, %%xmm11\n"
1002 "shufps $224, %%xmm11, %%xmm9\n"
1003 "addps %%xmm8, %%xmm10\n"
1004 "subps %%xmm9, %%xmm10\n"
1005 "movaps %%xmm10, %%xmm5\n"
1006 "movaps %%xmm6, %%xmm8\n"
1007 "shufps $68, %%xmm8, %%xmm8\n"
1008 "xorps %%xmm9, %%xmm9\n"
1009 "movaps %%xmm6, %%xmm10\n"
1010 "shufps $14, %%xmm9, %%xmm10\n"
1011 "movaps %%xmm6, %%xmm11\n"
1012 "shufps $224, %%xmm11, %%xmm9\n"
1013 "addps %%xmm8, %%xmm10\n"
1014 "subps %%xmm9, %%xmm10\n"
1015 "movaps %%xmm10, %%xmm6\n"
1016 "movaps %%xmm7, %%xmm8\n"
1017 "shufps $68, %%xmm8, %%xmm8\n"
1018 "xorps %%xmm9, %%xmm9\n"
1019 "movaps %%xmm7, %%xmm10\n"
1020 "shufps $14, %%xmm9, %%xmm10\n"
1021 "movaps %%xmm7, %%xmm11\n"
1022 "shufps $224, %%xmm11, %%xmm9\n"
1023 "addps %%xmm8, %%xmm10\n"
1024 "subps %%xmm9, %%xmm10\n"
1025 "movaps %%xmm10, %%xmm7\n"
1026 "movaps %%xmm0, %%xmm8\n"
1027 "movaps %%xmm0, %%xmm9\n"
1028 "addps %%xmm1, %%xmm8\n"
1029 "subps %%xmm1, %%xmm9\n"
1030 "movaps %%xmm2, %%xmm10\n"
1031 "movaps %%xmm2, %%xmm11\n"
1032 "addps %%xmm3, %%xmm10\n"
1033 "subps %%xmm3, %%xmm11\n"
1034 "movaps %%xmm4, %%xmm12\n"
1035 "movaps %%xmm4, %%xmm13\n"
1036 "addps %%xmm5, %%xmm12\n"
1037 "subps %%xmm5, %%xmm13\n"
1038 "movaps %%xmm6, %%xmm14\n"
1039 "movaps %%xmm6, %%xmm15\n"
1040 "addps %%xmm7, %%xmm14\n"
1041 "subps %%xmm7, %%xmm15\n"
1042 "movaps %%xmm8, %%xmm0\n"
1043 "movaps %%xmm8, %%xmm2\n"
1044 "addps %%xmm10, %%xmm0\n"
1045 "subps %%xmm10, %%xmm2\n"
1046 "movaps %%xmm9, %%xmm1\n"
1047 "movaps %%xmm9, %%xmm3\n"
1048 "addps %%xmm11, %%xmm1\n"
1049 "subps %%xmm11, %%xmm3\n"
1050 "movaps %%xmm12, %%xmm4\n"
1051 "movaps %%xmm12, %%xmm6\n"
1052 "addps %%xmm14, %%xmm4\n"
1053 "subps %%xmm14, %%xmm6\n"
1054 "movaps %%xmm13, %%xmm5\n"
1055 "movaps %%xmm13, %%xmm7\n"
1056 "addps %%xmm15, %%xmm5\n"
1057 "subps %%xmm15, %%xmm7\n"
1058 "movaps %%xmm0, %%xmm8\n"
1059 "movaps %%xmm0, %%xmm12\n"
1060 "addps %%xmm4, %%xmm8\n"
1061 "subps %%xmm4, %%xmm12\n"
1062 "movaps %%xmm1, %%xmm9\n"
1063 "movaps %%xmm1, %%xmm13\n"
1064 "addps %%xmm5, %%xmm9\n"
1065 "subps %%xmm5, %%xmm13\n"
1066 "movaps %%xmm2, %%xmm10\n"
1067 "movaps %%xmm2, %%xmm14\n"
1068 "addps %%xmm6, %%xmm10\n"
1069 "subps %%xmm6, %%xmm14\n"
1070 "movaps %%xmm3, %%xmm11\n"
1071 "movaps %%xmm3, %%xmm15\n"
1072 "addps %%xmm7, %%xmm11\n"
1073 "subps %%xmm7, %%xmm15\n"
1074 "movups %%xmm8, (%0)\n"
1075 "movups %%xmm9, (%1)\n"
1076 "movups %%xmm10, (%2)\n"
1077 "movups %%xmm11, (%3)\n"
1078 "movups %%xmm12, (%4)\n"
1079 "movups %%xmm13, (%5)\n"
1080 "movups %%xmm14, (%6)\n"
1081 "movups %%xmm15, (%7)\n"
1082 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1083 );
1084 }
1085 }
1086 for (int j = 0; j < 256; j += 256) {
1087 for (int k = 0; k < 32; k += 4) {
1088 __asm__ volatile (
1089 "movups (%0), %%xmm0\n"
1090 "movups (%1), %%xmm1\n"
1091 "movups (%2), %%xmm2\n"
1092 "movups (%3), %%xmm3\n"
1093 "movups (%4), %%xmm4\n"
1094 "movups (%5), %%xmm5\n"
1095 "movups (%6), %%xmm6\n"
1096 "movups (%7), %%xmm7\n"
1097 "movaps %%xmm0, %%xmm8\n"
1098 "movaps %%xmm0, %%xmm9\n"
1099 "addps %%xmm1, %%xmm8\n"
1100 "subps %%xmm1, %%xmm9\n"
1101 "movaps %%xmm2, %%xmm10\n"
1102 "movaps %%xmm2, %%xmm11\n"
1103 "addps %%xmm3, %%xmm10\n"
1104 "subps %%xmm3, %%xmm11\n"
1105 "movaps %%xmm4, %%xmm12\n"
1106 "movaps %%xmm4, %%xmm13\n"
1107 "addps %%xmm5, %%xmm12\n"
1108 "subps %%xmm5, %%xmm13\n"
1109 "movaps %%xmm6, %%xmm14\n"
1110 "movaps %%xmm6, %%xmm15\n"
1111 "addps %%xmm7, %%xmm14\n"
1112 "subps %%xmm7, %%xmm15\n"
1113 "movaps %%xmm8, %%xmm0\n"
1114 "movaps %%xmm8, %%xmm2\n"
1115 "addps %%xmm10, %%xmm0\n"
1116 "subps %%xmm10, %%xmm2\n"
1117 "movaps %%xmm9, %%xmm1\n"
1118 "movaps %%xmm9, %%xmm3\n"
1119 "addps %%xmm11, %%xmm1\n"
1120 "subps %%xmm11, %%xmm3\n"
1121 "movaps %%xmm12, %%xmm4\n"
1122 "movaps %%xmm12, %%xmm6\n"
1123 "addps %%xmm14, %%xmm4\n"
1124 "subps %%xmm14, %%xmm6\n"
1125 "movaps %%xmm13, %%xmm5\n"
1126 "movaps %%xmm13, %%xmm7\n"
1127 "addps %%xmm15, %%xmm5\n"
1128 "subps %%xmm15, %%xmm7\n"
1129 "movaps %%xmm0, %%xmm8\n"
1130 "movaps %%xmm0, %%xmm12\n"
1131 "addps %%xmm4, %%xmm8\n"
1132 "subps %%xmm4, %%xmm12\n"
1133 "movaps %%xmm1, %%xmm9\n"
1134 "movaps %%xmm1, %%xmm13\n"
1135 "addps %%xmm5, %%xmm9\n"
1136 "subps %%xmm5, %%xmm13\n"
1137 "movaps %%xmm2, %%xmm10\n"
1138 "movaps %%xmm2, %%xmm14\n"
1139 "addps %%xmm6, %%xmm10\n"
1140 "subps %%xmm6, %%xmm14\n"
1141 "movaps %%xmm3, %%xmm11\n"
1142 "movaps %%xmm3, %%xmm15\n"
1143 "addps %%xmm7, %%xmm11\n"
1144 "subps %%xmm7, %%xmm15\n"
1145 "movups %%xmm8, (%0)\n"
1146 "movups %%xmm9, (%1)\n"
1147 "movups %%xmm10, (%2)\n"
1148 "movups %%xmm11, (%3)\n"
1149 "movups %%xmm12, (%4)\n"
1150 "movups %%xmm13, (%5)\n"
1151 "movups %%xmm14, (%6)\n"
1152 "movups %%xmm15, (%7)\n"
1153 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1154 );
1155 }
1156 }
1157 }
1158 static inline void helper_float_9(float *buf);
helper_float_9(float * buf)1159 static inline void helper_float_9(float *buf) {
1160 for (int j = 0; j < 512; j += 32) {
1161 for (int k = 0; k < 4; k += 4) {
1162 __asm__ volatile (
1163 "movups (%0), %%xmm0\n"
1164 "movups (%1), %%xmm1\n"
1165 "movups (%2), %%xmm2\n"
1166 "movups (%3), %%xmm3\n"
1167 "movups (%4), %%xmm4\n"
1168 "movups (%5), %%xmm5\n"
1169 "movups (%6), %%xmm6\n"
1170 "movups (%7), %%xmm7\n"
1171 "movaps %%xmm0, %%xmm8\n"
1172 "shufps $160, %%xmm8, %%xmm8\n"
1173 "shufps $245, %%xmm0, %%xmm0\n"
1174 "xorps %%xmm9, %%xmm9\n"
1175 "subps %%xmm0, %%xmm9\n"
1176 "addsubps %%xmm9, %%xmm8\n"
1177 "movaps %%xmm8, %%xmm0\n"
1178 "movaps %%xmm1, %%xmm8\n"
1179 "shufps $160, %%xmm8, %%xmm8\n"
1180 "shufps $245, %%xmm1, %%xmm1\n"
1181 "xorps %%xmm9, %%xmm9\n"
1182 "subps %%xmm1, %%xmm9\n"
1183 "addsubps %%xmm9, %%xmm8\n"
1184 "movaps %%xmm8, %%xmm1\n"
1185 "movaps %%xmm2, %%xmm8\n"
1186 "shufps $160, %%xmm8, %%xmm8\n"
1187 "shufps $245, %%xmm2, %%xmm2\n"
1188 "xorps %%xmm9, %%xmm9\n"
1189 "subps %%xmm2, %%xmm9\n"
1190 "addsubps %%xmm9, %%xmm8\n"
1191 "movaps %%xmm8, %%xmm2\n"
1192 "movaps %%xmm3, %%xmm8\n"
1193 "shufps $160, %%xmm8, %%xmm8\n"
1194 "shufps $245, %%xmm3, %%xmm3\n"
1195 "xorps %%xmm9, %%xmm9\n"
1196 "subps %%xmm3, %%xmm9\n"
1197 "addsubps %%xmm9, %%xmm8\n"
1198 "movaps %%xmm8, %%xmm3\n"
1199 "movaps %%xmm4, %%xmm8\n"
1200 "shufps $160, %%xmm8, %%xmm8\n"
1201 "shufps $245, %%xmm4, %%xmm4\n"
1202 "xorps %%xmm9, %%xmm9\n"
1203 "subps %%xmm4, %%xmm9\n"
1204 "addsubps %%xmm9, %%xmm8\n"
1205 "movaps %%xmm8, %%xmm4\n"
1206 "movaps %%xmm5, %%xmm8\n"
1207 "shufps $160, %%xmm8, %%xmm8\n"
1208 "shufps $245, %%xmm5, %%xmm5\n"
1209 "xorps %%xmm9, %%xmm9\n"
1210 "subps %%xmm5, %%xmm9\n"
1211 "addsubps %%xmm9, %%xmm8\n"
1212 "movaps %%xmm8, %%xmm5\n"
1213 "movaps %%xmm6, %%xmm8\n"
1214 "shufps $160, %%xmm8, %%xmm8\n"
1215 "shufps $245, %%xmm6, %%xmm6\n"
1216 "xorps %%xmm9, %%xmm9\n"
1217 "subps %%xmm6, %%xmm9\n"
1218 "addsubps %%xmm9, %%xmm8\n"
1219 "movaps %%xmm8, %%xmm6\n"
1220 "movaps %%xmm7, %%xmm8\n"
1221 "shufps $160, %%xmm8, %%xmm8\n"
1222 "shufps $245, %%xmm7, %%xmm7\n"
1223 "xorps %%xmm9, %%xmm9\n"
1224 "subps %%xmm7, %%xmm9\n"
1225 "addsubps %%xmm9, %%xmm8\n"
1226 "movaps %%xmm8, %%xmm7\n"
1227 "movaps %%xmm0, %%xmm8\n"
1228 "shufps $68, %%xmm8, %%xmm8\n"
1229 "xorps %%xmm9, %%xmm9\n"
1230 "movaps %%xmm0, %%xmm10\n"
1231 "shufps $14, %%xmm9, %%xmm10\n"
1232 "movaps %%xmm0, %%xmm11\n"
1233 "shufps $224, %%xmm11, %%xmm9\n"
1234 "addps %%xmm8, %%xmm10\n"
1235 "subps %%xmm9, %%xmm10\n"
1236 "movaps %%xmm10, %%xmm0\n"
1237 "movaps %%xmm1, %%xmm8\n"
1238 "shufps $68, %%xmm8, %%xmm8\n"
1239 "xorps %%xmm9, %%xmm9\n"
1240 "movaps %%xmm1, %%xmm10\n"
1241 "shufps $14, %%xmm9, %%xmm10\n"
1242 "movaps %%xmm1, %%xmm11\n"
1243 "shufps $224, %%xmm11, %%xmm9\n"
1244 "addps %%xmm8, %%xmm10\n"
1245 "subps %%xmm9, %%xmm10\n"
1246 "movaps %%xmm10, %%xmm1\n"
1247 "movaps %%xmm2, %%xmm8\n"
1248 "shufps $68, %%xmm8, %%xmm8\n"
1249 "xorps %%xmm9, %%xmm9\n"
1250 "movaps %%xmm2, %%xmm10\n"
1251 "shufps $14, %%xmm9, %%xmm10\n"
1252 "movaps %%xmm2, %%xmm11\n"
1253 "shufps $224, %%xmm11, %%xmm9\n"
1254 "addps %%xmm8, %%xmm10\n"
1255 "subps %%xmm9, %%xmm10\n"
1256 "movaps %%xmm10, %%xmm2\n"
1257 "movaps %%xmm3, %%xmm8\n"
1258 "shufps $68, %%xmm8, %%xmm8\n"
1259 "xorps %%xmm9, %%xmm9\n"
1260 "movaps %%xmm3, %%xmm10\n"
1261 "shufps $14, %%xmm9, %%xmm10\n"
1262 "movaps %%xmm3, %%xmm11\n"
1263 "shufps $224, %%xmm11, %%xmm9\n"
1264 "addps %%xmm8, %%xmm10\n"
1265 "subps %%xmm9, %%xmm10\n"
1266 "movaps %%xmm10, %%xmm3\n"
1267 "movaps %%xmm4, %%xmm8\n"
1268 "shufps $68, %%xmm8, %%xmm8\n"
1269 "xorps %%xmm9, %%xmm9\n"
1270 "movaps %%xmm4, %%xmm10\n"
1271 "shufps $14, %%xmm9, %%xmm10\n"
1272 "movaps %%xmm4, %%xmm11\n"
1273 "shufps $224, %%xmm11, %%xmm9\n"
1274 "addps %%xmm8, %%xmm10\n"
1275 "subps %%xmm9, %%xmm10\n"
1276 "movaps %%xmm10, %%xmm4\n"
1277 "movaps %%xmm5, %%xmm8\n"
1278 "shufps $68, %%xmm8, %%xmm8\n"
1279 "xorps %%xmm9, %%xmm9\n"
1280 "movaps %%xmm5, %%xmm10\n"
1281 "shufps $14, %%xmm9, %%xmm10\n"
1282 "movaps %%xmm5, %%xmm11\n"
1283 "shufps $224, %%xmm11, %%xmm9\n"
1284 "addps %%xmm8, %%xmm10\n"
1285 "subps %%xmm9, %%xmm10\n"
1286 "movaps %%xmm10, %%xmm5\n"
1287 "movaps %%xmm6, %%xmm8\n"
1288 "shufps $68, %%xmm8, %%xmm8\n"
1289 "xorps %%xmm9, %%xmm9\n"
1290 "movaps %%xmm6, %%xmm10\n"
1291 "shufps $14, %%xmm9, %%xmm10\n"
1292 "movaps %%xmm6, %%xmm11\n"
1293 "shufps $224, %%xmm11, %%xmm9\n"
1294 "addps %%xmm8, %%xmm10\n"
1295 "subps %%xmm9, %%xmm10\n"
1296 "movaps %%xmm10, %%xmm6\n"
1297 "movaps %%xmm7, %%xmm8\n"
1298 "shufps $68, %%xmm8, %%xmm8\n"
1299 "xorps %%xmm9, %%xmm9\n"
1300 "movaps %%xmm7, %%xmm10\n"
1301 "shufps $14, %%xmm9, %%xmm10\n"
1302 "movaps %%xmm7, %%xmm11\n"
1303 "shufps $224, %%xmm11, %%xmm9\n"
1304 "addps %%xmm8, %%xmm10\n"
1305 "subps %%xmm9, %%xmm10\n"
1306 "movaps %%xmm10, %%xmm7\n"
1307 "movaps %%xmm0, %%xmm8\n"
1308 "movaps %%xmm0, %%xmm9\n"
1309 "addps %%xmm1, %%xmm8\n"
1310 "subps %%xmm1, %%xmm9\n"
1311 "movaps %%xmm2, %%xmm10\n"
1312 "movaps %%xmm2, %%xmm11\n"
1313 "addps %%xmm3, %%xmm10\n"
1314 "subps %%xmm3, %%xmm11\n"
1315 "movaps %%xmm4, %%xmm12\n"
1316 "movaps %%xmm4, %%xmm13\n"
1317 "addps %%xmm5, %%xmm12\n"
1318 "subps %%xmm5, %%xmm13\n"
1319 "movaps %%xmm6, %%xmm14\n"
1320 "movaps %%xmm6, %%xmm15\n"
1321 "addps %%xmm7, %%xmm14\n"
1322 "subps %%xmm7, %%xmm15\n"
1323 "movaps %%xmm8, %%xmm0\n"
1324 "movaps %%xmm8, %%xmm2\n"
1325 "addps %%xmm10, %%xmm0\n"
1326 "subps %%xmm10, %%xmm2\n"
1327 "movaps %%xmm9, %%xmm1\n"
1328 "movaps %%xmm9, %%xmm3\n"
1329 "addps %%xmm11, %%xmm1\n"
1330 "subps %%xmm11, %%xmm3\n"
1331 "movaps %%xmm12, %%xmm4\n"
1332 "movaps %%xmm12, %%xmm6\n"
1333 "addps %%xmm14, %%xmm4\n"
1334 "subps %%xmm14, %%xmm6\n"
1335 "movaps %%xmm13, %%xmm5\n"
1336 "movaps %%xmm13, %%xmm7\n"
1337 "addps %%xmm15, %%xmm5\n"
1338 "subps %%xmm15, %%xmm7\n"
1339 "movaps %%xmm0, %%xmm8\n"
1340 "movaps %%xmm0, %%xmm12\n"
1341 "addps %%xmm4, %%xmm8\n"
1342 "subps %%xmm4, %%xmm12\n"
1343 "movaps %%xmm1, %%xmm9\n"
1344 "movaps %%xmm1, %%xmm13\n"
1345 "addps %%xmm5, %%xmm9\n"
1346 "subps %%xmm5, %%xmm13\n"
1347 "movaps %%xmm2, %%xmm10\n"
1348 "movaps %%xmm2, %%xmm14\n"
1349 "addps %%xmm6, %%xmm10\n"
1350 "subps %%xmm6, %%xmm14\n"
1351 "movaps %%xmm3, %%xmm11\n"
1352 "movaps %%xmm3, %%xmm15\n"
1353 "addps %%xmm7, %%xmm11\n"
1354 "subps %%xmm7, %%xmm15\n"
1355 "movups %%xmm8, (%0)\n"
1356 "movups %%xmm9, (%1)\n"
1357 "movups %%xmm10, (%2)\n"
1358 "movups %%xmm11, (%3)\n"
1359 "movups %%xmm12, (%4)\n"
1360 "movups %%xmm13, (%5)\n"
1361 "movups %%xmm14, (%6)\n"
1362 "movups %%xmm15, (%7)\n"
1363 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1364 );
1365 }
1366 }
1367 for (int j = 0; j < 512; j += 256) {
1368 for (int k = 0; k < 32; k += 4) {
1369 __asm__ volatile (
1370 "movups (%0), %%xmm0\n"
1371 "movups (%1), %%xmm1\n"
1372 "movups (%2), %%xmm2\n"
1373 "movups (%3), %%xmm3\n"
1374 "movups (%4), %%xmm4\n"
1375 "movups (%5), %%xmm5\n"
1376 "movups (%6), %%xmm6\n"
1377 "movups (%7), %%xmm7\n"
1378 "movaps %%xmm0, %%xmm8\n"
1379 "movaps %%xmm0, %%xmm9\n"
1380 "addps %%xmm1, %%xmm8\n"
1381 "subps %%xmm1, %%xmm9\n"
1382 "movaps %%xmm2, %%xmm10\n"
1383 "movaps %%xmm2, %%xmm11\n"
1384 "addps %%xmm3, %%xmm10\n"
1385 "subps %%xmm3, %%xmm11\n"
1386 "movaps %%xmm4, %%xmm12\n"
1387 "movaps %%xmm4, %%xmm13\n"
1388 "addps %%xmm5, %%xmm12\n"
1389 "subps %%xmm5, %%xmm13\n"
1390 "movaps %%xmm6, %%xmm14\n"
1391 "movaps %%xmm6, %%xmm15\n"
1392 "addps %%xmm7, %%xmm14\n"
1393 "subps %%xmm7, %%xmm15\n"
1394 "movaps %%xmm8, %%xmm0\n"
1395 "movaps %%xmm8, %%xmm2\n"
1396 "addps %%xmm10, %%xmm0\n"
1397 "subps %%xmm10, %%xmm2\n"
1398 "movaps %%xmm9, %%xmm1\n"
1399 "movaps %%xmm9, %%xmm3\n"
1400 "addps %%xmm11, %%xmm1\n"
1401 "subps %%xmm11, %%xmm3\n"
1402 "movaps %%xmm12, %%xmm4\n"
1403 "movaps %%xmm12, %%xmm6\n"
1404 "addps %%xmm14, %%xmm4\n"
1405 "subps %%xmm14, %%xmm6\n"
1406 "movaps %%xmm13, %%xmm5\n"
1407 "movaps %%xmm13, %%xmm7\n"
1408 "addps %%xmm15, %%xmm5\n"
1409 "subps %%xmm15, %%xmm7\n"
1410 "movaps %%xmm0, %%xmm8\n"
1411 "movaps %%xmm0, %%xmm12\n"
1412 "addps %%xmm4, %%xmm8\n"
1413 "subps %%xmm4, %%xmm12\n"
1414 "movaps %%xmm1, %%xmm9\n"
1415 "movaps %%xmm1, %%xmm13\n"
1416 "addps %%xmm5, %%xmm9\n"
1417 "subps %%xmm5, %%xmm13\n"
1418 "movaps %%xmm2, %%xmm10\n"
1419 "movaps %%xmm2, %%xmm14\n"
1420 "addps %%xmm6, %%xmm10\n"
1421 "subps %%xmm6, %%xmm14\n"
1422 "movaps %%xmm3, %%xmm11\n"
1423 "movaps %%xmm3, %%xmm15\n"
1424 "addps %%xmm7, %%xmm11\n"
1425 "subps %%xmm7, %%xmm15\n"
1426 "movups %%xmm8, (%0)\n"
1427 "movups %%xmm9, (%1)\n"
1428 "movups %%xmm10, (%2)\n"
1429 "movups %%xmm11, (%3)\n"
1430 "movups %%xmm12, (%4)\n"
1431 "movups %%xmm13, (%5)\n"
1432 "movups %%xmm14, (%6)\n"
1433 "movups %%xmm15, (%7)\n"
1434 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1435 );
1436 }
1437 }
1438 for (int j = 0; j < 512; j += 512) {
1439 for (int k = 0; k < 256; k += 4) {
1440 __asm__ volatile (
1441 "movups (%0), %%xmm0\n"
1442 "movups (%1), %%xmm1\n"
1443 "movaps %%xmm0, %%xmm8\n"
1444 "movaps %%xmm0, %%xmm9\n"
1445 "addps %%xmm1, %%xmm8\n"
1446 "subps %%xmm1, %%xmm9\n"
1447 "movups %%xmm8, (%0)\n"
1448 "movups %%xmm9, (%1)\n"
1449 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1450 );
1451 }
1452 }
1453 }
1454 static inline void helper_float_10(float *buf);
helper_float_10(float * buf)1455 static inline void helper_float_10(float *buf) {
1456 for (int j = 0; j < 1024; j += 32) {
1457 for (int k = 0; k < 4; k += 4) {
1458 __asm__ volatile (
1459 "movups (%0), %%xmm0\n"
1460 "movups (%1), %%xmm1\n"
1461 "movups (%2), %%xmm2\n"
1462 "movups (%3), %%xmm3\n"
1463 "movups (%4), %%xmm4\n"
1464 "movups (%5), %%xmm5\n"
1465 "movups (%6), %%xmm6\n"
1466 "movups (%7), %%xmm7\n"
1467 "movaps %%xmm0, %%xmm8\n"
1468 "shufps $160, %%xmm8, %%xmm8\n"
1469 "shufps $245, %%xmm0, %%xmm0\n"
1470 "xorps %%xmm9, %%xmm9\n"
1471 "subps %%xmm0, %%xmm9\n"
1472 "addsubps %%xmm9, %%xmm8\n"
1473 "movaps %%xmm8, %%xmm0\n"
1474 "movaps %%xmm1, %%xmm8\n"
1475 "shufps $160, %%xmm8, %%xmm8\n"
1476 "shufps $245, %%xmm1, %%xmm1\n"
1477 "xorps %%xmm9, %%xmm9\n"
1478 "subps %%xmm1, %%xmm9\n"
1479 "addsubps %%xmm9, %%xmm8\n"
1480 "movaps %%xmm8, %%xmm1\n"
1481 "movaps %%xmm2, %%xmm8\n"
1482 "shufps $160, %%xmm8, %%xmm8\n"
1483 "shufps $245, %%xmm2, %%xmm2\n"
1484 "xorps %%xmm9, %%xmm9\n"
1485 "subps %%xmm2, %%xmm9\n"
1486 "addsubps %%xmm9, %%xmm8\n"
1487 "movaps %%xmm8, %%xmm2\n"
1488 "movaps %%xmm3, %%xmm8\n"
1489 "shufps $160, %%xmm8, %%xmm8\n"
1490 "shufps $245, %%xmm3, %%xmm3\n"
1491 "xorps %%xmm9, %%xmm9\n"
1492 "subps %%xmm3, %%xmm9\n"
1493 "addsubps %%xmm9, %%xmm8\n"
1494 "movaps %%xmm8, %%xmm3\n"
1495 "movaps %%xmm4, %%xmm8\n"
1496 "shufps $160, %%xmm8, %%xmm8\n"
1497 "shufps $245, %%xmm4, %%xmm4\n"
1498 "xorps %%xmm9, %%xmm9\n"
1499 "subps %%xmm4, %%xmm9\n"
1500 "addsubps %%xmm9, %%xmm8\n"
1501 "movaps %%xmm8, %%xmm4\n"
1502 "movaps %%xmm5, %%xmm8\n"
1503 "shufps $160, %%xmm8, %%xmm8\n"
1504 "shufps $245, %%xmm5, %%xmm5\n"
1505 "xorps %%xmm9, %%xmm9\n"
1506 "subps %%xmm5, %%xmm9\n"
1507 "addsubps %%xmm9, %%xmm8\n"
1508 "movaps %%xmm8, %%xmm5\n"
1509 "movaps %%xmm6, %%xmm8\n"
1510 "shufps $160, %%xmm8, %%xmm8\n"
1511 "shufps $245, %%xmm6, %%xmm6\n"
1512 "xorps %%xmm9, %%xmm9\n"
1513 "subps %%xmm6, %%xmm9\n"
1514 "addsubps %%xmm9, %%xmm8\n"
1515 "movaps %%xmm8, %%xmm6\n"
1516 "movaps %%xmm7, %%xmm8\n"
1517 "shufps $160, %%xmm8, %%xmm8\n"
1518 "shufps $245, %%xmm7, %%xmm7\n"
1519 "xorps %%xmm9, %%xmm9\n"
1520 "subps %%xmm7, %%xmm9\n"
1521 "addsubps %%xmm9, %%xmm8\n"
1522 "movaps %%xmm8, %%xmm7\n"
1523 "movaps %%xmm0, %%xmm8\n"
1524 "shufps $68, %%xmm8, %%xmm8\n"
1525 "xorps %%xmm9, %%xmm9\n"
1526 "movaps %%xmm0, %%xmm10\n"
1527 "shufps $14, %%xmm9, %%xmm10\n"
1528 "movaps %%xmm0, %%xmm11\n"
1529 "shufps $224, %%xmm11, %%xmm9\n"
1530 "addps %%xmm8, %%xmm10\n"
1531 "subps %%xmm9, %%xmm10\n"
1532 "movaps %%xmm10, %%xmm0\n"
1533 "movaps %%xmm1, %%xmm8\n"
1534 "shufps $68, %%xmm8, %%xmm8\n"
1535 "xorps %%xmm9, %%xmm9\n"
1536 "movaps %%xmm1, %%xmm10\n"
1537 "shufps $14, %%xmm9, %%xmm10\n"
1538 "movaps %%xmm1, %%xmm11\n"
1539 "shufps $224, %%xmm11, %%xmm9\n"
1540 "addps %%xmm8, %%xmm10\n"
1541 "subps %%xmm9, %%xmm10\n"
1542 "movaps %%xmm10, %%xmm1\n"
1543 "movaps %%xmm2, %%xmm8\n"
1544 "shufps $68, %%xmm8, %%xmm8\n"
1545 "xorps %%xmm9, %%xmm9\n"
1546 "movaps %%xmm2, %%xmm10\n"
1547 "shufps $14, %%xmm9, %%xmm10\n"
1548 "movaps %%xmm2, %%xmm11\n"
1549 "shufps $224, %%xmm11, %%xmm9\n"
1550 "addps %%xmm8, %%xmm10\n"
1551 "subps %%xmm9, %%xmm10\n"
1552 "movaps %%xmm10, %%xmm2\n"
1553 "movaps %%xmm3, %%xmm8\n"
1554 "shufps $68, %%xmm8, %%xmm8\n"
1555 "xorps %%xmm9, %%xmm9\n"
1556 "movaps %%xmm3, %%xmm10\n"
1557 "shufps $14, %%xmm9, %%xmm10\n"
1558 "movaps %%xmm3, %%xmm11\n"
1559 "shufps $224, %%xmm11, %%xmm9\n"
1560 "addps %%xmm8, %%xmm10\n"
1561 "subps %%xmm9, %%xmm10\n"
1562 "movaps %%xmm10, %%xmm3\n"
1563 "movaps %%xmm4, %%xmm8\n"
1564 "shufps $68, %%xmm8, %%xmm8\n"
1565 "xorps %%xmm9, %%xmm9\n"
1566 "movaps %%xmm4, %%xmm10\n"
1567 "shufps $14, %%xmm9, %%xmm10\n"
1568 "movaps %%xmm4, %%xmm11\n"
1569 "shufps $224, %%xmm11, %%xmm9\n"
1570 "addps %%xmm8, %%xmm10\n"
1571 "subps %%xmm9, %%xmm10\n"
1572 "movaps %%xmm10, %%xmm4\n"
1573 "movaps %%xmm5, %%xmm8\n"
1574 "shufps $68, %%xmm8, %%xmm8\n"
1575 "xorps %%xmm9, %%xmm9\n"
1576 "movaps %%xmm5, %%xmm10\n"
1577 "shufps $14, %%xmm9, %%xmm10\n"
1578 "movaps %%xmm5, %%xmm11\n"
1579 "shufps $224, %%xmm11, %%xmm9\n"
1580 "addps %%xmm8, %%xmm10\n"
1581 "subps %%xmm9, %%xmm10\n"
1582 "movaps %%xmm10, %%xmm5\n"
1583 "movaps %%xmm6, %%xmm8\n"
1584 "shufps $68, %%xmm8, %%xmm8\n"
1585 "xorps %%xmm9, %%xmm9\n"
1586 "movaps %%xmm6, %%xmm10\n"
1587 "shufps $14, %%xmm9, %%xmm10\n"
1588 "movaps %%xmm6, %%xmm11\n"
1589 "shufps $224, %%xmm11, %%xmm9\n"
1590 "addps %%xmm8, %%xmm10\n"
1591 "subps %%xmm9, %%xmm10\n"
1592 "movaps %%xmm10, %%xmm6\n"
1593 "movaps %%xmm7, %%xmm8\n"
1594 "shufps $68, %%xmm8, %%xmm8\n"
1595 "xorps %%xmm9, %%xmm9\n"
1596 "movaps %%xmm7, %%xmm10\n"
1597 "shufps $14, %%xmm9, %%xmm10\n"
1598 "movaps %%xmm7, %%xmm11\n"
1599 "shufps $224, %%xmm11, %%xmm9\n"
1600 "addps %%xmm8, %%xmm10\n"
1601 "subps %%xmm9, %%xmm10\n"
1602 "movaps %%xmm10, %%xmm7\n"
1603 "movaps %%xmm0, %%xmm8\n"
1604 "movaps %%xmm0, %%xmm9\n"
1605 "addps %%xmm1, %%xmm8\n"
1606 "subps %%xmm1, %%xmm9\n"
1607 "movaps %%xmm2, %%xmm10\n"
1608 "movaps %%xmm2, %%xmm11\n"
1609 "addps %%xmm3, %%xmm10\n"
1610 "subps %%xmm3, %%xmm11\n"
1611 "movaps %%xmm4, %%xmm12\n"
1612 "movaps %%xmm4, %%xmm13\n"
1613 "addps %%xmm5, %%xmm12\n"
1614 "subps %%xmm5, %%xmm13\n"
1615 "movaps %%xmm6, %%xmm14\n"
1616 "movaps %%xmm6, %%xmm15\n"
1617 "addps %%xmm7, %%xmm14\n"
1618 "subps %%xmm7, %%xmm15\n"
1619 "movaps %%xmm8, %%xmm0\n"
1620 "movaps %%xmm8, %%xmm2\n"
1621 "addps %%xmm10, %%xmm0\n"
1622 "subps %%xmm10, %%xmm2\n"
1623 "movaps %%xmm9, %%xmm1\n"
1624 "movaps %%xmm9, %%xmm3\n"
1625 "addps %%xmm11, %%xmm1\n"
1626 "subps %%xmm11, %%xmm3\n"
1627 "movaps %%xmm12, %%xmm4\n"
1628 "movaps %%xmm12, %%xmm6\n"
1629 "addps %%xmm14, %%xmm4\n"
1630 "subps %%xmm14, %%xmm6\n"
1631 "movaps %%xmm13, %%xmm5\n"
1632 "movaps %%xmm13, %%xmm7\n"
1633 "addps %%xmm15, %%xmm5\n"
1634 "subps %%xmm15, %%xmm7\n"
1635 "movaps %%xmm0, %%xmm8\n"
1636 "movaps %%xmm0, %%xmm12\n"
1637 "addps %%xmm4, %%xmm8\n"
1638 "subps %%xmm4, %%xmm12\n"
1639 "movaps %%xmm1, %%xmm9\n"
1640 "movaps %%xmm1, %%xmm13\n"
1641 "addps %%xmm5, %%xmm9\n"
1642 "subps %%xmm5, %%xmm13\n"
1643 "movaps %%xmm2, %%xmm10\n"
1644 "movaps %%xmm2, %%xmm14\n"
1645 "addps %%xmm6, %%xmm10\n"
1646 "subps %%xmm6, %%xmm14\n"
1647 "movaps %%xmm3, %%xmm11\n"
1648 "movaps %%xmm3, %%xmm15\n"
1649 "addps %%xmm7, %%xmm11\n"
1650 "subps %%xmm7, %%xmm15\n"
1651 "movups %%xmm8, (%0)\n"
1652 "movups %%xmm9, (%1)\n"
1653 "movups %%xmm10, (%2)\n"
1654 "movups %%xmm11, (%3)\n"
1655 "movups %%xmm12, (%4)\n"
1656 "movups %%xmm13, (%5)\n"
1657 "movups %%xmm14, (%6)\n"
1658 "movups %%xmm15, (%7)\n"
1659 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1660 );
1661 }
1662 }
1663 for (int j = 0; j < 1024; j += 256) {
1664 for (int k = 0; k < 32; k += 4) {
1665 __asm__ volatile (
1666 "movups (%0), %%xmm0\n"
1667 "movups (%1), %%xmm1\n"
1668 "movups (%2), %%xmm2\n"
1669 "movups (%3), %%xmm3\n"
1670 "movups (%4), %%xmm4\n"
1671 "movups (%5), %%xmm5\n"
1672 "movups (%6), %%xmm6\n"
1673 "movups (%7), %%xmm7\n"
1674 "movaps %%xmm0, %%xmm8\n"
1675 "movaps %%xmm0, %%xmm9\n"
1676 "addps %%xmm1, %%xmm8\n"
1677 "subps %%xmm1, %%xmm9\n"
1678 "movaps %%xmm2, %%xmm10\n"
1679 "movaps %%xmm2, %%xmm11\n"
1680 "addps %%xmm3, %%xmm10\n"
1681 "subps %%xmm3, %%xmm11\n"
1682 "movaps %%xmm4, %%xmm12\n"
1683 "movaps %%xmm4, %%xmm13\n"
1684 "addps %%xmm5, %%xmm12\n"
1685 "subps %%xmm5, %%xmm13\n"
1686 "movaps %%xmm6, %%xmm14\n"
1687 "movaps %%xmm6, %%xmm15\n"
1688 "addps %%xmm7, %%xmm14\n"
1689 "subps %%xmm7, %%xmm15\n"
1690 "movaps %%xmm8, %%xmm0\n"
1691 "movaps %%xmm8, %%xmm2\n"
1692 "addps %%xmm10, %%xmm0\n"
1693 "subps %%xmm10, %%xmm2\n"
1694 "movaps %%xmm9, %%xmm1\n"
1695 "movaps %%xmm9, %%xmm3\n"
1696 "addps %%xmm11, %%xmm1\n"
1697 "subps %%xmm11, %%xmm3\n"
1698 "movaps %%xmm12, %%xmm4\n"
1699 "movaps %%xmm12, %%xmm6\n"
1700 "addps %%xmm14, %%xmm4\n"
1701 "subps %%xmm14, %%xmm6\n"
1702 "movaps %%xmm13, %%xmm5\n"
1703 "movaps %%xmm13, %%xmm7\n"
1704 "addps %%xmm15, %%xmm5\n"
1705 "subps %%xmm15, %%xmm7\n"
1706 "movaps %%xmm0, %%xmm8\n"
1707 "movaps %%xmm0, %%xmm12\n"
1708 "addps %%xmm4, %%xmm8\n"
1709 "subps %%xmm4, %%xmm12\n"
1710 "movaps %%xmm1, %%xmm9\n"
1711 "movaps %%xmm1, %%xmm13\n"
1712 "addps %%xmm5, %%xmm9\n"
1713 "subps %%xmm5, %%xmm13\n"
1714 "movaps %%xmm2, %%xmm10\n"
1715 "movaps %%xmm2, %%xmm14\n"
1716 "addps %%xmm6, %%xmm10\n"
1717 "subps %%xmm6, %%xmm14\n"
1718 "movaps %%xmm3, %%xmm11\n"
1719 "movaps %%xmm3, %%xmm15\n"
1720 "addps %%xmm7, %%xmm11\n"
1721 "subps %%xmm7, %%xmm15\n"
1722 "movups %%xmm8, (%0)\n"
1723 "movups %%xmm9, (%1)\n"
1724 "movups %%xmm10, (%2)\n"
1725 "movups %%xmm11, (%3)\n"
1726 "movups %%xmm12, (%4)\n"
1727 "movups %%xmm13, (%5)\n"
1728 "movups %%xmm14, (%6)\n"
1729 "movups %%xmm15, (%7)\n"
1730 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1731 );
1732 }
1733 }
1734 for (int j = 0; j < 1024; j += 1024) {
1735 for (int k = 0; k < 256; k += 4) {
1736 __asm__ volatile (
1737 "movups (%0), %%xmm0\n"
1738 "movups (%1), %%xmm1\n"
1739 "movups (%2), %%xmm2\n"
1740 "movups (%3), %%xmm3\n"
1741 "movaps %%xmm0, %%xmm8\n"
1742 "movaps %%xmm0, %%xmm9\n"
1743 "addps %%xmm1, %%xmm8\n"
1744 "subps %%xmm1, %%xmm9\n"
1745 "movaps %%xmm2, %%xmm10\n"
1746 "movaps %%xmm2, %%xmm11\n"
1747 "addps %%xmm3, %%xmm10\n"
1748 "subps %%xmm3, %%xmm11\n"
1749 "movaps %%xmm8, %%xmm0\n"
1750 "movaps %%xmm8, %%xmm2\n"
1751 "addps %%xmm10, %%xmm0\n"
1752 "subps %%xmm10, %%xmm2\n"
1753 "movaps %%xmm9, %%xmm1\n"
1754 "movaps %%xmm9, %%xmm3\n"
1755 "addps %%xmm11, %%xmm1\n"
1756 "subps %%xmm11, %%xmm3\n"
1757 "movups %%xmm0, (%0)\n"
1758 "movups %%xmm1, (%1)\n"
1759 "movups %%xmm2, (%2)\n"
1760 "movups %%xmm3, (%3)\n"
1761 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1762 );
1763 }
1764 }
1765 }
1766 static inline void helper_float_11(float *buf);
helper_float_11(float * buf)1767 static inline void helper_float_11(float *buf) {
1768 for (int j = 0; j < 2048; j += 32) {
1769 for (int k = 0; k < 4; k += 4) {
1770 __asm__ volatile (
1771 "movups (%0), %%xmm0\n"
1772 "movups (%1), %%xmm1\n"
1773 "movups (%2), %%xmm2\n"
1774 "movups (%3), %%xmm3\n"
1775 "movups (%4), %%xmm4\n"
1776 "movups (%5), %%xmm5\n"
1777 "movups (%6), %%xmm6\n"
1778 "movups (%7), %%xmm7\n"
1779 "movaps %%xmm0, %%xmm8\n"
1780 "shufps $160, %%xmm8, %%xmm8\n"
1781 "shufps $245, %%xmm0, %%xmm0\n"
1782 "xorps %%xmm9, %%xmm9\n"
1783 "subps %%xmm0, %%xmm9\n"
1784 "addsubps %%xmm9, %%xmm8\n"
1785 "movaps %%xmm8, %%xmm0\n"
1786 "movaps %%xmm1, %%xmm8\n"
1787 "shufps $160, %%xmm8, %%xmm8\n"
1788 "shufps $245, %%xmm1, %%xmm1\n"
1789 "xorps %%xmm9, %%xmm9\n"
1790 "subps %%xmm1, %%xmm9\n"
1791 "addsubps %%xmm9, %%xmm8\n"
1792 "movaps %%xmm8, %%xmm1\n"
1793 "movaps %%xmm2, %%xmm8\n"
1794 "shufps $160, %%xmm8, %%xmm8\n"
1795 "shufps $245, %%xmm2, %%xmm2\n"
1796 "xorps %%xmm9, %%xmm9\n"
1797 "subps %%xmm2, %%xmm9\n"
1798 "addsubps %%xmm9, %%xmm8\n"
1799 "movaps %%xmm8, %%xmm2\n"
1800 "movaps %%xmm3, %%xmm8\n"
1801 "shufps $160, %%xmm8, %%xmm8\n"
1802 "shufps $245, %%xmm3, %%xmm3\n"
1803 "xorps %%xmm9, %%xmm9\n"
1804 "subps %%xmm3, %%xmm9\n"
1805 "addsubps %%xmm9, %%xmm8\n"
1806 "movaps %%xmm8, %%xmm3\n"
1807 "movaps %%xmm4, %%xmm8\n"
1808 "shufps $160, %%xmm8, %%xmm8\n"
1809 "shufps $245, %%xmm4, %%xmm4\n"
1810 "xorps %%xmm9, %%xmm9\n"
1811 "subps %%xmm4, %%xmm9\n"
1812 "addsubps %%xmm9, %%xmm8\n"
1813 "movaps %%xmm8, %%xmm4\n"
1814 "movaps %%xmm5, %%xmm8\n"
1815 "shufps $160, %%xmm8, %%xmm8\n"
1816 "shufps $245, %%xmm5, %%xmm5\n"
1817 "xorps %%xmm9, %%xmm9\n"
1818 "subps %%xmm5, %%xmm9\n"
1819 "addsubps %%xmm9, %%xmm8\n"
1820 "movaps %%xmm8, %%xmm5\n"
1821 "movaps %%xmm6, %%xmm8\n"
1822 "shufps $160, %%xmm8, %%xmm8\n"
1823 "shufps $245, %%xmm6, %%xmm6\n"
1824 "xorps %%xmm9, %%xmm9\n"
1825 "subps %%xmm6, %%xmm9\n"
1826 "addsubps %%xmm9, %%xmm8\n"
1827 "movaps %%xmm8, %%xmm6\n"
1828 "movaps %%xmm7, %%xmm8\n"
1829 "shufps $160, %%xmm8, %%xmm8\n"
1830 "shufps $245, %%xmm7, %%xmm7\n"
1831 "xorps %%xmm9, %%xmm9\n"
1832 "subps %%xmm7, %%xmm9\n"
1833 "addsubps %%xmm9, %%xmm8\n"
1834 "movaps %%xmm8, %%xmm7\n"
1835 "movaps %%xmm0, %%xmm8\n"
1836 "shufps $68, %%xmm8, %%xmm8\n"
1837 "xorps %%xmm9, %%xmm9\n"
1838 "movaps %%xmm0, %%xmm10\n"
1839 "shufps $14, %%xmm9, %%xmm10\n"
1840 "movaps %%xmm0, %%xmm11\n"
1841 "shufps $224, %%xmm11, %%xmm9\n"
1842 "addps %%xmm8, %%xmm10\n"
1843 "subps %%xmm9, %%xmm10\n"
1844 "movaps %%xmm10, %%xmm0\n"
1845 "movaps %%xmm1, %%xmm8\n"
1846 "shufps $68, %%xmm8, %%xmm8\n"
1847 "xorps %%xmm9, %%xmm9\n"
1848 "movaps %%xmm1, %%xmm10\n"
1849 "shufps $14, %%xmm9, %%xmm10\n"
1850 "movaps %%xmm1, %%xmm11\n"
1851 "shufps $224, %%xmm11, %%xmm9\n"
1852 "addps %%xmm8, %%xmm10\n"
1853 "subps %%xmm9, %%xmm10\n"
1854 "movaps %%xmm10, %%xmm1\n"
1855 "movaps %%xmm2, %%xmm8\n"
1856 "shufps $68, %%xmm8, %%xmm8\n"
1857 "xorps %%xmm9, %%xmm9\n"
1858 "movaps %%xmm2, %%xmm10\n"
1859 "shufps $14, %%xmm9, %%xmm10\n"
1860 "movaps %%xmm2, %%xmm11\n"
1861 "shufps $224, %%xmm11, %%xmm9\n"
1862 "addps %%xmm8, %%xmm10\n"
1863 "subps %%xmm9, %%xmm10\n"
1864 "movaps %%xmm10, %%xmm2\n"
1865 "movaps %%xmm3, %%xmm8\n"
1866 "shufps $68, %%xmm8, %%xmm8\n"
1867 "xorps %%xmm9, %%xmm9\n"
1868 "movaps %%xmm3, %%xmm10\n"
1869 "shufps $14, %%xmm9, %%xmm10\n"
1870 "movaps %%xmm3, %%xmm11\n"
1871 "shufps $224, %%xmm11, %%xmm9\n"
1872 "addps %%xmm8, %%xmm10\n"
1873 "subps %%xmm9, %%xmm10\n"
1874 "movaps %%xmm10, %%xmm3\n"
1875 "movaps %%xmm4, %%xmm8\n"
1876 "shufps $68, %%xmm8, %%xmm8\n"
1877 "xorps %%xmm9, %%xmm9\n"
1878 "movaps %%xmm4, %%xmm10\n"
1879 "shufps $14, %%xmm9, %%xmm10\n"
1880 "movaps %%xmm4, %%xmm11\n"
1881 "shufps $224, %%xmm11, %%xmm9\n"
1882 "addps %%xmm8, %%xmm10\n"
1883 "subps %%xmm9, %%xmm10\n"
1884 "movaps %%xmm10, %%xmm4\n"
1885 "movaps %%xmm5, %%xmm8\n"
1886 "shufps $68, %%xmm8, %%xmm8\n"
1887 "xorps %%xmm9, %%xmm9\n"
1888 "movaps %%xmm5, %%xmm10\n"
1889 "shufps $14, %%xmm9, %%xmm10\n"
1890 "movaps %%xmm5, %%xmm11\n"
1891 "shufps $224, %%xmm11, %%xmm9\n"
1892 "addps %%xmm8, %%xmm10\n"
1893 "subps %%xmm9, %%xmm10\n"
1894 "movaps %%xmm10, %%xmm5\n"
1895 "movaps %%xmm6, %%xmm8\n"
1896 "shufps $68, %%xmm8, %%xmm8\n"
1897 "xorps %%xmm9, %%xmm9\n"
1898 "movaps %%xmm6, %%xmm10\n"
1899 "shufps $14, %%xmm9, %%xmm10\n"
1900 "movaps %%xmm6, %%xmm11\n"
1901 "shufps $224, %%xmm11, %%xmm9\n"
1902 "addps %%xmm8, %%xmm10\n"
1903 "subps %%xmm9, %%xmm10\n"
1904 "movaps %%xmm10, %%xmm6\n"
1905 "movaps %%xmm7, %%xmm8\n"
1906 "shufps $68, %%xmm8, %%xmm8\n"
1907 "xorps %%xmm9, %%xmm9\n"
1908 "movaps %%xmm7, %%xmm10\n"
1909 "shufps $14, %%xmm9, %%xmm10\n"
1910 "movaps %%xmm7, %%xmm11\n"
1911 "shufps $224, %%xmm11, %%xmm9\n"
1912 "addps %%xmm8, %%xmm10\n"
1913 "subps %%xmm9, %%xmm10\n"
1914 "movaps %%xmm10, %%xmm7\n"
1915 "movaps %%xmm0, %%xmm8\n"
1916 "movaps %%xmm0, %%xmm9\n"
1917 "addps %%xmm1, %%xmm8\n"
1918 "subps %%xmm1, %%xmm9\n"
1919 "movaps %%xmm2, %%xmm10\n"
1920 "movaps %%xmm2, %%xmm11\n"
1921 "addps %%xmm3, %%xmm10\n"
1922 "subps %%xmm3, %%xmm11\n"
1923 "movaps %%xmm4, %%xmm12\n"
1924 "movaps %%xmm4, %%xmm13\n"
1925 "addps %%xmm5, %%xmm12\n"
1926 "subps %%xmm5, %%xmm13\n"
1927 "movaps %%xmm6, %%xmm14\n"
1928 "movaps %%xmm6, %%xmm15\n"
1929 "addps %%xmm7, %%xmm14\n"
1930 "subps %%xmm7, %%xmm15\n"
1931 "movaps %%xmm8, %%xmm0\n"
1932 "movaps %%xmm8, %%xmm2\n"
1933 "addps %%xmm10, %%xmm0\n"
1934 "subps %%xmm10, %%xmm2\n"
1935 "movaps %%xmm9, %%xmm1\n"
1936 "movaps %%xmm9, %%xmm3\n"
1937 "addps %%xmm11, %%xmm1\n"
1938 "subps %%xmm11, %%xmm3\n"
1939 "movaps %%xmm12, %%xmm4\n"
1940 "movaps %%xmm12, %%xmm6\n"
1941 "addps %%xmm14, %%xmm4\n"
1942 "subps %%xmm14, %%xmm6\n"
1943 "movaps %%xmm13, %%xmm5\n"
1944 "movaps %%xmm13, %%xmm7\n"
1945 "addps %%xmm15, %%xmm5\n"
1946 "subps %%xmm15, %%xmm7\n"
1947 "movaps %%xmm0, %%xmm8\n"
1948 "movaps %%xmm0, %%xmm12\n"
1949 "addps %%xmm4, %%xmm8\n"
1950 "subps %%xmm4, %%xmm12\n"
1951 "movaps %%xmm1, %%xmm9\n"
1952 "movaps %%xmm1, %%xmm13\n"
1953 "addps %%xmm5, %%xmm9\n"
1954 "subps %%xmm5, %%xmm13\n"
1955 "movaps %%xmm2, %%xmm10\n"
1956 "movaps %%xmm2, %%xmm14\n"
1957 "addps %%xmm6, %%xmm10\n"
1958 "subps %%xmm6, %%xmm14\n"
1959 "movaps %%xmm3, %%xmm11\n"
1960 "movaps %%xmm3, %%xmm15\n"
1961 "addps %%xmm7, %%xmm11\n"
1962 "subps %%xmm7, %%xmm15\n"
1963 "movups %%xmm8, (%0)\n"
1964 "movups %%xmm9, (%1)\n"
1965 "movups %%xmm10, (%2)\n"
1966 "movups %%xmm11, (%3)\n"
1967 "movups %%xmm12, (%4)\n"
1968 "movups %%xmm13, (%5)\n"
1969 "movups %%xmm14, (%6)\n"
1970 "movups %%xmm15, (%7)\n"
1971 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
1972 );
1973 }
1974 }
1975 for (int j = 0; j < 2048; j += 256) {
1976 for (int k = 0; k < 32; k += 4) {
1977 __asm__ volatile (
1978 "movups (%0), %%xmm0\n"
1979 "movups (%1), %%xmm1\n"
1980 "movups (%2), %%xmm2\n"
1981 "movups (%3), %%xmm3\n"
1982 "movups (%4), %%xmm4\n"
1983 "movups (%5), %%xmm5\n"
1984 "movups (%6), %%xmm6\n"
1985 "movups (%7), %%xmm7\n"
1986 "movaps %%xmm0, %%xmm8\n"
1987 "movaps %%xmm0, %%xmm9\n"
1988 "addps %%xmm1, %%xmm8\n"
1989 "subps %%xmm1, %%xmm9\n"
1990 "movaps %%xmm2, %%xmm10\n"
1991 "movaps %%xmm2, %%xmm11\n"
1992 "addps %%xmm3, %%xmm10\n"
1993 "subps %%xmm3, %%xmm11\n"
1994 "movaps %%xmm4, %%xmm12\n"
1995 "movaps %%xmm4, %%xmm13\n"
1996 "addps %%xmm5, %%xmm12\n"
1997 "subps %%xmm5, %%xmm13\n"
1998 "movaps %%xmm6, %%xmm14\n"
1999 "movaps %%xmm6, %%xmm15\n"
2000 "addps %%xmm7, %%xmm14\n"
2001 "subps %%xmm7, %%xmm15\n"
2002 "movaps %%xmm8, %%xmm0\n"
2003 "movaps %%xmm8, %%xmm2\n"
2004 "addps %%xmm10, %%xmm0\n"
2005 "subps %%xmm10, %%xmm2\n"
2006 "movaps %%xmm9, %%xmm1\n"
2007 "movaps %%xmm9, %%xmm3\n"
2008 "addps %%xmm11, %%xmm1\n"
2009 "subps %%xmm11, %%xmm3\n"
2010 "movaps %%xmm12, %%xmm4\n"
2011 "movaps %%xmm12, %%xmm6\n"
2012 "addps %%xmm14, %%xmm4\n"
2013 "subps %%xmm14, %%xmm6\n"
2014 "movaps %%xmm13, %%xmm5\n"
2015 "movaps %%xmm13, %%xmm7\n"
2016 "addps %%xmm15, %%xmm5\n"
2017 "subps %%xmm15, %%xmm7\n"
2018 "movaps %%xmm0, %%xmm8\n"
2019 "movaps %%xmm0, %%xmm12\n"
2020 "addps %%xmm4, %%xmm8\n"
2021 "subps %%xmm4, %%xmm12\n"
2022 "movaps %%xmm1, %%xmm9\n"
2023 "movaps %%xmm1, %%xmm13\n"
2024 "addps %%xmm5, %%xmm9\n"
2025 "subps %%xmm5, %%xmm13\n"
2026 "movaps %%xmm2, %%xmm10\n"
2027 "movaps %%xmm2, %%xmm14\n"
2028 "addps %%xmm6, %%xmm10\n"
2029 "subps %%xmm6, %%xmm14\n"
2030 "movaps %%xmm3, %%xmm11\n"
2031 "movaps %%xmm3, %%xmm15\n"
2032 "addps %%xmm7, %%xmm11\n"
2033 "subps %%xmm7, %%xmm15\n"
2034 "movups %%xmm8, (%0)\n"
2035 "movups %%xmm9, (%1)\n"
2036 "movups %%xmm10, (%2)\n"
2037 "movups %%xmm11, (%3)\n"
2038 "movups %%xmm12, (%4)\n"
2039 "movups %%xmm13, (%5)\n"
2040 "movups %%xmm14, (%6)\n"
2041 "movups %%xmm15, (%7)\n"
2042 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2043 );
2044 }
2045 }
2046 for (int j = 0; j < 2048; j += 2048) {
2047 for (int k = 0; k < 256; k += 4) {
2048 __asm__ volatile (
2049 "movups (%0), %%xmm0\n"
2050 "movups (%1), %%xmm1\n"
2051 "movups (%2), %%xmm2\n"
2052 "movups (%3), %%xmm3\n"
2053 "movups (%4), %%xmm4\n"
2054 "movups (%5), %%xmm5\n"
2055 "movups (%6), %%xmm6\n"
2056 "movups (%7), %%xmm7\n"
2057 "movaps %%xmm0, %%xmm8\n"
2058 "movaps %%xmm0, %%xmm9\n"
2059 "addps %%xmm1, %%xmm8\n"
2060 "subps %%xmm1, %%xmm9\n"
2061 "movaps %%xmm2, %%xmm10\n"
2062 "movaps %%xmm2, %%xmm11\n"
2063 "addps %%xmm3, %%xmm10\n"
2064 "subps %%xmm3, %%xmm11\n"
2065 "movaps %%xmm4, %%xmm12\n"
2066 "movaps %%xmm4, %%xmm13\n"
2067 "addps %%xmm5, %%xmm12\n"
2068 "subps %%xmm5, %%xmm13\n"
2069 "movaps %%xmm6, %%xmm14\n"
2070 "movaps %%xmm6, %%xmm15\n"
2071 "addps %%xmm7, %%xmm14\n"
2072 "subps %%xmm7, %%xmm15\n"
2073 "movaps %%xmm8, %%xmm0\n"
2074 "movaps %%xmm8, %%xmm2\n"
2075 "addps %%xmm10, %%xmm0\n"
2076 "subps %%xmm10, %%xmm2\n"
2077 "movaps %%xmm9, %%xmm1\n"
2078 "movaps %%xmm9, %%xmm3\n"
2079 "addps %%xmm11, %%xmm1\n"
2080 "subps %%xmm11, %%xmm3\n"
2081 "movaps %%xmm12, %%xmm4\n"
2082 "movaps %%xmm12, %%xmm6\n"
2083 "addps %%xmm14, %%xmm4\n"
2084 "subps %%xmm14, %%xmm6\n"
2085 "movaps %%xmm13, %%xmm5\n"
2086 "movaps %%xmm13, %%xmm7\n"
2087 "addps %%xmm15, %%xmm5\n"
2088 "subps %%xmm15, %%xmm7\n"
2089 "movaps %%xmm0, %%xmm8\n"
2090 "movaps %%xmm0, %%xmm12\n"
2091 "addps %%xmm4, %%xmm8\n"
2092 "subps %%xmm4, %%xmm12\n"
2093 "movaps %%xmm1, %%xmm9\n"
2094 "movaps %%xmm1, %%xmm13\n"
2095 "addps %%xmm5, %%xmm9\n"
2096 "subps %%xmm5, %%xmm13\n"
2097 "movaps %%xmm2, %%xmm10\n"
2098 "movaps %%xmm2, %%xmm14\n"
2099 "addps %%xmm6, %%xmm10\n"
2100 "subps %%xmm6, %%xmm14\n"
2101 "movaps %%xmm3, %%xmm11\n"
2102 "movaps %%xmm3, %%xmm15\n"
2103 "addps %%xmm7, %%xmm11\n"
2104 "subps %%xmm7, %%xmm15\n"
2105 "movups %%xmm8, (%0)\n"
2106 "movups %%xmm9, (%1)\n"
2107 "movups %%xmm10, (%2)\n"
2108 "movups %%xmm11, (%3)\n"
2109 "movups %%xmm12, (%4)\n"
2110 "movups %%xmm13, (%5)\n"
2111 "movups %%xmm14, (%6)\n"
2112 "movups %%xmm15, (%7)\n"
2113 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2114 );
2115 }
2116 }
2117 }
2118 void helper_float_12_recursive(float *buf, int depth);
helper_float_12_recursive(float * buf,int depth)2119 void helper_float_12_recursive(float *buf, int depth) {
2120 if (depth == 7) {
2121 for (int j = 0; j < 128; j += 32) {
2122 for (int k = 0; k < 4; k += 4) {
2123 __asm__ volatile (
2124 "movups (%0), %%xmm0\n"
2125 "movups (%1), %%xmm1\n"
2126 "movups (%2), %%xmm2\n"
2127 "movups (%3), %%xmm3\n"
2128 "movups (%4), %%xmm4\n"
2129 "movups (%5), %%xmm5\n"
2130 "movups (%6), %%xmm6\n"
2131 "movups (%7), %%xmm7\n"
2132 "movaps %%xmm0, %%xmm8\n"
2133 "shufps $160, %%xmm8, %%xmm8\n"
2134 "shufps $245, %%xmm0, %%xmm0\n"
2135 "xorps %%xmm9, %%xmm9\n"
2136 "subps %%xmm0, %%xmm9\n"
2137 "addsubps %%xmm9, %%xmm8\n"
2138 "movaps %%xmm8, %%xmm0\n"
2139 "movaps %%xmm1, %%xmm8\n"
2140 "shufps $160, %%xmm8, %%xmm8\n"
2141 "shufps $245, %%xmm1, %%xmm1\n"
2142 "xorps %%xmm9, %%xmm9\n"
2143 "subps %%xmm1, %%xmm9\n"
2144 "addsubps %%xmm9, %%xmm8\n"
2145 "movaps %%xmm8, %%xmm1\n"
2146 "movaps %%xmm2, %%xmm8\n"
2147 "shufps $160, %%xmm8, %%xmm8\n"
2148 "shufps $245, %%xmm2, %%xmm2\n"
2149 "xorps %%xmm9, %%xmm9\n"
2150 "subps %%xmm2, %%xmm9\n"
2151 "addsubps %%xmm9, %%xmm8\n"
2152 "movaps %%xmm8, %%xmm2\n"
2153 "movaps %%xmm3, %%xmm8\n"
2154 "shufps $160, %%xmm8, %%xmm8\n"
2155 "shufps $245, %%xmm3, %%xmm3\n"
2156 "xorps %%xmm9, %%xmm9\n"
2157 "subps %%xmm3, %%xmm9\n"
2158 "addsubps %%xmm9, %%xmm8\n"
2159 "movaps %%xmm8, %%xmm3\n"
2160 "movaps %%xmm4, %%xmm8\n"
2161 "shufps $160, %%xmm8, %%xmm8\n"
2162 "shufps $245, %%xmm4, %%xmm4\n"
2163 "xorps %%xmm9, %%xmm9\n"
2164 "subps %%xmm4, %%xmm9\n"
2165 "addsubps %%xmm9, %%xmm8\n"
2166 "movaps %%xmm8, %%xmm4\n"
2167 "movaps %%xmm5, %%xmm8\n"
2168 "shufps $160, %%xmm8, %%xmm8\n"
2169 "shufps $245, %%xmm5, %%xmm5\n"
2170 "xorps %%xmm9, %%xmm9\n"
2171 "subps %%xmm5, %%xmm9\n"
2172 "addsubps %%xmm9, %%xmm8\n"
2173 "movaps %%xmm8, %%xmm5\n"
2174 "movaps %%xmm6, %%xmm8\n"
2175 "shufps $160, %%xmm8, %%xmm8\n"
2176 "shufps $245, %%xmm6, %%xmm6\n"
2177 "xorps %%xmm9, %%xmm9\n"
2178 "subps %%xmm6, %%xmm9\n"
2179 "addsubps %%xmm9, %%xmm8\n"
2180 "movaps %%xmm8, %%xmm6\n"
2181 "movaps %%xmm7, %%xmm8\n"
2182 "shufps $160, %%xmm8, %%xmm8\n"
2183 "shufps $245, %%xmm7, %%xmm7\n"
2184 "xorps %%xmm9, %%xmm9\n"
2185 "subps %%xmm7, %%xmm9\n"
2186 "addsubps %%xmm9, %%xmm8\n"
2187 "movaps %%xmm8, %%xmm7\n"
2188 "movaps %%xmm0, %%xmm8\n"
2189 "shufps $68, %%xmm8, %%xmm8\n"
2190 "xorps %%xmm9, %%xmm9\n"
2191 "movaps %%xmm0, %%xmm10\n"
2192 "shufps $14, %%xmm9, %%xmm10\n"
2193 "movaps %%xmm0, %%xmm11\n"
2194 "shufps $224, %%xmm11, %%xmm9\n"
2195 "addps %%xmm8, %%xmm10\n"
2196 "subps %%xmm9, %%xmm10\n"
2197 "movaps %%xmm10, %%xmm0\n"
2198 "movaps %%xmm1, %%xmm8\n"
2199 "shufps $68, %%xmm8, %%xmm8\n"
2200 "xorps %%xmm9, %%xmm9\n"
2201 "movaps %%xmm1, %%xmm10\n"
2202 "shufps $14, %%xmm9, %%xmm10\n"
2203 "movaps %%xmm1, %%xmm11\n"
2204 "shufps $224, %%xmm11, %%xmm9\n"
2205 "addps %%xmm8, %%xmm10\n"
2206 "subps %%xmm9, %%xmm10\n"
2207 "movaps %%xmm10, %%xmm1\n"
2208 "movaps %%xmm2, %%xmm8\n"
2209 "shufps $68, %%xmm8, %%xmm8\n"
2210 "xorps %%xmm9, %%xmm9\n"
2211 "movaps %%xmm2, %%xmm10\n"
2212 "shufps $14, %%xmm9, %%xmm10\n"
2213 "movaps %%xmm2, %%xmm11\n"
2214 "shufps $224, %%xmm11, %%xmm9\n"
2215 "addps %%xmm8, %%xmm10\n"
2216 "subps %%xmm9, %%xmm10\n"
2217 "movaps %%xmm10, %%xmm2\n"
2218 "movaps %%xmm3, %%xmm8\n"
2219 "shufps $68, %%xmm8, %%xmm8\n"
2220 "xorps %%xmm9, %%xmm9\n"
2221 "movaps %%xmm3, %%xmm10\n"
2222 "shufps $14, %%xmm9, %%xmm10\n"
2223 "movaps %%xmm3, %%xmm11\n"
2224 "shufps $224, %%xmm11, %%xmm9\n"
2225 "addps %%xmm8, %%xmm10\n"
2226 "subps %%xmm9, %%xmm10\n"
2227 "movaps %%xmm10, %%xmm3\n"
2228 "movaps %%xmm4, %%xmm8\n"
2229 "shufps $68, %%xmm8, %%xmm8\n"
2230 "xorps %%xmm9, %%xmm9\n"
2231 "movaps %%xmm4, %%xmm10\n"
2232 "shufps $14, %%xmm9, %%xmm10\n"
2233 "movaps %%xmm4, %%xmm11\n"
2234 "shufps $224, %%xmm11, %%xmm9\n"
2235 "addps %%xmm8, %%xmm10\n"
2236 "subps %%xmm9, %%xmm10\n"
2237 "movaps %%xmm10, %%xmm4\n"
2238 "movaps %%xmm5, %%xmm8\n"
2239 "shufps $68, %%xmm8, %%xmm8\n"
2240 "xorps %%xmm9, %%xmm9\n"
2241 "movaps %%xmm5, %%xmm10\n"
2242 "shufps $14, %%xmm9, %%xmm10\n"
2243 "movaps %%xmm5, %%xmm11\n"
2244 "shufps $224, %%xmm11, %%xmm9\n"
2245 "addps %%xmm8, %%xmm10\n"
2246 "subps %%xmm9, %%xmm10\n"
2247 "movaps %%xmm10, %%xmm5\n"
2248 "movaps %%xmm6, %%xmm8\n"
2249 "shufps $68, %%xmm8, %%xmm8\n"
2250 "xorps %%xmm9, %%xmm9\n"
2251 "movaps %%xmm6, %%xmm10\n"
2252 "shufps $14, %%xmm9, %%xmm10\n"
2253 "movaps %%xmm6, %%xmm11\n"
2254 "shufps $224, %%xmm11, %%xmm9\n"
2255 "addps %%xmm8, %%xmm10\n"
2256 "subps %%xmm9, %%xmm10\n"
2257 "movaps %%xmm10, %%xmm6\n"
2258 "movaps %%xmm7, %%xmm8\n"
2259 "shufps $68, %%xmm8, %%xmm8\n"
2260 "xorps %%xmm9, %%xmm9\n"
2261 "movaps %%xmm7, %%xmm10\n"
2262 "shufps $14, %%xmm9, %%xmm10\n"
2263 "movaps %%xmm7, %%xmm11\n"
2264 "shufps $224, %%xmm11, %%xmm9\n"
2265 "addps %%xmm8, %%xmm10\n"
2266 "subps %%xmm9, %%xmm10\n"
2267 "movaps %%xmm10, %%xmm7\n"
2268 "movaps %%xmm0, %%xmm8\n"
2269 "movaps %%xmm0, %%xmm9\n"
2270 "addps %%xmm1, %%xmm8\n"
2271 "subps %%xmm1, %%xmm9\n"
2272 "movaps %%xmm2, %%xmm10\n"
2273 "movaps %%xmm2, %%xmm11\n"
2274 "addps %%xmm3, %%xmm10\n"
2275 "subps %%xmm3, %%xmm11\n"
2276 "movaps %%xmm4, %%xmm12\n"
2277 "movaps %%xmm4, %%xmm13\n"
2278 "addps %%xmm5, %%xmm12\n"
2279 "subps %%xmm5, %%xmm13\n"
2280 "movaps %%xmm6, %%xmm14\n"
2281 "movaps %%xmm6, %%xmm15\n"
2282 "addps %%xmm7, %%xmm14\n"
2283 "subps %%xmm7, %%xmm15\n"
2284 "movaps %%xmm8, %%xmm0\n"
2285 "movaps %%xmm8, %%xmm2\n"
2286 "addps %%xmm10, %%xmm0\n"
2287 "subps %%xmm10, %%xmm2\n"
2288 "movaps %%xmm9, %%xmm1\n"
2289 "movaps %%xmm9, %%xmm3\n"
2290 "addps %%xmm11, %%xmm1\n"
2291 "subps %%xmm11, %%xmm3\n"
2292 "movaps %%xmm12, %%xmm4\n"
2293 "movaps %%xmm12, %%xmm6\n"
2294 "addps %%xmm14, %%xmm4\n"
2295 "subps %%xmm14, %%xmm6\n"
2296 "movaps %%xmm13, %%xmm5\n"
2297 "movaps %%xmm13, %%xmm7\n"
2298 "addps %%xmm15, %%xmm5\n"
2299 "subps %%xmm15, %%xmm7\n"
2300 "movaps %%xmm0, %%xmm8\n"
2301 "movaps %%xmm0, %%xmm12\n"
2302 "addps %%xmm4, %%xmm8\n"
2303 "subps %%xmm4, %%xmm12\n"
2304 "movaps %%xmm1, %%xmm9\n"
2305 "movaps %%xmm1, %%xmm13\n"
2306 "addps %%xmm5, %%xmm9\n"
2307 "subps %%xmm5, %%xmm13\n"
2308 "movaps %%xmm2, %%xmm10\n"
2309 "movaps %%xmm2, %%xmm14\n"
2310 "addps %%xmm6, %%xmm10\n"
2311 "subps %%xmm6, %%xmm14\n"
2312 "movaps %%xmm3, %%xmm11\n"
2313 "movaps %%xmm3, %%xmm15\n"
2314 "addps %%xmm7, %%xmm11\n"
2315 "subps %%xmm7, %%xmm15\n"
2316 "movups %%xmm8, (%0)\n"
2317 "movups %%xmm9, (%1)\n"
2318 "movups %%xmm10, (%2)\n"
2319 "movups %%xmm11, (%3)\n"
2320 "movups %%xmm12, (%4)\n"
2321 "movups %%xmm13, (%5)\n"
2322 "movups %%xmm14, (%6)\n"
2323 "movups %%xmm15, (%7)\n"
2324 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2325 );
2326 }
2327 }
2328 for (int j = 0; j < 128; j += 128) {
2329 for (int k = 0; k < 32; k += 4) {
2330 __asm__ volatile (
2331 "movups (%0), %%xmm0\n"
2332 "movups (%1), %%xmm1\n"
2333 "movups (%2), %%xmm2\n"
2334 "movups (%3), %%xmm3\n"
2335 "movaps %%xmm0, %%xmm8\n"
2336 "movaps %%xmm0, %%xmm9\n"
2337 "addps %%xmm1, %%xmm8\n"
2338 "subps %%xmm1, %%xmm9\n"
2339 "movaps %%xmm2, %%xmm10\n"
2340 "movaps %%xmm2, %%xmm11\n"
2341 "addps %%xmm3, %%xmm10\n"
2342 "subps %%xmm3, %%xmm11\n"
2343 "movaps %%xmm8, %%xmm0\n"
2344 "movaps %%xmm8, %%xmm2\n"
2345 "addps %%xmm10, %%xmm0\n"
2346 "subps %%xmm10, %%xmm2\n"
2347 "movaps %%xmm9, %%xmm1\n"
2348 "movaps %%xmm9, %%xmm3\n"
2349 "addps %%xmm11, %%xmm1\n"
2350 "subps %%xmm11, %%xmm3\n"
2351 "movups %%xmm0, (%0)\n"
2352 "movups %%xmm1, (%1)\n"
2353 "movups %%xmm2, (%2)\n"
2354 "movups %%xmm3, (%3)\n"
2355 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2356 );
2357 }
2358 }
2359 return;
2360 }
2361 if (depth == 10) {
2362 helper_float_12_recursive(buf + 0, 7);
2363 helper_float_12_recursive(buf + 128, 7);
2364 helper_float_12_recursive(buf + 256, 7);
2365 helper_float_12_recursive(buf + 384, 7);
2366 helper_float_12_recursive(buf + 512, 7);
2367 helper_float_12_recursive(buf + 640, 7);
2368 helper_float_12_recursive(buf + 768, 7);
2369 helper_float_12_recursive(buf + 896, 7);
2370 for (int j = 0; j < 1024; j += 1024) {
2371 for (int k = 0; k < 128; k += 4) {
2372 __asm__ volatile (
2373 "movups (%0), %%xmm0\n"
2374 "movups (%1), %%xmm1\n"
2375 "movups (%2), %%xmm2\n"
2376 "movups (%3), %%xmm3\n"
2377 "movups (%4), %%xmm4\n"
2378 "movups (%5), %%xmm5\n"
2379 "movups (%6), %%xmm6\n"
2380 "movups (%7), %%xmm7\n"
2381 "movaps %%xmm0, %%xmm8\n"
2382 "movaps %%xmm0, %%xmm9\n"
2383 "addps %%xmm1, %%xmm8\n"
2384 "subps %%xmm1, %%xmm9\n"
2385 "movaps %%xmm2, %%xmm10\n"
2386 "movaps %%xmm2, %%xmm11\n"
2387 "addps %%xmm3, %%xmm10\n"
2388 "subps %%xmm3, %%xmm11\n"
2389 "movaps %%xmm4, %%xmm12\n"
2390 "movaps %%xmm4, %%xmm13\n"
2391 "addps %%xmm5, %%xmm12\n"
2392 "subps %%xmm5, %%xmm13\n"
2393 "movaps %%xmm6, %%xmm14\n"
2394 "movaps %%xmm6, %%xmm15\n"
2395 "addps %%xmm7, %%xmm14\n"
2396 "subps %%xmm7, %%xmm15\n"
2397 "movaps %%xmm8, %%xmm0\n"
2398 "movaps %%xmm8, %%xmm2\n"
2399 "addps %%xmm10, %%xmm0\n"
2400 "subps %%xmm10, %%xmm2\n"
2401 "movaps %%xmm9, %%xmm1\n"
2402 "movaps %%xmm9, %%xmm3\n"
2403 "addps %%xmm11, %%xmm1\n"
2404 "subps %%xmm11, %%xmm3\n"
2405 "movaps %%xmm12, %%xmm4\n"
2406 "movaps %%xmm12, %%xmm6\n"
2407 "addps %%xmm14, %%xmm4\n"
2408 "subps %%xmm14, %%xmm6\n"
2409 "movaps %%xmm13, %%xmm5\n"
2410 "movaps %%xmm13, %%xmm7\n"
2411 "addps %%xmm15, %%xmm5\n"
2412 "subps %%xmm15, %%xmm7\n"
2413 "movaps %%xmm0, %%xmm8\n"
2414 "movaps %%xmm0, %%xmm12\n"
2415 "addps %%xmm4, %%xmm8\n"
2416 "subps %%xmm4, %%xmm12\n"
2417 "movaps %%xmm1, %%xmm9\n"
2418 "movaps %%xmm1, %%xmm13\n"
2419 "addps %%xmm5, %%xmm9\n"
2420 "subps %%xmm5, %%xmm13\n"
2421 "movaps %%xmm2, %%xmm10\n"
2422 "movaps %%xmm2, %%xmm14\n"
2423 "addps %%xmm6, %%xmm10\n"
2424 "subps %%xmm6, %%xmm14\n"
2425 "movaps %%xmm3, %%xmm11\n"
2426 "movaps %%xmm3, %%xmm15\n"
2427 "addps %%xmm7, %%xmm11\n"
2428 "subps %%xmm7, %%xmm15\n"
2429 "movups %%xmm8, (%0)\n"
2430 "movups %%xmm9, (%1)\n"
2431 "movups %%xmm10, (%2)\n"
2432 "movups %%xmm11, (%3)\n"
2433 "movups %%xmm12, (%4)\n"
2434 "movups %%xmm13, (%5)\n"
2435 "movups %%xmm14, (%6)\n"
2436 "movups %%xmm15, (%7)\n"
2437 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2438 );
2439 }
2440 }
2441 return;
2442 }
2443 if (depth == 12) {
2444 helper_float_12_recursive(buf + 0, 10);
2445 helper_float_12_recursive(buf + 1024, 10);
2446 helper_float_12_recursive(buf + 2048, 10);
2447 helper_float_12_recursive(buf + 3072, 10);
2448 for (int j = 0; j < 4096; j += 4096) {
2449 for (int k = 0; k < 1024; k += 4) {
2450 __asm__ volatile (
2451 "movups (%0), %%xmm0\n"
2452 "movups (%1), %%xmm1\n"
2453 "movups (%2), %%xmm2\n"
2454 "movups (%3), %%xmm3\n"
2455 "movaps %%xmm0, %%xmm8\n"
2456 "movaps %%xmm0, %%xmm9\n"
2457 "addps %%xmm1, %%xmm8\n"
2458 "subps %%xmm1, %%xmm9\n"
2459 "movaps %%xmm2, %%xmm10\n"
2460 "movaps %%xmm2, %%xmm11\n"
2461 "addps %%xmm3, %%xmm10\n"
2462 "subps %%xmm3, %%xmm11\n"
2463 "movaps %%xmm8, %%xmm0\n"
2464 "movaps %%xmm8, %%xmm2\n"
2465 "addps %%xmm10, %%xmm0\n"
2466 "subps %%xmm10, %%xmm2\n"
2467 "movaps %%xmm9, %%xmm1\n"
2468 "movaps %%xmm9, %%xmm3\n"
2469 "addps %%xmm11, %%xmm1\n"
2470 "subps %%xmm11, %%xmm3\n"
2471 "movups %%xmm0, (%0)\n"
2472 "movups %%xmm1, (%1)\n"
2473 "movups %%xmm2, (%2)\n"
2474 "movups %%xmm3, (%3)\n"
2475 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2476 );
2477 }
2478 }
2479 return;
2480 }
2481 }
2482 void helper_float_12(float *buf);
helper_float_12(float * buf)2483 void helper_float_12(float *buf) {
2484 helper_float_12_recursive(buf, 12);
2485 }
2486 void helper_float_13_recursive(float *buf, int depth);
helper_float_13_recursive(float * buf,int depth)2487 void helper_float_13_recursive(float *buf, int depth) {
2488 if (depth == 11) {
2489 for (int j = 0; j < 2048; j += 32) {
2490 for (int k = 0; k < 4; k += 4) {
2491 __asm__ volatile (
2492 "movups (%0), %%xmm0\n"
2493 "movups (%1), %%xmm1\n"
2494 "movups (%2), %%xmm2\n"
2495 "movups (%3), %%xmm3\n"
2496 "movups (%4), %%xmm4\n"
2497 "movups (%5), %%xmm5\n"
2498 "movups (%6), %%xmm6\n"
2499 "movups (%7), %%xmm7\n"
2500 "movaps %%xmm0, %%xmm8\n"
2501 "shufps $160, %%xmm8, %%xmm8\n"
2502 "shufps $245, %%xmm0, %%xmm0\n"
2503 "xorps %%xmm9, %%xmm9\n"
2504 "subps %%xmm0, %%xmm9\n"
2505 "addsubps %%xmm9, %%xmm8\n"
2506 "movaps %%xmm8, %%xmm0\n"
2507 "movaps %%xmm1, %%xmm8\n"
2508 "shufps $160, %%xmm8, %%xmm8\n"
2509 "shufps $245, %%xmm1, %%xmm1\n"
2510 "xorps %%xmm9, %%xmm9\n"
2511 "subps %%xmm1, %%xmm9\n"
2512 "addsubps %%xmm9, %%xmm8\n"
2513 "movaps %%xmm8, %%xmm1\n"
2514 "movaps %%xmm2, %%xmm8\n"
2515 "shufps $160, %%xmm8, %%xmm8\n"
2516 "shufps $245, %%xmm2, %%xmm2\n"
2517 "xorps %%xmm9, %%xmm9\n"
2518 "subps %%xmm2, %%xmm9\n"
2519 "addsubps %%xmm9, %%xmm8\n"
2520 "movaps %%xmm8, %%xmm2\n"
2521 "movaps %%xmm3, %%xmm8\n"
2522 "shufps $160, %%xmm8, %%xmm8\n"
2523 "shufps $245, %%xmm3, %%xmm3\n"
2524 "xorps %%xmm9, %%xmm9\n"
2525 "subps %%xmm3, %%xmm9\n"
2526 "addsubps %%xmm9, %%xmm8\n"
2527 "movaps %%xmm8, %%xmm3\n"
2528 "movaps %%xmm4, %%xmm8\n"
2529 "shufps $160, %%xmm8, %%xmm8\n"
2530 "shufps $245, %%xmm4, %%xmm4\n"
2531 "xorps %%xmm9, %%xmm9\n"
2532 "subps %%xmm4, %%xmm9\n"
2533 "addsubps %%xmm9, %%xmm8\n"
2534 "movaps %%xmm8, %%xmm4\n"
2535 "movaps %%xmm5, %%xmm8\n"
2536 "shufps $160, %%xmm8, %%xmm8\n"
2537 "shufps $245, %%xmm5, %%xmm5\n"
2538 "xorps %%xmm9, %%xmm9\n"
2539 "subps %%xmm5, %%xmm9\n"
2540 "addsubps %%xmm9, %%xmm8\n"
2541 "movaps %%xmm8, %%xmm5\n"
2542 "movaps %%xmm6, %%xmm8\n"
2543 "shufps $160, %%xmm8, %%xmm8\n"
2544 "shufps $245, %%xmm6, %%xmm6\n"
2545 "xorps %%xmm9, %%xmm9\n"
2546 "subps %%xmm6, %%xmm9\n"
2547 "addsubps %%xmm9, %%xmm8\n"
2548 "movaps %%xmm8, %%xmm6\n"
2549 "movaps %%xmm7, %%xmm8\n"
2550 "shufps $160, %%xmm8, %%xmm8\n"
2551 "shufps $245, %%xmm7, %%xmm7\n"
2552 "xorps %%xmm9, %%xmm9\n"
2553 "subps %%xmm7, %%xmm9\n"
2554 "addsubps %%xmm9, %%xmm8\n"
2555 "movaps %%xmm8, %%xmm7\n"
2556 "movaps %%xmm0, %%xmm8\n"
2557 "shufps $68, %%xmm8, %%xmm8\n"
2558 "xorps %%xmm9, %%xmm9\n"
2559 "movaps %%xmm0, %%xmm10\n"
2560 "shufps $14, %%xmm9, %%xmm10\n"
2561 "movaps %%xmm0, %%xmm11\n"
2562 "shufps $224, %%xmm11, %%xmm9\n"
2563 "addps %%xmm8, %%xmm10\n"
2564 "subps %%xmm9, %%xmm10\n"
2565 "movaps %%xmm10, %%xmm0\n"
2566 "movaps %%xmm1, %%xmm8\n"
2567 "shufps $68, %%xmm8, %%xmm8\n"
2568 "xorps %%xmm9, %%xmm9\n"
2569 "movaps %%xmm1, %%xmm10\n"
2570 "shufps $14, %%xmm9, %%xmm10\n"
2571 "movaps %%xmm1, %%xmm11\n"
2572 "shufps $224, %%xmm11, %%xmm9\n"
2573 "addps %%xmm8, %%xmm10\n"
2574 "subps %%xmm9, %%xmm10\n"
2575 "movaps %%xmm10, %%xmm1\n"
2576 "movaps %%xmm2, %%xmm8\n"
2577 "shufps $68, %%xmm8, %%xmm8\n"
2578 "xorps %%xmm9, %%xmm9\n"
2579 "movaps %%xmm2, %%xmm10\n"
2580 "shufps $14, %%xmm9, %%xmm10\n"
2581 "movaps %%xmm2, %%xmm11\n"
2582 "shufps $224, %%xmm11, %%xmm9\n"
2583 "addps %%xmm8, %%xmm10\n"
2584 "subps %%xmm9, %%xmm10\n"
2585 "movaps %%xmm10, %%xmm2\n"
2586 "movaps %%xmm3, %%xmm8\n"
2587 "shufps $68, %%xmm8, %%xmm8\n"
2588 "xorps %%xmm9, %%xmm9\n"
2589 "movaps %%xmm3, %%xmm10\n"
2590 "shufps $14, %%xmm9, %%xmm10\n"
2591 "movaps %%xmm3, %%xmm11\n"
2592 "shufps $224, %%xmm11, %%xmm9\n"
2593 "addps %%xmm8, %%xmm10\n"
2594 "subps %%xmm9, %%xmm10\n"
2595 "movaps %%xmm10, %%xmm3\n"
2596 "movaps %%xmm4, %%xmm8\n"
2597 "shufps $68, %%xmm8, %%xmm8\n"
2598 "xorps %%xmm9, %%xmm9\n"
2599 "movaps %%xmm4, %%xmm10\n"
2600 "shufps $14, %%xmm9, %%xmm10\n"
2601 "movaps %%xmm4, %%xmm11\n"
2602 "shufps $224, %%xmm11, %%xmm9\n"
2603 "addps %%xmm8, %%xmm10\n"
2604 "subps %%xmm9, %%xmm10\n"
2605 "movaps %%xmm10, %%xmm4\n"
2606 "movaps %%xmm5, %%xmm8\n"
2607 "shufps $68, %%xmm8, %%xmm8\n"
2608 "xorps %%xmm9, %%xmm9\n"
2609 "movaps %%xmm5, %%xmm10\n"
2610 "shufps $14, %%xmm9, %%xmm10\n"
2611 "movaps %%xmm5, %%xmm11\n"
2612 "shufps $224, %%xmm11, %%xmm9\n"
2613 "addps %%xmm8, %%xmm10\n"
2614 "subps %%xmm9, %%xmm10\n"
2615 "movaps %%xmm10, %%xmm5\n"
2616 "movaps %%xmm6, %%xmm8\n"
2617 "shufps $68, %%xmm8, %%xmm8\n"
2618 "xorps %%xmm9, %%xmm9\n"
2619 "movaps %%xmm6, %%xmm10\n"
2620 "shufps $14, %%xmm9, %%xmm10\n"
2621 "movaps %%xmm6, %%xmm11\n"
2622 "shufps $224, %%xmm11, %%xmm9\n"
2623 "addps %%xmm8, %%xmm10\n"
2624 "subps %%xmm9, %%xmm10\n"
2625 "movaps %%xmm10, %%xmm6\n"
2626 "movaps %%xmm7, %%xmm8\n"
2627 "shufps $68, %%xmm8, %%xmm8\n"
2628 "xorps %%xmm9, %%xmm9\n"
2629 "movaps %%xmm7, %%xmm10\n"
2630 "shufps $14, %%xmm9, %%xmm10\n"
2631 "movaps %%xmm7, %%xmm11\n"
2632 "shufps $224, %%xmm11, %%xmm9\n"
2633 "addps %%xmm8, %%xmm10\n"
2634 "subps %%xmm9, %%xmm10\n"
2635 "movaps %%xmm10, %%xmm7\n"
2636 "movaps %%xmm0, %%xmm8\n"
2637 "movaps %%xmm0, %%xmm9\n"
2638 "addps %%xmm1, %%xmm8\n"
2639 "subps %%xmm1, %%xmm9\n"
2640 "movaps %%xmm2, %%xmm10\n"
2641 "movaps %%xmm2, %%xmm11\n"
2642 "addps %%xmm3, %%xmm10\n"
2643 "subps %%xmm3, %%xmm11\n"
2644 "movaps %%xmm4, %%xmm12\n"
2645 "movaps %%xmm4, %%xmm13\n"
2646 "addps %%xmm5, %%xmm12\n"
2647 "subps %%xmm5, %%xmm13\n"
2648 "movaps %%xmm6, %%xmm14\n"
2649 "movaps %%xmm6, %%xmm15\n"
2650 "addps %%xmm7, %%xmm14\n"
2651 "subps %%xmm7, %%xmm15\n"
2652 "movaps %%xmm8, %%xmm0\n"
2653 "movaps %%xmm8, %%xmm2\n"
2654 "addps %%xmm10, %%xmm0\n"
2655 "subps %%xmm10, %%xmm2\n"
2656 "movaps %%xmm9, %%xmm1\n"
2657 "movaps %%xmm9, %%xmm3\n"
2658 "addps %%xmm11, %%xmm1\n"
2659 "subps %%xmm11, %%xmm3\n"
2660 "movaps %%xmm12, %%xmm4\n"
2661 "movaps %%xmm12, %%xmm6\n"
2662 "addps %%xmm14, %%xmm4\n"
2663 "subps %%xmm14, %%xmm6\n"
2664 "movaps %%xmm13, %%xmm5\n"
2665 "movaps %%xmm13, %%xmm7\n"
2666 "addps %%xmm15, %%xmm5\n"
2667 "subps %%xmm15, %%xmm7\n"
2668 "movaps %%xmm0, %%xmm8\n"
2669 "movaps %%xmm0, %%xmm12\n"
2670 "addps %%xmm4, %%xmm8\n"
2671 "subps %%xmm4, %%xmm12\n"
2672 "movaps %%xmm1, %%xmm9\n"
2673 "movaps %%xmm1, %%xmm13\n"
2674 "addps %%xmm5, %%xmm9\n"
2675 "subps %%xmm5, %%xmm13\n"
2676 "movaps %%xmm2, %%xmm10\n"
2677 "movaps %%xmm2, %%xmm14\n"
2678 "addps %%xmm6, %%xmm10\n"
2679 "subps %%xmm6, %%xmm14\n"
2680 "movaps %%xmm3, %%xmm11\n"
2681 "movaps %%xmm3, %%xmm15\n"
2682 "addps %%xmm7, %%xmm11\n"
2683 "subps %%xmm7, %%xmm15\n"
2684 "movups %%xmm8, (%0)\n"
2685 "movups %%xmm9, (%1)\n"
2686 "movups %%xmm10, (%2)\n"
2687 "movups %%xmm11, (%3)\n"
2688 "movups %%xmm12, (%4)\n"
2689 "movups %%xmm13, (%5)\n"
2690 "movups %%xmm14, (%6)\n"
2691 "movups %%xmm15, (%7)\n"
2692 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2693 );
2694 }
2695 }
2696 for (int j = 0; j < 2048; j += 256) {
2697 for (int k = 0; k < 32; k += 4) {
2698 __asm__ volatile (
2699 "movups (%0), %%xmm0\n"
2700 "movups (%1), %%xmm1\n"
2701 "movups (%2), %%xmm2\n"
2702 "movups (%3), %%xmm3\n"
2703 "movups (%4), %%xmm4\n"
2704 "movups (%5), %%xmm5\n"
2705 "movups (%6), %%xmm6\n"
2706 "movups (%7), %%xmm7\n"
2707 "movaps %%xmm0, %%xmm8\n"
2708 "movaps %%xmm0, %%xmm9\n"
2709 "addps %%xmm1, %%xmm8\n"
2710 "subps %%xmm1, %%xmm9\n"
2711 "movaps %%xmm2, %%xmm10\n"
2712 "movaps %%xmm2, %%xmm11\n"
2713 "addps %%xmm3, %%xmm10\n"
2714 "subps %%xmm3, %%xmm11\n"
2715 "movaps %%xmm4, %%xmm12\n"
2716 "movaps %%xmm4, %%xmm13\n"
2717 "addps %%xmm5, %%xmm12\n"
2718 "subps %%xmm5, %%xmm13\n"
2719 "movaps %%xmm6, %%xmm14\n"
2720 "movaps %%xmm6, %%xmm15\n"
2721 "addps %%xmm7, %%xmm14\n"
2722 "subps %%xmm7, %%xmm15\n"
2723 "movaps %%xmm8, %%xmm0\n"
2724 "movaps %%xmm8, %%xmm2\n"
2725 "addps %%xmm10, %%xmm0\n"
2726 "subps %%xmm10, %%xmm2\n"
2727 "movaps %%xmm9, %%xmm1\n"
2728 "movaps %%xmm9, %%xmm3\n"
2729 "addps %%xmm11, %%xmm1\n"
2730 "subps %%xmm11, %%xmm3\n"
2731 "movaps %%xmm12, %%xmm4\n"
2732 "movaps %%xmm12, %%xmm6\n"
2733 "addps %%xmm14, %%xmm4\n"
2734 "subps %%xmm14, %%xmm6\n"
2735 "movaps %%xmm13, %%xmm5\n"
2736 "movaps %%xmm13, %%xmm7\n"
2737 "addps %%xmm15, %%xmm5\n"
2738 "subps %%xmm15, %%xmm7\n"
2739 "movaps %%xmm0, %%xmm8\n"
2740 "movaps %%xmm0, %%xmm12\n"
2741 "addps %%xmm4, %%xmm8\n"
2742 "subps %%xmm4, %%xmm12\n"
2743 "movaps %%xmm1, %%xmm9\n"
2744 "movaps %%xmm1, %%xmm13\n"
2745 "addps %%xmm5, %%xmm9\n"
2746 "subps %%xmm5, %%xmm13\n"
2747 "movaps %%xmm2, %%xmm10\n"
2748 "movaps %%xmm2, %%xmm14\n"
2749 "addps %%xmm6, %%xmm10\n"
2750 "subps %%xmm6, %%xmm14\n"
2751 "movaps %%xmm3, %%xmm11\n"
2752 "movaps %%xmm3, %%xmm15\n"
2753 "addps %%xmm7, %%xmm11\n"
2754 "subps %%xmm7, %%xmm15\n"
2755 "movups %%xmm8, (%0)\n"
2756 "movups %%xmm9, (%1)\n"
2757 "movups %%xmm10, (%2)\n"
2758 "movups %%xmm11, (%3)\n"
2759 "movups %%xmm12, (%4)\n"
2760 "movups %%xmm13, (%5)\n"
2761 "movups %%xmm14, (%6)\n"
2762 "movups %%xmm15, (%7)\n"
2763 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2764 );
2765 }
2766 }
2767 for (int j = 0; j < 2048; j += 2048) {
2768 for (int k = 0; k < 256; k += 4) {
2769 __asm__ volatile (
2770 "movups (%0), %%xmm0\n"
2771 "movups (%1), %%xmm1\n"
2772 "movups (%2), %%xmm2\n"
2773 "movups (%3), %%xmm3\n"
2774 "movups (%4), %%xmm4\n"
2775 "movups (%5), %%xmm5\n"
2776 "movups (%6), %%xmm6\n"
2777 "movups (%7), %%xmm7\n"
2778 "movaps %%xmm0, %%xmm8\n"
2779 "movaps %%xmm0, %%xmm9\n"
2780 "addps %%xmm1, %%xmm8\n"
2781 "subps %%xmm1, %%xmm9\n"
2782 "movaps %%xmm2, %%xmm10\n"
2783 "movaps %%xmm2, %%xmm11\n"
2784 "addps %%xmm3, %%xmm10\n"
2785 "subps %%xmm3, %%xmm11\n"
2786 "movaps %%xmm4, %%xmm12\n"
2787 "movaps %%xmm4, %%xmm13\n"
2788 "addps %%xmm5, %%xmm12\n"
2789 "subps %%xmm5, %%xmm13\n"
2790 "movaps %%xmm6, %%xmm14\n"
2791 "movaps %%xmm6, %%xmm15\n"
2792 "addps %%xmm7, %%xmm14\n"
2793 "subps %%xmm7, %%xmm15\n"
2794 "movaps %%xmm8, %%xmm0\n"
2795 "movaps %%xmm8, %%xmm2\n"
2796 "addps %%xmm10, %%xmm0\n"
2797 "subps %%xmm10, %%xmm2\n"
2798 "movaps %%xmm9, %%xmm1\n"
2799 "movaps %%xmm9, %%xmm3\n"
2800 "addps %%xmm11, %%xmm1\n"
2801 "subps %%xmm11, %%xmm3\n"
2802 "movaps %%xmm12, %%xmm4\n"
2803 "movaps %%xmm12, %%xmm6\n"
2804 "addps %%xmm14, %%xmm4\n"
2805 "subps %%xmm14, %%xmm6\n"
2806 "movaps %%xmm13, %%xmm5\n"
2807 "movaps %%xmm13, %%xmm7\n"
2808 "addps %%xmm15, %%xmm5\n"
2809 "subps %%xmm15, %%xmm7\n"
2810 "movaps %%xmm0, %%xmm8\n"
2811 "movaps %%xmm0, %%xmm12\n"
2812 "addps %%xmm4, %%xmm8\n"
2813 "subps %%xmm4, %%xmm12\n"
2814 "movaps %%xmm1, %%xmm9\n"
2815 "movaps %%xmm1, %%xmm13\n"
2816 "addps %%xmm5, %%xmm9\n"
2817 "subps %%xmm5, %%xmm13\n"
2818 "movaps %%xmm2, %%xmm10\n"
2819 "movaps %%xmm2, %%xmm14\n"
2820 "addps %%xmm6, %%xmm10\n"
2821 "subps %%xmm6, %%xmm14\n"
2822 "movaps %%xmm3, %%xmm11\n"
2823 "movaps %%xmm3, %%xmm15\n"
2824 "addps %%xmm7, %%xmm11\n"
2825 "subps %%xmm7, %%xmm15\n"
2826 "movups %%xmm8, (%0)\n"
2827 "movups %%xmm9, (%1)\n"
2828 "movups %%xmm10, (%2)\n"
2829 "movups %%xmm11, (%3)\n"
2830 "movups %%xmm12, (%4)\n"
2831 "movups %%xmm13, (%5)\n"
2832 "movups %%xmm14, (%6)\n"
2833 "movups %%xmm15, (%7)\n"
2834 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2835 );
2836 }
2837 }
2838 return;
2839 }
2840 if (depth == 13) {
2841 helper_float_13_recursive(buf + 0, 11);
2842 helper_float_13_recursive(buf + 2048, 11);
2843 helper_float_13_recursive(buf + 4096, 11);
2844 helper_float_13_recursive(buf + 6144, 11);
2845 for (int j = 0; j < 8192; j += 8192) {
2846 for (int k = 0; k < 2048; k += 4) {
2847 __asm__ volatile (
2848 "movups (%0), %%xmm0\n"
2849 "movups (%1), %%xmm1\n"
2850 "movups (%2), %%xmm2\n"
2851 "movups (%3), %%xmm3\n"
2852 "movaps %%xmm0, %%xmm8\n"
2853 "movaps %%xmm0, %%xmm9\n"
2854 "addps %%xmm1, %%xmm8\n"
2855 "subps %%xmm1, %%xmm9\n"
2856 "movaps %%xmm2, %%xmm10\n"
2857 "movaps %%xmm2, %%xmm11\n"
2858 "addps %%xmm3, %%xmm10\n"
2859 "subps %%xmm3, %%xmm11\n"
2860 "movaps %%xmm8, %%xmm0\n"
2861 "movaps %%xmm8, %%xmm2\n"
2862 "addps %%xmm10, %%xmm0\n"
2863 "subps %%xmm10, %%xmm2\n"
2864 "movaps %%xmm9, %%xmm1\n"
2865 "movaps %%xmm9, %%xmm3\n"
2866 "addps %%xmm11, %%xmm1\n"
2867 "subps %%xmm11, %%xmm3\n"
2868 "movups %%xmm0, (%0)\n"
2869 "movups %%xmm1, (%1)\n"
2870 "movups %%xmm2, (%2)\n"
2871 "movups %%xmm3, (%3)\n"
2872 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
2873 );
2874 }
2875 }
2876 return;
2877 }
2878 }
2879 void helper_float_13(float *buf);
helper_float_13(float * buf)2880 void helper_float_13(float *buf) {
2881 helper_float_13_recursive(buf, 13);
2882 }
2883 void helper_float_14_recursive(float *buf, int depth);
helper_float_14_recursive(float * buf,int depth)2884 void helper_float_14_recursive(float *buf, int depth) {
2885 if (depth == 11) {
2886 for (int j = 0; j < 2048; j += 32) {
2887 for (int k = 0; k < 4; k += 4) {
2888 __asm__ volatile (
2889 "movups (%0), %%xmm0\n"
2890 "movups (%1), %%xmm1\n"
2891 "movups (%2), %%xmm2\n"
2892 "movups (%3), %%xmm3\n"
2893 "movups (%4), %%xmm4\n"
2894 "movups (%5), %%xmm5\n"
2895 "movups (%6), %%xmm6\n"
2896 "movups (%7), %%xmm7\n"
2897 "movaps %%xmm0, %%xmm8\n"
2898 "shufps $160, %%xmm8, %%xmm8\n"
2899 "shufps $245, %%xmm0, %%xmm0\n"
2900 "xorps %%xmm9, %%xmm9\n"
2901 "subps %%xmm0, %%xmm9\n"
2902 "addsubps %%xmm9, %%xmm8\n"
2903 "movaps %%xmm8, %%xmm0\n"
2904 "movaps %%xmm1, %%xmm8\n"
2905 "shufps $160, %%xmm8, %%xmm8\n"
2906 "shufps $245, %%xmm1, %%xmm1\n"
2907 "xorps %%xmm9, %%xmm9\n"
2908 "subps %%xmm1, %%xmm9\n"
2909 "addsubps %%xmm9, %%xmm8\n"
2910 "movaps %%xmm8, %%xmm1\n"
2911 "movaps %%xmm2, %%xmm8\n"
2912 "shufps $160, %%xmm8, %%xmm8\n"
2913 "shufps $245, %%xmm2, %%xmm2\n"
2914 "xorps %%xmm9, %%xmm9\n"
2915 "subps %%xmm2, %%xmm9\n"
2916 "addsubps %%xmm9, %%xmm8\n"
2917 "movaps %%xmm8, %%xmm2\n"
2918 "movaps %%xmm3, %%xmm8\n"
2919 "shufps $160, %%xmm8, %%xmm8\n"
2920 "shufps $245, %%xmm3, %%xmm3\n"
2921 "xorps %%xmm9, %%xmm9\n"
2922 "subps %%xmm3, %%xmm9\n"
2923 "addsubps %%xmm9, %%xmm8\n"
2924 "movaps %%xmm8, %%xmm3\n"
2925 "movaps %%xmm4, %%xmm8\n"
2926 "shufps $160, %%xmm8, %%xmm8\n"
2927 "shufps $245, %%xmm4, %%xmm4\n"
2928 "xorps %%xmm9, %%xmm9\n"
2929 "subps %%xmm4, %%xmm9\n"
2930 "addsubps %%xmm9, %%xmm8\n"
2931 "movaps %%xmm8, %%xmm4\n"
2932 "movaps %%xmm5, %%xmm8\n"
2933 "shufps $160, %%xmm8, %%xmm8\n"
2934 "shufps $245, %%xmm5, %%xmm5\n"
2935 "xorps %%xmm9, %%xmm9\n"
2936 "subps %%xmm5, %%xmm9\n"
2937 "addsubps %%xmm9, %%xmm8\n"
2938 "movaps %%xmm8, %%xmm5\n"
2939 "movaps %%xmm6, %%xmm8\n"
2940 "shufps $160, %%xmm8, %%xmm8\n"
2941 "shufps $245, %%xmm6, %%xmm6\n"
2942 "xorps %%xmm9, %%xmm9\n"
2943 "subps %%xmm6, %%xmm9\n"
2944 "addsubps %%xmm9, %%xmm8\n"
2945 "movaps %%xmm8, %%xmm6\n"
2946 "movaps %%xmm7, %%xmm8\n"
2947 "shufps $160, %%xmm8, %%xmm8\n"
2948 "shufps $245, %%xmm7, %%xmm7\n"
2949 "xorps %%xmm9, %%xmm9\n"
2950 "subps %%xmm7, %%xmm9\n"
2951 "addsubps %%xmm9, %%xmm8\n"
2952 "movaps %%xmm8, %%xmm7\n"
2953 "movaps %%xmm0, %%xmm8\n"
2954 "shufps $68, %%xmm8, %%xmm8\n"
2955 "xorps %%xmm9, %%xmm9\n"
2956 "movaps %%xmm0, %%xmm10\n"
2957 "shufps $14, %%xmm9, %%xmm10\n"
2958 "movaps %%xmm0, %%xmm11\n"
2959 "shufps $224, %%xmm11, %%xmm9\n"
2960 "addps %%xmm8, %%xmm10\n"
2961 "subps %%xmm9, %%xmm10\n"
2962 "movaps %%xmm10, %%xmm0\n"
2963 "movaps %%xmm1, %%xmm8\n"
2964 "shufps $68, %%xmm8, %%xmm8\n"
2965 "xorps %%xmm9, %%xmm9\n"
2966 "movaps %%xmm1, %%xmm10\n"
2967 "shufps $14, %%xmm9, %%xmm10\n"
2968 "movaps %%xmm1, %%xmm11\n"
2969 "shufps $224, %%xmm11, %%xmm9\n"
2970 "addps %%xmm8, %%xmm10\n"
2971 "subps %%xmm9, %%xmm10\n"
2972 "movaps %%xmm10, %%xmm1\n"
2973 "movaps %%xmm2, %%xmm8\n"
2974 "shufps $68, %%xmm8, %%xmm8\n"
2975 "xorps %%xmm9, %%xmm9\n"
2976 "movaps %%xmm2, %%xmm10\n"
2977 "shufps $14, %%xmm9, %%xmm10\n"
2978 "movaps %%xmm2, %%xmm11\n"
2979 "shufps $224, %%xmm11, %%xmm9\n"
2980 "addps %%xmm8, %%xmm10\n"
2981 "subps %%xmm9, %%xmm10\n"
2982 "movaps %%xmm10, %%xmm2\n"
2983 "movaps %%xmm3, %%xmm8\n"
2984 "shufps $68, %%xmm8, %%xmm8\n"
2985 "xorps %%xmm9, %%xmm9\n"
2986 "movaps %%xmm3, %%xmm10\n"
2987 "shufps $14, %%xmm9, %%xmm10\n"
2988 "movaps %%xmm3, %%xmm11\n"
2989 "shufps $224, %%xmm11, %%xmm9\n"
2990 "addps %%xmm8, %%xmm10\n"
2991 "subps %%xmm9, %%xmm10\n"
2992 "movaps %%xmm10, %%xmm3\n"
2993 "movaps %%xmm4, %%xmm8\n"
2994 "shufps $68, %%xmm8, %%xmm8\n"
2995 "xorps %%xmm9, %%xmm9\n"
2996 "movaps %%xmm4, %%xmm10\n"
2997 "shufps $14, %%xmm9, %%xmm10\n"
2998 "movaps %%xmm4, %%xmm11\n"
2999 "shufps $224, %%xmm11, %%xmm9\n"
3000 "addps %%xmm8, %%xmm10\n"
3001 "subps %%xmm9, %%xmm10\n"
3002 "movaps %%xmm10, %%xmm4\n"
3003 "movaps %%xmm5, %%xmm8\n"
3004 "shufps $68, %%xmm8, %%xmm8\n"
3005 "xorps %%xmm9, %%xmm9\n"
3006 "movaps %%xmm5, %%xmm10\n"
3007 "shufps $14, %%xmm9, %%xmm10\n"
3008 "movaps %%xmm5, %%xmm11\n"
3009 "shufps $224, %%xmm11, %%xmm9\n"
3010 "addps %%xmm8, %%xmm10\n"
3011 "subps %%xmm9, %%xmm10\n"
3012 "movaps %%xmm10, %%xmm5\n"
3013 "movaps %%xmm6, %%xmm8\n"
3014 "shufps $68, %%xmm8, %%xmm8\n"
3015 "xorps %%xmm9, %%xmm9\n"
3016 "movaps %%xmm6, %%xmm10\n"
3017 "shufps $14, %%xmm9, %%xmm10\n"
3018 "movaps %%xmm6, %%xmm11\n"
3019 "shufps $224, %%xmm11, %%xmm9\n"
3020 "addps %%xmm8, %%xmm10\n"
3021 "subps %%xmm9, %%xmm10\n"
3022 "movaps %%xmm10, %%xmm6\n"
3023 "movaps %%xmm7, %%xmm8\n"
3024 "shufps $68, %%xmm8, %%xmm8\n"
3025 "xorps %%xmm9, %%xmm9\n"
3026 "movaps %%xmm7, %%xmm10\n"
3027 "shufps $14, %%xmm9, %%xmm10\n"
3028 "movaps %%xmm7, %%xmm11\n"
3029 "shufps $224, %%xmm11, %%xmm9\n"
3030 "addps %%xmm8, %%xmm10\n"
3031 "subps %%xmm9, %%xmm10\n"
3032 "movaps %%xmm10, %%xmm7\n"
3033 "movaps %%xmm0, %%xmm8\n"
3034 "movaps %%xmm0, %%xmm9\n"
3035 "addps %%xmm1, %%xmm8\n"
3036 "subps %%xmm1, %%xmm9\n"
3037 "movaps %%xmm2, %%xmm10\n"
3038 "movaps %%xmm2, %%xmm11\n"
3039 "addps %%xmm3, %%xmm10\n"
3040 "subps %%xmm3, %%xmm11\n"
3041 "movaps %%xmm4, %%xmm12\n"
3042 "movaps %%xmm4, %%xmm13\n"
3043 "addps %%xmm5, %%xmm12\n"
3044 "subps %%xmm5, %%xmm13\n"
3045 "movaps %%xmm6, %%xmm14\n"
3046 "movaps %%xmm6, %%xmm15\n"
3047 "addps %%xmm7, %%xmm14\n"
3048 "subps %%xmm7, %%xmm15\n"
3049 "movaps %%xmm8, %%xmm0\n"
3050 "movaps %%xmm8, %%xmm2\n"
3051 "addps %%xmm10, %%xmm0\n"
3052 "subps %%xmm10, %%xmm2\n"
3053 "movaps %%xmm9, %%xmm1\n"
3054 "movaps %%xmm9, %%xmm3\n"
3055 "addps %%xmm11, %%xmm1\n"
3056 "subps %%xmm11, %%xmm3\n"
3057 "movaps %%xmm12, %%xmm4\n"
3058 "movaps %%xmm12, %%xmm6\n"
3059 "addps %%xmm14, %%xmm4\n"
3060 "subps %%xmm14, %%xmm6\n"
3061 "movaps %%xmm13, %%xmm5\n"
3062 "movaps %%xmm13, %%xmm7\n"
3063 "addps %%xmm15, %%xmm5\n"
3064 "subps %%xmm15, %%xmm7\n"
3065 "movaps %%xmm0, %%xmm8\n"
3066 "movaps %%xmm0, %%xmm12\n"
3067 "addps %%xmm4, %%xmm8\n"
3068 "subps %%xmm4, %%xmm12\n"
3069 "movaps %%xmm1, %%xmm9\n"
3070 "movaps %%xmm1, %%xmm13\n"
3071 "addps %%xmm5, %%xmm9\n"
3072 "subps %%xmm5, %%xmm13\n"
3073 "movaps %%xmm2, %%xmm10\n"
3074 "movaps %%xmm2, %%xmm14\n"
3075 "addps %%xmm6, %%xmm10\n"
3076 "subps %%xmm6, %%xmm14\n"
3077 "movaps %%xmm3, %%xmm11\n"
3078 "movaps %%xmm3, %%xmm15\n"
3079 "addps %%xmm7, %%xmm11\n"
3080 "subps %%xmm7, %%xmm15\n"
3081 "movups %%xmm8, (%0)\n"
3082 "movups %%xmm9, (%1)\n"
3083 "movups %%xmm10, (%2)\n"
3084 "movups %%xmm11, (%3)\n"
3085 "movups %%xmm12, (%4)\n"
3086 "movups %%xmm13, (%5)\n"
3087 "movups %%xmm14, (%6)\n"
3088 "movups %%xmm15, (%7)\n"
3089 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3090 );
3091 }
3092 }
3093 for (int j = 0; j < 2048; j += 256) {
3094 for (int k = 0; k < 32; k += 4) {
3095 __asm__ volatile (
3096 "movups (%0), %%xmm0\n"
3097 "movups (%1), %%xmm1\n"
3098 "movups (%2), %%xmm2\n"
3099 "movups (%3), %%xmm3\n"
3100 "movups (%4), %%xmm4\n"
3101 "movups (%5), %%xmm5\n"
3102 "movups (%6), %%xmm6\n"
3103 "movups (%7), %%xmm7\n"
3104 "movaps %%xmm0, %%xmm8\n"
3105 "movaps %%xmm0, %%xmm9\n"
3106 "addps %%xmm1, %%xmm8\n"
3107 "subps %%xmm1, %%xmm9\n"
3108 "movaps %%xmm2, %%xmm10\n"
3109 "movaps %%xmm2, %%xmm11\n"
3110 "addps %%xmm3, %%xmm10\n"
3111 "subps %%xmm3, %%xmm11\n"
3112 "movaps %%xmm4, %%xmm12\n"
3113 "movaps %%xmm4, %%xmm13\n"
3114 "addps %%xmm5, %%xmm12\n"
3115 "subps %%xmm5, %%xmm13\n"
3116 "movaps %%xmm6, %%xmm14\n"
3117 "movaps %%xmm6, %%xmm15\n"
3118 "addps %%xmm7, %%xmm14\n"
3119 "subps %%xmm7, %%xmm15\n"
3120 "movaps %%xmm8, %%xmm0\n"
3121 "movaps %%xmm8, %%xmm2\n"
3122 "addps %%xmm10, %%xmm0\n"
3123 "subps %%xmm10, %%xmm2\n"
3124 "movaps %%xmm9, %%xmm1\n"
3125 "movaps %%xmm9, %%xmm3\n"
3126 "addps %%xmm11, %%xmm1\n"
3127 "subps %%xmm11, %%xmm3\n"
3128 "movaps %%xmm12, %%xmm4\n"
3129 "movaps %%xmm12, %%xmm6\n"
3130 "addps %%xmm14, %%xmm4\n"
3131 "subps %%xmm14, %%xmm6\n"
3132 "movaps %%xmm13, %%xmm5\n"
3133 "movaps %%xmm13, %%xmm7\n"
3134 "addps %%xmm15, %%xmm5\n"
3135 "subps %%xmm15, %%xmm7\n"
3136 "movaps %%xmm0, %%xmm8\n"
3137 "movaps %%xmm0, %%xmm12\n"
3138 "addps %%xmm4, %%xmm8\n"
3139 "subps %%xmm4, %%xmm12\n"
3140 "movaps %%xmm1, %%xmm9\n"
3141 "movaps %%xmm1, %%xmm13\n"
3142 "addps %%xmm5, %%xmm9\n"
3143 "subps %%xmm5, %%xmm13\n"
3144 "movaps %%xmm2, %%xmm10\n"
3145 "movaps %%xmm2, %%xmm14\n"
3146 "addps %%xmm6, %%xmm10\n"
3147 "subps %%xmm6, %%xmm14\n"
3148 "movaps %%xmm3, %%xmm11\n"
3149 "movaps %%xmm3, %%xmm15\n"
3150 "addps %%xmm7, %%xmm11\n"
3151 "subps %%xmm7, %%xmm15\n"
3152 "movups %%xmm8, (%0)\n"
3153 "movups %%xmm9, (%1)\n"
3154 "movups %%xmm10, (%2)\n"
3155 "movups %%xmm11, (%3)\n"
3156 "movups %%xmm12, (%4)\n"
3157 "movups %%xmm13, (%5)\n"
3158 "movups %%xmm14, (%6)\n"
3159 "movups %%xmm15, (%7)\n"
3160 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3161 );
3162 }
3163 }
3164 for (int j = 0; j < 2048; j += 2048) {
3165 for (int k = 0; k < 256; k += 4) {
3166 __asm__ volatile (
3167 "movups (%0), %%xmm0\n"
3168 "movups (%1), %%xmm1\n"
3169 "movups (%2), %%xmm2\n"
3170 "movups (%3), %%xmm3\n"
3171 "movups (%4), %%xmm4\n"
3172 "movups (%5), %%xmm5\n"
3173 "movups (%6), %%xmm6\n"
3174 "movups (%7), %%xmm7\n"
3175 "movaps %%xmm0, %%xmm8\n"
3176 "movaps %%xmm0, %%xmm9\n"
3177 "addps %%xmm1, %%xmm8\n"
3178 "subps %%xmm1, %%xmm9\n"
3179 "movaps %%xmm2, %%xmm10\n"
3180 "movaps %%xmm2, %%xmm11\n"
3181 "addps %%xmm3, %%xmm10\n"
3182 "subps %%xmm3, %%xmm11\n"
3183 "movaps %%xmm4, %%xmm12\n"
3184 "movaps %%xmm4, %%xmm13\n"
3185 "addps %%xmm5, %%xmm12\n"
3186 "subps %%xmm5, %%xmm13\n"
3187 "movaps %%xmm6, %%xmm14\n"
3188 "movaps %%xmm6, %%xmm15\n"
3189 "addps %%xmm7, %%xmm14\n"
3190 "subps %%xmm7, %%xmm15\n"
3191 "movaps %%xmm8, %%xmm0\n"
3192 "movaps %%xmm8, %%xmm2\n"
3193 "addps %%xmm10, %%xmm0\n"
3194 "subps %%xmm10, %%xmm2\n"
3195 "movaps %%xmm9, %%xmm1\n"
3196 "movaps %%xmm9, %%xmm3\n"
3197 "addps %%xmm11, %%xmm1\n"
3198 "subps %%xmm11, %%xmm3\n"
3199 "movaps %%xmm12, %%xmm4\n"
3200 "movaps %%xmm12, %%xmm6\n"
3201 "addps %%xmm14, %%xmm4\n"
3202 "subps %%xmm14, %%xmm6\n"
3203 "movaps %%xmm13, %%xmm5\n"
3204 "movaps %%xmm13, %%xmm7\n"
3205 "addps %%xmm15, %%xmm5\n"
3206 "subps %%xmm15, %%xmm7\n"
3207 "movaps %%xmm0, %%xmm8\n"
3208 "movaps %%xmm0, %%xmm12\n"
3209 "addps %%xmm4, %%xmm8\n"
3210 "subps %%xmm4, %%xmm12\n"
3211 "movaps %%xmm1, %%xmm9\n"
3212 "movaps %%xmm1, %%xmm13\n"
3213 "addps %%xmm5, %%xmm9\n"
3214 "subps %%xmm5, %%xmm13\n"
3215 "movaps %%xmm2, %%xmm10\n"
3216 "movaps %%xmm2, %%xmm14\n"
3217 "addps %%xmm6, %%xmm10\n"
3218 "subps %%xmm6, %%xmm14\n"
3219 "movaps %%xmm3, %%xmm11\n"
3220 "movaps %%xmm3, %%xmm15\n"
3221 "addps %%xmm7, %%xmm11\n"
3222 "subps %%xmm7, %%xmm15\n"
3223 "movups %%xmm8, (%0)\n"
3224 "movups %%xmm9, (%1)\n"
3225 "movups %%xmm10, (%2)\n"
3226 "movups %%xmm11, (%3)\n"
3227 "movups %%xmm12, (%4)\n"
3228 "movups %%xmm13, (%5)\n"
3229 "movups %%xmm14, (%6)\n"
3230 "movups %%xmm15, (%7)\n"
3231 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3232 );
3233 }
3234 }
3235 return;
3236 }
3237 if (depth == 14) {
3238 helper_float_14_recursive(buf + 0, 11);
3239 helper_float_14_recursive(buf + 2048, 11);
3240 helper_float_14_recursive(buf + 4096, 11);
3241 helper_float_14_recursive(buf + 6144, 11);
3242 helper_float_14_recursive(buf + 8192, 11);
3243 helper_float_14_recursive(buf + 10240, 11);
3244 helper_float_14_recursive(buf + 12288, 11);
3245 helper_float_14_recursive(buf + 14336, 11);
3246 for (int j = 0; j < 16384; j += 16384) {
3247 for (int k = 0; k < 2048; k += 4) {
3248 __asm__ volatile (
3249 "movups (%0), %%xmm0\n"
3250 "movups (%1), %%xmm1\n"
3251 "movups (%2), %%xmm2\n"
3252 "movups (%3), %%xmm3\n"
3253 "movups (%4), %%xmm4\n"
3254 "movups (%5), %%xmm5\n"
3255 "movups (%6), %%xmm6\n"
3256 "movups (%7), %%xmm7\n"
3257 "movaps %%xmm0, %%xmm8\n"
3258 "movaps %%xmm0, %%xmm9\n"
3259 "addps %%xmm1, %%xmm8\n"
3260 "subps %%xmm1, %%xmm9\n"
3261 "movaps %%xmm2, %%xmm10\n"
3262 "movaps %%xmm2, %%xmm11\n"
3263 "addps %%xmm3, %%xmm10\n"
3264 "subps %%xmm3, %%xmm11\n"
3265 "movaps %%xmm4, %%xmm12\n"
3266 "movaps %%xmm4, %%xmm13\n"
3267 "addps %%xmm5, %%xmm12\n"
3268 "subps %%xmm5, %%xmm13\n"
3269 "movaps %%xmm6, %%xmm14\n"
3270 "movaps %%xmm6, %%xmm15\n"
3271 "addps %%xmm7, %%xmm14\n"
3272 "subps %%xmm7, %%xmm15\n"
3273 "movaps %%xmm8, %%xmm0\n"
3274 "movaps %%xmm8, %%xmm2\n"
3275 "addps %%xmm10, %%xmm0\n"
3276 "subps %%xmm10, %%xmm2\n"
3277 "movaps %%xmm9, %%xmm1\n"
3278 "movaps %%xmm9, %%xmm3\n"
3279 "addps %%xmm11, %%xmm1\n"
3280 "subps %%xmm11, %%xmm3\n"
3281 "movaps %%xmm12, %%xmm4\n"
3282 "movaps %%xmm12, %%xmm6\n"
3283 "addps %%xmm14, %%xmm4\n"
3284 "subps %%xmm14, %%xmm6\n"
3285 "movaps %%xmm13, %%xmm5\n"
3286 "movaps %%xmm13, %%xmm7\n"
3287 "addps %%xmm15, %%xmm5\n"
3288 "subps %%xmm15, %%xmm7\n"
3289 "movaps %%xmm0, %%xmm8\n"
3290 "movaps %%xmm0, %%xmm12\n"
3291 "addps %%xmm4, %%xmm8\n"
3292 "subps %%xmm4, %%xmm12\n"
3293 "movaps %%xmm1, %%xmm9\n"
3294 "movaps %%xmm1, %%xmm13\n"
3295 "addps %%xmm5, %%xmm9\n"
3296 "subps %%xmm5, %%xmm13\n"
3297 "movaps %%xmm2, %%xmm10\n"
3298 "movaps %%xmm2, %%xmm14\n"
3299 "addps %%xmm6, %%xmm10\n"
3300 "subps %%xmm6, %%xmm14\n"
3301 "movaps %%xmm3, %%xmm11\n"
3302 "movaps %%xmm3, %%xmm15\n"
3303 "addps %%xmm7, %%xmm11\n"
3304 "subps %%xmm7, %%xmm15\n"
3305 "movups %%xmm8, (%0)\n"
3306 "movups %%xmm9, (%1)\n"
3307 "movups %%xmm10, (%2)\n"
3308 "movups %%xmm11, (%3)\n"
3309 "movups %%xmm12, (%4)\n"
3310 "movups %%xmm13, (%5)\n"
3311 "movups %%xmm14, (%6)\n"
3312 "movups %%xmm15, (%7)\n"
3313 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3314 );
3315 }
3316 }
3317 return;
3318 }
3319 }
3320 void helper_float_14(float *buf);
helper_float_14(float * buf)3321 void helper_float_14(float *buf) {
3322 helper_float_14_recursive(buf, 14);
3323 }
3324 void helper_float_15_recursive(float *buf, int depth);
helper_float_15_recursive(float * buf,int depth)3325 void helper_float_15_recursive(float *buf, int depth) {
3326 if (depth == 13) {
3327 for (int j = 0; j < 8192; j += 32) {
3328 for (int k = 0; k < 4; k += 4) {
3329 __asm__ volatile (
3330 "movups (%0), %%xmm0\n"
3331 "movups (%1), %%xmm1\n"
3332 "movups (%2), %%xmm2\n"
3333 "movups (%3), %%xmm3\n"
3334 "movups (%4), %%xmm4\n"
3335 "movups (%5), %%xmm5\n"
3336 "movups (%6), %%xmm6\n"
3337 "movups (%7), %%xmm7\n"
3338 "movaps %%xmm0, %%xmm8\n"
3339 "shufps $160, %%xmm8, %%xmm8\n"
3340 "shufps $245, %%xmm0, %%xmm0\n"
3341 "xorps %%xmm9, %%xmm9\n"
3342 "subps %%xmm0, %%xmm9\n"
3343 "addsubps %%xmm9, %%xmm8\n"
3344 "movaps %%xmm8, %%xmm0\n"
3345 "movaps %%xmm1, %%xmm8\n"
3346 "shufps $160, %%xmm8, %%xmm8\n"
3347 "shufps $245, %%xmm1, %%xmm1\n"
3348 "xorps %%xmm9, %%xmm9\n"
3349 "subps %%xmm1, %%xmm9\n"
3350 "addsubps %%xmm9, %%xmm8\n"
3351 "movaps %%xmm8, %%xmm1\n"
3352 "movaps %%xmm2, %%xmm8\n"
3353 "shufps $160, %%xmm8, %%xmm8\n"
3354 "shufps $245, %%xmm2, %%xmm2\n"
3355 "xorps %%xmm9, %%xmm9\n"
3356 "subps %%xmm2, %%xmm9\n"
3357 "addsubps %%xmm9, %%xmm8\n"
3358 "movaps %%xmm8, %%xmm2\n"
3359 "movaps %%xmm3, %%xmm8\n"
3360 "shufps $160, %%xmm8, %%xmm8\n"
3361 "shufps $245, %%xmm3, %%xmm3\n"
3362 "xorps %%xmm9, %%xmm9\n"
3363 "subps %%xmm3, %%xmm9\n"
3364 "addsubps %%xmm9, %%xmm8\n"
3365 "movaps %%xmm8, %%xmm3\n"
3366 "movaps %%xmm4, %%xmm8\n"
3367 "shufps $160, %%xmm8, %%xmm8\n"
3368 "shufps $245, %%xmm4, %%xmm4\n"
3369 "xorps %%xmm9, %%xmm9\n"
3370 "subps %%xmm4, %%xmm9\n"
3371 "addsubps %%xmm9, %%xmm8\n"
3372 "movaps %%xmm8, %%xmm4\n"
3373 "movaps %%xmm5, %%xmm8\n"
3374 "shufps $160, %%xmm8, %%xmm8\n"
3375 "shufps $245, %%xmm5, %%xmm5\n"
3376 "xorps %%xmm9, %%xmm9\n"
3377 "subps %%xmm5, %%xmm9\n"
3378 "addsubps %%xmm9, %%xmm8\n"
3379 "movaps %%xmm8, %%xmm5\n"
3380 "movaps %%xmm6, %%xmm8\n"
3381 "shufps $160, %%xmm8, %%xmm8\n"
3382 "shufps $245, %%xmm6, %%xmm6\n"
3383 "xorps %%xmm9, %%xmm9\n"
3384 "subps %%xmm6, %%xmm9\n"
3385 "addsubps %%xmm9, %%xmm8\n"
3386 "movaps %%xmm8, %%xmm6\n"
3387 "movaps %%xmm7, %%xmm8\n"
3388 "shufps $160, %%xmm8, %%xmm8\n"
3389 "shufps $245, %%xmm7, %%xmm7\n"
3390 "xorps %%xmm9, %%xmm9\n"
3391 "subps %%xmm7, %%xmm9\n"
3392 "addsubps %%xmm9, %%xmm8\n"
3393 "movaps %%xmm8, %%xmm7\n"
3394 "movaps %%xmm0, %%xmm8\n"
3395 "shufps $68, %%xmm8, %%xmm8\n"
3396 "xorps %%xmm9, %%xmm9\n"
3397 "movaps %%xmm0, %%xmm10\n"
3398 "shufps $14, %%xmm9, %%xmm10\n"
3399 "movaps %%xmm0, %%xmm11\n"
3400 "shufps $224, %%xmm11, %%xmm9\n"
3401 "addps %%xmm8, %%xmm10\n"
3402 "subps %%xmm9, %%xmm10\n"
3403 "movaps %%xmm10, %%xmm0\n"
3404 "movaps %%xmm1, %%xmm8\n"
3405 "shufps $68, %%xmm8, %%xmm8\n"
3406 "xorps %%xmm9, %%xmm9\n"
3407 "movaps %%xmm1, %%xmm10\n"
3408 "shufps $14, %%xmm9, %%xmm10\n"
3409 "movaps %%xmm1, %%xmm11\n"
3410 "shufps $224, %%xmm11, %%xmm9\n"
3411 "addps %%xmm8, %%xmm10\n"
3412 "subps %%xmm9, %%xmm10\n"
3413 "movaps %%xmm10, %%xmm1\n"
3414 "movaps %%xmm2, %%xmm8\n"
3415 "shufps $68, %%xmm8, %%xmm8\n"
3416 "xorps %%xmm9, %%xmm9\n"
3417 "movaps %%xmm2, %%xmm10\n"
3418 "shufps $14, %%xmm9, %%xmm10\n"
3419 "movaps %%xmm2, %%xmm11\n"
3420 "shufps $224, %%xmm11, %%xmm9\n"
3421 "addps %%xmm8, %%xmm10\n"
3422 "subps %%xmm9, %%xmm10\n"
3423 "movaps %%xmm10, %%xmm2\n"
3424 "movaps %%xmm3, %%xmm8\n"
3425 "shufps $68, %%xmm8, %%xmm8\n"
3426 "xorps %%xmm9, %%xmm9\n"
3427 "movaps %%xmm3, %%xmm10\n"
3428 "shufps $14, %%xmm9, %%xmm10\n"
3429 "movaps %%xmm3, %%xmm11\n"
3430 "shufps $224, %%xmm11, %%xmm9\n"
3431 "addps %%xmm8, %%xmm10\n"
3432 "subps %%xmm9, %%xmm10\n"
3433 "movaps %%xmm10, %%xmm3\n"
3434 "movaps %%xmm4, %%xmm8\n"
3435 "shufps $68, %%xmm8, %%xmm8\n"
3436 "xorps %%xmm9, %%xmm9\n"
3437 "movaps %%xmm4, %%xmm10\n"
3438 "shufps $14, %%xmm9, %%xmm10\n"
3439 "movaps %%xmm4, %%xmm11\n"
3440 "shufps $224, %%xmm11, %%xmm9\n"
3441 "addps %%xmm8, %%xmm10\n"
3442 "subps %%xmm9, %%xmm10\n"
3443 "movaps %%xmm10, %%xmm4\n"
3444 "movaps %%xmm5, %%xmm8\n"
3445 "shufps $68, %%xmm8, %%xmm8\n"
3446 "xorps %%xmm9, %%xmm9\n"
3447 "movaps %%xmm5, %%xmm10\n"
3448 "shufps $14, %%xmm9, %%xmm10\n"
3449 "movaps %%xmm5, %%xmm11\n"
3450 "shufps $224, %%xmm11, %%xmm9\n"
3451 "addps %%xmm8, %%xmm10\n"
3452 "subps %%xmm9, %%xmm10\n"
3453 "movaps %%xmm10, %%xmm5\n"
3454 "movaps %%xmm6, %%xmm8\n"
3455 "shufps $68, %%xmm8, %%xmm8\n"
3456 "xorps %%xmm9, %%xmm9\n"
3457 "movaps %%xmm6, %%xmm10\n"
3458 "shufps $14, %%xmm9, %%xmm10\n"
3459 "movaps %%xmm6, %%xmm11\n"
3460 "shufps $224, %%xmm11, %%xmm9\n"
3461 "addps %%xmm8, %%xmm10\n"
3462 "subps %%xmm9, %%xmm10\n"
3463 "movaps %%xmm10, %%xmm6\n"
3464 "movaps %%xmm7, %%xmm8\n"
3465 "shufps $68, %%xmm8, %%xmm8\n"
3466 "xorps %%xmm9, %%xmm9\n"
3467 "movaps %%xmm7, %%xmm10\n"
3468 "shufps $14, %%xmm9, %%xmm10\n"
3469 "movaps %%xmm7, %%xmm11\n"
3470 "shufps $224, %%xmm11, %%xmm9\n"
3471 "addps %%xmm8, %%xmm10\n"
3472 "subps %%xmm9, %%xmm10\n"
3473 "movaps %%xmm10, %%xmm7\n"
3474 "movaps %%xmm0, %%xmm8\n"
3475 "movaps %%xmm0, %%xmm9\n"
3476 "addps %%xmm1, %%xmm8\n"
3477 "subps %%xmm1, %%xmm9\n"
3478 "movaps %%xmm2, %%xmm10\n"
3479 "movaps %%xmm2, %%xmm11\n"
3480 "addps %%xmm3, %%xmm10\n"
3481 "subps %%xmm3, %%xmm11\n"
3482 "movaps %%xmm4, %%xmm12\n"
3483 "movaps %%xmm4, %%xmm13\n"
3484 "addps %%xmm5, %%xmm12\n"
3485 "subps %%xmm5, %%xmm13\n"
3486 "movaps %%xmm6, %%xmm14\n"
3487 "movaps %%xmm6, %%xmm15\n"
3488 "addps %%xmm7, %%xmm14\n"
3489 "subps %%xmm7, %%xmm15\n"
3490 "movaps %%xmm8, %%xmm0\n"
3491 "movaps %%xmm8, %%xmm2\n"
3492 "addps %%xmm10, %%xmm0\n"
3493 "subps %%xmm10, %%xmm2\n"
3494 "movaps %%xmm9, %%xmm1\n"
3495 "movaps %%xmm9, %%xmm3\n"
3496 "addps %%xmm11, %%xmm1\n"
3497 "subps %%xmm11, %%xmm3\n"
3498 "movaps %%xmm12, %%xmm4\n"
3499 "movaps %%xmm12, %%xmm6\n"
3500 "addps %%xmm14, %%xmm4\n"
3501 "subps %%xmm14, %%xmm6\n"
3502 "movaps %%xmm13, %%xmm5\n"
3503 "movaps %%xmm13, %%xmm7\n"
3504 "addps %%xmm15, %%xmm5\n"
3505 "subps %%xmm15, %%xmm7\n"
3506 "movaps %%xmm0, %%xmm8\n"
3507 "movaps %%xmm0, %%xmm12\n"
3508 "addps %%xmm4, %%xmm8\n"
3509 "subps %%xmm4, %%xmm12\n"
3510 "movaps %%xmm1, %%xmm9\n"
3511 "movaps %%xmm1, %%xmm13\n"
3512 "addps %%xmm5, %%xmm9\n"
3513 "subps %%xmm5, %%xmm13\n"
3514 "movaps %%xmm2, %%xmm10\n"
3515 "movaps %%xmm2, %%xmm14\n"
3516 "addps %%xmm6, %%xmm10\n"
3517 "subps %%xmm6, %%xmm14\n"
3518 "movaps %%xmm3, %%xmm11\n"
3519 "movaps %%xmm3, %%xmm15\n"
3520 "addps %%xmm7, %%xmm11\n"
3521 "subps %%xmm7, %%xmm15\n"
3522 "movups %%xmm8, (%0)\n"
3523 "movups %%xmm9, (%1)\n"
3524 "movups %%xmm10, (%2)\n"
3525 "movups %%xmm11, (%3)\n"
3526 "movups %%xmm12, (%4)\n"
3527 "movups %%xmm13, (%5)\n"
3528 "movups %%xmm14, (%6)\n"
3529 "movups %%xmm15, (%7)\n"
3530 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3531 );
3532 }
3533 }
3534 for (int j = 0; j < 8192; j += 256) {
3535 for (int k = 0; k < 32; k += 4) {
3536 __asm__ volatile (
3537 "movups (%0), %%xmm0\n"
3538 "movups (%1), %%xmm1\n"
3539 "movups (%2), %%xmm2\n"
3540 "movups (%3), %%xmm3\n"
3541 "movups (%4), %%xmm4\n"
3542 "movups (%5), %%xmm5\n"
3543 "movups (%6), %%xmm6\n"
3544 "movups (%7), %%xmm7\n"
3545 "movaps %%xmm0, %%xmm8\n"
3546 "movaps %%xmm0, %%xmm9\n"
3547 "addps %%xmm1, %%xmm8\n"
3548 "subps %%xmm1, %%xmm9\n"
3549 "movaps %%xmm2, %%xmm10\n"
3550 "movaps %%xmm2, %%xmm11\n"
3551 "addps %%xmm3, %%xmm10\n"
3552 "subps %%xmm3, %%xmm11\n"
3553 "movaps %%xmm4, %%xmm12\n"
3554 "movaps %%xmm4, %%xmm13\n"
3555 "addps %%xmm5, %%xmm12\n"
3556 "subps %%xmm5, %%xmm13\n"
3557 "movaps %%xmm6, %%xmm14\n"
3558 "movaps %%xmm6, %%xmm15\n"
3559 "addps %%xmm7, %%xmm14\n"
3560 "subps %%xmm7, %%xmm15\n"
3561 "movaps %%xmm8, %%xmm0\n"
3562 "movaps %%xmm8, %%xmm2\n"
3563 "addps %%xmm10, %%xmm0\n"
3564 "subps %%xmm10, %%xmm2\n"
3565 "movaps %%xmm9, %%xmm1\n"
3566 "movaps %%xmm9, %%xmm3\n"
3567 "addps %%xmm11, %%xmm1\n"
3568 "subps %%xmm11, %%xmm3\n"
3569 "movaps %%xmm12, %%xmm4\n"
3570 "movaps %%xmm12, %%xmm6\n"
3571 "addps %%xmm14, %%xmm4\n"
3572 "subps %%xmm14, %%xmm6\n"
3573 "movaps %%xmm13, %%xmm5\n"
3574 "movaps %%xmm13, %%xmm7\n"
3575 "addps %%xmm15, %%xmm5\n"
3576 "subps %%xmm15, %%xmm7\n"
3577 "movaps %%xmm0, %%xmm8\n"
3578 "movaps %%xmm0, %%xmm12\n"
3579 "addps %%xmm4, %%xmm8\n"
3580 "subps %%xmm4, %%xmm12\n"
3581 "movaps %%xmm1, %%xmm9\n"
3582 "movaps %%xmm1, %%xmm13\n"
3583 "addps %%xmm5, %%xmm9\n"
3584 "subps %%xmm5, %%xmm13\n"
3585 "movaps %%xmm2, %%xmm10\n"
3586 "movaps %%xmm2, %%xmm14\n"
3587 "addps %%xmm6, %%xmm10\n"
3588 "subps %%xmm6, %%xmm14\n"
3589 "movaps %%xmm3, %%xmm11\n"
3590 "movaps %%xmm3, %%xmm15\n"
3591 "addps %%xmm7, %%xmm11\n"
3592 "subps %%xmm7, %%xmm15\n"
3593 "movups %%xmm8, (%0)\n"
3594 "movups %%xmm9, (%1)\n"
3595 "movups %%xmm10, (%2)\n"
3596 "movups %%xmm11, (%3)\n"
3597 "movups %%xmm12, (%4)\n"
3598 "movups %%xmm13, (%5)\n"
3599 "movups %%xmm14, (%6)\n"
3600 "movups %%xmm15, (%7)\n"
3601 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3602 );
3603 }
3604 }
3605 for (int j = 0; j < 8192; j += 2048) {
3606 for (int k = 0; k < 256; k += 4) {
3607 __asm__ volatile (
3608 "movups (%0), %%xmm0\n"
3609 "movups (%1), %%xmm1\n"
3610 "movups (%2), %%xmm2\n"
3611 "movups (%3), %%xmm3\n"
3612 "movups (%4), %%xmm4\n"
3613 "movups (%5), %%xmm5\n"
3614 "movups (%6), %%xmm6\n"
3615 "movups (%7), %%xmm7\n"
3616 "movaps %%xmm0, %%xmm8\n"
3617 "movaps %%xmm0, %%xmm9\n"
3618 "addps %%xmm1, %%xmm8\n"
3619 "subps %%xmm1, %%xmm9\n"
3620 "movaps %%xmm2, %%xmm10\n"
3621 "movaps %%xmm2, %%xmm11\n"
3622 "addps %%xmm3, %%xmm10\n"
3623 "subps %%xmm3, %%xmm11\n"
3624 "movaps %%xmm4, %%xmm12\n"
3625 "movaps %%xmm4, %%xmm13\n"
3626 "addps %%xmm5, %%xmm12\n"
3627 "subps %%xmm5, %%xmm13\n"
3628 "movaps %%xmm6, %%xmm14\n"
3629 "movaps %%xmm6, %%xmm15\n"
3630 "addps %%xmm7, %%xmm14\n"
3631 "subps %%xmm7, %%xmm15\n"
3632 "movaps %%xmm8, %%xmm0\n"
3633 "movaps %%xmm8, %%xmm2\n"
3634 "addps %%xmm10, %%xmm0\n"
3635 "subps %%xmm10, %%xmm2\n"
3636 "movaps %%xmm9, %%xmm1\n"
3637 "movaps %%xmm9, %%xmm3\n"
3638 "addps %%xmm11, %%xmm1\n"
3639 "subps %%xmm11, %%xmm3\n"
3640 "movaps %%xmm12, %%xmm4\n"
3641 "movaps %%xmm12, %%xmm6\n"
3642 "addps %%xmm14, %%xmm4\n"
3643 "subps %%xmm14, %%xmm6\n"
3644 "movaps %%xmm13, %%xmm5\n"
3645 "movaps %%xmm13, %%xmm7\n"
3646 "addps %%xmm15, %%xmm5\n"
3647 "subps %%xmm15, %%xmm7\n"
3648 "movaps %%xmm0, %%xmm8\n"
3649 "movaps %%xmm0, %%xmm12\n"
3650 "addps %%xmm4, %%xmm8\n"
3651 "subps %%xmm4, %%xmm12\n"
3652 "movaps %%xmm1, %%xmm9\n"
3653 "movaps %%xmm1, %%xmm13\n"
3654 "addps %%xmm5, %%xmm9\n"
3655 "subps %%xmm5, %%xmm13\n"
3656 "movaps %%xmm2, %%xmm10\n"
3657 "movaps %%xmm2, %%xmm14\n"
3658 "addps %%xmm6, %%xmm10\n"
3659 "subps %%xmm6, %%xmm14\n"
3660 "movaps %%xmm3, %%xmm11\n"
3661 "movaps %%xmm3, %%xmm15\n"
3662 "addps %%xmm7, %%xmm11\n"
3663 "subps %%xmm7, %%xmm15\n"
3664 "movups %%xmm8, (%0)\n"
3665 "movups %%xmm9, (%1)\n"
3666 "movups %%xmm10, (%2)\n"
3667 "movups %%xmm11, (%3)\n"
3668 "movups %%xmm12, (%4)\n"
3669 "movups %%xmm13, (%5)\n"
3670 "movups %%xmm14, (%6)\n"
3671 "movups %%xmm15, (%7)\n"
3672 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3673 );
3674 }
3675 }
3676 for (int j = 0; j < 8192; j += 8192) {
3677 for (int k = 0; k < 2048; k += 4) {
3678 __asm__ volatile (
3679 "movups (%0), %%xmm0\n"
3680 "movups (%1), %%xmm1\n"
3681 "movups (%2), %%xmm2\n"
3682 "movups (%3), %%xmm3\n"
3683 "movaps %%xmm0, %%xmm8\n"
3684 "movaps %%xmm0, %%xmm9\n"
3685 "addps %%xmm1, %%xmm8\n"
3686 "subps %%xmm1, %%xmm9\n"
3687 "movaps %%xmm2, %%xmm10\n"
3688 "movaps %%xmm2, %%xmm11\n"
3689 "addps %%xmm3, %%xmm10\n"
3690 "subps %%xmm3, %%xmm11\n"
3691 "movaps %%xmm8, %%xmm0\n"
3692 "movaps %%xmm8, %%xmm2\n"
3693 "addps %%xmm10, %%xmm0\n"
3694 "subps %%xmm10, %%xmm2\n"
3695 "movaps %%xmm9, %%xmm1\n"
3696 "movaps %%xmm9, %%xmm3\n"
3697 "addps %%xmm11, %%xmm1\n"
3698 "subps %%xmm11, %%xmm3\n"
3699 "movups %%xmm0, (%0)\n"
3700 "movups %%xmm1, (%1)\n"
3701 "movups %%xmm2, (%2)\n"
3702 "movups %%xmm3, (%3)\n"
3703 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3704 );
3705 }
3706 }
3707 return;
3708 }
3709 if (depth == 15) {
3710 helper_float_15_recursive(buf + 0, 13);
3711 helper_float_15_recursive(buf + 8192, 13);
3712 helper_float_15_recursive(buf + 16384, 13);
3713 helper_float_15_recursive(buf + 24576, 13);
3714 for (int j = 0; j < 32768; j += 32768) {
3715 for (int k = 0; k < 8192; k += 4) {
3716 __asm__ volatile (
3717 "movups (%0), %%xmm0\n"
3718 "movups (%1), %%xmm1\n"
3719 "movups (%2), %%xmm2\n"
3720 "movups (%3), %%xmm3\n"
3721 "movaps %%xmm0, %%xmm8\n"
3722 "movaps %%xmm0, %%xmm9\n"
3723 "addps %%xmm1, %%xmm8\n"
3724 "subps %%xmm1, %%xmm9\n"
3725 "movaps %%xmm2, %%xmm10\n"
3726 "movaps %%xmm2, %%xmm11\n"
3727 "addps %%xmm3, %%xmm10\n"
3728 "subps %%xmm3, %%xmm11\n"
3729 "movaps %%xmm8, %%xmm0\n"
3730 "movaps %%xmm8, %%xmm2\n"
3731 "addps %%xmm10, %%xmm0\n"
3732 "subps %%xmm10, %%xmm2\n"
3733 "movaps %%xmm9, %%xmm1\n"
3734 "movaps %%xmm9, %%xmm3\n"
3735 "addps %%xmm11, %%xmm1\n"
3736 "subps %%xmm11, %%xmm3\n"
3737 "movups %%xmm0, (%0)\n"
3738 "movups %%xmm1, (%1)\n"
3739 "movups %%xmm2, (%2)\n"
3740 "movups %%xmm3, (%3)\n"
3741 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3742 );
3743 }
3744 }
3745 return;
3746 }
3747 }
3748 void helper_float_15(float *buf);
helper_float_15(float * buf)3749 void helper_float_15(float *buf) {
3750 helper_float_15_recursive(buf, 15);
3751 }
3752 void helper_float_16_recursive(float *buf, int depth);
helper_float_16_recursive(float * buf,int depth)3753 void helper_float_16_recursive(float *buf, int depth) {
3754 if (depth == 11) {
3755 for (int j = 0; j < 2048; j += 32) {
3756 for (int k = 0; k < 4; k += 4) {
3757 __asm__ volatile (
3758 "movups (%0), %%xmm0\n"
3759 "movups (%1), %%xmm1\n"
3760 "movups (%2), %%xmm2\n"
3761 "movups (%3), %%xmm3\n"
3762 "movups (%4), %%xmm4\n"
3763 "movups (%5), %%xmm5\n"
3764 "movups (%6), %%xmm6\n"
3765 "movups (%7), %%xmm7\n"
3766 "movaps %%xmm0, %%xmm8\n"
3767 "shufps $160, %%xmm8, %%xmm8\n"
3768 "shufps $245, %%xmm0, %%xmm0\n"
3769 "xorps %%xmm9, %%xmm9\n"
3770 "subps %%xmm0, %%xmm9\n"
3771 "addsubps %%xmm9, %%xmm8\n"
3772 "movaps %%xmm8, %%xmm0\n"
3773 "movaps %%xmm1, %%xmm8\n"
3774 "shufps $160, %%xmm8, %%xmm8\n"
3775 "shufps $245, %%xmm1, %%xmm1\n"
3776 "xorps %%xmm9, %%xmm9\n"
3777 "subps %%xmm1, %%xmm9\n"
3778 "addsubps %%xmm9, %%xmm8\n"
3779 "movaps %%xmm8, %%xmm1\n"
3780 "movaps %%xmm2, %%xmm8\n"
3781 "shufps $160, %%xmm8, %%xmm8\n"
3782 "shufps $245, %%xmm2, %%xmm2\n"
3783 "xorps %%xmm9, %%xmm9\n"
3784 "subps %%xmm2, %%xmm9\n"
3785 "addsubps %%xmm9, %%xmm8\n"
3786 "movaps %%xmm8, %%xmm2\n"
3787 "movaps %%xmm3, %%xmm8\n"
3788 "shufps $160, %%xmm8, %%xmm8\n"
3789 "shufps $245, %%xmm3, %%xmm3\n"
3790 "xorps %%xmm9, %%xmm9\n"
3791 "subps %%xmm3, %%xmm9\n"
3792 "addsubps %%xmm9, %%xmm8\n"
3793 "movaps %%xmm8, %%xmm3\n"
3794 "movaps %%xmm4, %%xmm8\n"
3795 "shufps $160, %%xmm8, %%xmm8\n"
3796 "shufps $245, %%xmm4, %%xmm4\n"
3797 "xorps %%xmm9, %%xmm9\n"
3798 "subps %%xmm4, %%xmm9\n"
3799 "addsubps %%xmm9, %%xmm8\n"
3800 "movaps %%xmm8, %%xmm4\n"
3801 "movaps %%xmm5, %%xmm8\n"
3802 "shufps $160, %%xmm8, %%xmm8\n"
3803 "shufps $245, %%xmm5, %%xmm5\n"
3804 "xorps %%xmm9, %%xmm9\n"
3805 "subps %%xmm5, %%xmm9\n"
3806 "addsubps %%xmm9, %%xmm8\n"
3807 "movaps %%xmm8, %%xmm5\n"
3808 "movaps %%xmm6, %%xmm8\n"
3809 "shufps $160, %%xmm8, %%xmm8\n"
3810 "shufps $245, %%xmm6, %%xmm6\n"
3811 "xorps %%xmm9, %%xmm9\n"
3812 "subps %%xmm6, %%xmm9\n"
3813 "addsubps %%xmm9, %%xmm8\n"
3814 "movaps %%xmm8, %%xmm6\n"
3815 "movaps %%xmm7, %%xmm8\n"
3816 "shufps $160, %%xmm8, %%xmm8\n"
3817 "shufps $245, %%xmm7, %%xmm7\n"
3818 "xorps %%xmm9, %%xmm9\n"
3819 "subps %%xmm7, %%xmm9\n"
3820 "addsubps %%xmm9, %%xmm8\n"
3821 "movaps %%xmm8, %%xmm7\n"
3822 "movaps %%xmm0, %%xmm8\n"
3823 "shufps $68, %%xmm8, %%xmm8\n"
3824 "xorps %%xmm9, %%xmm9\n"
3825 "movaps %%xmm0, %%xmm10\n"
3826 "shufps $14, %%xmm9, %%xmm10\n"
3827 "movaps %%xmm0, %%xmm11\n"
3828 "shufps $224, %%xmm11, %%xmm9\n"
3829 "addps %%xmm8, %%xmm10\n"
3830 "subps %%xmm9, %%xmm10\n"
3831 "movaps %%xmm10, %%xmm0\n"
3832 "movaps %%xmm1, %%xmm8\n"
3833 "shufps $68, %%xmm8, %%xmm8\n"
3834 "xorps %%xmm9, %%xmm9\n"
3835 "movaps %%xmm1, %%xmm10\n"
3836 "shufps $14, %%xmm9, %%xmm10\n"
3837 "movaps %%xmm1, %%xmm11\n"
3838 "shufps $224, %%xmm11, %%xmm9\n"
3839 "addps %%xmm8, %%xmm10\n"
3840 "subps %%xmm9, %%xmm10\n"
3841 "movaps %%xmm10, %%xmm1\n"
3842 "movaps %%xmm2, %%xmm8\n"
3843 "shufps $68, %%xmm8, %%xmm8\n"
3844 "xorps %%xmm9, %%xmm9\n"
3845 "movaps %%xmm2, %%xmm10\n"
3846 "shufps $14, %%xmm9, %%xmm10\n"
3847 "movaps %%xmm2, %%xmm11\n"
3848 "shufps $224, %%xmm11, %%xmm9\n"
3849 "addps %%xmm8, %%xmm10\n"
3850 "subps %%xmm9, %%xmm10\n"
3851 "movaps %%xmm10, %%xmm2\n"
3852 "movaps %%xmm3, %%xmm8\n"
3853 "shufps $68, %%xmm8, %%xmm8\n"
3854 "xorps %%xmm9, %%xmm9\n"
3855 "movaps %%xmm3, %%xmm10\n"
3856 "shufps $14, %%xmm9, %%xmm10\n"
3857 "movaps %%xmm3, %%xmm11\n"
3858 "shufps $224, %%xmm11, %%xmm9\n"
3859 "addps %%xmm8, %%xmm10\n"
3860 "subps %%xmm9, %%xmm10\n"
3861 "movaps %%xmm10, %%xmm3\n"
3862 "movaps %%xmm4, %%xmm8\n"
3863 "shufps $68, %%xmm8, %%xmm8\n"
3864 "xorps %%xmm9, %%xmm9\n"
3865 "movaps %%xmm4, %%xmm10\n"
3866 "shufps $14, %%xmm9, %%xmm10\n"
3867 "movaps %%xmm4, %%xmm11\n"
3868 "shufps $224, %%xmm11, %%xmm9\n"
3869 "addps %%xmm8, %%xmm10\n"
3870 "subps %%xmm9, %%xmm10\n"
3871 "movaps %%xmm10, %%xmm4\n"
3872 "movaps %%xmm5, %%xmm8\n"
3873 "shufps $68, %%xmm8, %%xmm8\n"
3874 "xorps %%xmm9, %%xmm9\n"
3875 "movaps %%xmm5, %%xmm10\n"
3876 "shufps $14, %%xmm9, %%xmm10\n"
3877 "movaps %%xmm5, %%xmm11\n"
3878 "shufps $224, %%xmm11, %%xmm9\n"
3879 "addps %%xmm8, %%xmm10\n"
3880 "subps %%xmm9, %%xmm10\n"
3881 "movaps %%xmm10, %%xmm5\n"
3882 "movaps %%xmm6, %%xmm8\n"
3883 "shufps $68, %%xmm8, %%xmm8\n"
3884 "xorps %%xmm9, %%xmm9\n"
3885 "movaps %%xmm6, %%xmm10\n"
3886 "shufps $14, %%xmm9, %%xmm10\n"
3887 "movaps %%xmm6, %%xmm11\n"
3888 "shufps $224, %%xmm11, %%xmm9\n"
3889 "addps %%xmm8, %%xmm10\n"
3890 "subps %%xmm9, %%xmm10\n"
3891 "movaps %%xmm10, %%xmm6\n"
3892 "movaps %%xmm7, %%xmm8\n"
3893 "shufps $68, %%xmm8, %%xmm8\n"
3894 "xorps %%xmm9, %%xmm9\n"
3895 "movaps %%xmm7, %%xmm10\n"
3896 "shufps $14, %%xmm9, %%xmm10\n"
3897 "movaps %%xmm7, %%xmm11\n"
3898 "shufps $224, %%xmm11, %%xmm9\n"
3899 "addps %%xmm8, %%xmm10\n"
3900 "subps %%xmm9, %%xmm10\n"
3901 "movaps %%xmm10, %%xmm7\n"
3902 "movaps %%xmm0, %%xmm8\n"
3903 "movaps %%xmm0, %%xmm9\n"
3904 "addps %%xmm1, %%xmm8\n"
3905 "subps %%xmm1, %%xmm9\n"
3906 "movaps %%xmm2, %%xmm10\n"
3907 "movaps %%xmm2, %%xmm11\n"
3908 "addps %%xmm3, %%xmm10\n"
3909 "subps %%xmm3, %%xmm11\n"
3910 "movaps %%xmm4, %%xmm12\n"
3911 "movaps %%xmm4, %%xmm13\n"
3912 "addps %%xmm5, %%xmm12\n"
3913 "subps %%xmm5, %%xmm13\n"
3914 "movaps %%xmm6, %%xmm14\n"
3915 "movaps %%xmm6, %%xmm15\n"
3916 "addps %%xmm7, %%xmm14\n"
3917 "subps %%xmm7, %%xmm15\n"
3918 "movaps %%xmm8, %%xmm0\n"
3919 "movaps %%xmm8, %%xmm2\n"
3920 "addps %%xmm10, %%xmm0\n"
3921 "subps %%xmm10, %%xmm2\n"
3922 "movaps %%xmm9, %%xmm1\n"
3923 "movaps %%xmm9, %%xmm3\n"
3924 "addps %%xmm11, %%xmm1\n"
3925 "subps %%xmm11, %%xmm3\n"
3926 "movaps %%xmm12, %%xmm4\n"
3927 "movaps %%xmm12, %%xmm6\n"
3928 "addps %%xmm14, %%xmm4\n"
3929 "subps %%xmm14, %%xmm6\n"
3930 "movaps %%xmm13, %%xmm5\n"
3931 "movaps %%xmm13, %%xmm7\n"
3932 "addps %%xmm15, %%xmm5\n"
3933 "subps %%xmm15, %%xmm7\n"
3934 "movaps %%xmm0, %%xmm8\n"
3935 "movaps %%xmm0, %%xmm12\n"
3936 "addps %%xmm4, %%xmm8\n"
3937 "subps %%xmm4, %%xmm12\n"
3938 "movaps %%xmm1, %%xmm9\n"
3939 "movaps %%xmm1, %%xmm13\n"
3940 "addps %%xmm5, %%xmm9\n"
3941 "subps %%xmm5, %%xmm13\n"
3942 "movaps %%xmm2, %%xmm10\n"
3943 "movaps %%xmm2, %%xmm14\n"
3944 "addps %%xmm6, %%xmm10\n"
3945 "subps %%xmm6, %%xmm14\n"
3946 "movaps %%xmm3, %%xmm11\n"
3947 "movaps %%xmm3, %%xmm15\n"
3948 "addps %%xmm7, %%xmm11\n"
3949 "subps %%xmm7, %%xmm15\n"
3950 "movups %%xmm8, (%0)\n"
3951 "movups %%xmm9, (%1)\n"
3952 "movups %%xmm10, (%2)\n"
3953 "movups %%xmm11, (%3)\n"
3954 "movups %%xmm12, (%4)\n"
3955 "movups %%xmm13, (%5)\n"
3956 "movups %%xmm14, (%6)\n"
3957 "movups %%xmm15, (%7)\n"
3958 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
3959 );
3960 }
3961 }
3962 for (int j = 0; j < 2048; j += 256) {
3963 for (int k = 0; k < 32; k += 4) {
3964 __asm__ volatile (
3965 "movups (%0), %%xmm0\n"
3966 "movups (%1), %%xmm1\n"
3967 "movups (%2), %%xmm2\n"
3968 "movups (%3), %%xmm3\n"
3969 "movups (%4), %%xmm4\n"
3970 "movups (%5), %%xmm5\n"
3971 "movups (%6), %%xmm6\n"
3972 "movups (%7), %%xmm7\n"
3973 "movaps %%xmm0, %%xmm8\n"
3974 "movaps %%xmm0, %%xmm9\n"
3975 "addps %%xmm1, %%xmm8\n"
3976 "subps %%xmm1, %%xmm9\n"
3977 "movaps %%xmm2, %%xmm10\n"
3978 "movaps %%xmm2, %%xmm11\n"
3979 "addps %%xmm3, %%xmm10\n"
3980 "subps %%xmm3, %%xmm11\n"
3981 "movaps %%xmm4, %%xmm12\n"
3982 "movaps %%xmm4, %%xmm13\n"
3983 "addps %%xmm5, %%xmm12\n"
3984 "subps %%xmm5, %%xmm13\n"
3985 "movaps %%xmm6, %%xmm14\n"
3986 "movaps %%xmm6, %%xmm15\n"
3987 "addps %%xmm7, %%xmm14\n"
3988 "subps %%xmm7, %%xmm15\n"
3989 "movaps %%xmm8, %%xmm0\n"
3990 "movaps %%xmm8, %%xmm2\n"
3991 "addps %%xmm10, %%xmm0\n"
3992 "subps %%xmm10, %%xmm2\n"
3993 "movaps %%xmm9, %%xmm1\n"
3994 "movaps %%xmm9, %%xmm3\n"
3995 "addps %%xmm11, %%xmm1\n"
3996 "subps %%xmm11, %%xmm3\n"
3997 "movaps %%xmm12, %%xmm4\n"
3998 "movaps %%xmm12, %%xmm6\n"
3999 "addps %%xmm14, %%xmm4\n"
4000 "subps %%xmm14, %%xmm6\n"
4001 "movaps %%xmm13, %%xmm5\n"
4002 "movaps %%xmm13, %%xmm7\n"
4003 "addps %%xmm15, %%xmm5\n"
4004 "subps %%xmm15, %%xmm7\n"
4005 "movaps %%xmm0, %%xmm8\n"
4006 "movaps %%xmm0, %%xmm12\n"
4007 "addps %%xmm4, %%xmm8\n"
4008 "subps %%xmm4, %%xmm12\n"
4009 "movaps %%xmm1, %%xmm9\n"
4010 "movaps %%xmm1, %%xmm13\n"
4011 "addps %%xmm5, %%xmm9\n"
4012 "subps %%xmm5, %%xmm13\n"
4013 "movaps %%xmm2, %%xmm10\n"
4014 "movaps %%xmm2, %%xmm14\n"
4015 "addps %%xmm6, %%xmm10\n"
4016 "subps %%xmm6, %%xmm14\n"
4017 "movaps %%xmm3, %%xmm11\n"
4018 "movaps %%xmm3, %%xmm15\n"
4019 "addps %%xmm7, %%xmm11\n"
4020 "subps %%xmm7, %%xmm15\n"
4021 "movups %%xmm8, (%0)\n"
4022 "movups %%xmm9, (%1)\n"
4023 "movups %%xmm10, (%2)\n"
4024 "movups %%xmm11, (%3)\n"
4025 "movups %%xmm12, (%4)\n"
4026 "movups %%xmm13, (%5)\n"
4027 "movups %%xmm14, (%6)\n"
4028 "movups %%xmm15, (%7)\n"
4029 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4030 );
4031 }
4032 }
4033 for (int j = 0; j < 2048; j += 2048) {
4034 for (int k = 0; k < 256; k += 4) {
4035 __asm__ volatile (
4036 "movups (%0), %%xmm0\n"
4037 "movups (%1), %%xmm1\n"
4038 "movups (%2), %%xmm2\n"
4039 "movups (%3), %%xmm3\n"
4040 "movups (%4), %%xmm4\n"
4041 "movups (%5), %%xmm5\n"
4042 "movups (%6), %%xmm6\n"
4043 "movups (%7), %%xmm7\n"
4044 "movaps %%xmm0, %%xmm8\n"
4045 "movaps %%xmm0, %%xmm9\n"
4046 "addps %%xmm1, %%xmm8\n"
4047 "subps %%xmm1, %%xmm9\n"
4048 "movaps %%xmm2, %%xmm10\n"
4049 "movaps %%xmm2, %%xmm11\n"
4050 "addps %%xmm3, %%xmm10\n"
4051 "subps %%xmm3, %%xmm11\n"
4052 "movaps %%xmm4, %%xmm12\n"
4053 "movaps %%xmm4, %%xmm13\n"
4054 "addps %%xmm5, %%xmm12\n"
4055 "subps %%xmm5, %%xmm13\n"
4056 "movaps %%xmm6, %%xmm14\n"
4057 "movaps %%xmm6, %%xmm15\n"
4058 "addps %%xmm7, %%xmm14\n"
4059 "subps %%xmm7, %%xmm15\n"
4060 "movaps %%xmm8, %%xmm0\n"
4061 "movaps %%xmm8, %%xmm2\n"
4062 "addps %%xmm10, %%xmm0\n"
4063 "subps %%xmm10, %%xmm2\n"
4064 "movaps %%xmm9, %%xmm1\n"
4065 "movaps %%xmm9, %%xmm3\n"
4066 "addps %%xmm11, %%xmm1\n"
4067 "subps %%xmm11, %%xmm3\n"
4068 "movaps %%xmm12, %%xmm4\n"
4069 "movaps %%xmm12, %%xmm6\n"
4070 "addps %%xmm14, %%xmm4\n"
4071 "subps %%xmm14, %%xmm6\n"
4072 "movaps %%xmm13, %%xmm5\n"
4073 "movaps %%xmm13, %%xmm7\n"
4074 "addps %%xmm15, %%xmm5\n"
4075 "subps %%xmm15, %%xmm7\n"
4076 "movaps %%xmm0, %%xmm8\n"
4077 "movaps %%xmm0, %%xmm12\n"
4078 "addps %%xmm4, %%xmm8\n"
4079 "subps %%xmm4, %%xmm12\n"
4080 "movaps %%xmm1, %%xmm9\n"
4081 "movaps %%xmm1, %%xmm13\n"
4082 "addps %%xmm5, %%xmm9\n"
4083 "subps %%xmm5, %%xmm13\n"
4084 "movaps %%xmm2, %%xmm10\n"
4085 "movaps %%xmm2, %%xmm14\n"
4086 "addps %%xmm6, %%xmm10\n"
4087 "subps %%xmm6, %%xmm14\n"
4088 "movaps %%xmm3, %%xmm11\n"
4089 "movaps %%xmm3, %%xmm15\n"
4090 "addps %%xmm7, %%xmm11\n"
4091 "subps %%xmm7, %%xmm15\n"
4092 "movups %%xmm8, (%0)\n"
4093 "movups %%xmm9, (%1)\n"
4094 "movups %%xmm10, (%2)\n"
4095 "movups %%xmm11, (%3)\n"
4096 "movups %%xmm12, (%4)\n"
4097 "movups %%xmm13, (%5)\n"
4098 "movups %%xmm14, (%6)\n"
4099 "movups %%xmm15, (%7)\n"
4100 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4101 );
4102 }
4103 }
4104 return;
4105 }
4106 if (depth == 14) {
4107 helper_float_16_recursive(buf + 0, 11);
4108 helper_float_16_recursive(buf + 2048, 11);
4109 helper_float_16_recursive(buf + 4096, 11);
4110 helper_float_16_recursive(buf + 6144, 11);
4111 helper_float_16_recursive(buf + 8192, 11);
4112 helper_float_16_recursive(buf + 10240, 11);
4113 helper_float_16_recursive(buf + 12288, 11);
4114 helper_float_16_recursive(buf + 14336, 11);
4115 for (int j = 0; j < 16384; j += 16384) {
4116 for (int k = 0; k < 2048; k += 4) {
4117 __asm__ volatile (
4118 "movups (%0), %%xmm0\n"
4119 "movups (%1), %%xmm1\n"
4120 "movups (%2), %%xmm2\n"
4121 "movups (%3), %%xmm3\n"
4122 "movups (%4), %%xmm4\n"
4123 "movups (%5), %%xmm5\n"
4124 "movups (%6), %%xmm6\n"
4125 "movups (%7), %%xmm7\n"
4126 "movaps %%xmm0, %%xmm8\n"
4127 "movaps %%xmm0, %%xmm9\n"
4128 "addps %%xmm1, %%xmm8\n"
4129 "subps %%xmm1, %%xmm9\n"
4130 "movaps %%xmm2, %%xmm10\n"
4131 "movaps %%xmm2, %%xmm11\n"
4132 "addps %%xmm3, %%xmm10\n"
4133 "subps %%xmm3, %%xmm11\n"
4134 "movaps %%xmm4, %%xmm12\n"
4135 "movaps %%xmm4, %%xmm13\n"
4136 "addps %%xmm5, %%xmm12\n"
4137 "subps %%xmm5, %%xmm13\n"
4138 "movaps %%xmm6, %%xmm14\n"
4139 "movaps %%xmm6, %%xmm15\n"
4140 "addps %%xmm7, %%xmm14\n"
4141 "subps %%xmm7, %%xmm15\n"
4142 "movaps %%xmm8, %%xmm0\n"
4143 "movaps %%xmm8, %%xmm2\n"
4144 "addps %%xmm10, %%xmm0\n"
4145 "subps %%xmm10, %%xmm2\n"
4146 "movaps %%xmm9, %%xmm1\n"
4147 "movaps %%xmm9, %%xmm3\n"
4148 "addps %%xmm11, %%xmm1\n"
4149 "subps %%xmm11, %%xmm3\n"
4150 "movaps %%xmm12, %%xmm4\n"
4151 "movaps %%xmm12, %%xmm6\n"
4152 "addps %%xmm14, %%xmm4\n"
4153 "subps %%xmm14, %%xmm6\n"
4154 "movaps %%xmm13, %%xmm5\n"
4155 "movaps %%xmm13, %%xmm7\n"
4156 "addps %%xmm15, %%xmm5\n"
4157 "subps %%xmm15, %%xmm7\n"
4158 "movaps %%xmm0, %%xmm8\n"
4159 "movaps %%xmm0, %%xmm12\n"
4160 "addps %%xmm4, %%xmm8\n"
4161 "subps %%xmm4, %%xmm12\n"
4162 "movaps %%xmm1, %%xmm9\n"
4163 "movaps %%xmm1, %%xmm13\n"
4164 "addps %%xmm5, %%xmm9\n"
4165 "subps %%xmm5, %%xmm13\n"
4166 "movaps %%xmm2, %%xmm10\n"
4167 "movaps %%xmm2, %%xmm14\n"
4168 "addps %%xmm6, %%xmm10\n"
4169 "subps %%xmm6, %%xmm14\n"
4170 "movaps %%xmm3, %%xmm11\n"
4171 "movaps %%xmm3, %%xmm15\n"
4172 "addps %%xmm7, %%xmm11\n"
4173 "subps %%xmm7, %%xmm15\n"
4174 "movups %%xmm8, (%0)\n"
4175 "movups %%xmm9, (%1)\n"
4176 "movups %%xmm10, (%2)\n"
4177 "movups %%xmm11, (%3)\n"
4178 "movups %%xmm12, (%4)\n"
4179 "movups %%xmm13, (%5)\n"
4180 "movups %%xmm14, (%6)\n"
4181 "movups %%xmm15, (%7)\n"
4182 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4183 );
4184 }
4185 }
4186 return;
4187 }
4188 if (depth == 16) {
4189 helper_float_16_recursive(buf + 0, 14);
4190 helper_float_16_recursive(buf + 16384, 14);
4191 helper_float_16_recursive(buf + 32768, 14);
4192 helper_float_16_recursive(buf + 49152, 14);
4193 for (int j = 0; j < 65536; j += 65536) {
4194 for (int k = 0; k < 16384; k += 4) {
4195 __asm__ volatile (
4196 "movups (%0), %%xmm0\n"
4197 "movups (%1), %%xmm1\n"
4198 "movups (%2), %%xmm2\n"
4199 "movups (%3), %%xmm3\n"
4200 "movaps %%xmm0, %%xmm8\n"
4201 "movaps %%xmm0, %%xmm9\n"
4202 "addps %%xmm1, %%xmm8\n"
4203 "subps %%xmm1, %%xmm9\n"
4204 "movaps %%xmm2, %%xmm10\n"
4205 "movaps %%xmm2, %%xmm11\n"
4206 "addps %%xmm3, %%xmm10\n"
4207 "subps %%xmm3, %%xmm11\n"
4208 "movaps %%xmm8, %%xmm0\n"
4209 "movaps %%xmm8, %%xmm2\n"
4210 "addps %%xmm10, %%xmm0\n"
4211 "subps %%xmm10, %%xmm2\n"
4212 "movaps %%xmm9, %%xmm1\n"
4213 "movaps %%xmm9, %%xmm3\n"
4214 "addps %%xmm11, %%xmm1\n"
4215 "subps %%xmm11, %%xmm3\n"
4216 "movups %%xmm0, (%0)\n"
4217 "movups %%xmm1, (%1)\n"
4218 "movups %%xmm2, (%2)\n"
4219 "movups %%xmm3, (%3)\n"
4220 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4221 );
4222 }
4223 }
4224 return;
4225 }
4226 }
4227 void helper_float_16(float *buf);
helper_float_16(float * buf)4228 void helper_float_16(float *buf) {
4229 helper_float_16_recursive(buf, 16);
4230 }
4231 void helper_float_17_recursive(float *buf, int depth);
helper_float_17_recursive(float * buf,int depth)4232 void helper_float_17_recursive(float *buf, int depth) {
4233 if (depth == 11) {
4234 for (int j = 0; j < 2048; j += 32) {
4235 for (int k = 0; k < 4; k += 4) {
4236 __asm__ volatile (
4237 "movups (%0), %%xmm0\n"
4238 "movups (%1), %%xmm1\n"
4239 "movups (%2), %%xmm2\n"
4240 "movups (%3), %%xmm3\n"
4241 "movups (%4), %%xmm4\n"
4242 "movups (%5), %%xmm5\n"
4243 "movups (%6), %%xmm6\n"
4244 "movups (%7), %%xmm7\n"
4245 "movaps %%xmm0, %%xmm8\n"
4246 "shufps $160, %%xmm8, %%xmm8\n"
4247 "shufps $245, %%xmm0, %%xmm0\n"
4248 "xorps %%xmm9, %%xmm9\n"
4249 "subps %%xmm0, %%xmm9\n"
4250 "addsubps %%xmm9, %%xmm8\n"
4251 "movaps %%xmm8, %%xmm0\n"
4252 "movaps %%xmm1, %%xmm8\n"
4253 "shufps $160, %%xmm8, %%xmm8\n"
4254 "shufps $245, %%xmm1, %%xmm1\n"
4255 "xorps %%xmm9, %%xmm9\n"
4256 "subps %%xmm1, %%xmm9\n"
4257 "addsubps %%xmm9, %%xmm8\n"
4258 "movaps %%xmm8, %%xmm1\n"
4259 "movaps %%xmm2, %%xmm8\n"
4260 "shufps $160, %%xmm8, %%xmm8\n"
4261 "shufps $245, %%xmm2, %%xmm2\n"
4262 "xorps %%xmm9, %%xmm9\n"
4263 "subps %%xmm2, %%xmm9\n"
4264 "addsubps %%xmm9, %%xmm8\n"
4265 "movaps %%xmm8, %%xmm2\n"
4266 "movaps %%xmm3, %%xmm8\n"
4267 "shufps $160, %%xmm8, %%xmm8\n"
4268 "shufps $245, %%xmm3, %%xmm3\n"
4269 "xorps %%xmm9, %%xmm9\n"
4270 "subps %%xmm3, %%xmm9\n"
4271 "addsubps %%xmm9, %%xmm8\n"
4272 "movaps %%xmm8, %%xmm3\n"
4273 "movaps %%xmm4, %%xmm8\n"
4274 "shufps $160, %%xmm8, %%xmm8\n"
4275 "shufps $245, %%xmm4, %%xmm4\n"
4276 "xorps %%xmm9, %%xmm9\n"
4277 "subps %%xmm4, %%xmm9\n"
4278 "addsubps %%xmm9, %%xmm8\n"
4279 "movaps %%xmm8, %%xmm4\n"
4280 "movaps %%xmm5, %%xmm8\n"
4281 "shufps $160, %%xmm8, %%xmm8\n"
4282 "shufps $245, %%xmm5, %%xmm5\n"
4283 "xorps %%xmm9, %%xmm9\n"
4284 "subps %%xmm5, %%xmm9\n"
4285 "addsubps %%xmm9, %%xmm8\n"
4286 "movaps %%xmm8, %%xmm5\n"
4287 "movaps %%xmm6, %%xmm8\n"
4288 "shufps $160, %%xmm8, %%xmm8\n"
4289 "shufps $245, %%xmm6, %%xmm6\n"
4290 "xorps %%xmm9, %%xmm9\n"
4291 "subps %%xmm6, %%xmm9\n"
4292 "addsubps %%xmm9, %%xmm8\n"
4293 "movaps %%xmm8, %%xmm6\n"
4294 "movaps %%xmm7, %%xmm8\n"
4295 "shufps $160, %%xmm8, %%xmm8\n"
4296 "shufps $245, %%xmm7, %%xmm7\n"
4297 "xorps %%xmm9, %%xmm9\n"
4298 "subps %%xmm7, %%xmm9\n"
4299 "addsubps %%xmm9, %%xmm8\n"
4300 "movaps %%xmm8, %%xmm7\n"
4301 "movaps %%xmm0, %%xmm8\n"
4302 "shufps $68, %%xmm8, %%xmm8\n"
4303 "xorps %%xmm9, %%xmm9\n"
4304 "movaps %%xmm0, %%xmm10\n"
4305 "shufps $14, %%xmm9, %%xmm10\n"
4306 "movaps %%xmm0, %%xmm11\n"
4307 "shufps $224, %%xmm11, %%xmm9\n"
4308 "addps %%xmm8, %%xmm10\n"
4309 "subps %%xmm9, %%xmm10\n"
4310 "movaps %%xmm10, %%xmm0\n"
4311 "movaps %%xmm1, %%xmm8\n"
4312 "shufps $68, %%xmm8, %%xmm8\n"
4313 "xorps %%xmm9, %%xmm9\n"
4314 "movaps %%xmm1, %%xmm10\n"
4315 "shufps $14, %%xmm9, %%xmm10\n"
4316 "movaps %%xmm1, %%xmm11\n"
4317 "shufps $224, %%xmm11, %%xmm9\n"
4318 "addps %%xmm8, %%xmm10\n"
4319 "subps %%xmm9, %%xmm10\n"
4320 "movaps %%xmm10, %%xmm1\n"
4321 "movaps %%xmm2, %%xmm8\n"
4322 "shufps $68, %%xmm8, %%xmm8\n"
4323 "xorps %%xmm9, %%xmm9\n"
4324 "movaps %%xmm2, %%xmm10\n"
4325 "shufps $14, %%xmm9, %%xmm10\n"
4326 "movaps %%xmm2, %%xmm11\n"
4327 "shufps $224, %%xmm11, %%xmm9\n"
4328 "addps %%xmm8, %%xmm10\n"
4329 "subps %%xmm9, %%xmm10\n"
4330 "movaps %%xmm10, %%xmm2\n"
4331 "movaps %%xmm3, %%xmm8\n"
4332 "shufps $68, %%xmm8, %%xmm8\n"
4333 "xorps %%xmm9, %%xmm9\n"
4334 "movaps %%xmm3, %%xmm10\n"
4335 "shufps $14, %%xmm9, %%xmm10\n"
4336 "movaps %%xmm3, %%xmm11\n"
4337 "shufps $224, %%xmm11, %%xmm9\n"
4338 "addps %%xmm8, %%xmm10\n"
4339 "subps %%xmm9, %%xmm10\n"
4340 "movaps %%xmm10, %%xmm3\n"
4341 "movaps %%xmm4, %%xmm8\n"
4342 "shufps $68, %%xmm8, %%xmm8\n"
4343 "xorps %%xmm9, %%xmm9\n"
4344 "movaps %%xmm4, %%xmm10\n"
4345 "shufps $14, %%xmm9, %%xmm10\n"
4346 "movaps %%xmm4, %%xmm11\n"
4347 "shufps $224, %%xmm11, %%xmm9\n"
4348 "addps %%xmm8, %%xmm10\n"
4349 "subps %%xmm9, %%xmm10\n"
4350 "movaps %%xmm10, %%xmm4\n"
4351 "movaps %%xmm5, %%xmm8\n"
4352 "shufps $68, %%xmm8, %%xmm8\n"
4353 "xorps %%xmm9, %%xmm9\n"
4354 "movaps %%xmm5, %%xmm10\n"
4355 "shufps $14, %%xmm9, %%xmm10\n"
4356 "movaps %%xmm5, %%xmm11\n"
4357 "shufps $224, %%xmm11, %%xmm9\n"
4358 "addps %%xmm8, %%xmm10\n"
4359 "subps %%xmm9, %%xmm10\n"
4360 "movaps %%xmm10, %%xmm5\n"
4361 "movaps %%xmm6, %%xmm8\n"
4362 "shufps $68, %%xmm8, %%xmm8\n"
4363 "xorps %%xmm9, %%xmm9\n"
4364 "movaps %%xmm6, %%xmm10\n"
4365 "shufps $14, %%xmm9, %%xmm10\n"
4366 "movaps %%xmm6, %%xmm11\n"
4367 "shufps $224, %%xmm11, %%xmm9\n"
4368 "addps %%xmm8, %%xmm10\n"
4369 "subps %%xmm9, %%xmm10\n"
4370 "movaps %%xmm10, %%xmm6\n"
4371 "movaps %%xmm7, %%xmm8\n"
4372 "shufps $68, %%xmm8, %%xmm8\n"
4373 "xorps %%xmm9, %%xmm9\n"
4374 "movaps %%xmm7, %%xmm10\n"
4375 "shufps $14, %%xmm9, %%xmm10\n"
4376 "movaps %%xmm7, %%xmm11\n"
4377 "shufps $224, %%xmm11, %%xmm9\n"
4378 "addps %%xmm8, %%xmm10\n"
4379 "subps %%xmm9, %%xmm10\n"
4380 "movaps %%xmm10, %%xmm7\n"
4381 "movaps %%xmm0, %%xmm8\n"
4382 "movaps %%xmm0, %%xmm9\n"
4383 "addps %%xmm1, %%xmm8\n"
4384 "subps %%xmm1, %%xmm9\n"
4385 "movaps %%xmm2, %%xmm10\n"
4386 "movaps %%xmm2, %%xmm11\n"
4387 "addps %%xmm3, %%xmm10\n"
4388 "subps %%xmm3, %%xmm11\n"
4389 "movaps %%xmm4, %%xmm12\n"
4390 "movaps %%xmm4, %%xmm13\n"
4391 "addps %%xmm5, %%xmm12\n"
4392 "subps %%xmm5, %%xmm13\n"
4393 "movaps %%xmm6, %%xmm14\n"
4394 "movaps %%xmm6, %%xmm15\n"
4395 "addps %%xmm7, %%xmm14\n"
4396 "subps %%xmm7, %%xmm15\n"
4397 "movaps %%xmm8, %%xmm0\n"
4398 "movaps %%xmm8, %%xmm2\n"
4399 "addps %%xmm10, %%xmm0\n"
4400 "subps %%xmm10, %%xmm2\n"
4401 "movaps %%xmm9, %%xmm1\n"
4402 "movaps %%xmm9, %%xmm3\n"
4403 "addps %%xmm11, %%xmm1\n"
4404 "subps %%xmm11, %%xmm3\n"
4405 "movaps %%xmm12, %%xmm4\n"
4406 "movaps %%xmm12, %%xmm6\n"
4407 "addps %%xmm14, %%xmm4\n"
4408 "subps %%xmm14, %%xmm6\n"
4409 "movaps %%xmm13, %%xmm5\n"
4410 "movaps %%xmm13, %%xmm7\n"
4411 "addps %%xmm15, %%xmm5\n"
4412 "subps %%xmm15, %%xmm7\n"
4413 "movaps %%xmm0, %%xmm8\n"
4414 "movaps %%xmm0, %%xmm12\n"
4415 "addps %%xmm4, %%xmm8\n"
4416 "subps %%xmm4, %%xmm12\n"
4417 "movaps %%xmm1, %%xmm9\n"
4418 "movaps %%xmm1, %%xmm13\n"
4419 "addps %%xmm5, %%xmm9\n"
4420 "subps %%xmm5, %%xmm13\n"
4421 "movaps %%xmm2, %%xmm10\n"
4422 "movaps %%xmm2, %%xmm14\n"
4423 "addps %%xmm6, %%xmm10\n"
4424 "subps %%xmm6, %%xmm14\n"
4425 "movaps %%xmm3, %%xmm11\n"
4426 "movaps %%xmm3, %%xmm15\n"
4427 "addps %%xmm7, %%xmm11\n"
4428 "subps %%xmm7, %%xmm15\n"
4429 "movups %%xmm8, (%0)\n"
4430 "movups %%xmm9, (%1)\n"
4431 "movups %%xmm10, (%2)\n"
4432 "movups %%xmm11, (%3)\n"
4433 "movups %%xmm12, (%4)\n"
4434 "movups %%xmm13, (%5)\n"
4435 "movups %%xmm14, (%6)\n"
4436 "movups %%xmm15, (%7)\n"
4437 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4438 );
4439 }
4440 }
4441 for (int j = 0; j < 2048; j += 256) {
4442 for (int k = 0; k < 32; k += 4) {
4443 __asm__ volatile (
4444 "movups (%0), %%xmm0\n"
4445 "movups (%1), %%xmm1\n"
4446 "movups (%2), %%xmm2\n"
4447 "movups (%3), %%xmm3\n"
4448 "movups (%4), %%xmm4\n"
4449 "movups (%5), %%xmm5\n"
4450 "movups (%6), %%xmm6\n"
4451 "movups (%7), %%xmm7\n"
4452 "movaps %%xmm0, %%xmm8\n"
4453 "movaps %%xmm0, %%xmm9\n"
4454 "addps %%xmm1, %%xmm8\n"
4455 "subps %%xmm1, %%xmm9\n"
4456 "movaps %%xmm2, %%xmm10\n"
4457 "movaps %%xmm2, %%xmm11\n"
4458 "addps %%xmm3, %%xmm10\n"
4459 "subps %%xmm3, %%xmm11\n"
4460 "movaps %%xmm4, %%xmm12\n"
4461 "movaps %%xmm4, %%xmm13\n"
4462 "addps %%xmm5, %%xmm12\n"
4463 "subps %%xmm5, %%xmm13\n"
4464 "movaps %%xmm6, %%xmm14\n"
4465 "movaps %%xmm6, %%xmm15\n"
4466 "addps %%xmm7, %%xmm14\n"
4467 "subps %%xmm7, %%xmm15\n"
4468 "movaps %%xmm8, %%xmm0\n"
4469 "movaps %%xmm8, %%xmm2\n"
4470 "addps %%xmm10, %%xmm0\n"
4471 "subps %%xmm10, %%xmm2\n"
4472 "movaps %%xmm9, %%xmm1\n"
4473 "movaps %%xmm9, %%xmm3\n"
4474 "addps %%xmm11, %%xmm1\n"
4475 "subps %%xmm11, %%xmm3\n"
4476 "movaps %%xmm12, %%xmm4\n"
4477 "movaps %%xmm12, %%xmm6\n"
4478 "addps %%xmm14, %%xmm4\n"
4479 "subps %%xmm14, %%xmm6\n"
4480 "movaps %%xmm13, %%xmm5\n"
4481 "movaps %%xmm13, %%xmm7\n"
4482 "addps %%xmm15, %%xmm5\n"
4483 "subps %%xmm15, %%xmm7\n"
4484 "movaps %%xmm0, %%xmm8\n"
4485 "movaps %%xmm0, %%xmm12\n"
4486 "addps %%xmm4, %%xmm8\n"
4487 "subps %%xmm4, %%xmm12\n"
4488 "movaps %%xmm1, %%xmm9\n"
4489 "movaps %%xmm1, %%xmm13\n"
4490 "addps %%xmm5, %%xmm9\n"
4491 "subps %%xmm5, %%xmm13\n"
4492 "movaps %%xmm2, %%xmm10\n"
4493 "movaps %%xmm2, %%xmm14\n"
4494 "addps %%xmm6, %%xmm10\n"
4495 "subps %%xmm6, %%xmm14\n"
4496 "movaps %%xmm3, %%xmm11\n"
4497 "movaps %%xmm3, %%xmm15\n"
4498 "addps %%xmm7, %%xmm11\n"
4499 "subps %%xmm7, %%xmm15\n"
4500 "movups %%xmm8, (%0)\n"
4501 "movups %%xmm9, (%1)\n"
4502 "movups %%xmm10, (%2)\n"
4503 "movups %%xmm11, (%3)\n"
4504 "movups %%xmm12, (%4)\n"
4505 "movups %%xmm13, (%5)\n"
4506 "movups %%xmm14, (%6)\n"
4507 "movups %%xmm15, (%7)\n"
4508 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4509 );
4510 }
4511 }
4512 for (int j = 0; j < 2048; j += 2048) {
4513 for (int k = 0; k < 256; k += 4) {
4514 __asm__ volatile (
4515 "movups (%0), %%xmm0\n"
4516 "movups (%1), %%xmm1\n"
4517 "movups (%2), %%xmm2\n"
4518 "movups (%3), %%xmm3\n"
4519 "movups (%4), %%xmm4\n"
4520 "movups (%5), %%xmm5\n"
4521 "movups (%6), %%xmm6\n"
4522 "movups (%7), %%xmm7\n"
4523 "movaps %%xmm0, %%xmm8\n"
4524 "movaps %%xmm0, %%xmm9\n"
4525 "addps %%xmm1, %%xmm8\n"
4526 "subps %%xmm1, %%xmm9\n"
4527 "movaps %%xmm2, %%xmm10\n"
4528 "movaps %%xmm2, %%xmm11\n"
4529 "addps %%xmm3, %%xmm10\n"
4530 "subps %%xmm3, %%xmm11\n"
4531 "movaps %%xmm4, %%xmm12\n"
4532 "movaps %%xmm4, %%xmm13\n"
4533 "addps %%xmm5, %%xmm12\n"
4534 "subps %%xmm5, %%xmm13\n"
4535 "movaps %%xmm6, %%xmm14\n"
4536 "movaps %%xmm6, %%xmm15\n"
4537 "addps %%xmm7, %%xmm14\n"
4538 "subps %%xmm7, %%xmm15\n"
4539 "movaps %%xmm8, %%xmm0\n"
4540 "movaps %%xmm8, %%xmm2\n"
4541 "addps %%xmm10, %%xmm0\n"
4542 "subps %%xmm10, %%xmm2\n"
4543 "movaps %%xmm9, %%xmm1\n"
4544 "movaps %%xmm9, %%xmm3\n"
4545 "addps %%xmm11, %%xmm1\n"
4546 "subps %%xmm11, %%xmm3\n"
4547 "movaps %%xmm12, %%xmm4\n"
4548 "movaps %%xmm12, %%xmm6\n"
4549 "addps %%xmm14, %%xmm4\n"
4550 "subps %%xmm14, %%xmm6\n"
4551 "movaps %%xmm13, %%xmm5\n"
4552 "movaps %%xmm13, %%xmm7\n"
4553 "addps %%xmm15, %%xmm5\n"
4554 "subps %%xmm15, %%xmm7\n"
4555 "movaps %%xmm0, %%xmm8\n"
4556 "movaps %%xmm0, %%xmm12\n"
4557 "addps %%xmm4, %%xmm8\n"
4558 "subps %%xmm4, %%xmm12\n"
4559 "movaps %%xmm1, %%xmm9\n"
4560 "movaps %%xmm1, %%xmm13\n"
4561 "addps %%xmm5, %%xmm9\n"
4562 "subps %%xmm5, %%xmm13\n"
4563 "movaps %%xmm2, %%xmm10\n"
4564 "movaps %%xmm2, %%xmm14\n"
4565 "addps %%xmm6, %%xmm10\n"
4566 "subps %%xmm6, %%xmm14\n"
4567 "movaps %%xmm3, %%xmm11\n"
4568 "movaps %%xmm3, %%xmm15\n"
4569 "addps %%xmm7, %%xmm11\n"
4570 "subps %%xmm7, %%xmm15\n"
4571 "movups %%xmm8, (%0)\n"
4572 "movups %%xmm9, (%1)\n"
4573 "movups %%xmm10, (%2)\n"
4574 "movups %%xmm11, (%3)\n"
4575 "movups %%xmm12, (%4)\n"
4576 "movups %%xmm13, (%5)\n"
4577 "movups %%xmm14, (%6)\n"
4578 "movups %%xmm15, (%7)\n"
4579 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4580 );
4581 }
4582 }
4583 return;
4584 }
4585 if (depth == 14) {
4586 helper_float_17_recursive(buf + 0, 11);
4587 helper_float_17_recursive(buf + 2048, 11);
4588 helper_float_17_recursive(buf + 4096, 11);
4589 helper_float_17_recursive(buf + 6144, 11);
4590 helper_float_17_recursive(buf + 8192, 11);
4591 helper_float_17_recursive(buf + 10240, 11);
4592 helper_float_17_recursive(buf + 12288, 11);
4593 helper_float_17_recursive(buf + 14336, 11);
4594 for (int j = 0; j < 16384; j += 16384) {
4595 for (int k = 0; k < 2048; k += 4) {
4596 __asm__ volatile (
4597 "movups (%0), %%xmm0\n"
4598 "movups (%1), %%xmm1\n"
4599 "movups (%2), %%xmm2\n"
4600 "movups (%3), %%xmm3\n"
4601 "movups (%4), %%xmm4\n"
4602 "movups (%5), %%xmm5\n"
4603 "movups (%6), %%xmm6\n"
4604 "movups (%7), %%xmm7\n"
4605 "movaps %%xmm0, %%xmm8\n"
4606 "movaps %%xmm0, %%xmm9\n"
4607 "addps %%xmm1, %%xmm8\n"
4608 "subps %%xmm1, %%xmm9\n"
4609 "movaps %%xmm2, %%xmm10\n"
4610 "movaps %%xmm2, %%xmm11\n"
4611 "addps %%xmm3, %%xmm10\n"
4612 "subps %%xmm3, %%xmm11\n"
4613 "movaps %%xmm4, %%xmm12\n"
4614 "movaps %%xmm4, %%xmm13\n"
4615 "addps %%xmm5, %%xmm12\n"
4616 "subps %%xmm5, %%xmm13\n"
4617 "movaps %%xmm6, %%xmm14\n"
4618 "movaps %%xmm6, %%xmm15\n"
4619 "addps %%xmm7, %%xmm14\n"
4620 "subps %%xmm7, %%xmm15\n"
4621 "movaps %%xmm8, %%xmm0\n"
4622 "movaps %%xmm8, %%xmm2\n"
4623 "addps %%xmm10, %%xmm0\n"
4624 "subps %%xmm10, %%xmm2\n"
4625 "movaps %%xmm9, %%xmm1\n"
4626 "movaps %%xmm9, %%xmm3\n"
4627 "addps %%xmm11, %%xmm1\n"
4628 "subps %%xmm11, %%xmm3\n"
4629 "movaps %%xmm12, %%xmm4\n"
4630 "movaps %%xmm12, %%xmm6\n"
4631 "addps %%xmm14, %%xmm4\n"
4632 "subps %%xmm14, %%xmm6\n"
4633 "movaps %%xmm13, %%xmm5\n"
4634 "movaps %%xmm13, %%xmm7\n"
4635 "addps %%xmm15, %%xmm5\n"
4636 "subps %%xmm15, %%xmm7\n"
4637 "movaps %%xmm0, %%xmm8\n"
4638 "movaps %%xmm0, %%xmm12\n"
4639 "addps %%xmm4, %%xmm8\n"
4640 "subps %%xmm4, %%xmm12\n"
4641 "movaps %%xmm1, %%xmm9\n"
4642 "movaps %%xmm1, %%xmm13\n"
4643 "addps %%xmm5, %%xmm9\n"
4644 "subps %%xmm5, %%xmm13\n"
4645 "movaps %%xmm2, %%xmm10\n"
4646 "movaps %%xmm2, %%xmm14\n"
4647 "addps %%xmm6, %%xmm10\n"
4648 "subps %%xmm6, %%xmm14\n"
4649 "movaps %%xmm3, %%xmm11\n"
4650 "movaps %%xmm3, %%xmm15\n"
4651 "addps %%xmm7, %%xmm11\n"
4652 "subps %%xmm7, %%xmm15\n"
4653 "movups %%xmm8, (%0)\n"
4654 "movups %%xmm9, (%1)\n"
4655 "movups %%xmm10, (%2)\n"
4656 "movups %%xmm11, (%3)\n"
4657 "movups %%xmm12, (%4)\n"
4658 "movups %%xmm13, (%5)\n"
4659 "movups %%xmm14, (%6)\n"
4660 "movups %%xmm15, (%7)\n"
4661 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4662 );
4663 }
4664 }
4665 return;
4666 }
4667 if (depth == 17) {
4668 helper_float_17_recursive(buf + 0, 14);
4669 helper_float_17_recursive(buf + 16384, 14);
4670 helper_float_17_recursive(buf + 32768, 14);
4671 helper_float_17_recursive(buf + 49152, 14);
4672 helper_float_17_recursive(buf + 65536, 14);
4673 helper_float_17_recursive(buf + 81920, 14);
4674 helper_float_17_recursive(buf + 98304, 14);
4675 helper_float_17_recursive(buf + 114688, 14);
4676 for (int j = 0; j < 131072; j += 131072) {
4677 for (int k = 0; k < 16384; k += 4) {
4678 __asm__ volatile (
4679 "movups (%0), %%xmm0\n"
4680 "movups (%1), %%xmm1\n"
4681 "movups (%2), %%xmm2\n"
4682 "movups (%3), %%xmm3\n"
4683 "movups (%4), %%xmm4\n"
4684 "movups (%5), %%xmm5\n"
4685 "movups (%6), %%xmm6\n"
4686 "movups (%7), %%xmm7\n"
4687 "movaps %%xmm0, %%xmm8\n"
4688 "movaps %%xmm0, %%xmm9\n"
4689 "addps %%xmm1, %%xmm8\n"
4690 "subps %%xmm1, %%xmm9\n"
4691 "movaps %%xmm2, %%xmm10\n"
4692 "movaps %%xmm2, %%xmm11\n"
4693 "addps %%xmm3, %%xmm10\n"
4694 "subps %%xmm3, %%xmm11\n"
4695 "movaps %%xmm4, %%xmm12\n"
4696 "movaps %%xmm4, %%xmm13\n"
4697 "addps %%xmm5, %%xmm12\n"
4698 "subps %%xmm5, %%xmm13\n"
4699 "movaps %%xmm6, %%xmm14\n"
4700 "movaps %%xmm6, %%xmm15\n"
4701 "addps %%xmm7, %%xmm14\n"
4702 "subps %%xmm7, %%xmm15\n"
4703 "movaps %%xmm8, %%xmm0\n"
4704 "movaps %%xmm8, %%xmm2\n"
4705 "addps %%xmm10, %%xmm0\n"
4706 "subps %%xmm10, %%xmm2\n"
4707 "movaps %%xmm9, %%xmm1\n"
4708 "movaps %%xmm9, %%xmm3\n"
4709 "addps %%xmm11, %%xmm1\n"
4710 "subps %%xmm11, %%xmm3\n"
4711 "movaps %%xmm12, %%xmm4\n"
4712 "movaps %%xmm12, %%xmm6\n"
4713 "addps %%xmm14, %%xmm4\n"
4714 "subps %%xmm14, %%xmm6\n"
4715 "movaps %%xmm13, %%xmm5\n"
4716 "movaps %%xmm13, %%xmm7\n"
4717 "addps %%xmm15, %%xmm5\n"
4718 "subps %%xmm15, %%xmm7\n"
4719 "movaps %%xmm0, %%xmm8\n"
4720 "movaps %%xmm0, %%xmm12\n"
4721 "addps %%xmm4, %%xmm8\n"
4722 "subps %%xmm4, %%xmm12\n"
4723 "movaps %%xmm1, %%xmm9\n"
4724 "movaps %%xmm1, %%xmm13\n"
4725 "addps %%xmm5, %%xmm9\n"
4726 "subps %%xmm5, %%xmm13\n"
4727 "movaps %%xmm2, %%xmm10\n"
4728 "movaps %%xmm2, %%xmm14\n"
4729 "addps %%xmm6, %%xmm10\n"
4730 "subps %%xmm6, %%xmm14\n"
4731 "movaps %%xmm3, %%xmm11\n"
4732 "movaps %%xmm3, %%xmm15\n"
4733 "addps %%xmm7, %%xmm11\n"
4734 "subps %%xmm7, %%xmm15\n"
4735 "movups %%xmm8, (%0)\n"
4736 "movups %%xmm9, (%1)\n"
4737 "movups %%xmm10, (%2)\n"
4738 "movups %%xmm11, (%3)\n"
4739 "movups %%xmm12, (%4)\n"
4740 "movups %%xmm13, (%5)\n"
4741 "movups %%xmm14, (%6)\n"
4742 "movups %%xmm15, (%7)\n"
4743 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4744 );
4745 }
4746 }
4747 return;
4748 }
4749 }
4750 void helper_float_17(float *buf);
helper_float_17(float * buf)4751 void helper_float_17(float *buf) {
4752 helper_float_17_recursive(buf, 17);
4753 }
4754 void helper_float_18_recursive(float *buf, int depth);
helper_float_18_recursive(float * buf,int depth)4755 void helper_float_18_recursive(float *buf, int depth) {
4756 if (depth == 13) {
4757 for (int j = 0; j < 8192; j += 32) {
4758 for (int k = 0; k < 4; k += 4) {
4759 __asm__ volatile (
4760 "movups (%0), %%xmm0\n"
4761 "movups (%1), %%xmm1\n"
4762 "movups (%2), %%xmm2\n"
4763 "movups (%3), %%xmm3\n"
4764 "movups (%4), %%xmm4\n"
4765 "movups (%5), %%xmm5\n"
4766 "movups (%6), %%xmm6\n"
4767 "movups (%7), %%xmm7\n"
4768 "movaps %%xmm0, %%xmm8\n"
4769 "shufps $160, %%xmm8, %%xmm8\n"
4770 "shufps $245, %%xmm0, %%xmm0\n"
4771 "xorps %%xmm9, %%xmm9\n"
4772 "subps %%xmm0, %%xmm9\n"
4773 "addsubps %%xmm9, %%xmm8\n"
4774 "movaps %%xmm8, %%xmm0\n"
4775 "movaps %%xmm1, %%xmm8\n"
4776 "shufps $160, %%xmm8, %%xmm8\n"
4777 "shufps $245, %%xmm1, %%xmm1\n"
4778 "xorps %%xmm9, %%xmm9\n"
4779 "subps %%xmm1, %%xmm9\n"
4780 "addsubps %%xmm9, %%xmm8\n"
4781 "movaps %%xmm8, %%xmm1\n"
4782 "movaps %%xmm2, %%xmm8\n"
4783 "shufps $160, %%xmm8, %%xmm8\n"
4784 "shufps $245, %%xmm2, %%xmm2\n"
4785 "xorps %%xmm9, %%xmm9\n"
4786 "subps %%xmm2, %%xmm9\n"
4787 "addsubps %%xmm9, %%xmm8\n"
4788 "movaps %%xmm8, %%xmm2\n"
4789 "movaps %%xmm3, %%xmm8\n"
4790 "shufps $160, %%xmm8, %%xmm8\n"
4791 "shufps $245, %%xmm3, %%xmm3\n"
4792 "xorps %%xmm9, %%xmm9\n"
4793 "subps %%xmm3, %%xmm9\n"
4794 "addsubps %%xmm9, %%xmm8\n"
4795 "movaps %%xmm8, %%xmm3\n"
4796 "movaps %%xmm4, %%xmm8\n"
4797 "shufps $160, %%xmm8, %%xmm8\n"
4798 "shufps $245, %%xmm4, %%xmm4\n"
4799 "xorps %%xmm9, %%xmm9\n"
4800 "subps %%xmm4, %%xmm9\n"
4801 "addsubps %%xmm9, %%xmm8\n"
4802 "movaps %%xmm8, %%xmm4\n"
4803 "movaps %%xmm5, %%xmm8\n"
4804 "shufps $160, %%xmm8, %%xmm8\n"
4805 "shufps $245, %%xmm5, %%xmm5\n"
4806 "xorps %%xmm9, %%xmm9\n"
4807 "subps %%xmm5, %%xmm9\n"
4808 "addsubps %%xmm9, %%xmm8\n"
4809 "movaps %%xmm8, %%xmm5\n"
4810 "movaps %%xmm6, %%xmm8\n"
4811 "shufps $160, %%xmm8, %%xmm8\n"
4812 "shufps $245, %%xmm6, %%xmm6\n"
4813 "xorps %%xmm9, %%xmm9\n"
4814 "subps %%xmm6, %%xmm9\n"
4815 "addsubps %%xmm9, %%xmm8\n"
4816 "movaps %%xmm8, %%xmm6\n"
4817 "movaps %%xmm7, %%xmm8\n"
4818 "shufps $160, %%xmm8, %%xmm8\n"
4819 "shufps $245, %%xmm7, %%xmm7\n"
4820 "xorps %%xmm9, %%xmm9\n"
4821 "subps %%xmm7, %%xmm9\n"
4822 "addsubps %%xmm9, %%xmm8\n"
4823 "movaps %%xmm8, %%xmm7\n"
4824 "movaps %%xmm0, %%xmm8\n"
4825 "shufps $68, %%xmm8, %%xmm8\n"
4826 "xorps %%xmm9, %%xmm9\n"
4827 "movaps %%xmm0, %%xmm10\n"
4828 "shufps $14, %%xmm9, %%xmm10\n"
4829 "movaps %%xmm0, %%xmm11\n"
4830 "shufps $224, %%xmm11, %%xmm9\n"
4831 "addps %%xmm8, %%xmm10\n"
4832 "subps %%xmm9, %%xmm10\n"
4833 "movaps %%xmm10, %%xmm0\n"
4834 "movaps %%xmm1, %%xmm8\n"
4835 "shufps $68, %%xmm8, %%xmm8\n"
4836 "xorps %%xmm9, %%xmm9\n"
4837 "movaps %%xmm1, %%xmm10\n"
4838 "shufps $14, %%xmm9, %%xmm10\n"
4839 "movaps %%xmm1, %%xmm11\n"
4840 "shufps $224, %%xmm11, %%xmm9\n"
4841 "addps %%xmm8, %%xmm10\n"
4842 "subps %%xmm9, %%xmm10\n"
4843 "movaps %%xmm10, %%xmm1\n"
4844 "movaps %%xmm2, %%xmm8\n"
4845 "shufps $68, %%xmm8, %%xmm8\n"
4846 "xorps %%xmm9, %%xmm9\n"
4847 "movaps %%xmm2, %%xmm10\n"
4848 "shufps $14, %%xmm9, %%xmm10\n"
4849 "movaps %%xmm2, %%xmm11\n"
4850 "shufps $224, %%xmm11, %%xmm9\n"
4851 "addps %%xmm8, %%xmm10\n"
4852 "subps %%xmm9, %%xmm10\n"
4853 "movaps %%xmm10, %%xmm2\n"
4854 "movaps %%xmm3, %%xmm8\n"
4855 "shufps $68, %%xmm8, %%xmm8\n"
4856 "xorps %%xmm9, %%xmm9\n"
4857 "movaps %%xmm3, %%xmm10\n"
4858 "shufps $14, %%xmm9, %%xmm10\n"
4859 "movaps %%xmm3, %%xmm11\n"
4860 "shufps $224, %%xmm11, %%xmm9\n"
4861 "addps %%xmm8, %%xmm10\n"
4862 "subps %%xmm9, %%xmm10\n"
4863 "movaps %%xmm10, %%xmm3\n"
4864 "movaps %%xmm4, %%xmm8\n"
4865 "shufps $68, %%xmm8, %%xmm8\n"
4866 "xorps %%xmm9, %%xmm9\n"
4867 "movaps %%xmm4, %%xmm10\n"
4868 "shufps $14, %%xmm9, %%xmm10\n"
4869 "movaps %%xmm4, %%xmm11\n"
4870 "shufps $224, %%xmm11, %%xmm9\n"
4871 "addps %%xmm8, %%xmm10\n"
4872 "subps %%xmm9, %%xmm10\n"
4873 "movaps %%xmm10, %%xmm4\n"
4874 "movaps %%xmm5, %%xmm8\n"
4875 "shufps $68, %%xmm8, %%xmm8\n"
4876 "xorps %%xmm9, %%xmm9\n"
4877 "movaps %%xmm5, %%xmm10\n"
4878 "shufps $14, %%xmm9, %%xmm10\n"
4879 "movaps %%xmm5, %%xmm11\n"
4880 "shufps $224, %%xmm11, %%xmm9\n"
4881 "addps %%xmm8, %%xmm10\n"
4882 "subps %%xmm9, %%xmm10\n"
4883 "movaps %%xmm10, %%xmm5\n"
4884 "movaps %%xmm6, %%xmm8\n"
4885 "shufps $68, %%xmm8, %%xmm8\n"
4886 "xorps %%xmm9, %%xmm9\n"
4887 "movaps %%xmm6, %%xmm10\n"
4888 "shufps $14, %%xmm9, %%xmm10\n"
4889 "movaps %%xmm6, %%xmm11\n"
4890 "shufps $224, %%xmm11, %%xmm9\n"
4891 "addps %%xmm8, %%xmm10\n"
4892 "subps %%xmm9, %%xmm10\n"
4893 "movaps %%xmm10, %%xmm6\n"
4894 "movaps %%xmm7, %%xmm8\n"
4895 "shufps $68, %%xmm8, %%xmm8\n"
4896 "xorps %%xmm9, %%xmm9\n"
4897 "movaps %%xmm7, %%xmm10\n"
4898 "shufps $14, %%xmm9, %%xmm10\n"
4899 "movaps %%xmm7, %%xmm11\n"
4900 "shufps $224, %%xmm11, %%xmm9\n"
4901 "addps %%xmm8, %%xmm10\n"
4902 "subps %%xmm9, %%xmm10\n"
4903 "movaps %%xmm10, %%xmm7\n"
4904 "movaps %%xmm0, %%xmm8\n"
4905 "movaps %%xmm0, %%xmm9\n"
4906 "addps %%xmm1, %%xmm8\n"
4907 "subps %%xmm1, %%xmm9\n"
4908 "movaps %%xmm2, %%xmm10\n"
4909 "movaps %%xmm2, %%xmm11\n"
4910 "addps %%xmm3, %%xmm10\n"
4911 "subps %%xmm3, %%xmm11\n"
4912 "movaps %%xmm4, %%xmm12\n"
4913 "movaps %%xmm4, %%xmm13\n"
4914 "addps %%xmm5, %%xmm12\n"
4915 "subps %%xmm5, %%xmm13\n"
4916 "movaps %%xmm6, %%xmm14\n"
4917 "movaps %%xmm6, %%xmm15\n"
4918 "addps %%xmm7, %%xmm14\n"
4919 "subps %%xmm7, %%xmm15\n"
4920 "movaps %%xmm8, %%xmm0\n"
4921 "movaps %%xmm8, %%xmm2\n"
4922 "addps %%xmm10, %%xmm0\n"
4923 "subps %%xmm10, %%xmm2\n"
4924 "movaps %%xmm9, %%xmm1\n"
4925 "movaps %%xmm9, %%xmm3\n"
4926 "addps %%xmm11, %%xmm1\n"
4927 "subps %%xmm11, %%xmm3\n"
4928 "movaps %%xmm12, %%xmm4\n"
4929 "movaps %%xmm12, %%xmm6\n"
4930 "addps %%xmm14, %%xmm4\n"
4931 "subps %%xmm14, %%xmm6\n"
4932 "movaps %%xmm13, %%xmm5\n"
4933 "movaps %%xmm13, %%xmm7\n"
4934 "addps %%xmm15, %%xmm5\n"
4935 "subps %%xmm15, %%xmm7\n"
4936 "movaps %%xmm0, %%xmm8\n"
4937 "movaps %%xmm0, %%xmm12\n"
4938 "addps %%xmm4, %%xmm8\n"
4939 "subps %%xmm4, %%xmm12\n"
4940 "movaps %%xmm1, %%xmm9\n"
4941 "movaps %%xmm1, %%xmm13\n"
4942 "addps %%xmm5, %%xmm9\n"
4943 "subps %%xmm5, %%xmm13\n"
4944 "movaps %%xmm2, %%xmm10\n"
4945 "movaps %%xmm2, %%xmm14\n"
4946 "addps %%xmm6, %%xmm10\n"
4947 "subps %%xmm6, %%xmm14\n"
4948 "movaps %%xmm3, %%xmm11\n"
4949 "movaps %%xmm3, %%xmm15\n"
4950 "addps %%xmm7, %%xmm11\n"
4951 "subps %%xmm7, %%xmm15\n"
4952 "movups %%xmm8, (%0)\n"
4953 "movups %%xmm9, (%1)\n"
4954 "movups %%xmm10, (%2)\n"
4955 "movups %%xmm11, (%3)\n"
4956 "movups %%xmm12, (%4)\n"
4957 "movups %%xmm13, (%5)\n"
4958 "movups %%xmm14, (%6)\n"
4959 "movups %%xmm15, (%7)\n"
4960 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
4961 );
4962 }
4963 }
4964 for (int j = 0; j < 8192; j += 256) {
4965 for (int k = 0; k < 32; k += 4) {
4966 __asm__ volatile (
4967 "movups (%0), %%xmm0\n"
4968 "movups (%1), %%xmm1\n"
4969 "movups (%2), %%xmm2\n"
4970 "movups (%3), %%xmm3\n"
4971 "movups (%4), %%xmm4\n"
4972 "movups (%5), %%xmm5\n"
4973 "movups (%6), %%xmm6\n"
4974 "movups (%7), %%xmm7\n"
4975 "movaps %%xmm0, %%xmm8\n"
4976 "movaps %%xmm0, %%xmm9\n"
4977 "addps %%xmm1, %%xmm8\n"
4978 "subps %%xmm1, %%xmm9\n"
4979 "movaps %%xmm2, %%xmm10\n"
4980 "movaps %%xmm2, %%xmm11\n"
4981 "addps %%xmm3, %%xmm10\n"
4982 "subps %%xmm3, %%xmm11\n"
4983 "movaps %%xmm4, %%xmm12\n"
4984 "movaps %%xmm4, %%xmm13\n"
4985 "addps %%xmm5, %%xmm12\n"
4986 "subps %%xmm5, %%xmm13\n"
4987 "movaps %%xmm6, %%xmm14\n"
4988 "movaps %%xmm6, %%xmm15\n"
4989 "addps %%xmm7, %%xmm14\n"
4990 "subps %%xmm7, %%xmm15\n"
4991 "movaps %%xmm8, %%xmm0\n"
4992 "movaps %%xmm8, %%xmm2\n"
4993 "addps %%xmm10, %%xmm0\n"
4994 "subps %%xmm10, %%xmm2\n"
4995 "movaps %%xmm9, %%xmm1\n"
4996 "movaps %%xmm9, %%xmm3\n"
4997 "addps %%xmm11, %%xmm1\n"
4998 "subps %%xmm11, %%xmm3\n"
4999 "movaps %%xmm12, %%xmm4\n"
5000 "movaps %%xmm12, %%xmm6\n"
5001 "addps %%xmm14, %%xmm4\n"
5002 "subps %%xmm14, %%xmm6\n"
5003 "movaps %%xmm13, %%xmm5\n"
5004 "movaps %%xmm13, %%xmm7\n"
5005 "addps %%xmm15, %%xmm5\n"
5006 "subps %%xmm15, %%xmm7\n"
5007 "movaps %%xmm0, %%xmm8\n"
5008 "movaps %%xmm0, %%xmm12\n"
5009 "addps %%xmm4, %%xmm8\n"
5010 "subps %%xmm4, %%xmm12\n"
5011 "movaps %%xmm1, %%xmm9\n"
5012 "movaps %%xmm1, %%xmm13\n"
5013 "addps %%xmm5, %%xmm9\n"
5014 "subps %%xmm5, %%xmm13\n"
5015 "movaps %%xmm2, %%xmm10\n"
5016 "movaps %%xmm2, %%xmm14\n"
5017 "addps %%xmm6, %%xmm10\n"
5018 "subps %%xmm6, %%xmm14\n"
5019 "movaps %%xmm3, %%xmm11\n"
5020 "movaps %%xmm3, %%xmm15\n"
5021 "addps %%xmm7, %%xmm11\n"
5022 "subps %%xmm7, %%xmm15\n"
5023 "movups %%xmm8, (%0)\n"
5024 "movups %%xmm9, (%1)\n"
5025 "movups %%xmm10, (%2)\n"
5026 "movups %%xmm11, (%3)\n"
5027 "movups %%xmm12, (%4)\n"
5028 "movups %%xmm13, (%5)\n"
5029 "movups %%xmm14, (%6)\n"
5030 "movups %%xmm15, (%7)\n"
5031 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5032 );
5033 }
5034 }
5035 for (int j = 0; j < 8192; j += 2048) {
5036 for (int k = 0; k < 256; k += 4) {
5037 __asm__ volatile (
5038 "movups (%0), %%xmm0\n"
5039 "movups (%1), %%xmm1\n"
5040 "movups (%2), %%xmm2\n"
5041 "movups (%3), %%xmm3\n"
5042 "movups (%4), %%xmm4\n"
5043 "movups (%5), %%xmm5\n"
5044 "movups (%6), %%xmm6\n"
5045 "movups (%7), %%xmm7\n"
5046 "movaps %%xmm0, %%xmm8\n"
5047 "movaps %%xmm0, %%xmm9\n"
5048 "addps %%xmm1, %%xmm8\n"
5049 "subps %%xmm1, %%xmm9\n"
5050 "movaps %%xmm2, %%xmm10\n"
5051 "movaps %%xmm2, %%xmm11\n"
5052 "addps %%xmm3, %%xmm10\n"
5053 "subps %%xmm3, %%xmm11\n"
5054 "movaps %%xmm4, %%xmm12\n"
5055 "movaps %%xmm4, %%xmm13\n"
5056 "addps %%xmm5, %%xmm12\n"
5057 "subps %%xmm5, %%xmm13\n"
5058 "movaps %%xmm6, %%xmm14\n"
5059 "movaps %%xmm6, %%xmm15\n"
5060 "addps %%xmm7, %%xmm14\n"
5061 "subps %%xmm7, %%xmm15\n"
5062 "movaps %%xmm8, %%xmm0\n"
5063 "movaps %%xmm8, %%xmm2\n"
5064 "addps %%xmm10, %%xmm0\n"
5065 "subps %%xmm10, %%xmm2\n"
5066 "movaps %%xmm9, %%xmm1\n"
5067 "movaps %%xmm9, %%xmm3\n"
5068 "addps %%xmm11, %%xmm1\n"
5069 "subps %%xmm11, %%xmm3\n"
5070 "movaps %%xmm12, %%xmm4\n"
5071 "movaps %%xmm12, %%xmm6\n"
5072 "addps %%xmm14, %%xmm4\n"
5073 "subps %%xmm14, %%xmm6\n"
5074 "movaps %%xmm13, %%xmm5\n"
5075 "movaps %%xmm13, %%xmm7\n"
5076 "addps %%xmm15, %%xmm5\n"
5077 "subps %%xmm15, %%xmm7\n"
5078 "movaps %%xmm0, %%xmm8\n"
5079 "movaps %%xmm0, %%xmm12\n"
5080 "addps %%xmm4, %%xmm8\n"
5081 "subps %%xmm4, %%xmm12\n"
5082 "movaps %%xmm1, %%xmm9\n"
5083 "movaps %%xmm1, %%xmm13\n"
5084 "addps %%xmm5, %%xmm9\n"
5085 "subps %%xmm5, %%xmm13\n"
5086 "movaps %%xmm2, %%xmm10\n"
5087 "movaps %%xmm2, %%xmm14\n"
5088 "addps %%xmm6, %%xmm10\n"
5089 "subps %%xmm6, %%xmm14\n"
5090 "movaps %%xmm3, %%xmm11\n"
5091 "movaps %%xmm3, %%xmm15\n"
5092 "addps %%xmm7, %%xmm11\n"
5093 "subps %%xmm7, %%xmm15\n"
5094 "movups %%xmm8, (%0)\n"
5095 "movups %%xmm9, (%1)\n"
5096 "movups %%xmm10, (%2)\n"
5097 "movups %%xmm11, (%3)\n"
5098 "movups %%xmm12, (%4)\n"
5099 "movups %%xmm13, (%5)\n"
5100 "movups %%xmm14, (%6)\n"
5101 "movups %%xmm15, (%7)\n"
5102 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5103 );
5104 }
5105 }
5106 for (int j = 0; j < 8192; j += 8192) {
5107 for (int k = 0; k < 2048; k += 4) {
5108 __asm__ volatile (
5109 "movups (%0), %%xmm0\n"
5110 "movups (%1), %%xmm1\n"
5111 "movups (%2), %%xmm2\n"
5112 "movups (%3), %%xmm3\n"
5113 "movaps %%xmm0, %%xmm8\n"
5114 "movaps %%xmm0, %%xmm9\n"
5115 "addps %%xmm1, %%xmm8\n"
5116 "subps %%xmm1, %%xmm9\n"
5117 "movaps %%xmm2, %%xmm10\n"
5118 "movaps %%xmm2, %%xmm11\n"
5119 "addps %%xmm3, %%xmm10\n"
5120 "subps %%xmm3, %%xmm11\n"
5121 "movaps %%xmm8, %%xmm0\n"
5122 "movaps %%xmm8, %%xmm2\n"
5123 "addps %%xmm10, %%xmm0\n"
5124 "subps %%xmm10, %%xmm2\n"
5125 "movaps %%xmm9, %%xmm1\n"
5126 "movaps %%xmm9, %%xmm3\n"
5127 "addps %%xmm11, %%xmm1\n"
5128 "subps %%xmm11, %%xmm3\n"
5129 "movups %%xmm0, (%0)\n"
5130 "movups %%xmm1, (%1)\n"
5131 "movups %%xmm2, (%2)\n"
5132 "movups %%xmm3, (%3)\n"
5133 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5134 );
5135 }
5136 }
5137 return;
5138 }
5139 if (depth == 16) {
5140 helper_float_18_recursive(buf + 0, 13);
5141 helper_float_18_recursive(buf + 8192, 13);
5142 helper_float_18_recursive(buf + 16384, 13);
5143 helper_float_18_recursive(buf + 24576, 13);
5144 helper_float_18_recursive(buf + 32768, 13);
5145 helper_float_18_recursive(buf + 40960, 13);
5146 helper_float_18_recursive(buf + 49152, 13);
5147 helper_float_18_recursive(buf + 57344, 13);
5148 for (int j = 0; j < 65536; j += 65536) {
5149 for (int k = 0; k < 8192; k += 4) {
5150 __asm__ volatile (
5151 "movups (%0), %%xmm0\n"
5152 "movups (%1), %%xmm1\n"
5153 "movups (%2), %%xmm2\n"
5154 "movups (%3), %%xmm3\n"
5155 "movups (%4), %%xmm4\n"
5156 "movups (%5), %%xmm5\n"
5157 "movups (%6), %%xmm6\n"
5158 "movups (%7), %%xmm7\n"
5159 "movaps %%xmm0, %%xmm8\n"
5160 "movaps %%xmm0, %%xmm9\n"
5161 "addps %%xmm1, %%xmm8\n"
5162 "subps %%xmm1, %%xmm9\n"
5163 "movaps %%xmm2, %%xmm10\n"
5164 "movaps %%xmm2, %%xmm11\n"
5165 "addps %%xmm3, %%xmm10\n"
5166 "subps %%xmm3, %%xmm11\n"
5167 "movaps %%xmm4, %%xmm12\n"
5168 "movaps %%xmm4, %%xmm13\n"
5169 "addps %%xmm5, %%xmm12\n"
5170 "subps %%xmm5, %%xmm13\n"
5171 "movaps %%xmm6, %%xmm14\n"
5172 "movaps %%xmm6, %%xmm15\n"
5173 "addps %%xmm7, %%xmm14\n"
5174 "subps %%xmm7, %%xmm15\n"
5175 "movaps %%xmm8, %%xmm0\n"
5176 "movaps %%xmm8, %%xmm2\n"
5177 "addps %%xmm10, %%xmm0\n"
5178 "subps %%xmm10, %%xmm2\n"
5179 "movaps %%xmm9, %%xmm1\n"
5180 "movaps %%xmm9, %%xmm3\n"
5181 "addps %%xmm11, %%xmm1\n"
5182 "subps %%xmm11, %%xmm3\n"
5183 "movaps %%xmm12, %%xmm4\n"
5184 "movaps %%xmm12, %%xmm6\n"
5185 "addps %%xmm14, %%xmm4\n"
5186 "subps %%xmm14, %%xmm6\n"
5187 "movaps %%xmm13, %%xmm5\n"
5188 "movaps %%xmm13, %%xmm7\n"
5189 "addps %%xmm15, %%xmm5\n"
5190 "subps %%xmm15, %%xmm7\n"
5191 "movaps %%xmm0, %%xmm8\n"
5192 "movaps %%xmm0, %%xmm12\n"
5193 "addps %%xmm4, %%xmm8\n"
5194 "subps %%xmm4, %%xmm12\n"
5195 "movaps %%xmm1, %%xmm9\n"
5196 "movaps %%xmm1, %%xmm13\n"
5197 "addps %%xmm5, %%xmm9\n"
5198 "subps %%xmm5, %%xmm13\n"
5199 "movaps %%xmm2, %%xmm10\n"
5200 "movaps %%xmm2, %%xmm14\n"
5201 "addps %%xmm6, %%xmm10\n"
5202 "subps %%xmm6, %%xmm14\n"
5203 "movaps %%xmm3, %%xmm11\n"
5204 "movaps %%xmm3, %%xmm15\n"
5205 "addps %%xmm7, %%xmm11\n"
5206 "subps %%xmm7, %%xmm15\n"
5207 "movups %%xmm8, (%0)\n"
5208 "movups %%xmm9, (%1)\n"
5209 "movups %%xmm10, (%2)\n"
5210 "movups %%xmm11, (%3)\n"
5211 "movups %%xmm12, (%4)\n"
5212 "movups %%xmm13, (%5)\n"
5213 "movups %%xmm14, (%6)\n"
5214 "movups %%xmm15, (%7)\n"
5215 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5216 );
5217 }
5218 }
5219 return;
5220 }
5221 if (depth == 18) {
5222 helper_float_18_recursive(buf + 0, 16);
5223 helper_float_18_recursive(buf + 65536, 16);
5224 helper_float_18_recursive(buf + 131072, 16);
5225 helper_float_18_recursive(buf + 196608, 16);
5226 for (int j = 0; j < 262144; j += 262144) {
5227 for (int k = 0; k < 65536; k += 4) {
5228 __asm__ volatile (
5229 "movups (%0), %%xmm0\n"
5230 "movups (%1), %%xmm1\n"
5231 "movups (%2), %%xmm2\n"
5232 "movups (%3), %%xmm3\n"
5233 "movaps %%xmm0, %%xmm8\n"
5234 "movaps %%xmm0, %%xmm9\n"
5235 "addps %%xmm1, %%xmm8\n"
5236 "subps %%xmm1, %%xmm9\n"
5237 "movaps %%xmm2, %%xmm10\n"
5238 "movaps %%xmm2, %%xmm11\n"
5239 "addps %%xmm3, %%xmm10\n"
5240 "subps %%xmm3, %%xmm11\n"
5241 "movaps %%xmm8, %%xmm0\n"
5242 "movaps %%xmm8, %%xmm2\n"
5243 "addps %%xmm10, %%xmm0\n"
5244 "subps %%xmm10, %%xmm2\n"
5245 "movaps %%xmm9, %%xmm1\n"
5246 "movaps %%xmm9, %%xmm3\n"
5247 "addps %%xmm11, %%xmm1\n"
5248 "subps %%xmm11, %%xmm3\n"
5249 "movups %%xmm0, (%0)\n"
5250 "movups %%xmm1, (%1)\n"
5251 "movups %%xmm2, (%2)\n"
5252 "movups %%xmm3, (%3)\n"
5253 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5254 );
5255 }
5256 }
5257 return;
5258 }
5259 }
5260 void helper_float_18(float *buf);
helper_float_18(float * buf)5261 void helper_float_18(float *buf) {
5262 helper_float_18_recursive(buf, 18);
5263 }
5264 void helper_float_19_recursive(float *buf, int depth);
helper_float_19_recursive(float * buf,int depth)5265 void helper_float_19_recursive(float *buf, int depth) {
5266 if (depth == 13) {
5267 for (int j = 0; j < 8192; j += 32) {
5268 for (int k = 0; k < 4; k += 4) {
5269 __asm__ volatile (
5270 "movups (%0), %%xmm0\n"
5271 "movups (%1), %%xmm1\n"
5272 "movups (%2), %%xmm2\n"
5273 "movups (%3), %%xmm3\n"
5274 "movups (%4), %%xmm4\n"
5275 "movups (%5), %%xmm5\n"
5276 "movups (%6), %%xmm6\n"
5277 "movups (%7), %%xmm7\n"
5278 "movaps %%xmm0, %%xmm8\n"
5279 "shufps $160, %%xmm8, %%xmm8\n"
5280 "shufps $245, %%xmm0, %%xmm0\n"
5281 "xorps %%xmm9, %%xmm9\n"
5282 "subps %%xmm0, %%xmm9\n"
5283 "addsubps %%xmm9, %%xmm8\n"
5284 "movaps %%xmm8, %%xmm0\n"
5285 "movaps %%xmm1, %%xmm8\n"
5286 "shufps $160, %%xmm8, %%xmm8\n"
5287 "shufps $245, %%xmm1, %%xmm1\n"
5288 "xorps %%xmm9, %%xmm9\n"
5289 "subps %%xmm1, %%xmm9\n"
5290 "addsubps %%xmm9, %%xmm8\n"
5291 "movaps %%xmm8, %%xmm1\n"
5292 "movaps %%xmm2, %%xmm8\n"
5293 "shufps $160, %%xmm8, %%xmm8\n"
5294 "shufps $245, %%xmm2, %%xmm2\n"
5295 "xorps %%xmm9, %%xmm9\n"
5296 "subps %%xmm2, %%xmm9\n"
5297 "addsubps %%xmm9, %%xmm8\n"
5298 "movaps %%xmm8, %%xmm2\n"
5299 "movaps %%xmm3, %%xmm8\n"
5300 "shufps $160, %%xmm8, %%xmm8\n"
5301 "shufps $245, %%xmm3, %%xmm3\n"
5302 "xorps %%xmm9, %%xmm9\n"
5303 "subps %%xmm3, %%xmm9\n"
5304 "addsubps %%xmm9, %%xmm8\n"
5305 "movaps %%xmm8, %%xmm3\n"
5306 "movaps %%xmm4, %%xmm8\n"
5307 "shufps $160, %%xmm8, %%xmm8\n"
5308 "shufps $245, %%xmm4, %%xmm4\n"
5309 "xorps %%xmm9, %%xmm9\n"
5310 "subps %%xmm4, %%xmm9\n"
5311 "addsubps %%xmm9, %%xmm8\n"
5312 "movaps %%xmm8, %%xmm4\n"
5313 "movaps %%xmm5, %%xmm8\n"
5314 "shufps $160, %%xmm8, %%xmm8\n"
5315 "shufps $245, %%xmm5, %%xmm5\n"
5316 "xorps %%xmm9, %%xmm9\n"
5317 "subps %%xmm5, %%xmm9\n"
5318 "addsubps %%xmm9, %%xmm8\n"
5319 "movaps %%xmm8, %%xmm5\n"
5320 "movaps %%xmm6, %%xmm8\n"
5321 "shufps $160, %%xmm8, %%xmm8\n"
5322 "shufps $245, %%xmm6, %%xmm6\n"
5323 "xorps %%xmm9, %%xmm9\n"
5324 "subps %%xmm6, %%xmm9\n"
5325 "addsubps %%xmm9, %%xmm8\n"
5326 "movaps %%xmm8, %%xmm6\n"
5327 "movaps %%xmm7, %%xmm8\n"
5328 "shufps $160, %%xmm8, %%xmm8\n"
5329 "shufps $245, %%xmm7, %%xmm7\n"
5330 "xorps %%xmm9, %%xmm9\n"
5331 "subps %%xmm7, %%xmm9\n"
5332 "addsubps %%xmm9, %%xmm8\n"
5333 "movaps %%xmm8, %%xmm7\n"
5334 "movaps %%xmm0, %%xmm8\n"
5335 "shufps $68, %%xmm8, %%xmm8\n"
5336 "xorps %%xmm9, %%xmm9\n"
5337 "movaps %%xmm0, %%xmm10\n"
5338 "shufps $14, %%xmm9, %%xmm10\n"
5339 "movaps %%xmm0, %%xmm11\n"
5340 "shufps $224, %%xmm11, %%xmm9\n"
5341 "addps %%xmm8, %%xmm10\n"
5342 "subps %%xmm9, %%xmm10\n"
5343 "movaps %%xmm10, %%xmm0\n"
5344 "movaps %%xmm1, %%xmm8\n"
5345 "shufps $68, %%xmm8, %%xmm8\n"
5346 "xorps %%xmm9, %%xmm9\n"
5347 "movaps %%xmm1, %%xmm10\n"
5348 "shufps $14, %%xmm9, %%xmm10\n"
5349 "movaps %%xmm1, %%xmm11\n"
5350 "shufps $224, %%xmm11, %%xmm9\n"
5351 "addps %%xmm8, %%xmm10\n"
5352 "subps %%xmm9, %%xmm10\n"
5353 "movaps %%xmm10, %%xmm1\n"
5354 "movaps %%xmm2, %%xmm8\n"
5355 "shufps $68, %%xmm8, %%xmm8\n"
5356 "xorps %%xmm9, %%xmm9\n"
5357 "movaps %%xmm2, %%xmm10\n"
5358 "shufps $14, %%xmm9, %%xmm10\n"
5359 "movaps %%xmm2, %%xmm11\n"
5360 "shufps $224, %%xmm11, %%xmm9\n"
5361 "addps %%xmm8, %%xmm10\n"
5362 "subps %%xmm9, %%xmm10\n"
5363 "movaps %%xmm10, %%xmm2\n"
5364 "movaps %%xmm3, %%xmm8\n"
5365 "shufps $68, %%xmm8, %%xmm8\n"
5366 "xorps %%xmm9, %%xmm9\n"
5367 "movaps %%xmm3, %%xmm10\n"
5368 "shufps $14, %%xmm9, %%xmm10\n"
5369 "movaps %%xmm3, %%xmm11\n"
5370 "shufps $224, %%xmm11, %%xmm9\n"
5371 "addps %%xmm8, %%xmm10\n"
5372 "subps %%xmm9, %%xmm10\n"
5373 "movaps %%xmm10, %%xmm3\n"
5374 "movaps %%xmm4, %%xmm8\n"
5375 "shufps $68, %%xmm8, %%xmm8\n"
5376 "xorps %%xmm9, %%xmm9\n"
5377 "movaps %%xmm4, %%xmm10\n"
5378 "shufps $14, %%xmm9, %%xmm10\n"
5379 "movaps %%xmm4, %%xmm11\n"
5380 "shufps $224, %%xmm11, %%xmm9\n"
5381 "addps %%xmm8, %%xmm10\n"
5382 "subps %%xmm9, %%xmm10\n"
5383 "movaps %%xmm10, %%xmm4\n"
5384 "movaps %%xmm5, %%xmm8\n"
5385 "shufps $68, %%xmm8, %%xmm8\n"
5386 "xorps %%xmm9, %%xmm9\n"
5387 "movaps %%xmm5, %%xmm10\n"
5388 "shufps $14, %%xmm9, %%xmm10\n"
5389 "movaps %%xmm5, %%xmm11\n"
5390 "shufps $224, %%xmm11, %%xmm9\n"
5391 "addps %%xmm8, %%xmm10\n"
5392 "subps %%xmm9, %%xmm10\n"
5393 "movaps %%xmm10, %%xmm5\n"
5394 "movaps %%xmm6, %%xmm8\n"
5395 "shufps $68, %%xmm8, %%xmm8\n"
5396 "xorps %%xmm9, %%xmm9\n"
5397 "movaps %%xmm6, %%xmm10\n"
5398 "shufps $14, %%xmm9, %%xmm10\n"
5399 "movaps %%xmm6, %%xmm11\n"
5400 "shufps $224, %%xmm11, %%xmm9\n"
5401 "addps %%xmm8, %%xmm10\n"
5402 "subps %%xmm9, %%xmm10\n"
5403 "movaps %%xmm10, %%xmm6\n"
5404 "movaps %%xmm7, %%xmm8\n"
5405 "shufps $68, %%xmm8, %%xmm8\n"
5406 "xorps %%xmm9, %%xmm9\n"
5407 "movaps %%xmm7, %%xmm10\n"
5408 "shufps $14, %%xmm9, %%xmm10\n"
5409 "movaps %%xmm7, %%xmm11\n"
5410 "shufps $224, %%xmm11, %%xmm9\n"
5411 "addps %%xmm8, %%xmm10\n"
5412 "subps %%xmm9, %%xmm10\n"
5413 "movaps %%xmm10, %%xmm7\n"
5414 "movaps %%xmm0, %%xmm8\n"
5415 "movaps %%xmm0, %%xmm9\n"
5416 "addps %%xmm1, %%xmm8\n"
5417 "subps %%xmm1, %%xmm9\n"
5418 "movaps %%xmm2, %%xmm10\n"
5419 "movaps %%xmm2, %%xmm11\n"
5420 "addps %%xmm3, %%xmm10\n"
5421 "subps %%xmm3, %%xmm11\n"
5422 "movaps %%xmm4, %%xmm12\n"
5423 "movaps %%xmm4, %%xmm13\n"
5424 "addps %%xmm5, %%xmm12\n"
5425 "subps %%xmm5, %%xmm13\n"
5426 "movaps %%xmm6, %%xmm14\n"
5427 "movaps %%xmm6, %%xmm15\n"
5428 "addps %%xmm7, %%xmm14\n"
5429 "subps %%xmm7, %%xmm15\n"
5430 "movaps %%xmm8, %%xmm0\n"
5431 "movaps %%xmm8, %%xmm2\n"
5432 "addps %%xmm10, %%xmm0\n"
5433 "subps %%xmm10, %%xmm2\n"
5434 "movaps %%xmm9, %%xmm1\n"
5435 "movaps %%xmm9, %%xmm3\n"
5436 "addps %%xmm11, %%xmm1\n"
5437 "subps %%xmm11, %%xmm3\n"
5438 "movaps %%xmm12, %%xmm4\n"
5439 "movaps %%xmm12, %%xmm6\n"
5440 "addps %%xmm14, %%xmm4\n"
5441 "subps %%xmm14, %%xmm6\n"
5442 "movaps %%xmm13, %%xmm5\n"
5443 "movaps %%xmm13, %%xmm7\n"
5444 "addps %%xmm15, %%xmm5\n"
5445 "subps %%xmm15, %%xmm7\n"
5446 "movaps %%xmm0, %%xmm8\n"
5447 "movaps %%xmm0, %%xmm12\n"
5448 "addps %%xmm4, %%xmm8\n"
5449 "subps %%xmm4, %%xmm12\n"
5450 "movaps %%xmm1, %%xmm9\n"
5451 "movaps %%xmm1, %%xmm13\n"
5452 "addps %%xmm5, %%xmm9\n"
5453 "subps %%xmm5, %%xmm13\n"
5454 "movaps %%xmm2, %%xmm10\n"
5455 "movaps %%xmm2, %%xmm14\n"
5456 "addps %%xmm6, %%xmm10\n"
5457 "subps %%xmm6, %%xmm14\n"
5458 "movaps %%xmm3, %%xmm11\n"
5459 "movaps %%xmm3, %%xmm15\n"
5460 "addps %%xmm7, %%xmm11\n"
5461 "subps %%xmm7, %%xmm15\n"
5462 "movups %%xmm8, (%0)\n"
5463 "movups %%xmm9, (%1)\n"
5464 "movups %%xmm10, (%2)\n"
5465 "movups %%xmm11, (%3)\n"
5466 "movups %%xmm12, (%4)\n"
5467 "movups %%xmm13, (%5)\n"
5468 "movups %%xmm14, (%6)\n"
5469 "movups %%xmm15, (%7)\n"
5470 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5471 );
5472 }
5473 }
5474 for (int j = 0; j < 8192; j += 256) {
5475 for (int k = 0; k < 32; k += 4) {
5476 __asm__ volatile (
5477 "movups (%0), %%xmm0\n"
5478 "movups (%1), %%xmm1\n"
5479 "movups (%2), %%xmm2\n"
5480 "movups (%3), %%xmm3\n"
5481 "movups (%4), %%xmm4\n"
5482 "movups (%5), %%xmm5\n"
5483 "movups (%6), %%xmm6\n"
5484 "movups (%7), %%xmm7\n"
5485 "movaps %%xmm0, %%xmm8\n"
5486 "movaps %%xmm0, %%xmm9\n"
5487 "addps %%xmm1, %%xmm8\n"
5488 "subps %%xmm1, %%xmm9\n"
5489 "movaps %%xmm2, %%xmm10\n"
5490 "movaps %%xmm2, %%xmm11\n"
5491 "addps %%xmm3, %%xmm10\n"
5492 "subps %%xmm3, %%xmm11\n"
5493 "movaps %%xmm4, %%xmm12\n"
5494 "movaps %%xmm4, %%xmm13\n"
5495 "addps %%xmm5, %%xmm12\n"
5496 "subps %%xmm5, %%xmm13\n"
5497 "movaps %%xmm6, %%xmm14\n"
5498 "movaps %%xmm6, %%xmm15\n"
5499 "addps %%xmm7, %%xmm14\n"
5500 "subps %%xmm7, %%xmm15\n"
5501 "movaps %%xmm8, %%xmm0\n"
5502 "movaps %%xmm8, %%xmm2\n"
5503 "addps %%xmm10, %%xmm0\n"
5504 "subps %%xmm10, %%xmm2\n"
5505 "movaps %%xmm9, %%xmm1\n"
5506 "movaps %%xmm9, %%xmm3\n"
5507 "addps %%xmm11, %%xmm1\n"
5508 "subps %%xmm11, %%xmm3\n"
5509 "movaps %%xmm12, %%xmm4\n"
5510 "movaps %%xmm12, %%xmm6\n"
5511 "addps %%xmm14, %%xmm4\n"
5512 "subps %%xmm14, %%xmm6\n"
5513 "movaps %%xmm13, %%xmm5\n"
5514 "movaps %%xmm13, %%xmm7\n"
5515 "addps %%xmm15, %%xmm5\n"
5516 "subps %%xmm15, %%xmm7\n"
5517 "movaps %%xmm0, %%xmm8\n"
5518 "movaps %%xmm0, %%xmm12\n"
5519 "addps %%xmm4, %%xmm8\n"
5520 "subps %%xmm4, %%xmm12\n"
5521 "movaps %%xmm1, %%xmm9\n"
5522 "movaps %%xmm1, %%xmm13\n"
5523 "addps %%xmm5, %%xmm9\n"
5524 "subps %%xmm5, %%xmm13\n"
5525 "movaps %%xmm2, %%xmm10\n"
5526 "movaps %%xmm2, %%xmm14\n"
5527 "addps %%xmm6, %%xmm10\n"
5528 "subps %%xmm6, %%xmm14\n"
5529 "movaps %%xmm3, %%xmm11\n"
5530 "movaps %%xmm3, %%xmm15\n"
5531 "addps %%xmm7, %%xmm11\n"
5532 "subps %%xmm7, %%xmm15\n"
5533 "movups %%xmm8, (%0)\n"
5534 "movups %%xmm9, (%1)\n"
5535 "movups %%xmm10, (%2)\n"
5536 "movups %%xmm11, (%3)\n"
5537 "movups %%xmm12, (%4)\n"
5538 "movups %%xmm13, (%5)\n"
5539 "movups %%xmm14, (%6)\n"
5540 "movups %%xmm15, (%7)\n"
5541 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5542 );
5543 }
5544 }
5545 for (int j = 0; j < 8192; j += 2048) {
5546 for (int k = 0; k < 256; k += 4) {
5547 __asm__ volatile (
5548 "movups (%0), %%xmm0\n"
5549 "movups (%1), %%xmm1\n"
5550 "movups (%2), %%xmm2\n"
5551 "movups (%3), %%xmm3\n"
5552 "movups (%4), %%xmm4\n"
5553 "movups (%5), %%xmm5\n"
5554 "movups (%6), %%xmm6\n"
5555 "movups (%7), %%xmm7\n"
5556 "movaps %%xmm0, %%xmm8\n"
5557 "movaps %%xmm0, %%xmm9\n"
5558 "addps %%xmm1, %%xmm8\n"
5559 "subps %%xmm1, %%xmm9\n"
5560 "movaps %%xmm2, %%xmm10\n"
5561 "movaps %%xmm2, %%xmm11\n"
5562 "addps %%xmm3, %%xmm10\n"
5563 "subps %%xmm3, %%xmm11\n"
5564 "movaps %%xmm4, %%xmm12\n"
5565 "movaps %%xmm4, %%xmm13\n"
5566 "addps %%xmm5, %%xmm12\n"
5567 "subps %%xmm5, %%xmm13\n"
5568 "movaps %%xmm6, %%xmm14\n"
5569 "movaps %%xmm6, %%xmm15\n"
5570 "addps %%xmm7, %%xmm14\n"
5571 "subps %%xmm7, %%xmm15\n"
5572 "movaps %%xmm8, %%xmm0\n"
5573 "movaps %%xmm8, %%xmm2\n"
5574 "addps %%xmm10, %%xmm0\n"
5575 "subps %%xmm10, %%xmm2\n"
5576 "movaps %%xmm9, %%xmm1\n"
5577 "movaps %%xmm9, %%xmm3\n"
5578 "addps %%xmm11, %%xmm1\n"
5579 "subps %%xmm11, %%xmm3\n"
5580 "movaps %%xmm12, %%xmm4\n"
5581 "movaps %%xmm12, %%xmm6\n"
5582 "addps %%xmm14, %%xmm4\n"
5583 "subps %%xmm14, %%xmm6\n"
5584 "movaps %%xmm13, %%xmm5\n"
5585 "movaps %%xmm13, %%xmm7\n"
5586 "addps %%xmm15, %%xmm5\n"
5587 "subps %%xmm15, %%xmm7\n"
5588 "movaps %%xmm0, %%xmm8\n"
5589 "movaps %%xmm0, %%xmm12\n"
5590 "addps %%xmm4, %%xmm8\n"
5591 "subps %%xmm4, %%xmm12\n"
5592 "movaps %%xmm1, %%xmm9\n"
5593 "movaps %%xmm1, %%xmm13\n"
5594 "addps %%xmm5, %%xmm9\n"
5595 "subps %%xmm5, %%xmm13\n"
5596 "movaps %%xmm2, %%xmm10\n"
5597 "movaps %%xmm2, %%xmm14\n"
5598 "addps %%xmm6, %%xmm10\n"
5599 "subps %%xmm6, %%xmm14\n"
5600 "movaps %%xmm3, %%xmm11\n"
5601 "movaps %%xmm3, %%xmm15\n"
5602 "addps %%xmm7, %%xmm11\n"
5603 "subps %%xmm7, %%xmm15\n"
5604 "movups %%xmm8, (%0)\n"
5605 "movups %%xmm9, (%1)\n"
5606 "movups %%xmm10, (%2)\n"
5607 "movups %%xmm11, (%3)\n"
5608 "movups %%xmm12, (%4)\n"
5609 "movups %%xmm13, (%5)\n"
5610 "movups %%xmm14, (%6)\n"
5611 "movups %%xmm15, (%7)\n"
5612 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5613 );
5614 }
5615 }
5616 for (int j = 0; j < 8192; j += 8192) {
5617 for (int k = 0; k < 2048; k += 4) {
5618 __asm__ volatile (
5619 "movups (%0), %%xmm0\n"
5620 "movups (%1), %%xmm1\n"
5621 "movups (%2), %%xmm2\n"
5622 "movups (%3), %%xmm3\n"
5623 "movaps %%xmm0, %%xmm8\n"
5624 "movaps %%xmm0, %%xmm9\n"
5625 "addps %%xmm1, %%xmm8\n"
5626 "subps %%xmm1, %%xmm9\n"
5627 "movaps %%xmm2, %%xmm10\n"
5628 "movaps %%xmm2, %%xmm11\n"
5629 "addps %%xmm3, %%xmm10\n"
5630 "subps %%xmm3, %%xmm11\n"
5631 "movaps %%xmm8, %%xmm0\n"
5632 "movaps %%xmm8, %%xmm2\n"
5633 "addps %%xmm10, %%xmm0\n"
5634 "subps %%xmm10, %%xmm2\n"
5635 "movaps %%xmm9, %%xmm1\n"
5636 "movaps %%xmm9, %%xmm3\n"
5637 "addps %%xmm11, %%xmm1\n"
5638 "subps %%xmm11, %%xmm3\n"
5639 "movups %%xmm0, (%0)\n"
5640 "movups %%xmm1, (%1)\n"
5641 "movups %%xmm2, (%2)\n"
5642 "movups %%xmm3, (%3)\n"
5643 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5644 );
5645 }
5646 }
5647 return;
5648 }
5649 if (depth == 16) {
5650 helper_float_19_recursive(buf + 0, 13);
5651 helper_float_19_recursive(buf + 8192, 13);
5652 helper_float_19_recursive(buf + 16384, 13);
5653 helper_float_19_recursive(buf + 24576, 13);
5654 helper_float_19_recursive(buf + 32768, 13);
5655 helper_float_19_recursive(buf + 40960, 13);
5656 helper_float_19_recursive(buf + 49152, 13);
5657 helper_float_19_recursive(buf + 57344, 13);
5658 for (int j = 0; j < 65536; j += 65536) {
5659 for (int k = 0; k < 8192; k += 4) {
5660 __asm__ volatile (
5661 "movups (%0), %%xmm0\n"
5662 "movups (%1), %%xmm1\n"
5663 "movups (%2), %%xmm2\n"
5664 "movups (%3), %%xmm3\n"
5665 "movups (%4), %%xmm4\n"
5666 "movups (%5), %%xmm5\n"
5667 "movups (%6), %%xmm6\n"
5668 "movups (%7), %%xmm7\n"
5669 "movaps %%xmm0, %%xmm8\n"
5670 "movaps %%xmm0, %%xmm9\n"
5671 "addps %%xmm1, %%xmm8\n"
5672 "subps %%xmm1, %%xmm9\n"
5673 "movaps %%xmm2, %%xmm10\n"
5674 "movaps %%xmm2, %%xmm11\n"
5675 "addps %%xmm3, %%xmm10\n"
5676 "subps %%xmm3, %%xmm11\n"
5677 "movaps %%xmm4, %%xmm12\n"
5678 "movaps %%xmm4, %%xmm13\n"
5679 "addps %%xmm5, %%xmm12\n"
5680 "subps %%xmm5, %%xmm13\n"
5681 "movaps %%xmm6, %%xmm14\n"
5682 "movaps %%xmm6, %%xmm15\n"
5683 "addps %%xmm7, %%xmm14\n"
5684 "subps %%xmm7, %%xmm15\n"
5685 "movaps %%xmm8, %%xmm0\n"
5686 "movaps %%xmm8, %%xmm2\n"
5687 "addps %%xmm10, %%xmm0\n"
5688 "subps %%xmm10, %%xmm2\n"
5689 "movaps %%xmm9, %%xmm1\n"
5690 "movaps %%xmm9, %%xmm3\n"
5691 "addps %%xmm11, %%xmm1\n"
5692 "subps %%xmm11, %%xmm3\n"
5693 "movaps %%xmm12, %%xmm4\n"
5694 "movaps %%xmm12, %%xmm6\n"
5695 "addps %%xmm14, %%xmm4\n"
5696 "subps %%xmm14, %%xmm6\n"
5697 "movaps %%xmm13, %%xmm5\n"
5698 "movaps %%xmm13, %%xmm7\n"
5699 "addps %%xmm15, %%xmm5\n"
5700 "subps %%xmm15, %%xmm7\n"
5701 "movaps %%xmm0, %%xmm8\n"
5702 "movaps %%xmm0, %%xmm12\n"
5703 "addps %%xmm4, %%xmm8\n"
5704 "subps %%xmm4, %%xmm12\n"
5705 "movaps %%xmm1, %%xmm9\n"
5706 "movaps %%xmm1, %%xmm13\n"
5707 "addps %%xmm5, %%xmm9\n"
5708 "subps %%xmm5, %%xmm13\n"
5709 "movaps %%xmm2, %%xmm10\n"
5710 "movaps %%xmm2, %%xmm14\n"
5711 "addps %%xmm6, %%xmm10\n"
5712 "subps %%xmm6, %%xmm14\n"
5713 "movaps %%xmm3, %%xmm11\n"
5714 "movaps %%xmm3, %%xmm15\n"
5715 "addps %%xmm7, %%xmm11\n"
5716 "subps %%xmm7, %%xmm15\n"
5717 "movups %%xmm8, (%0)\n"
5718 "movups %%xmm9, (%1)\n"
5719 "movups %%xmm10, (%2)\n"
5720 "movups %%xmm11, (%3)\n"
5721 "movups %%xmm12, (%4)\n"
5722 "movups %%xmm13, (%5)\n"
5723 "movups %%xmm14, (%6)\n"
5724 "movups %%xmm15, (%7)\n"
5725 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5726 );
5727 }
5728 }
5729 return;
5730 }
5731 if (depth == 19) {
5732 helper_float_19_recursive(buf + 0, 16);
5733 helper_float_19_recursive(buf + 65536, 16);
5734 helper_float_19_recursive(buf + 131072, 16);
5735 helper_float_19_recursive(buf + 196608, 16);
5736 helper_float_19_recursive(buf + 262144, 16);
5737 helper_float_19_recursive(buf + 327680, 16);
5738 helper_float_19_recursive(buf + 393216, 16);
5739 helper_float_19_recursive(buf + 458752, 16);
5740 for (int j = 0; j < 524288; j += 524288) {
5741 for (int k = 0; k < 65536; k += 4) {
5742 __asm__ volatile (
5743 "movups (%0), %%xmm0\n"
5744 "movups (%1), %%xmm1\n"
5745 "movups (%2), %%xmm2\n"
5746 "movups (%3), %%xmm3\n"
5747 "movups (%4), %%xmm4\n"
5748 "movups (%5), %%xmm5\n"
5749 "movups (%6), %%xmm6\n"
5750 "movups (%7), %%xmm7\n"
5751 "movaps %%xmm0, %%xmm8\n"
5752 "movaps %%xmm0, %%xmm9\n"
5753 "addps %%xmm1, %%xmm8\n"
5754 "subps %%xmm1, %%xmm9\n"
5755 "movaps %%xmm2, %%xmm10\n"
5756 "movaps %%xmm2, %%xmm11\n"
5757 "addps %%xmm3, %%xmm10\n"
5758 "subps %%xmm3, %%xmm11\n"
5759 "movaps %%xmm4, %%xmm12\n"
5760 "movaps %%xmm4, %%xmm13\n"
5761 "addps %%xmm5, %%xmm12\n"
5762 "subps %%xmm5, %%xmm13\n"
5763 "movaps %%xmm6, %%xmm14\n"
5764 "movaps %%xmm6, %%xmm15\n"
5765 "addps %%xmm7, %%xmm14\n"
5766 "subps %%xmm7, %%xmm15\n"
5767 "movaps %%xmm8, %%xmm0\n"
5768 "movaps %%xmm8, %%xmm2\n"
5769 "addps %%xmm10, %%xmm0\n"
5770 "subps %%xmm10, %%xmm2\n"
5771 "movaps %%xmm9, %%xmm1\n"
5772 "movaps %%xmm9, %%xmm3\n"
5773 "addps %%xmm11, %%xmm1\n"
5774 "subps %%xmm11, %%xmm3\n"
5775 "movaps %%xmm12, %%xmm4\n"
5776 "movaps %%xmm12, %%xmm6\n"
5777 "addps %%xmm14, %%xmm4\n"
5778 "subps %%xmm14, %%xmm6\n"
5779 "movaps %%xmm13, %%xmm5\n"
5780 "movaps %%xmm13, %%xmm7\n"
5781 "addps %%xmm15, %%xmm5\n"
5782 "subps %%xmm15, %%xmm7\n"
5783 "movaps %%xmm0, %%xmm8\n"
5784 "movaps %%xmm0, %%xmm12\n"
5785 "addps %%xmm4, %%xmm8\n"
5786 "subps %%xmm4, %%xmm12\n"
5787 "movaps %%xmm1, %%xmm9\n"
5788 "movaps %%xmm1, %%xmm13\n"
5789 "addps %%xmm5, %%xmm9\n"
5790 "subps %%xmm5, %%xmm13\n"
5791 "movaps %%xmm2, %%xmm10\n"
5792 "movaps %%xmm2, %%xmm14\n"
5793 "addps %%xmm6, %%xmm10\n"
5794 "subps %%xmm6, %%xmm14\n"
5795 "movaps %%xmm3, %%xmm11\n"
5796 "movaps %%xmm3, %%xmm15\n"
5797 "addps %%xmm7, %%xmm11\n"
5798 "subps %%xmm7, %%xmm15\n"
5799 "movups %%xmm8, (%0)\n"
5800 "movups %%xmm9, (%1)\n"
5801 "movups %%xmm10, (%2)\n"
5802 "movups %%xmm11, (%3)\n"
5803 "movups %%xmm12, (%4)\n"
5804 "movups %%xmm13, (%5)\n"
5805 "movups %%xmm14, (%6)\n"
5806 "movups %%xmm15, (%7)\n"
5807 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
5808 );
5809 }
5810 }
5811 return;
5812 }
5813 }
5814 void helper_float_19(float *buf);
helper_float_19(float * buf)5815 void helper_float_19(float *buf) {
5816 helper_float_19_recursive(buf, 19);
5817 }
5818 void helper_float_20_recursive(float *buf, int depth);
helper_float_20_recursive(float * buf,int depth)5819 void helper_float_20_recursive(float *buf, int depth) {
5820 if (depth == 8) {
5821 for (int j = 0; j < 256; j += 32) {
5822 for (int k = 0; k < 4; k += 4) {
5823 __asm__ volatile (
5824 "movups (%0), %%xmm0\n"
5825 "movups (%1), %%xmm1\n"
5826 "movups (%2), %%xmm2\n"
5827 "movups (%3), %%xmm3\n"
5828 "movups (%4), %%xmm4\n"
5829 "movups (%5), %%xmm5\n"
5830 "movups (%6), %%xmm6\n"
5831 "movups (%7), %%xmm7\n"
5832 "movaps %%xmm0, %%xmm8\n"
5833 "shufps $160, %%xmm8, %%xmm8\n"
5834 "shufps $245, %%xmm0, %%xmm0\n"
5835 "xorps %%xmm9, %%xmm9\n"
5836 "subps %%xmm0, %%xmm9\n"
5837 "addsubps %%xmm9, %%xmm8\n"
5838 "movaps %%xmm8, %%xmm0\n"
5839 "movaps %%xmm1, %%xmm8\n"
5840 "shufps $160, %%xmm8, %%xmm8\n"
5841 "shufps $245, %%xmm1, %%xmm1\n"
5842 "xorps %%xmm9, %%xmm9\n"
5843 "subps %%xmm1, %%xmm9\n"
5844 "addsubps %%xmm9, %%xmm8\n"
5845 "movaps %%xmm8, %%xmm1\n"
5846 "movaps %%xmm2, %%xmm8\n"
5847 "shufps $160, %%xmm8, %%xmm8\n"
5848 "shufps $245, %%xmm2, %%xmm2\n"
5849 "xorps %%xmm9, %%xmm9\n"
5850 "subps %%xmm2, %%xmm9\n"
5851 "addsubps %%xmm9, %%xmm8\n"
5852 "movaps %%xmm8, %%xmm2\n"
5853 "movaps %%xmm3, %%xmm8\n"
5854 "shufps $160, %%xmm8, %%xmm8\n"
5855 "shufps $245, %%xmm3, %%xmm3\n"
5856 "xorps %%xmm9, %%xmm9\n"
5857 "subps %%xmm3, %%xmm9\n"
5858 "addsubps %%xmm9, %%xmm8\n"
5859 "movaps %%xmm8, %%xmm3\n"
5860 "movaps %%xmm4, %%xmm8\n"
5861 "shufps $160, %%xmm8, %%xmm8\n"
5862 "shufps $245, %%xmm4, %%xmm4\n"
5863 "xorps %%xmm9, %%xmm9\n"
5864 "subps %%xmm4, %%xmm9\n"
5865 "addsubps %%xmm9, %%xmm8\n"
5866 "movaps %%xmm8, %%xmm4\n"
5867 "movaps %%xmm5, %%xmm8\n"
5868 "shufps $160, %%xmm8, %%xmm8\n"
5869 "shufps $245, %%xmm5, %%xmm5\n"
5870 "xorps %%xmm9, %%xmm9\n"
5871 "subps %%xmm5, %%xmm9\n"
5872 "addsubps %%xmm9, %%xmm8\n"
5873 "movaps %%xmm8, %%xmm5\n"
5874 "movaps %%xmm6, %%xmm8\n"
5875 "shufps $160, %%xmm8, %%xmm8\n"
5876 "shufps $245, %%xmm6, %%xmm6\n"
5877 "xorps %%xmm9, %%xmm9\n"
5878 "subps %%xmm6, %%xmm9\n"
5879 "addsubps %%xmm9, %%xmm8\n"
5880 "movaps %%xmm8, %%xmm6\n"
5881 "movaps %%xmm7, %%xmm8\n"
5882 "shufps $160, %%xmm8, %%xmm8\n"
5883 "shufps $245, %%xmm7, %%xmm7\n"
5884 "xorps %%xmm9, %%xmm9\n"
5885 "subps %%xmm7, %%xmm9\n"
5886 "addsubps %%xmm9, %%xmm8\n"
5887 "movaps %%xmm8, %%xmm7\n"
5888 "movaps %%xmm0, %%xmm8\n"
5889 "shufps $68, %%xmm8, %%xmm8\n"
5890 "xorps %%xmm9, %%xmm9\n"
5891 "movaps %%xmm0, %%xmm10\n"
5892 "shufps $14, %%xmm9, %%xmm10\n"
5893 "movaps %%xmm0, %%xmm11\n"
5894 "shufps $224, %%xmm11, %%xmm9\n"
5895 "addps %%xmm8, %%xmm10\n"
5896 "subps %%xmm9, %%xmm10\n"
5897 "movaps %%xmm10, %%xmm0\n"
5898 "movaps %%xmm1, %%xmm8\n"
5899 "shufps $68, %%xmm8, %%xmm8\n"
5900 "xorps %%xmm9, %%xmm9\n"
5901 "movaps %%xmm1, %%xmm10\n"
5902 "shufps $14, %%xmm9, %%xmm10\n"
5903 "movaps %%xmm1, %%xmm11\n"
5904 "shufps $224, %%xmm11, %%xmm9\n"
5905 "addps %%xmm8, %%xmm10\n"
5906 "subps %%xmm9, %%xmm10\n"
5907 "movaps %%xmm10, %%xmm1\n"
5908 "movaps %%xmm2, %%xmm8\n"
5909 "shufps $68, %%xmm8, %%xmm8\n"
5910 "xorps %%xmm9, %%xmm9\n"
5911 "movaps %%xmm2, %%xmm10\n"
5912 "shufps $14, %%xmm9, %%xmm10\n"
5913 "movaps %%xmm2, %%xmm11\n"
5914 "shufps $224, %%xmm11, %%xmm9\n"
5915 "addps %%xmm8, %%xmm10\n"
5916 "subps %%xmm9, %%xmm10\n"
5917 "movaps %%xmm10, %%xmm2\n"
5918 "movaps %%xmm3, %%xmm8\n"
5919 "shufps $68, %%xmm8, %%xmm8\n"
5920 "xorps %%xmm9, %%xmm9\n"
5921 "movaps %%xmm3, %%xmm10\n"
5922 "shufps $14, %%xmm9, %%xmm10\n"
5923 "movaps %%xmm3, %%xmm11\n"
5924 "shufps $224, %%xmm11, %%xmm9\n"
5925 "addps %%xmm8, %%xmm10\n"
5926 "subps %%xmm9, %%xmm10\n"
5927 "movaps %%xmm10, %%xmm3\n"
5928 "movaps %%xmm4, %%xmm8\n"
5929 "shufps $68, %%xmm8, %%xmm8\n"
5930 "xorps %%xmm9, %%xmm9\n"
5931 "movaps %%xmm4, %%xmm10\n"
5932 "shufps $14, %%xmm9, %%xmm10\n"
5933 "movaps %%xmm4, %%xmm11\n"
5934 "shufps $224, %%xmm11, %%xmm9\n"
5935 "addps %%xmm8, %%xmm10\n"
5936 "subps %%xmm9, %%xmm10\n"
5937 "movaps %%xmm10, %%xmm4\n"
5938 "movaps %%xmm5, %%xmm8\n"
5939 "shufps $68, %%xmm8, %%xmm8\n"
5940 "xorps %%xmm9, %%xmm9\n"
5941 "movaps %%xmm5, %%xmm10\n"
5942 "shufps $14, %%xmm9, %%xmm10\n"
5943 "movaps %%xmm5, %%xmm11\n"
5944 "shufps $224, %%xmm11, %%xmm9\n"
5945 "addps %%xmm8, %%xmm10\n"
5946 "subps %%xmm9, %%xmm10\n"
5947 "movaps %%xmm10, %%xmm5\n"
5948 "movaps %%xmm6, %%xmm8\n"
5949 "shufps $68, %%xmm8, %%xmm8\n"
5950 "xorps %%xmm9, %%xmm9\n"
5951 "movaps %%xmm6, %%xmm10\n"
5952 "shufps $14, %%xmm9, %%xmm10\n"
5953 "movaps %%xmm6, %%xmm11\n"
5954 "shufps $224, %%xmm11, %%xmm9\n"
5955 "addps %%xmm8, %%xmm10\n"
5956 "subps %%xmm9, %%xmm10\n"
5957 "movaps %%xmm10, %%xmm6\n"
5958 "movaps %%xmm7, %%xmm8\n"
5959 "shufps $68, %%xmm8, %%xmm8\n"
5960 "xorps %%xmm9, %%xmm9\n"
5961 "movaps %%xmm7, %%xmm10\n"
5962 "shufps $14, %%xmm9, %%xmm10\n"
5963 "movaps %%xmm7, %%xmm11\n"
5964 "shufps $224, %%xmm11, %%xmm9\n"
5965 "addps %%xmm8, %%xmm10\n"
5966 "subps %%xmm9, %%xmm10\n"
5967 "movaps %%xmm10, %%xmm7\n"
5968 "movaps %%xmm0, %%xmm8\n"
5969 "movaps %%xmm0, %%xmm9\n"
5970 "addps %%xmm1, %%xmm8\n"
5971 "subps %%xmm1, %%xmm9\n"
5972 "movaps %%xmm2, %%xmm10\n"
5973 "movaps %%xmm2, %%xmm11\n"
5974 "addps %%xmm3, %%xmm10\n"
5975 "subps %%xmm3, %%xmm11\n"
5976 "movaps %%xmm4, %%xmm12\n"
5977 "movaps %%xmm4, %%xmm13\n"
5978 "addps %%xmm5, %%xmm12\n"
5979 "subps %%xmm5, %%xmm13\n"
5980 "movaps %%xmm6, %%xmm14\n"
5981 "movaps %%xmm6, %%xmm15\n"
5982 "addps %%xmm7, %%xmm14\n"
5983 "subps %%xmm7, %%xmm15\n"
5984 "movaps %%xmm8, %%xmm0\n"
5985 "movaps %%xmm8, %%xmm2\n"
5986 "addps %%xmm10, %%xmm0\n"
5987 "subps %%xmm10, %%xmm2\n"
5988 "movaps %%xmm9, %%xmm1\n"
5989 "movaps %%xmm9, %%xmm3\n"
5990 "addps %%xmm11, %%xmm1\n"
5991 "subps %%xmm11, %%xmm3\n"
5992 "movaps %%xmm12, %%xmm4\n"
5993 "movaps %%xmm12, %%xmm6\n"
5994 "addps %%xmm14, %%xmm4\n"
5995 "subps %%xmm14, %%xmm6\n"
5996 "movaps %%xmm13, %%xmm5\n"
5997 "movaps %%xmm13, %%xmm7\n"
5998 "addps %%xmm15, %%xmm5\n"
5999 "subps %%xmm15, %%xmm7\n"
6000 "movaps %%xmm0, %%xmm8\n"
6001 "movaps %%xmm0, %%xmm12\n"
6002 "addps %%xmm4, %%xmm8\n"
6003 "subps %%xmm4, %%xmm12\n"
6004 "movaps %%xmm1, %%xmm9\n"
6005 "movaps %%xmm1, %%xmm13\n"
6006 "addps %%xmm5, %%xmm9\n"
6007 "subps %%xmm5, %%xmm13\n"
6008 "movaps %%xmm2, %%xmm10\n"
6009 "movaps %%xmm2, %%xmm14\n"
6010 "addps %%xmm6, %%xmm10\n"
6011 "subps %%xmm6, %%xmm14\n"
6012 "movaps %%xmm3, %%xmm11\n"
6013 "movaps %%xmm3, %%xmm15\n"
6014 "addps %%xmm7, %%xmm11\n"
6015 "subps %%xmm7, %%xmm15\n"
6016 "movups %%xmm8, (%0)\n"
6017 "movups %%xmm9, (%1)\n"
6018 "movups %%xmm10, (%2)\n"
6019 "movups %%xmm11, (%3)\n"
6020 "movups %%xmm12, (%4)\n"
6021 "movups %%xmm13, (%5)\n"
6022 "movups %%xmm14, (%6)\n"
6023 "movups %%xmm15, (%7)\n"
6024 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6025 );
6026 }
6027 }
6028 for (int j = 0; j < 256; j += 256) {
6029 for (int k = 0; k < 32; k += 4) {
6030 __asm__ volatile (
6031 "movups (%0), %%xmm0\n"
6032 "movups (%1), %%xmm1\n"
6033 "movups (%2), %%xmm2\n"
6034 "movups (%3), %%xmm3\n"
6035 "movups (%4), %%xmm4\n"
6036 "movups (%5), %%xmm5\n"
6037 "movups (%6), %%xmm6\n"
6038 "movups (%7), %%xmm7\n"
6039 "movaps %%xmm0, %%xmm8\n"
6040 "movaps %%xmm0, %%xmm9\n"
6041 "addps %%xmm1, %%xmm8\n"
6042 "subps %%xmm1, %%xmm9\n"
6043 "movaps %%xmm2, %%xmm10\n"
6044 "movaps %%xmm2, %%xmm11\n"
6045 "addps %%xmm3, %%xmm10\n"
6046 "subps %%xmm3, %%xmm11\n"
6047 "movaps %%xmm4, %%xmm12\n"
6048 "movaps %%xmm4, %%xmm13\n"
6049 "addps %%xmm5, %%xmm12\n"
6050 "subps %%xmm5, %%xmm13\n"
6051 "movaps %%xmm6, %%xmm14\n"
6052 "movaps %%xmm6, %%xmm15\n"
6053 "addps %%xmm7, %%xmm14\n"
6054 "subps %%xmm7, %%xmm15\n"
6055 "movaps %%xmm8, %%xmm0\n"
6056 "movaps %%xmm8, %%xmm2\n"
6057 "addps %%xmm10, %%xmm0\n"
6058 "subps %%xmm10, %%xmm2\n"
6059 "movaps %%xmm9, %%xmm1\n"
6060 "movaps %%xmm9, %%xmm3\n"
6061 "addps %%xmm11, %%xmm1\n"
6062 "subps %%xmm11, %%xmm3\n"
6063 "movaps %%xmm12, %%xmm4\n"
6064 "movaps %%xmm12, %%xmm6\n"
6065 "addps %%xmm14, %%xmm4\n"
6066 "subps %%xmm14, %%xmm6\n"
6067 "movaps %%xmm13, %%xmm5\n"
6068 "movaps %%xmm13, %%xmm7\n"
6069 "addps %%xmm15, %%xmm5\n"
6070 "subps %%xmm15, %%xmm7\n"
6071 "movaps %%xmm0, %%xmm8\n"
6072 "movaps %%xmm0, %%xmm12\n"
6073 "addps %%xmm4, %%xmm8\n"
6074 "subps %%xmm4, %%xmm12\n"
6075 "movaps %%xmm1, %%xmm9\n"
6076 "movaps %%xmm1, %%xmm13\n"
6077 "addps %%xmm5, %%xmm9\n"
6078 "subps %%xmm5, %%xmm13\n"
6079 "movaps %%xmm2, %%xmm10\n"
6080 "movaps %%xmm2, %%xmm14\n"
6081 "addps %%xmm6, %%xmm10\n"
6082 "subps %%xmm6, %%xmm14\n"
6083 "movaps %%xmm3, %%xmm11\n"
6084 "movaps %%xmm3, %%xmm15\n"
6085 "addps %%xmm7, %%xmm11\n"
6086 "subps %%xmm7, %%xmm15\n"
6087 "movups %%xmm8, (%0)\n"
6088 "movups %%xmm9, (%1)\n"
6089 "movups %%xmm10, (%2)\n"
6090 "movups %%xmm11, (%3)\n"
6091 "movups %%xmm12, (%4)\n"
6092 "movups %%xmm13, (%5)\n"
6093 "movups %%xmm14, (%6)\n"
6094 "movups %%xmm15, (%7)\n"
6095 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6096 );
6097 }
6098 }
6099 return;
6100 }
6101 if (depth == 11) {
6102 helper_float_20_recursive(buf + 0, 8);
6103 helper_float_20_recursive(buf + 256, 8);
6104 helper_float_20_recursive(buf + 512, 8);
6105 helper_float_20_recursive(buf + 768, 8);
6106 helper_float_20_recursive(buf + 1024, 8);
6107 helper_float_20_recursive(buf + 1280, 8);
6108 helper_float_20_recursive(buf + 1536, 8);
6109 helper_float_20_recursive(buf + 1792, 8);
6110 for (int j = 0; j < 2048; j += 2048) {
6111 for (int k = 0; k < 256; k += 4) {
6112 __asm__ volatile (
6113 "movups (%0), %%xmm0\n"
6114 "movups (%1), %%xmm1\n"
6115 "movups (%2), %%xmm2\n"
6116 "movups (%3), %%xmm3\n"
6117 "movups (%4), %%xmm4\n"
6118 "movups (%5), %%xmm5\n"
6119 "movups (%6), %%xmm6\n"
6120 "movups (%7), %%xmm7\n"
6121 "movaps %%xmm0, %%xmm8\n"
6122 "movaps %%xmm0, %%xmm9\n"
6123 "addps %%xmm1, %%xmm8\n"
6124 "subps %%xmm1, %%xmm9\n"
6125 "movaps %%xmm2, %%xmm10\n"
6126 "movaps %%xmm2, %%xmm11\n"
6127 "addps %%xmm3, %%xmm10\n"
6128 "subps %%xmm3, %%xmm11\n"
6129 "movaps %%xmm4, %%xmm12\n"
6130 "movaps %%xmm4, %%xmm13\n"
6131 "addps %%xmm5, %%xmm12\n"
6132 "subps %%xmm5, %%xmm13\n"
6133 "movaps %%xmm6, %%xmm14\n"
6134 "movaps %%xmm6, %%xmm15\n"
6135 "addps %%xmm7, %%xmm14\n"
6136 "subps %%xmm7, %%xmm15\n"
6137 "movaps %%xmm8, %%xmm0\n"
6138 "movaps %%xmm8, %%xmm2\n"
6139 "addps %%xmm10, %%xmm0\n"
6140 "subps %%xmm10, %%xmm2\n"
6141 "movaps %%xmm9, %%xmm1\n"
6142 "movaps %%xmm9, %%xmm3\n"
6143 "addps %%xmm11, %%xmm1\n"
6144 "subps %%xmm11, %%xmm3\n"
6145 "movaps %%xmm12, %%xmm4\n"
6146 "movaps %%xmm12, %%xmm6\n"
6147 "addps %%xmm14, %%xmm4\n"
6148 "subps %%xmm14, %%xmm6\n"
6149 "movaps %%xmm13, %%xmm5\n"
6150 "movaps %%xmm13, %%xmm7\n"
6151 "addps %%xmm15, %%xmm5\n"
6152 "subps %%xmm15, %%xmm7\n"
6153 "movaps %%xmm0, %%xmm8\n"
6154 "movaps %%xmm0, %%xmm12\n"
6155 "addps %%xmm4, %%xmm8\n"
6156 "subps %%xmm4, %%xmm12\n"
6157 "movaps %%xmm1, %%xmm9\n"
6158 "movaps %%xmm1, %%xmm13\n"
6159 "addps %%xmm5, %%xmm9\n"
6160 "subps %%xmm5, %%xmm13\n"
6161 "movaps %%xmm2, %%xmm10\n"
6162 "movaps %%xmm2, %%xmm14\n"
6163 "addps %%xmm6, %%xmm10\n"
6164 "subps %%xmm6, %%xmm14\n"
6165 "movaps %%xmm3, %%xmm11\n"
6166 "movaps %%xmm3, %%xmm15\n"
6167 "addps %%xmm7, %%xmm11\n"
6168 "subps %%xmm7, %%xmm15\n"
6169 "movups %%xmm8, (%0)\n"
6170 "movups %%xmm9, (%1)\n"
6171 "movups %%xmm10, (%2)\n"
6172 "movups %%xmm11, (%3)\n"
6173 "movups %%xmm12, (%4)\n"
6174 "movups %%xmm13, (%5)\n"
6175 "movups %%xmm14, (%6)\n"
6176 "movups %%xmm15, (%7)\n"
6177 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6178 );
6179 }
6180 }
6181 return;
6182 }
6183 if (depth == 14) {
6184 helper_float_20_recursive(buf + 0, 11);
6185 helper_float_20_recursive(buf + 2048, 11);
6186 helper_float_20_recursive(buf + 4096, 11);
6187 helper_float_20_recursive(buf + 6144, 11);
6188 helper_float_20_recursive(buf + 8192, 11);
6189 helper_float_20_recursive(buf + 10240, 11);
6190 helper_float_20_recursive(buf + 12288, 11);
6191 helper_float_20_recursive(buf + 14336, 11);
6192 for (int j = 0; j < 16384; j += 16384) {
6193 for (int k = 0; k < 2048; k += 4) {
6194 __asm__ volatile (
6195 "movups (%0), %%xmm0\n"
6196 "movups (%1), %%xmm1\n"
6197 "movups (%2), %%xmm2\n"
6198 "movups (%3), %%xmm3\n"
6199 "movups (%4), %%xmm4\n"
6200 "movups (%5), %%xmm5\n"
6201 "movups (%6), %%xmm6\n"
6202 "movups (%7), %%xmm7\n"
6203 "movaps %%xmm0, %%xmm8\n"
6204 "movaps %%xmm0, %%xmm9\n"
6205 "addps %%xmm1, %%xmm8\n"
6206 "subps %%xmm1, %%xmm9\n"
6207 "movaps %%xmm2, %%xmm10\n"
6208 "movaps %%xmm2, %%xmm11\n"
6209 "addps %%xmm3, %%xmm10\n"
6210 "subps %%xmm3, %%xmm11\n"
6211 "movaps %%xmm4, %%xmm12\n"
6212 "movaps %%xmm4, %%xmm13\n"
6213 "addps %%xmm5, %%xmm12\n"
6214 "subps %%xmm5, %%xmm13\n"
6215 "movaps %%xmm6, %%xmm14\n"
6216 "movaps %%xmm6, %%xmm15\n"
6217 "addps %%xmm7, %%xmm14\n"
6218 "subps %%xmm7, %%xmm15\n"
6219 "movaps %%xmm8, %%xmm0\n"
6220 "movaps %%xmm8, %%xmm2\n"
6221 "addps %%xmm10, %%xmm0\n"
6222 "subps %%xmm10, %%xmm2\n"
6223 "movaps %%xmm9, %%xmm1\n"
6224 "movaps %%xmm9, %%xmm3\n"
6225 "addps %%xmm11, %%xmm1\n"
6226 "subps %%xmm11, %%xmm3\n"
6227 "movaps %%xmm12, %%xmm4\n"
6228 "movaps %%xmm12, %%xmm6\n"
6229 "addps %%xmm14, %%xmm4\n"
6230 "subps %%xmm14, %%xmm6\n"
6231 "movaps %%xmm13, %%xmm5\n"
6232 "movaps %%xmm13, %%xmm7\n"
6233 "addps %%xmm15, %%xmm5\n"
6234 "subps %%xmm15, %%xmm7\n"
6235 "movaps %%xmm0, %%xmm8\n"
6236 "movaps %%xmm0, %%xmm12\n"
6237 "addps %%xmm4, %%xmm8\n"
6238 "subps %%xmm4, %%xmm12\n"
6239 "movaps %%xmm1, %%xmm9\n"
6240 "movaps %%xmm1, %%xmm13\n"
6241 "addps %%xmm5, %%xmm9\n"
6242 "subps %%xmm5, %%xmm13\n"
6243 "movaps %%xmm2, %%xmm10\n"
6244 "movaps %%xmm2, %%xmm14\n"
6245 "addps %%xmm6, %%xmm10\n"
6246 "subps %%xmm6, %%xmm14\n"
6247 "movaps %%xmm3, %%xmm11\n"
6248 "movaps %%xmm3, %%xmm15\n"
6249 "addps %%xmm7, %%xmm11\n"
6250 "subps %%xmm7, %%xmm15\n"
6251 "movups %%xmm8, (%0)\n"
6252 "movups %%xmm9, (%1)\n"
6253 "movups %%xmm10, (%2)\n"
6254 "movups %%xmm11, (%3)\n"
6255 "movups %%xmm12, (%4)\n"
6256 "movups %%xmm13, (%5)\n"
6257 "movups %%xmm14, (%6)\n"
6258 "movups %%xmm15, (%7)\n"
6259 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6260 );
6261 }
6262 }
6263 return;
6264 }
6265 if (depth == 17) {
6266 helper_float_20_recursive(buf + 0, 14);
6267 helper_float_20_recursive(buf + 16384, 14);
6268 helper_float_20_recursive(buf + 32768, 14);
6269 helper_float_20_recursive(buf + 49152, 14);
6270 helper_float_20_recursive(buf + 65536, 14);
6271 helper_float_20_recursive(buf + 81920, 14);
6272 helper_float_20_recursive(buf + 98304, 14);
6273 helper_float_20_recursive(buf + 114688, 14);
6274 for (int j = 0; j < 131072; j += 131072) {
6275 for (int k = 0; k < 16384; k += 4) {
6276 __asm__ volatile (
6277 "movups (%0), %%xmm0\n"
6278 "movups (%1), %%xmm1\n"
6279 "movups (%2), %%xmm2\n"
6280 "movups (%3), %%xmm3\n"
6281 "movups (%4), %%xmm4\n"
6282 "movups (%5), %%xmm5\n"
6283 "movups (%6), %%xmm6\n"
6284 "movups (%7), %%xmm7\n"
6285 "movaps %%xmm0, %%xmm8\n"
6286 "movaps %%xmm0, %%xmm9\n"
6287 "addps %%xmm1, %%xmm8\n"
6288 "subps %%xmm1, %%xmm9\n"
6289 "movaps %%xmm2, %%xmm10\n"
6290 "movaps %%xmm2, %%xmm11\n"
6291 "addps %%xmm3, %%xmm10\n"
6292 "subps %%xmm3, %%xmm11\n"
6293 "movaps %%xmm4, %%xmm12\n"
6294 "movaps %%xmm4, %%xmm13\n"
6295 "addps %%xmm5, %%xmm12\n"
6296 "subps %%xmm5, %%xmm13\n"
6297 "movaps %%xmm6, %%xmm14\n"
6298 "movaps %%xmm6, %%xmm15\n"
6299 "addps %%xmm7, %%xmm14\n"
6300 "subps %%xmm7, %%xmm15\n"
6301 "movaps %%xmm8, %%xmm0\n"
6302 "movaps %%xmm8, %%xmm2\n"
6303 "addps %%xmm10, %%xmm0\n"
6304 "subps %%xmm10, %%xmm2\n"
6305 "movaps %%xmm9, %%xmm1\n"
6306 "movaps %%xmm9, %%xmm3\n"
6307 "addps %%xmm11, %%xmm1\n"
6308 "subps %%xmm11, %%xmm3\n"
6309 "movaps %%xmm12, %%xmm4\n"
6310 "movaps %%xmm12, %%xmm6\n"
6311 "addps %%xmm14, %%xmm4\n"
6312 "subps %%xmm14, %%xmm6\n"
6313 "movaps %%xmm13, %%xmm5\n"
6314 "movaps %%xmm13, %%xmm7\n"
6315 "addps %%xmm15, %%xmm5\n"
6316 "subps %%xmm15, %%xmm7\n"
6317 "movaps %%xmm0, %%xmm8\n"
6318 "movaps %%xmm0, %%xmm12\n"
6319 "addps %%xmm4, %%xmm8\n"
6320 "subps %%xmm4, %%xmm12\n"
6321 "movaps %%xmm1, %%xmm9\n"
6322 "movaps %%xmm1, %%xmm13\n"
6323 "addps %%xmm5, %%xmm9\n"
6324 "subps %%xmm5, %%xmm13\n"
6325 "movaps %%xmm2, %%xmm10\n"
6326 "movaps %%xmm2, %%xmm14\n"
6327 "addps %%xmm6, %%xmm10\n"
6328 "subps %%xmm6, %%xmm14\n"
6329 "movaps %%xmm3, %%xmm11\n"
6330 "movaps %%xmm3, %%xmm15\n"
6331 "addps %%xmm7, %%xmm11\n"
6332 "subps %%xmm7, %%xmm15\n"
6333 "movups %%xmm8, (%0)\n"
6334 "movups %%xmm9, (%1)\n"
6335 "movups %%xmm10, (%2)\n"
6336 "movups %%xmm11, (%3)\n"
6337 "movups %%xmm12, (%4)\n"
6338 "movups %%xmm13, (%5)\n"
6339 "movups %%xmm14, (%6)\n"
6340 "movups %%xmm15, (%7)\n"
6341 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6342 );
6343 }
6344 }
6345 return;
6346 }
6347 if (depth == 20) {
6348 helper_float_20_recursive(buf + 0, 17);
6349 helper_float_20_recursive(buf + 131072, 17);
6350 helper_float_20_recursive(buf + 262144, 17);
6351 helper_float_20_recursive(buf + 393216, 17);
6352 helper_float_20_recursive(buf + 524288, 17);
6353 helper_float_20_recursive(buf + 655360, 17);
6354 helper_float_20_recursive(buf + 786432, 17);
6355 helper_float_20_recursive(buf + 917504, 17);
6356 for (int j = 0; j < 1048576; j += 1048576) {
6357 for (int k = 0; k < 131072; k += 4) {
6358 __asm__ volatile (
6359 "movups (%0), %%xmm0\n"
6360 "movups (%1), %%xmm1\n"
6361 "movups (%2), %%xmm2\n"
6362 "movups (%3), %%xmm3\n"
6363 "movups (%4), %%xmm4\n"
6364 "movups (%5), %%xmm5\n"
6365 "movups (%6), %%xmm6\n"
6366 "movups (%7), %%xmm7\n"
6367 "movaps %%xmm0, %%xmm8\n"
6368 "movaps %%xmm0, %%xmm9\n"
6369 "addps %%xmm1, %%xmm8\n"
6370 "subps %%xmm1, %%xmm9\n"
6371 "movaps %%xmm2, %%xmm10\n"
6372 "movaps %%xmm2, %%xmm11\n"
6373 "addps %%xmm3, %%xmm10\n"
6374 "subps %%xmm3, %%xmm11\n"
6375 "movaps %%xmm4, %%xmm12\n"
6376 "movaps %%xmm4, %%xmm13\n"
6377 "addps %%xmm5, %%xmm12\n"
6378 "subps %%xmm5, %%xmm13\n"
6379 "movaps %%xmm6, %%xmm14\n"
6380 "movaps %%xmm6, %%xmm15\n"
6381 "addps %%xmm7, %%xmm14\n"
6382 "subps %%xmm7, %%xmm15\n"
6383 "movaps %%xmm8, %%xmm0\n"
6384 "movaps %%xmm8, %%xmm2\n"
6385 "addps %%xmm10, %%xmm0\n"
6386 "subps %%xmm10, %%xmm2\n"
6387 "movaps %%xmm9, %%xmm1\n"
6388 "movaps %%xmm9, %%xmm3\n"
6389 "addps %%xmm11, %%xmm1\n"
6390 "subps %%xmm11, %%xmm3\n"
6391 "movaps %%xmm12, %%xmm4\n"
6392 "movaps %%xmm12, %%xmm6\n"
6393 "addps %%xmm14, %%xmm4\n"
6394 "subps %%xmm14, %%xmm6\n"
6395 "movaps %%xmm13, %%xmm5\n"
6396 "movaps %%xmm13, %%xmm7\n"
6397 "addps %%xmm15, %%xmm5\n"
6398 "subps %%xmm15, %%xmm7\n"
6399 "movaps %%xmm0, %%xmm8\n"
6400 "movaps %%xmm0, %%xmm12\n"
6401 "addps %%xmm4, %%xmm8\n"
6402 "subps %%xmm4, %%xmm12\n"
6403 "movaps %%xmm1, %%xmm9\n"
6404 "movaps %%xmm1, %%xmm13\n"
6405 "addps %%xmm5, %%xmm9\n"
6406 "subps %%xmm5, %%xmm13\n"
6407 "movaps %%xmm2, %%xmm10\n"
6408 "movaps %%xmm2, %%xmm14\n"
6409 "addps %%xmm6, %%xmm10\n"
6410 "subps %%xmm6, %%xmm14\n"
6411 "movaps %%xmm3, %%xmm11\n"
6412 "movaps %%xmm3, %%xmm15\n"
6413 "addps %%xmm7, %%xmm11\n"
6414 "subps %%xmm7, %%xmm15\n"
6415 "movups %%xmm8, (%0)\n"
6416 "movups %%xmm9, (%1)\n"
6417 "movups %%xmm10, (%2)\n"
6418 "movups %%xmm11, (%3)\n"
6419 "movups %%xmm12, (%4)\n"
6420 "movups %%xmm13, (%5)\n"
6421 "movups %%xmm14, (%6)\n"
6422 "movups %%xmm15, (%7)\n"
6423 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6424 );
6425 }
6426 }
6427 return;
6428 }
6429 }
6430 void helper_float_20(float *buf);
helper_float_20(float * buf)6431 void helper_float_20(float *buf) {
6432 helper_float_20_recursive(buf, 20);
6433 }
6434 void helper_float_21_recursive(float *buf, int depth);
helper_float_21_recursive(float * buf,int depth)6435 void helper_float_21_recursive(float *buf, int depth) {
6436 if (depth == 13) {
6437 for (int j = 0; j < 8192; j += 32) {
6438 for (int k = 0; k < 4; k += 4) {
6439 __asm__ volatile (
6440 "movups (%0), %%xmm0\n"
6441 "movups (%1), %%xmm1\n"
6442 "movups (%2), %%xmm2\n"
6443 "movups (%3), %%xmm3\n"
6444 "movups (%4), %%xmm4\n"
6445 "movups (%5), %%xmm5\n"
6446 "movups (%6), %%xmm6\n"
6447 "movups (%7), %%xmm7\n"
6448 "movaps %%xmm0, %%xmm8\n"
6449 "shufps $160, %%xmm8, %%xmm8\n"
6450 "shufps $245, %%xmm0, %%xmm0\n"
6451 "xorps %%xmm9, %%xmm9\n"
6452 "subps %%xmm0, %%xmm9\n"
6453 "addsubps %%xmm9, %%xmm8\n"
6454 "movaps %%xmm8, %%xmm0\n"
6455 "movaps %%xmm1, %%xmm8\n"
6456 "shufps $160, %%xmm8, %%xmm8\n"
6457 "shufps $245, %%xmm1, %%xmm1\n"
6458 "xorps %%xmm9, %%xmm9\n"
6459 "subps %%xmm1, %%xmm9\n"
6460 "addsubps %%xmm9, %%xmm8\n"
6461 "movaps %%xmm8, %%xmm1\n"
6462 "movaps %%xmm2, %%xmm8\n"
6463 "shufps $160, %%xmm8, %%xmm8\n"
6464 "shufps $245, %%xmm2, %%xmm2\n"
6465 "xorps %%xmm9, %%xmm9\n"
6466 "subps %%xmm2, %%xmm9\n"
6467 "addsubps %%xmm9, %%xmm8\n"
6468 "movaps %%xmm8, %%xmm2\n"
6469 "movaps %%xmm3, %%xmm8\n"
6470 "shufps $160, %%xmm8, %%xmm8\n"
6471 "shufps $245, %%xmm3, %%xmm3\n"
6472 "xorps %%xmm9, %%xmm9\n"
6473 "subps %%xmm3, %%xmm9\n"
6474 "addsubps %%xmm9, %%xmm8\n"
6475 "movaps %%xmm8, %%xmm3\n"
6476 "movaps %%xmm4, %%xmm8\n"
6477 "shufps $160, %%xmm8, %%xmm8\n"
6478 "shufps $245, %%xmm4, %%xmm4\n"
6479 "xorps %%xmm9, %%xmm9\n"
6480 "subps %%xmm4, %%xmm9\n"
6481 "addsubps %%xmm9, %%xmm8\n"
6482 "movaps %%xmm8, %%xmm4\n"
6483 "movaps %%xmm5, %%xmm8\n"
6484 "shufps $160, %%xmm8, %%xmm8\n"
6485 "shufps $245, %%xmm5, %%xmm5\n"
6486 "xorps %%xmm9, %%xmm9\n"
6487 "subps %%xmm5, %%xmm9\n"
6488 "addsubps %%xmm9, %%xmm8\n"
6489 "movaps %%xmm8, %%xmm5\n"
6490 "movaps %%xmm6, %%xmm8\n"
6491 "shufps $160, %%xmm8, %%xmm8\n"
6492 "shufps $245, %%xmm6, %%xmm6\n"
6493 "xorps %%xmm9, %%xmm9\n"
6494 "subps %%xmm6, %%xmm9\n"
6495 "addsubps %%xmm9, %%xmm8\n"
6496 "movaps %%xmm8, %%xmm6\n"
6497 "movaps %%xmm7, %%xmm8\n"
6498 "shufps $160, %%xmm8, %%xmm8\n"
6499 "shufps $245, %%xmm7, %%xmm7\n"
6500 "xorps %%xmm9, %%xmm9\n"
6501 "subps %%xmm7, %%xmm9\n"
6502 "addsubps %%xmm9, %%xmm8\n"
6503 "movaps %%xmm8, %%xmm7\n"
6504 "movaps %%xmm0, %%xmm8\n"
6505 "shufps $68, %%xmm8, %%xmm8\n"
6506 "xorps %%xmm9, %%xmm9\n"
6507 "movaps %%xmm0, %%xmm10\n"
6508 "shufps $14, %%xmm9, %%xmm10\n"
6509 "movaps %%xmm0, %%xmm11\n"
6510 "shufps $224, %%xmm11, %%xmm9\n"
6511 "addps %%xmm8, %%xmm10\n"
6512 "subps %%xmm9, %%xmm10\n"
6513 "movaps %%xmm10, %%xmm0\n"
6514 "movaps %%xmm1, %%xmm8\n"
6515 "shufps $68, %%xmm8, %%xmm8\n"
6516 "xorps %%xmm9, %%xmm9\n"
6517 "movaps %%xmm1, %%xmm10\n"
6518 "shufps $14, %%xmm9, %%xmm10\n"
6519 "movaps %%xmm1, %%xmm11\n"
6520 "shufps $224, %%xmm11, %%xmm9\n"
6521 "addps %%xmm8, %%xmm10\n"
6522 "subps %%xmm9, %%xmm10\n"
6523 "movaps %%xmm10, %%xmm1\n"
6524 "movaps %%xmm2, %%xmm8\n"
6525 "shufps $68, %%xmm8, %%xmm8\n"
6526 "xorps %%xmm9, %%xmm9\n"
6527 "movaps %%xmm2, %%xmm10\n"
6528 "shufps $14, %%xmm9, %%xmm10\n"
6529 "movaps %%xmm2, %%xmm11\n"
6530 "shufps $224, %%xmm11, %%xmm9\n"
6531 "addps %%xmm8, %%xmm10\n"
6532 "subps %%xmm9, %%xmm10\n"
6533 "movaps %%xmm10, %%xmm2\n"
6534 "movaps %%xmm3, %%xmm8\n"
6535 "shufps $68, %%xmm8, %%xmm8\n"
6536 "xorps %%xmm9, %%xmm9\n"
6537 "movaps %%xmm3, %%xmm10\n"
6538 "shufps $14, %%xmm9, %%xmm10\n"
6539 "movaps %%xmm3, %%xmm11\n"
6540 "shufps $224, %%xmm11, %%xmm9\n"
6541 "addps %%xmm8, %%xmm10\n"
6542 "subps %%xmm9, %%xmm10\n"
6543 "movaps %%xmm10, %%xmm3\n"
6544 "movaps %%xmm4, %%xmm8\n"
6545 "shufps $68, %%xmm8, %%xmm8\n"
6546 "xorps %%xmm9, %%xmm9\n"
6547 "movaps %%xmm4, %%xmm10\n"
6548 "shufps $14, %%xmm9, %%xmm10\n"
6549 "movaps %%xmm4, %%xmm11\n"
6550 "shufps $224, %%xmm11, %%xmm9\n"
6551 "addps %%xmm8, %%xmm10\n"
6552 "subps %%xmm9, %%xmm10\n"
6553 "movaps %%xmm10, %%xmm4\n"
6554 "movaps %%xmm5, %%xmm8\n"
6555 "shufps $68, %%xmm8, %%xmm8\n"
6556 "xorps %%xmm9, %%xmm9\n"
6557 "movaps %%xmm5, %%xmm10\n"
6558 "shufps $14, %%xmm9, %%xmm10\n"
6559 "movaps %%xmm5, %%xmm11\n"
6560 "shufps $224, %%xmm11, %%xmm9\n"
6561 "addps %%xmm8, %%xmm10\n"
6562 "subps %%xmm9, %%xmm10\n"
6563 "movaps %%xmm10, %%xmm5\n"
6564 "movaps %%xmm6, %%xmm8\n"
6565 "shufps $68, %%xmm8, %%xmm8\n"
6566 "xorps %%xmm9, %%xmm9\n"
6567 "movaps %%xmm6, %%xmm10\n"
6568 "shufps $14, %%xmm9, %%xmm10\n"
6569 "movaps %%xmm6, %%xmm11\n"
6570 "shufps $224, %%xmm11, %%xmm9\n"
6571 "addps %%xmm8, %%xmm10\n"
6572 "subps %%xmm9, %%xmm10\n"
6573 "movaps %%xmm10, %%xmm6\n"
6574 "movaps %%xmm7, %%xmm8\n"
6575 "shufps $68, %%xmm8, %%xmm8\n"
6576 "xorps %%xmm9, %%xmm9\n"
6577 "movaps %%xmm7, %%xmm10\n"
6578 "shufps $14, %%xmm9, %%xmm10\n"
6579 "movaps %%xmm7, %%xmm11\n"
6580 "shufps $224, %%xmm11, %%xmm9\n"
6581 "addps %%xmm8, %%xmm10\n"
6582 "subps %%xmm9, %%xmm10\n"
6583 "movaps %%xmm10, %%xmm7\n"
6584 "movaps %%xmm0, %%xmm8\n"
6585 "movaps %%xmm0, %%xmm9\n"
6586 "addps %%xmm1, %%xmm8\n"
6587 "subps %%xmm1, %%xmm9\n"
6588 "movaps %%xmm2, %%xmm10\n"
6589 "movaps %%xmm2, %%xmm11\n"
6590 "addps %%xmm3, %%xmm10\n"
6591 "subps %%xmm3, %%xmm11\n"
6592 "movaps %%xmm4, %%xmm12\n"
6593 "movaps %%xmm4, %%xmm13\n"
6594 "addps %%xmm5, %%xmm12\n"
6595 "subps %%xmm5, %%xmm13\n"
6596 "movaps %%xmm6, %%xmm14\n"
6597 "movaps %%xmm6, %%xmm15\n"
6598 "addps %%xmm7, %%xmm14\n"
6599 "subps %%xmm7, %%xmm15\n"
6600 "movaps %%xmm8, %%xmm0\n"
6601 "movaps %%xmm8, %%xmm2\n"
6602 "addps %%xmm10, %%xmm0\n"
6603 "subps %%xmm10, %%xmm2\n"
6604 "movaps %%xmm9, %%xmm1\n"
6605 "movaps %%xmm9, %%xmm3\n"
6606 "addps %%xmm11, %%xmm1\n"
6607 "subps %%xmm11, %%xmm3\n"
6608 "movaps %%xmm12, %%xmm4\n"
6609 "movaps %%xmm12, %%xmm6\n"
6610 "addps %%xmm14, %%xmm4\n"
6611 "subps %%xmm14, %%xmm6\n"
6612 "movaps %%xmm13, %%xmm5\n"
6613 "movaps %%xmm13, %%xmm7\n"
6614 "addps %%xmm15, %%xmm5\n"
6615 "subps %%xmm15, %%xmm7\n"
6616 "movaps %%xmm0, %%xmm8\n"
6617 "movaps %%xmm0, %%xmm12\n"
6618 "addps %%xmm4, %%xmm8\n"
6619 "subps %%xmm4, %%xmm12\n"
6620 "movaps %%xmm1, %%xmm9\n"
6621 "movaps %%xmm1, %%xmm13\n"
6622 "addps %%xmm5, %%xmm9\n"
6623 "subps %%xmm5, %%xmm13\n"
6624 "movaps %%xmm2, %%xmm10\n"
6625 "movaps %%xmm2, %%xmm14\n"
6626 "addps %%xmm6, %%xmm10\n"
6627 "subps %%xmm6, %%xmm14\n"
6628 "movaps %%xmm3, %%xmm11\n"
6629 "movaps %%xmm3, %%xmm15\n"
6630 "addps %%xmm7, %%xmm11\n"
6631 "subps %%xmm7, %%xmm15\n"
6632 "movups %%xmm8, (%0)\n"
6633 "movups %%xmm9, (%1)\n"
6634 "movups %%xmm10, (%2)\n"
6635 "movups %%xmm11, (%3)\n"
6636 "movups %%xmm12, (%4)\n"
6637 "movups %%xmm13, (%5)\n"
6638 "movups %%xmm14, (%6)\n"
6639 "movups %%xmm15, (%7)\n"
6640 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6641 );
6642 }
6643 }
6644 for (int j = 0; j < 8192; j += 256) {
6645 for (int k = 0; k < 32; k += 4) {
6646 __asm__ volatile (
6647 "movups (%0), %%xmm0\n"
6648 "movups (%1), %%xmm1\n"
6649 "movups (%2), %%xmm2\n"
6650 "movups (%3), %%xmm3\n"
6651 "movups (%4), %%xmm4\n"
6652 "movups (%5), %%xmm5\n"
6653 "movups (%6), %%xmm6\n"
6654 "movups (%7), %%xmm7\n"
6655 "movaps %%xmm0, %%xmm8\n"
6656 "movaps %%xmm0, %%xmm9\n"
6657 "addps %%xmm1, %%xmm8\n"
6658 "subps %%xmm1, %%xmm9\n"
6659 "movaps %%xmm2, %%xmm10\n"
6660 "movaps %%xmm2, %%xmm11\n"
6661 "addps %%xmm3, %%xmm10\n"
6662 "subps %%xmm3, %%xmm11\n"
6663 "movaps %%xmm4, %%xmm12\n"
6664 "movaps %%xmm4, %%xmm13\n"
6665 "addps %%xmm5, %%xmm12\n"
6666 "subps %%xmm5, %%xmm13\n"
6667 "movaps %%xmm6, %%xmm14\n"
6668 "movaps %%xmm6, %%xmm15\n"
6669 "addps %%xmm7, %%xmm14\n"
6670 "subps %%xmm7, %%xmm15\n"
6671 "movaps %%xmm8, %%xmm0\n"
6672 "movaps %%xmm8, %%xmm2\n"
6673 "addps %%xmm10, %%xmm0\n"
6674 "subps %%xmm10, %%xmm2\n"
6675 "movaps %%xmm9, %%xmm1\n"
6676 "movaps %%xmm9, %%xmm3\n"
6677 "addps %%xmm11, %%xmm1\n"
6678 "subps %%xmm11, %%xmm3\n"
6679 "movaps %%xmm12, %%xmm4\n"
6680 "movaps %%xmm12, %%xmm6\n"
6681 "addps %%xmm14, %%xmm4\n"
6682 "subps %%xmm14, %%xmm6\n"
6683 "movaps %%xmm13, %%xmm5\n"
6684 "movaps %%xmm13, %%xmm7\n"
6685 "addps %%xmm15, %%xmm5\n"
6686 "subps %%xmm15, %%xmm7\n"
6687 "movaps %%xmm0, %%xmm8\n"
6688 "movaps %%xmm0, %%xmm12\n"
6689 "addps %%xmm4, %%xmm8\n"
6690 "subps %%xmm4, %%xmm12\n"
6691 "movaps %%xmm1, %%xmm9\n"
6692 "movaps %%xmm1, %%xmm13\n"
6693 "addps %%xmm5, %%xmm9\n"
6694 "subps %%xmm5, %%xmm13\n"
6695 "movaps %%xmm2, %%xmm10\n"
6696 "movaps %%xmm2, %%xmm14\n"
6697 "addps %%xmm6, %%xmm10\n"
6698 "subps %%xmm6, %%xmm14\n"
6699 "movaps %%xmm3, %%xmm11\n"
6700 "movaps %%xmm3, %%xmm15\n"
6701 "addps %%xmm7, %%xmm11\n"
6702 "subps %%xmm7, %%xmm15\n"
6703 "movups %%xmm8, (%0)\n"
6704 "movups %%xmm9, (%1)\n"
6705 "movups %%xmm10, (%2)\n"
6706 "movups %%xmm11, (%3)\n"
6707 "movups %%xmm12, (%4)\n"
6708 "movups %%xmm13, (%5)\n"
6709 "movups %%xmm14, (%6)\n"
6710 "movups %%xmm15, (%7)\n"
6711 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6712 );
6713 }
6714 }
6715 for (int j = 0; j < 8192; j += 2048) {
6716 for (int k = 0; k < 256; k += 4) {
6717 __asm__ volatile (
6718 "movups (%0), %%xmm0\n"
6719 "movups (%1), %%xmm1\n"
6720 "movups (%2), %%xmm2\n"
6721 "movups (%3), %%xmm3\n"
6722 "movups (%4), %%xmm4\n"
6723 "movups (%5), %%xmm5\n"
6724 "movups (%6), %%xmm6\n"
6725 "movups (%7), %%xmm7\n"
6726 "movaps %%xmm0, %%xmm8\n"
6727 "movaps %%xmm0, %%xmm9\n"
6728 "addps %%xmm1, %%xmm8\n"
6729 "subps %%xmm1, %%xmm9\n"
6730 "movaps %%xmm2, %%xmm10\n"
6731 "movaps %%xmm2, %%xmm11\n"
6732 "addps %%xmm3, %%xmm10\n"
6733 "subps %%xmm3, %%xmm11\n"
6734 "movaps %%xmm4, %%xmm12\n"
6735 "movaps %%xmm4, %%xmm13\n"
6736 "addps %%xmm5, %%xmm12\n"
6737 "subps %%xmm5, %%xmm13\n"
6738 "movaps %%xmm6, %%xmm14\n"
6739 "movaps %%xmm6, %%xmm15\n"
6740 "addps %%xmm7, %%xmm14\n"
6741 "subps %%xmm7, %%xmm15\n"
6742 "movaps %%xmm8, %%xmm0\n"
6743 "movaps %%xmm8, %%xmm2\n"
6744 "addps %%xmm10, %%xmm0\n"
6745 "subps %%xmm10, %%xmm2\n"
6746 "movaps %%xmm9, %%xmm1\n"
6747 "movaps %%xmm9, %%xmm3\n"
6748 "addps %%xmm11, %%xmm1\n"
6749 "subps %%xmm11, %%xmm3\n"
6750 "movaps %%xmm12, %%xmm4\n"
6751 "movaps %%xmm12, %%xmm6\n"
6752 "addps %%xmm14, %%xmm4\n"
6753 "subps %%xmm14, %%xmm6\n"
6754 "movaps %%xmm13, %%xmm5\n"
6755 "movaps %%xmm13, %%xmm7\n"
6756 "addps %%xmm15, %%xmm5\n"
6757 "subps %%xmm15, %%xmm7\n"
6758 "movaps %%xmm0, %%xmm8\n"
6759 "movaps %%xmm0, %%xmm12\n"
6760 "addps %%xmm4, %%xmm8\n"
6761 "subps %%xmm4, %%xmm12\n"
6762 "movaps %%xmm1, %%xmm9\n"
6763 "movaps %%xmm1, %%xmm13\n"
6764 "addps %%xmm5, %%xmm9\n"
6765 "subps %%xmm5, %%xmm13\n"
6766 "movaps %%xmm2, %%xmm10\n"
6767 "movaps %%xmm2, %%xmm14\n"
6768 "addps %%xmm6, %%xmm10\n"
6769 "subps %%xmm6, %%xmm14\n"
6770 "movaps %%xmm3, %%xmm11\n"
6771 "movaps %%xmm3, %%xmm15\n"
6772 "addps %%xmm7, %%xmm11\n"
6773 "subps %%xmm7, %%xmm15\n"
6774 "movups %%xmm8, (%0)\n"
6775 "movups %%xmm9, (%1)\n"
6776 "movups %%xmm10, (%2)\n"
6777 "movups %%xmm11, (%3)\n"
6778 "movups %%xmm12, (%4)\n"
6779 "movups %%xmm13, (%5)\n"
6780 "movups %%xmm14, (%6)\n"
6781 "movups %%xmm15, (%7)\n"
6782 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6783 );
6784 }
6785 }
6786 for (int j = 0; j < 8192; j += 8192) {
6787 for (int k = 0; k < 2048; k += 4) {
6788 __asm__ volatile (
6789 "movups (%0), %%xmm0\n"
6790 "movups (%1), %%xmm1\n"
6791 "movups (%2), %%xmm2\n"
6792 "movups (%3), %%xmm3\n"
6793 "movaps %%xmm0, %%xmm8\n"
6794 "movaps %%xmm0, %%xmm9\n"
6795 "addps %%xmm1, %%xmm8\n"
6796 "subps %%xmm1, %%xmm9\n"
6797 "movaps %%xmm2, %%xmm10\n"
6798 "movaps %%xmm2, %%xmm11\n"
6799 "addps %%xmm3, %%xmm10\n"
6800 "subps %%xmm3, %%xmm11\n"
6801 "movaps %%xmm8, %%xmm0\n"
6802 "movaps %%xmm8, %%xmm2\n"
6803 "addps %%xmm10, %%xmm0\n"
6804 "subps %%xmm10, %%xmm2\n"
6805 "movaps %%xmm9, %%xmm1\n"
6806 "movaps %%xmm9, %%xmm3\n"
6807 "addps %%xmm11, %%xmm1\n"
6808 "subps %%xmm11, %%xmm3\n"
6809 "movups %%xmm0, (%0)\n"
6810 "movups %%xmm1, (%1)\n"
6811 "movups %%xmm2, (%2)\n"
6812 "movups %%xmm3, (%3)\n"
6813 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6814 );
6815 }
6816 }
6817 return;
6818 }
6819 if (depth == 16) {
6820 helper_float_21_recursive(buf + 0, 13);
6821 helper_float_21_recursive(buf + 8192, 13);
6822 helper_float_21_recursive(buf + 16384, 13);
6823 helper_float_21_recursive(buf + 24576, 13);
6824 helper_float_21_recursive(buf + 32768, 13);
6825 helper_float_21_recursive(buf + 40960, 13);
6826 helper_float_21_recursive(buf + 49152, 13);
6827 helper_float_21_recursive(buf + 57344, 13);
6828 for (int j = 0; j < 65536; j += 65536) {
6829 for (int k = 0; k < 8192; k += 4) {
6830 __asm__ volatile (
6831 "movups (%0), %%xmm0\n"
6832 "movups (%1), %%xmm1\n"
6833 "movups (%2), %%xmm2\n"
6834 "movups (%3), %%xmm3\n"
6835 "movups (%4), %%xmm4\n"
6836 "movups (%5), %%xmm5\n"
6837 "movups (%6), %%xmm6\n"
6838 "movups (%7), %%xmm7\n"
6839 "movaps %%xmm0, %%xmm8\n"
6840 "movaps %%xmm0, %%xmm9\n"
6841 "addps %%xmm1, %%xmm8\n"
6842 "subps %%xmm1, %%xmm9\n"
6843 "movaps %%xmm2, %%xmm10\n"
6844 "movaps %%xmm2, %%xmm11\n"
6845 "addps %%xmm3, %%xmm10\n"
6846 "subps %%xmm3, %%xmm11\n"
6847 "movaps %%xmm4, %%xmm12\n"
6848 "movaps %%xmm4, %%xmm13\n"
6849 "addps %%xmm5, %%xmm12\n"
6850 "subps %%xmm5, %%xmm13\n"
6851 "movaps %%xmm6, %%xmm14\n"
6852 "movaps %%xmm6, %%xmm15\n"
6853 "addps %%xmm7, %%xmm14\n"
6854 "subps %%xmm7, %%xmm15\n"
6855 "movaps %%xmm8, %%xmm0\n"
6856 "movaps %%xmm8, %%xmm2\n"
6857 "addps %%xmm10, %%xmm0\n"
6858 "subps %%xmm10, %%xmm2\n"
6859 "movaps %%xmm9, %%xmm1\n"
6860 "movaps %%xmm9, %%xmm3\n"
6861 "addps %%xmm11, %%xmm1\n"
6862 "subps %%xmm11, %%xmm3\n"
6863 "movaps %%xmm12, %%xmm4\n"
6864 "movaps %%xmm12, %%xmm6\n"
6865 "addps %%xmm14, %%xmm4\n"
6866 "subps %%xmm14, %%xmm6\n"
6867 "movaps %%xmm13, %%xmm5\n"
6868 "movaps %%xmm13, %%xmm7\n"
6869 "addps %%xmm15, %%xmm5\n"
6870 "subps %%xmm15, %%xmm7\n"
6871 "movaps %%xmm0, %%xmm8\n"
6872 "movaps %%xmm0, %%xmm12\n"
6873 "addps %%xmm4, %%xmm8\n"
6874 "subps %%xmm4, %%xmm12\n"
6875 "movaps %%xmm1, %%xmm9\n"
6876 "movaps %%xmm1, %%xmm13\n"
6877 "addps %%xmm5, %%xmm9\n"
6878 "subps %%xmm5, %%xmm13\n"
6879 "movaps %%xmm2, %%xmm10\n"
6880 "movaps %%xmm2, %%xmm14\n"
6881 "addps %%xmm6, %%xmm10\n"
6882 "subps %%xmm6, %%xmm14\n"
6883 "movaps %%xmm3, %%xmm11\n"
6884 "movaps %%xmm3, %%xmm15\n"
6885 "addps %%xmm7, %%xmm11\n"
6886 "subps %%xmm7, %%xmm15\n"
6887 "movups %%xmm8, (%0)\n"
6888 "movups %%xmm9, (%1)\n"
6889 "movups %%xmm10, (%2)\n"
6890 "movups %%xmm11, (%3)\n"
6891 "movups %%xmm12, (%4)\n"
6892 "movups %%xmm13, (%5)\n"
6893 "movups %%xmm14, (%6)\n"
6894 "movups %%xmm15, (%7)\n"
6895 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6896 );
6897 }
6898 }
6899 return;
6900 }
6901 if (depth == 19) {
6902 helper_float_21_recursive(buf + 0, 16);
6903 helper_float_21_recursive(buf + 65536, 16);
6904 helper_float_21_recursive(buf + 131072, 16);
6905 helper_float_21_recursive(buf + 196608, 16);
6906 helper_float_21_recursive(buf + 262144, 16);
6907 helper_float_21_recursive(buf + 327680, 16);
6908 helper_float_21_recursive(buf + 393216, 16);
6909 helper_float_21_recursive(buf + 458752, 16);
6910 for (int j = 0; j < 524288; j += 524288) {
6911 for (int k = 0; k < 65536; k += 4) {
6912 __asm__ volatile (
6913 "movups (%0), %%xmm0\n"
6914 "movups (%1), %%xmm1\n"
6915 "movups (%2), %%xmm2\n"
6916 "movups (%3), %%xmm3\n"
6917 "movups (%4), %%xmm4\n"
6918 "movups (%5), %%xmm5\n"
6919 "movups (%6), %%xmm6\n"
6920 "movups (%7), %%xmm7\n"
6921 "movaps %%xmm0, %%xmm8\n"
6922 "movaps %%xmm0, %%xmm9\n"
6923 "addps %%xmm1, %%xmm8\n"
6924 "subps %%xmm1, %%xmm9\n"
6925 "movaps %%xmm2, %%xmm10\n"
6926 "movaps %%xmm2, %%xmm11\n"
6927 "addps %%xmm3, %%xmm10\n"
6928 "subps %%xmm3, %%xmm11\n"
6929 "movaps %%xmm4, %%xmm12\n"
6930 "movaps %%xmm4, %%xmm13\n"
6931 "addps %%xmm5, %%xmm12\n"
6932 "subps %%xmm5, %%xmm13\n"
6933 "movaps %%xmm6, %%xmm14\n"
6934 "movaps %%xmm6, %%xmm15\n"
6935 "addps %%xmm7, %%xmm14\n"
6936 "subps %%xmm7, %%xmm15\n"
6937 "movaps %%xmm8, %%xmm0\n"
6938 "movaps %%xmm8, %%xmm2\n"
6939 "addps %%xmm10, %%xmm0\n"
6940 "subps %%xmm10, %%xmm2\n"
6941 "movaps %%xmm9, %%xmm1\n"
6942 "movaps %%xmm9, %%xmm3\n"
6943 "addps %%xmm11, %%xmm1\n"
6944 "subps %%xmm11, %%xmm3\n"
6945 "movaps %%xmm12, %%xmm4\n"
6946 "movaps %%xmm12, %%xmm6\n"
6947 "addps %%xmm14, %%xmm4\n"
6948 "subps %%xmm14, %%xmm6\n"
6949 "movaps %%xmm13, %%xmm5\n"
6950 "movaps %%xmm13, %%xmm7\n"
6951 "addps %%xmm15, %%xmm5\n"
6952 "subps %%xmm15, %%xmm7\n"
6953 "movaps %%xmm0, %%xmm8\n"
6954 "movaps %%xmm0, %%xmm12\n"
6955 "addps %%xmm4, %%xmm8\n"
6956 "subps %%xmm4, %%xmm12\n"
6957 "movaps %%xmm1, %%xmm9\n"
6958 "movaps %%xmm1, %%xmm13\n"
6959 "addps %%xmm5, %%xmm9\n"
6960 "subps %%xmm5, %%xmm13\n"
6961 "movaps %%xmm2, %%xmm10\n"
6962 "movaps %%xmm2, %%xmm14\n"
6963 "addps %%xmm6, %%xmm10\n"
6964 "subps %%xmm6, %%xmm14\n"
6965 "movaps %%xmm3, %%xmm11\n"
6966 "movaps %%xmm3, %%xmm15\n"
6967 "addps %%xmm7, %%xmm11\n"
6968 "subps %%xmm7, %%xmm15\n"
6969 "movups %%xmm8, (%0)\n"
6970 "movups %%xmm9, (%1)\n"
6971 "movups %%xmm10, (%2)\n"
6972 "movups %%xmm11, (%3)\n"
6973 "movups %%xmm12, (%4)\n"
6974 "movups %%xmm13, (%5)\n"
6975 "movups %%xmm14, (%6)\n"
6976 "movups %%xmm15, (%7)\n"
6977 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
6978 );
6979 }
6980 }
6981 return;
6982 }
6983 if (depth == 21) {
6984 helper_float_21_recursive(buf + 0, 19);
6985 helper_float_21_recursive(buf + 524288, 19);
6986 helper_float_21_recursive(buf + 1048576, 19);
6987 helper_float_21_recursive(buf + 1572864, 19);
6988 for (int j = 0; j < 2097152; j += 2097152) {
6989 for (int k = 0; k < 524288; k += 4) {
6990 __asm__ volatile (
6991 "movups (%0), %%xmm0\n"
6992 "movups (%1), %%xmm1\n"
6993 "movups (%2), %%xmm2\n"
6994 "movups (%3), %%xmm3\n"
6995 "movaps %%xmm0, %%xmm8\n"
6996 "movaps %%xmm0, %%xmm9\n"
6997 "addps %%xmm1, %%xmm8\n"
6998 "subps %%xmm1, %%xmm9\n"
6999 "movaps %%xmm2, %%xmm10\n"
7000 "movaps %%xmm2, %%xmm11\n"
7001 "addps %%xmm3, %%xmm10\n"
7002 "subps %%xmm3, %%xmm11\n"
7003 "movaps %%xmm8, %%xmm0\n"
7004 "movaps %%xmm8, %%xmm2\n"
7005 "addps %%xmm10, %%xmm0\n"
7006 "subps %%xmm10, %%xmm2\n"
7007 "movaps %%xmm9, %%xmm1\n"
7008 "movaps %%xmm9, %%xmm3\n"
7009 "addps %%xmm11, %%xmm1\n"
7010 "subps %%xmm11, %%xmm3\n"
7011 "movups %%xmm0, (%0)\n"
7012 "movups %%xmm1, (%1)\n"
7013 "movups %%xmm2, (%2)\n"
7014 "movups %%xmm3, (%3)\n"
7015 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7016 );
7017 }
7018 }
7019 return;
7020 }
7021 }
7022 void helper_float_21(float *buf);
helper_float_21(float * buf)7023 void helper_float_21(float *buf) {
7024 helper_float_21_recursive(buf, 21);
7025 }
7026 void helper_float_22_recursive(float *buf, int depth);
helper_float_22_recursive(float * buf,int depth)7027 void helper_float_22_recursive(float *buf, int depth) {
7028 if (depth == 11) {
7029 for (int j = 0; j < 2048; j += 32) {
7030 for (int k = 0; k < 4; k += 4) {
7031 __asm__ volatile (
7032 "movups (%0), %%xmm0\n"
7033 "movups (%1), %%xmm1\n"
7034 "movups (%2), %%xmm2\n"
7035 "movups (%3), %%xmm3\n"
7036 "movups (%4), %%xmm4\n"
7037 "movups (%5), %%xmm5\n"
7038 "movups (%6), %%xmm6\n"
7039 "movups (%7), %%xmm7\n"
7040 "movaps %%xmm0, %%xmm8\n"
7041 "shufps $160, %%xmm8, %%xmm8\n"
7042 "shufps $245, %%xmm0, %%xmm0\n"
7043 "xorps %%xmm9, %%xmm9\n"
7044 "subps %%xmm0, %%xmm9\n"
7045 "addsubps %%xmm9, %%xmm8\n"
7046 "movaps %%xmm8, %%xmm0\n"
7047 "movaps %%xmm1, %%xmm8\n"
7048 "shufps $160, %%xmm8, %%xmm8\n"
7049 "shufps $245, %%xmm1, %%xmm1\n"
7050 "xorps %%xmm9, %%xmm9\n"
7051 "subps %%xmm1, %%xmm9\n"
7052 "addsubps %%xmm9, %%xmm8\n"
7053 "movaps %%xmm8, %%xmm1\n"
7054 "movaps %%xmm2, %%xmm8\n"
7055 "shufps $160, %%xmm8, %%xmm8\n"
7056 "shufps $245, %%xmm2, %%xmm2\n"
7057 "xorps %%xmm9, %%xmm9\n"
7058 "subps %%xmm2, %%xmm9\n"
7059 "addsubps %%xmm9, %%xmm8\n"
7060 "movaps %%xmm8, %%xmm2\n"
7061 "movaps %%xmm3, %%xmm8\n"
7062 "shufps $160, %%xmm8, %%xmm8\n"
7063 "shufps $245, %%xmm3, %%xmm3\n"
7064 "xorps %%xmm9, %%xmm9\n"
7065 "subps %%xmm3, %%xmm9\n"
7066 "addsubps %%xmm9, %%xmm8\n"
7067 "movaps %%xmm8, %%xmm3\n"
7068 "movaps %%xmm4, %%xmm8\n"
7069 "shufps $160, %%xmm8, %%xmm8\n"
7070 "shufps $245, %%xmm4, %%xmm4\n"
7071 "xorps %%xmm9, %%xmm9\n"
7072 "subps %%xmm4, %%xmm9\n"
7073 "addsubps %%xmm9, %%xmm8\n"
7074 "movaps %%xmm8, %%xmm4\n"
7075 "movaps %%xmm5, %%xmm8\n"
7076 "shufps $160, %%xmm8, %%xmm8\n"
7077 "shufps $245, %%xmm5, %%xmm5\n"
7078 "xorps %%xmm9, %%xmm9\n"
7079 "subps %%xmm5, %%xmm9\n"
7080 "addsubps %%xmm9, %%xmm8\n"
7081 "movaps %%xmm8, %%xmm5\n"
7082 "movaps %%xmm6, %%xmm8\n"
7083 "shufps $160, %%xmm8, %%xmm8\n"
7084 "shufps $245, %%xmm6, %%xmm6\n"
7085 "xorps %%xmm9, %%xmm9\n"
7086 "subps %%xmm6, %%xmm9\n"
7087 "addsubps %%xmm9, %%xmm8\n"
7088 "movaps %%xmm8, %%xmm6\n"
7089 "movaps %%xmm7, %%xmm8\n"
7090 "shufps $160, %%xmm8, %%xmm8\n"
7091 "shufps $245, %%xmm7, %%xmm7\n"
7092 "xorps %%xmm9, %%xmm9\n"
7093 "subps %%xmm7, %%xmm9\n"
7094 "addsubps %%xmm9, %%xmm8\n"
7095 "movaps %%xmm8, %%xmm7\n"
7096 "movaps %%xmm0, %%xmm8\n"
7097 "shufps $68, %%xmm8, %%xmm8\n"
7098 "xorps %%xmm9, %%xmm9\n"
7099 "movaps %%xmm0, %%xmm10\n"
7100 "shufps $14, %%xmm9, %%xmm10\n"
7101 "movaps %%xmm0, %%xmm11\n"
7102 "shufps $224, %%xmm11, %%xmm9\n"
7103 "addps %%xmm8, %%xmm10\n"
7104 "subps %%xmm9, %%xmm10\n"
7105 "movaps %%xmm10, %%xmm0\n"
7106 "movaps %%xmm1, %%xmm8\n"
7107 "shufps $68, %%xmm8, %%xmm8\n"
7108 "xorps %%xmm9, %%xmm9\n"
7109 "movaps %%xmm1, %%xmm10\n"
7110 "shufps $14, %%xmm9, %%xmm10\n"
7111 "movaps %%xmm1, %%xmm11\n"
7112 "shufps $224, %%xmm11, %%xmm9\n"
7113 "addps %%xmm8, %%xmm10\n"
7114 "subps %%xmm9, %%xmm10\n"
7115 "movaps %%xmm10, %%xmm1\n"
7116 "movaps %%xmm2, %%xmm8\n"
7117 "shufps $68, %%xmm8, %%xmm8\n"
7118 "xorps %%xmm9, %%xmm9\n"
7119 "movaps %%xmm2, %%xmm10\n"
7120 "shufps $14, %%xmm9, %%xmm10\n"
7121 "movaps %%xmm2, %%xmm11\n"
7122 "shufps $224, %%xmm11, %%xmm9\n"
7123 "addps %%xmm8, %%xmm10\n"
7124 "subps %%xmm9, %%xmm10\n"
7125 "movaps %%xmm10, %%xmm2\n"
7126 "movaps %%xmm3, %%xmm8\n"
7127 "shufps $68, %%xmm8, %%xmm8\n"
7128 "xorps %%xmm9, %%xmm9\n"
7129 "movaps %%xmm3, %%xmm10\n"
7130 "shufps $14, %%xmm9, %%xmm10\n"
7131 "movaps %%xmm3, %%xmm11\n"
7132 "shufps $224, %%xmm11, %%xmm9\n"
7133 "addps %%xmm8, %%xmm10\n"
7134 "subps %%xmm9, %%xmm10\n"
7135 "movaps %%xmm10, %%xmm3\n"
7136 "movaps %%xmm4, %%xmm8\n"
7137 "shufps $68, %%xmm8, %%xmm8\n"
7138 "xorps %%xmm9, %%xmm9\n"
7139 "movaps %%xmm4, %%xmm10\n"
7140 "shufps $14, %%xmm9, %%xmm10\n"
7141 "movaps %%xmm4, %%xmm11\n"
7142 "shufps $224, %%xmm11, %%xmm9\n"
7143 "addps %%xmm8, %%xmm10\n"
7144 "subps %%xmm9, %%xmm10\n"
7145 "movaps %%xmm10, %%xmm4\n"
7146 "movaps %%xmm5, %%xmm8\n"
7147 "shufps $68, %%xmm8, %%xmm8\n"
7148 "xorps %%xmm9, %%xmm9\n"
7149 "movaps %%xmm5, %%xmm10\n"
7150 "shufps $14, %%xmm9, %%xmm10\n"
7151 "movaps %%xmm5, %%xmm11\n"
7152 "shufps $224, %%xmm11, %%xmm9\n"
7153 "addps %%xmm8, %%xmm10\n"
7154 "subps %%xmm9, %%xmm10\n"
7155 "movaps %%xmm10, %%xmm5\n"
7156 "movaps %%xmm6, %%xmm8\n"
7157 "shufps $68, %%xmm8, %%xmm8\n"
7158 "xorps %%xmm9, %%xmm9\n"
7159 "movaps %%xmm6, %%xmm10\n"
7160 "shufps $14, %%xmm9, %%xmm10\n"
7161 "movaps %%xmm6, %%xmm11\n"
7162 "shufps $224, %%xmm11, %%xmm9\n"
7163 "addps %%xmm8, %%xmm10\n"
7164 "subps %%xmm9, %%xmm10\n"
7165 "movaps %%xmm10, %%xmm6\n"
7166 "movaps %%xmm7, %%xmm8\n"
7167 "shufps $68, %%xmm8, %%xmm8\n"
7168 "xorps %%xmm9, %%xmm9\n"
7169 "movaps %%xmm7, %%xmm10\n"
7170 "shufps $14, %%xmm9, %%xmm10\n"
7171 "movaps %%xmm7, %%xmm11\n"
7172 "shufps $224, %%xmm11, %%xmm9\n"
7173 "addps %%xmm8, %%xmm10\n"
7174 "subps %%xmm9, %%xmm10\n"
7175 "movaps %%xmm10, %%xmm7\n"
7176 "movaps %%xmm0, %%xmm8\n"
7177 "movaps %%xmm0, %%xmm9\n"
7178 "addps %%xmm1, %%xmm8\n"
7179 "subps %%xmm1, %%xmm9\n"
7180 "movaps %%xmm2, %%xmm10\n"
7181 "movaps %%xmm2, %%xmm11\n"
7182 "addps %%xmm3, %%xmm10\n"
7183 "subps %%xmm3, %%xmm11\n"
7184 "movaps %%xmm4, %%xmm12\n"
7185 "movaps %%xmm4, %%xmm13\n"
7186 "addps %%xmm5, %%xmm12\n"
7187 "subps %%xmm5, %%xmm13\n"
7188 "movaps %%xmm6, %%xmm14\n"
7189 "movaps %%xmm6, %%xmm15\n"
7190 "addps %%xmm7, %%xmm14\n"
7191 "subps %%xmm7, %%xmm15\n"
7192 "movaps %%xmm8, %%xmm0\n"
7193 "movaps %%xmm8, %%xmm2\n"
7194 "addps %%xmm10, %%xmm0\n"
7195 "subps %%xmm10, %%xmm2\n"
7196 "movaps %%xmm9, %%xmm1\n"
7197 "movaps %%xmm9, %%xmm3\n"
7198 "addps %%xmm11, %%xmm1\n"
7199 "subps %%xmm11, %%xmm3\n"
7200 "movaps %%xmm12, %%xmm4\n"
7201 "movaps %%xmm12, %%xmm6\n"
7202 "addps %%xmm14, %%xmm4\n"
7203 "subps %%xmm14, %%xmm6\n"
7204 "movaps %%xmm13, %%xmm5\n"
7205 "movaps %%xmm13, %%xmm7\n"
7206 "addps %%xmm15, %%xmm5\n"
7207 "subps %%xmm15, %%xmm7\n"
7208 "movaps %%xmm0, %%xmm8\n"
7209 "movaps %%xmm0, %%xmm12\n"
7210 "addps %%xmm4, %%xmm8\n"
7211 "subps %%xmm4, %%xmm12\n"
7212 "movaps %%xmm1, %%xmm9\n"
7213 "movaps %%xmm1, %%xmm13\n"
7214 "addps %%xmm5, %%xmm9\n"
7215 "subps %%xmm5, %%xmm13\n"
7216 "movaps %%xmm2, %%xmm10\n"
7217 "movaps %%xmm2, %%xmm14\n"
7218 "addps %%xmm6, %%xmm10\n"
7219 "subps %%xmm6, %%xmm14\n"
7220 "movaps %%xmm3, %%xmm11\n"
7221 "movaps %%xmm3, %%xmm15\n"
7222 "addps %%xmm7, %%xmm11\n"
7223 "subps %%xmm7, %%xmm15\n"
7224 "movups %%xmm8, (%0)\n"
7225 "movups %%xmm9, (%1)\n"
7226 "movups %%xmm10, (%2)\n"
7227 "movups %%xmm11, (%3)\n"
7228 "movups %%xmm12, (%4)\n"
7229 "movups %%xmm13, (%5)\n"
7230 "movups %%xmm14, (%6)\n"
7231 "movups %%xmm15, (%7)\n"
7232 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7233 );
7234 }
7235 }
7236 for (int j = 0; j < 2048; j += 256) {
7237 for (int k = 0; k < 32; k += 4) {
7238 __asm__ volatile (
7239 "movups (%0), %%xmm0\n"
7240 "movups (%1), %%xmm1\n"
7241 "movups (%2), %%xmm2\n"
7242 "movups (%3), %%xmm3\n"
7243 "movups (%4), %%xmm4\n"
7244 "movups (%5), %%xmm5\n"
7245 "movups (%6), %%xmm6\n"
7246 "movups (%7), %%xmm7\n"
7247 "movaps %%xmm0, %%xmm8\n"
7248 "movaps %%xmm0, %%xmm9\n"
7249 "addps %%xmm1, %%xmm8\n"
7250 "subps %%xmm1, %%xmm9\n"
7251 "movaps %%xmm2, %%xmm10\n"
7252 "movaps %%xmm2, %%xmm11\n"
7253 "addps %%xmm3, %%xmm10\n"
7254 "subps %%xmm3, %%xmm11\n"
7255 "movaps %%xmm4, %%xmm12\n"
7256 "movaps %%xmm4, %%xmm13\n"
7257 "addps %%xmm5, %%xmm12\n"
7258 "subps %%xmm5, %%xmm13\n"
7259 "movaps %%xmm6, %%xmm14\n"
7260 "movaps %%xmm6, %%xmm15\n"
7261 "addps %%xmm7, %%xmm14\n"
7262 "subps %%xmm7, %%xmm15\n"
7263 "movaps %%xmm8, %%xmm0\n"
7264 "movaps %%xmm8, %%xmm2\n"
7265 "addps %%xmm10, %%xmm0\n"
7266 "subps %%xmm10, %%xmm2\n"
7267 "movaps %%xmm9, %%xmm1\n"
7268 "movaps %%xmm9, %%xmm3\n"
7269 "addps %%xmm11, %%xmm1\n"
7270 "subps %%xmm11, %%xmm3\n"
7271 "movaps %%xmm12, %%xmm4\n"
7272 "movaps %%xmm12, %%xmm6\n"
7273 "addps %%xmm14, %%xmm4\n"
7274 "subps %%xmm14, %%xmm6\n"
7275 "movaps %%xmm13, %%xmm5\n"
7276 "movaps %%xmm13, %%xmm7\n"
7277 "addps %%xmm15, %%xmm5\n"
7278 "subps %%xmm15, %%xmm7\n"
7279 "movaps %%xmm0, %%xmm8\n"
7280 "movaps %%xmm0, %%xmm12\n"
7281 "addps %%xmm4, %%xmm8\n"
7282 "subps %%xmm4, %%xmm12\n"
7283 "movaps %%xmm1, %%xmm9\n"
7284 "movaps %%xmm1, %%xmm13\n"
7285 "addps %%xmm5, %%xmm9\n"
7286 "subps %%xmm5, %%xmm13\n"
7287 "movaps %%xmm2, %%xmm10\n"
7288 "movaps %%xmm2, %%xmm14\n"
7289 "addps %%xmm6, %%xmm10\n"
7290 "subps %%xmm6, %%xmm14\n"
7291 "movaps %%xmm3, %%xmm11\n"
7292 "movaps %%xmm3, %%xmm15\n"
7293 "addps %%xmm7, %%xmm11\n"
7294 "subps %%xmm7, %%xmm15\n"
7295 "movups %%xmm8, (%0)\n"
7296 "movups %%xmm9, (%1)\n"
7297 "movups %%xmm10, (%2)\n"
7298 "movups %%xmm11, (%3)\n"
7299 "movups %%xmm12, (%4)\n"
7300 "movups %%xmm13, (%5)\n"
7301 "movups %%xmm14, (%6)\n"
7302 "movups %%xmm15, (%7)\n"
7303 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7304 );
7305 }
7306 }
7307 for (int j = 0; j < 2048; j += 2048) {
7308 for (int k = 0; k < 256; k += 4) {
7309 __asm__ volatile (
7310 "movups (%0), %%xmm0\n"
7311 "movups (%1), %%xmm1\n"
7312 "movups (%2), %%xmm2\n"
7313 "movups (%3), %%xmm3\n"
7314 "movups (%4), %%xmm4\n"
7315 "movups (%5), %%xmm5\n"
7316 "movups (%6), %%xmm6\n"
7317 "movups (%7), %%xmm7\n"
7318 "movaps %%xmm0, %%xmm8\n"
7319 "movaps %%xmm0, %%xmm9\n"
7320 "addps %%xmm1, %%xmm8\n"
7321 "subps %%xmm1, %%xmm9\n"
7322 "movaps %%xmm2, %%xmm10\n"
7323 "movaps %%xmm2, %%xmm11\n"
7324 "addps %%xmm3, %%xmm10\n"
7325 "subps %%xmm3, %%xmm11\n"
7326 "movaps %%xmm4, %%xmm12\n"
7327 "movaps %%xmm4, %%xmm13\n"
7328 "addps %%xmm5, %%xmm12\n"
7329 "subps %%xmm5, %%xmm13\n"
7330 "movaps %%xmm6, %%xmm14\n"
7331 "movaps %%xmm6, %%xmm15\n"
7332 "addps %%xmm7, %%xmm14\n"
7333 "subps %%xmm7, %%xmm15\n"
7334 "movaps %%xmm8, %%xmm0\n"
7335 "movaps %%xmm8, %%xmm2\n"
7336 "addps %%xmm10, %%xmm0\n"
7337 "subps %%xmm10, %%xmm2\n"
7338 "movaps %%xmm9, %%xmm1\n"
7339 "movaps %%xmm9, %%xmm3\n"
7340 "addps %%xmm11, %%xmm1\n"
7341 "subps %%xmm11, %%xmm3\n"
7342 "movaps %%xmm12, %%xmm4\n"
7343 "movaps %%xmm12, %%xmm6\n"
7344 "addps %%xmm14, %%xmm4\n"
7345 "subps %%xmm14, %%xmm6\n"
7346 "movaps %%xmm13, %%xmm5\n"
7347 "movaps %%xmm13, %%xmm7\n"
7348 "addps %%xmm15, %%xmm5\n"
7349 "subps %%xmm15, %%xmm7\n"
7350 "movaps %%xmm0, %%xmm8\n"
7351 "movaps %%xmm0, %%xmm12\n"
7352 "addps %%xmm4, %%xmm8\n"
7353 "subps %%xmm4, %%xmm12\n"
7354 "movaps %%xmm1, %%xmm9\n"
7355 "movaps %%xmm1, %%xmm13\n"
7356 "addps %%xmm5, %%xmm9\n"
7357 "subps %%xmm5, %%xmm13\n"
7358 "movaps %%xmm2, %%xmm10\n"
7359 "movaps %%xmm2, %%xmm14\n"
7360 "addps %%xmm6, %%xmm10\n"
7361 "subps %%xmm6, %%xmm14\n"
7362 "movaps %%xmm3, %%xmm11\n"
7363 "movaps %%xmm3, %%xmm15\n"
7364 "addps %%xmm7, %%xmm11\n"
7365 "subps %%xmm7, %%xmm15\n"
7366 "movups %%xmm8, (%0)\n"
7367 "movups %%xmm9, (%1)\n"
7368 "movups %%xmm10, (%2)\n"
7369 "movups %%xmm11, (%3)\n"
7370 "movups %%xmm12, (%4)\n"
7371 "movups %%xmm13, (%5)\n"
7372 "movups %%xmm14, (%6)\n"
7373 "movups %%xmm15, (%7)\n"
7374 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7375 );
7376 }
7377 }
7378 return;
7379 }
7380 if (depth == 14) {
7381 helper_float_22_recursive(buf + 0, 11);
7382 helper_float_22_recursive(buf + 2048, 11);
7383 helper_float_22_recursive(buf + 4096, 11);
7384 helper_float_22_recursive(buf + 6144, 11);
7385 helper_float_22_recursive(buf + 8192, 11);
7386 helper_float_22_recursive(buf + 10240, 11);
7387 helper_float_22_recursive(buf + 12288, 11);
7388 helper_float_22_recursive(buf + 14336, 11);
7389 for (int j = 0; j < 16384; j += 16384) {
7390 for (int k = 0; k < 2048; k += 4) {
7391 __asm__ volatile (
7392 "movups (%0), %%xmm0\n"
7393 "movups (%1), %%xmm1\n"
7394 "movups (%2), %%xmm2\n"
7395 "movups (%3), %%xmm3\n"
7396 "movups (%4), %%xmm4\n"
7397 "movups (%5), %%xmm5\n"
7398 "movups (%6), %%xmm6\n"
7399 "movups (%7), %%xmm7\n"
7400 "movaps %%xmm0, %%xmm8\n"
7401 "movaps %%xmm0, %%xmm9\n"
7402 "addps %%xmm1, %%xmm8\n"
7403 "subps %%xmm1, %%xmm9\n"
7404 "movaps %%xmm2, %%xmm10\n"
7405 "movaps %%xmm2, %%xmm11\n"
7406 "addps %%xmm3, %%xmm10\n"
7407 "subps %%xmm3, %%xmm11\n"
7408 "movaps %%xmm4, %%xmm12\n"
7409 "movaps %%xmm4, %%xmm13\n"
7410 "addps %%xmm5, %%xmm12\n"
7411 "subps %%xmm5, %%xmm13\n"
7412 "movaps %%xmm6, %%xmm14\n"
7413 "movaps %%xmm6, %%xmm15\n"
7414 "addps %%xmm7, %%xmm14\n"
7415 "subps %%xmm7, %%xmm15\n"
7416 "movaps %%xmm8, %%xmm0\n"
7417 "movaps %%xmm8, %%xmm2\n"
7418 "addps %%xmm10, %%xmm0\n"
7419 "subps %%xmm10, %%xmm2\n"
7420 "movaps %%xmm9, %%xmm1\n"
7421 "movaps %%xmm9, %%xmm3\n"
7422 "addps %%xmm11, %%xmm1\n"
7423 "subps %%xmm11, %%xmm3\n"
7424 "movaps %%xmm12, %%xmm4\n"
7425 "movaps %%xmm12, %%xmm6\n"
7426 "addps %%xmm14, %%xmm4\n"
7427 "subps %%xmm14, %%xmm6\n"
7428 "movaps %%xmm13, %%xmm5\n"
7429 "movaps %%xmm13, %%xmm7\n"
7430 "addps %%xmm15, %%xmm5\n"
7431 "subps %%xmm15, %%xmm7\n"
7432 "movaps %%xmm0, %%xmm8\n"
7433 "movaps %%xmm0, %%xmm12\n"
7434 "addps %%xmm4, %%xmm8\n"
7435 "subps %%xmm4, %%xmm12\n"
7436 "movaps %%xmm1, %%xmm9\n"
7437 "movaps %%xmm1, %%xmm13\n"
7438 "addps %%xmm5, %%xmm9\n"
7439 "subps %%xmm5, %%xmm13\n"
7440 "movaps %%xmm2, %%xmm10\n"
7441 "movaps %%xmm2, %%xmm14\n"
7442 "addps %%xmm6, %%xmm10\n"
7443 "subps %%xmm6, %%xmm14\n"
7444 "movaps %%xmm3, %%xmm11\n"
7445 "movaps %%xmm3, %%xmm15\n"
7446 "addps %%xmm7, %%xmm11\n"
7447 "subps %%xmm7, %%xmm15\n"
7448 "movups %%xmm8, (%0)\n"
7449 "movups %%xmm9, (%1)\n"
7450 "movups %%xmm10, (%2)\n"
7451 "movups %%xmm11, (%3)\n"
7452 "movups %%xmm12, (%4)\n"
7453 "movups %%xmm13, (%5)\n"
7454 "movups %%xmm14, (%6)\n"
7455 "movups %%xmm15, (%7)\n"
7456 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7457 );
7458 }
7459 }
7460 return;
7461 }
7462 if (depth == 17) {
7463 helper_float_22_recursive(buf + 0, 14);
7464 helper_float_22_recursive(buf + 16384, 14);
7465 helper_float_22_recursive(buf + 32768, 14);
7466 helper_float_22_recursive(buf + 49152, 14);
7467 helper_float_22_recursive(buf + 65536, 14);
7468 helper_float_22_recursive(buf + 81920, 14);
7469 helper_float_22_recursive(buf + 98304, 14);
7470 helper_float_22_recursive(buf + 114688, 14);
7471 for (int j = 0; j < 131072; j += 131072) {
7472 for (int k = 0; k < 16384; k += 4) {
7473 __asm__ volatile (
7474 "movups (%0), %%xmm0\n"
7475 "movups (%1), %%xmm1\n"
7476 "movups (%2), %%xmm2\n"
7477 "movups (%3), %%xmm3\n"
7478 "movups (%4), %%xmm4\n"
7479 "movups (%5), %%xmm5\n"
7480 "movups (%6), %%xmm6\n"
7481 "movups (%7), %%xmm7\n"
7482 "movaps %%xmm0, %%xmm8\n"
7483 "movaps %%xmm0, %%xmm9\n"
7484 "addps %%xmm1, %%xmm8\n"
7485 "subps %%xmm1, %%xmm9\n"
7486 "movaps %%xmm2, %%xmm10\n"
7487 "movaps %%xmm2, %%xmm11\n"
7488 "addps %%xmm3, %%xmm10\n"
7489 "subps %%xmm3, %%xmm11\n"
7490 "movaps %%xmm4, %%xmm12\n"
7491 "movaps %%xmm4, %%xmm13\n"
7492 "addps %%xmm5, %%xmm12\n"
7493 "subps %%xmm5, %%xmm13\n"
7494 "movaps %%xmm6, %%xmm14\n"
7495 "movaps %%xmm6, %%xmm15\n"
7496 "addps %%xmm7, %%xmm14\n"
7497 "subps %%xmm7, %%xmm15\n"
7498 "movaps %%xmm8, %%xmm0\n"
7499 "movaps %%xmm8, %%xmm2\n"
7500 "addps %%xmm10, %%xmm0\n"
7501 "subps %%xmm10, %%xmm2\n"
7502 "movaps %%xmm9, %%xmm1\n"
7503 "movaps %%xmm9, %%xmm3\n"
7504 "addps %%xmm11, %%xmm1\n"
7505 "subps %%xmm11, %%xmm3\n"
7506 "movaps %%xmm12, %%xmm4\n"
7507 "movaps %%xmm12, %%xmm6\n"
7508 "addps %%xmm14, %%xmm4\n"
7509 "subps %%xmm14, %%xmm6\n"
7510 "movaps %%xmm13, %%xmm5\n"
7511 "movaps %%xmm13, %%xmm7\n"
7512 "addps %%xmm15, %%xmm5\n"
7513 "subps %%xmm15, %%xmm7\n"
7514 "movaps %%xmm0, %%xmm8\n"
7515 "movaps %%xmm0, %%xmm12\n"
7516 "addps %%xmm4, %%xmm8\n"
7517 "subps %%xmm4, %%xmm12\n"
7518 "movaps %%xmm1, %%xmm9\n"
7519 "movaps %%xmm1, %%xmm13\n"
7520 "addps %%xmm5, %%xmm9\n"
7521 "subps %%xmm5, %%xmm13\n"
7522 "movaps %%xmm2, %%xmm10\n"
7523 "movaps %%xmm2, %%xmm14\n"
7524 "addps %%xmm6, %%xmm10\n"
7525 "subps %%xmm6, %%xmm14\n"
7526 "movaps %%xmm3, %%xmm11\n"
7527 "movaps %%xmm3, %%xmm15\n"
7528 "addps %%xmm7, %%xmm11\n"
7529 "subps %%xmm7, %%xmm15\n"
7530 "movups %%xmm8, (%0)\n"
7531 "movups %%xmm9, (%1)\n"
7532 "movups %%xmm10, (%2)\n"
7533 "movups %%xmm11, (%3)\n"
7534 "movups %%xmm12, (%4)\n"
7535 "movups %%xmm13, (%5)\n"
7536 "movups %%xmm14, (%6)\n"
7537 "movups %%xmm15, (%7)\n"
7538 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7539 );
7540 }
7541 }
7542 return;
7543 }
7544 if (depth == 20) {
7545 helper_float_22_recursive(buf + 0, 17);
7546 helper_float_22_recursive(buf + 131072, 17);
7547 helper_float_22_recursive(buf + 262144, 17);
7548 helper_float_22_recursive(buf + 393216, 17);
7549 helper_float_22_recursive(buf + 524288, 17);
7550 helper_float_22_recursive(buf + 655360, 17);
7551 helper_float_22_recursive(buf + 786432, 17);
7552 helper_float_22_recursive(buf + 917504, 17);
7553 for (int j = 0; j < 1048576; j += 1048576) {
7554 for (int k = 0; k < 131072; k += 4) {
7555 __asm__ volatile (
7556 "movups (%0), %%xmm0\n"
7557 "movups (%1), %%xmm1\n"
7558 "movups (%2), %%xmm2\n"
7559 "movups (%3), %%xmm3\n"
7560 "movups (%4), %%xmm4\n"
7561 "movups (%5), %%xmm5\n"
7562 "movups (%6), %%xmm6\n"
7563 "movups (%7), %%xmm7\n"
7564 "movaps %%xmm0, %%xmm8\n"
7565 "movaps %%xmm0, %%xmm9\n"
7566 "addps %%xmm1, %%xmm8\n"
7567 "subps %%xmm1, %%xmm9\n"
7568 "movaps %%xmm2, %%xmm10\n"
7569 "movaps %%xmm2, %%xmm11\n"
7570 "addps %%xmm3, %%xmm10\n"
7571 "subps %%xmm3, %%xmm11\n"
7572 "movaps %%xmm4, %%xmm12\n"
7573 "movaps %%xmm4, %%xmm13\n"
7574 "addps %%xmm5, %%xmm12\n"
7575 "subps %%xmm5, %%xmm13\n"
7576 "movaps %%xmm6, %%xmm14\n"
7577 "movaps %%xmm6, %%xmm15\n"
7578 "addps %%xmm7, %%xmm14\n"
7579 "subps %%xmm7, %%xmm15\n"
7580 "movaps %%xmm8, %%xmm0\n"
7581 "movaps %%xmm8, %%xmm2\n"
7582 "addps %%xmm10, %%xmm0\n"
7583 "subps %%xmm10, %%xmm2\n"
7584 "movaps %%xmm9, %%xmm1\n"
7585 "movaps %%xmm9, %%xmm3\n"
7586 "addps %%xmm11, %%xmm1\n"
7587 "subps %%xmm11, %%xmm3\n"
7588 "movaps %%xmm12, %%xmm4\n"
7589 "movaps %%xmm12, %%xmm6\n"
7590 "addps %%xmm14, %%xmm4\n"
7591 "subps %%xmm14, %%xmm6\n"
7592 "movaps %%xmm13, %%xmm5\n"
7593 "movaps %%xmm13, %%xmm7\n"
7594 "addps %%xmm15, %%xmm5\n"
7595 "subps %%xmm15, %%xmm7\n"
7596 "movaps %%xmm0, %%xmm8\n"
7597 "movaps %%xmm0, %%xmm12\n"
7598 "addps %%xmm4, %%xmm8\n"
7599 "subps %%xmm4, %%xmm12\n"
7600 "movaps %%xmm1, %%xmm9\n"
7601 "movaps %%xmm1, %%xmm13\n"
7602 "addps %%xmm5, %%xmm9\n"
7603 "subps %%xmm5, %%xmm13\n"
7604 "movaps %%xmm2, %%xmm10\n"
7605 "movaps %%xmm2, %%xmm14\n"
7606 "addps %%xmm6, %%xmm10\n"
7607 "subps %%xmm6, %%xmm14\n"
7608 "movaps %%xmm3, %%xmm11\n"
7609 "movaps %%xmm3, %%xmm15\n"
7610 "addps %%xmm7, %%xmm11\n"
7611 "subps %%xmm7, %%xmm15\n"
7612 "movups %%xmm8, (%0)\n"
7613 "movups %%xmm9, (%1)\n"
7614 "movups %%xmm10, (%2)\n"
7615 "movups %%xmm11, (%3)\n"
7616 "movups %%xmm12, (%4)\n"
7617 "movups %%xmm13, (%5)\n"
7618 "movups %%xmm14, (%6)\n"
7619 "movups %%xmm15, (%7)\n"
7620 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7621 );
7622 }
7623 }
7624 return;
7625 }
7626 if (depth == 22) {
7627 helper_float_22_recursive(buf + 0, 20);
7628 helper_float_22_recursive(buf + 1048576, 20);
7629 helper_float_22_recursive(buf + 2097152, 20);
7630 helper_float_22_recursive(buf + 3145728, 20);
7631 for (int j = 0; j < 4194304; j += 4194304) {
7632 for (int k = 0; k < 1048576; k += 4) {
7633 __asm__ volatile (
7634 "movups (%0), %%xmm0\n"
7635 "movups (%1), %%xmm1\n"
7636 "movups (%2), %%xmm2\n"
7637 "movups (%3), %%xmm3\n"
7638 "movaps %%xmm0, %%xmm8\n"
7639 "movaps %%xmm0, %%xmm9\n"
7640 "addps %%xmm1, %%xmm8\n"
7641 "subps %%xmm1, %%xmm9\n"
7642 "movaps %%xmm2, %%xmm10\n"
7643 "movaps %%xmm2, %%xmm11\n"
7644 "addps %%xmm3, %%xmm10\n"
7645 "subps %%xmm3, %%xmm11\n"
7646 "movaps %%xmm8, %%xmm0\n"
7647 "movaps %%xmm8, %%xmm2\n"
7648 "addps %%xmm10, %%xmm0\n"
7649 "subps %%xmm10, %%xmm2\n"
7650 "movaps %%xmm9, %%xmm1\n"
7651 "movaps %%xmm9, %%xmm3\n"
7652 "addps %%xmm11, %%xmm1\n"
7653 "subps %%xmm11, %%xmm3\n"
7654 "movups %%xmm0, (%0)\n"
7655 "movups %%xmm1, (%1)\n"
7656 "movups %%xmm2, (%2)\n"
7657 "movups %%xmm3, (%3)\n"
7658 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7659 );
7660 }
7661 }
7662 return;
7663 }
7664 }
7665 void helper_float_22(float *buf);
helper_float_22(float * buf)7666 void helper_float_22(float *buf) {
7667 helper_float_22_recursive(buf, 22);
7668 }
7669 void helper_float_23_recursive(float *buf, int depth);
helper_float_23_recursive(float * buf,int depth)7670 void helper_float_23_recursive(float *buf, int depth) {
7671 if (depth == 6) {
7672 for (int j = 0; j < 64; j += 32) {
7673 for (int k = 0; k < 4; k += 4) {
7674 __asm__ volatile (
7675 "movups (%0), %%xmm0\n"
7676 "movups (%1), %%xmm1\n"
7677 "movups (%2), %%xmm2\n"
7678 "movups (%3), %%xmm3\n"
7679 "movups (%4), %%xmm4\n"
7680 "movups (%5), %%xmm5\n"
7681 "movups (%6), %%xmm6\n"
7682 "movups (%7), %%xmm7\n"
7683 "movaps %%xmm0, %%xmm8\n"
7684 "shufps $160, %%xmm8, %%xmm8\n"
7685 "shufps $245, %%xmm0, %%xmm0\n"
7686 "xorps %%xmm9, %%xmm9\n"
7687 "subps %%xmm0, %%xmm9\n"
7688 "addsubps %%xmm9, %%xmm8\n"
7689 "movaps %%xmm8, %%xmm0\n"
7690 "movaps %%xmm1, %%xmm8\n"
7691 "shufps $160, %%xmm8, %%xmm8\n"
7692 "shufps $245, %%xmm1, %%xmm1\n"
7693 "xorps %%xmm9, %%xmm9\n"
7694 "subps %%xmm1, %%xmm9\n"
7695 "addsubps %%xmm9, %%xmm8\n"
7696 "movaps %%xmm8, %%xmm1\n"
7697 "movaps %%xmm2, %%xmm8\n"
7698 "shufps $160, %%xmm8, %%xmm8\n"
7699 "shufps $245, %%xmm2, %%xmm2\n"
7700 "xorps %%xmm9, %%xmm9\n"
7701 "subps %%xmm2, %%xmm9\n"
7702 "addsubps %%xmm9, %%xmm8\n"
7703 "movaps %%xmm8, %%xmm2\n"
7704 "movaps %%xmm3, %%xmm8\n"
7705 "shufps $160, %%xmm8, %%xmm8\n"
7706 "shufps $245, %%xmm3, %%xmm3\n"
7707 "xorps %%xmm9, %%xmm9\n"
7708 "subps %%xmm3, %%xmm9\n"
7709 "addsubps %%xmm9, %%xmm8\n"
7710 "movaps %%xmm8, %%xmm3\n"
7711 "movaps %%xmm4, %%xmm8\n"
7712 "shufps $160, %%xmm8, %%xmm8\n"
7713 "shufps $245, %%xmm4, %%xmm4\n"
7714 "xorps %%xmm9, %%xmm9\n"
7715 "subps %%xmm4, %%xmm9\n"
7716 "addsubps %%xmm9, %%xmm8\n"
7717 "movaps %%xmm8, %%xmm4\n"
7718 "movaps %%xmm5, %%xmm8\n"
7719 "shufps $160, %%xmm8, %%xmm8\n"
7720 "shufps $245, %%xmm5, %%xmm5\n"
7721 "xorps %%xmm9, %%xmm9\n"
7722 "subps %%xmm5, %%xmm9\n"
7723 "addsubps %%xmm9, %%xmm8\n"
7724 "movaps %%xmm8, %%xmm5\n"
7725 "movaps %%xmm6, %%xmm8\n"
7726 "shufps $160, %%xmm8, %%xmm8\n"
7727 "shufps $245, %%xmm6, %%xmm6\n"
7728 "xorps %%xmm9, %%xmm9\n"
7729 "subps %%xmm6, %%xmm9\n"
7730 "addsubps %%xmm9, %%xmm8\n"
7731 "movaps %%xmm8, %%xmm6\n"
7732 "movaps %%xmm7, %%xmm8\n"
7733 "shufps $160, %%xmm8, %%xmm8\n"
7734 "shufps $245, %%xmm7, %%xmm7\n"
7735 "xorps %%xmm9, %%xmm9\n"
7736 "subps %%xmm7, %%xmm9\n"
7737 "addsubps %%xmm9, %%xmm8\n"
7738 "movaps %%xmm8, %%xmm7\n"
7739 "movaps %%xmm0, %%xmm8\n"
7740 "shufps $68, %%xmm8, %%xmm8\n"
7741 "xorps %%xmm9, %%xmm9\n"
7742 "movaps %%xmm0, %%xmm10\n"
7743 "shufps $14, %%xmm9, %%xmm10\n"
7744 "movaps %%xmm0, %%xmm11\n"
7745 "shufps $224, %%xmm11, %%xmm9\n"
7746 "addps %%xmm8, %%xmm10\n"
7747 "subps %%xmm9, %%xmm10\n"
7748 "movaps %%xmm10, %%xmm0\n"
7749 "movaps %%xmm1, %%xmm8\n"
7750 "shufps $68, %%xmm8, %%xmm8\n"
7751 "xorps %%xmm9, %%xmm9\n"
7752 "movaps %%xmm1, %%xmm10\n"
7753 "shufps $14, %%xmm9, %%xmm10\n"
7754 "movaps %%xmm1, %%xmm11\n"
7755 "shufps $224, %%xmm11, %%xmm9\n"
7756 "addps %%xmm8, %%xmm10\n"
7757 "subps %%xmm9, %%xmm10\n"
7758 "movaps %%xmm10, %%xmm1\n"
7759 "movaps %%xmm2, %%xmm8\n"
7760 "shufps $68, %%xmm8, %%xmm8\n"
7761 "xorps %%xmm9, %%xmm9\n"
7762 "movaps %%xmm2, %%xmm10\n"
7763 "shufps $14, %%xmm9, %%xmm10\n"
7764 "movaps %%xmm2, %%xmm11\n"
7765 "shufps $224, %%xmm11, %%xmm9\n"
7766 "addps %%xmm8, %%xmm10\n"
7767 "subps %%xmm9, %%xmm10\n"
7768 "movaps %%xmm10, %%xmm2\n"
7769 "movaps %%xmm3, %%xmm8\n"
7770 "shufps $68, %%xmm8, %%xmm8\n"
7771 "xorps %%xmm9, %%xmm9\n"
7772 "movaps %%xmm3, %%xmm10\n"
7773 "shufps $14, %%xmm9, %%xmm10\n"
7774 "movaps %%xmm3, %%xmm11\n"
7775 "shufps $224, %%xmm11, %%xmm9\n"
7776 "addps %%xmm8, %%xmm10\n"
7777 "subps %%xmm9, %%xmm10\n"
7778 "movaps %%xmm10, %%xmm3\n"
7779 "movaps %%xmm4, %%xmm8\n"
7780 "shufps $68, %%xmm8, %%xmm8\n"
7781 "xorps %%xmm9, %%xmm9\n"
7782 "movaps %%xmm4, %%xmm10\n"
7783 "shufps $14, %%xmm9, %%xmm10\n"
7784 "movaps %%xmm4, %%xmm11\n"
7785 "shufps $224, %%xmm11, %%xmm9\n"
7786 "addps %%xmm8, %%xmm10\n"
7787 "subps %%xmm9, %%xmm10\n"
7788 "movaps %%xmm10, %%xmm4\n"
7789 "movaps %%xmm5, %%xmm8\n"
7790 "shufps $68, %%xmm8, %%xmm8\n"
7791 "xorps %%xmm9, %%xmm9\n"
7792 "movaps %%xmm5, %%xmm10\n"
7793 "shufps $14, %%xmm9, %%xmm10\n"
7794 "movaps %%xmm5, %%xmm11\n"
7795 "shufps $224, %%xmm11, %%xmm9\n"
7796 "addps %%xmm8, %%xmm10\n"
7797 "subps %%xmm9, %%xmm10\n"
7798 "movaps %%xmm10, %%xmm5\n"
7799 "movaps %%xmm6, %%xmm8\n"
7800 "shufps $68, %%xmm8, %%xmm8\n"
7801 "xorps %%xmm9, %%xmm9\n"
7802 "movaps %%xmm6, %%xmm10\n"
7803 "shufps $14, %%xmm9, %%xmm10\n"
7804 "movaps %%xmm6, %%xmm11\n"
7805 "shufps $224, %%xmm11, %%xmm9\n"
7806 "addps %%xmm8, %%xmm10\n"
7807 "subps %%xmm9, %%xmm10\n"
7808 "movaps %%xmm10, %%xmm6\n"
7809 "movaps %%xmm7, %%xmm8\n"
7810 "shufps $68, %%xmm8, %%xmm8\n"
7811 "xorps %%xmm9, %%xmm9\n"
7812 "movaps %%xmm7, %%xmm10\n"
7813 "shufps $14, %%xmm9, %%xmm10\n"
7814 "movaps %%xmm7, %%xmm11\n"
7815 "shufps $224, %%xmm11, %%xmm9\n"
7816 "addps %%xmm8, %%xmm10\n"
7817 "subps %%xmm9, %%xmm10\n"
7818 "movaps %%xmm10, %%xmm7\n"
7819 "movaps %%xmm0, %%xmm8\n"
7820 "movaps %%xmm0, %%xmm9\n"
7821 "addps %%xmm1, %%xmm8\n"
7822 "subps %%xmm1, %%xmm9\n"
7823 "movaps %%xmm2, %%xmm10\n"
7824 "movaps %%xmm2, %%xmm11\n"
7825 "addps %%xmm3, %%xmm10\n"
7826 "subps %%xmm3, %%xmm11\n"
7827 "movaps %%xmm4, %%xmm12\n"
7828 "movaps %%xmm4, %%xmm13\n"
7829 "addps %%xmm5, %%xmm12\n"
7830 "subps %%xmm5, %%xmm13\n"
7831 "movaps %%xmm6, %%xmm14\n"
7832 "movaps %%xmm6, %%xmm15\n"
7833 "addps %%xmm7, %%xmm14\n"
7834 "subps %%xmm7, %%xmm15\n"
7835 "movaps %%xmm8, %%xmm0\n"
7836 "movaps %%xmm8, %%xmm2\n"
7837 "addps %%xmm10, %%xmm0\n"
7838 "subps %%xmm10, %%xmm2\n"
7839 "movaps %%xmm9, %%xmm1\n"
7840 "movaps %%xmm9, %%xmm3\n"
7841 "addps %%xmm11, %%xmm1\n"
7842 "subps %%xmm11, %%xmm3\n"
7843 "movaps %%xmm12, %%xmm4\n"
7844 "movaps %%xmm12, %%xmm6\n"
7845 "addps %%xmm14, %%xmm4\n"
7846 "subps %%xmm14, %%xmm6\n"
7847 "movaps %%xmm13, %%xmm5\n"
7848 "movaps %%xmm13, %%xmm7\n"
7849 "addps %%xmm15, %%xmm5\n"
7850 "subps %%xmm15, %%xmm7\n"
7851 "movaps %%xmm0, %%xmm8\n"
7852 "movaps %%xmm0, %%xmm12\n"
7853 "addps %%xmm4, %%xmm8\n"
7854 "subps %%xmm4, %%xmm12\n"
7855 "movaps %%xmm1, %%xmm9\n"
7856 "movaps %%xmm1, %%xmm13\n"
7857 "addps %%xmm5, %%xmm9\n"
7858 "subps %%xmm5, %%xmm13\n"
7859 "movaps %%xmm2, %%xmm10\n"
7860 "movaps %%xmm2, %%xmm14\n"
7861 "addps %%xmm6, %%xmm10\n"
7862 "subps %%xmm6, %%xmm14\n"
7863 "movaps %%xmm3, %%xmm11\n"
7864 "movaps %%xmm3, %%xmm15\n"
7865 "addps %%xmm7, %%xmm11\n"
7866 "subps %%xmm7, %%xmm15\n"
7867 "movups %%xmm8, (%0)\n"
7868 "movups %%xmm9, (%1)\n"
7869 "movups %%xmm10, (%2)\n"
7870 "movups %%xmm11, (%3)\n"
7871 "movups %%xmm12, (%4)\n"
7872 "movups %%xmm13, (%5)\n"
7873 "movups %%xmm14, (%6)\n"
7874 "movups %%xmm15, (%7)\n"
7875 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7876 );
7877 }
7878 }
7879 for (int j = 0; j < 64; j += 64) {
7880 for (int k = 0; k < 32; k += 4) {
7881 __asm__ volatile (
7882 "movups (%0), %%xmm0\n"
7883 "movups (%1), %%xmm1\n"
7884 "movaps %%xmm0, %%xmm8\n"
7885 "movaps %%xmm0, %%xmm9\n"
7886 "addps %%xmm1, %%xmm8\n"
7887 "subps %%xmm1, %%xmm9\n"
7888 "movups %%xmm8, (%0)\n"
7889 "movups %%xmm9, (%1)\n"
7890 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7891 );
7892 }
7893 }
7894 return;
7895 }
7896 if (depth == 9) {
7897 helper_float_23_recursive(buf + 0, 6);
7898 helper_float_23_recursive(buf + 64, 6);
7899 helper_float_23_recursive(buf + 128, 6);
7900 helper_float_23_recursive(buf + 192, 6);
7901 helper_float_23_recursive(buf + 256, 6);
7902 helper_float_23_recursive(buf + 320, 6);
7903 helper_float_23_recursive(buf + 384, 6);
7904 helper_float_23_recursive(buf + 448, 6);
7905 for (int j = 0; j < 512; j += 512) {
7906 for (int k = 0; k < 64; k += 4) {
7907 __asm__ volatile (
7908 "movups (%0), %%xmm0\n"
7909 "movups (%1), %%xmm1\n"
7910 "movups (%2), %%xmm2\n"
7911 "movups (%3), %%xmm3\n"
7912 "movups (%4), %%xmm4\n"
7913 "movups (%5), %%xmm5\n"
7914 "movups (%6), %%xmm6\n"
7915 "movups (%7), %%xmm7\n"
7916 "movaps %%xmm0, %%xmm8\n"
7917 "movaps %%xmm0, %%xmm9\n"
7918 "addps %%xmm1, %%xmm8\n"
7919 "subps %%xmm1, %%xmm9\n"
7920 "movaps %%xmm2, %%xmm10\n"
7921 "movaps %%xmm2, %%xmm11\n"
7922 "addps %%xmm3, %%xmm10\n"
7923 "subps %%xmm3, %%xmm11\n"
7924 "movaps %%xmm4, %%xmm12\n"
7925 "movaps %%xmm4, %%xmm13\n"
7926 "addps %%xmm5, %%xmm12\n"
7927 "subps %%xmm5, %%xmm13\n"
7928 "movaps %%xmm6, %%xmm14\n"
7929 "movaps %%xmm6, %%xmm15\n"
7930 "addps %%xmm7, %%xmm14\n"
7931 "subps %%xmm7, %%xmm15\n"
7932 "movaps %%xmm8, %%xmm0\n"
7933 "movaps %%xmm8, %%xmm2\n"
7934 "addps %%xmm10, %%xmm0\n"
7935 "subps %%xmm10, %%xmm2\n"
7936 "movaps %%xmm9, %%xmm1\n"
7937 "movaps %%xmm9, %%xmm3\n"
7938 "addps %%xmm11, %%xmm1\n"
7939 "subps %%xmm11, %%xmm3\n"
7940 "movaps %%xmm12, %%xmm4\n"
7941 "movaps %%xmm12, %%xmm6\n"
7942 "addps %%xmm14, %%xmm4\n"
7943 "subps %%xmm14, %%xmm6\n"
7944 "movaps %%xmm13, %%xmm5\n"
7945 "movaps %%xmm13, %%xmm7\n"
7946 "addps %%xmm15, %%xmm5\n"
7947 "subps %%xmm15, %%xmm7\n"
7948 "movaps %%xmm0, %%xmm8\n"
7949 "movaps %%xmm0, %%xmm12\n"
7950 "addps %%xmm4, %%xmm8\n"
7951 "subps %%xmm4, %%xmm12\n"
7952 "movaps %%xmm1, %%xmm9\n"
7953 "movaps %%xmm1, %%xmm13\n"
7954 "addps %%xmm5, %%xmm9\n"
7955 "subps %%xmm5, %%xmm13\n"
7956 "movaps %%xmm2, %%xmm10\n"
7957 "movaps %%xmm2, %%xmm14\n"
7958 "addps %%xmm6, %%xmm10\n"
7959 "subps %%xmm6, %%xmm14\n"
7960 "movaps %%xmm3, %%xmm11\n"
7961 "movaps %%xmm3, %%xmm15\n"
7962 "addps %%xmm7, %%xmm11\n"
7963 "subps %%xmm7, %%xmm15\n"
7964 "movups %%xmm8, (%0)\n"
7965 "movups %%xmm9, (%1)\n"
7966 "movups %%xmm10, (%2)\n"
7967 "movups %%xmm11, (%3)\n"
7968 "movups %%xmm12, (%4)\n"
7969 "movups %%xmm13, (%5)\n"
7970 "movups %%xmm14, (%6)\n"
7971 "movups %%xmm15, (%7)\n"
7972 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
7973 );
7974 }
7975 }
7976 return;
7977 }
7978 if (depth == 12) {
7979 helper_float_23_recursive(buf + 0, 9);
7980 helper_float_23_recursive(buf + 512, 9);
7981 helper_float_23_recursive(buf + 1024, 9);
7982 helper_float_23_recursive(buf + 1536, 9);
7983 helper_float_23_recursive(buf + 2048, 9);
7984 helper_float_23_recursive(buf + 2560, 9);
7985 helper_float_23_recursive(buf + 3072, 9);
7986 helper_float_23_recursive(buf + 3584, 9);
7987 for (int j = 0; j < 4096; j += 4096) {
7988 for (int k = 0; k < 512; k += 4) {
7989 __asm__ volatile (
7990 "movups (%0), %%xmm0\n"
7991 "movups (%1), %%xmm1\n"
7992 "movups (%2), %%xmm2\n"
7993 "movups (%3), %%xmm3\n"
7994 "movups (%4), %%xmm4\n"
7995 "movups (%5), %%xmm5\n"
7996 "movups (%6), %%xmm6\n"
7997 "movups (%7), %%xmm7\n"
7998 "movaps %%xmm0, %%xmm8\n"
7999 "movaps %%xmm0, %%xmm9\n"
8000 "addps %%xmm1, %%xmm8\n"
8001 "subps %%xmm1, %%xmm9\n"
8002 "movaps %%xmm2, %%xmm10\n"
8003 "movaps %%xmm2, %%xmm11\n"
8004 "addps %%xmm3, %%xmm10\n"
8005 "subps %%xmm3, %%xmm11\n"
8006 "movaps %%xmm4, %%xmm12\n"
8007 "movaps %%xmm4, %%xmm13\n"
8008 "addps %%xmm5, %%xmm12\n"
8009 "subps %%xmm5, %%xmm13\n"
8010 "movaps %%xmm6, %%xmm14\n"
8011 "movaps %%xmm6, %%xmm15\n"
8012 "addps %%xmm7, %%xmm14\n"
8013 "subps %%xmm7, %%xmm15\n"
8014 "movaps %%xmm8, %%xmm0\n"
8015 "movaps %%xmm8, %%xmm2\n"
8016 "addps %%xmm10, %%xmm0\n"
8017 "subps %%xmm10, %%xmm2\n"
8018 "movaps %%xmm9, %%xmm1\n"
8019 "movaps %%xmm9, %%xmm3\n"
8020 "addps %%xmm11, %%xmm1\n"
8021 "subps %%xmm11, %%xmm3\n"
8022 "movaps %%xmm12, %%xmm4\n"
8023 "movaps %%xmm12, %%xmm6\n"
8024 "addps %%xmm14, %%xmm4\n"
8025 "subps %%xmm14, %%xmm6\n"
8026 "movaps %%xmm13, %%xmm5\n"
8027 "movaps %%xmm13, %%xmm7\n"
8028 "addps %%xmm15, %%xmm5\n"
8029 "subps %%xmm15, %%xmm7\n"
8030 "movaps %%xmm0, %%xmm8\n"
8031 "movaps %%xmm0, %%xmm12\n"
8032 "addps %%xmm4, %%xmm8\n"
8033 "subps %%xmm4, %%xmm12\n"
8034 "movaps %%xmm1, %%xmm9\n"
8035 "movaps %%xmm1, %%xmm13\n"
8036 "addps %%xmm5, %%xmm9\n"
8037 "subps %%xmm5, %%xmm13\n"
8038 "movaps %%xmm2, %%xmm10\n"
8039 "movaps %%xmm2, %%xmm14\n"
8040 "addps %%xmm6, %%xmm10\n"
8041 "subps %%xmm6, %%xmm14\n"
8042 "movaps %%xmm3, %%xmm11\n"
8043 "movaps %%xmm3, %%xmm15\n"
8044 "addps %%xmm7, %%xmm11\n"
8045 "subps %%xmm7, %%xmm15\n"
8046 "movups %%xmm8, (%0)\n"
8047 "movups %%xmm9, (%1)\n"
8048 "movups %%xmm10, (%2)\n"
8049 "movups %%xmm11, (%3)\n"
8050 "movups %%xmm12, (%4)\n"
8051 "movups %%xmm13, (%5)\n"
8052 "movups %%xmm14, (%6)\n"
8053 "movups %%xmm15, (%7)\n"
8054 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8055 );
8056 }
8057 }
8058 return;
8059 }
8060 if (depth == 15) {
8061 helper_float_23_recursive(buf + 0, 12);
8062 helper_float_23_recursive(buf + 4096, 12);
8063 helper_float_23_recursive(buf + 8192, 12);
8064 helper_float_23_recursive(buf + 12288, 12);
8065 helper_float_23_recursive(buf + 16384, 12);
8066 helper_float_23_recursive(buf + 20480, 12);
8067 helper_float_23_recursive(buf + 24576, 12);
8068 helper_float_23_recursive(buf + 28672, 12);
8069 for (int j = 0; j < 32768; j += 32768) {
8070 for (int k = 0; k < 4096; k += 4) {
8071 __asm__ volatile (
8072 "movups (%0), %%xmm0\n"
8073 "movups (%1), %%xmm1\n"
8074 "movups (%2), %%xmm2\n"
8075 "movups (%3), %%xmm3\n"
8076 "movups (%4), %%xmm4\n"
8077 "movups (%5), %%xmm5\n"
8078 "movups (%6), %%xmm6\n"
8079 "movups (%7), %%xmm7\n"
8080 "movaps %%xmm0, %%xmm8\n"
8081 "movaps %%xmm0, %%xmm9\n"
8082 "addps %%xmm1, %%xmm8\n"
8083 "subps %%xmm1, %%xmm9\n"
8084 "movaps %%xmm2, %%xmm10\n"
8085 "movaps %%xmm2, %%xmm11\n"
8086 "addps %%xmm3, %%xmm10\n"
8087 "subps %%xmm3, %%xmm11\n"
8088 "movaps %%xmm4, %%xmm12\n"
8089 "movaps %%xmm4, %%xmm13\n"
8090 "addps %%xmm5, %%xmm12\n"
8091 "subps %%xmm5, %%xmm13\n"
8092 "movaps %%xmm6, %%xmm14\n"
8093 "movaps %%xmm6, %%xmm15\n"
8094 "addps %%xmm7, %%xmm14\n"
8095 "subps %%xmm7, %%xmm15\n"
8096 "movaps %%xmm8, %%xmm0\n"
8097 "movaps %%xmm8, %%xmm2\n"
8098 "addps %%xmm10, %%xmm0\n"
8099 "subps %%xmm10, %%xmm2\n"
8100 "movaps %%xmm9, %%xmm1\n"
8101 "movaps %%xmm9, %%xmm3\n"
8102 "addps %%xmm11, %%xmm1\n"
8103 "subps %%xmm11, %%xmm3\n"
8104 "movaps %%xmm12, %%xmm4\n"
8105 "movaps %%xmm12, %%xmm6\n"
8106 "addps %%xmm14, %%xmm4\n"
8107 "subps %%xmm14, %%xmm6\n"
8108 "movaps %%xmm13, %%xmm5\n"
8109 "movaps %%xmm13, %%xmm7\n"
8110 "addps %%xmm15, %%xmm5\n"
8111 "subps %%xmm15, %%xmm7\n"
8112 "movaps %%xmm0, %%xmm8\n"
8113 "movaps %%xmm0, %%xmm12\n"
8114 "addps %%xmm4, %%xmm8\n"
8115 "subps %%xmm4, %%xmm12\n"
8116 "movaps %%xmm1, %%xmm9\n"
8117 "movaps %%xmm1, %%xmm13\n"
8118 "addps %%xmm5, %%xmm9\n"
8119 "subps %%xmm5, %%xmm13\n"
8120 "movaps %%xmm2, %%xmm10\n"
8121 "movaps %%xmm2, %%xmm14\n"
8122 "addps %%xmm6, %%xmm10\n"
8123 "subps %%xmm6, %%xmm14\n"
8124 "movaps %%xmm3, %%xmm11\n"
8125 "movaps %%xmm3, %%xmm15\n"
8126 "addps %%xmm7, %%xmm11\n"
8127 "subps %%xmm7, %%xmm15\n"
8128 "movups %%xmm8, (%0)\n"
8129 "movups %%xmm9, (%1)\n"
8130 "movups %%xmm10, (%2)\n"
8131 "movups %%xmm11, (%3)\n"
8132 "movups %%xmm12, (%4)\n"
8133 "movups %%xmm13, (%5)\n"
8134 "movups %%xmm14, (%6)\n"
8135 "movups %%xmm15, (%7)\n"
8136 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8137 );
8138 }
8139 }
8140 return;
8141 }
8142 if (depth == 18) {
8143 helper_float_23_recursive(buf + 0, 15);
8144 helper_float_23_recursive(buf + 32768, 15);
8145 helper_float_23_recursive(buf + 65536, 15);
8146 helper_float_23_recursive(buf + 98304, 15);
8147 helper_float_23_recursive(buf + 131072, 15);
8148 helper_float_23_recursive(buf + 163840, 15);
8149 helper_float_23_recursive(buf + 196608, 15);
8150 helper_float_23_recursive(buf + 229376, 15);
8151 for (int j = 0; j < 262144; j += 262144) {
8152 for (int k = 0; k < 32768; k += 4) {
8153 __asm__ volatile (
8154 "movups (%0), %%xmm0\n"
8155 "movups (%1), %%xmm1\n"
8156 "movups (%2), %%xmm2\n"
8157 "movups (%3), %%xmm3\n"
8158 "movups (%4), %%xmm4\n"
8159 "movups (%5), %%xmm5\n"
8160 "movups (%6), %%xmm6\n"
8161 "movups (%7), %%xmm7\n"
8162 "movaps %%xmm0, %%xmm8\n"
8163 "movaps %%xmm0, %%xmm9\n"
8164 "addps %%xmm1, %%xmm8\n"
8165 "subps %%xmm1, %%xmm9\n"
8166 "movaps %%xmm2, %%xmm10\n"
8167 "movaps %%xmm2, %%xmm11\n"
8168 "addps %%xmm3, %%xmm10\n"
8169 "subps %%xmm3, %%xmm11\n"
8170 "movaps %%xmm4, %%xmm12\n"
8171 "movaps %%xmm4, %%xmm13\n"
8172 "addps %%xmm5, %%xmm12\n"
8173 "subps %%xmm5, %%xmm13\n"
8174 "movaps %%xmm6, %%xmm14\n"
8175 "movaps %%xmm6, %%xmm15\n"
8176 "addps %%xmm7, %%xmm14\n"
8177 "subps %%xmm7, %%xmm15\n"
8178 "movaps %%xmm8, %%xmm0\n"
8179 "movaps %%xmm8, %%xmm2\n"
8180 "addps %%xmm10, %%xmm0\n"
8181 "subps %%xmm10, %%xmm2\n"
8182 "movaps %%xmm9, %%xmm1\n"
8183 "movaps %%xmm9, %%xmm3\n"
8184 "addps %%xmm11, %%xmm1\n"
8185 "subps %%xmm11, %%xmm3\n"
8186 "movaps %%xmm12, %%xmm4\n"
8187 "movaps %%xmm12, %%xmm6\n"
8188 "addps %%xmm14, %%xmm4\n"
8189 "subps %%xmm14, %%xmm6\n"
8190 "movaps %%xmm13, %%xmm5\n"
8191 "movaps %%xmm13, %%xmm7\n"
8192 "addps %%xmm15, %%xmm5\n"
8193 "subps %%xmm15, %%xmm7\n"
8194 "movaps %%xmm0, %%xmm8\n"
8195 "movaps %%xmm0, %%xmm12\n"
8196 "addps %%xmm4, %%xmm8\n"
8197 "subps %%xmm4, %%xmm12\n"
8198 "movaps %%xmm1, %%xmm9\n"
8199 "movaps %%xmm1, %%xmm13\n"
8200 "addps %%xmm5, %%xmm9\n"
8201 "subps %%xmm5, %%xmm13\n"
8202 "movaps %%xmm2, %%xmm10\n"
8203 "movaps %%xmm2, %%xmm14\n"
8204 "addps %%xmm6, %%xmm10\n"
8205 "subps %%xmm6, %%xmm14\n"
8206 "movaps %%xmm3, %%xmm11\n"
8207 "movaps %%xmm3, %%xmm15\n"
8208 "addps %%xmm7, %%xmm11\n"
8209 "subps %%xmm7, %%xmm15\n"
8210 "movups %%xmm8, (%0)\n"
8211 "movups %%xmm9, (%1)\n"
8212 "movups %%xmm10, (%2)\n"
8213 "movups %%xmm11, (%3)\n"
8214 "movups %%xmm12, (%4)\n"
8215 "movups %%xmm13, (%5)\n"
8216 "movups %%xmm14, (%6)\n"
8217 "movups %%xmm15, (%7)\n"
8218 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8219 );
8220 }
8221 }
8222 return;
8223 }
8224 if (depth == 21) {
8225 helper_float_23_recursive(buf + 0, 18);
8226 helper_float_23_recursive(buf + 262144, 18);
8227 helper_float_23_recursive(buf + 524288, 18);
8228 helper_float_23_recursive(buf + 786432, 18);
8229 helper_float_23_recursive(buf + 1048576, 18);
8230 helper_float_23_recursive(buf + 1310720, 18);
8231 helper_float_23_recursive(buf + 1572864, 18);
8232 helper_float_23_recursive(buf + 1835008, 18);
8233 for (int j = 0; j < 2097152; j += 2097152) {
8234 for (int k = 0; k < 262144; k += 4) {
8235 __asm__ volatile (
8236 "movups (%0), %%xmm0\n"
8237 "movups (%1), %%xmm1\n"
8238 "movups (%2), %%xmm2\n"
8239 "movups (%3), %%xmm3\n"
8240 "movups (%4), %%xmm4\n"
8241 "movups (%5), %%xmm5\n"
8242 "movups (%6), %%xmm6\n"
8243 "movups (%7), %%xmm7\n"
8244 "movaps %%xmm0, %%xmm8\n"
8245 "movaps %%xmm0, %%xmm9\n"
8246 "addps %%xmm1, %%xmm8\n"
8247 "subps %%xmm1, %%xmm9\n"
8248 "movaps %%xmm2, %%xmm10\n"
8249 "movaps %%xmm2, %%xmm11\n"
8250 "addps %%xmm3, %%xmm10\n"
8251 "subps %%xmm3, %%xmm11\n"
8252 "movaps %%xmm4, %%xmm12\n"
8253 "movaps %%xmm4, %%xmm13\n"
8254 "addps %%xmm5, %%xmm12\n"
8255 "subps %%xmm5, %%xmm13\n"
8256 "movaps %%xmm6, %%xmm14\n"
8257 "movaps %%xmm6, %%xmm15\n"
8258 "addps %%xmm7, %%xmm14\n"
8259 "subps %%xmm7, %%xmm15\n"
8260 "movaps %%xmm8, %%xmm0\n"
8261 "movaps %%xmm8, %%xmm2\n"
8262 "addps %%xmm10, %%xmm0\n"
8263 "subps %%xmm10, %%xmm2\n"
8264 "movaps %%xmm9, %%xmm1\n"
8265 "movaps %%xmm9, %%xmm3\n"
8266 "addps %%xmm11, %%xmm1\n"
8267 "subps %%xmm11, %%xmm3\n"
8268 "movaps %%xmm12, %%xmm4\n"
8269 "movaps %%xmm12, %%xmm6\n"
8270 "addps %%xmm14, %%xmm4\n"
8271 "subps %%xmm14, %%xmm6\n"
8272 "movaps %%xmm13, %%xmm5\n"
8273 "movaps %%xmm13, %%xmm7\n"
8274 "addps %%xmm15, %%xmm5\n"
8275 "subps %%xmm15, %%xmm7\n"
8276 "movaps %%xmm0, %%xmm8\n"
8277 "movaps %%xmm0, %%xmm12\n"
8278 "addps %%xmm4, %%xmm8\n"
8279 "subps %%xmm4, %%xmm12\n"
8280 "movaps %%xmm1, %%xmm9\n"
8281 "movaps %%xmm1, %%xmm13\n"
8282 "addps %%xmm5, %%xmm9\n"
8283 "subps %%xmm5, %%xmm13\n"
8284 "movaps %%xmm2, %%xmm10\n"
8285 "movaps %%xmm2, %%xmm14\n"
8286 "addps %%xmm6, %%xmm10\n"
8287 "subps %%xmm6, %%xmm14\n"
8288 "movaps %%xmm3, %%xmm11\n"
8289 "movaps %%xmm3, %%xmm15\n"
8290 "addps %%xmm7, %%xmm11\n"
8291 "subps %%xmm7, %%xmm15\n"
8292 "movups %%xmm8, (%0)\n"
8293 "movups %%xmm9, (%1)\n"
8294 "movups %%xmm10, (%2)\n"
8295 "movups %%xmm11, (%3)\n"
8296 "movups %%xmm12, (%4)\n"
8297 "movups %%xmm13, (%5)\n"
8298 "movups %%xmm14, (%6)\n"
8299 "movups %%xmm15, (%7)\n"
8300 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8301 );
8302 }
8303 }
8304 return;
8305 }
8306 if (depth == 23) {
8307 helper_float_23_recursive(buf + 0, 21);
8308 helper_float_23_recursive(buf + 2097152, 21);
8309 helper_float_23_recursive(buf + 4194304, 21);
8310 helper_float_23_recursive(buf + 6291456, 21);
8311 for (int j = 0; j < 8388608; j += 8388608) {
8312 for (int k = 0; k < 2097152; k += 4) {
8313 __asm__ volatile (
8314 "movups (%0), %%xmm0\n"
8315 "movups (%1), %%xmm1\n"
8316 "movups (%2), %%xmm2\n"
8317 "movups (%3), %%xmm3\n"
8318 "movaps %%xmm0, %%xmm8\n"
8319 "movaps %%xmm0, %%xmm9\n"
8320 "addps %%xmm1, %%xmm8\n"
8321 "subps %%xmm1, %%xmm9\n"
8322 "movaps %%xmm2, %%xmm10\n"
8323 "movaps %%xmm2, %%xmm11\n"
8324 "addps %%xmm3, %%xmm10\n"
8325 "subps %%xmm3, %%xmm11\n"
8326 "movaps %%xmm8, %%xmm0\n"
8327 "movaps %%xmm8, %%xmm2\n"
8328 "addps %%xmm10, %%xmm0\n"
8329 "subps %%xmm10, %%xmm2\n"
8330 "movaps %%xmm9, %%xmm1\n"
8331 "movaps %%xmm9, %%xmm3\n"
8332 "addps %%xmm11, %%xmm1\n"
8333 "subps %%xmm11, %%xmm3\n"
8334 "movups %%xmm0, (%0)\n"
8335 "movups %%xmm1, (%1)\n"
8336 "movups %%xmm2, (%2)\n"
8337 "movups %%xmm3, (%3)\n"
8338 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8339 );
8340 }
8341 }
8342 return;
8343 }
8344 }
8345 void helper_float_23(float *buf);
helper_float_23(float * buf)8346 void helper_float_23(float *buf) {
8347 helper_float_23_recursive(buf, 23);
8348 }
8349 void helper_float_24_recursive(float *buf, int depth);
helper_float_24_recursive(float * buf,int depth)8350 void helper_float_24_recursive(float *buf, int depth) {
8351 if (depth == 15) {
8352 for (int j = 0; j < 32768; j += 32) {
8353 for (int k = 0; k < 4; k += 4) {
8354 __asm__ volatile (
8355 "movups (%0), %%xmm0\n"
8356 "movups (%1), %%xmm1\n"
8357 "movups (%2), %%xmm2\n"
8358 "movups (%3), %%xmm3\n"
8359 "movups (%4), %%xmm4\n"
8360 "movups (%5), %%xmm5\n"
8361 "movups (%6), %%xmm6\n"
8362 "movups (%7), %%xmm7\n"
8363 "movaps %%xmm0, %%xmm8\n"
8364 "shufps $160, %%xmm8, %%xmm8\n"
8365 "shufps $245, %%xmm0, %%xmm0\n"
8366 "xorps %%xmm9, %%xmm9\n"
8367 "subps %%xmm0, %%xmm9\n"
8368 "addsubps %%xmm9, %%xmm8\n"
8369 "movaps %%xmm8, %%xmm0\n"
8370 "movaps %%xmm1, %%xmm8\n"
8371 "shufps $160, %%xmm8, %%xmm8\n"
8372 "shufps $245, %%xmm1, %%xmm1\n"
8373 "xorps %%xmm9, %%xmm9\n"
8374 "subps %%xmm1, %%xmm9\n"
8375 "addsubps %%xmm9, %%xmm8\n"
8376 "movaps %%xmm8, %%xmm1\n"
8377 "movaps %%xmm2, %%xmm8\n"
8378 "shufps $160, %%xmm8, %%xmm8\n"
8379 "shufps $245, %%xmm2, %%xmm2\n"
8380 "xorps %%xmm9, %%xmm9\n"
8381 "subps %%xmm2, %%xmm9\n"
8382 "addsubps %%xmm9, %%xmm8\n"
8383 "movaps %%xmm8, %%xmm2\n"
8384 "movaps %%xmm3, %%xmm8\n"
8385 "shufps $160, %%xmm8, %%xmm8\n"
8386 "shufps $245, %%xmm3, %%xmm3\n"
8387 "xorps %%xmm9, %%xmm9\n"
8388 "subps %%xmm3, %%xmm9\n"
8389 "addsubps %%xmm9, %%xmm8\n"
8390 "movaps %%xmm8, %%xmm3\n"
8391 "movaps %%xmm4, %%xmm8\n"
8392 "shufps $160, %%xmm8, %%xmm8\n"
8393 "shufps $245, %%xmm4, %%xmm4\n"
8394 "xorps %%xmm9, %%xmm9\n"
8395 "subps %%xmm4, %%xmm9\n"
8396 "addsubps %%xmm9, %%xmm8\n"
8397 "movaps %%xmm8, %%xmm4\n"
8398 "movaps %%xmm5, %%xmm8\n"
8399 "shufps $160, %%xmm8, %%xmm8\n"
8400 "shufps $245, %%xmm5, %%xmm5\n"
8401 "xorps %%xmm9, %%xmm9\n"
8402 "subps %%xmm5, %%xmm9\n"
8403 "addsubps %%xmm9, %%xmm8\n"
8404 "movaps %%xmm8, %%xmm5\n"
8405 "movaps %%xmm6, %%xmm8\n"
8406 "shufps $160, %%xmm8, %%xmm8\n"
8407 "shufps $245, %%xmm6, %%xmm6\n"
8408 "xorps %%xmm9, %%xmm9\n"
8409 "subps %%xmm6, %%xmm9\n"
8410 "addsubps %%xmm9, %%xmm8\n"
8411 "movaps %%xmm8, %%xmm6\n"
8412 "movaps %%xmm7, %%xmm8\n"
8413 "shufps $160, %%xmm8, %%xmm8\n"
8414 "shufps $245, %%xmm7, %%xmm7\n"
8415 "xorps %%xmm9, %%xmm9\n"
8416 "subps %%xmm7, %%xmm9\n"
8417 "addsubps %%xmm9, %%xmm8\n"
8418 "movaps %%xmm8, %%xmm7\n"
8419 "movaps %%xmm0, %%xmm8\n"
8420 "shufps $68, %%xmm8, %%xmm8\n"
8421 "xorps %%xmm9, %%xmm9\n"
8422 "movaps %%xmm0, %%xmm10\n"
8423 "shufps $14, %%xmm9, %%xmm10\n"
8424 "movaps %%xmm0, %%xmm11\n"
8425 "shufps $224, %%xmm11, %%xmm9\n"
8426 "addps %%xmm8, %%xmm10\n"
8427 "subps %%xmm9, %%xmm10\n"
8428 "movaps %%xmm10, %%xmm0\n"
8429 "movaps %%xmm1, %%xmm8\n"
8430 "shufps $68, %%xmm8, %%xmm8\n"
8431 "xorps %%xmm9, %%xmm9\n"
8432 "movaps %%xmm1, %%xmm10\n"
8433 "shufps $14, %%xmm9, %%xmm10\n"
8434 "movaps %%xmm1, %%xmm11\n"
8435 "shufps $224, %%xmm11, %%xmm9\n"
8436 "addps %%xmm8, %%xmm10\n"
8437 "subps %%xmm9, %%xmm10\n"
8438 "movaps %%xmm10, %%xmm1\n"
8439 "movaps %%xmm2, %%xmm8\n"
8440 "shufps $68, %%xmm8, %%xmm8\n"
8441 "xorps %%xmm9, %%xmm9\n"
8442 "movaps %%xmm2, %%xmm10\n"
8443 "shufps $14, %%xmm9, %%xmm10\n"
8444 "movaps %%xmm2, %%xmm11\n"
8445 "shufps $224, %%xmm11, %%xmm9\n"
8446 "addps %%xmm8, %%xmm10\n"
8447 "subps %%xmm9, %%xmm10\n"
8448 "movaps %%xmm10, %%xmm2\n"
8449 "movaps %%xmm3, %%xmm8\n"
8450 "shufps $68, %%xmm8, %%xmm8\n"
8451 "xorps %%xmm9, %%xmm9\n"
8452 "movaps %%xmm3, %%xmm10\n"
8453 "shufps $14, %%xmm9, %%xmm10\n"
8454 "movaps %%xmm3, %%xmm11\n"
8455 "shufps $224, %%xmm11, %%xmm9\n"
8456 "addps %%xmm8, %%xmm10\n"
8457 "subps %%xmm9, %%xmm10\n"
8458 "movaps %%xmm10, %%xmm3\n"
8459 "movaps %%xmm4, %%xmm8\n"
8460 "shufps $68, %%xmm8, %%xmm8\n"
8461 "xorps %%xmm9, %%xmm9\n"
8462 "movaps %%xmm4, %%xmm10\n"
8463 "shufps $14, %%xmm9, %%xmm10\n"
8464 "movaps %%xmm4, %%xmm11\n"
8465 "shufps $224, %%xmm11, %%xmm9\n"
8466 "addps %%xmm8, %%xmm10\n"
8467 "subps %%xmm9, %%xmm10\n"
8468 "movaps %%xmm10, %%xmm4\n"
8469 "movaps %%xmm5, %%xmm8\n"
8470 "shufps $68, %%xmm8, %%xmm8\n"
8471 "xorps %%xmm9, %%xmm9\n"
8472 "movaps %%xmm5, %%xmm10\n"
8473 "shufps $14, %%xmm9, %%xmm10\n"
8474 "movaps %%xmm5, %%xmm11\n"
8475 "shufps $224, %%xmm11, %%xmm9\n"
8476 "addps %%xmm8, %%xmm10\n"
8477 "subps %%xmm9, %%xmm10\n"
8478 "movaps %%xmm10, %%xmm5\n"
8479 "movaps %%xmm6, %%xmm8\n"
8480 "shufps $68, %%xmm8, %%xmm8\n"
8481 "xorps %%xmm9, %%xmm9\n"
8482 "movaps %%xmm6, %%xmm10\n"
8483 "shufps $14, %%xmm9, %%xmm10\n"
8484 "movaps %%xmm6, %%xmm11\n"
8485 "shufps $224, %%xmm11, %%xmm9\n"
8486 "addps %%xmm8, %%xmm10\n"
8487 "subps %%xmm9, %%xmm10\n"
8488 "movaps %%xmm10, %%xmm6\n"
8489 "movaps %%xmm7, %%xmm8\n"
8490 "shufps $68, %%xmm8, %%xmm8\n"
8491 "xorps %%xmm9, %%xmm9\n"
8492 "movaps %%xmm7, %%xmm10\n"
8493 "shufps $14, %%xmm9, %%xmm10\n"
8494 "movaps %%xmm7, %%xmm11\n"
8495 "shufps $224, %%xmm11, %%xmm9\n"
8496 "addps %%xmm8, %%xmm10\n"
8497 "subps %%xmm9, %%xmm10\n"
8498 "movaps %%xmm10, %%xmm7\n"
8499 "movaps %%xmm0, %%xmm8\n"
8500 "movaps %%xmm0, %%xmm9\n"
8501 "addps %%xmm1, %%xmm8\n"
8502 "subps %%xmm1, %%xmm9\n"
8503 "movaps %%xmm2, %%xmm10\n"
8504 "movaps %%xmm2, %%xmm11\n"
8505 "addps %%xmm3, %%xmm10\n"
8506 "subps %%xmm3, %%xmm11\n"
8507 "movaps %%xmm4, %%xmm12\n"
8508 "movaps %%xmm4, %%xmm13\n"
8509 "addps %%xmm5, %%xmm12\n"
8510 "subps %%xmm5, %%xmm13\n"
8511 "movaps %%xmm6, %%xmm14\n"
8512 "movaps %%xmm6, %%xmm15\n"
8513 "addps %%xmm7, %%xmm14\n"
8514 "subps %%xmm7, %%xmm15\n"
8515 "movaps %%xmm8, %%xmm0\n"
8516 "movaps %%xmm8, %%xmm2\n"
8517 "addps %%xmm10, %%xmm0\n"
8518 "subps %%xmm10, %%xmm2\n"
8519 "movaps %%xmm9, %%xmm1\n"
8520 "movaps %%xmm9, %%xmm3\n"
8521 "addps %%xmm11, %%xmm1\n"
8522 "subps %%xmm11, %%xmm3\n"
8523 "movaps %%xmm12, %%xmm4\n"
8524 "movaps %%xmm12, %%xmm6\n"
8525 "addps %%xmm14, %%xmm4\n"
8526 "subps %%xmm14, %%xmm6\n"
8527 "movaps %%xmm13, %%xmm5\n"
8528 "movaps %%xmm13, %%xmm7\n"
8529 "addps %%xmm15, %%xmm5\n"
8530 "subps %%xmm15, %%xmm7\n"
8531 "movaps %%xmm0, %%xmm8\n"
8532 "movaps %%xmm0, %%xmm12\n"
8533 "addps %%xmm4, %%xmm8\n"
8534 "subps %%xmm4, %%xmm12\n"
8535 "movaps %%xmm1, %%xmm9\n"
8536 "movaps %%xmm1, %%xmm13\n"
8537 "addps %%xmm5, %%xmm9\n"
8538 "subps %%xmm5, %%xmm13\n"
8539 "movaps %%xmm2, %%xmm10\n"
8540 "movaps %%xmm2, %%xmm14\n"
8541 "addps %%xmm6, %%xmm10\n"
8542 "subps %%xmm6, %%xmm14\n"
8543 "movaps %%xmm3, %%xmm11\n"
8544 "movaps %%xmm3, %%xmm15\n"
8545 "addps %%xmm7, %%xmm11\n"
8546 "subps %%xmm7, %%xmm15\n"
8547 "movups %%xmm8, (%0)\n"
8548 "movups %%xmm9, (%1)\n"
8549 "movups %%xmm10, (%2)\n"
8550 "movups %%xmm11, (%3)\n"
8551 "movups %%xmm12, (%4)\n"
8552 "movups %%xmm13, (%5)\n"
8553 "movups %%xmm14, (%6)\n"
8554 "movups %%xmm15, (%7)\n"
8555 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8556 );
8557 }
8558 }
8559 for (int j = 0; j < 32768; j += 256) {
8560 for (int k = 0; k < 32; k += 4) {
8561 __asm__ volatile (
8562 "movups (%0), %%xmm0\n"
8563 "movups (%1), %%xmm1\n"
8564 "movups (%2), %%xmm2\n"
8565 "movups (%3), %%xmm3\n"
8566 "movups (%4), %%xmm4\n"
8567 "movups (%5), %%xmm5\n"
8568 "movups (%6), %%xmm6\n"
8569 "movups (%7), %%xmm7\n"
8570 "movaps %%xmm0, %%xmm8\n"
8571 "movaps %%xmm0, %%xmm9\n"
8572 "addps %%xmm1, %%xmm8\n"
8573 "subps %%xmm1, %%xmm9\n"
8574 "movaps %%xmm2, %%xmm10\n"
8575 "movaps %%xmm2, %%xmm11\n"
8576 "addps %%xmm3, %%xmm10\n"
8577 "subps %%xmm3, %%xmm11\n"
8578 "movaps %%xmm4, %%xmm12\n"
8579 "movaps %%xmm4, %%xmm13\n"
8580 "addps %%xmm5, %%xmm12\n"
8581 "subps %%xmm5, %%xmm13\n"
8582 "movaps %%xmm6, %%xmm14\n"
8583 "movaps %%xmm6, %%xmm15\n"
8584 "addps %%xmm7, %%xmm14\n"
8585 "subps %%xmm7, %%xmm15\n"
8586 "movaps %%xmm8, %%xmm0\n"
8587 "movaps %%xmm8, %%xmm2\n"
8588 "addps %%xmm10, %%xmm0\n"
8589 "subps %%xmm10, %%xmm2\n"
8590 "movaps %%xmm9, %%xmm1\n"
8591 "movaps %%xmm9, %%xmm3\n"
8592 "addps %%xmm11, %%xmm1\n"
8593 "subps %%xmm11, %%xmm3\n"
8594 "movaps %%xmm12, %%xmm4\n"
8595 "movaps %%xmm12, %%xmm6\n"
8596 "addps %%xmm14, %%xmm4\n"
8597 "subps %%xmm14, %%xmm6\n"
8598 "movaps %%xmm13, %%xmm5\n"
8599 "movaps %%xmm13, %%xmm7\n"
8600 "addps %%xmm15, %%xmm5\n"
8601 "subps %%xmm15, %%xmm7\n"
8602 "movaps %%xmm0, %%xmm8\n"
8603 "movaps %%xmm0, %%xmm12\n"
8604 "addps %%xmm4, %%xmm8\n"
8605 "subps %%xmm4, %%xmm12\n"
8606 "movaps %%xmm1, %%xmm9\n"
8607 "movaps %%xmm1, %%xmm13\n"
8608 "addps %%xmm5, %%xmm9\n"
8609 "subps %%xmm5, %%xmm13\n"
8610 "movaps %%xmm2, %%xmm10\n"
8611 "movaps %%xmm2, %%xmm14\n"
8612 "addps %%xmm6, %%xmm10\n"
8613 "subps %%xmm6, %%xmm14\n"
8614 "movaps %%xmm3, %%xmm11\n"
8615 "movaps %%xmm3, %%xmm15\n"
8616 "addps %%xmm7, %%xmm11\n"
8617 "subps %%xmm7, %%xmm15\n"
8618 "movups %%xmm8, (%0)\n"
8619 "movups %%xmm9, (%1)\n"
8620 "movups %%xmm10, (%2)\n"
8621 "movups %%xmm11, (%3)\n"
8622 "movups %%xmm12, (%4)\n"
8623 "movups %%xmm13, (%5)\n"
8624 "movups %%xmm14, (%6)\n"
8625 "movups %%xmm15, (%7)\n"
8626 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8627 );
8628 }
8629 }
8630 for (int j = 0; j < 32768; j += 2048) {
8631 for (int k = 0; k < 256; k += 4) {
8632 __asm__ volatile (
8633 "movups (%0), %%xmm0\n"
8634 "movups (%1), %%xmm1\n"
8635 "movups (%2), %%xmm2\n"
8636 "movups (%3), %%xmm3\n"
8637 "movups (%4), %%xmm4\n"
8638 "movups (%5), %%xmm5\n"
8639 "movups (%6), %%xmm6\n"
8640 "movups (%7), %%xmm7\n"
8641 "movaps %%xmm0, %%xmm8\n"
8642 "movaps %%xmm0, %%xmm9\n"
8643 "addps %%xmm1, %%xmm8\n"
8644 "subps %%xmm1, %%xmm9\n"
8645 "movaps %%xmm2, %%xmm10\n"
8646 "movaps %%xmm2, %%xmm11\n"
8647 "addps %%xmm3, %%xmm10\n"
8648 "subps %%xmm3, %%xmm11\n"
8649 "movaps %%xmm4, %%xmm12\n"
8650 "movaps %%xmm4, %%xmm13\n"
8651 "addps %%xmm5, %%xmm12\n"
8652 "subps %%xmm5, %%xmm13\n"
8653 "movaps %%xmm6, %%xmm14\n"
8654 "movaps %%xmm6, %%xmm15\n"
8655 "addps %%xmm7, %%xmm14\n"
8656 "subps %%xmm7, %%xmm15\n"
8657 "movaps %%xmm8, %%xmm0\n"
8658 "movaps %%xmm8, %%xmm2\n"
8659 "addps %%xmm10, %%xmm0\n"
8660 "subps %%xmm10, %%xmm2\n"
8661 "movaps %%xmm9, %%xmm1\n"
8662 "movaps %%xmm9, %%xmm3\n"
8663 "addps %%xmm11, %%xmm1\n"
8664 "subps %%xmm11, %%xmm3\n"
8665 "movaps %%xmm12, %%xmm4\n"
8666 "movaps %%xmm12, %%xmm6\n"
8667 "addps %%xmm14, %%xmm4\n"
8668 "subps %%xmm14, %%xmm6\n"
8669 "movaps %%xmm13, %%xmm5\n"
8670 "movaps %%xmm13, %%xmm7\n"
8671 "addps %%xmm15, %%xmm5\n"
8672 "subps %%xmm15, %%xmm7\n"
8673 "movaps %%xmm0, %%xmm8\n"
8674 "movaps %%xmm0, %%xmm12\n"
8675 "addps %%xmm4, %%xmm8\n"
8676 "subps %%xmm4, %%xmm12\n"
8677 "movaps %%xmm1, %%xmm9\n"
8678 "movaps %%xmm1, %%xmm13\n"
8679 "addps %%xmm5, %%xmm9\n"
8680 "subps %%xmm5, %%xmm13\n"
8681 "movaps %%xmm2, %%xmm10\n"
8682 "movaps %%xmm2, %%xmm14\n"
8683 "addps %%xmm6, %%xmm10\n"
8684 "subps %%xmm6, %%xmm14\n"
8685 "movaps %%xmm3, %%xmm11\n"
8686 "movaps %%xmm3, %%xmm15\n"
8687 "addps %%xmm7, %%xmm11\n"
8688 "subps %%xmm7, %%xmm15\n"
8689 "movups %%xmm8, (%0)\n"
8690 "movups %%xmm9, (%1)\n"
8691 "movups %%xmm10, (%2)\n"
8692 "movups %%xmm11, (%3)\n"
8693 "movups %%xmm12, (%4)\n"
8694 "movups %%xmm13, (%5)\n"
8695 "movups %%xmm14, (%6)\n"
8696 "movups %%xmm15, (%7)\n"
8697 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8698 );
8699 }
8700 }
8701 for (int j = 0; j < 32768; j += 16384) {
8702 for (int k = 0; k < 2048; k += 4) {
8703 __asm__ volatile (
8704 "movups (%0), %%xmm0\n"
8705 "movups (%1), %%xmm1\n"
8706 "movups (%2), %%xmm2\n"
8707 "movups (%3), %%xmm3\n"
8708 "movups (%4), %%xmm4\n"
8709 "movups (%5), %%xmm5\n"
8710 "movups (%6), %%xmm6\n"
8711 "movups (%7), %%xmm7\n"
8712 "movaps %%xmm0, %%xmm8\n"
8713 "movaps %%xmm0, %%xmm9\n"
8714 "addps %%xmm1, %%xmm8\n"
8715 "subps %%xmm1, %%xmm9\n"
8716 "movaps %%xmm2, %%xmm10\n"
8717 "movaps %%xmm2, %%xmm11\n"
8718 "addps %%xmm3, %%xmm10\n"
8719 "subps %%xmm3, %%xmm11\n"
8720 "movaps %%xmm4, %%xmm12\n"
8721 "movaps %%xmm4, %%xmm13\n"
8722 "addps %%xmm5, %%xmm12\n"
8723 "subps %%xmm5, %%xmm13\n"
8724 "movaps %%xmm6, %%xmm14\n"
8725 "movaps %%xmm6, %%xmm15\n"
8726 "addps %%xmm7, %%xmm14\n"
8727 "subps %%xmm7, %%xmm15\n"
8728 "movaps %%xmm8, %%xmm0\n"
8729 "movaps %%xmm8, %%xmm2\n"
8730 "addps %%xmm10, %%xmm0\n"
8731 "subps %%xmm10, %%xmm2\n"
8732 "movaps %%xmm9, %%xmm1\n"
8733 "movaps %%xmm9, %%xmm3\n"
8734 "addps %%xmm11, %%xmm1\n"
8735 "subps %%xmm11, %%xmm3\n"
8736 "movaps %%xmm12, %%xmm4\n"
8737 "movaps %%xmm12, %%xmm6\n"
8738 "addps %%xmm14, %%xmm4\n"
8739 "subps %%xmm14, %%xmm6\n"
8740 "movaps %%xmm13, %%xmm5\n"
8741 "movaps %%xmm13, %%xmm7\n"
8742 "addps %%xmm15, %%xmm5\n"
8743 "subps %%xmm15, %%xmm7\n"
8744 "movaps %%xmm0, %%xmm8\n"
8745 "movaps %%xmm0, %%xmm12\n"
8746 "addps %%xmm4, %%xmm8\n"
8747 "subps %%xmm4, %%xmm12\n"
8748 "movaps %%xmm1, %%xmm9\n"
8749 "movaps %%xmm1, %%xmm13\n"
8750 "addps %%xmm5, %%xmm9\n"
8751 "subps %%xmm5, %%xmm13\n"
8752 "movaps %%xmm2, %%xmm10\n"
8753 "movaps %%xmm2, %%xmm14\n"
8754 "addps %%xmm6, %%xmm10\n"
8755 "subps %%xmm6, %%xmm14\n"
8756 "movaps %%xmm3, %%xmm11\n"
8757 "movaps %%xmm3, %%xmm15\n"
8758 "addps %%xmm7, %%xmm11\n"
8759 "subps %%xmm7, %%xmm15\n"
8760 "movups %%xmm8, (%0)\n"
8761 "movups %%xmm9, (%1)\n"
8762 "movups %%xmm10, (%2)\n"
8763 "movups %%xmm11, (%3)\n"
8764 "movups %%xmm12, (%4)\n"
8765 "movups %%xmm13, (%5)\n"
8766 "movups %%xmm14, (%6)\n"
8767 "movups %%xmm15, (%7)\n"
8768 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8769 );
8770 }
8771 }
8772 for (int j = 0; j < 32768; j += 32768) {
8773 for (int k = 0; k < 16384; k += 4) {
8774 __asm__ volatile (
8775 "movups (%0), %%xmm0\n"
8776 "movups (%1), %%xmm1\n"
8777 "movaps %%xmm0, %%xmm8\n"
8778 "movaps %%xmm0, %%xmm9\n"
8779 "addps %%xmm1, %%xmm8\n"
8780 "subps %%xmm1, %%xmm9\n"
8781 "movups %%xmm8, (%0)\n"
8782 "movups %%xmm9, (%1)\n"
8783 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8784 );
8785 }
8786 }
8787 return;
8788 }
8789 if (depth == 18) {
8790 helper_float_24_recursive(buf + 0, 15);
8791 helper_float_24_recursive(buf + 32768, 15);
8792 helper_float_24_recursive(buf + 65536, 15);
8793 helper_float_24_recursive(buf + 98304, 15);
8794 helper_float_24_recursive(buf + 131072, 15);
8795 helper_float_24_recursive(buf + 163840, 15);
8796 helper_float_24_recursive(buf + 196608, 15);
8797 helper_float_24_recursive(buf + 229376, 15);
8798 for (int j = 0; j < 262144; j += 262144) {
8799 for (int k = 0; k < 32768; k += 4) {
8800 __asm__ volatile (
8801 "movups (%0), %%xmm0\n"
8802 "movups (%1), %%xmm1\n"
8803 "movups (%2), %%xmm2\n"
8804 "movups (%3), %%xmm3\n"
8805 "movups (%4), %%xmm4\n"
8806 "movups (%5), %%xmm5\n"
8807 "movups (%6), %%xmm6\n"
8808 "movups (%7), %%xmm7\n"
8809 "movaps %%xmm0, %%xmm8\n"
8810 "movaps %%xmm0, %%xmm9\n"
8811 "addps %%xmm1, %%xmm8\n"
8812 "subps %%xmm1, %%xmm9\n"
8813 "movaps %%xmm2, %%xmm10\n"
8814 "movaps %%xmm2, %%xmm11\n"
8815 "addps %%xmm3, %%xmm10\n"
8816 "subps %%xmm3, %%xmm11\n"
8817 "movaps %%xmm4, %%xmm12\n"
8818 "movaps %%xmm4, %%xmm13\n"
8819 "addps %%xmm5, %%xmm12\n"
8820 "subps %%xmm5, %%xmm13\n"
8821 "movaps %%xmm6, %%xmm14\n"
8822 "movaps %%xmm6, %%xmm15\n"
8823 "addps %%xmm7, %%xmm14\n"
8824 "subps %%xmm7, %%xmm15\n"
8825 "movaps %%xmm8, %%xmm0\n"
8826 "movaps %%xmm8, %%xmm2\n"
8827 "addps %%xmm10, %%xmm0\n"
8828 "subps %%xmm10, %%xmm2\n"
8829 "movaps %%xmm9, %%xmm1\n"
8830 "movaps %%xmm9, %%xmm3\n"
8831 "addps %%xmm11, %%xmm1\n"
8832 "subps %%xmm11, %%xmm3\n"
8833 "movaps %%xmm12, %%xmm4\n"
8834 "movaps %%xmm12, %%xmm6\n"
8835 "addps %%xmm14, %%xmm4\n"
8836 "subps %%xmm14, %%xmm6\n"
8837 "movaps %%xmm13, %%xmm5\n"
8838 "movaps %%xmm13, %%xmm7\n"
8839 "addps %%xmm15, %%xmm5\n"
8840 "subps %%xmm15, %%xmm7\n"
8841 "movaps %%xmm0, %%xmm8\n"
8842 "movaps %%xmm0, %%xmm12\n"
8843 "addps %%xmm4, %%xmm8\n"
8844 "subps %%xmm4, %%xmm12\n"
8845 "movaps %%xmm1, %%xmm9\n"
8846 "movaps %%xmm1, %%xmm13\n"
8847 "addps %%xmm5, %%xmm9\n"
8848 "subps %%xmm5, %%xmm13\n"
8849 "movaps %%xmm2, %%xmm10\n"
8850 "movaps %%xmm2, %%xmm14\n"
8851 "addps %%xmm6, %%xmm10\n"
8852 "subps %%xmm6, %%xmm14\n"
8853 "movaps %%xmm3, %%xmm11\n"
8854 "movaps %%xmm3, %%xmm15\n"
8855 "addps %%xmm7, %%xmm11\n"
8856 "subps %%xmm7, %%xmm15\n"
8857 "movups %%xmm8, (%0)\n"
8858 "movups %%xmm9, (%1)\n"
8859 "movups %%xmm10, (%2)\n"
8860 "movups %%xmm11, (%3)\n"
8861 "movups %%xmm12, (%4)\n"
8862 "movups %%xmm13, (%5)\n"
8863 "movups %%xmm14, (%6)\n"
8864 "movups %%xmm15, (%7)\n"
8865 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8866 );
8867 }
8868 }
8869 return;
8870 }
8871 if (depth == 21) {
8872 helper_float_24_recursive(buf + 0, 18);
8873 helper_float_24_recursive(buf + 262144, 18);
8874 helper_float_24_recursive(buf + 524288, 18);
8875 helper_float_24_recursive(buf + 786432, 18);
8876 helper_float_24_recursive(buf + 1048576, 18);
8877 helper_float_24_recursive(buf + 1310720, 18);
8878 helper_float_24_recursive(buf + 1572864, 18);
8879 helper_float_24_recursive(buf + 1835008, 18);
8880 for (int j = 0; j < 2097152; j += 2097152) {
8881 for (int k = 0; k < 262144; k += 4) {
8882 __asm__ volatile (
8883 "movups (%0), %%xmm0\n"
8884 "movups (%1), %%xmm1\n"
8885 "movups (%2), %%xmm2\n"
8886 "movups (%3), %%xmm3\n"
8887 "movups (%4), %%xmm4\n"
8888 "movups (%5), %%xmm5\n"
8889 "movups (%6), %%xmm6\n"
8890 "movups (%7), %%xmm7\n"
8891 "movaps %%xmm0, %%xmm8\n"
8892 "movaps %%xmm0, %%xmm9\n"
8893 "addps %%xmm1, %%xmm8\n"
8894 "subps %%xmm1, %%xmm9\n"
8895 "movaps %%xmm2, %%xmm10\n"
8896 "movaps %%xmm2, %%xmm11\n"
8897 "addps %%xmm3, %%xmm10\n"
8898 "subps %%xmm3, %%xmm11\n"
8899 "movaps %%xmm4, %%xmm12\n"
8900 "movaps %%xmm4, %%xmm13\n"
8901 "addps %%xmm5, %%xmm12\n"
8902 "subps %%xmm5, %%xmm13\n"
8903 "movaps %%xmm6, %%xmm14\n"
8904 "movaps %%xmm6, %%xmm15\n"
8905 "addps %%xmm7, %%xmm14\n"
8906 "subps %%xmm7, %%xmm15\n"
8907 "movaps %%xmm8, %%xmm0\n"
8908 "movaps %%xmm8, %%xmm2\n"
8909 "addps %%xmm10, %%xmm0\n"
8910 "subps %%xmm10, %%xmm2\n"
8911 "movaps %%xmm9, %%xmm1\n"
8912 "movaps %%xmm9, %%xmm3\n"
8913 "addps %%xmm11, %%xmm1\n"
8914 "subps %%xmm11, %%xmm3\n"
8915 "movaps %%xmm12, %%xmm4\n"
8916 "movaps %%xmm12, %%xmm6\n"
8917 "addps %%xmm14, %%xmm4\n"
8918 "subps %%xmm14, %%xmm6\n"
8919 "movaps %%xmm13, %%xmm5\n"
8920 "movaps %%xmm13, %%xmm7\n"
8921 "addps %%xmm15, %%xmm5\n"
8922 "subps %%xmm15, %%xmm7\n"
8923 "movaps %%xmm0, %%xmm8\n"
8924 "movaps %%xmm0, %%xmm12\n"
8925 "addps %%xmm4, %%xmm8\n"
8926 "subps %%xmm4, %%xmm12\n"
8927 "movaps %%xmm1, %%xmm9\n"
8928 "movaps %%xmm1, %%xmm13\n"
8929 "addps %%xmm5, %%xmm9\n"
8930 "subps %%xmm5, %%xmm13\n"
8931 "movaps %%xmm2, %%xmm10\n"
8932 "movaps %%xmm2, %%xmm14\n"
8933 "addps %%xmm6, %%xmm10\n"
8934 "subps %%xmm6, %%xmm14\n"
8935 "movaps %%xmm3, %%xmm11\n"
8936 "movaps %%xmm3, %%xmm15\n"
8937 "addps %%xmm7, %%xmm11\n"
8938 "subps %%xmm7, %%xmm15\n"
8939 "movups %%xmm8, (%0)\n"
8940 "movups %%xmm9, (%1)\n"
8941 "movups %%xmm10, (%2)\n"
8942 "movups %%xmm11, (%3)\n"
8943 "movups %%xmm12, (%4)\n"
8944 "movups %%xmm13, (%5)\n"
8945 "movups %%xmm14, (%6)\n"
8946 "movups %%xmm15, (%7)\n"
8947 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
8948 );
8949 }
8950 }
8951 return;
8952 }
8953 if (depth == 24) {
8954 helper_float_24_recursive(buf + 0, 21);
8955 helper_float_24_recursive(buf + 2097152, 21);
8956 helper_float_24_recursive(buf + 4194304, 21);
8957 helper_float_24_recursive(buf + 6291456, 21);
8958 helper_float_24_recursive(buf + 8388608, 21);
8959 helper_float_24_recursive(buf + 10485760, 21);
8960 helper_float_24_recursive(buf + 12582912, 21);
8961 helper_float_24_recursive(buf + 14680064, 21);
8962 for (int j = 0; j < 16777216; j += 16777216) {
8963 for (int k = 0; k < 2097152; k += 4) {
8964 __asm__ volatile (
8965 "movups (%0), %%xmm0\n"
8966 "movups (%1), %%xmm1\n"
8967 "movups (%2), %%xmm2\n"
8968 "movups (%3), %%xmm3\n"
8969 "movups (%4), %%xmm4\n"
8970 "movups (%5), %%xmm5\n"
8971 "movups (%6), %%xmm6\n"
8972 "movups (%7), %%xmm7\n"
8973 "movaps %%xmm0, %%xmm8\n"
8974 "movaps %%xmm0, %%xmm9\n"
8975 "addps %%xmm1, %%xmm8\n"
8976 "subps %%xmm1, %%xmm9\n"
8977 "movaps %%xmm2, %%xmm10\n"
8978 "movaps %%xmm2, %%xmm11\n"
8979 "addps %%xmm3, %%xmm10\n"
8980 "subps %%xmm3, %%xmm11\n"
8981 "movaps %%xmm4, %%xmm12\n"
8982 "movaps %%xmm4, %%xmm13\n"
8983 "addps %%xmm5, %%xmm12\n"
8984 "subps %%xmm5, %%xmm13\n"
8985 "movaps %%xmm6, %%xmm14\n"
8986 "movaps %%xmm6, %%xmm15\n"
8987 "addps %%xmm7, %%xmm14\n"
8988 "subps %%xmm7, %%xmm15\n"
8989 "movaps %%xmm8, %%xmm0\n"
8990 "movaps %%xmm8, %%xmm2\n"
8991 "addps %%xmm10, %%xmm0\n"
8992 "subps %%xmm10, %%xmm2\n"
8993 "movaps %%xmm9, %%xmm1\n"
8994 "movaps %%xmm9, %%xmm3\n"
8995 "addps %%xmm11, %%xmm1\n"
8996 "subps %%xmm11, %%xmm3\n"
8997 "movaps %%xmm12, %%xmm4\n"
8998 "movaps %%xmm12, %%xmm6\n"
8999 "addps %%xmm14, %%xmm4\n"
9000 "subps %%xmm14, %%xmm6\n"
9001 "movaps %%xmm13, %%xmm5\n"
9002 "movaps %%xmm13, %%xmm7\n"
9003 "addps %%xmm15, %%xmm5\n"
9004 "subps %%xmm15, %%xmm7\n"
9005 "movaps %%xmm0, %%xmm8\n"
9006 "movaps %%xmm0, %%xmm12\n"
9007 "addps %%xmm4, %%xmm8\n"
9008 "subps %%xmm4, %%xmm12\n"
9009 "movaps %%xmm1, %%xmm9\n"
9010 "movaps %%xmm1, %%xmm13\n"
9011 "addps %%xmm5, %%xmm9\n"
9012 "subps %%xmm5, %%xmm13\n"
9013 "movaps %%xmm2, %%xmm10\n"
9014 "movaps %%xmm2, %%xmm14\n"
9015 "addps %%xmm6, %%xmm10\n"
9016 "subps %%xmm6, %%xmm14\n"
9017 "movaps %%xmm3, %%xmm11\n"
9018 "movaps %%xmm3, %%xmm15\n"
9019 "addps %%xmm7, %%xmm11\n"
9020 "subps %%xmm7, %%xmm15\n"
9021 "movups %%xmm8, (%0)\n"
9022 "movups %%xmm9, (%1)\n"
9023 "movups %%xmm10, (%2)\n"
9024 "movups %%xmm11, (%3)\n"
9025 "movups %%xmm12, (%4)\n"
9026 "movups %%xmm13, (%5)\n"
9027 "movups %%xmm14, (%6)\n"
9028 "movups %%xmm15, (%7)\n"
9029 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9030 );
9031 }
9032 }
9033 return;
9034 }
9035 }
9036 void helper_float_24(float *buf);
helper_float_24(float * buf)9037 void helper_float_24(float *buf) {
9038 helper_float_24_recursive(buf, 24);
9039 }
9040 void helper_float_25_recursive(float *buf, int depth);
helper_float_25_recursive(float * buf,int depth)9041 void helper_float_25_recursive(float *buf, int depth) {
9042 if (depth == 8) {
9043 for (int j = 0; j < 256; j += 32) {
9044 for (int k = 0; k < 4; k += 4) {
9045 __asm__ volatile (
9046 "movups (%0), %%xmm0\n"
9047 "movups (%1), %%xmm1\n"
9048 "movups (%2), %%xmm2\n"
9049 "movups (%3), %%xmm3\n"
9050 "movups (%4), %%xmm4\n"
9051 "movups (%5), %%xmm5\n"
9052 "movups (%6), %%xmm6\n"
9053 "movups (%7), %%xmm7\n"
9054 "movaps %%xmm0, %%xmm8\n"
9055 "shufps $160, %%xmm8, %%xmm8\n"
9056 "shufps $245, %%xmm0, %%xmm0\n"
9057 "xorps %%xmm9, %%xmm9\n"
9058 "subps %%xmm0, %%xmm9\n"
9059 "addsubps %%xmm9, %%xmm8\n"
9060 "movaps %%xmm8, %%xmm0\n"
9061 "movaps %%xmm1, %%xmm8\n"
9062 "shufps $160, %%xmm8, %%xmm8\n"
9063 "shufps $245, %%xmm1, %%xmm1\n"
9064 "xorps %%xmm9, %%xmm9\n"
9065 "subps %%xmm1, %%xmm9\n"
9066 "addsubps %%xmm9, %%xmm8\n"
9067 "movaps %%xmm8, %%xmm1\n"
9068 "movaps %%xmm2, %%xmm8\n"
9069 "shufps $160, %%xmm8, %%xmm8\n"
9070 "shufps $245, %%xmm2, %%xmm2\n"
9071 "xorps %%xmm9, %%xmm9\n"
9072 "subps %%xmm2, %%xmm9\n"
9073 "addsubps %%xmm9, %%xmm8\n"
9074 "movaps %%xmm8, %%xmm2\n"
9075 "movaps %%xmm3, %%xmm8\n"
9076 "shufps $160, %%xmm8, %%xmm8\n"
9077 "shufps $245, %%xmm3, %%xmm3\n"
9078 "xorps %%xmm9, %%xmm9\n"
9079 "subps %%xmm3, %%xmm9\n"
9080 "addsubps %%xmm9, %%xmm8\n"
9081 "movaps %%xmm8, %%xmm3\n"
9082 "movaps %%xmm4, %%xmm8\n"
9083 "shufps $160, %%xmm8, %%xmm8\n"
9084 "shufps $245, %%xmm4, %%xmm4\n"
9085 "xorps %%xmm9, %%xmm9\n"
9086 "subps %%xmm4, %%xmm9\n"
9087 "addsubps %%xmm9, %%xmm8\n"
9088 "movaps %%xmm8, %%xmm4\n"
9089 "movaps %%xmm5, %%xmm8\n"
9090 "shufps $160, %%xmm8, %%xmm8\n"
9091 "shufps $245, %%xmm5, %%xmm5\n"
9092 "xorps %%xmm9, %%xmm9\n"
9093 "subps %%xmm5, %%xmm9\n"
9094 "addsubps %%xmm9, %%xmm8\n"
9095 "movaps %%xmm8, %%xmm5\n"
9096 "movaps %%xmm6, %%xmm8\n"
9097 "shufps $160, %%xmm8, %%xmm8\n"
9098 "shufps $245, %%xmm6, %%xmm6\n"
9099 "xorps %%xmm9, %%xmm9\n"
9100 "subps %%xmm6, %%xmm9\n"
9101 "addsubps %%xmm9, %%xmm8\n"
9102 "movaps %%xmm8, %%xmm6\n"
9103 "movaps %%xmm7, %%xmm8\n"
9104 "shufps $160, %%xmm8, %%xmm8\n"
9105 "shufps $245, %%xmm7, %%xmm7\n"
9106 "xorps %%xmm9, %%xmm9\n"
9107 "subps %%xmm7, %%xmm9\n"
9108 "addsubps %%xmm9, %%xmm8\n"
9109 "movaps %%xmm8, %%xmm7\n"
9110 "movaps %%xmm0, %%xmm8\n"
9111 "shufps $68, %%xmm8, %%xmm8\n"
9112 "xorps %%xmm9, %%xmm9\n"
9113 "movaps %%xmm0, %%xmm10\n"
9114 "shufps $14, %%xmm9, %%xmm10\n"
9115 "movaps %%xmm0, %%xmm11\n"
9116 "shufps $224, %%xmm11, %%xmm9\n"
9117 "addps %%xmm8, %%xmm10\n"
9118 "subps %%xmm9, %%xmm10\n"
9119 "movaps %%xmm10, %%xmm0\n"
9120 "movaps %%xmm1, %%xmm8\n"
9121 "shufps $68, %%xmm8, %%xmm8\n"
9122 "xorps %%xmm9, %%xmm9\n"
9123 "movaps %%xmm1, %%xmm10\n"
9124 "shufps $14, %%xmm9, %%xmm10\n"
9125 "movaps %%xmm1, %%xmm11\n"
9126 "shufps $224, %%xmm11, %%xmm9\n"
9127 "addps %%xmm8, %%xmm10\n"
9128 "subps %%xmm9, %%xmm10\n"
9129 "movaps %%xmm10, %%xmm1\n"
9130 "movaps %%xmm2, %%xmm8\n"
9131 "shufps $68, %%xmm8, %%xmm8\n"
9132 "xorps %%xmm9, %%xmm9\n"
9133 "movaps %%xmm2, %%xmm10\n"
9134 "shufps $14, %%xmm9, %%xmm10\n"
9135 "movaps %%xmm2, %%xmm11\n"
9136 "shufps $224, %%xmm11, %%xmm9\n"
9137 "addps %%xmm8, %%xmm10\n"
9138 "subps %%xmm9, %%xmm10\n"
9139 "movaps %%xmm10, %%xmm2\n"
9140 "movaps %%xmm3, %%xmm8\n"
9141 "shufps $68, %%xmm8, %%xmm8\n"
9142 "xorps %%xmm9, %%xmm9\n"
9143 "movaps %%xmm3, %%xmm10\n"
9144 "shufps $14, %%xmm9, %%xmm10\n"
9145 "movaps %%xmm3, %%xmm11\n"
9146 "shufps $224, %%xmm11, %%xmm9\n"
9147 "addps %%xmm8, %%xmm10\n"
9148 "subps %%xmm9, %%xmm10\n"
9149 "movaps %%xmm10, %%xmm3\n"
9150 "movaps %%xmm4, %%xmm8\n"
9151 "shufps $68, %%xmm8, %%xmm8\n"
9152 "xorps %%xmm9, %%xmm9\n"
9153 "movaps %%xmm4, %%xmm10\n"
9154 "shufps $14, %%xmm9, %%xmm10\n"
9155 "movaps %%xmm4, %%xmm11\n"
9156 "shufps $224, %%xmm11, %%xmm9\n"
9157 "addps %%xmm8, %%xmm10\n"
9158 "subps %%xmm9, %%xmm10\n"
9159 "movaps %%xmm10, %%xmm4\n"
9160 "movaps %%xmm5, %%xmm8\n"
9161 "shufps $68, %%xmm8, %%xmm8\n"
9162 "xorps %%xmm9, %%xmm9\n"
9163 "movaps %%xmm5, %%xmm10\n"
9164 "shufps $14, %%xmm9, %%xmm10\n"
9165 "movaps %%xmm5, %%xmm11\n"
9166 "shufps $224, %%xmm11, %%xmm9\n"
9167 "addps %%xmm8, %%xmm10\n"
9168 "subps %%xmm9, %%xmm10\n"
9169 "movaps %%xmm10, %%xmm5\n"
9170 "movaps %%xmm6, %%xmm8\n"
9171 "shufps $68, %%xmm8, %%xmm8\n"
9172 "xorps %%xmm9, %%xmm9\n"
9173 "movaps %%xmm6, %%xmm10\n"
9174 "shufps $14, %%xmm9, %%xmm10\n"
9175 "movaps %%xmm6, %%xmm11\n"
9176 "shufps $224, %%xmm11, %%xmm9\n"
9177 "addps %%xmm8, %%xmm10\n"
9178 "subps %%xmm9, %%xmm10\n"
9179 "movaps %%xmm10, %%xmm6\n"
9180 "movaps %%xmm7, %%xmm8\n"
9181 "shufps $68, %%xmm8, %%xmm8\n"
9182 "xorps %%xmm9, %%xmm9\n"
9183 "movaps %%xmm7, %%xmm10\n"
9184 "shufps $14, %%xmm9, %%xmm10\n"
9185 "movaps %%xmm7, %%xmm11\n"
9186 "shufps $224, %%xmm11, %%xmm9\n"
9187 "addps %%xmm8, %%xmm10\n"
9188 "subps %%xmm9, %%xmm10\n"
9189 "movaps %%xmm10, %%xmm7\n"
9190 "movaps %%xmm0, %%xmm8\n"
9191 "movaps %%xmm0, %%xmm9\n"
9192 "addps %%xmm1, %%xmm8\n"
9193 "subps %%xmm1, %%xmm9\n"
9194 "movaps %%xmm2, %%xmm10\n"
9195 "movaps %%xmm2, %%xmm11\n"
9196 "addps %%xmm3, %%xmm10\n"
9197 "subps %%xmm3, %%xmm11\n"
9198 "movaps %%xmm4, %%xmm12\n"
9199 "movaps %%xmm4, %%xmm13\n"
9200 "addps %%xmm5, %%xmm12\n"
9201 "subps %%xmm5, %%xmm13\n"
9202 "movaps %%xmm6, %%xmm14\n"
9203 "movaps %%xmm6, %%xmm15\n"
9204 "addps %%xmm7, %%xmm14\n"
9205 "subps %%xmm7, %%xmm15\n"
9206 "movaps %%xmm8, %%xmm0\n"
9207 "movaps %%xmm8, %%xmm2\n"
9208 "addps %%xmm10, %%xmm0\n"
9209 "subps %%xmm10, %%xmm2\n"
9210 "movaps %%xmm9, %%xmm1\n"
9211 "movaps %%xmm9, %%xmm3\n"
9212 "addps %%xmm11, %%xmm1\n"
9213 "subps %%xmm11, %%xmm3\n"
9214 "movaps %%xmm12, %%xmm4\n"
9215 "movaps %%xmm12, %%xmm6\n"
9216 "addps %%xmm14, %%xmm4\n"
9217 "subps %%xmm14, %%xmm6\n"
9218 "movaps %%xmm13, %%xmm5\n"
9219 "movaps %%xmm13, %%xmm7\n"
9220 "addps %%xmm15, %%xmm5\n"
9221 "subps %%xmm15, %%xmm7\n"
9222 "movaps %%xmm0, %%xmm8\n"
9223 "movaps %%xmm0, %%xmm12\n"
9224 "addps %%xmm4, %%xmm8\n"
9225 "subps %%xmm4, %%xmm12\n"
9226 "movaps %%xmm1, %%xmm9\n"
9227 "movaps %%xmm1, %%xmm13\n"
9228 "addps %%xmm5, %%xmm9\n"
9229 "subps %%xmm5, %%xmm13\n"
9230 "movaps %%xmm2, %%xmm10\n"
9231 "movaps %%xmm2, %%xmm14\n"
9232 "addps %%xmm6, %%xmm10\n"
9233 "subps %%xmm6, %%xmm14\n"
9234 "movaps %%xmm3, %%xmm11\n"
9235 "movaps %%xmm3, %%xmm15\n"
9236 "addps %%xmm7, %%xmm11\n"
9237 "subps %%xmm7, %%xmm15\n"
9238 "movups %%xmm8, (%0)\n"
9239 "movups %%xmm9, (%1)\n"
9240 "movups %%xmm10, (%2)\n"
9241 "movups %%xmm11, (%3)\n"
9242 "movups %%xmm12, (%4)\n"
9243 "movups %%xmm13, (%5)\n"
9244 "movups %%xmm14, (%6)\n"
9245 "movups %%xmm15, (%7)\n"
9246 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9247 );
9248 }
9249 }
9250 for (int j = 0; j < 256; j += 256) {
9251 for (int k = 0; k < 32; k += 4) {
9252 __asm__ volatile (
9253 "movups (%0), %%xmm0\n"
9254 "movups (%1), %%xmm1\n"
9255 "movups (%2), %%xmm2\n"
9256 "movups (%3), %%xmm3\n"
9257 "movups (%4), %%xmm4\n"
9258 "movups (%5), %%xmm5\n"
9259 "movups (%6), %%xmm6\n"
9260 "movups (%7), %%xmm7\n"
9261 "movaps %%xmm0, %%xmm8\n"
9262 "movaps %%xmm0, %%xmm9\n"
9263 "addps %%xmm1, %%xmm8\n"
9264 "subps %%xmm1, %%xmm9\n"
9265 "movaps %%xmm2, %%xmm10\n"
9266 "movaps %%xmm2, %%xmm11\n"
9267 "addps %%xmm3, %%xmm10\n"
9268 "subps %%xmm3, %%xmm11\n"
9269 "movaps %%xmm4, %%xmm12\n"
9270 "movaps %%xmm4, %%xmm13\n"
9271 "addps %%xmm5, %%xmm12\n"
9272 "subps %%xmm5, %%xmm13\n"
9273 "movaps %%xmm6, %%xmm14\n"
9274 "movaps %%xmm6, %%xmm15\n"
9275 "addps %%xmm7, %%xmm14\n"
9276 "subps %%xmm7, %%xmm15\n"
9277 "movaps %%xmm8, %%xmm0\n"
9278 "movaps %%xmm8, %%xmm2\n"
9279 "addps %%xmm10, %%xmm0\n"
9280 "subps %%xmm10, %%xmm2\n"
9281 "movaps %%xmm9, %%xmm1\n"
9282 "movaps %%xmm9, %%xmm3\n"
9283 "addps %%xmm11, %%xmm1\n"
9284 "subps %%xmm11, %%xmm3\n"
9285 "movaps %%xmm12, %%xmm4\n"
9286 "movaps %%xmm12, %%xmm6\n"
9287 "addps %%xmm14, %%xmm4\n"
9288 "subps %%xmm14, %%xmm6\n"
9289 "movaps %%xmm13, %%xmm5\n"
9290 "movaps %%xmm13, %%xmm7\n"
9291 "addps %%xmm15, %%xmm5\n"
9292 "subps %%xmm15, %%xmm7\n"
9293 "movaps %%xmm0, %%xmm8\n"
9294 "movaps %%xmm0, %%xmm12\n"
9295 "addps %%xmm4, %%xmm8\n"
9296 "subps %%xmm4, %%xmm12\n"
9297 "movaps %%xmm1, %%xmm9\n"
9298 "movaps %%xmm1, %%xmm13\n"
9299 "addps %%xmm5, %%xmm9\n"
9300 "subps %%xmm5, %%xmm13\n"
9301 "movaps %%xmm2, %%xmm10\n"
9302 "movaps %%xmm2, %%xmm14\n"
9303 "addps %%xmm6, %%xmm10\n"
9304 "subps %%xmm6, %%xmm14\n"
9305 "movaps %%xmm3, %%xmm11\n"
9306 "movaps %%xmm3, %%xmm15\n"
9307 "addps %%xmm7, %%xmm11\n"
9308 "subps %%xmm7, %%xmm15\n"
9309 "movups %%xmm8, (%0)\n"
9310 "movups %%xmm9, (%1)\n"
9311 "movups %%xmm10, (%2)\n"
9312 "movups %%xmm11, (%3)\n"
9313 "movups %%xmm12, (%4)\n"
9314 "movups %%xmm13, (%5)\n"
9315 "movups %%xmm14, (%6)\n"
9316 "movups %%xmm15, (%7)\n"
9317 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9318 );
9319 }
9320 }
9321 return;
9322 }
9323 if (depth == 11) {
9324 helper_float_25_recursive(buf + 0, 8);
9325 helper_float_25_recursive(buf + 256, 8);
9326 helper_float_25_recursive(buf + 512, 8);
9327 helper_float_25_recursive(buf + 768, 8);
9328 helper_float_25_recursive(buf + 1024, 8);
9329 helper_float_25_recursive(buf + 1280, 8);
9330 helper_float_25_recursive(buf + 1536, 8);
9331 helper_float_25_recursive(buf + 1792, 8);
9332 for (int j = 0; j < 2048; j += 2048) {
9333 for (int k = 0; k < 256; k += 4) {
9334 __asm__ volatile (
9335 "movups (%0), %%xmm0\n"
9336 "movups (%1), %%xmm1\n"
9337 "movups (%2), %%xmm2\n"
9338 "movups (%3), %%xmm3\n"
9339 "movups (%4), %%xmm4\n"
9340 "movups (%5), %%xmm5\n"
9341 "movups (%6), %%xmm6\n"
9342 "movups (%7), %%xmm7\n"
9343 "movaps %%xmm0, %%xmm8\n"
9344 "movaps %%xmm0, %%xmm9\n"
9345 "addps %%xmm1, %%xmm8\n"
9346 "subps %%xmm1, %%xmm9\n"
9347 "movaps %%xmm2, %%xmm10\n"
9348 "movaps %%xmm2, %%xmm11\n"
9349 "addps %%xmm3, %%xmm10\n"
9350 "subps %%xmm3, %%xmm11\n"
9351 "movaps %%xmm4, %%xmm12\n"
9352 "movaps %%xmm4, %%xmm13\n"
9353 "addps %%xmm5, %%xmm12\n"
9354 "subps %%xmm5, %%xmm13\n"
9355 "movaps %%xmm6, %%xmm14\n"
9356 "movaps %%xmm6, %%xmm15\n"
9357 "addps %%xmm7, %%xmm14\n"
9358 "subps %%xmm7, %%xmm15\n"
9359 "movaps %%xmm8, %%xmm0\n"
9360 "movaps %%xmm8, %%xmm2\n"
9361 "addps %%xmm10, %%xmm0\n"
9362 "subps %%xmm10, %%xmm2\n"
9363 "movaps %%xmm9, %%xmm1\n"
9364 "movaps %%xmm9, %%xmm3\n"
9365 "addps %%xmm11, %%xmm1\n"
9366 "subps %%xmm11, %%xmm3\n"
9367 "movaps %%xmm12, %%xmm4\n"
9368 "movaps %%xmm12, %%xmm6\n"
9369 "addps %%xmm14, %%xmm4\n"
9370 "subps %%xmm14, %%xmm6\n"
9371 "movaps %%xmm13, %%xmm5\n"
9372 "movaps %%xmm13, %%xmm7\n"
9373 "addps %%xmm15, %%xmm5\n"
9374 "subps %%xmm15, %%xmm7\n"
9375 "movaps %%xmm0, %%xmm8\n"
9376 "movaps %%xmm0, %%xmm12\n"
9377 "addps %%xmm4, %%xmm8\n"
9378 "subps %%xmm4, %%xmm12\n"
9379 "movaps %%xmm1, %%xmm9\n"
9380 "movaps %%xmm1, %%xmm13\n"
9381 "addps %%xmm5, %%xmm9\n"
9382 "subps %%xmm5, %%xmm13\n"
9383 "movaps %%xmm2, %%xmm10\n"
9384 "movaps %%xmm2, %%xmm14\n"
9385 "addps %%xmm6, %%xmm10\n"
9386 "subps %%xmm6, %%xmm14\n"
9387 "movaps %%xmm3, %%xmm11\n"
9388 "movaps %%xmm3, %%xmm15\n"
9389 "addps %%xmm7, %%xmm11\n"
9390 "subps %%xmm7, %%xmm15\n"
9391 "movups %%xmm8, (%0)\n"
9392 "movups %%xmm9, (%1)\n"
9393 "movups %%xmm10, (%2)\n"
9394 "movups %%xmm11, (%3)\n"
9395 "movups %%xmm12, (%4)\n"
9396 "movups %%xmm13, (%5)\n"
9397 "movups %%xmm14, (%6)\n"
9398 "movups %%xmm15, (%7)\n"
9399 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9400 );
9401 }
9402 }
9403 return;
9404 }
9405 if (depth == 14) {
9406 helper_float_25_recursive(buf + 0, 11);
9407 helper_float_25_recursive(buf + 2048, 11);
9408 helper_float_25_recursive(buf + 4096, 11);
9409 helper_float_25_recursive(buf + 6144, 11);
9410 helper_float_25_recursive(buf + 8192, 11);
9411 helper_float_25_recursive(buf + 10240, 11);
9412 helper_float_25_recursive(buf + 12288, 11);
9413 helper_float_25_recursive(buf + 14336, 11);
9414 for (int j = 0; j < 16384; j += 16384) {
9415 for (int k = 0; k < 2048; k += 4) {
9416 __asm__ volatile (
9417 "movups (%0), %%xmm0\n"
9418 "movups (%1), %%xmm1\n"
9419 "movups (%2), %%xmm2\n"
9420 "movups (%3), %%xmm3\n"
9421 "movups (%4), %%xmm4\n"
9422 "movups (%5), %%xmm5\n"
9423 "movups (%6), %%xmm6\n"
9424 "movups (%7), %%xmm7\n"
9425 "movaps %%xmm0, %%xmm8\n"
9426 "movaps %%xmm0, %%xmm9\n"
9427 "addps %%xmm1, %%xmm8\n"
9428 "subps %%xmm1, %%xmm9\n"
9429 "movaps %%xmm2, %%xmm10\n"
9430 "movaps %%xmm2, %%xmm11\n"
9431 "addps %%xmm3, %%xmm10\n"
9432 "subps %%xmm3, %%xmm11\n"
9433 "movaps %%xmm4, %%xmm12\n"
9434 "movaps %%xmm4, %%xmm13\n"
9435 "addps %%xmm5, %%xmm12\n"
9436 "subps %%xmm5, %%xmm13\n"
9437 "movaps %%xmm6, %%xmm14\n"
9438 "movaps %%xmm6, %%xmm15\n"
9439 "addps %%xmm7, %%xmm14\n"
9440 "subps %%xmm7, %%xmm15\n"
9441 "movaps %%xmm8, %%xmm0\n"
9442 "movaps %%xmm8, %%xmm2\n"
9443 "addps %%xmm10, %%xmm0\n"
9444 "subps %%xmm10, %%xmm2\n"
9445 "movaps %%xmm9, %%xmm1\n"
9446 "movaps %%xmm9, %%xmm3\n"
9447 "addps %%xmm11, %%xmm1\n"
9448 "subps %%xmm11, %%xmm3\n"
9449 "movaps %%xmm12, %%xmm4\n"
9450 "movaps %%xmm12, %%xmm6\n"
9451 "addps %%xmm14, %%xmm4\n"
9452 "subps %%xmm14, %%xmm6\n"
9453 "movaps %%xmm13, %%xmm5\n"
9454 "movaps %%xmm13, %%xmm7\n"
9455 "addps %%xmm15, %%xmm5\n"
9456 "subps %%xmm15, %%xmm7\n"
9457 "movaps %%xmm0, %%xmm8\n"
9458 "movaps %%xmm0, %%xmm12\n"
9459 "addps %%xmm4, %%xmm8\n"
9460 "subps %%xmm4, %%xmm12\n"
9461 "movaps %%xmm1, %%xmm9\n"
9462 "movaps %%xmm1, %%xmm13\n"
9463 "addps %%xmm5, %%xmm9\n"
9464 "subps %%xmm5, %%xmm13\n"
9465 "movaps %%xmm2, %%xmm10\n"
9466 "movaps %%xmm2, %%xmm14\n"
9467 "addps %%xmm6, %%xmm10\n"
9468 "subps %%xmm6, %%xmm14\n"
9469 "movaps %%xmm3, %%xmm11\n"
9470 "movaps %%xmm3, %%xmm15\n"
9471 "addps %%xmm7, %%xmm11\n"
9472 "subps %%xmm7, %%xmm15\n"
9473 "movups %%xmm8, (%0)\n"
9474 "movups %%xmm9, (%1)\n"
9475 "movups %%xmm10, (%2)\n"
9476 "movups %%xmm11, (%3)\n"
9477 "movups %%xmm12, (%4)\n"
9478 "movups %%xmm13, (%5)\n"
9479 "movups %%xmm14, (%6)\n"
9480 "movups %%xmm15, (%7)\n"
9481 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9482 );
9483 }
9484 }
9485 return;
9486 }
9487 if (depth == 17) {
9488 helper_float_25_recursive(buf + 0, 14);
9489 helper_float_25_recursive(buf + 16384, 14);
9490 helper_float_25_recursive(buf + 32768, 14);
9491 helper_float_25_recursive(buf + 49152, 14);
9492 helper_float_25_recursive(buf + 65536, 14);
9493 helper_float_25_recursive(buf + 81920, 14);
9494 helper_float_25_recursive(buf + 98304, 14);
9495 helper_float_25_recursive(buf + 114688, 14);
9496 for (int j = 0; j < 131072; j += 131072) {
9497 for (int k = 0; k < 16384; k += 4) {
9498 __asm__ volatile (
9499 "movups (%0), %%xmm0\n"
9500 "movups (%1), %%xmm1\n"
9501 "movups (%2), %%xmm2\n"
9502 "movups (%3), %%xmm3\n"
9503 "movups (%4), %%xmm4\n"
9504 "movups (%5), %%xmm5\n"
9505 "movups (%6), %%xmm6\n"
9506 "movups (%7), %%xmm7\n"
9507 "movaps %%xmm0, %%xmm8\n"
9508 "movaps %%xmm0, %%xmm9\n"
9509 "addps %%xmm1, %%xmm8\n"
9510 "subps %%xmm1, %%xmm9\n"
9511 "movaps %%xmm2, %%xmm10\n"
9512 "movaps %%xmm2, %%xmm11\n"
9513 "addps %%xmm3, %%xmm10\n"
9514 "subps %%xmm3, %%xmm11\n"
9515 "movaps %%xmm4, %%xmm12\n"
9516 "movaps %%xmm4, %%xmm13\n"
9517 "addps %%xmm5, %%xmm12\n"
9518 "subps %%xmm5, %%xmm13\n"
9519 "movaps %%xmm6, %%xmm14\n"
9520 "movaps %%xmm6, %%xmm15\n"
9521 "addps %%xmm7, %%xmm14\n"
9522 "subps %%xmm7, %%xmm15\n"
9523 "movaps %%xmm8, %%xmm0\n"
9524 "movaps %%xmm8, %%xmm2\n"
9525 "addps %%xmm10, %%xmm0\n"
9526 "subps %%xmm10, %%xmm2\n"
9527 "movaps %%xmm9, %%xmm1\n"
9528 "movaps %%xmm9, %%xmm3\n"
9529 "addps %%xmm11, %%xmm1\n"
9530 "subps %%xmm11, %%xmm3\n"
9531 "movaps %%xmm12, %%xmm4\n"
9532 "movaps %%xmm12, %%xmm6\n"
9533 "addps %%xmm14, %%xmm4\n"
9534 "subps %%xmm14, %%xmm6\n"
9535 "movaps %%xmm13, %%xmm5\n"
9536 "movaps %%xmm13, %%xmm7\n"
9537 "addps %%xmm15, %%xmm5\n"
9538 "subps %%xmm15, %%xmm7\n"
9539 "movaps %%xmm0, %%xmm8\n"
9540 "movaps %%xmm0, %%xmm12\n"
9541 "addps %%xmm4, %%xmm8\n"
9542 "subps %%xmm4, %%xmm12\n"
9543 "movaps %%xmm1, %%xmm9\n"
9544 "movaps %%xmm1, %%xmm13\n"
9545 "addps %%xmm5, %%xmm9\n"
9546 "subps %%xmm5, %%xmm13\n"
9547 "movaps %%xmm2, %%xmm10\n"
9548 "movaps %%xmm2, %%xmm14\n"
9549 "addps %%xmm6, %%xmm10\n"
9550 "subps %%xmm6, %%xmm14\n"
9551 "movaps %%xmm3, %%xmm11\n"
9552 "movaps %%xmm3, %%xmm15\n"
9553 "addps %%xmm7, %%xmm11\n"
9554 "subps %%xmm7, %%xmm15\n"
9555 "movups %%xmm8, (%0)\n"
9556 "movups %%xmm9, (%1)\n"
9557 "movups %%xmm10, (%2)\n"
9558 "movups %%xmm11, (%3)\n"
9559 "movups %%xmm12, (%4)\n"
9560 "movups %%xmm13, (%5)\n"
9561 "movups %%xmm14, (%6)\n"
9562 "movups %%xmm15, (%7)\n"
9563 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9564 );
9565 }
9566 }
9567 return;
9568 }
9569 if (depth == 20) {
9570 helper_float_25_recursive(buf + 0, 17);
9571 helper_float_25_recursive(buf + 131072, 17);
9572 helper_float_25_recursive(buf + 262144, 17);
9573 helper_float_25_recursive(buf + 393216, 17);
9574 helper_float_25_recursive(buf + 524288, 17);
9575 helper_float_25_recursive(buf + 655360, 17);
9576 helper_float_25_recursive(buf + 786432, 17);
9577 helper_float_25_recursive(buf + 917504, 17);
9578 for (int j = 0; j < 1048576; j += 1048576) {
9579 for (int k = 0; k < 131072; k += 4) {
9580 __asm__ volatile (
9581 "movups (%0), %%xmm0\n"
9582 "movups (%1), %%xmm1\n"
9583 "movups (%2), %%xmm2\n"
9584 "movups (%3), %%xmm3\n"
9585 "movups (%4), %%xmm4\n"
9586 "movups (%5), %%xmm5\n"
9587 "movups (%6), %%xmm6\n"
9588 "movups (%7), %%xmm7\n"
9589 "movaps %%xmm0, %%xmm8\n"
9590 "movaps %%xmm0, %%xmm9\n"
9591 "addps %%xmm1, %%xmm8\n"
9592 "subps %%xmm1, %%xmm9\n"
9593 "movaps %%xmm2, %%xmm10\n"
9594 "movaps %%xmm2, %%xmm11\n"
9595 "addps %%xmm3, %%xmm10\n"
9596 "subps %%xmm3, %%xmm11\n"
9597 "movaps %%xmm4, %%xmm12\n"
9598 "movaps %%xmm4, %%xmm13\n"
9599 "addps %%xmm5, %%xmm12\n"
9600 "subps %%xmm5, %%xmm13\n"
9601 "movaps %%xmm6, %%xmm14\n"
9602 "movaps %%xmm6, %%xmm15\n"
9603 "addps %%xmm7, %%xmm14\n"
9604 "subps %%xmm7, %%xmm15\n"
9605 "movaps %%xmm8, %%xmm0\n"
9606 "movaps %%xmm8, %%xmm2\n"
9607 "addps %%xmm10, %%xmm0\n"
9608 "subps %%xmm10, %%xmm2\n"
9609 "movaps %%xmm9, %%xmm1\n"
9610 "movaps %%xmm9, %%xmm3\n"
9611 "addps %%xmm11, %%xmm1\n"
9612 "subps %%xmm11, %%xmm3\n"
9613 "movaps %%xmm12, %%xmm4\n"
9614 "movaps %%xmm12, %%xmm6\n"
9615 "addps %%xmm14, %%xmm4\n"
9616 "subps %%xmm14, %%xmm6\n"
9617 "movaps %%xmm13, %%xmm5\n"
9618 "movaps %%xmm13, %%xmm7\n"
9619 "addps %%xmm15, %%xmm5\n"
9620 "subps %%xmm15, %%xmm7\n"
9621 "movaps %%xmm0, %%xmm8\n"
9622 "movaps %%xmm0, %%xmm12\n"
9623 "addps %%xmm4, %%xmm8\n"
9624 "subps %%xmm4, %%xmm12\n"
9625 "movaps %%xmm1, %%xmm9\n"
9626 "movaps %%xmm1, %%xmm13\n"
9627 "addps %%xmm5, %%xmm9\n"
9628 "subps %%xmm5, %%xmm13\n"
9629 "movaps %%xmm2, %%xmm10\n"
9630 "movaps %%xmm2, %%xmm14\n"
9631 "addps %%xmm6, %%xmm10\n"
9632 "subps %%xmm6, %%xmm14\n"
9633 "movaps %%xmm3, %%xmm11\n"
9634 "movaps %%xmm3, %%xmm15\n"
9635 "addps %%xmm7, %%xmm11\n"
9636 "subps %%xmm7, %%xmm15\n"
9637 "movups %%xmm8, (%0)\n"
9638 "movups %%xmm9, (%1)\n"
9639 "movups %%xmm10, (%2)\n"
9640 "movups %%xmm11, (%3)\n"
9641 "movups %%xmm12, (%4)\n"
9642 "movups %%xmm13, (%5)\n"
9643 "movups %%xmm14, (%6)\n"
9644 "movups %%xmm15, (%7)\n"
9645 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9646 );
9647 }
9648 }
9649 return;
9650 }
9651 if (depth == 23) {
9652 helper_float_25_recursive(buf + 0, 20);
9653 helper_float_25_recursive(buf + 1048576, 20);
9654 helper_float_25_recursive(buf + 2097152, 20);
9655 helper_float_25_recursive(buf + 3145728, 20);
9656 helper_float_25_recursive(buf + 4194304, 20);
9657 helper_float_25_recursive(buf + 5242880, 20);
9658 helper_float_25_recursive(buf + 6291456, 20);
9659 helper_float_25_recursive(buf + 7340032, 20);
9660 for (int j = 0; j < 8388608; j += 8388608) {
9661 for (int k = 0; k < 1048576; k += 4) {
9662 __asm__ volatile (
9663 "movups (%0), %%xmm0\n"
9664 "movups (%1), %%xmm1\n"
9665 "movups (%2), %%xmm2\n"
9666 "movups (%3), %%xmm3\n"
9667 "movups (%4), %%xmm4\n"
9668 "movups (%5), %%xmm5\n"
9669 "movups (%6), %%xmm6\n"
9670 "movups (%7), %%xmm7\n"
9671 "movaps %%xmm0, %%xmm8\n"
9672 "movaps %%xmm0, %%xmm9\n"
9673 "addps %%xmm1, %%xmm8\n"
9674 "subps %%xmm1, %%xmm9\n"
9675 "movaps %%xmm2, %%xmm10\n"
9676 "movaps %%xmm2, %%xmm11\n"
9677 "addps %%xmm3, %%xmm10\n"
9678 "subps %%xmm3, %%xmm11\n"
9679 "movaps %%xmm4, %%xmm12\n"
9680 "movaps %%xmm4, %%xmm13\n"
9681 "addps %%xmm5, %%xmm12\n"
9682 "subps %%xmm5, %%xmm13\n"
9683 "movaps %%xmm6, %%xmm14\n"
9684 "movaps %%xmm6, %%xmm15\n"
9685 "addps %%xmm7, %%xmm14\n"
9686 "subps %%xmm7, %%xmm15\n"
9687 "movaps %%xmm8, %%xmm0\n"
9688 "movaps %%xmm8, %%xmm2\n"
9689 "addps %%xmm10, %%xmm0\n"
9690 "subps %%xmm10, %%xmm2\n"
9691 "movaps %%xmm9, %%xmm1\n"
9692 "movaps %%xmm9, %%xmm3\n"
9693 "addps %%xmm11, %%xmm1\n"
9694 "subps %%xmm11, %%xmm3\n"
9695 "movaps %%xmm12, %%xmm4\n"
9696 "movaps %%xmm12, %%xmm6\n"
9697 "addps %%xmm14, %%xmm4\n"
9698 "subps %%xmm14, %%xmm6\n"
9699 "movaps %%xmm13, %%xmm5\n"
9700 "movaps %%xmm13, %%xmm7\n"
9701 "addps %%xmm15, %%xmm5\n"
9702 "subps %%xmm15, %%xmm7\n"
9703 "movaps %%xmm0, %%xmm8\n"
9704 "movaps %%xmm0, %%xmm12\n"
9705 "addps %%xmm4, %%xmm8\n"
9706 "subps %%xmm4, %%xmm12\n"
9707 "movaps %%xmm1, %%xmm9\n"
9708 "movaps %%xmm1, %%xmm13\n"
9709 "addps %%xmm5, %%xmm9\n"
9710 "subps %%xmm5, %%xmm13\n"
9711 "movaps %%xmm2, %%xmm10\n"
9712 "movaps %%xmm2, %%xmm14\n"
9713 "addps %%xmm6, %%xmm10\n"
9714 "subps %%xmm6, %%xmm14\n"
9715 "movaps %%xmm3, %%xmm11\n"
9716 "movaps %%xmm3, %%xmm15\n"
9717 "addps %%xmm7, %%xmm11\n"
9718 "subps %%xmm7, %%xmm15\n"
9719 "movups %%xmm8, (%0)\n"
9720 "movups %%xmm9, (%1)\n"
9721 "movups %%xmm10, (%2)\n"
9722 "movups %%xmm11, (%3)\n"
9723 "movups %%xmm12, (%4)\n"
9724 "movups %%xmm13, (%5)\n"
9725 "movups %%xmm14, (%6)\n"
9726 "movups %%xmm15, (%7)\n"
9727 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9728 );
9729 }
9730 }
9731 return;
9732 }
9733 if (depth == 25) {
9734 helper_float_25_recursive(buf + 0, 23);
9735 helper_float_25_recursive(buf + 8388608, 23);
9736 helper_float_25_recursive(buf + 16777216, 23);
9737 helper_float_25_recursive(buf + 25165824, 23);
9738 for (int j = 0; j < 33554432; j += 33554432) {
9739 for (int k = 0; k < 8388608; k += 4) {
9740 __asm__ volatile (
9741 "movups (%0), %%xmm0\n"
9742 "movups (%1), %%xmm1\n"
9743 "movups (%2), %%xmm2\n"
9744 "movups (%3), %%xmm3\n"
9745 "movaps %%xmm0, %%xmm8\n"
9746 "movaps %%xmm0, %%xmm9\n"
9747 "addps %%xmm1, %%xmm8\n"
9748 "subps %%xmm1, %%xmm9\n"
9749 "movaps %%xmm2, %%xmm10\n"
9750 "movaps %%xmm2, %%xmm11\n"
9751 "addps %%xmm3, %%xmm10\n"
9752 "subps %%xmm3, %%xmm11\n"
9753 "movaps %%xmm8, %%xmm0\n"
9754 "movaps %%xmm8, %%xmm2\n"
9755 "addps %%xmm10, %%xmm0\n"
9756 "subps %%xmm10, %%xmm2\n"
9757 "movaps %%xmm9, %%xmm1\n"
9758 "movaps %%xmm9, %%xmm3\n"
9759 "addps %%xmm11, %%xmm1\n"
9760 "subps %%xmm11, %%xmm3\n"
9761 "movups %%xmm0, (%0)\n"
9762 "movups %%xmm1, (%1)\n"
9763 "movups %%xmm2, (%2)\n"
9764 "movups %%xmm3, (%3)\n"
9765 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9766 );
9767 }
9768 }
9769 return;
9770 }
9771 }
9772 void helper_float_25(float *buf);
helper_float_25(float * buf)9773 void helper_float_25(float *buf) {
9774 helper_float_25_recursive(buf, 25);
9775 }
9776 void helper_float_26_recursive(float *buf, int depth);
helper_float_26_recursive(float * buf,int depth)9777 void helper_float_26_recursive(float *buf, int depth) {
9778 if (depth == 5) {
9779 for (int j = 0; j < 32; j += 32) {
9780 for (int k = 0; k < 4; k += 4) {
9781 __asm__ volatile (
9782 "movups (%0), %%xmm0\n"
9783 "movups (%1), %%xmm1\n"
9784 "movups (%2), %%xmm2\n"
9785 "movups (%3), %%xmm3\n"
9786 "movups (%4), %%xmm4\n"
9787 "movups (%5), %%xmm5\n"
9788 "movups (%6), %%xmm6\n"
9789 "movups (%7), %%xmm7\n"
9790 "movaps %%xmm0, %%xmm8\n"
9791 "shufps $160, %%xmm8, %%xmm8\n"
9792 "shufps $245, %%xmm0, %%xmm0\n"
9793 "xorps %%xmm9, %%xmm9\n"
9794 "subps %%xmm0, %%xmm9\n"
9795 "addsubps %%xmm9, %%xmm8\n"
9796 "movaps %%xmm8, %%xmm0\n"
9797 "movaps %%xmm1, %%xmm8\n"
9798 "shufps $160, %%xmm8, %%xmm8\n"
9799 "shufps $245, %%xmm1, %%xmm1\n"
9800 "xorps %%xmm9, %%xmm9\n"
9801 "subps %%xmm1, %%xmm9\n"
9802 "addsubps %%xmm9, %%xmm8\n"
9803 "movaps %%xmm8, %%xmm1\n"
9804 "movaps %%xmm2, %%xmm8\n"
9805 "shufps $160, %%xmm8, %%xmm8\n"
9806 "shufps $245, %%xmm2, %%xmm2\n"
9807 "xorps %%xmm9, %%xmm9\n"
9808 "subps %%xmm2, %%xmm9\n"
9809 "addsubps %%xmm9, %%xmm8\n"
9810 "movaps %%xmm8, %%xmm2\n"
9811 "movaps %%xmm3, %%xmm8\n"
9812 "shufps $160, %%xmm8, %%xmm8\n"
9813 "shufps $245, %%xmm3, %%xmm3\n"
9814 "xorps %%xmm9, %%xmm9\n"
9815 "subps %%xmm3, %%xmm9\n"
9816 "addsubps %%xmm9, %%xmm8\n"
9817 "movaps %%xmm8, %%xmm3\n"
9818 "movaps %%xmm4, %%xmm8\n"
9819 "shufps $160, %%xmm8, %%xmm8\n"
9820 "shufps $245, %%xmm4, %%xmm4\n"
9821 "xorps %%xmm9, %%xmm9\n"
9822 "subps %%xmm4, %%xmm9\n"
9823 "addsubps %%xmm9, %%xmm8\n"
9824 "movaps %%xmm8, %%xmm4\n"
9825 "movaps %%xmm5, %%xmm8\n"
9826 "shufps $160, %%xmm8, %%xmm8\n"
9827 "shufps $245, %%xmm5, %%xmm5\n"
9828 "xorps %%xmm9, %%xmm9\n"
9829 "subps %%xmm5, %%xmm9\n"
9830 "addsubps %%xmm9, %%xmm8\n"
9831 "movaps %%xmm8, %%xmm5\n"
9832 "movaps %%xmm6, %%xmm8\n"
9833 "shufps $160, %%xmm8, %%xmm8\n"
9834 "shufps $245, %%xmm6, %%xmm6\n"
9835 "xorps %%xmm9, %%xmm9\n"
9836 "subps %%xmm6, %%xmm9\n"
9837 "addsubps %%xmm9, %%xmm8\n"
9838 "movaps %%xmm8, %%xmm6\n"
9839 "movaps %%xmm7, %%xmm8\n"
9840 "shufps $160, %%xmm8, %%xmm8\n"
9841 "shufps $245, %%xmm7, %%xmm7\n"
9842 "xorps %%xmm9, %%xmm9\n"
9843 "subps %%xmm7, %%xmm9\n"
9844 "addsubps %%xmm9, %%xmm8\n"
9845 "movaps %%xmm8, %%xmm7\n"
9846 "movaps %%xmm0, %%xmm8\n"
9847 "shufps $68, %%xmm8, %%xmm8\n"
9848 "xorps %%xmm9, %%xmm9\n"
9849 "movaps %%xmm0, %%xmm10\n"
9850 "shufps $14, %%xmm9, %%xmm10\n"
9851 "movaps %%xmm0, %%xmm11\n"
9852 "shufps $224, %%xmm11, %%xmm9\n"
9853 "addps %%xmm8, %%xmm10\n"
9854 "subps %%xmm9, %%xmm10\n"
9855 "movaps %%xmm10, %%xmm0\n"
9856 "movaps %%xmm1, %%xmm8\n"
9857 "shufps $68, %%xmm8, %%xmm8\n"
9858 "xorps %%xmm9, %%xmm9\n"
9859 "movaps %%xmm1, %%xmm10\n"
9860 "shufps $14, %%xmm9, %%xmm10\n"
9861 "movaps %%xmm1, %%xmm11\n"
9862 "shufps $224, %%xmm11, %%xmm9\n"
9863 "addps %%xmm8, %%xmm10\n"
9864 "subps %%xmm9, %%xmm10\n"
9865 "movaps %%xmm10, %%xmm1\n"
9866 "movaps %%xmm2, %%xmm8\n"
9867 "shufps $68, %%xmm8, %%xmm8\n"
9868 "xorps %%xmm9, %%xmm9\n"
9869 "movaps %%xmm2, %%xmm10\n"
9870 "shufps $14, %%xmm9, %%xmm10\n"
9871 "movaps %%xmm2, %%xmm11\n"
9872 "shufps $224, %%xmm11, %%xmm9\n"
9873 "addps %%xmm8, %%xmm10\n"
9874 "subps %%xmm9, %%xmm10\n"
9875 "movaps %%xmm10, %%xmm2\n"
9876 "movaps %%xmm3, %%xmm8\n"
9877 "shufps $68, %%xmm8, %%xmm8\n"
9878 "xorps %%xmm9, %%xmm9\n"
9879 "movaps %%xmm3, %%xmm10\n"
9880 "shufps $14, %%xmm9, %%xmm10\n"
9881 "movaps %%xmm3, %%xmm11\n"
9882 "shufps $224, %%xmm11, %%xmm9\n"
9883 "addps %%xmm8, %%xmm10\n"
9884 "subps %%xmm9, %%xmm10\n"
9885 "movaps %%xmm10, %%xmm3\n"
9886 "movaps %%xmm4, %%xmm8\n"
9887 "shufps $68, %%xmm8, %%xmm8\n"
9888 "xorps %%xmm9, %%xmm9\n"
9889 "movaps %%xmm4, %%xmm10\n"
9890 "shufps $14, %%xmm9, %%xmm10\n"
9891 "movaps %%xmm4, %%xmm11\n"
9892 "shufps $224, %%xmm11, %%xmm9\n"
9893 "addps %%xmm8, %%xmm10\n"
9894 "subps %%xmm9, %%xmm10\n"
9895 "movaps %%xmm10, %%xmm4\n"
9896 "movaps %%xmm5, %%xmm8\n"
9897 "shufps $68, %%xmm8, %%xmm8\n"
9898 "xorps %%xmm9, %%xmm9\n"
9899 "movaps %%xmm5, %%xmm10\n"
9900 "shufps $14, %%xmm9, %%xmm10\n"
9901 "movaps %%xmm5, %%xmm11\n"
9902 "shufps $224, %%xmm11, %%xmm9\n"
9903 "addps %%xmm8, %%xmm10\n"
9904 "subps %%xmm9, %%xmm10\n"
9905 "movaps %%xmm10, %%xmm5\n"
9906 "movaps %%xmm6, %%xmm8\n"
9907 "shufps $68, %%xmm8, %%xmm8\n"
9908 "xorps %%xmm9, %%xmm9\n"
9909 "movaps %%xmm6, %%xmm10\n"
9910 "shufps $14, %%xmm9, %%xmm10\n"
9911 "movaps %%xmm6, %%xmm11\n"
9912 "shufps $224, %%xmm11, %%xmm9\n"
9913 "addps %%xmm8, %%xmm10\n"
9914 "subps %%xmm9, %%xmm10\n"
9915 "movaps %%xmm10, %%xmm6\n"
9916 "movaps %%xmm7, %%xmm8\n"
9917 "shufps $68, %%xmm8, %%xmm8\n"
9918 "xorps %%xmm9, %%xmm9\n"
9919 "movaps %%xmm7, %%xmm10\n"
9920 "shufps $14, %%xmm9, %%xmm10\n"
9921 "movaps %%xmm7, %%xmm11\n"
9922 "shufps $224, %%xmm11, %%xmm9\n"
9923 "addps %%xmm8, %%xmm10\n"
9924 "subps %%xmm9, %%xmm10\n"
9925 "movaps %%xmm10, %%xmm7\n"
9926 "movaps %%xmm0, %%xmm8\n"
9927 "movaps %%xmm0, %%xmm9\n"
9928 "addps %%xmm1, %%xmm8\n"
9929 "subps %%xmm1, %%xmm9\n"
9930 "movaps %%xmm2, %%xmm10\n"
9931 "movaps %%xmm2, %%xmm11\n"
9932 "addps %%xmm3, %%xmm10\n"
9933 "subps %%xmm3, %%xmm11\n"
9934 "movaps %%xmm4, %%xmm12\n"
9935 "movaps %%xmm4, %%xmm13\n"
9936 "addps %%xmm5, %%xmm12\n"
9937 "subps %%xmm5, %%xmm13\n"
9938 "movaps %%xmm6, %%xmm14\n"
9939 "movaps %%xmm6, %%xmm15\n"
9940 "addps %%xmm7, %%xmm14\n"
9941 "subps %%xmm7, %%xmm15\n"
9942 "movaps %%xmm8, %%xmm0\n"
9943 "movaps %%xmm8, %%xmm2\n"
9944 "addps %%xmm10, %%xmm0\n"
9945 "subps %%xmm10, %%xmm2\n"
9946 "movaps %%xmm9, %%xmm1\n"
9947 "movaps %%xmm9, %%xmm3\n"
9948 "addps %%xmm11, %%xmm1\n"
9949 "subps %%xmm11, %%xmm3\n"
9950 "movaps %%xmm12, %%xmm4\n"
9951 "movaps %%xmm12, %%xmm6\n"
9952 "addps %%xmm14, %%xmm4\n"
9953 "subps %%xmm14, %%xmm6\n"
9954 "movaps %%xmm13, %%xmm5\n"
9955 "movaps %%xmm13, %%xmm7\n"
9956 "addps %%xmm15, %%xmm5\n"
9957 "subps %%xmm15, %%xmm7\n"
9958 "movaps %%xmm0, %%xmm8\n"
9959 "movaps %%xmm0, %%xmm12\n"
9960 "addps %%xmm4, %%xmm8\n"
9961 "subps %%xmm4, %%xmm12\n"
9962 "movaps %%xmm1, %%xmm9\n"
9963 "movaps %%xmm1, %%xmm13\n"
9964 "addps %%xmm5, %%xmm9\n"
9965 "subps %%xmm5, %%xmm13\n"
9966 "movaps %%xmm2, %%xmm10\n"
9967 "movaps %%xmm2, %%xmm14\n"
9968 "addps %%xmm6, %%xmm10\n"
9969 "subps %%xmm6, %%xmm14\n"
9970 "movaps %%xmm3, %%xmm11\n"
9971 "movaps %%xmm3, %%xmm15\n"
9972 "addps %%xmm7, %%xmm11\n"
9973 "subps %%xmm7, %%xmm15\n"
9974 "movups %%xmm8, (%0)\n"
9975 "movups %%xmm9, (%1)\n"
9976 "movups %%xmm10, (%2)\n"
9977 "movups %%xmm11, (%3)\n"
9978 "movups %%xmm12, (%4)\n"
9979 "movups %%xmm13, (%5)\n"
9980 "movups %%xmm14, (%6)\n"
9981 "movups %%xmm15, (%7)\n"
9982 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
9983 );
9984 }
9985 }
9986 return;
9987 }
9988 if (depth == 8) {
9989 helper_float_26_recursive(buf + 0, 5);
9990 helper_float_26_recursive(buf + 32, 5);
9991 helper_float_26_recursive(buf + 64, 5);
9992 helper_float_26_recursive(buf + 96, 5);
9993 helper_float_26_recursive(buf + 128, 5);
9994 helper_float_26_recursive(buf + 160, 5);
9995 helper_float_26_recursive(buf + 192, 5);
9996 helper_float_26_recursive(buf + 224, 5);
9997 for (int j = 0; j < 256; j += 256) {
9998 for (int k = 0; k < 32; k += 4) {
9999 __asm__ volatile (
10000 "movups (%0), %%xmm0\n"
10001 "movups (%1), %%xmm1\n"
10002 "movups (%2), %%xmm2\n"
10003 "movups (%3), %%xmm3\n"
10004 "movups (%4), %%xmm4\n"
10005 "movups (%5), %%xmm5\n"
10006 "movups (%6), %%xmm6\n"
10007 "movups (%7), %%xmm7\n"
10008 "movaps %%xmm0, %%xmm8\n"
10009 "movaps %%xmm0, %%xmm9\n"
10010 "addps %%xmm1, %%xmm8\n"
10011 "subps %%xmm1, %%xmm9\n"
10012 "movaps %%xmm2, %%xmm10\n"
10013 "movaps %%xmm2, %%xmm11\n"
10014 "addps %%xmm3, %%xmm10\n"
10015 "subps %%xmm3, %%xmm11\n"
10016 "movaps %%xmm4, %%xmm12\n"
10017 "movaps %%xmm4, %%xmm13\n"
10018 "addps %%xmm5, %%xmm12\n"
10019 "subps %%xmm5, %%xmm13\n"
10020 "movaps %%xmm6, %%xmm14\n"
10021 "movaps %%xmm6, %%xmm15\n"
10022 "addps %%xmm7, %%xmm14\n"
10023 "subps %%xmm7, %%xmm15\n"
10024 "movaps %%xmm8, %%xmm0\n"
10025 "movaps %%xmm8, %%xmm2\n"
10026 "addps %%xmm10, %%xmm0\n"
10027 "subps %%xmm10, %%xmm2\n"
10028 "movaps %%xmm9, %%xmm1\n"
10029 "movaps %%xmm9, %%xmm3\n"
10030 "addps %%xmm11, %%xmm1\n"
10031 "subps %%xmm11, %%xmm3\n"
10032 "movaps %%xmm12, %%xmm4\n"
10033 "movaps %%xmm12, %%xmm6\n"
10034 "addps %%xmm14, %%xmm4\n"
10035 "subps %%xmm14, %%xmm6\n"
10036 "movaps %%xmm13, %%xmm5\n"
10037 "movaps %%xmm13, %%xmm7\n"
10038 "addps %%xmm15, %%xmm5\n"
10039 "subps %%xmm15, %%xmm7\n"
10040 "movaps %%xmm0, %%xmm8\n"
10041 "movaps %%xmm0, %%xmm12\n"
10042 "addps %%xmm4, %%xmm8\n"
10043 "subps %%xmm4, %%xmm12\n"
10044 "movaps %%xmm1, %%xmm9\n"
10045 "movaps %%xmm1, %%xmm13\n"
10046 "addps %%xmm5, %%xmm9\n"
10047 "subps %%xmm5, %%xmm13\n"
10048 "movaps %%xmm2, %%xmm10\n"
10049 "movaps %%xmm2, %%xmm14\n"
10050 "addps %%xmm6, %%xmm10\n"
10051 "subps %%xmm6, %%xmm14\n"
10052 "movaps %%xmm3, %%xmm11\n"
10053 "movaps %%xmm3, %%xmm15\n"
10054 "addps %%xmm7, %%xmm11\n"
10055 "subps %%xmm7, %%xmm15\n"
10056 "movups %%xmm8, (%0)\n"
10057 "movups %%xmm9, (%1)\n"
10058 "movups %%xmm10, (%2)\n"
10059 "movups %%xmm11, (%3)\n"
10060 "movups %%xmm12, (%4)\n"
10061 "movups %%xmm13, (%5)\n"
10062 "movups %%xmm14, (%6)\n"
10063 "movups %%xmm15, (%7)\n"
10064 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10065 );
10066 }
10067 }
10068 return;
10069 }
10070 if (depth == 11) {
10071 helper_float_26_recursive(buf + 0, 8);
10072 helper_float_26_recursive(buf + 256, 8);
10073 helper_float_26_recursive(buf + 512, 8);
10074 helper_float_26_recursive(buf + 768, 8);
10075 helper_float_26_recursive(buf + 1024, 8);
10076 helper_float_26_recursive(buf + 1280, 8);
10077 helper_float_26_recursive(buf + 1536, 8);
10078 helper_float_26_recursive(buf + 1792, 8);
10079 for (int j = 0; j < 2048; j += 2048) {
10080 for (int k = 0; k < 256; k += 4) {
10081 __asm__ volatile (
10082 "movups (%0), %%xmm0\n"
10083 "movups (%1), %%xmm1\n"
10084 "movups (%2), %%xmm2\n"
10085 "movups (%3), %%xmm3\n"
10086 "movups (%4), %%xmm4\n"
10087 "movups (%5), %%xmm5\n"
10088 "movups (%6), %%xmm6\n"
10089 "movups (%7), %%xmm7\n"
10090 "movaps %%xmm0, %%xmm8\n"
10091 "movaps %%xmm0, %%xmm9\n"
10092 "addps %%xmm1, %%xmm8\n"
10093 "subps %%xmm1, %%xmm9\n"
10094 "movaps %%xmm2, %%xmm10\n"
10095 "movaps %%xmm2, %%xmm11\n"
10096 "addps %%xmm3, %%xmm10\n"
10097 "subps %%xmm3, %%xmm11\n"
10098 "movaps %%xmm4, %%xmm12\n"
10099 "movaps %%xmm4, %%xmm13\n"
10100 "addps %%xmm5, %%xmm12\n"
10101 "subps %%xmm5, %%xmm13\n"
10102 "movaps %%xmm6, %%xmm14\n"
10103 "movaps %%xmm6, %%xmm15\n"
10104 "addps %%xmm7, %%xmm14\n"
10105 "subps %%xmm7, %%xmm15\n"
10106 "movaps %%xmm8, %%xmm0\n"
10107 "movaps %%xmm8, %%xmm2\n"
10108 "addps %%xmm10, %%xmm0\n"
10109 "subps %%xmm10, %%xmm2\n"
10110 "movaps %%xmm9, %%xmm1\n"
10111 "movaps %%xmm9, %%xmm3\n"
10112 "addps %%xmm11, %%xmm1\n"
10113 "subps %%xmm11, %%xmm3\n"
10114 "movaps %%xmm12, %%xmm4\n"
10115 "movaps %%xmm12, %%xmm6\n"
10116 "addps %%xmm14, %%xmm4\n"
10117 "subps %%xmm14, %%xmm6\n"
10118 "movaps %%xmm13, %%xmm5\n"
10119 "movaps %%xmm13, %%xmm7\n"
10120 "addps %%xmm15, %%xmm5\n"
10121 "subps %%xmm15, %%xmm7\n"
10122 "movaps %%xmm0, %%xmm8\n"
10123 "movaps %%xmm0, %%xmm12\n"
10124 "addps %%xmm4, %%xmm8\n"
10125 "subps %%xmm4, %%xmm12\n"
10126 "movaps %%xmm1, %%xmm9\n"
10127 "movaps %%xmm1, %%xmm13\n"
10128 "addps %%xmm5, %%xmm9\n"
10129 "subps %%xmm5, %%xmm13\n"
10130 "movaps %%xmm2, %%xmm10\n"
10131 "movaps %%xmm2, %%xmm14\n"
10132 "addps %%xmm6, %%xmm10\n"
10133 "subps %%xmm6, %%xmm14\n"
10134 "movaps %%xmm3, %%xmm11\n"
10135 "movaps %%xmm3, %%xmm15\n"
10136 "addps %%xmm7, %%xmm11\n"
10137 "subps %%xmm7, %%xmm15\n"
10138 "movups %%xmm8, (%0)\n"
10139 "movups %%xmm9, (%1)\n"
10140 "movups %%xmm10, (%2)\n"
10141 "movups %%xmm11, (%3)\n"
10142 "movups %%xmm12, (%4)\n"
10143 "movups %%xmm13, (%5)\n"
10144 "movups %%xmm14, (%6)\n"
10145 "movups %%xmm15, (%7)\n"
10146 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10147 );
10148 }
10149 }
10150 return;
10151 }
10152 if (depth == 14) {
10153 helper_float_26_recursive(buf + 0, 11);
10154 helper_float_26_recursive(buf + 2048, 11);
10155 helper_float_26_recursive(buf + 4096, 11);
10156 helper_float_26_recursive(buf + 6144, 11);
10157 helper_float_26_recursive(buf + 8192, 11);
10158 helper_float_26_recursive(buf + 10240, 11);
10159 helper_float_26_recursive(buf + 12288, 11);
10160 helper_float_26_recursive(buf + 14336, 11);
10161 for (int j = 0; j < 16384; j += 16384) {
10162 for (int k = 0; k < 2048; k += 4) {
10163 __asm__ volatile (
10164 "movups (%0), %%xmm0\n"
10165 "movups (%1), %%xmm1\n"
10166 "movups (%2), %%xmm2\n"
10167 "movups (%3), %%xmm3\n"
10168 "movups (%4), %%xmm4\n"
10169 "movups (%5), %%xmm5\n"
10170 "movups (%6), %%xmm6\n"
10171 "movups (%7), %%xmm7\n"
10172 "movaps %%xmm0, %%xmm8\n"
10173 "movaps %%xmm0, %%xmm9\n"
10174 "addps %%xmm1, %%xmm8\n"
10175 "subps %%xmm1, %%xmm9\n"
10176 "movaps %%xmm2, %%xmm10\n"
10177 "movaps %%xmm2, %%xmm11\n"
10178 "addps %%xmm3, %%xmm10\n"
10179 "subps %%xmm3, %%xmm11\n"
10180 "movaps %%xmm4, %%xmm12\n"
10181 "movaps %%xmm4, %%xmm13\n"
10182 "addps %%xmm5, %%xmm12\n"
10183 "subps %%xmm5, %%xmm13\n"
10184 "movaps %%xmm6, %%xmm14\n"
10185 "movaps %%xmm6, %%xmm15\n"
10186 "addps %%xmm7, %%xmm14\n"
10187 "subps %%xmm7, %%xmm15\n"
10188 "movaps %%xmm8, %%xmm0\n"
10189 "movaps %%xmm8, %%xmm2\n"
10190 "addps %%xmm10, %%xmm0\n"
10191 "subps %%xmm10, %%xmm2\n"
10192 "movaps %%xmm9, %%xmm1\n"
10193 "movaps %%xmm9, %%xmm3\n"
10194 "addps %%xmm11, %%xmm1\n"
10195 "subps %%xmm11, %%xmm3\n"
10196 "movaps %%xmm12, %%xmm4\n"
10197 "movaps %%xmm12, %%xmm6\n"
10198 "addps %%xmm14, %%xmm4\n"
10199 "subps %%xmm14, %%xmm6\n"
10200 "movaps %%xmm13, %%xmm5\n"
10201 "movaps %%xmm13, %%xmm7\n"
10202 "addps %%xmm15, %%xmm5\n"
10203 "subps %%xmm15, %%xmm7\n"
10204 "movaps %%xmm0, %%xmm8\n"
10205 "movaps %%xmm0, %%xmm12\n"
10206 "addps %%xmm4, %%xmm8\n"
10207 "subps %%xmm4, %%xmm12\n"
10208 "movaps %%xmm1, %%xmm9\n"
10209 "movaps %%xmm1, %%xmm13\n"
10210 "addps %%xmm5, %%xmm9\n"
10211 "subps %%xmm5, %%xmm13\n"
10212 "movaps %%xmm2, %%xmm10\n"
10213 "movaps %%xmm2, %%xmm14\n"
10214 "addps %%xmm6, %%xmm10\n"
10215 "subps %%xmm6, %%xmm14\n"
10216 "movaps %%xmm3, %%xmm11\n"
10217 "movaps %%xmm3, %%xmm15\n"
10218 "addps %%xmm7, %%xmm11\n"
10219 "subps %%xmm7, %%xmm15\n"
10220 "movups %%xmm8, (%0)\n"
10221 "movups %%xmm9, (%1)\n"
10222 "movups %%xmm10, (%2)\n"
10223 "movups %%xmm11, (%3)\n"
10224 "movups %%xmm12, (%4)\n"
10225 "movups %%xmm13, (%5)\n"
10226 "movups %%xmm14, (%6)\n"
10227 "movups %%xmm15, (%7)\n"
10228 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10229 );
10230 }
10231 }
10232 return;
10233 }
10234 if (depth == 17) {
10235 helper_float_26_recursive(buf + 0, 14);
10236 helper_float_26_recursive(buf + 16384, 14);
10237 helper_float_26_recursive(buf + 32768, 14);
10238 helper_float_26_recursive(buf + 49152, 14);
10239 helper_float_26_recursive(buf + 65536, 14);
10240 helper_float_26_recursive(buf + 81920, 14);
10241 helper_float_26_recursive(buf + 98304, 14);
10242 helper_float_26_recursive(buf + 114688, 14);
10243 for (int j = 0; j < 131072; j += 131072) {
10244 for (int k = 0; k < 16384; k += 4) {
10245 __asm__ volatile (
10246 "movups (%0), %%xmm0\n"
10247 "movups (%1), %%xmm1\n"
10248 "movups (%2), %%xmm2\n"
10249 "movups (%3), %%xmm3\n"
10250 "movups (%4), %%xmm4\n"
10251 "movups (%5), %%xmm5\n"
10252 "movups (%6), %%xmm6\n"
10253 "movups (%7), %%xmm7\n"
10254 "movaps %%xmm0, %%xmm8\n"
10255 "movaps %%xmm0, %%xmm9\n"
10256 "addps %%xmm1, %%xmm8\n"
10257 "subps %%xmm1, %%xmm9\n"
10258 "movaps %%xmm2, %%xmm10\n"
10259 "movaps %%xmm2, %%xmm11\n"
10260 "addps %%xmm3, %%xmm10\n"
10261 "subps %%xmm3, %%xmm11\n"
10262 "movaps %%xmm4, %%xmm12\n"
10263 "movaps %%xmm4, %%xmm13\n"
10264 "addps %%xmm5, %%xmm12\n"
10265 "subps %%xmm5, %%xmm13\n"
10266 "movaps %%xmm6, %%xmm14\n"
10267 "movaps %%xmm6, %%xmm15\n"
10268 "addps %%xmm7, %%xmm14\n"
10269 "subps %%xmm7, %%xmm15\n"
10270 "movaps %%xmm8, %%xmm0\n"
10271 "movaps %%xmm8, %%xmm2\n"
10272 "addps %%xmm10, %%xmm0\n"
10273 "subps %%xmm10, %%xmm2\n"
10274 "movaps %%xmm9, %%xmm1\n"
10275 "movaps %%xmm9, %%xmm3\n"
10276 "addps %%xmm11, %%xmm1\n"
10277 "subps %%xmm11, %%xmm3\n"
10278 "movaps %%xmm12, %%xmm4\n"
10279 "movaps %%xmm12, %%xmm6\n"
10280 "addps %%xmm14, %%xmm4\n"
10281 "subps %%xmm14, %%xmm6\n"
10282 "movaps %%xmm13, %%xmm5\n"
10283 "movaps %%xmm13, %%xmm7\n"
10284 "addps %%xmm15, %%xmm5\n"
10285 "subps %%xmm15, %%xmm7\n"
10286 "movaps %%xmm0, %%xmm8\n"
10287 "movaps %%xmm0, %%xmm12\n"
10288 "addps %%xmm4, %%xmm8\n"
10289 "subps %%xmm4, %%xmm12\n"
10290 "movaps %%xmm1, %%xmm9\n"
10291 "movaps %%xmm1, %%xmm13\n"
10292 "addps %%xmm5, %%xmm9\n"
10293 "subps %%xmm5, %%xmm13\n"
10294 "movaps %%xmm2, %%xmm10\n"
10295 "movaps %%xmm2, %%xmm14\n"
10296 "addps %%xmm6, %%xmm10\n"
10297 "subps %%xmm6, %%xmm14\n"
10298 "movaps %%xmm3, %%xmm11\n"
10299 "movaps %%xmm3, %%xmm15\n"
10300 "addps %%xmm7, %%xmm11\n"
10301 "subps %%xmm7, %%xmm15\n"
10302 "movups %%xmm8, (%0)\n"
10303 "movups %%xmm9, (%1)\n"
10304 "movups %%xmm10, (%2)\n"
10305 "movups %%xmm11, (%3)\n"
10306 "movups %%xmm12, (%4)\n"
10307 "movups %%xmm13, (%5)\n"
10308 "movups %%xmm14, (%6)\n"
10309 "movups %%xmm15, (%7)\n"
10310 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10311 );
10312 }
10313 }
10314 return;
10315 }
10316 if (depth == 20) {
10317 helper_float_26_recursive(buf + 0, 17);
10318 helper_float_26_recursive(buf + 131072, 17);
10319 helper_float_26_recursive(buf + 262144, 17);
10320 helper_float_26_recursive(buf + 393216, 17);
10321 helper_float_26_recursive(buf + 524288, 17);
10322 helper_float_26_recursive(buf + 655360, 17);
10323 helper_float_26_recursive(buf + 786432, 17);
10324 helper_float_26_recursive(buf + 917504, 17);
10325 for (int j = 0; j < 1048576; j += 1048576) {
10326 for (int k = 0; k < 131072; k += 4) {
10327 __asm__ volatile (
10328 "movups (%0), %%xmm0\n"
10329 "movups (%1), %%xmm1\n"
10330 "movups (%2), %%xmm2\n"
10331 "movups (%3), %%xmm3\n"
10332 "movups (%4), %%xmm4\n"
10333 "movups (%5), %%xmm5\n"
10334 "movups (%6), %%xmm6\n"
10335 "movups (%7), %%xmm7\n"
10336 "movaps %%xmm0, %%xmm8\n"
10337 "movaps %%xmm0, %%xmm9\n"
10338 "addps %%xmm1, %%xmm8\n"
10339 "subps %%xmm1, %%xmm9\n"
10340 "movaps %%xmm2, %%xmm10\n"
10341 "movaps %%xmm2, %%xmm11\n"
10342 "addps %%xmm3, %%xmm10\n"
10343 "subps %%xmm3, %%xmm11\n"
10344 "movaps %%xmm4, %%xmm12\n"
10345 "movaps %%xmm4, %%xmm13\n"
10346 "addps %%xmm5, %%xmm12\n"
10347 "subps %%xmm5, %%xmm13\n"
10348 "movaps %%xmm6, %%xmm14\n"
10349 "movaps %%xmm6, %%xmm15\n"
10350 "addps %%xmm7, %%xmm14\n"
10351 "subps %%xmm7, %%xmm15\n"
10352 "movaps %%xmm8, %%xmm0\n"
10353 "movaps %%xmm8, %%xmm2\n"
10354 "addps %%xmm10, %%xmm0\n"
10355 "subps %%xmm10, %%xmm2\n"
10356 "movaps %%xmm9, %%xmm1\n"
10357 "movaps %%xmm9, %%xmm3\n"
10358 "addps %%xmm11, %%xmm1\n"
10359 "subps %%xmm11, %%xmm3\n"
10360 "movaps %%xmm12, %%xmm4\n"
10361 "movaps %%xmm12, %%xmm6\n"
10362 "addps %%xmm14, %%xmm4\n"
10363 "subps %%xmm14, %%xmm6\n"
10364 "movaps %%xmm13, %%xmm5\n"
10365 "movaps %%xmm13, %%xmm7\n"
10366 "addps %%xmm15, %%xmm5\n"
10367 "subps %%xmm15, %%xmm7\n"
10368 "movaps %%xmm0, %%xmm8\n"
10369 "movaps %%xmm0, %%xmm12\n"
10370 "addps %%xmm4, %%xmm8\n"
10371 "subps %%xmm4, %%xmm12\n"
10372 "movaps %%xmm1, %%xmm9\n"
10373 "movaps %%xmm1, %%xmm13\n"
10374 "addps %%xmm5, %%xmm9\n"
10375 "subps %%xmm5, %%xmm13\n"
10376 "movaps %%xmm2, %%xmm10\n"
10377 "movaps %%xmm2, %%xmm14\n"
10378 "addps %%xmm6, %%xmm10\n"
10379 "subps %%xmm6, %%xmm14\n"
10380 "movaps %%xmm3, %%xmm11\n"
10381 "movaps %%xmm3, %%xmm15\n"
10382 "addps %%xmm7, %%xmm11\n"
10383 "subps %%xmm7, %%xmm15\n"
10384 "movups %%xmm8, (%0)\n"
10385 "movups %%xmm9, (%1)\n"
10386 "movups %%xmm10, (%2)\n"
10387 "movups %%xmm11, (%3)\n"
10388 "movups %%xmm12, (%4)\n"
10389 "movups %%xmm13, (%5)\n"
10390 "movups %%xmm14, (%6)\n"
10391 "movups %%xmm15, (%7)\n"
10392 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10393 );
10394 }
10395 }
10396 return;
10397 }
10398 if (depth == 23) {
10399 helper_float_26_recursive(buf + 0, 20);
10400 helper_float_26_recursive(buf + 1048576, 20);
10401 helper_float_26_recursive(buf + 2097152, 20);
10402 helper_float_26_recursive(buf + 3145728, 20);
10403 helper_float_26_recursive(buf + 4194304, 20);
10404 helper_float_26_recursive(buf + 5242880, 20);
10405 helper_float_26_recursive(buf + 6291456, 20);
10406 helper_float_26_recursive(buf + 7340032, 20);
10407 for (int j = 0; j < 8388608; j += 8388608) {
10408 for (int k = 0; k < 1048576; k += 4) {
10409 __asm__ volatile (
10410 "movups (%0), %%xmm0\n"
10411 "movups (%1), %%xmm1\n"
10412 "movups (%2), %%xmm2\n"
10413 "movups (%3), %%xmm3\n"
10414 "movups (%4), %%xmm4\n"
10415 "movups (%5), %%xmm5\n"
10416 "movups (%6), %%xmm6\n"
10417 "movups (%7), %%xmm7\n"
10418 "movaps %%xmm0, %%xmm8\n"
10419 "movaps %%xmm0, %%xmm9\n"
10420 "addps %%xmm1, %%xmm8\n"
10421 "subps %%xmm1, %%xmm9\n"
10422 "movaps %%xmm2, %%xmm10\n"
10423 "movaps %%xmm2, %%xmm11\n"
10424 "addps %%xmm3, %%xmm10\n"
10425 "subps %%xmm3, %%xmm11\n"
10426 "movaps %%xmm4, %%xmm12\n"
10427 "movaps %%xmm4, %%xmm13\n"
10428 "addps %%xmm5, %%xmm12\n"
10429 "subps %%xmm5, %%xmm13\n"
10430 "movaps %%xmm6, %%xmm14\n"
10431 "movaps %%xmm6, %%xmm15\n"
10432 "addps %%xmm7, %%xmm14\n"
10433 "subps %%xmm7, %%xmm15\n"
10434 "movaps %%xmm8, %%xmm0\n"
10435 "movaps %%xmm8, %%xmm2\n"
10436 "addps %%xmm10, %%xmm0\n"
10437 "subps %%xmm10, %%xmm2\n"
10438 "movaps %%xmm9, %%xmm1\n"
10439 "movaps %%xmm9, %%xmm3\n"
10440 "addps %%xmm11, %%xmm1\n"
10441 "subps %%xmm11, %%xmm3\n"
10442 "movaps %%xmm12, %%xmm4\n"
10443 "movaps %%xmm12, %%xmm6\n"
10444 "addps %%xmm14, %%xmm4\n"
10445 "subps %%xmm14, %%xmm6\n"
10446 "movaps %%xmm13, %%xmm5\n"
10447 "movaps %%xmm13, %%xmm7\n"
10448 "addps %%xmm15, %%xmm5\n"
10449 "subps %%xmm15, %%xmm7\n"
10450 "movaps %%xmm0, %%xmm8\n"
10451 "movaps %%xmm0, %%xmm12\n"
10452 "addps %%xmm4, %%xmm8\n"
10453 "subps %%xmm4, %%xmm12\n"
10454 "movaps %%xmm1, %%xmm9\n"
10455 "movaps %%xmm1, %%xmm13\n"
10456 "addps %%xmm5, %%xmm9\n"
10457 "subps %%xmm5, %%xmm13\n"
10458 "movaps %%xmm2, %%xmm10\n"
10459 "movaps %%xmm2, %%xmm14\n"
10460 "addps %%xmm6, %%xmm10\n"
10461 "subps %%xmm6, %%xmm14\n"
10462 "movaps %%xmm3, %%xmm11\n"
10463 "movaps %%xmm3, %%xmm15\n"
10464 "addps %%xmm7, %%xmm11\n"
10465 "subps %%xmm7, %%xmm15\n"
10466 "movups %%xmm8, (%0)\n"
10467 "movups %%xmm9, (%1)\n"
10468 "movups %%xmm10, (%2)\n"
10469 "movups %%xmm11, (%3)\n"
10470 "movups %%xmm12, (%4)\n"
10471 "movups %%xmm13, (%5)\n"
10472 "movups %%xmm14, (%6)\n"
10473 "movups %%xmm15, (%7)\n"
10474 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10475 );
10476 }
10477 }
10478 return;
10479 }
10480 if (depth == 26) {
10481 helper_float_26_recursive(buf + 0, 23);
10482 helper_float_26_recursive(buf + 8388608, 23);
10483 helper_float_26_recursive(buf + 16777216, 23);
10484 helper_float_26_recursive(buf + 25165824, 23);
10485 helper_float_26_recursive(buf + 33554432, 23);
10486 helper_float_26_recursive(buf + 41943040, 23);
10487 helper_float_26_recursive(buf + 50331648, 23);
10488 helper_float_26_recursive(buf + 58720256, 23);
10489 for (int j = 0; j < 67108864; j += 67108864) {
10490 for (int k = 0; k < 8388608; k += 4) {
10491 __asm__ volatile (
10492 "movups (%0), %%xmm0\n"
10493 "movups (%1), %%xmm1\n"
10494 "movups (%2), %%xmm2\n"
10495 "movups (%3), %%xmm3\n"
10496 "movups (%4), %%xmm4\n"
10497 "movups (%5), %%xmm5\n"
10498 "movups (%6), %%xmm6\n"
10499 "movups (%7), %%xmm7\n"
10500 "movaps %%xmm0, %%xmm8\n"
10501 "movaps %%xmm0, %%xmm9\n"
10502 "addps %%xmm1, %%xmm8\n"
10503 "subps %%xmm1, %%xmm9\n"
10504 "movaps %%xmm2, %%xmm10\n"
10505 "movaps %%xmm2, %%xmm11\n"
10506 "addps %%xmm3, %%xmm10\n"
10507 "subps %%xmm3, %%xmm11\n"
10508 "movaps %%xmm4, %%xmm12\n"
10509 "movaps %%xmm4, %%xmm13\n"
10510 "addps %%xmm5, %%xmm12\n"
10511 "subps %%xmm5, %%xmm13\n"
10512 "movaps %%xmm6, %%xmm14\n"
10513 "movaps %%xmm6, %%xmm15\n"
10514 "addps %%xmm7, %%xmm14\n"
10515 "subps %%xmm7, %%xmm15\n"
10516 "movaps %%xmm8, %%xmm0\n"
10517 "movaps %%xmm8, %%xmm2\n"
10518 "addps %%xmm10, %%xmm0\n"
10519 "subps %%xmm10, %%xmm2\n"
10520 "movaps %%xmm9, %%xmm1\n"
10521 "movaps %%xmm9, %%xmm3\n"
10522 "addps %%xmm11, %%xmm1\n"
10523 "subps %%xmm11, %%xmm3\n"
10524 "movaps %%xmm12, %%xmm4\n"
10525 "movaps %%xmm12, %%xmm6\n"
10526 "addps %%xmm14, %%xmm4\n"
10527 "subps %%xmm14, %%xmm6\n"
10528 "movaps %%xmm13, %%xmm5\n"
10529 "movaps %%xmm13, %%xmm7\n"
10530 "addps %%xmm15, %%xmm5\n"
10531 "subps %%xmm15, %%xmm7\n"
10532 "movaps %%xmm0, %%xmm8\n"
10533 "movaps %%xmm0, %%xmm12\n"
10534 "addps %%xmm4, %%xmm8\n"
10535 "subps %%xmm4, %%xmm12\n"
10536 "movaps %%xmm1, %%xmm9\n"
10537 "movaps %%xmm1, %%xmm13\n"
10538 "addps %%xmm5, %%xmm9\n"
10539 "subps %%xmm5, %%xmm13\n"
10540 "movaps %%xmm2, %%xmm10\n"
10541 "movaps %%xmm2, %%xmm14\n"
10542 "addps %%xmm6, %%xmm10\n"
10543 "subps %%xmm6, %%xmm14\n"
10544 "movaps %%xmm3, %%xmm11\n"
10545 "movaps %%xmm3, %%xmm15\n"
10546 "addps %%xmm7, %%xmm11\n"
10547 "subps %%xmm7, %%xmm15\n"
10548 "movups %%xmm8, (%0)\n"
10549 "movups %%xmm9, (%1)\n"
10550 "movups %%xmm10, (%2)\n"
10551 "movups %%xmm11, (%3)\n"
10552 "movups %%xmm12, (%4)\n"
10553 "movups %%xmm13, (%5)\n"
10554 "movups %%xmm14, (%6)\n"
10555 "movups %%xmm15, (%7)\n"
10556 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10557 );
10558 }
10559 }
10560 return;
10561 }
10562 }
10563 void helper_float_26(float *buf);
helper_float_26(float * buf)10564 void helper_float_26(float *buf) {
10565 helper_float_26_recursive(buf, 26);
10566 }
10567 void helper_float_27_recursive(float *buf, int depth);
helper_float_27_recursive(float * buf,int depth)10568 void helper_float_27_recursive(float *buf, int depth) {
10569 if (depth == 12) {
10570 for (int j = 0; j < 4096; j += 32) {
10571 for (int k = 0; k < 4; k += 4) {
10572 __asm__ volatile (
10573 "movups (%0), %%xmm0\n"
10574 "movups (%1), %%xmm1\n"
10575 "movups (%2), %%xmm2\n"
10576 "movups (%3), %%xmm3\n"
10577 "movups (%4), %%xmm4\n"
10578 "movups (%5), %%xmm5\n"
10579 "movups (%6), %%xmm6\n"
10580 "movups (%7), %%xmm7\n"
10581 "movaps %%xmm0, %%xmm8\n"
10582 "shufps $160, %%xmm8, %%xmm8\n"
10583 "shufps $245, %%xmm0, %%xmm0\n"
10584 "xorps %%xmm9, %%xmm9\n"
10585 "subps %%xmm0, %%xmm9\n"
10586 "addsubps %%xmm9, %%xmm8\n"
10587 "movaps %%xmm8, %%xmm0\n"
10588 "movaps %%xmm1, %%xmm8\n"
10589 "shufps $160, %%xmm8, %%xmm8\n"
10590 "shufps $245, %%xmm1, %%xmm1\n"
10591 "xorps %%xmm9, %%xmm9\n"
10592 "subps %%xmm1, %%xmm9\n"
10593 "addsubps %%xmm9, %%xmm8\n"
10594 "movaps %%xmm8, %%xmm1\n"
10595 "movaps %%xmm2, %%xmm8\n"
10596 "shufps $160, %%xmm8, %%xmm8\n"
10597 "shufps $245, %%xmm2, %%xmm2\n"
10598 "xorps %%xmm9, %%xmm9\n"
10599 "subps %%xmm2, %%xmm9\n"
10600 "addsubps %%xmm9, %%xmm8\n"
10601 "movaps %%xmm8, %%xmm2\n"
10602 "movaps %%xmm3, %%xmm8\n"
10603 "shufps $160, %%xmm8, %%xmm8\n"
10604 "shufps $245, %%xmm3, %%xmm3\n"
10605 "xorps %%xmm9, %%xmm9\n"
10606 "subps %%xmm3, %%xmm9\n"
10607 "addsubps %%xmm9, %%xmm8\n"
10608 "movaps %%xmm8, %%xmm3\n"
10609 "movaps %%xmm4, %%xmm8\n"
10610 "shufps $160, %%xmm8, %%xmm8\n"
10611 "shufps $245, %%xmm4, %%xmm4\n"
10612 "xorps %%xmm9, %%xmm9\n"
10613 "subps %%xmm4, %%xmm9\n"
10614 "addsubps %%xmm9, %%xmm8\n"
10615 "movaps %%xmm8, %%xmm4\n"
10616 "movaps %%xmm5, %%xmm8\n"
10617 "shufps $160, %%xmm8, %%xmm8\n"
10618 "shufps $245, %%xmm5, %%xmm5\n"
10619 "xorps %%xmm9, %%xmm9\n"
10620 "subps %%xmm5, %%xmm9\n"
10621 "addsubps %%xmm9, %%xmm8\n"
10622 "movaps %%xmm8, %%xmm5\n"
10623 "movaps %%xmm6, %%xmm8\n"
10624 "shufps $160, %%xmm8, %%xmm8\n"
10625 "shufps $245, %%xmm6, %%xmm6\n"
10626 "xorps %%xmm9, %%xmm9\n"
10627 "subps %%xmm6, %%xmm9\n"
10628 "addsubps %%xmm9, %%xmm8\n"
10629 "movaps %%xmm8, %%xmm6\n"
10630 "movaps %%xmm7, %%xmm8\n"
10631 "shufps $160, %%xmm8, %%xmm8\n"
10632 "shufps $245, %%xmm7, %%xmm7\n"
10633 "xorps %%xmm9, %%xmm9\n"
10634 "subps %%xmm7, %%xmm9\n"
10635 "addsubps %%xmm9, %%xmm8\n"
10636 "movaps %%xmm8, %%xmm7\n"
10637 "movaps %%xmm0, %%xmm8\n"
10638 "shufps $68, %%xmm8, %%xmm8\n"
10639 "xorps %%xmm9, %%xmm9\n"
10640 "movaps %%xmm0, %%xmm10\n"
10641 "shufps $14, %%xmm9, %%xmm10\n"
10642 "movaps %%xmm0, %%xmm11\n"
10643 "shufps $224, %%xmm11, %%xmm9\n"
10644 "addps %%xmm8, %%xmm10\n"
10645 "subps %%xmm9, %%xmm10\n"
10646 "movaps %%xmm10, %%xmm0\n"
10647 "movaps %%xmm1, %%xmm8\n"
10648 "shufps $68, %%xmm8, %%xmm8\n"
10649 "xorps %%xmm9, %%xmm9\n"
10650 "movaps %%xmm1, %%xmm10\n"
10651 "shufps $14, %%xmm9, %%xmm10\n"
10652 "movaps %%xmm1, %%xmm11\n"
10653 "shufps $224, %%xmm11, %%xmm9\n"
10654 "addps %%xmm8, %%xmm10\n"
10655 "subps %%xmm9, %%xmm10\n"
10656 "movaps %%xmm10, %%xmm1\n"
10657 "movaps %%xmm2, %%xmm8\n"
10658 "shufps $68, %%xmm8, %%xmm8\n"
10659 "xorps %%xmm9, %%xmm9\n"
10660 "movaps %%xmm2, %%xmm10\n"
10661 "shufps $14, %%xmm9, %%xmm10\n"
10662 "movaps %%xmm2, %%xmm11\n"
10663 "shufps $224, %%xmm11, %%xmm9\n"
10664 "addps %%xmm8, %%xmm10\n"
10665 "subps %%xmm9, %%xmm10\n"
10666 "movaps %%xmm10, %%xmm2\n"
10667 "movaps %%xmm3, %%xmm8\n"
10668 "shufps $68, %%xmm8, %%xmm8\n"
10669 "xorps %%xmm9, %%xmm9\n"
10670 "movaps %%xmm3, %%xmm10\n"
10671 "shufps $14, %%xmm9, %%xmm10\n"
10672 "movaps %%xmm3, %%xmm11\n"
10673 "shufps $224, %%xmm11, %%xmm9\n"
10674 "addps %%xmm8, %%xmm10\n"
10675 "subps %%xmm9, %%xmm10\n"
10676 "movaps %%xmm10, %%xmm3\n"
10677 "movaps %%xmm4, %%xmm8\n"
10678 "shufps $68, %%xmm8, %%xmm8\n"
10679 "xorps %%xmm9, %%xmm9\n"
10680 "movaps %%xmm4, %%xmm10\n"
10681 "shufps $14, %%xmm9, %%xmm10\n"
10682 "movaps %%xmm4, %%xmm11\n"
10683 "shufps $224, %%xmm11, %%xmm9\n"
10684 "addps %%xmm8, %%xmm10\n"
10685 "subps %%xmm9, %%xmm10\n"
10686 "movaps %%xmm10, %%xmm4\n"
10687 "movaps %%xmm5, %%xmm8\n"
10688 "shufps $68, %%xmm8, %%xmm8\n"
10689 "xorps %%xmm9, %%xmm9\n"
10690 "movaps %%xmm5, %%xmm10\n"
10691 "shufps $14, %%xmm9, %%xmm10\n"
10692 "movaps %%xmm5, %%xmm11\n"
10693 "shufps $224, %%xmm11, %%xmm9\n"
10694 "addps %%xmm8, %%xmm10\n"
10695 "subps %%xmm9, %%xmm10\n"
10696 "movaps %%xmm10, %%xmm5\n"
10697 "movaps %%xmm6, %%xmm8\n"
10698 "shufps $68, %%xmm8, %%xmm8\n"
10699 "xorps %%xmm9, %%xmm9\n"
10700 "movaps %%xmm6, %%xmm10\n"
10701 "shufps $14, %%xmm9, %%xmm10\n"
10702 "movaps %%xmm6, %%xmm11\n"
10703 "shufps $224, %%xmm11, %%xmm9\n"
10704 "addps %%xmm8, %%xmm10\n"
10705 "subps %%xmm9, %%xmm10\n"
10706 "movaps %%xmm10, %%xmm6\n"
10707 "movaps %%xmm7, %%xmm8\n"
10708 "shufps $68, %%xmm8, %%xmm8\n"
10709 "xorps %%xmm9, %%xmm9\n"
10710 "movaps %%xmm7, %%xmm10\n"
10711 "shufps $14, %%xmm9, %%xmm10\n"
10712 "movaps %%xmm7, %%xmm11\n"
10713 "shufps $224, %%xmm11, %%xmm9\n"
10714 "addps %%xmm8, %%xmm10\n"
10715 "subps %%xmm9, %%xmm10\n"
10716 "movaps %%xmm10, %%xmm7\n"
10717 "movaps %%xmm0, %%xmm8\n"
10718 "movaps %%xmm0, %%xmm9\n"
10719 "addps %%xmm1, %%xmm8\n"
10720 "subps %%xmm1, %%xmm9\n"
10721 "movaps %%xmm2, %%xmm10\n"
10722 "movaps %%xmm2, %%xmm11\n"
10723 "addps %%xmm3, %%xmm10\n"
10724 "subps %%xmm3, %%xmm11\n"
10725 "movaps %%xmm4, %%xmm12\n"
10726 "movaps %%xmm4, %%xmm13\n"
10727 "addps %%xmm5, %%xmm12\n"
10728 "subps %%xmm5, %%xmm13\n"
10729 "movaps %%xmm6, %%xmm14\n"
10730 "movaps %%xmm6, %%xmm15\n"
10731 "addps %%xmm7, %%xmm14\n"
10732 "subps %%xmm7, %%xmm15\n"
10733 "movaps %%xmm8, %%xmm0\n"
10734 "movaps %%xmm8, %%xmm2\n"
10735 "addps %%xmm10, %%xmm0\n"
10736 "subps %%xmm10, %%xmm2\n"
10737 "movaps %%xmm9, %%xmm1\n"
10738 "movaps %%xmm9, %%xmm3\n"
10739 "addps %%xmm11, %%xmm1\n"
10740 "subps %%xmm11, %%xmm3\n"
10741 "movaps %%xmm12, %%xmm4\n"
10742 "movaps %%xmm12, %%xmm6\n"
10743 "addps %%xmm14, %%xmm4\n"
10744 "subps %%xmm14, %%xmm6\n"
10745 "movaps %%xmm13, %%xmm5\n"
10746 "movaps %%xmm13, %%xmm7\n"
10747 "addps %%xmm15, %%xmm5\n"
10748 "subps %%xmm15, %%xmm7\n"
10749 "movaps %%xmm0, %%xmm8\n"
10750 "movaps %%xmm0, %%xmm12\n"
10751 "addps %%xmm4, %%xmm8\n"
10752 "subps %%xmm4, %%xmm12\n"
10753 "movaps %%xmm1, %%xmm9\n"
10754 "movaps %%xmm1, %%xmm13\n"
10755 "addps %%xmm5, %%xmm9\n"
10756 "subps %%xmm5, %%xmm13\n"
10757 "movaps %%xmm2, %%xmm10\n"
10758 "movaps %%xmm2, %%xmm14\n"
10759 "addps %%xmm6, %%xmm10\n"
10760 "subps %%xmm6, %%xmm14\n"
10761 "movaps %%xmm3, %%xmm11\n"
10762 "movaps %%xmm3, %%xmm15\n"
10763 "addps %%xmm7, %%xmm11\n"
10764 "subps %%xmm7, %%xmm15\n"
10765 "movups %%xmm8, (%0)\n"
10766 "movups %%xmm9, (%1)\n"
10767 "movups %%xmm10, (%2)\n"
10768 "movups %%xmm11, (%3)\n"
10769 "movups %%xmm12, (%4)\n"
10770 "movups %%xmm13, (%5)\n"
10771 "movups %%xmm14, (%6)\n"
10772 "movups %%xmm15, (%7)\n"
10773 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10774 );
10775 }
10776 }
10777 for (int j = 0; j < 4096; j += 256) {
10778 for (int k = 0; k < 32; k += 4) {
10779 __asm__ volatile (
10780 "movups (%0), %%xmm0\n"
10781 "movups (%1), %%xmm1\n"
10782 "movups (%2), %%xmm2\n"
10783 "movups (%3), %%xmm3\n"
10784 "movups (%4), %%xmm4\n"
10785 "movups (%5), %%xmm5\n"
10786 "movups (%6), %%xmm6\n"
10787 "movups (%7), %%xmm7\n"
10788 "movaps %%xmm0, %%xmm8\n"
10789 "movaps %%xmm0, %%xmm9\n"
10790 "addps %%xmm1, %%xmm8\n"
10791 "subps %%xmm1, %%xmm9\n"
10792 "movaps %%xmm2, %%xmm10\n"
10793 "movaps %%xmm2, %%xmm11\n"
10794 "addps %%xmm3, %%xmm10\n"
10795 "subps %%xmm3, %%xmm11\n"
10796 "movaps %%xmm4, %%xmm12\n"
10797 "movaps %%xmm4, %%xmm13\n"
10798 "addps %%xmm5, %%xmm12\n"
10799 "subps %%xmm5, %%xmm13\n"
10800 "movaps %%xmm6, %%xmm14\n"
10801 "movaps %%xmm6, %%xmm15\n"
10802 "addps %%xmm7, %%xmm14\n"
10803 "subps %%xmm7, %%xmm15\n"
10804 "movaps %%xmm8, %%xmm0\n"
10805 "movaps %%xmm8, %%xmm2\n"
10806 "addps %%xmm10, %%xmm0\n"
10807 "subps %%xmm10, %%xmm2\n"
10808 "movaps %%xmm9, %%xmm1\n"
10809 "movaps %%xmm9, %%xmm3\n"
10810 "addps %%xmm11, %%xmm1\n"
10811 "subps %%xmm11, %%xmm3\n"
10812 "movaps %%xmm12, %%xmm4\n"
10813 "movaps %%xmm12, %%xmm6\n"
10814 "addps %%xmm14, %%xmm4\n"
10815 "subps %%xmm14, %%xmm6\n"
10816 "movaps %%xmm13, %%xmm5\n"
10817 "movaps %%xmm13, %%xmm7\n"
10818 "addps %%xmm15, %%xmm5\n"
10819 "subps %%xmm15, %%xmm7\n"
10820 "movaps %%xmm0, %%xmm8\n"
10821 "movaps %%xmm0, %%xmm12\n"
10822 "addps %%xmm4, %%xmm8\n"
10823 "subps %%xmm4, %%xmm12\n"
10824 "movaps %%xmm1, %%xmm9\n"
10825 "movaps %%xmm1, %%xmm13\n"
10826 "addps %%xmm5, %%xmm9\n"
10827 "subps %%xmm5, %%xmm13\n"
10828 "movaps %%xmm2, %%xmm10\n"
10829 "movaps %%xmm2, %%xmm14\n"
10830 "addps %%xmm6, %%xmm10\n"
10831 "subps %%xmm6, %%xmm14\n"
10832 "movaps %%xmm3, %%xmm11\n"
10833 "movaps %%xmm3, %%xmm15\n"
10834 "addps %%xmm7, %%xmm11\n"
10835 "subps %%xmm7, %%xmm15\n"
10836 "movups %%xmm8, (%0)\n"
10837 "movups %%xmm9, (%1)\n"
10838 "movups %%xmm10, (%2)\n"
10839 "movups %%xmm11, (%3)\n"
10840 "movups %%xmm12, (%4)\n"
10841 "movups %%xmm13, (%5)\n"
10842 "movups %%xmm14, (%6)\n"
10843 "movups %%xmm15, (%7)\n"
10844 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10845 );
10846 }
10847 }
10848 for (int j = 0; j < 4096; j += 2048) {
10849 for (int k = 0; k < 256; k += 4) {
10850 __asm__ volatile (
10851 "movups (%0), %%xmm0\n"
10852 "movups (%1), %%xmm1\n"
10853 "movups (%2), %%xmm2\n"
10854 "movups (%3), %%xmm3\n"
10855 "movups (%4), %%xmm4\n"
10856 "movups (%5), %%xmm5\n"
10857 "movups (%6), %%xmm6\n"
10858 "movups (%7), %%xmm7\n"
10859 "movaps %%xmm0, %%xmm8\n"
10860 "movaps %%xmm0, %%xmm9\n"
10861 "addps %%xmm1, %%xmm8\n"
10862 "subps %%xmm1, %%xmm9\n"
10863 "movaps %%xmm2, %%xmm10\n"
10864 "movaps %%xmm2, %%xmm11\n"
10865 "addps %%xmm3, %%xmm10\n"
10866 "subps %%xmm3, %%xmm11\n"
10867 "movaps %%xmm4, %%xmm12\n"
10868 "movaps %%xmm4, %%xmm13\n"
10869 "addps %%xmm5, %%xmm12\n"
10870 "subps %%xmm5, %%xmm13\n"
10871 "movaps %%xmm6, %%xmm14\n"
10872 "movaps %%xmm6, %%xmm15\n"
10873 "addps %%xmm7, %%xmm14\n"
10874 "subps %%xmm7, %%xmm15\n"
10875 "movaps %%xmm8, %%xmm0\n"
10876 "movaps %%xmm8, %%xmm2\n"
10877 "addps %%xmm10, %%xmm0\n"
10878 "subps %%xmm10, %%xmm2\n"
10879 "movaps %%xmm9, %%xmm1\n"
10880 "movaps %%xmm9, %%xmm3\n"
10881 "addps %%xmm11, %%xmm1\n"
10882 "subps %%xmm11, %%xmm3\n"
10883 "movaps %%xmm12, %%xmm4\n"
10884 "movaps %%xmm12, %%xmm6\n"
10885 "addps %%xmm14, %%xmm4\n"
10886 "subps %%xmm14, %%xmm6\n"
10887 "movaps %%xmm13, %%xmm5\n"
10888 "movaps %%xmm13, %%xmm7\n"
10889 "addps %%xmm15, %%xmm5\n"
10890 "subps %%xmm15, %%xmm7\n"
10891 "movaps %%xmm0, %%xmm8\n"
10892 "movaps %%xmm0, %%xmm12\n"
10893 "addps %%xmm4, %%xmm8\n"
10894 "subps %%xmm4, %%xmm12\n"
10895 "movaps %%xmm1, %%xmm9\n"
10896 "movaps %%xmm1, %%xmm13\n"
10897 "addps %%xmm5, %%xmm9\n"
10898 "subps %%xmm5, %%xmm13\n"
10899 "movaps %%xmm2, %%xmm10\n"
10900 "movaps %%xmm2, %%xmm14\n"
10901 "addps %%xmm6, %%xmm10\n"
10902 "subps %%xmm6, %%xmm14\n"
10903 "movaps %%xmm3, %%xmm11\n"
10904 "movaps %%xmm3, %%xmm15\n"
10905 "addps %%xmm7, %%xmm11\n"
10906 "subps %%xmm7, %%xmm15\n"
10907 "movups %%xmm8, (%0)\n"
10908 "movups %%xmm9, (%1)\n"
10909 "movups %%xmm10, (%2)\n"
10910 "movups %%xmm11, (%3)\n"
10911 "movups %%xmm12, (%4)\n"
10912 "movups %%xmm13, (%5)\n"
10913 "movups %%xmm14, (%6)\n"
10914 "movups %%xmm15, (%7)\n"
10915 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10916 );
10917 }
10918 }
10919 for (int j = 0; j < 4096; j += 4096) {
10920 for (int k = 0; k < 2048; k += 4) {
10921 __asm__ volatile (
10922 "movups (%0), %%xmm0\n"
10923 "movups (%1), %%xmm1\n"
10924 "movaps %%xmm0, %%xmm8\n"
10925 "movaps %%xmm0, %%xmm9\n"
10926 "addps %%xmm1, %%xmm8\n"
10927 "subps %%xmm1, %%xmm9\n"
10928 "movups %%xmm8, (%0)\n"
10929 "movups %%xmm9, (%1)\n"
10930 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
10931 );
10932 }
10933 }
10934 return;
10935 }
10936 if (depth == 15) {
10937 helper_float_27_recursive(buf + 0, 12);
10938 helper_float_27_recursive(buf + 4096, 12);
10939 helper_float_27_recursive(buf + 8192, 12);
10940 helper_float_27_recursive(buf + 12288, 12);
10941 helper_float_27_recursive(buf + 16384, 12);
10942 helper_float_27_recursive(buf + 20480, 12);
10943 helper_float_27_recursive(buf + 24576, 12);
10944 helper_float_27_recursive(buf + 28672, 12);
10945 for (int j = 0; j < 32768; j += 32768) {
10946 for (int k = 0; k < 4096; k += 4) {
10947 __asm__ volatile (
10948 "movups (%0), %%xmm0\n"
10949 "movups (%1), %%xmm1\n"
10950 "movups (%2), %%xmm2\n"
10951 "movups (%3), %%xmm3\n"
10952 "movups (%4), %%xmm4\n"
10953 "movups (%5), %%xmm5\n"
10954 "movups (%6), %%xmm6\n"
10955 "movups (%7), %%xmm7\n"
10956 "movaps %%xmm0, %%xmm8\n"
10957 "movaps %%xmm0, %%xmm9\n"
10958 "addps %%xmm1, %%xmm8\n"
10959 "subps %%xmm1, %%xmm9\n"
10960 "movaps %%xmm2, %%xmm10\n"
10961 "movaps %%xmm2, %%xmm11\n"
10962 "addps %%xmm3, %%xmm10\n"
10963 "subps %%xmm3, %%xmm11\n"
10964 "movaps %%xmm4, %%xmm12\n"
10965 "movaps %%xmm4, %%xmm13\n"
10966 "addps %%xmm5, %%xmm12\n"
10967 "subps %%xmm5, %%xmm13\n"
10968 "movaps %%xmm6, %%xmm14\n"
10969 "movaps %%xmm6, %%xmm15\n"
10970 "addps %%xmm7, %%xmm14\n"
10971 "subps %%xmm7, %%xmm15\n"
10972 "movaps %%xmm8, %%xmm0\n"
10973 "movaps %%xmm8, %%xmm2\n"
10974 "addps %%xmm10, %%xmm0\n"
10975 "subps %%xmm10, %%xmm2\n"
10976 "movaps %%xmm9, %%xmm1\n"
10977 "movaps %%xmm9, %%xmm3\n"
10978 "addps %%xmm11, %%xmm1\n"
10979 "subps %%xmm11, %%xmm3\n"
10980 "movaps %%xmm12, %%xmm4\n"
10981 "movaps %%xmm12, %%xmm6\n"
10982 "addps %%xmm14, %%xmm4\n"
10983 "subps %%xmm14, %%xmm6\n"
10984 "movaps %%xmm13, %%xmm5\n"
10985 "movaps %%xmm13, %%xmm7\n"
10986 "addps %%xmm15, %%xmm5\n"
10987 "subps %%xmm15, %%xmm7\n"
10988 "movaps %%xmm0, %%xmm8\n"
10989 "movaps %%xmm0, %%xmm12\n"
10990 "addps %%xmm4, %%xmm8\n"
10991 "subps %%xmm4, %%xmm12\n"
10992 "movaps %%xmm1, %%xmm9\n"
10993 "movaps %%xmm1, %%xmm13\n"
10994 "addps %%xmm5, %%xmm9\n"
10995 "subps %%xmm5, %%xmm13\n"
10996 "movaps %%xmm2, %%xmm10\n"
10997 "movaps %%xmm2, %%xmm14\n"
10998 "addps %%xmm6, %%xmm10\n"
10999 "subps %%xmm6, %%xmm14\n"
11000 "movaps %%xmm3, %%xmm11\n"
11001 "movaps %%xmm3, %%xmm15\n"
11002 "addps %%xmm7, %%xmm11\n"
11003 "subps %%xmm7, %%xmm15\n"
11004 "movups %%xmm8, (%0)\n"
11005 "movups %%xmm9, (%1)\n"
11006 "movups %%xmm10, (%2)\n"
11007 "movups %%xmm11, (%3)\n"
11008 "movups %%xmm12, (%4)\n"
11009 "movups %%xmm13, (%5)\n"
11010 "movups %%xmm14, (%6)\n"
11011 "movups %%xmm15, (%7)\n"
11012 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11013 );
11014 }
11015 }
11016 return;
11017 }
11018 if (depth == 18) {
11019 helper_float_27_recursive(buf + 0, 15);
11020 helper_float_27_recursive(buf + 32768, 15);
11021 helper_float_27_recursive(buf + 65536, 15);
11022 helper_float_27_recursive(buf + 98304, 15);
11023 helper_float_27_recursive(buf + 131072, 15);
11024 helper_float_27_recursive(buf + 163840, 15);
11025 helper_float_27_recursive(buf + 196608, 15);
11026 helper_float_27_recursive(buf + 229376, 15);
11027 for (int j = 0; j < 262144; j += 262144) {
11028 for (int k = 0; k < 32768; k += 4) {
11029 __asm__ volatile (
11030 "movups (%0), %%xmm0\n"
11031 "movups (%1), %%xmm1\n"
11032 "movups (%2), %%xmm2\n"
11033 "movups (%3), %%xmm3\n"
11034 "movups (%4), %%xmm4\n"
11035 "movups (%5), %%xmm5\n"
11036 "movups (%6), %%xmm6\n"
11037 "movups (%7), %%xmm7\n"
11038 "movaps %%xmm0, %%xmm8\n"
11039 "movaps %%xmm0, %%xmm9\n"
11040 "addps %%xmm1, %%xmm8\n"
11041 "subps %%xmm1, %%xmm9\n"
11042 "movaps %%xmm2, %%xmm10\n"
11043 "movaps %%xmm2, %%xmm11\n"
11044 "addps %%xmm3, %%xmm10\n"
11045 "subps %%xmm3, %%xmm11\n"
11046 "movaps %%xmm4, %%xmm12\n"
11047 "movaps %%xmm4, %%xmm13\n"
11048 "addps %%xmm5, %%xmm12\n"
11049 "subps %%xmm5, %%xmm13\n"
11050 "movaps %%xmm6, %%xmm14\n"
11051 "movaps %%xmm6, %%xmm15\n"
11052 "addps %%xmm7, %%xmm14\n"
11053 "subps %%xmm7, %%xmm15\n"
11054 "movaps %%xmm8, %%xmm0\n"
11055 "movaps %%xmm8, %%xmm2\n"
11056 "addps %%xmm10, %%xmm0\n"
11057 "subps %%xmm10, %%xmm2\n"
11058 "movaps %%xmm9, %%xmm1\n"
11059 "movaps %%xmm9, %%xmm3\n"
11060 "addps %%xmm11, %%xmm1\n"
11061 "subps %%xmm11, %%xmm3\n"
11062 "movaps %%xmm12, %%xmm4\n"
11063 "movaps %%xmm12, %%xmm6\n"
11064 "addps %%xmm14, %%xmm4\n"
11065 "subps %%xmm14, %%xmm6\n"
11066 "movaps %%xmm13, %%xmm5\n"
11067 "movaps %%xmm13, %%xmm7\n"
11068 "addps %%xmm15, %%xmm5\n"
11069 "subps %%xmm15, %%xmm7\n"
11070 "movaps %%xmm0, %%xmm8\n"
11071 "movaps %%xmm0, %%xmm12\n"
11072 "addps %%xmm4, %%xmm8\n"
11073 "subps %%xmm4, %%xmm12\n"
11074 "movaps %%xmm1, %%xmm9\n"
11075 "movaps %%xmm1, %%xmm13\n"
11076 "addps %%xmm5, %%xmm9\n"
11077 "subps %%xmm5, %%xmm13\n"
11078 "movaps %%xmm2, %%xmm10\n"
11079 "movaps %%xmm2, %%xmm14\n"
11080 "addps %%xmm6, %%xmm10\n"
11081 "subps %%xmm6, %%xmm14\n"
11082 "movaps %%xmm3, %%xmm11\n"
11083 "movaps %%xmm3, %%xmm15\n"
11084 "addps %%xmm7, %%xmm11\n"
11085 "subps %%xmm7, %%xmm15\n"
11086 "movups %%xmm8, (%0)\n"
11087 "movups %%xmm9, (%1)\n"
11088 "movups %%xmm10, (%2)\n"
11089 "movups %%xmm11, (%3)\n"
11090 "movups %%xmm12, (%4)\n"
11091 "movups %%xmm13, (%5)\n"
11092 "movups %%xmm14, (%6)\n"
11093 "movups %%xmm15, (%7)\n"
11094 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11095 );
11096 }
11097 }
11098 return;
11099 }
11100 if (depth == 21) {
11101 helper_float_27_recursive(buf + 0, 18);
11102 helper_float_27_recursive(buf + 262144, 18);
11103 helper_float_27_recursive(buf + 524288, 18);
11104 helper_float_27_recursive(buf + 786432, 18);
11105 helper_float_27_recursive(buf + 1048576, 18);
11106 helper_float_27_recursive(buf + 1310720, 18);
11107 helper_float_27_recursive(buf + 1572864, 18);
11108 helper_float_27_recursive(buf + 1835008, 18);
11109 for (int j = 0; j < 2097152; j += 2097152) {
11110 for (int k = 0; k < 262144; k += 4) {
11111 __asm__ volatile (
11112 "movups (%0), %%xmm0\n"
11113 "movups (%1), %%xmm1\n"
11114 "movups (%2), %%xmm2\n"
11115 "movups (%3), %%xmm3\n"
11116 "movups (%4), %%xmm4\n"
11117 "movups (%5), %%xmm5\n"
11118 "movups (%6), %%xmm6\n"
11119 "movups (%7), %%xmm7\n"
11120 "movaps %%xmm0, %%xmm8\n"
11121 "movaps %%xmm0, %%xmm9\n"
11122 "addps %%xmm1, %%xmm8\n"
11123 "subps %%xmm1, %%xmm9\n"
11124 "movaps %%xmm2, %%xmm10\n"
11125 "movaps %%xmm2, %%xmm11\n"
11126 "addps %%xmm3, %%xmm10\n"
11127 "subps %%xmm3, %%xmm11\n"
11128 "movaps %%xmm4, %%xmm12\n"
11129 "movaps %%xmm4, %%xmm13\n"
11130 "addps %%xmm5, %%xmm12\n"
11131 "subps %%xmm5, %%xmm13\n"
11132 "movaps %%xmm6, %%xmm14\n"
11133 "movaps %%xmm6, %%xmm15\n"
11134 "addps %%xmm7, %%xmm14\n"
11135 "subps %%xmm7, %%xmm15\n"
11136 "movaps %%xmm8, %%xmm0\n"
11137 "movaps %%xmm8, %%xmm2\n"
11138 "addps %%xmm10, %%xmm0\n"
11139 "subps %%xmm10, %%xmm2\n"
11140 "movaps %%xmm9, %%xmm1\n"
11141 "movaps %%xmm9, %%xmm3\n"
11142 "addps %%xmm11, %%xmm1\n"
11143 "subps %%xmm11, %%xmm3\n"
11144 "movaps %%xmm12, %%xmm4\n"
11145 "movaps %%xmm12, %%xmm6\n"
11146 "addps %%xmm14, %%xmm4\n"
11147 "subps %%xmm14, %%xmm6\n"
11148 "movaps %%xmm13, %%xmm5\n"
11149 "movaps %%xmm13, %%xmm7\n"
11150 "addps %%xmm15, %%xmm5\n"
11151 "subps %%xmm15, %%xmm7\n"
11152 "movaps %%xmm0, %%xmm8\n"
11153 "movaps %%xmm0, %%xmm12\n"
11154 "addps %%xmm4, %%xmm8\n"
11155 "subps %%xmm4, %%xmm12\n"
11156 "movaps %%xmm1, %%xmm9\n"
11157 "movaps %%xmm1, %%xmm13\n"
11158 "addps %%xmm5, %%xmm9\n"
11159 "subps %%xmm5, %%xmm13\n"
11160 "movaps %%xmm2, %%xmm10\n"
11161 "movaps %%xmm2, %%xmm14\n"
11162 "addps %%xmm6, %%xmm10\n"
11163 "subps %%xmm6, %%xmm14\n"
11164 "movaps %%xmm3, %%xmm11\n"
11165 "movaps %%xmm3, %%xmm15\n"
11166 "addps %%xmm7, %%xmm11\n"
11167 "subps %%xmm7, %%xmm15\n"
11168 "movups %%xmm8, (%0)\n"
11169 "movups %%xmm9, (%1)\n"
11170 "movups %%xmm10, (%2)\n"
11171 "movups %%xmm11, (%3)\n"
11172 "movups %%xmm12, (%4)\n"
11173 "movups %%xmm13, (%5)\n"
11174 "movups %%xmm14, (%6)\n"
11175 "movups %%xmm15, (%7)\n"
11176 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11177 );
11178 }
11179 }
11180 return;
11181 }
11182 if (depth == 24) {
11183 helper_float_27_recursive(buf + 0, 21);
11184 helper_float_27_recursive(buf + 2097152, 21);
11185 helper_float_27_recursive(buf + 4194304, 21);
11186 helper_float_27_recursive(buf + 6291456, 21);
11187 helper_float_27_recursive(buf + 8388608, 21);
11188 helper_float_27_recursive(buf + 10485760, 21);
11189 helper_float_27_recursive(buf + 12582912, 21);
11190 helper_float_27_recursive(buf + 14680064, 21);
11191 for (int j = 0; j < 16777216; j += 16777216) {
11192 for (int k = 0; k < 2097152; k += 4) {
11193 __asm__ volatile (
11194 "movups (%0), %%xmm0\n"
11195 "movups (%1), %%xmm1\n"
11196 "movups (%2), %%xmm2\n"
11197 "movups (%3), %%xmm3\n"
11198 "movups (%4), %%xmm4\n"
11199 "movups (%5), %%xmm5\n"
11200 "movups (%6), %%xmm6\n"
11201 "movups (%7), %%xmm7\n"
11202 "movaps %%xmm0, %%xmm8\n"
11203 "movaps %%xmm0, %%xmm9\n"
11204 "addps %%xmm1, %%xmm8\n"
11205 "subps %%xmm1, %%xmm9\n"
11206 "movaps %%xmm2, %%xmm10\n"
11207 "movaps %%xmm2, %%xmm11\n"
11208 "addps %%xmm3, %%xmm10\n"
11209 "subps %%xmm3, %%xmm11\n"
11210 "movaps %%xmm4, %%xmm12\n"
11211 "movaps %%xmm4, %%xmm13\n"
11212 "addps %%xmm5, %%xmm12\n"
11213 "subps %%xmm5, %%xmm13\n"
11214 "movaps %%xmm6, %%xmm14\n"
11215 "movaps %%xmm6, %%xmm15\n"
11216 "addps %%xmm7, %%xmm14\n"
11217 "subps %%xmm7, %%xmm15\n"
11218 "movaps %%xmm8, %%xmm0\n"
11219 "movaps %%xmm8, %%xmm2\n"
11220 "addps %%xmm10, %%xmm0\n"
11221 "subps %%xmm10, %%xmm2\n"
11222 "movaps %%xmm9, %%xmm1\n"
11223 "movaps %%xmm9, %%xmm3\n"
11224 "addps %%xmm11, %%xmm1\n"
11225 "subps %%xmm11, %%xmm3\n"
11226 "movaps %%xmm12, %%xmm4\n"
11227 "movaps %%xmm12, %%xmm6\n"
11228 "addps %%xmm14, %%xmm4\n"
11229 "subps %%xmm14, %%xmm6\n"
11230 "movaps %%xmm13, %%xmm5\n"
11231 "movaps %%xmm13, %%xmm7\n"
11232 "addps %%xmm15, %%xmm5\n"
11233 "subps %%xmm15, %%xmm7\n"
11234 "movaps %%xmm0, %%xmm8\n"
11235 "movaps %%xmm0, %%xmm12\n"
11236 "addps %%xmm4, %%xmm8\n"
11237 "subps %%xmm4, %%xmm12\n"
11238 "movaps %%xmm1, %%xmm9\n"
11239 "movaps %%xmm1, %%xmm13\n"
11240 "addps %%xmm5, %%xmm9\n"
11241 "subps %%xmm5, %%xmm13\n"
11242 "movaps %%xmm2, %%xmm10\n"
11243 "movaps %%xmm2, %%xmm14\n"
11244 "addps %%xmm6, %%xmm10\n"
11245 "subps %%xmm6, %%xmm14\n"
11246 "movaps %%xmm3, %%xmm11\n"
11247 "movaps %%xmm3, %%xmm15\n"
11248 "addps %%xmm7, %%xmm11\n"
11249 "subps %%xmm7, %%xmm15\n"
11250 "movups %%xmm8, (%0)\n"
11251 "movups %%xmm9, (%1)\n"
11252 "movups %%xmm10, (%2)\n"
11253 "movups %%xmm11, (%3)\n"
11254 "movups %%xmm12, (%4)\n"
11255 "movups %%xmm13, (%5)\n"
11256 "movups %%xmm14, (%6)\n"
11257 "movups %%xmm15, (%7)\n"
11258 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11259 );
11260 }
11261 }
11262 return;
11263 }
11264 if (depth == 27) {
11265 helper_float_27_recursive(buf + 0, 24);
11266 helper_float_27_recursive(buf + 16777216, 24);
11267 helper_float_27_recursive(buf + 33554432, 24);
11268 helper_float_27_recursive(buf + 50331648, 24);
11269 helper_float_27_recursive(buf + 67108864, 24);
11270 helper_float_27_recursive(buf + 83886080, 24);
11271 helper_float_27_recursive(buf + 100663296, 24);
11272 helper_float_27_recursive(buf + 117440512, 24);
11273 for (int j = 0; j < 134217728; j += 134217728) {
11274 for (int k = 0; k < 16777216; k += 4) {
11275 __asm__ volatile (
11276 "movups (%0), %%xmm0\n"
11277 "movups (%1), %%xmm1\n"
11278 "movups (%2), %%xmm2\n"
11279 "movups (%3), %%xmm3\n"
11280 "movups (%4), %%xmm4\n"
11281 "movups (%5), %%xmm5\n"
11282 "movups (%6), %%xmm6\n"
11283 "movups (%7), %%xmm7\n"
11284 "movaps %%xmm0, %%xmm8\n"
11285 "movaps %%xmm0, %%xmm9\n"
11286 "addps %%xmm1, %%xmm8\n"
11287 "subps %%xmm1, %%xmm9\n"
11288 "movaps %%xmm2, %%xmm10\n"
11289 "movaps %%xmm2, %%xmm11\n"
11290 "addps %%xmm3, %%xmm10\n"
11291 "subps %%xmm3, %%xmm11\n"
11292 "movaps %%xmm4, %%xmm12\n"
11293 "movaps %%xmm4, %%xmm13\n"
11294 "addps %%xmm5, %%xmm12\n"
11295 "subps %%xmm5, %%xmm13\n"
11296 "movaps %%xmm6, %%xmm14\n"
11297 "movaps %%xmm6, %%xmm15\n"
11298 "addps %%xmm7, %%xmm14\n"
11299 "subps %%xmm7, %%xmm15\n"
11300 "movaps %%xmm8, %%xmm0\n"
11301 "movaps %%xmm8, %%xmm2\n"
11302 "addps %%xmm10, %%xmm0\n"
11303 "subps %%xmm10, %%xmm2\n"
11304 "movaps %%xmm9, %%xmm1\n"
11305 "movaps %%xmm9, %%xmm3\n"
11306 "addps %%xmm11, %%xmm1\n"
11307 "subps %%xmm11, %%xmm3\n"
11308 "movaps %%xmm12, %%xmm4\n"
11309 "movaps %%xmm12, %%xmm6\n"
11310 "addps %%xmm14, %%xmm4\n"
11311 "subps %%xmm14, %%xmm6\n"
11312 "movaps %%xmm13, %%xmm5\n"
11313 "movaps %%xmm13, %%xmm7\n"
11314 "addps %%xmm15, %%xmm5\n"
11315 "subps %%xmm15, %%xmm7\n"
11316 "movaps %%xmm0, %%xmm8\n"
11317 "movaps %%xmm0, %%xmm12\n"
11318 "addps %%xmm4, %%xmm8\n"
11319 "subps %%xmm4, %%xmm12\n"
11320 "movaps %%xmm1, %%xmm9\n"
11321 "movaps %%xmm1, %%xmm13\n"
11322 "addps %%xmm5, %%xmm9\n"
11323 "subps %%xmm5, %%xmm13\n"
11324 "movaps %%xmm2, %%xmm10\n"
11325 "movaps %%xmm2, %%xmm14\n"
11326 "addps %%xmm6, %%xmm10\n"
11327 "subps %%xmm6, %%xmm14\n"
11328 "movaps %%xmm3, %%xmm11\n"
11329 "movaps %%xmm3, %%xmm15\n"
11330 "addps %%xmm7, %%xmm11\n"
11331 "subps %%xmm7, %%xmm15\n"
11332 "movups %%xmm8, (%0)\n"
11333 "movups %%xmm9, (%1)\n"
11334 "movups %%xmm10, (%2)\n"
11335 "movups %%xmm11, (%3)\n"
11336 "movups %%xmm12, (%4)\n"
11337 "movups %%xmm13, (%5)\n"
11338 "movups %%xmm14, (%6)\n"
11339 "movups %%xmm15, (%7)\n"
11340 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11341 );
11342 }
11343 }
11344 return;
11345 }
11346 }
11347 void helper_float_27(float *buf);
helper_float_27(float * buf)11348 void helper_float_27(float *buf) {
11349 helper_float_27_recursive(buf, 27);
11350 }
11351 void helper_float_28_recursive(float *buf, int depth);
helper_float_28_recursive(float * buf,int depth)11352 void helper_float_28_recursive(float *buf, int depth) {
11353 if (depth == 16) {
11354 for (int j = 0; j < 65536; j += 32) {
11355 for (int k = 0; k < 4; k += 4) {
11356 __asm__ volatile (
11357 "movups (%0), %%xmm0\n"
11358 "movups (%1), %%xmm1\n"
11359 "movups (%2), %%xmm2\n"
11360 "movups (%3), %%xmm3\n"
11361 "movups (%4), %%xmm4\n"
11362 "movups (%5), %%xmm5\n"
11363 "movups (%6), %%xmm6\n"
11364 "movups (%7), %%xmm7\n"
11365 "movaps %%xmm0, %%xmm8\n"
11366 "shufps $160, %%xmm8, %%xmm8\n"
11367 "shufps $245, %%xmm0, %%xmm0\n"
11368 "xorps %%xmm9, %%xmm9\n"
11369 "subps %%xmm0, %%xmm9\n"
11370 "addsubps %%xmm9, %%xmm8\n"
11371 "movaps %%xmm8, %%xmm0\n"
11372 "movaps %%xmm1, %%xmm8\n"
11373 "shufps $160, %%xmm8, %%xmm8\n"
11374 "shufps $245, %%xmm1, %%xmm1\n"
11375 "xorps %%xmm9, %%xmm9\n"
11376 "subps %%xmm1, %%xmm9\n"
11377 "addsubps %%xmm9, %%xmm8\n"
11378 "movaps %%xmm8, %%xmm1\n"
11379 "movaps %%xmm2, %%xmm8\n"
11380 "shufps $160, %%xmm8, %%xmm8\n"
11381 "shufps $245, %%xmm2, %%xmm2\n"
11382 "xorps %%xmm9, %%xmm9\n"
11383 "subps %%xmm2, %%xmm9\n"
11384 "addsubps %%xmm9, %%xmm8\n"
11385 "movaps %%xmm8, %%xmm2\n"
11386 "movaps %%xmm3, %%xmm8\n"
11387 "shufps $160, %%xmm8, %%xmm8\n"
11388 "shufps $245, %%xmm3, %%xmm3\n"
11389 "xorps %%xmm9, %%xmm9\n"
11390 "subps %%xmm3, %%xmm9\n"
11391 "addsubps %%xmm9, %%xmm8\n"
11392 "movaps %%xmm8, %%xmm3\n"
11393 "movaps %%xmm4, %%xmm8\n"
11394 "shufps $160, %%xmm8, %%xmm8\n"
11395 "shufps $245, %%xmm4, %%xmm4\n"
11396 "xorps %%xmm9, %%xmm9\n"
11397 "subps %%xmm4, %%xmm9\n"
11398 "addsubps %%xmm9, %%xmm8\n"
11399 "movaps %%xmm8, %%xmm4\n"
11400 "movaps %%xmm5, %%xmm8\n"
11401 "shufps $160, %%xmm8, %%xmm8\n"
11402 "shufps $245, %%xmm5, %%xmm5\n"
11403 "xorps %%xmm9, %%xmm9\n"
11404 "subps %%xmm5, %%xmm9\n"
11405 "addsubps %%xmm9, %%xmm8\n"
11406 "movaps %%xmm8, %%xmm5\n"
11407 "movaps %%xmm6, %%xmm8\n"
11408 "shufps $160, %%xmm8, %%xmm8\n"
11409 "shufps $245, %%xmm6, %%xmm6\n"
11410 "xorps %%xmm9, %%xmm9\n"
11411 "subps %%xmm6, %%xmm9\n"
11412 "addsubps %%xmm9, %%xmm8\n"
11413 "movaps %%xmm8, %%xmm6\n"
11414 "movaps %%xmm7, %%xmm8\n"
11415 "shufps $160, %%xmm8, %%xmm8\n"
11416 "shufps $245, %%xmm7, %%xmm7\n"
11417 "xorps %%xmm9, %%xmm9\n"
11418 "subps %%xmm7, %%xmm9\n"
11419 "addsubps %%xmm9, %%xmm8\n"
11420 "movaps %%xmm8, %%xmm7\n"
11421 "movaps %%xmm0, %%xmm8\n"
11422 "shufps $68, %%xmm8, %%xmm8\n"
11423 "xorps %%xmm9, %%xmm9\n"
11424 "movaps %%xmm0, %%xmm10\n"
11425 "shufps $14, %%xmm9, %%xmm10\n"
11426 "movaps %%xmm0, %%xmm11\n"
11427 "shufps $224, %%xmm11, %%xmm9\n"
11428 "addps %%xmm8, %%xmm10\n"
11429 "subps %%xmm9, %%xmm10\n"
11430 "movaps %%xmm10, %%xmm0\n"
11431 "movaps %%xmm1, %%xmm8\n"
11432 "shufps $68, %%xmm8, %%xmm8\n"
11433 "xorps %%xmm9, %%xmm9\n"
11434 "movaps %%xmm1, %%xmm10\n"
11435 "shufps $14, %%xmm9, %%xmm10\n"
11436 "movaps %%xmm1, %%xmm11\n"
11437 "shufps $224, %%xmm11, %%xmm9\n"
11438 "addps %%xmm8, %%xmm10\n"
11439 "subps %%xmm9, %%xmm10\n"
11440 "movaps %%xmm10, %%xmm1\n"
11441 "movaps %%xmm2, %%xmm8\n"
11442 "shufps $68, %%xmm8, %%xmm8\n"
11443 "xorps %%xmm9, %%xmm9\n"
11444 "movaps %%xmm2, %%xmm10\n"
11445 "shufps $14, %%xmm9, %%xmm10\n"
11446 "movaps %%xmm2, %%xmm11\n"
11447 "shufps $224, %%xmm11, %%xmm9\n"
11448 "addps %%xmm8, %%xmm10\n"
11449 "subps %%xmm9, %%xmm10\n"
11450 "movaps %%xmm10, %%xmm2\n"
11451 "movaps %%xmm3, %%xmm8\n"
11452 "shufps $68, %%xmm8, %%xmm8\n"
11453 "xorps %%xmm9, %%xmm9\n"
11454 "movaps %%xmm3, %%xmm10\n"
11455 "shufps $14, %%xmm9, %%xmm10\n"
11456 "movaps %%xmm3, %%xmm11\n"
11457 "shufps $224, %%xmm11, %%xmm9\n"
11458 "addps %%xmm8, %%xmm10\n"
11459 "subps %%xmm9, %%xmm10\n"
11460 "movaps %%xmm10, %%xmm3\n"
11461 "movaps %%xmm4, %%xmm8\n"
11462 "shufps $68, %%xmm8, %%xmm8\n"
11463 "xorps %%xmm9, %%xmm9\n"
11464 "movaps %%xmm4, %%xmm10\n"
11465 "shufps $14, %%xmm9, %%xmm10\n"
11466 "movaps %%xmm4, %%xmm11\n"
11467 "shufps $224, %%xmm11, %%xmm9\n"
11468 "addps %%xmm8, %%xmm10\n"
11469 "subps %%xmm9, %%xmm10\n"
11470 "movaps %%xmm10, %%xmm4\n"
11471 "movaps %%xmm5, %%xmm8\n"
11472 "shufps $68, %%xmm8, %%xmm8\n"
11473 "xorps %%xmm9, %%xmm9\n"
11474 "movaps %%xmm5, %%xmm10\n"
11475 "shufps $14, %%xmm9, %%xmm10\n"
11476 "movaps %%xmm5, %%xmm11\n"
11477 "shufps $224, %%xmm11, %%xmm9\n"
11478 "addps %%xmm8, %%xmm10\n"
11479 "subps %%xmm9, %%xmm10\n"
11480 "movaps %%xmm10, %%xmm5\n"
11481 "movaps %%xmm6, %%xmm8\n"
11482 "shufps $68, %%xmm8, %%xmm8\n"
11483 "xorps %%xmm9, %%xmm9\n"
11484 "movaps %%xmm6, %%xmm10\n"
11485 "shufps $14, %%xmm9, %%xmm10\n"
11486 "movaps %%xmm6, %%xmm11\n"
11487 "shufps $224, %%xmm11, %%xmm9\n"
11488 "addps %%xmm8, %%xmm10\n"
11489 "subps %%xmm9, %%xmm10\n"
11490 "movaps %%xmm10, %%xmm6\n"
11491 "movaps %%xmm7, %%xmm8\n"
11492 "shufps $68, %%xmm8, %%xmm8\n"
11493 "xorps %%xmm9, %%xmm9\n"
11494 "movaps %%xmm7, %%xmm10\n"
11495 "shufps $14, %%xmm9, %%xmm10\n"
11496 "movaps %%xmm7, %%xmm11\n"
11497 "shufps $224, %%xmm11, %%xmm9\n"
11498 "addps %%xmm8, %%xmm10\n"
11499 "subps %%xmm9, %%xmm10\n"
11500 "movaps %%xmm10, %%xmm7\n"
11501 "movaps %%xmm0, %%xmm8\n"
11502 "movaps %%xmm0, %%xmm9\n"
11503 "addps %%xmm1, %%xmm8\n"
11504 "subps %%xmm1, %%xmm9\n"
11505 "movaps %%xmm2, %%xmm10\n"
11506 "movaps %%xmm2, %%xmm11\n"
11507 "addps %%xmm3, %%xmm10\n"
11508 "subps %%xmm3, %%xmm11\n"
11509 "movaps %%xmm4, %%xmm12\n"
11510 "movaps %%xmm4, %%xmm13\n"
11511 "addps %%xmm5, %%xmm12\n"
11512 "subps %%xmm5, %%xmm13\n"
11513 "movaps %%xmm6, %%xmm14\n"
11514 "movaps %%xmm6, %%xmm15\n"
11515 "addps %%xmm7, %%xmm14\n"
11516 "subps %%xmm7, %%xmm15\n"
11517 "movaps %%xmm8, %%xmm0\n"
11518 "movaps %%xmm8, %%xmm2\n"
11519 "addps %%xmm10, %%xmm0\n"
11520 "subps %%xmm10, %%xmm2\n"
11521 "movaps %%xmm9, %%xmm1\n"
11522 "movaps %%xmm9, %%xmm3\n"
11523 "addps %%xmm11, %%xmm1\n"
11524 "subps %%xmm11, %%xmm3\n"
11525 "movaps %%xmm12, %%xmm4\n"
11526 "movaps %%xmm12, %%xmm6\n"
11527 "addps %%xmm14, %%xmm4\n"
11528 "subps %%xmm14, %%xmm6\n"
11529 "movaps %%xmm13, %%xmm5\n"
11530 "movaps %%xmm13, %%xmm7\n"
11531 "addps %%xmm15, %%xmm5\n"
11532 "subps %%xmm15, %%xmm7\n"
11533 "movaps %%xmm0, %%xmm8\n"
11534 "movaps %%xmm0, %%xmm12\n"
11535 "addps %%xmm4, %%xmm8\n"
11536 "subps %%xmm4, %%xmm12\n"
11537 "movaps %%xmm1, %%xmm9\n"
11538 "movaps %%xmm1, %%xmm13\n"
11539 "addps %%xmm5, %%xmm9\n"
11540 "subps %%xmm5, %%xmm13\n"
11541 "movaps %%xmm2, %%xmm10\n"
11542 "movaps %%xmm2, %%xmm14\n"
11543 "addps %%xmm6, %%xmm10\n"
11544 "subps %%xmm6, %%xmm14\n"
11545 "movaps %%xmm3, %%xmm11\n"
11546 "movaps %%xmm3, %%xmm15\n"
11547 "addps %%xmm7, %%xmm11\n"
11548 "subps %%xmm7, %%xmm15\n"
11549 "movups %%xmm8, (%0)\n"
11550 "movups %%xmm9, (%1)\n"
11551 "movups %%xmm10, (%2)\n"
11552 "movups %%xmm11, (%3)\n"
11553 "movups %%xmm12, (%4)\n"
11554 "movups %%xmm13, (%5)\n"
11555 "movups %%xmm14, (%6)\n"
11556 "movups %%xmm15, (%7)\n"
11557 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11558 );
11559 }
11560 }
11561 for (int j = 0; j < 65536; j += 256) {
11562 for (int k = 0; k < 32; k += 4) {
11563 __asm__ volatile (
11564 "movups (%0), %%xmm0\n"
11565 "movups (%1), %%xmm1\n"
11566 "movups (%2), %%xmm2\n"
11567 "movups (%3), %%xmm3\n"
11568 "movups (%4), %%xmm4\n"
11569 "movups (%5), %%xmm5\n"
11570 "movups (%6), %%xmm6\n"
11571 "movups (%7), %%xmm7\n"
11572 "movaps %%xmm0, %%xmm8\n"
11573 "movaps %%xmm0, %%xmm9\n"
11574 "addps %%xmm1, %%xmm8\n"
11575 "subps %%xmm1, %%xmm9\n"
11576 "movaps %%xmm2, %%xmm10\n"
11577 "movaps %%xmm2, %%xmm11\n"
11578 "addps %%xmm3, %%xmm10\n"
11579 "subps %%xmm3, %%xmm11\n"
11580 "movaps %%xmm4, %%xmm12\n"
11581 "movaps %%xmm4, %%xmm13\n"
11582 "addps %%xmm5, %%xmm12\n"
11583 "subps %%xmm5, %%xmm13\n"
11584 "movaps %%xmm6, %%xmm14\n"
11585 "movaps %%xmm6, %%xmm15\n"
11586 "addps %%xmm7, %%xmm14\n"
11587 "subps %%xmm7, %%xmm15\n"
11588 "movaps %%xmm8, %%xmm0\n"
11589 "movaps %%xmm8, %%xmm2\n"
11590 "addps %%xmm10, %%xmm0\n"
11591 "subps %%xmm10, %%xmm2\n"
11592 "movaps %%xmm9, %%xmm1\n"
11593 "movaps %%xmm9, %%xmm3\n"
11594 "addps %%xmm11, %%xmm1\n"
11595 "subps %%xmm11, %%xmm3\n"
11596 "movaps %%xmm12, %%xmm4\n"
11597 "movaps %%xmm12, %%xmm6\n"
11598 "addps %%xmm14, %%xmm4\n"
11599 "subps %%xmm14, %%xmm6\n"
11600 "movaps %%xmm13, %%xmm5\n"
11601 "movaps %%xmm13, %%xmm7\n"
11602 "addps %%xmm15, %%xmm5\n"
11603 "subps %%xmm15, %%xmm7\n"
11604 "movaps %%xmm0, %%xmm8\n"
11605 "movaps %%xmm0, %%xmm12\n"
11606 "addps %%xmm4, %%xmm8\n"
11607 "subps %%xmm4, %%xmm12\n"
11608 "movaps %%xmm1, %%xmm9\n"
11609 "movaps %%xmm1, %%xmm13\n"
11610 "addps %%xmm5, %%xmm9\n"
11611 "subps %%xmm5, %%xmm13\n"
11612 "movaps %%xmm2, %%xmm10\n"
11613 "movaps %%xmm2, %%xmm14\n"
11614 "addps %%xmm6, %%xmm10\n"
11615 "subps %%xmm6, %%xmm14\n"
11616 "movaps %%xmm3, %%xmm11\n"
11617 "movaps %%xmm3, %%xmm15\n"
11618 "addps %%xmm7, %%xmm11\n"
11619 "subps %%xmm7, %%xmm15\n"
11620 "movups %%xmm8, (%0)\n"
11621 "movups %%xmm9, (%1)\n"
11622 "movups %%xmm10, (%2)\n"
11623 "movups %%xmm11, (%3)\n"
11624 "movups %%xmm12, (%4)\n"
11625 "movups %%xmm13, (%5)\n"
11626 "movups %%xmm14, (%6)\n"
11627 "movups %%xmm15, (%7)\n"
11628 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11629 );
11630 }
11631 }
11632 for (int j = 0; j < 65536; j += 2048) {
11633 for (int k = 0; k < 256; k += 4) {
11634 __asm__ volatile (
11635 "movups (%0), %%xmm0\n"
11636 "movups (%1), %%xmm1\n"
11637 "movups (%2), %%xmm2\n"
11638 "movups (%3), %%xmm3\n"
11639 "movups (%4), %%xmm4\n"
11640 "movups (%5), %%xmm5\n"
11641 "movups (%6), %%xmm6\n"
11642 "movups (%7), %%xmm7\n"
11643 "movaps %%xmm0, %%xmm8\n"
11644 "movaps %%xmm0, %%xmm9\n"
11645 "addps %%xmm1, %%xmm8\n"
11646 "subps %%xmm1, %%xmm9\n"
11647 "movaps %%xmm2, %%xmm10\n"
11648 "movaps %%xmm2, %%xmm11\n"
11649 "addps %%xmm3, %%xmm10\n"
11650 "subps %%xmm3, %%xmm11\n"
11651 "movaps %%xmm4, %%xmm12\n"
11652 "movaps %%xmm4, %%xmm13\n"
11653 "addps %%xmm5, %%xmm12\n"
11654 "subps %%xmm5, %%xmm13\n"
11655 "movaps %%xmm6, %%xmm14\n"
11656 "movaps %%xmm6, %%xmm15\n"
11657 "addps %%xmm7, %%xmm14\n"
11658 "subps %%xmm7, %%xmm15\n"
11659 "movaps %%xmm8, %%xmm0\n"
11660 "movaps %%xmm8, %%xmm2\n"
11661 "addps %%xmm10, %%xmm0\n"
11662 "subps %%xmm10, %%xmm2\n"
11663 "movaps %%xmm9, %%xmm1\n"
11664 "movaps %%xmm9, %%xmm3\n"
11665 "addps %%xmm11, %%xmm1\n"
11666 "subps %%xmm11, %%xmm3\n"
11667 "movaps %%xmm12, %%xmm4\n"
11668 "movaps %%xmm12, %%xmm6\n"
11669 "addps %%xmm14, %%xmm4\n"
11670 "subps %%xmm14, %%xmm6\n"
11671 "movaps %%xmm13, %%xmm5\n"
11672 "movaps %%xmm13, %%xmm7\n"
11673 "addps %%xmm15, %%xmm5\n"
11674 "subps %%xmm15, %%xmm7\n"
11675 "movaps %%xmm0, %%xmm8\n"
11676 "movaps %%xmm0, %%xmm12\n"
11677 "addps %%xmm4, %%xmm8\n"
11678 "subps %%xmm4, %%xmm12\n"
11679 "movaps %%xmm1, %%xmm9\n"
11680 "movaps %%xmm1, %%xmm13\n"
11681 "addps %%xmm5, %%xmm9\n"
11682 "subps %%xmm5, %%xmm13\n"
11683 "movaps %%xmm2, %%xmm10\n"
11684 "movaps %%xmm2, %%xmm14\n"
11685 "addps %%xmm6, %%xmm10\n"
11686 "subps %%xmm6, %%xmm14\n"
11687 "movaps %%xmm3, %%xmm11\n"
11688 "movaps %%xmm3, %%xmm15\n"
11689 "addps %%xmm7, %%xmm11\n"
11690 "subps %%xmm7, %%xmm15\n"
11691 "movups %%xmm8, (%0)\n"
11692 "movups %%xmm9, (%1)\n"
11693 "movups %%xmm10, (%2)\n"
11694 "movups %%xmm11, (%3)\n"
11695 "movups %%xmm12, (%4)\n"
11696 "movups %%xmm13, (%5)\n"
11697 "movups %%xmm14, (%6)\n"
11698 "movups %%xmm15, (%7)\n"
11699 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11700 );
11701 }
11702 }
11703 for (int j = 0; j < 65536; j += 16384) {
11704 for (int k = 0; k < 2048; k += 4) {
11705 __asm__ volatile (
11706 "movups (%0), %%xmm0\n"
11707 "movups (%1), %%xmm1\n"
11708 "movups (%2), %%xmm2\n"
11709 "movups (%3), %%xmm3\n"
11710 "movups (%4), %%xmm4\n"
11711 "movups (%5), %%xmm5\n"
11712 "movups (%6), %%xmm6\n"
11713 "movups (%7), %%xmm7\n"
11714 "movaps %%xmm0, %%xmm8\n"
11715 "movaps %%xmm0, %%xmm9\n"
11716 "addps %%xmm1, %%xmm8\n"
11717 "subps %%xmm1, %%xmm9\n"
11718 "movaps %%xmm2, %%xmm10\n"
11719 "movaps %%xmm2, %%xmm11\n"
11720 "addps %%xmm3, %%xmm10\n"
11721 "subps %%xmm3, %%xmm11\n"
11722 "movaps %%xmm4, %%xmm12\n"
11723 "movaps %%xmm4, %%xmm13\n"
11724 "addps %%xmm5, %%xmm12\n"
11725 "subps %%xmm5, %%xmm13\n"
11726 "movaps %%xmm6, %%xmm14\n"
11727 "movaps %%xmm6, %%xmm15\n"
11728 "addps %%xmm7, %%xmm14\n"
11729 "subps %%xmm7, %%xmm15\n"
11730 "movaps %%xmm8, %%xmm0\n"
11731 "movaps %%xmm8, %%xmm2\n"
11732 "addps %%xmm10, %%xmm0\n"
11733 "subps %%xmm10, %%xmm2\n"
11734 "movaps %%xmm9, %%xmm1\n"
11735 "movaps %%xmm9, %%xmm3\n"
11736 "addps %%xmm11, %%xmm1\n"
11737 "subps %%xmm11, %%xmm3\n"
11738 "movaps %%xmm12, %%xmm4\n"
11739 "movaps %%xmm12, %%xmm6\n"
11740 "addps %%xmm14, %%xmm4\n"
11741 "subps %%xmm14, %%xmm6\n"
11742 "movaps %%xmm13, %%xmm5\n"
11743 "movaps %%xmm13, %%xmm7\n"
11744 "addps %%xmm15, %%xmm5\n"
11745 "subps %%xmm15, %%xmm7\n"
11746 "movaps %%xmm0, %%xmm8\n"
11747 "movaps %%xmm0, %%xmm12\n"
11748 "addps %%xmm4, %%xmm8\n"
11749 "subps %%xmm4, %%xmm12\n"
11750 "movaps %%xmm1, %%xmm9\n"
11751 "movaps %%xmm1, %%xmm13\n"
11752 "addps %%xmm5, %%xmm9\n"
11753 "subps %%xmm5, %%xmm13\n"
11754 "movaps %%xmm2, %%xmm10\n"
11755 "movaps %%xmm2, %%xmm14\n"
11756 "addps %%xmm6, %%xmm10\n"
11757 "subps %%xmm6, %%xmm14\n"
11758 "movaps %%xmm3, %%xmm11\n"
11759 "movaps %%xmm3, %%xmm15\n"
11760 "addps %%xmm7, %%xmm11\n"
11761 "subps %%xmm7, %%xmm15\n"
11762 "movups %%xmm8, (%0)\n"
11763 "movups %%xmm9, (%1)\n"
11764 "movups %%xmm10, (%2)\n"
11765 "movups %%xmm11, (%3)\n"
11766 "movups %%xmm12, (%4)\n"
11767 "movups %%xmm13, (%5)\n"
11768 "movups %%xmm14, (%6)\n"
11769 "movups %%xmm15, (%7)\n"
11770 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11771 );
11772 }
11773 }
11774 for (int j = 0; j < 65536; j += 65536) {
11775 for (int k = 0; k < 16384; k += 4) {
11776 __asm__ volatile (
11777 "movups (%0), %%xmm0\n"
11778 "movups (%1), %%xmm1\n"
11779 "movups (%2), %%xmm2\n"
11780 "movups (%3), %%xmm3\n"
11781 "movaps %%xmm0, %%xmm8\n"
11782 "movaps %%xmm0, %%xmm9\n"
11783 "addps %%xmm1, %%xmm8\n"
11784 "subps %%xmm1, %%xmm9\n"
11785 "movaps %%xmm2, %%xmm10\n"
11786 "movaps %%xmm2, %%xmm11\n"
11787 "addps %%xmm3, %%xmm10\n"
11788 "subps %%xmm3, %%xmm11\n"
11789 "movaps %%xmm8, %%xmm0\n"
11790 "movaps %%xmm8, %%xmm2\n"
11791 "addps %%xmm10, %%xmm0\n"
11792 "subps %%xmm10, %%xmm2\n"
11793 "movaps %%xmm9, %%xmm1\n"
11794 "movaps %%xmm9, %%xmm3\n"
11795 "addps %%xmm11, %%xmm1\n"
11796 "subps %%xmm11, %%xmm3\n"
11797 "movups %%xmm0, (%0)\n"
11798 "movups %%xmm1, (%1)\n"
11799 "movups %%xmm2, (%2)\n"
11800 "movups %%xmm3, (%3)\n"
11801 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11802 );
11803 }
11804 }
11805 return;
11806 }
11807 if (depth == 19) {
11808 helper_float_28_recursive(buf + 0, 16);
11809 helper_float_28_recursive(buf + 65536, 16);
11810 helper_float_28_recursive(buf + 131072, 16);
11811 helper_float_28_recursive(buf + 196608, 16);
11812 helper_float_28_recursive(buf + 262144, 16);
11813 helper_float_28_recursive(buf + 327680, 16);
11814 helper_float_28_recursive(buf + 393216, 16);
11815 helper_float_28_recursive(buf + 458752, 16);
11816 for (int j = 0; j < 524288; j += 524288) {
11817 for (int k = 0; k < 65536; k += 4) {
11818 __asm__ volatile (
11819 "movups (%0), %%xmm0\n"
11820 "movups (%1), %%xmm1\n"
11821 "movups (%2), %%xmm2\n"
11822 "movups (%3), %%xmm3\n"
11823 "movups (%4), %%xmm4\n"
11824 "movups (%5), %%xmm5\n"
11825 "movups (%6), %%xmm6\n"
11826 "movups (%7), %%xmm7\n"
11827 "movaps %%xmm0, %%xmm8\n"
11828 "movaps %%xmm0, %%xmm9\n"
11829 "addps %%xmm1, %%xmm8\n"
11830 "subps %%xmm1, %%xmm9\n"
11831 "movaps %%xmm2, %%xmm10\n"
11832 "movaps %%xmm2, %%xmm11\n"
11833 "addps %%xmm3, %%xmm10\n"
11834 "subps %%xmm3, %%xmm11\n"
11835 "movaps %%xmm4, %%xmm12\n"
11836 "movaps %%xmm4, %%xmm13\n"
11837 "addps %%xmm5, %%xmm12\n"
11838 "subps %%xmm5, %%xmm13\n"
11839 "movaps %%xmm6, %%xmm14\n"
11840 "movaps %%xmm6, %%xmm15\n"
11841 "addps %%xmm7, %%xmm14\n"
11842 "subps %%xmm7, %%xmm15\n"
11843 "movaps %%xmm8, %%xmm0\n"
11844 "movaps %%xmm8, %%xmm2\n"
11845 "addps %%xmm10, %%xmm0\n"
11846 "subps %%xmm10, %%xmm2\n"
11847 "movaps %%xmm9, %%xmm1\n"
11848 "movaps %%xmm9, %%xmm3\n"
11849 "addps %%xmm11, %%xmm1\n"
11850 "subps %%xmm11, %%xmm3\n"
11851 "movaps %%xmm12, %%xmm4\n"
11852 "movaps %%xmm12, %%xmm6\n"
11853 "addps %%xmm14, %%xmm4\n"
11854 "subps %%xmm14, %%xmm6\n"
11855 "movaps %%xmm13, %%xmm5\n"
11856 "movaps %%xmm13, %%xmm7\n"
11857 "addps %%xmm15, %%xmm5\n"
11858 "subps %%xmm15, %%xmm7\n"
11859 "movaps %%xmm0, %%xmm8\n"
11860 "movaps %%xmm0, %%xmm12\n"
11861 "addps %%xmm4, %%xmm8\n"
11862 "subps %%xmm4, %%xmm12\n"
11863 "movaps %%xmm1, %%xmm9\n"
11864 "movaps %%xmm1, %%xmm13\n"
11865 "addps %%xmm5, %%xmm9\n"
11866 "subps %%xmm5, %%xmm13\n"
11867 "movaps %%xmm2, %%xmm10\n"
11868 "movaps %%xmm2, %%xmm14\n"
11869 "addps %%xmm6, %%xmm10\n"
11870 "subps %%xmm6, %%xmm14\n"
11871 "movaps %%xmm3, %%xmm11\n"
11872 "movaps %%xmm3, %%xmm15\n"
11873 "addps %%xmm7, %%xmm11\n"
11874 "subps %%xmm7, %%xmm15\n"
11875 "movups %%xmm8, (%0)\n"
11876 "movups %%xmm9, (%1)\n"
11877 "movups %%xmm10, (%2)\n"
11878 "movups %%xmm11, (%3)\n"
11879 "movups %%xmm12, (%4)\n"
11880 "movups %%xmm13, (%5)\n"
11881 "movups %%xmm14, (%6)\n"
11882 "movups %%xmm15, (%7)\n"
11883 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11884 );
11885 }
11886 }
11887 return;
11888 }
11889 if (depth == 22) {
11890 helper_float_28_recursive(buf + 0, 19);
11891 helper_float_28_recursive(buf + 524288, 19);
11892 helper_float_28_recursive(buf + 1048576, 19);
11893 helper_float_28_recursive(buf + 1572864, 19);
11894 helper_float_28_recursive(buf + 2097152, 19);
11895 helper_float_28_recursive(buf + 2621440, 19);
11896 helper_float_28_recursive(buf + 3145728, 19);
11897 helper_float_28_recursive(buf + 3670016, 19);
11898 for (int j = 0; j < 4194304; j += 4194304) {
11899 for (int k = 0; k < 524288; k += 4) {
11900 __asm__ volatile (
11901 "movups (%0), %%xmm0\n"
11902 "movups (%1), %%xmm1\n"
11903 "movups (%2), %%xmm2\n"
11904 "movups (%3), %%xmm3\n"
11905 "movups (%4), %%xmm4\n"
11906 "movups (%5), %%xmm5\n"
11907 "movups (%6), %%xmm6\n"
11908 "movups (%7), %%xmm7\n"
11909 "movaps %%xmm0, %%xmm8\n"
11910 "movaps %%xmm0, %%xmm9\n"
11911 "addps %%xmm1, %%xmm8\n"
11912 "subps %%xmm1, %%xmm9\n"
11913 "movaps %%xmm2, %%xmm10\n"
11914 "movaps %%xmm2, %%xmm11\n"
11915 "addps %%xmm3, %%xmm10\n"
11916 "subps %%xmm3, %%xmm11\n"
11917 "movaps %%xmm4, %%xmm12\n"
11918 "movaps %%xmm4, %%xmm13\n"
11919 "addps %%xmm5, %%xmm12\n"
11920 "subps %%xmm5, %%xmm13\n"
11921 "movaps %%xmm6, %%xmm14\n"
11922 "movaps %%xmm6, %%xmm15\n"
11923 "addps %%xmm7, %%xmm14\n"
11924 "subps %%xmm7, %%xmm15\n"
11925 "movaps %%xmm8, %%xmm0\n"
11926 "movaps %%xmm8, %%xmm2\n"
11927 "addps %%xmm10, %%xmm0\n"
11928 "subps %%xmm10, %%xmm2\n"
11929 "movaps %%xmm9, %%xmm1\n"
11930 "movaps %%xmm9, %%xmm3\n"
11931 "addps %%xmm11, %%xmm1\n"
11932 "subps %%xmm11, %%xmm3\n"
11933 "movaps %%xmm12, %%xmm4\n"
11934 "movaps %%xmm12, %%xmm6\n"
11935 "addps %%xmm14, %%xmm4\n"
11936 "subps %%xmm14, %%xmm6\n"
11937 "movaps %%xmm13, %%xmm5\n"
11938 "movaps %%xmm13, %%xmm7\n"
11939 "addps %%xmm15, %%xmm5\n"
11940 "subps %%xmm15, %%xmm7\n"
11941 "movaps %%xmm0, %%xmm8\n"
11942 "movaps %%xmm0, %%xmm12\n"
11943 "addps %%xmm4, %%xmm8\n"
11944 "subps %%xmm4, %%xmm12\n"
11945 "movaps %%xmm1, %%xmm9\n"
11946 "movaps %%xmm1, %%xmm13\n"
11947 "addps %%xmm5, %%xmm9\n"
11948 "subps %%xmm5, %%xmm13\n"
11949 "movaps %%xmm2, %%xmm10\n"
11950 "movaps %%xmm2, %%xmm14\n"
11951 "addps %%xmm6, %%xmm10\n"
11952 "subps %%xmm6, %%xmm14\n"
11953 "movaps %%xmm3, %%xmm11\n"
11954 "movaps %%xmm3, %%xmm15\n"
11955 "addps %%xmm7, %%xmm11\n"
11956 "subps %%xmm7, %%xmm15\n"
11957 "movups %%xmm8, (%0)\n"
11958 "movups %%xmm9, (%1)\n"
11959 "movups %%xmm10, (%2)\n"
11960 "movups %%xmm11, (%3)\n"
11961 "movups %%xmm12, (%4)\n"
11962 "movups %%xmm13, (%5)\n"
11963 "movups %%xmm14, (%6)\n"
11964 "movups %%xmm15, (%7)\n"
11965 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
11966 );
11967 }
11968 }
11969 return;
11970 }
11971 if (depth == 25) {
11972 helper_float_28_recursive(buf + 0, 22);
11973 helper_float_28_recursive(buf + 4194304, 22);
11974 helper_float_28_recursive(buf + 8388608, 22);
11975 helper_float_28_recursive(buf + 12582912, 22);
11976 helper_float_28_recursive(buf + 16777216, 22);
11977 helper_float_28_recursive(buf + 20971520, 22);
11978 helper_float_28_recursive(buf + 25165824, 22);
11979 helper_float_28_recursive(buf + 29360128, 22);
11980 for (int j = 0; j < 33554432; j += 33554432) {
11981 for (int k = 0; k < 4194304; k += 4) {
11982 __asm__ volatile (
11983 "movups (%0), %%xmm0\n"
11984 "movups (%1), %%xmm1\n"
11985 "movups (%2), %%xmm2\n"
11986 "movups (%3), %%xmm3\n"
11987 "movups (%4), %%xmm4\n"
11988 "movups (%5), %%xmm5\n"
11989 "movups (%6), %%xmm6\n"
11990 "movups (%7), %%xmm7\n"
11991 "movaps %%xmm0, %%xmm8\n"
11992 "movaps %%xmm0, %%xmm9\n"
11993 "addps %%xmm1, %%xmm8\n"
11994 "subps %%xmm1, %%xmm9\n"
11995 "movaps %%xmm2, %%xmm10\n"
11996 "movaps %%xmm2, %%xmm11\n"
11997 "addps %%xmm3, %%xmm10\n"
11998 "subps %%xmm3, %%xmm11\n"
11999 "movaps %%xmm4, %%xmm12\n"
12000 "movaps %%xmm4, %%xmm13\n"
12001 "addps %%xmm5, %%xmm12\n"
12002 "subps %%xmm5, %%xmm13\n"
12003 "movaps %%xmm6, %%xmm14\n"
12004 "movaps %%xmm6, %%xmm15\n"
12005 "addps %%xmm7, %%xmm14\n"
12006 "subps %%xmm7, %%xmm15\n"
12007 "movaps %%xmm8, %%xmm0\n"
12008 "movaps %%xmm8, %%xmm2\n"
12009 "addps %%xmm10, %%xmm0\n"
12010 "subps %%xmm10, %%xmm2\n"
12011 "movaps %%xmm9, %%xmm1\n"
12012 "movaps %%xmm9, %%xmm3\n"
12013 "addps %%xmm11, %%xmm1\n"
12014 "subps %%xmm11, %%xmm3\n"
12015 "movaps %%xmm12, %%xmm4\n"
12016 "movaps %%xmm12, %%xmm6\n"
12017 "addps %%xmm14, %%xmm4\n"
12018 "subps %%xmm14, %%xmm6\n"
12019 "movaps %%xmm13, %%xmm5\n"
12020 "movaps %%xmm13, %%xmm7\n"
12021 "addps %%xmm15, %%xmm5\n"
12022 "subps %%xmm15, %%xmm7\n"
12023 "movaps %%xmm0, %%xmm8\n"
12024 "movaps %%xmm0, %%xmm12\n"
12025 "addps %%xmm4, %%xmm8\n"
12026 "subps %%xmm4, %%xmm12\n"
12027 "movaps %%xmm1, %%xmm9\n"
12028 "movaps %%xmm1, %%xmm13\n"
12029 "addps %%xmm5, %%xmm9\n"
12030 "subps %%xmm5, %%xmm13\n"
12031 "movaps %%xmm2, %%xmm10\n"
12032 "movaps %%xmm2, %%xmm14\n"
12033 "addps %%xmm6, %%xmm10\n"
12034 "subps %%xmm6, %%xmm14\n"
12035 "movaps %%xmm3, %%xmm11\n"
12036 "movaps %%xmm3, %%xmm15\n"
12037 "addps %%xmm7, %%xmm11\n"
12038 "subps %%xmm7, %%xmm15\n"
12039 "movups %%xmm8, (%0)\n"
12040 "movups %%xmm9, (%1)\n"
12041 "movups %%xmm10, (%2)\n"
12042 "movups %%xmm11, (%3)\n"
12043 "movups %%xmm12, (%4)\n"
12044 "movups %%xmm13, (%5)\n"
12045 "movups %%xmm14, (%6)\n"
12046 "movups %%xmm15, (%7)\n"
12047 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12048 );
12049 }
12050 }
12051 return;
12052 }
12053 if (depth == 28) {
12054 helper_float_28_recursive(buf + 0, 25);
12055 helper_float_28_recursive(buf + 33554432, 25);
12056 helper_float_28_recursive(buf + 67108864, 25);
12057 helper_float_28_recursive(buf + 100663296, 25);
12058 helper_float_28_recursive(buf + 134217728, 25);
12059 helper_float_28_recursive(buf + 167772160, 25);
12060 helper_float_28_recursive(buf + 201326592, 25);
12061 helper_float_28_recursive(buf + 234881024, 25);
12062 for (int j = 0; j < 268435456; j += 268435456) {
12063 for (int k = 0; k < 33554432; k += 4) {
12064 __asm__ volatile (
12065 "movups (%0), %%xmm0\n"
12066 "movups (%1), %%xmm1\n"
12067 "movups (%2), %%xmm2\n"
12068 "movups (%3), %%xmm3\n"
12069 "movups (%4), %%xmm4\n"
12070 "movups (%5), %%xmm5\n"
12071 "movups (%6), %%xmm6\n"
12072 "movups (%7), %%xmm7\n"
12073 "movaps %%xmm0, %%xmm8\n"
12074 "movaps %%xmm0, %%xmm9\n"
12075 "addps %%xmm1, %%xmm8\n"
12076 "subps %%xmm1, %%xmm9\n"
12077 "movaps %%xmm2, %%xmm10\n"
12078 "movaps %%xmm2, %%xmm11\n"
12079 "addps %%xmm3, %%xmm10\n"
12080 "subps %%xmm3, %%xmm11\n"
12081 "movaps %%xmm4, %%xmm12\n"
12082 "movaps %%xmm4, %%xmm13\n"
12083 "addps %%xmm5, %%xmm12\n"
12084 "subps %%xmm5, %%xmm13\n"
12085 "movaps %%xmm6, %%xmm14\n"
12086 "movaps %%xmm6, %%xmm15\n"
12087 "addps %%xmm7, %%xmm14\n"
12088 "subps %%xmm7, %%xmm15\n"
12089 "movaps %%xmm8, %%xmm0\n"
12090 "movaps %%xmm8, %%xmm2\n"
12091 "addps %%xmm10, %%xmm0\n"
12092 "subps %%xmm10, %%xmm2\n"
12093 "movaps %%xmm9, %%xmm1\n"
12094 "movaps %%xmm9, %%xmm3\n"
12095 "addps %%xmm11, %%xmm1\n"
12096 "subps %%xmm11, %%xmm3\n"
12097 "movaps %%xmm12, %%xmm4\n"
12098 "movaps %%xmm12, %%xmm6\n"
12099 "addps %%xmm14, %%xmm4\n"
12100 "subps %%xmm14, %%xmm6\n"
12101 "movaps %%xmm13, %%xmm5\n"
12102 "movaps %%xmm13, %%xmm7\n"
12103 "addps %%xmm15, %%xmm5\n"
12104 "subps %%xmm15, %%xmm7\n"
12105 "movaps %%xmm0, %%xmm8\n"
12106 "movaps %%xmm0, %%xmm12\n"
12107 "addps %%xmm4, %%xmm8\n"
12108 "subps %%xmm4, %%xmm12\n"
12109 "movaps %%xmm1, %%xmm9\n"
12110 "movaps %%xmm1, %%xmm13\n"
12111 "addps %%xmm5, %%xmm9\n"
12112 "subps %%xmm5, %%xmm13\n"
12113 "movaps %%xmm2, %%xmm10\n"
12114 "movaps %%xmm2, %%xmm14\n"
12115 "addps %%xmm6, %%xmm10\n"
12116 "subps %%xmm6, %%xmm14\n"
12117 "movaps %%xmm3, %%xmm11\n"
12118 "movaps %%xmm3, %%xmm15\n"
12119 "addps %%xmm7, %%xmm11\n"
12120 "subps %%xmm7, %%xmm15\n"
12121 "movups %%xmm8, (%0)\n"
12122 "movups %%xmm9, (%1)\n"
12123 "movups %%xmm10, (%2)\n"
12124 "movups %%xmm11, (%3)\n"
12125 "movups %%xmm12, (%4)\n"
12126 "movups %%xmm13, (%5)\n"
12127 "movups %%xmm14, (%6)\n"
12128 "movups %%xmm15, (%7)\n"
12129 :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12130 );
12131 }
12132 }
12133 return;
12134 }
12135 }
12136 void helper_float_28(float *buf);
helper_float_28(float * buf)12137 void helper_float_28(float *buf) {
12138 helper_float_28_recursive(buf, 28);
12139 }
12140 void helper_float_29_recursive(float *buf, int depth);
helper_float_29_recursive(float * buf,int depth)12141 void helper_float_29_recursive(float *buf, int depth) {
12142 if (depth == 12) {
12143 for (int j = 0; j < 4096; j += 32) {
12144 for (int k = 0; k < 4; k += 4) {
12145 __asm__ volatile (
12146 "movups (%0), %%xmm0\n"
12147 "movups (%1), %%xmm1\n"
12148 "movups (%2), %%xmm2\n"
12149 "movups (%3), %%xmm3\n"
12150 "movups (%4), %%xmm4\n"
12151 "movups (%5), %%xmm5\n"
12152 "movups (%6), %%xmm6\n"
12153 "movups (%7), %%xmm7\n"
12154 "movaps %%xmm0, %%xmm8\n"
12155 "shufps $160, %%xmm8, %%xmm8\n"
12156 "shufps $245, %%xmm0, %%xmm0\n"
12157 "xorps %%xmm9, %%xmm9\n"
12158 "subps %%xmm0, %%xmm9\n"
12159 "addsubps %%xmm9, %%xmm8\n"
12160 "movaps %%xmm8, %%xmm0\n"
12161 "movaps %%xmm1, %%xmm8\n"
12162 "shufps $160, %%xmm8, %%xmm8\n"
12163 "shufps $245, %%xmm1, %%xmm1\n"
12164 "xorps %%xmm9, %%xmm9\n"
12165 "subps %%xmm1, %%xmm9\n"
12166 "addsubps %%xmm9, %%xmm8\n"
12167 "movaps %%xmm8, %%xmm1\n"
12168 "movaps %%xmm2, %%xmm8\n"
12169 "shufps $160, %%xmm8, %%xmm8\n"
12170 "shufps $245, %%xmm2, %%xmm2\n"
12171 "xorps %%xmm9, %%xmm9\n"
12172 "subps %%xmm2, %%xmm9\n"
12173 "addsubps %%xmm9, %%xmm8\n"
12174 "movaps %%xmm8, %%xmm2\n"
12175 "movaps %%xmm3, %%xmm8\n"
12176 "shufps $160, %%xmm8, %%xmm8\n"
12177 "shufps $245, %%xmm3, %%xmm3\n"
12178 "xorps %%xmm9, %%xmm9\n"
12179 "subps %%xmm3, %%xmm9\n"
12180 "addsubps %%xmm9, %%xmm8\n"
12181 "movaps %%xmm8, %%xmm3\n"
12182 "movaps %%xmm4, %%xmm8\n"
12183 "shufps $160, %%xmm8, %%xmm8\n"
12184 "shufps $245, %%xmm4, %%xmm4\n"
12185 "xorps %%xmm9, %%xmm9\n"
12186 "subps %%xmm4, %%xmm9\n"
12187 "addsubps %%xmm9, %%xmm8\n"
12188 "movaps %%xmm8, %%xmm4\n"
12189 "movaps %%xmm5, %%xmm8\n"
12190 "shufps $160, %%xmm8, %%xmm8\n"
12191 "shufps $245, %%xmm5, %%xmm5\n"
12192 "xorps %%xmm9, %%xmm9\n"
12193 "subps %%xmm5, %%xmm9\n"
12194 "addsubps %%xmm9, %%xmm8\n"
12195 "movaps %%xmm8, %%xmm5\n"
12196 "movaps %%xmm6, %%xmm8\n"
12197 "shufps $160, %%xmm8, %%xmm8\n"
12198 "shufps $245, %%xmm6, %%xmm6\n"
12199 "xorps %%xmm9, %%xmm9\n"
12200 "subps %%xmm6, %%xmm9\n"
12201 "addsubps %%xmm9, %%xmm8\n"
12202 "movaps %%xmm8, %%xmm6\n"
12203 "movaps %%xmm7, %%xmm8\n"
12204 "shufps $160, %%xmm8, %%xmm8\n"
12205 "shufps $245, %%xmm7, %%xmm7\n"
12206 "xorps %%xmm9, %%xmm9\n"
12207 "subps %%xmm7, %%xmm9\n"
12208 "addsubps %%xmm9, %%xmm8\n"
12209 "movaps %%xmm8, %%xmm7\n"
12210 "movaps %%xmm0, %%xmm8\n"
12211 "shufps $68, %%xmm8, %%xmm8\n"
12212 "xorps %%xmm9, %%xmm9\n"
12213 "movaps %%xmm0, %%xmm10\n"
12214 "shufps $14, %%xmm9, %%xmm10\n"
12215 "movaps %%xmm0, %%xmm11\n"
12216 "shufps $224, %%xmm11, %%xmm9\n"
12217 "addps %%xmm8, %%xmm10\n"
12218 "subps %%xmm9, %%xmm10\n"
12219 "movaps %%xmm10, %%xmm0\n"
12220 "movaps %%xmm1, %%xmm8\n"
12221 "shufps $68, %%xmm8, %%xmm8\n"
12222 "xorps %%xmm9, %%xmm9\n"
12223 "movaps %%xmm1, %%xmm10\n"
12224 "shufps $14, %%xmm9, %%xmm10\n"
12225 "movaps %%xmm1, %%xmm11\n"
12226 "shufps $224, %%xmm11, %%xmm9\n"
12227 "addps %%xmm8, %%xmm10\n"
12228 "subps %%xmm9, %%xmm10\n"
12229 "movaps %%xmm10, %%xmm1\n"
12230 "movaps %%xmm2, %%xmm8\n"
12231 "shufps $68, %%xmm8, %%xmm8\n"
12232 "xorps %%xmm9, %%xmm9\n"
12233 "movaps %%xmm2, %%xmm10\n"
12234 "shufps $14, %%xmm9, %%xmm10\n"
12235 "movaps %%xmm2, %%xmm11\n"
12236 "shufps $224, %%xmm11, %%xmm9\n"
12237 "addps %%xmm8, %%xmm10\n"
12238 "subps %%xmm9, %%xmm10\n"
12239 "movaps %%xmm10, %%xmm2\n"
12240 "movaps %%xmm3, %%xmm8\n"
12241 "shufps $68, %%xmm8, %%xmm8\n"
12242 "xorps %%xmm9, %%xmm9\n"
12243 "movaps %%xmm3, %%xmm10\n"
12244 "shufps $14, %%xmm9, %%xmm10\n"
12245 "movaps %%xmm3, %%xmm11\n"
12246 "shufps $224, %%xmm11, %%xmm9\n"
12247 "addps %%xmm8, %%xmm10\n"
12248 "subps %%xmm9, %%xmm10\n"
12249 "movaps %%xmm10, %%xmm3\n"
12250 "movaps %%xmm4, %%xmm8\n"
12251 "shufps $68, %%xmm8, %%xmm8\n"
12252 "xorps %%xmm9, %%xmm9\n"
12253 "movaps %%xmm4, %%xmm10\n"
12254 "shufps $14, %%xmm9, %%xmm10\n"
12255 "movaps %%xmm4, %%xmm11\n"
12256 "shufps $224, %%xmm11, %%xmm9\n"
12257 "addps %%xmm8, %%xmm10\n"
12258 "subps %%xmm9, %%xmm10\n"
12259 "movaps %%xmm10, %%xmm4\n"
12260 "movaps %%xmm5, %%xmm8\n"
12261 "shufps $68, %%xmm8, %%xmm8\n"
12262 "xorps %%xmm9, %%xmm9\n"
12263 "movaps %%xmm5, %%xmm10\n"
12264 "shufps $14, %%xmm9, %%xmm10\n"
12265 "movaps %%xmm5, %%xmm11\n"
12266 "shufps $224, %%xmm11, %%xmm9\n"
12267 "addps %%xmm8, %%xmm10\n"
12268 "subps %%xmm9, %%xmm10\n"
12269 "movaps %%xmm10, %%xmm5\n"
12270 "movaps %%xmm6, %%xmm8\n"
12271 "shufps $68, %%xmm8, %%xmm8\n"
12272 "xorps %%xmm9, %%xmm9\n"
12273 "movaps %%xmm6, %%xmm10\n"
12274 "shufps $14, %%xmm9, %%xmm10\n"
12275 "movaps %%xmm6, %%xmm11\n"
12276 "shufps $224, %%xmm11, %%xmm9\n"
12277 "addps %%xmm8, %%xmm10\n"
12278 "subps %%xmm9, %%xmm10\n"
12279 "movaps %%xmm10, %%xmm6\n"
12280 "movaps %%xmm7, %%xmm8\n"
12281 "shufps $68, %%xmm8, %%xmm8\n"
12282 "xorps %%xmm9, %%xmm9\n"
12283 "movaps %%xmm7, %%xmm10\n"
12284 "shufps $14, %%xmm9, %%xmm10\n"
12285 "movaps %%xmm7, %%xmm11\n"
12286 "shufps $224, %%xmm11, %%xmm9\n"
12287 "addps %%xmm8, %%xmm10\n"
12288 "subps %%xmm9, %%xmm10\n"
12289 "movaps %%xmm10, %%xmm7\n"
12290 "movaps %%xmm0, %%xmm8\n"
12291 "movaps %%xmm0, %%xmm9\n"
12292 "addps %%xmm1, %%xmm8\n"
12293 "subps %%xmm1, %%xmm9\n"
12294 "movaps %%xmm2, %%xmm10\n"
12295 "movaps %%xmm2, %%xmm11\n"
12296 "addps %%xmm3, %%xmm10\n"
12297 "subps %%xmm3, %%xmm11\n"
12298 "movaps %%xmm4, %%xmm12\n"
12299 "movaps %%xmm4, %%xmm13\n"
12300 "addps %%xmm5, %%xmm12\n"
12301 "subps %%xmm5, %%xmm13\n"
12302 "movaps %%xmm6, %%xmm14\n"
12303 "movaps %%xmm6, %%xmm15\n"
12304 "addps %%xmm7, %%xmm14\n"
12305 "subps %%xmm7, %%xmm15\n"
12306 "movaps %%xmm8, %%xmm0\n"
12307 "movaps %%xmm8, %%xmm2\n"
12308 "addps %%xmm10, %%xmm0\n"
12309 "subps %%xmm10, %%xmm2\n"
12310 "movaps %%xmm9, %%xmm1\n"
12311 "movaps %%xmm9, %%xmm3\n"
12312 "addps %%xmm11, %%xmm1\n"
12313 "subps %%xmm11, %%xmm3\n"
12314 "movaps %%xmm12, %%xmm4\n"
12315 "movaps %%xmm12, %%xmm6\n"
12316 "addps %%xmm14, %%xmm4\n"
12317 "subps %%xmm14, %%xmm6\n"
12318 "movaps %%xmm13, %%xmm5\n"
12319 "movaps %%xmm13, %%xmm7\n"
12320 "addps %%xmm15, %%xmm5\n"
12321 "subps %%xmm15, %%xmm7\n"
12322 "movaps %%xmm0, %%xmm8\n"
12323 "movaps %%xmm0, %%xmm12\n"
12324 "addps %%xmm4, %%xmm8\n"
12325 "subps %%xmm4, %%xmm12\n"
12326 "movaps %%xmm1, %%xmm9\n"
12327 "movaps %%xmm1, %%xmm13\n"
12328 "addps %%xmm5, %%xmm9\n"
12329 "subps %%xmm5, %%xmm13\n"
12330 "movaps %%xmm2, %%xmm10\n"
12331 "movaps %%xmm2, %%xmm14\n"
12332 "addps %%xmm6, %%xmm10\n"
12333 "subps %%xmm6, %%xmm14\n"
12334 "movaps %%xmm3, %%xmm11\n"
12335 "movaps %%xmm3, %%xmm15\n"
12336 "addps %%xmm7, %%xmm11\n"
12337 "subps %%xmm7, %%xmm15\n"
12338 "movups %%xmm8, (%0)\n"
12339 "movups %%xmm9, (%1)\n"
12340 "movups %%xmm10, (%2)\n"
12341 "movups %%xmm11, (%3)\n"
12342 "movups %%xmm12, (%4)\n"
12343 "movups %%xmm13, (%5)\n"
12344 "movups %%xmm14, (%6)\n"
12345 "movups %%xmm15, (%7)\n"
12346 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12347 );
12348 }
12349 }
12350 for (int j = 0; j < 4096; j += 256) {
12351 for (int k = 0; k < 32; k += 4) {
12352 __asm__ volatile (
12353 "movups (%0), %%xmm0\n"
12354 "movups (%1), %%xmm1\n"
12355 "movups (%2), %%xmm2\n"
12356 "movups (%3), %%xmm3\n"
12357 "movups (%4), %%xmm4\n"
12358 "movups (%5), %%xmm5\n"
12359 "movups (%6), %%xmm6\n"
12360 "movups (%7), %%xmm7\n"
12361 "movaps %%xmm0, %%xmm8\n"
12362 "movaps %%xmm0, %%xmm9\n"
12363 "addps %%xmm1, %%xmm8\n"
12364 "subps %%xmm1, %%xmm9\n"
12365 "movaps %%xmm2, %%xmm10\n"
12366 "movaps %%xmm2, %%xmm11\n"
12367 "addps %%xmm3, %%xmm10\n"
12368 "subps %%xmm3, %%xmm11\n"
12369 "movaps %%xmm4, %%xmm12\n"
12370 "movaps %%xmm4, %%xmm13\n"
12371 "addps %%xmm5, %%xmm12\n"
12372 "subps %%xmm5, %%xmm13\n"
12373 "movaps %%xmm6, %%xmm14\n"
12374 "movaps %%xmm6, %%xmm15\n"
12375 "addps %%xmm7, %%xmm14\n"
12376 "subps %%xmm7, %%xmm15\n"
12377 "movaps %%xmm8, %%xmm0\n"
12378 "movaps %%xmm8, %%xmm2\n"
12379 "addps %%xmm10, %%xmm0\n"
12380 "subps %%xmm10, %%xmm2\n"
12381 "movaps %%xmm9, %%xmm1\n"
12382 "movaps %%xmm9, %%xmm3\n"
12383 "addps %%xmm11, %%xmm1\n"
12384 "subps %%xmm11, %%xmm3\n"
12385 "movaps %%xmm12, %%xmm4\n"
12386 "movaps %%xmm12, %%xmm6\n"
12387 "addps %%xmm14, %%xmm4\n"
12388 "subps %%xmm14, %%xmm6\n"
12389 "movaps %%xmm13, %%xmm5\n"
12390 "movaps %%xmm13, %%xmm7\n"
12391 "addps %%xmm15, %%xmm5\n"
12392 "subps %%xmm15, %%xmm7\n"
12393 "movaps %%xmm0, %%xmm8\n"
12394 "movaps %%xmm0, %%xmm12\n"
12395 "addps %%xmm4, %%xmm8\n"
12396 "subps %%xmm4, %%xmm12\n"
12397 "movaps %%xmm1, %%xmm9\n"
12398 "movaps %%xmm1, %%xmm13\n"
12399 "addps %%xmm5, %%xmm9\n"
12400 "subps %%xmm5, %%xmm13\n"
12401 "movaps %%xmm2, %%xmm10\n"
12402 "movaps %%xmm2, %%xmm14\n"
12403 "addps %%xmm6, %%xmm10\n"
12404 "subps %%xmm6, %%xmm14\n"
12405 "movaps %%xmm3, %%xmm11\n"
12406 "movaps %%xmm3, %%xmm15\n"
12407 "addps %%xmm7, %%xmm11\n"
12408 "subps %%xmm7, %%xmm15\n"
12409 "movups %%xmm8, (%0)\n"
12410 "movups %%xmm9, (%1)\n"
12411 "movups %%xmm10, (%2)\n"
12412 "movups %%xmm11, (%3)\n"
12413 "movups %%xmm12, (%4)\n"
12414 "movups %%xmm13, (%5)\n"
12415 "movups %%xmm14, (%6)\n"
12416 "movups %%xmm15, (%7)\n"
12417 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12418 );
12419 }
12420 }
12421 for (int j = 0; j < 4096; j += 2048) {
12422 for (int k = 0; k < 256; k += 4) {
12423 __asm__ volatile (
12424 "movups (%0), %%xmm0\n"
12425 "movups (%1), %%xmm1\n"
12426 "movups (%2), %%xmm2\n"
12427 "movups (%3), %%xmm3\n"
12428 "movups (%4), %%xmm4\n"
12429 "movups (%5), %%xmm5\n"
12430 "movups (%6), %%xmm6\n"
12431 "movups (%7), %%xmm7\n"
12432 "movaps %%xmm0, %%xmm8\n"
12433 "movaps %%xmm0, %%xmm9\n"
12434 "addps %%xmm1, %%xmm8\n"
12435 "subps %%xmm1, %%xmm9\n"
12436 "movaps %%xmm2, %%xmm10\n"
12437 "movaps %%xmm2, %%xmm11\n"
12438 "addps %%xmm3, %%xmm10\n"
12439 "subps %%xmm3, %%xmm11\n"
12440 "movaps %%xmm4, %%xmm12\n"
12441 "movaps %%xmm4, %%xmm13\n"
12442 "addps %%xmm5, %%xmm12\n"
12443 "subps %%xmm5, %%xmm13\n"
12444 "movaps %%xmm6, %%xmm14\n"
12445 "movaps %%xmm6, %%xmm15\n"
12446 "addps %%xmm7, %%xmm14\n"
12447 "subps %%xmm7, %%xmm15\n"
12448 "movaps %%xmm8, %%xmm0\n"
12449 "movaps %%xmm8, %%xmm2\n"
12450 "addps %%xmm10, %%xmm0\n"
12451 "subps %%xmm10, %%xmm2\n"
12452 "movaps %%xmm9, %%xmm1\n"
12453 "movaps %%xmm9, %%xmm3\n"
12454 "addps %%xmm11, %%xmm1\n"
12455 "subps %%xmm11, %%xmm3\n"
12456 "movaps %%xmm12, %%xmm4\n"
12457 "movaps %%xmm12, %%xmm6\n"
12458 "addps %%xmm14, %%xmm4\n"
12459 "subps %%xmm14, %%xmm6\n"
12460 "movaps %%xmm13, %%xmm5\n"
12461 "movaps %%xmm13, %%xmm7\n"
12462 "addps %%xmm15, %%xmm5\n"
12463 "subps %%xmm15, %%xmm7\n"
12464 "movaps %%xmm0, %%xmm8\n"
12465 "movaps %%xmm0, %%xmm12\n"
12466 "addps %%xmm4, %%xmm8\n"
12467 "subps %%xmm4, %%xmm12\n"
12468 "movaps %%xmm1, %%xmm9\n"
12469 "movaps %%xmm1, %%xmm13\n"
12470 "addps %%xmm5, %%xmm9\n"
12471 "subps %%xmm5, %%xmm13\n"
12472 "movaps %%xmm2, %%xmm10\n"
12473 "movaps %%xmm2, %%xmm14\n"
12474 "addps %%xmm6, %%xmm10\n"
12475 "subps %%xmm6, %%xmm14\n"
12476 "movaps %%xmm3, %%xmm11\n"
12477 "movaps %%xmm3, %%xmm15\n"
12478 "addps %%xmm7, %%xmm11\n"
12479 "subps %%xmm7, %%xmm15\n"
12480 "movups %%xmm8, (%0)\n"
12481 "movups %%xmm9, (%1)\n"
12482 "movups %%xmm10, (%2)\n"
12483 "movups %%xmm11, (%3)\n"
12484 "movups %%xmm12, (%4)\n"
12485 "movups %%xmm13, (%5)\n"
12486 "movups %%xmm14, (%6)\n"
12487 "movups %%xmm15, (%7)\n"
12488 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12489 );
12490 }
12491 }
12492 for (int j = 0; j < 4096; j += 4096) {
12493 for (int k = 0; k < 2048; k += 4) {
12494 __asm__ volatile (
12495 "movups (%0), %%xmm0\n"
12496 "movups (%1), %%xmm1\n"
12497 "movaps %%xmm0, %%xmm8\n"
12498 "movaps %%xmm0, %%xmm9\n"
12499 "addps %%xmm1, %%xmm8\n"
12500 "subps %%xmm1, %%xmm9\n"
12501 "movups %%xmm8, (%0)\n"
12502 "movups %%xmm9, (%1)\n"
12503 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12504 );
12505 }
12506 }
12507 return;
12508 }
12509 if (depth == 15) {
12510 helper_float_29_recursive(buf + 0, 12);
12511 helper_float_29_recursive(buf + 4096, 12);
12512 helper_float_29_recursive(buf + 8192, 12);
12513 helper_float_29_recursive(buf + 12288, 12);
12514 helper_float_29_recursive(buf + 16384, 12);
12515 helper_float_29_recursive(buf + 20480, 12);
12516 helper_float_29_recursive(buf + 24576, 12);
12517 helper_float_29_recursive(buf + 28672, 12);
12518 for (int j = 0; j < 32768; j += 32768) {
12519 for (int k = 0; k < 4096; k += 4) {
12520 __asm__ volatile (
12521 "movups (%0), %%xmm0\n"
12522 "movups (%1), %%xmm1\n"
12523 "movups (%2), %%xmm2\n"
12524 "movups (%3), %%xmm3\n"
12525 "movups (%4), %%xmm4\n"
12526 "movups (%5), %%xmm5\n"
12527 "movups (%6), %%xmm6\n"
12528 "movups (%7), %%xmm7\n"
12529 "movaps %%xmm0, %%xmm8\n"
12530 "movaps %%xmm0, %%xmm9\n"
12531 "addps %%xmm1, %%xmm8\n"
12532 "subps %%xmm1, %%xmm9\n"
12533 "movaps %%xmm2, %%xmm10\n"
12534 "movaps %%xmm2, %%xmm11\n"
12535 "addps %%xmm3, %%xmm10\n"
12536 "subps %%xmm3, %%xmm11\n"
12537 "movaps %%xmm4, %%xmm12\n"
12538 "movaps %%xmm4, %%xmm13\n"
12539 "addps %%xmm5, %%xmm12\n"
12540 "subps %%xmm5, %%xmm13\n"
12541 "movaps %%xmm6, %%xmm14\n"
12542 "movaps %%xmm6, %%xmm15\n"
12543 "addps %%xmm7, %%xmm14\n"
12544 "subps %%xmm7, %%xmm15\n"
12545 "movaps %%xmm8, %%xmm0\n"
12546 "movaps %%xmm8, %%xmm2\n"
12547 "addps %%xmm10, %%xmm0\n"
12548 "subps %%xmm10, %%xmm2\n"
12549 "movaps %%xmm9, %%xmm1\n"
12550 "movaps %%xmm9, %%xmm3\n"
12551 "addps %%xmm11, %%xmm1\n"
12552 "subps %%xmm11, %%xmm3\n"
12553 "movaps %%xmm12, %%xmm4\n"
12554 "movaps %%xmm12, %%xmm6\n"
12555 "addps %%xmm14, %%xmm4\n"
12556 "subps %%xmm14, %%xmm6\n"
12557 "movaps %%xmm13, %%xmm5\n"
12558 "movaps %%xmm13, %%xmm7\n"
12559 "addps %%xmm15, %%xmm5\n"
12560 "subps %%xmm15, %%xmm7\n"
12561 "movaps %%xmm0, %%xmm8\n"
12562 "movaps %%xmm0, %%xmm12\n"
12563 "addps %%xmm4, %%xmm8\n"
12564 "subps %%xmm4, %%xmm12\n"
12565 "movaps %%xmm1, %%xmm9\n"
12566 "movaps %%xmm1, %%xmm13\n"
12567 "addps %%xmm5, %%xmm9\n"
12568 "subps %%xmm5, %%xmm13\n"
12569 "movaps %%xmm2, %%xmm10\n"
12570 "movaps %%xmm2, %%xmm14\n"
12571 "addps %%xmm6, %%xmm10\n"
12572 "subps %%xmm6, %%xmm14\n"
12573 "movaps %%xmm3, %%xmm11\n"
12574 "movaps %%xmm3, %%xmm15\n"
12575 "addps %%xmm7, %%xmm11\n"
12576 "subps %%xmm7, %%xmm15\n"
12577 "movups %%xmm8, (%0)\n"
12578 "movups %%xmm9, (%1)\n"
12579 "movups %%xmm10, (%2)\n"
12580 "movups %%xmm11, (%3)\n"
12581 "movups %%xmm12, (%4)\n"
12582 "movups %%xmm13, (%5)\n"
12583 "movups %%xmm14, (%6)\n"
12584 "movups %%xmm15, (%7)\n"
12585 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12586 );
12587 }
12588 }
12589 return;
12590 }
12591 if (depth == 18) {
12592 helper_float_29_recursive(buf + 0, 15);
12593 helper_float_29_recursive(buf + 32768, 15);
12594 helper_float_29_recursive(buf + 65536, 15);
12595 helper_float_29_recursive(buf + 98304, 15);
12596 helper_float_29_recursive(buf + 131072, 15);
12597 helper_float_29_recursive(buf + 163840, 15);
12598 helper_float_29_recursive(buf + 196608, 15);
12599 helper_float_29_recursive(buf + 229376, 15);
12600 for (int j = 0; j < 262144; j += 262144) {
12601 for (int k = 0; k < 32768; k += 4) {
12602 __asm__ volatile (
12603 "movups (%0), %%xmm0\n"
12604 "movups (%1), %%xmm1\n"
12605 "movups (%2), %%xmm2\n"
12606 "movups (%3), %%xmm3\n"
12607 "movups (%4), %%xmm4\n"
12608 "movups (%5), %%xmm5\n"
12609 "movups (%6), %%xmm6\n"
12610 "movups (%7), %%xmm7\n"
12611 "movaps %%xmm0, %%xmm8\n"
12612 "movaps %%xmm0, %%xmm9\n"
12613 "addps %%xmm1, %%xmm8\n"
12614 "subps %%xmm1, %%xmm9\n"
12615 "movaps %%xmm2, %%xmm10\n"
12616 "movaps %%xmm2, %%xmm11\n"
12617 "addps %%xmm3, %%xmm10\n"
12618 "subps %%xmm3, %%xmm11\n"
12619 "movaps %%xmm4, %%xmm12\n"
12620 "movaps %%xmm4, %%xmm13\n"
12621 "addps %%xmm5, %%xmm12\n"
12622 "subps %%xmm5, %%xmm13\n"
12623 "movaps %%xmm6, %%xmm14\n"
12624 "movaps %%xmm6, %%xmm15\n"
12625 "addps %%xmm7, %%xmm14\n"
12626 "subps %%xmm7, %%xmm15\n"
12627 "movaps %%xmm8, %%xmm0\n"
12628 "movaps %%xmm8, %%xmm2\n"
12629 "addps %%xmm10, %%xmm0\n"
12630 "subps %%xmm10, %%xmm2\n"
12631 "movaps %%xmm9, %%xmm1\n"
12632 "movaps %%xmm9, %%xmm3\n"
12633 "addps %%xmm11, %%xmm1\n"
12634 "subps %%xmm11, %%xmm3\n"
12635 "movaps %%xmm12, %%xmm4\n"
12636 "movaps %%xmm12, %%xmm6\n"
12637 "addps %%xmm14, %%xmm4\n"
12638 "subps %%xmm14, %%xmm6\n"
12639 "movaps %%xmm13, %%xmm5\n"
12640 "movaps %%xmm13, %%xmm7\n"
12641 "addps %%xmm15, %%xmm5\n"
12642 "subps %%xmm15, %%xmm7\n"
12643 "movaps %%xmm0, %%xmm8\n"
12644 "movaps %%xmm0, %%xmm12\n"
12645 "addps %%xmm4, %%xmm8\n"
12646 "subps %%xmm4, %%xmm12\n"
12647 "movaps %%xmm1, %%xmm9\n"
12648 "movaps %%xmm1, %%xmm13\n"
12649 "addps %%xmm5, %%xmm9\n"
12650 "subps %%xmm5, %%xmm13\n"
12651 "movaps %%xmm2, %%xmm10\n"
12652 "movaps %%xmm2, %%xmm14\n"
12653 "addps %%xmm6, %%xmm10\n"
12654 "subps %%xmm6, %%xmm14\n"
12655 "movaps %%xmm3, %%xmm11\n"
12656 "movaps %%xmm3, %%xmm15\n"
12657 "addps %%xmm7, %%xmm11\n"
12658 "subps %%xmm7, %%xmm15\n"
12659 "movups %%xmm8, (%0)\n"
12660 "movups %%xmm9, (%1)\n"
12661 "movups %%xmm10, (%2)\n"
12662 "movups %%xmm11, (%3)\n"
12663 "movups %%xmm12, (%4)\n"
12664 "movups %%xmm13, (%5)\n"
12665 "movups %%xmm14, (%6)\n"
12666 "movups %%xmm15, (%7)\n"
12667 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12668 );
12669 }
12670 }
12671 return;
12672 }
12673 if (depth == 21) {
12674 helper_float_29_recursive(buf + 0, 18);
12675 helper_float_29_recursive(buf + 262144, 18);
12676 helper_float_29_recursive(buf + 524288, 18);
12677 helper_float_29_recursive(buf + 786432, 18);
12678 helper_float_29_recursive(buf + 1048576, 18);
12679 helper_float_29_recursive(buf + 1310720, 18);
12680 helper_float_29_recursive(buf + 1572864, 18);
12681 helper_float_29_recursive(buf + 1835008, 18);
12682 for (int j = 0; j < 2097152; j += 2097152) {
12683 for (int k = 0; k < 262144; k += 4) {
12684 __asm__ volatile (
12685 "movups (%0), %%xmm0\n"
12686 "movups (%1), %%xmm1\n"
12687 "movups (%2), %%xmm2\n"
12688 "movups (%3), %%xmm3\n"
12689 "movups (%4), %%xmm4\n"
12690 "movups (%5), %%xmm5\n"
12691 "movups (%6), %%xmm6\n"
12692 "movups (%7), %%xmm7\n"
12693 "movaps %%xmm0, %%xmm8\n"
12694 "movaps %%xmm0, %%xmm9\n"
12695 "addps %%xmm1, %%xmm8\n"
12696 "subps %%xmm1, %%xmm9\n"
12697 "movaps %%xmm2, %%xmm10\n"
12698 "movaps %%xmm2, %%xmm11\n"
12699 "addps %%xmm3, %%xmm10\n"
12700 "subps %%xmm3, %%xmm11\n"
12701 "movaps %%xmm4, %%xmm12\n"
12702 "movaps %%xmm4, %%xmm13\n"
12703 "addps %%xmm5, %%xmm12\n"
12704 "subps %%xmm5, %%xmm13\n"
12705 "movaps %%xmm6, %%xmm14\n"
12706 "movaps %%xmm6, %%xmm15\n"
12707 "addps %%xmm7, %%xmm14\n"
12708 "subps %%xmm7, %%xmm15\n"
12709 "movaps %%xmm8, %%xmm0\n"
12710 "movaps %%xmm8, %%xmm2\n"
12711 "addps %%xmm10, %%xmm0\n"
12712 "subps %%xmm10, %%xmm2\n"
12713 "movaps %%xmm9, %%xmm1\n"
12714 "movaps %%xmm9, %%xmm3\n"
12715 "addps %%xmm11, %%xmm1\n"
12716 "subps %%xmm11, %%xmm3\n"
12717 "movaps %%xmm12, %%xmm4\n"
12718 "movaps %%xmm12, %%xmm6\n"
12719 "addps %%xmm14, %%xmm4\n"
12720 "subps %%xmm14, %%xmm6\n"
12721 "movaps %%xmm13, %%xmm5\n"
12722 "movaps %%xmm13, %%xmm7\n"
12723 "addps %%xmm15, %%xmm5\n"
12724 "subps %%xmm15, %%xmm7\n"
12725 "movaps %%xmm0, %%xmm8\n"
12726 "movaps %%xmm0, %%xmm12\n"
12727 "addps %%xmm4, %%xmm8\n"
12728 "subps %%xmm4, %%xmm12\n"
12729 "movaps %%xmm1, %%xmm9\n"
12730 "movaps %%xmm1, %%xmm13\n"
12731 "addps %%xmm5, %%xmm9\n"
12732 "subps %%xmm5, %%xmm13\n"
12733 "movaps %%xmm2, %%xmm10\n"
12734 "movaps %%xmm2, %%xmm14\n"
12735 "addps %%xmm6, %%xmm10\n"
12736 "subps %%xmm6, %%xmm14\n"
12737 "movaps %%xmm3, %%xmm11\n"
12738 "movaps %%xmm3, %%xmm15\n"
12739 "addps %%xmm7, %%xmm11\n"
12740 "subps %%xmm7, %%xmm15\n"
12741 "movups %%xmm8, (%0)\n"
12742 "movups %%xmm9, (%1)\n"
12743 "movups %%xmm10, (%2)\n"
12744 "movups %%xmm11, (%3)\n"
12745 "movups %%xmm12, (%4)\n"
12746 "movups %%xmm13, (%5)\n"
12747 "movups %%xmm14, (%6)\n"
12748 "movups %%xmm15, (%7)\n"
12749 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12750 );
12751 }
12752 }
12753 return;
12754 }
12755 if (depth == 24) {
12756 helper_float_29_recursive(buf + 0, 21);
12757 helper_float_29_recursive(buf + 2097152, 21);
12758 helper_float_29_recursive(buf + 4194304, 21);
12759 helper_float_29_recursive(buf + 6291456, 21);
12760 helper_float_29_recursive(buf + 8388608, 21);
12761 helper_float_29_recursive(buf + 10485760, 21);
12762 helper_float_29_recursive(buf + 12582912, 21);
12763 helper_float_29_recursive(buf + 14680064, 21);
12764 for (int j = 0; j < 16777216; j += 16777216) {
12765 for (int k = 0; k < 2097152; k += 4) {
12766 __asm__ volatile (
12767 "movups (%0), %%xmm0\n"
12768 "movups (%1), %%xmm1\n"
12769 "movups (%2), %%xmm2\n"
12770 "movups (%3), %%xmm3\n"
12771 "movups (%4), %%xmm4\n"
12772 "movups (%5), %%xmm5\n"
12773 "movups (%6), %%xmm6\n"
12774 "movups (%7), %%xmm7\n"
12775 "movaps %%xmm0, %%xmm8\n"
12776 "movaps %%xmm0, %%xmm9\n"
12777 "addps %%xmm1, %%xmm8\n"
12778 "subps %%xmm1, %%xmm9\n"
12779 "movaps %%xmm2, %%xmm10\n"
12780 "movaps %%xmm2, %%xmm11\n"
12781 "addps %%xmm3, %%xmm10\n"
12782 "subps %%xmm3, %%xmm11\n"
12783 "movaps %%xmm4, %%xmm12\n"
12784 "movaps %%xmm4, %%xmm13\n"
12785 "addps %%xmm5, %%xmm12\n"
12786 "subps %%xmm5, %%xmm13\n"
12787 "movaps %%xmm6, %%xmm14\n"
12788 "movaps %%xmm6, %%xmm15\n"
12789 "addps %%xmm7, %%xmm14\n"
12790 "subps %%xmm7, %%xmm15\n"
12791 "movaps %%xmm8, %%xmm0\n"
12792 "movaps %%xmm8, %%xmm2\n"
12793 "addps %%xmm10, %%xmm0\n"
12794 "subps %%xmm10, %%xmm2\n"
12795 "movaps %%xmm9, %%xmm1\n"
12796 "movaps %%xmm9, %%xmm3\n"
12797 "addps %%xmm11, %%xmm1\n"
12798 "subps %%xmm11, %%xmm3\n"
12799 "movaps %%xmm12, %%xmm4\n"
12800 "movaps %%xmm12, %%xmm6\n"
12801 "addps %%xmm14, %%xmm4\n"
12802 "subps %%xmm14, %%xmm6\n"
12803 "movaps %%xmm13, %%xmm5\n"
12804 "movaps %%xmm13, %%xmm7\n"
12805 "addps %%xmm15, %%xmm5\n"
12806 "subps %%xmm15, %%xmm7\n"
12807 "movaps %%xmm0, %%xmm8\n"
12808 "movaps %%xmm0, %%xmm12\n"
12809 "addps %%xmm4, %%xmm8\n"
12810 "subps %%xmm4, %%xmm12\n"
12811 "movaps %%xmm1, %%xmm9\n"
12812 "movaps %%xmm1, %%xmm13\n"
12813 "addps %%xmm5, %%xmm9\n"
12814 "subps %%xmm5, %%xmm13\n"
12815 "movaps %%xmm2, %%xmm10\n"
12816 "movaps %%xmm2, %%xmm14\n"
12817 "addps %%xmm6, %%xmm10\n"
12818 "subps %%xmm6, %%xmm14\n"
12819 "movaps %%xmm3, %%xmm11\n"
12820 "movaps %%xmm3, %%xmm15\n"
12821 "addps %%xmm7, %%xmm11\n"
12822 "subps %%xmm7, %%xmm15\n"
12823 "movups %%xmm8, (%0)\n"
12824 "movups %%xmm9, (%1)\n"
12825 "movups %%xmm10, (%2)\n"
12826 "movups %%xmm11, (%3)\n"
12827 "movups %%xmm12, (%4)\n"
12828 "movups %%xmm13, (%5)\n"
12829 "movups %%xmm14, (%6)\n"
12830 "movups %%xmm15, (%7)\n"
12831 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12832 );
12833 }
12834 }
12835 return;
12836 }
12837 if (depth == 27) {
12838 helper_float_29_recursive(buf + 0, 24);
12839 helper_float_29_recursive(buf + 16777216, 24);
12840 helper_float_29_recursive(buf + 33554432, 24);
12841 helper_float_29_recursive(buf + 50331648, 24);
12842 helper_float_29_recursive(buf + 67108864, 24);
12843 helper_float_29_recursive(buf + 83886080, 24);
12844 helper_float_29_recursive(buf + 100663296, 24);
12845 helper_float_29_recursive(buf + 117440512, 24);
12846 for (int j = 0; j < 134217728; j += 134217728) {
12847 for (int k = 0; k < 16777216; k += 4) {
12848 __asm__ volatile (
12849 "movups (%0), %%xmm0\n"
12850 "movups (%1), %%xmm1\n"
12851 "movups (%2), %%xmm2\n"
12852 "movups (%3), %%xmm3\n"
12853 "movups (%4), %%xmm4\n"
12854 "movups (%5), %%xmm5\n"
12855 "movups (%6), %%xmm6\n"
12856 "movups (%7), %%xmm7\n"
12857 "movaps %%xmm0, %%xmm8\n"
12858 "movaps %%xmm0, %%xmm9\n"
12859 "addps %%xmm1, %%xmm8\n"
12860 "subps %%xmm1, %%xmm9\n"
12861 "movaps %%xmm2, %%xmm10\n"
12862 "movaps %%xmm2, %%xmm11\n"
12863 "addps %%xmm3, %%xmm10\n"
12864 "subps %%xmm3, %%xmm11\n"
12865 "movaps %%xmm4, %%xmm12\n"
12866 "movaps %%xmm4, %%xmm13\n"
12867 "addps %%xmm5, %%xmm12\n"
12868 "subps %%xmm5, %%xmm13\n"
12869 "movaps %%xmm6, %%xmm14\n"
12870 "movaps %%xmm6, %%xmm15\n"
12871 "addps %%xmm7, %%xmm14\n"
12872 "subps %%xmm7, %%xmm15\n"
12873 "movaps %%xmm8, %%xmm0\n"
12874 "movaps %%xmm8, %%xmm2\n"
12875 "addps %%xmm10, %%xmm0\n"
12876 "subps %%xmm10, %%xmm2\n"
12877 "movaps %%xmm9, %%xmm1\n"
12878 "movaps %%xmm9, %%xmm3\n"
12879 "addps %%xmm11, %%xmm1\n"
12880 "subps %%xmm11, %%xmm3\n"
12881 "movaps %%xmm12, %%xmm4\n"
12882 "movaps %%xmm12, %%xmm6\n"
12883 "addps %%xmm14, %%xmm4\n"
12884 "subps %%xmm14, %%xmm6\n"
12885 "movaps %%xmm13, %%xmm5\n"
12886 "movaps %%xmm13, %%xmm7\n"
12887 "addps %%xmm15, %%xmm5\n"
12888 "subps %%xmm15, %%xmm7\n"
12889 "movaps %%xmm0, %%xmm8\n"
12890 "movaps %%xmm0, %%xmm12\n"
12891 "addps %%xmm4, %%xmm8\n"
12892 "subps %%xmm4, %%xmm12\n"
12893 "movaps %%xmm1, %%xmm9\n"
12894 "movaps %%xmm1, %%xmm13\n"
12895 "addps %%xmm5, %%xmm9\n"
12896 "subps %%xmm5, %%xmm13\n"
12897 "movaps %%xmm2, %%xmm10\n"
12898 "movaps %%xmm2, %%xmm14\n"
12899 "addps %%xmm6, %%xmm10\n"
12900 "subps %%xmm6, %%xmm14\n"
12901 "movaps %%xmm3, %%xmm11\n"
12902 "movaps %%xmm3, %%xmm15\n"
12903 "addps %%xmm7, %%xmm11\n"
12904 "subps %%xmm7, %%xmm15\n"
12905 "movups %%xmm8, (%0)\n"
12906 "movups %%xmm9, (%1)\n"
12907 "movups %%xmm10, (%2)\n"
12908 "movups %%xmm11, (%3)\n"
12909 "movups %%xmm12, (%4)\n"
12910 "movups %%xmm13, (%5)\n"
12911 "movups %%xmm14, (%6)\n"
12912 "movups %%xmm15, (%7)\n"
12913 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12914 );
12915 }
12916 }
12917 return;
12918 }
12919 if (depth == 29) {
12920 helper_float_29_recursive(buf + 0, 27);
12921 helper_float_29_recursive(buf + 134217728, 27);
12922 helper_float_29_recursive(buf + 268435456, 27);
12923 helper_float_29_recursive(buf + 402653184, 27);
12924 for (int j = 0; j < 536870912; j += 536870912) {
12925 for (int k = 0; k < 134217728; k += 4) {
12926 __asm__ volatile (
12927 "movups (%0), %%xmm0\n"
12928 "movups (%1), %%xmm1\n"
12929 "movups (%2), %%xmm2\n"
12930 "movups (%3), %%xmm3\n"
12931 "movaps %%xmm0, %%xmm8\n"
12932 "movaps %%xmm0, %%xmm9\n"
12933 "addps %%xmm1, %%xmm8\n"
12934 "subps %%xmm1, %%xmm9\n"
12935 "movaps %%xmm2, %%xmm10\n"
12936 "movaps %%xmm2, %%xmm11\n"
12937 "addps %%xmm3, %%xmm10\n"
12938 "subps %%xmm3, %%xmm11\n"
12939 "movaps %%xmm8, %%xmm0\n"
12940 "movaps %%xmm8, %%xmm2\n"
12941 "addps %%xmm10, %%xmm0\n"
12942 "subps %%xmm10, %%xmm2\n"
12943 "movaps %%xmm9, %%xmm1\n"
12944 "movaps %%xmm9, %%xmm3\n"
12945 "addps %%xmm11, %%xmm1\n"
12946 "subps %%xmm11, %%xmm3\n"
12947 "movups %%xmm0, (%0)\n"
12948 "movups %%xmm1, (%1)\n"
12949 "movups %%xmm2, (%2)\n"
12950 "movups %%xmm3, (%3)\n"
12951 :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
12952 );
12953 }
12954 }
12955 return;
12956 }
12957 }
12958 void helper_float_29(float *buf);
helper_float_29(float * buf)12959 void helper_float_29(float *buf) {
12960 helper_float_29_recursive(buf, 29);
12961 }
12962 void helper_float_30_recursive(float *buf, int depth);
helper_float_30_recursive(float * buf,int depth)12963 void helper_float_30_recursive(float *buf, int depth) {
12964 if (depth == 12) {
12965 for (int j = 0; j < 4096; j += 32) {
12966 for (int k = 0; k < 4; k += 4) {
12967 __asm__ volatile (
12968 "movups (%0), %%xmm0\n"
12969 "movups (%1), %%xmm1\n"
12970 "movups (%2), %%xmm2\n"
12971 "movups (%3), %%xmm3\n"
12972 "movups (%4), %%xmm4\n"
12973 "movups (%5), %%xmm5\n"
12974 "movups (%6), %%xmm6\n"
12975 "movups (%7), %%xmm7\n"
12976 "movaps %%xmm0, %%xmm8\n"
12977 "shufps $160, %%xmm8, %%xmm8\n"
12978 "shufps $245, %%xmm0, %%xmm0\n"
12979 "xorps %%xmm9, %%xmm9\n"
12980 "subps %%xmm0, %%xmm9\n"
12981 "addsubps %%xmm9, %%xmm8\n"
12982 "movaps %%xmm8, %%xmm0\n"
12983 "movaps %%xmm1, %%xmm8\n"
12984 "shufps $160, %%xmm8, %%xmm8\n"
12985 "shufps $245, %%xmm1, %%xmm1\n"
12986 "xorps %%xmm9, %%xmm9\n"
12987 "subps %%xmm1, %%xmm9\n"
12988 "addsubps %%xmm9, %%xmm8\n"
12989 "movaps %%xmm8, %%xmm1\n"
12990 "movaps %%xmm2, %%xmm8\n"
12991 "shufps $160, %%xmm8, %%xmm8\n"
12992 "shufps $245, %%xmm2, %%xmm2\n"
12993 "xorps %%xmm9, %%xmm9\n"
12994 "subps %%xmm2, %%xmm9\n"
12995 "addsubps %%xmm9, %%xmm8\n"
12996 "movaps %%xmm8, %%xmm2\n"
12997 "movaps %%xmm3, %%xmm8\n"
12998 "shufps $160, %%xmm8, %%xmm8\n"
12999 "shufps $245, %%xmm3, %%xmm3\n"
13000 "xorps %%xmm9, %%xmm9\n"
13001 "subps %%xmm3, %%xmm9\n"
13002 "addsubps %%xmm9, %%xmm8\n"
13003 "movaps %%xmm8, %%xmm3\n"
13004 "movaps %%xmm4, %%xmm8\n"
13005 "shufps $160, %%xmm8, %%xmm8\n"
13006 "shufps $245, %%xmm4, %%xmm4\n"
13007 "xorps %%xmm9, %%xmm9\n"
13008 "subps %%xmm4, %%xmm9\n"
13009 "addsubps %%xmm9, %%xmm8\n"
13010 "movaps %%xmm8, %%xmm4\n"
13011 "movaps %%xmm5, %%xmm8\n"
13012 "shufps $160, %%xmm8, %%xmm8\n"
13013 "shufps $245, %%xmm5, %%xmm5\n"
13014 "xorps %%xmm9, %%xmm9\n"
13015 "subps %%xmm5, %%xmm9\n"
13016 "addsubps %%xmm9, %%xmm8\n"
13017 "movaps %%xmm8, %%xmm5\n"
13018 "movaps %%xmm6, %%xmm8\n"
13019 "shufps $160, %%xmm8, %%xmm8\n"
13020 "shufps $245, %%xmm6, %%xmm6\n"
13021 "xorps %%xmm9, %%xmm9\n"
13022 "subps %%xmm6, %%xmm9\n"
13023 "addsubps %%xmm9, %%xmm8\n"
13024 "movaps %%xmm8, %%xmm6\n"
13025 "movaps %%xmm7, %%xmm8\n"
13026 "shufps $160, %%xmm8, %%xmm8\n"
13027 "shufps $245, %%xmm7, %%xmm7\n"
13028 "xorps %%xmm9, %%xmm9\n"
13029 "subps %%xmm7, %%xmm9\n"
13030 "addsubps %%xmm9, %%xmm8\n"
13031 "movaps %%xmm8, %%xmm7\n"
13032 "movaps %%xmm0, %%xmm8\n"
13033 "shufps $68, %%xmm8, %%xmm8\n"
13034 "xorps %%xmm9, %%xmm9\n"
13035 "movaps %%xmm0, %%xmm10\n"
13036 "shufps $14, %%xmm9, %%xmm10\n"
13037 "movaps %%xmm0, %%xmm11\n"
13038 "shufps $224, %%xmm11, %%xmm9\n"
13039 "addps %%xmm8, %%xmm10\n"
13040 "subps %%xmm9, %%xmm10\n"
13041 "movaps %%xmm10, %%xmm0\n"
13042 "movaps %%xmm1, %%xmm8\n"
13043 "shufps $68, %%xmm8, %%xmm8\n"
13044 "xorps %%xmm9, %%xmm9\n"
13045 "movaps %%xmm1, %%xmm10\n"
13046 "shufps $14, %%xmm9, %%xmm10\n"
13047 "movaps %%xmm1, %%xmm11\n"
13048 "shufps $224, %%xmm11, %%xmm9\n"
13049 "addps %%xmm8, %%xmm10\n"
13050 "subps %%xmm9, %%xmm10\n"
13051 "movaps %%xmm10, %%xmm1\n"
13052 "movaps %%xmm2, %%xmm8\n"
13053 "shufps $68, %%xmm8, %%xmm8\n"
13054 "xorps %%xmm9, %%xmm9\n"
13055 "movaps %%xmm2, %%xmm10\n"
13056 "shufps $14, %%xmm9, %%xmm10\n"
13057 "movaps %%xmm2, %%xmm11\n"
13058 "shufps $224, %%xmm11, %%xmm9\n"
13059 "addps %%xmm8, %%xmm10\n"
13060 "subps %%xmm9, %%xmm10\n"
13061 "movaps %%xmm10, %%xmm2\n"
13062 "movaps %%xmm3, %%xmm8\n"
13063 "shufps $68, %%xmm8, %%xmm8\n"
13064 "xorps %%xmm9, %%xmm9\n"
13065 "movaps %%xmm3, %%xmm10\n"
13066 "shufps $14, %%xmm9, %%xmm10\n"
13067 "movaps %%xmm3, %%xmm11\n"
13068 "shufps $224, %%xmm11, %%xmm9\n"
13069 "addps %%xmm8, %%xmm10\n"
13070 "subps %%xmm9, %%xmm10\n"
13071 "movaps %%xmm10, %%xmm3\n"
13072 "movaps %%xmm4, %%xmm8\n"
13073 "shufps $68, %%xmm8, %%xmm8\n"
13074 "xorps %%xmm9, %%xmm9\n"
13075 "movaps %%xmm4, %%xmm10\n"
13076 "shufps $14, %%xmm9, %%xmm10\n"
13077 "movaps %%xmm4, %%xmm11\n"
13078 "shufps $224, %%xmm11, %%xmm9\n"
13079 "addps %%xmm8, %%xmm10\n"
13080 "subps %%xmm9, %%xmm10\n"
13081 "movaps %%xmm10, %%xmm4\n"
13082 "movaps %%xmm5, %%xmm8\n"
13083 "shufps $68, %%xmm8, %%xmm8\n"
13084 "xorps %%xmm9, %%xmm9\n"
13085 "movaps %%xmm5, %%xmm10\n"
13086 "shufps $14, %%xmm9, %%xmm10\n"
13087 "movaps %%xmm5, %%xmm11\n"
13088 "shufps $224, %%xmm11, %%xmm9\n"
13089 "addps %%xmm8, %%xmm10\n"
13090 "subps %%xmm9, %%xmm10\n"
13091 "movaps %%xmm10, %%xmm5\n"
13092 "movaps %%xmm6, %%xmm8\n"
13093 "shufps $68, %%xmm8, %%xmm8\n"
13094 "xorps %%xmm9, %%xmm9\n"
13095 "movaps %%xmm6, %%xmm10\n"
13096 "shufps $14, %%xmm9, %%xmm10\n"
13097 "movaps %%xmm6, %%xmm11\n"
13098 "shufps $224, %%xmm11, %%xmm9\n"
13099 "addps %%xmm8, %%xmm10\n"
13100 "subps %%xmm9, %%xmm10\n"
13101 "movaps %%xmm10, %%xmm6\n"
13102 "movaps %%xmm7, %%xmm8\n"
13103 "shufps $68, %%xmm8, %%xmm8\n"
13104 "xorps %%xmm9, %%xmm9\n"
13105 "movaps %%xmm7, %%xmm10\n"
13106 "shufps $14, %%xmm9, %%xmm10\n"
13107 "movaps %%xmm7, %%xmm11\n"
13108 "shufps $224, %%xmm11, %%xmm9\n"
13109 "addps %%xmm8, %%xmm10\n"
13110 "subps %%xmm9, %%xmm10\n"
13111 "movaps %%xmm10, %%xmm7\n"
13112 "movaps %%xmm0, %%xmm8\n"
13113 "movaps %%xmm0, %%xmm9\n"
13114 "addps %%xmm1, %%xmm8\n"
13115 "subps %%xmm1, %%xmm9\n"
13116 "movaps %%xmm2, %%xmm10\n"
13117 "movaps %%xmm2, %%xmm11\n"
13118 "addps %%xmm3, %%xmm10\n"
13119 "subps %%xmm3, %%xmm11\n"
13120 "movaps %%xmm4, %%xmm12\n"
13121 "movaps %%xmm4, %%xmm13\n"
13122 "addps %%xmm5, %%xmm12\n"
13123 "subps %%xmm5, %%xmm13\n"
13124 "movaps %%xmm6, %%xmm14\n"
13125 "movaps %%xmm6, %%xmm15\n"
13126 "addps %%xmm7, %%xmm14\n"
13127 "subps %%xmm7, %%xmm15\n"
13128 "movaps %%xmm8, %%xmm0\n"
13129 "movaps %%xmm8, %%xmm2\n"
13130 "addps %%xmm10, %%xmm0\n"
13131 "subps %%xmm10, %%xmm2\n"
13132 "movaps %%xmm9, %%xmm1\n"
13133 "movaps %%xmm9, %%xmm3\n"
13134 "addps %%xmm11, %%xmm1\n"
13135 "subps %%xmm11, %%xmm3\n"
13136 "movaps %%xmm12, %%xmm4\n"
13137 "movaps %%xmm12, %%xmm6\n"
13138 "addps %%xmm14, %%xmm4\n"
13139 "subps %%xmm14, %%xmm6\n"
13140 "movaps %%xmm13, %%xmm5\n"
13141 "movaps %%xmm13, %%xmm7\n"
13142 "addps %%xmm15, %%xmm5\n"
13143 "subps %%xmm15, %%xmm7\n"
13144 "movaps %%xmm0, %%xmm8\n"
13145 "movaps %%xmm0, %%xmm12\n"
13146 "addps %%xmm4, %%xmm8\n"
13147 "subps %%xmm4, %%xmm12\n"
13148 "movaps %%xmm1, %%xmm9\n"
13149 "movaps %%xmm1, %%xmm13\n"
13150 "addps %%xmm5, %%xmm9\n"
13151 "subps %%xmm5, %%xmm13\n"
13152 "movaps %%xmm2, %%xmm10\n"
13153 "movaps %%xmm2, %%xmm14\n"
13154 "addps %%xmm6, %%xmm10\n"
13155 "subps %%xmm6, %%xmm14\n"
13156 "movaps %%xmm3, %%xmm11\n"
13157 "movaps %%xmm3, %%xmm15\n"
13158 "addps %%xmm7, %%xmm11\n"
13159 "subps %%xmm7, %%xmm15\n"
13160 "movups %%xmm8, (%0)\n"
13161 "movups %%xmm9, (%1)\n"
13162 "movups %%xmm10, (%2)\n"
13163 "movups %%xmm11, (%3)\n"
13164 "movups %%xmm12, (%4)\n"
13165 "movups %%xmm13, (%5)\n"
13166 "movups %%xmm14, (%6)\n"
13167 "movups %%xmm15, (%7)\n"
13168 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13169 );
13170 }
13171 }
13172 for (int j = 0; j < 4096; j += 256) {
13173 for (int k = 0; k < 32; k += 4) {
13174 __asm__ volatile (
13175 "movups (%0), %%xmm0\n"
13176 "movups (%1), %%xmm1\n"
13177 "movups (%2), %%xmm2\n"
13178 "movups (%3), %%xmm3\n"
13179 "movups (%4), %%xmm4\n"
13180 "movups (%5), %%xmm5\n"
13181 "movups (%6), %%xmm6\n"
13182 "movups (%7), %%xmm7\n"
13183 "movaps %%xmm0, %%xmm8\n"
13184 "movaps %%xmm0, %%xmm9\n"
13185 "addps %%xmm1, %%xmm8\n"
13186 "subps %%xmm1, %%xmm9\n"
13187 "movaps %%xmm2, %%xmm10\n"
13188 "movaps %%xmm2, %%xmm11\n"
13189 "addps %%xmm3, %%xmm10\n"
13190 "subps %%xmm3, %%xmm11\n"
13191 "movaps %%xmm4, %%xmm12\n"
13192 "movaps %%xmm4, %%xmm13\n"
13193 "addps %%xmm5, %%xmm12\n"
13194 "subps %%xmm5, %%xmm13\n"
13195 "movaps %%xmm6, %%xmm14\n"
13196 "movaps %%xmm6, %%xmm15\n"
13197 "addps %%xmm7, %%xmm14\n"
13198 "subps %%xmm7, %%xmm15\n"
13199 "movaps %%xmm8, %%xmm0\n"
13200 "movaps %%xmm8, %%xmm2\n"
13201 "addps %%xmm10, %%xmm0\n"
13202 "subps %%xmm10, %%xmm2\n"
13203 "movaps %%xmm9, %%xmm1\n"
13204 "movaps %%xmm9, %%xmm3\n"
13205 "addps %%xmm11, %%xmm1\n"
13206 "subps %%xmm11, %%xmm3\n"
13207 "movaps %%xmm12, %%xmm4\n"
13208 "movaps %%xmm12, %%xmm6\n"
13209 "addps %%xmm14, %%xmm4\n"
13210 "subps %%xmm14, %%xmm6\n"
13211 "movaps %%xmm13, %%xmm5\n"
13212 "movaps %%xmm13, %%xmm7\n"
13213 "addps %%xmm15, %%xmm5\n"
13214 "subps %%xmm15, %%xmm7\n"
13215 "movaps %%xmm0, %%xmm8\n"
13216 "movaps %%xmm0, %%xmm12\n"
13217 "addps %%xmm4, %%xmm8\n"
13218 "subps %%xmm4, %%xmm12\n"
13219 "movaps %%xmm1, %%xmm9\n"
13220 "movaps %%xmm1, %%xmm13\n"
13221 "addps %%xmm5, %%xmm9\n"
13222 "subps %%xmm5, %%xmm13\n"
13223 "movaps %%xmm2, %%xmm10\n"
13224 "movaps %%xmm2, %%xmm14\n"
13225 "addps %%xmm6, %%xmm10\n"
13226 "subps %%xmm6, %%xmm14\n"
13227 "movaps %%xmm3, %%xmm11\n"
13228 "movaps %%xmm3, %%xmm15\n"
13229 "addps %%xmm7, %%xmm11\n"
13230 "subps %%xmm7, %%xmm15\n"
13231 "movups %%xmm8, (%0)\n"
13232 "movups %%xmm9, (%1)\n"
13233 "movups %%xmm10, (%2)\n"
13234 "movups %%xmm11, (%3)\n"
13235 "movups %%xmm12, (%4)\n"
13236 "movups %%xmm13, (%5)\n"
13237 "movups %%xmm14, (%6)\n"
13238 "movups %%xmm15, (%7)\n"
13239 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13240 );
13241 }
13242 }
13243 for (int j = 0; j < 4096; j += 2048) {
13244 for (int k = 0; k < 256; k += 4) {
13245 __asm__ volatile (
13246 "movups (%0), %%xmm0\n"
13247 "movups (%1), %%xmm1\n"
13248 "movups (%2), %%xmm2\n"
13249 "movups (%3), %%xmm3\n"
13250 "movups (%4), %%xmm4\n"
13251 "movups (%5), %%xmm5\n"
13252 "movups (%6), %%xmm6\n"
13253 "movups (%7), %%xmm7\n"
13254 "movaps %%xmm0, %%xmm8\n"
13255 "movaps %%xmm0, %%xmm9\n"
13256 "addps %%xmm1, %%xmm8\n"
13257 "subps %%xmm1, %%xmm9\n"
13258 "movaps %%xmm2, %%xmm10\n"
13259 "movaps %%xmm2, %%xmm11\n"
13260 "addps %%xmm3, %%xmm10\n"
13261 "subps %%xmm3, %%xmm11\n"
13262 "movaps %%xmm4, %%xmm12\n"
13263 "movaps %%xmm4, %%xmm13\n"
13264 "addps %%xmm5, %%xmm12\n"
13265 "subps %%xmm5, %%xmm13\n"
13266 "movaps %%xmm6, %%xmm14\n"
13267 "movaps %%xmm6, %%xmm15\n"
13268 "addps %%xmm7, %%xmm14\n"
13269 "subps %%xmm7, %%xmm15\n"
13270 "movaps %%xmm8, %%xmm0\n"
13271 "movaps %%xmm8, %%xmm2\n"
13272 "addps %%xmm10, %%xmm0\n"
13273 "subps %%xmm10, %%xmm2\n"
13274 "movaps %%xmm9, %%xmm1\n"
13275 "movaps %%xmm9, %%xmm3\n"
13276 "addps %%xmm11, %%xmm1\n"
13277 "subps %%xmm11, %%xmm3\n"
13278 "movaps %%xmm12, %%xmm4\n"
13279 "movaps %%xmm12, %%xmm6\n"
13280 "addps %%xmm14, %%xmm4\n"
13281 "subps %%xmm14, %%xmm6\n"
13282 "movaps %%xmm13, %%xmm5\n"
13283 "movaps %%xmm13, %%xmm7\n"
13284 "addps %%xmm15, %%xmm5\n"
13285 "subps %%xmm15, %%xmm7\n"
13286 "movaps %%xmm0, %%xmm8\n"
13287 "movaps %%xmm0, %%xmm12\n"
13288 "addps %%xmm4, %%xmm8\n"
13289 "subps %%xmm4, %%xmm12\n"
13290 "movaps %%xmm1, %%xmm9\n"
13291 "movaps %%xmm1, %%xmm13\n"
13292 "addps %%xmm5, %%xmm9\n"
13293 "subps %%xmm5, %%xmm13\n"
13294 "movaps %%xmm2, %%xmm10\n"
13295 "movaps %%xmm2, %%xmm14\n"
13296 "addps %%xmm6, %%xmm10\n"
13297 "subps %%xmm6, %%xmm14\n"
13298 "movaps %%xmm3, %%xmm11\n"
13299 "movaps %%xmm3, %%xmm15\n"
13300 "addps %%xmm7, %%xmm11\n"
13301 "subps %%xmm7, %%xmm15\n"
13302 "movups %%xmm8, (%0)\n"
13303 "movups %%xmm9, (%1)\n"
13304 "movups %%xmm10, (%2)\n"
13305 "movups %%xmm11, (%3)\n"
13306 "movups %%xmm12, (%4)\n"
13307 "movups %%xmm13, (%5)\n"
13308 "movups %%xmm14, (%6)\n"
13309 "movups %%xmm15, (%7)\n"
13310 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13311 );
13312 }
13313 }
13314 for (int j = 0; j < 4096; j += 4096) {
13315 for (int k = 0; k < 2048; k += 4) {
13316 __asm__ volatile (
13317 "movups (%0), %%xmm0\n"
13318 "movups (%1), %%xmm1\n"
13319 "movaps %%xmm0, %%xmm8\n"
13320 "movaps %%xmm0, %%xmm9\n"
13321 "addps %%xmm1, %%xmm8\n"
13322 "subps %%xmm1, %%xmm9\n"
13323 "movups %%xmm8, (%0)\n"
13324 "movups %%xmm9, (%1)\n"
13325 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13326 );
13327 }
13328 }
13329 return;
13330 }
13331 if (depth == 15) {
13332 helper_float_30_recursive(buf + 0, 12);
13333 helper_float_30_recursive(buf + 4096, 12);
13334 helper_float_30_recursive(buf + 8192, 12);
13335 helper_float_30_recursive(buf + 12288, 12);
13336 helper_float_30_recursive(buf + 16384, 12);
13337 helper_float_30_recursive(buf + 20480, 12);
13338 helper_float_30_recursive(buf + 24576, 12);
13339 helper_float_30_recursive(buf + 28672, 12);
13340 for (int j = 0; j < 32768; j += 32768) {
13341 for (int k = 0; k < 4096; k += 4) {
13342 __asm__ volatile (
13343 "movups (%0), %%xmm0\n"
13344 "movups (%1), %%xmm1\n"
13345 "movups (%2), %%xmm2\n"
13346 "movups (%3), %%xmm3\n"
13347 "movups (%4), %%xmm4\n"
13348 "movups (%5), %%xmm5\n"
13349 "movups (%6), %%xmm6\n"
13350 "movups (%7), %%xmm7\n"
13351 "movaps %%xmm0, %%xmm8\n"
13352 "movaps %%xmm0, %%xmm9\n"
13353 "addps %%xmm1, %%xmm8\n"
13354 "subps %%xmm1, %%xmm9\n"
13355 "movaps %%xmm2, %%xmm10\n"
13356 "movaps %%xmm2, %%xmm11\n"
13357 "addps %%xmm3, %%xmm10\n"
13358 "subps %%xmm3, %%xmm11\n"
13359 "movaps %%xmm4, %%xmm12\n"
13360 "movaps %%xmm4, %%xmm13\n"
13361 "addps %%xmm5, %%xmm12\n"
13362 "subps %%xmm5, %%xmm13\n"
13363 "movaps %%xmm6, %%xmm14\n"
13364 "movaps %%xmm6, %%xmm15\n"
13365 "addps %%xmm7, %%xmm14\n"
13366 "subps %%xmm7, %%xmm15\n"
13367 "movaps %%xmm8, %%xmm0\n"
13368 "movaps %%xmm8, %%xmm2\n"
13369 "addps %%xmm10, %%xmm0\n"
13370 "subps %%xmm10, %%xmm2\n"
13371 "movaps %%xmm9, %%xmm1\n"
13372 "movaps %%xmm9, %%xmm3\n"
13373 "addps %%xmm11, %%xmm1\n"
13374 "subps %%xmm11, %%xmm3\n"
13375 "movaps %%xmm12, %%xmm4\n"
13376 "movaps %%xmm12, %%xmm6\n"
13377 "addps %%xmm14, %%xmm4\n"
13378 "subps %%xmm14, %%xmm6\n"
13379 "movaps %%xmm13, %%xmm5\n"
13380 "movaps %%xmm13, %%xmm7\n"
13381 "addps %%xmm15, %%xmm5\n"
13382 "subps %%xmm15, %%xmm7\n"
13383 "movaps %%xmm0, %%xmm8\n"
13384 "movaps %%xmm0, %%xmm12\n"
13385 "addps %%xmm4, %%xmm8\n"
13386 "subps %%xmm4, %%xmm12\n"
13387 "movaps %%xmm1, %%xmm9\n"
13388 "movaps %%xmm1, %%xmm13\n"
13389 "addps %%xmm5, %%xmm9\n"
13390 "subps %%xmm5, %%xmm13\n"
13391 "movaps %%xmm2, %%xmm10\n"
13392 "movaps %%xmm2, %%xmm14\n"
13393 "addps %%xmm6, %%xmm10\n"
13394 "subps %%xmm6, %%xmm14\n"
13395 "movaps %%xmm3, %%xmm11\n"
13396 "movaps %%xmm3, %%xmm15\n"
13397 "addps %%xmm7, %%xmm11\n"
13398 "subps %%xmm7, %%xmm15\n"
13399 "movups %%xmm8, (%0)\n"
13400 "movups %%xmm9, (%1)\n"
13401 "movups %%xmm10, (%2)\n"
13402 "movups %%xmm11, (%3)\n"
13403 "movups %%xmm12, (%4)\n"
13404 "movups %%xmm13, (%5)\n"
13405 "movups %%xmm14, (%6)\n"
13406 "movups %%xmm15, (%7)\n"
13407 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13408 );
13409 }
13410 }
13411 return;
13412 }
13413 if (depth == 18) {
13414 helper_float_30_recursive(buf + 0, 15);
13415 helper_float_30_recursive(buf + 32768, 15);
13416 helper_float_30_recursive(buf + 65536, 15);
13417 helper_float_30_recursive(buf + 98304, 15);
13418 helper_float_30_recursive(buf + 131072, 15);
13419 helper_float_30_recursive(buf + 163840, 15);
13420 helper_float_30_recursive(buf + 196608, 15);
13421 helper_float_30_recursive(buf + 229376, 15);
13422 for (int j = 0; j < 262144; j += 262144) {
13423 for (int k = 0; k < 32768; k += 4) {
13424 __asm__ volatile (
13425 "movups (%0), %%xmm0\n"
13426 "movups (%1), %%xmm1\n"
13427 "movups (%2), %%xmm2\n"
13428 "movups (%3), %%xmm3\n"
13429 "movups (%4), %%xmm4\n"
13430 "movups (%5), %%xmm5\n"
13431 "movups (%6), %%xmm6\n"
13432 "movups (%7), %%xmm7\n"
13433 "movaps %%xmm0, %%xmm8\n"
13434 "movaps %%xmm0, %%xmm9\n"
13435 "addps %%xmm1, %%xmm8\n"
13436 "subps %%xmm1, %%xmm9\n"
13437 "movaps %%xmm2, %%xmm10\n"
13438 "movaps %%xmm2, %%xmm11\n"
13439 "addps %%xmm3, %%xmm10\n"
13440 "subps %%xmm3, %%xmm11\n"
13441 "movaps %%xmm4, %%xmm12\n"
13442 "movaps %%xmm4, %%xmm13\n"
13443 "addps %%xmm5, %%xmm12\n"
13444 "subps %%xmm5, %%xmm13\n"
13445 "movaps %%xmm6, %%xmm14\n"
13446 "movaps %%xmm6, %%xmm15\n"
13447 "addps %%xmm7, %%xmm14\n"
13448 "subps %%xmm7, %%xmm15\n"
13449 "movaps %%xmm8, %%xmm0\n"
13450 "movaps %%xmm8, %%xmm2\n"
13451 "addps %%xmm10, %%xmm0\n"
13452 "subps %%xmm10, %%xmm2\n"
13453 "movaps %%xmm9, %%xmm1\n"
13454 "movaps %%xmm9, %%xmm3\n"
13455 "addps %%xmm11, %%xmm1\n"
13456 "subps %%xmm11, %%xmm3\n"
13457 "movaps %%xmm12, %%xmm4\n"
13458 "movaps %%xmm12, %%xmm6\n"
13459 "addps %%xmm14, %%xmm4\n"
13460 "subps %%xmm14, %%xmm6\n"
13461 "movaps %%xmm13, %%xmm5\n"
13462 "movaps %%xmm13, %%xmm7\n"
13463 "addps %%xmm15, %%xmm5\n"
13464 "subps %%xmm15, %%xmm7\n"
13465 "movaps %%xmm0, %%xmm8\n"
13466 "movaps %%xmm0, %%xmm12\n"
13467 "addps %%xmm4, %%xmm8\n"
13468 "subps %%xmm4, %%xmm12\n"
13469 "movaps %%xmm1, %%xmm9\n"
13470 "movaps %%xmm1, %%xmm13\n"
13471 "addps %%xmm5, %%xmm9\n"
13472 "subps %%xmm5, %%xmm13\n"
13473 "movaps %%xmm2, %%xmm10\n"
13474 "movaps %%xmm2, %%xmm14\n"
13475 "addps %%xmm6, %%xmm10\n"
13476 "subps %%xmm6, %%xmm14\n"
13477 "movaps %%xmm3, %%xmm11\n"
13478 "movaps %%xmm3, %%xmm15\n"
13479 "addps %%xmm7, %%xmm11\n"
13480 "subps %%xmm7, %%xmm15\n"
13481 "movups %%xmm8, (%0)\n"
13482 "movups %%xmm9, (%1)\n"
13483 "movups %%xmm10, (%2)\n"
13484 "movups %%xmm11, (%3)\n"
13485 "movups %%xmm12, (%4)\n"
13486 "movups %%xmm13, (%5)\n"
13487 "movups %%xmm14, (%6)\n"
13488 "movups %%xmm15, (%7)\n"
13489 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13490 );
13491 }
13492 }
13493 return;
13494 }
13495 if (depth == 21) {
13496 helper_float_30_recursive(buf + 0, 18);
13497 helper_float_30_recursive(buf + 262144, 18);
13498 helper_float_30_recursive(buf + 524288, 18);
13499 helper_float_30_recursive(buf + 786432, 18);
13500 helper_float_30_recursive(buf + 1048576, 18);
13501 helper_float_30_recursive(buf + 1310720, 18);
13502 helper_float_30_recursive(buf + 1572864, 18);
13503 helper_float_30_recursive(buf + 1835008, 18);
13504 for (int j = 0; j < 2097152; j += 2097152) {
13505 for (int k = 0; k < 262144; k += 4) {
13506 __asm__ volatile (
13507 "movups (%0), %%xmm0\n"
13508 "movups (%1), %%xmm1\n"
13509 "movups (%2), %%xmm2\n"
13510 "movups (%3), %%xmm3\n"
13511 "movups (%4), %%xmm4\n"
13512 "movups (%5), %%xmm5\n"
13513 "movups (%6), %%xmm6\n"
13514 "movups (%7), %%xmm7\n"
13515 "movaps %%xmm0, %%xmm8\n"
13516 "movaps %%xmm0, %%xmm9\n"
13517 "addps %%xmm1, %%xmm8\n"
13518 "subps %%xmm1, %%xmm9\n"
13519 "movaps %%xmm2, %%xmm10\n"
13520 "movaps %%xmm2, %%xmm11\n"
13521 "addps %%xmm3, %%xmm10\n"
13522 "subps %%xmm3, %%xmm11\n"
13523 "movaps %%xmm4, %%xmm12\n"
13524 "movaps %%xmm4, %%xmm13\n"
13525 "addps %%xmm5, %%xmm12\n"
13526 "subps %%xmm5, %%xmm13\n"
13527 "movaps %%xmm6, %%xmm14\n"
13528 "movaps %%xmm6, %%xmm15\n"
13529 "addps %%xmm7, %%xmm14\n"
13530 "subps %%xmm7, %%xmm15\n"
13531 "movaps %%xmm8, %%xmm0\n"
13532 "movaps %%xmm8, %%xmm2\n"
13533 "addps %%xmm10, %%xmm0\n"
13534 "subps %%xmm10, %%xmm2\n"
13535 "movaps %%xmm9, %%xmm1\n"
13536 "movaps %%xmm9, %%xmm3\n"
13537 "addps %%xmm11, %%xmm1\n"
13538 "subps %%xmm11, %%xmm3\n"
13539 "movaps %%xmm12, %%xmm4\n"
13540 "movaps %%xmm12, %%xmm6\n"
13541 "addps %%xmm14, %%xmm4\n"
13542 "subps %%xmm14, %%xmm6\n"
13543 "movaps %%xmm13, %%xmm5\n"
13544 "movaps %%xmm13, %%xmm7\n"
13545 "addps %%xmm15, %%xmm5\n"
13546 "subps %%xmm15, %%xmm7\n"
13547 "movaps %%xmm0, %%xmm8\n"
13548 "movaps %%xmm0, %%xmm12\n"
13549 "addps %%xmm4, %%xmm8\n"
13550 "subps %%xmm4, %%xmm12\n"
13551 "movaps %%xmm1, %%xmm9\n"
13552 "movaps %%xmm1, %%xmm13\n"
13553 "addps %%xmm5, %%xmm9\n"
13554 "subps %%xmm5, %%xmm13\n"
13555 "movaps %%xmm2, %%xmm10\n"
13556 "movaps %%xmm2, %%xmm14\n"
13557 "addps %%xmm6, %%xmm10\n"
13558 "subps %%xmm6, %%xmm14\n"
13559 "movaps %%xmm3, %%xmm11\n"
13560 "movaps %%xmm3, %%xmm15\n"
13561 "addps %%xmm7, %%xmm11\n"
13562 "subps %%xmm7, %%xmm15\n"
13563 "movups %%xmm8, (%0)\n"
13564 "movups %%xmm9, (%1)\n"
13565 "movups %%xmm10, (%2)\n"
13566 "movups %%xmm11, (%3)\n"
13567 "movups %%xmm12, (%4)\n"
13568 "movups %%xmm13, (%5)\n"
13569 "movups %%xmm14, (%6)\n"
13570 "movups %%xmm15, (%7)\n"
13571 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13572 );
13573 }
13574 }
13575 return;
13576 }
13577 if (depth == 24) {
13578 helper_float_30_recursive(buf + 0, 21);
13579 helper_float_30_recursive(buf + 2097152, 21);
13580 helper_float_30_recursive(buf + 4194304, 21);
13581 helper_float_30_recursive(buf + 6291456, 21);
13582 helper_float_30_recursive(buf + 8388608, 21);
13583 helper_float_30_recursive(buf + 10485760, 21);
13584 helper_float_30_recursive(buf + 12582912, 21);
13585 helper_float_30_recursive(buf + 14680064, 21);
13586 for (int j = 0; j < 16777216; j += 16777216) {
13587 for (int k = 0; k < 2097152; k += 4) {
13588 __asm__ volatile (
13589 "movups (%0), %%xmm0\n"
13590 "movups (%1), %%xmm1\n"
13591 "movups (%2), %%xmm2\n"
13592 "movups (%3), %%xmm3\n"
13593 "movups (%4), %%xmm4\n"
13594 "movups (%5), %%xmm5\n"
13595 "movups (%6), %%xmm6\n"
13596 "movups (%7), %%xmm7\n"
13597 "movaps %%xmm0, %%xmm8\n"
13598 "movaps %%xmm0, %%xmm9\n"
13599 "addps %%xmm1, %%xmm8\n"
13600 "subps %%xmm1, %%xmm9\n"
13601 "movaps %%xmm2, %%xmm10\n"
13602 "movaps %%xmm2, %%xmm11\n"
13603 "addps %%xmm3, %%xmm10\n"
13604 "subps %%xmm3, %%xmm11\n"
13605 "movaps %%xmm4, %%xmm12\n"
13606 "movaps %%xmm4, %%xmm13\n"
13607 "addps %%xmm5, %%xmm12\n"
13608 "subps %%xmm5, %%xmm13\n"
13609 "movaps %%xmm6, %%xmm14\n"
13610 "movaps %%xmm6, %%xmm15\n"
13611 "addps %%xmm7, %%xmm14\n"
13612 "subps %%xmm7, %%xmm15\n"
13613 "movaps %%xmm8, %%xmm0\n"
13614 "movaps %%xmm8, %%xmm2\n"
13615 "addps %%xmm10, %%xmm0\n"
13616 "subps %%xmm10, %%xmm2\n"
13617 "movaps %%xmm9, %%xmm1\n"
13618 "movaps %%xmm9, %%xmm3\n"
13619 "addps %%xmm11, %%xmm1\n"
13620 "subps %%xmm11, %%xmm3\n"
13621 "movaps %%xmm12, %%xmm4\n"
13622 "movaps %%xmm12, %%xmm6\n"
13623 "addps %%xmm14, %%xmm4\n"
13624 "subps %%xmm14, %%xmm6\n"
13625 "movaps %%xmm13, %%xmm5\n"
13626 "movaps %%xmm13, %%xmm7\n"
13627 "addps %%xmm15, %%xmm5\n"
13628 "subps %%xmm15, %%xmm7\n"
13629 "movaps %%xmm0, %%xmm8\n"
13630 "movaps %%xmm0, %%xmm12\n"
13631 "addps %%xmm4, %%xmm8\n"
13632 "subps %%xmm4, %%xmm12\n"
13633 "movaps %%xmm1, %%xmm9\n"
13634 "movaps %%xmm1, %%xmm13\n"
13635 "addps %%xmm5, %%xmm9\n"
13636 "subps %%xmm5, %%xmm13\n"
13637 "movaps %%xmm2, %%xmm10\n"
13638 "movaps %%xmm2, %%xmm14\n"
13639 "addps %%xmm6, %%xmm10\n"
13640 "subps %%xmm6, %%xmm14\n"
13641 "movaps %%xmm3, %%xmm11\n"
13642 "movaps %%xmm3, %%xmm15\n"
13643 "addps %%xmm7, %%xmm11\n"
13644 "subps %%xmm7, %%xmm15\n"
13645 "movups %%xmm8, (%0)\n"
13646 "movups %%xmm9, (%1)\n"
13647 "movups %%xmm10, (%2)\n"
13648 "movups %%xmm11, (%3)\n"
13649 "movups %%xmm12, (%4)\n"
13650 "movups %%xmm13, (%5)\n"
13651 "movups %%xmm14, (%6)\n"
13652 "movups %%xmm15, (%7)\n"
13653 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13654 );
13655 }
13656 }
13657 return;
13658 }
13659 if (depth == 27) {
13660 helper_float_30_recursive(buf + 0, 24);
13661 helper_float_30_recursive(buf + 16777216, 24);
13662 helper_float_30_recursive(buf + 33554432, 24);
13663 helper_float_30_recursive(buf + 50331648, 24);
13664 helper_float_30_recursive(buf + 67108864, 24);
13665 helper_float_30_recursive(buf + 83886080, 24);
13666 helper_float_30_recursive(buf + 100663296, 24);
13667 helper_float_30_recursive(buf + 117440512, 24);
13668 for (int j = 0; j < 134217728; j += 134217728) {
13669 for (int k = 0; k < 16777216; k += 4) {
13670 __asm__ volatile (
13671 "movups (%0), %%xmm0\n"
13672 "movups (%1), %%xmm1\n"
13673 "movups (%2), %%xmm2\n"
13674 "movups (%3), %%xmm3\n"
13675 "movups (%4), %%xmm4\n"
13676 "movups (%5), %%xmm5\n"
13677 "movups (%6), %%xmm6\n"
13678 "movups (%7), %%xmm7\n"
13679 "movaps %%xmm0, %%xmm8\n"
13680 "movaps %%xmm0, %%xmm9\n"
13681 "addps %%xmm1, %%xmm8\n"
13682 "subps %%xmm1, %%xmm9\n"
13683 "movaps %%xmm2, %%xmm10\n"
13684 "movaps %%xmm2, %%xmm11\n"
13685 "addps %%xmm3, %%xmm10\n"
13686 "subps %%xmm3, %%xmm11\n"
13687 "movaps %%xmm4, %%xmm12\n"
13688 "movaps %%xmm4, %%xmm13\n"
13689 "addps %%xmm5, %%xmm12\n"
13690 "subps %%xmm5, %%xmm13\n"
13691 "movaps %%xmm6, %%xmm14\n"
13692 "movaps %%xmm6, %%xmm15\n"
13693 "addps %%xmm7, %%xmm14\n"
13694 "subps %%xmm7, %%xmm15\n"
13695 "movaps %%xmm8, %%xmm0\n"
13696 "movaps %%xmm8, %%xmm2\n"
13697 "addps %%xmm10, %%xmm0\n"
13698 "subps %%xmm10, %%xmm2\n"
13699 "movaps %%xmm9, %%xmm1\n"
13700 "movaps %%xmm9, %%xmm3\n"
13701 "addps %%xmm11, %%xmm1\n"
13702 "subps %%xmm11, %%xmm3\n"
13703 "movaps %%xmm12, %%xmm4\n"
13704 "movaps %%xmm12, %%xmm6\n"
13705 "addps %%xmm14, %%xmm4\n"
13706 "subps %%xmm14, %%xmm6\n"
13707 "movaps %%xmm13, %%xmm5\n"
13708 "movaps %%xmm13, %%xmm7\n"
13709 "addps %%xmm15, %%xmm5\n"
13710 "subps %%xmm15, %%xmm7\n"
13711 "movaps %%xmm0, %%xmm8\n"
13712 "movaps %%xmm0, %%xmm12\n"
13713 "addps %%xmm4, %%xmm8\n"
13714 "subps %%xmm4, %%xmm12\n"
13715 "movaps %%xmm1, %%xmm9\n"
13716 "movaps %%xmm1, %%xmm13\n"
13717 "addps %%xmm5, %%xmm9\n"
13718 "subps %%xmm5, %%xmm13\n"
13719 "movaps %%xmm2, %%xmm10\n"
13720 "movaps %%xmm2, %%xmm14\n"
13721 "addps %%xmm6, %%xmm10\n"
13722 "subps %%xmm6, %%xmm14\n"
13723 "movaps %%xmm3, %%xmm11\n"
13724 "movaps %%xmm3, %%xmm15\n"
13725 "addps %%xmm7, %%xmm11\n"
13726 "subps %%xmm7, %%xmm15\n"
13727 "movups %%xmm8, (%0)\n"
13728 "movups %%xmm9, (%1)\n"
13729 "movups %%xmm10, (%2)\n"
13730 "movups %%xmm11, (%3)\n"
13731 "movups %%xmm12, (%4)\n"
13732 "movups %%xmm13, (%5)\n"
13733 "movups %%xmm14, (%6)\n"
13734 "movups %%xmm15, (%7)\n"
13735 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13736 );
13737 }
13738 }
13739 return;
13740 }
13741 if (depth == 30) {
13742 helper_float_30_recursive(buf + 0, 27);
13743 helper_float_30_recursive(buf + 134217728, 27);
13744 helper_float_30_recursive(buf + 268435456, 27);
13745 helper_float_30_recursive(buf + 402653184, 27);
13746 helper_float_30_recursive(buf + 536870912, 27);
13747 helper_float_30_recursive(buf + 671088640, 27);
13748 helper_float_30_recursive(buf + 805306368, 27);
13749 helper_float_30_recursive(buf + 939524096, 27);
13750 for (int j = 0; j < 1073741824; j += 1073741824) {
13751 for (int k = 0; k < 134217728; k += 4) {
13752 __asm__ volatile (
13753 "movups (%0), %%xmm0\n"
13754 "movups (%1), %%xmm1\n"
13755 "movups (%2), %%xmm2\n"
13756 "movups (%3), %%xmm3\n"
13757 "movups (%4), %%xmm4\n"
13758 "movups (%5), %%xmm5\n"
13759 "movups (%6), %%xmm6\n"
13760 "movups (%7), %%xmm7\n"
13761 "movaps %%xmm0, %%xmm8\n"
13762 "movaps %%xmm0, %%xmm9\n"
13763 "addps %%xmm1, %%xmm8\n"
13764 "subps %%xmm1, %%xmm9\n"
13765 "movaps %%xmm2, %%xmm10\n"
13766 "movaps %%xmm2, %%xmm11\n"
13767 "addps %%xmm3, %%xmm10\n"
13768 "subps %%xmm3, %%xmm11\n"
13769 "movaps %%xmm4, %%xmm12\n"
13770 "movaps %%xmm4, %%xmm13\n"
13771 "addps %%xmm5, %%xmm12\n"
13772 "subps %%xmm5, %%xmm13\n"
13773 "movaps %%xmm6, %%xmm14\n"
13774 "movaps %%xmm6, %%xmm15\n"
13775 "addps %%xmm7, %%xmm14\n"
13776 "subps %%xmm7, %%xmm15\n"
13777 "movaps %%xmm8, %%xmm0\n"
13778 "movaps %%xmm8, %%xmm2\n"
13779 "addps %%xmm10, %%xmm0\n"
13780 "subps %%xmm10, %%xmm2\n"
13781 "movaps %%xmm9, %%xmm1\n"
13782 "movaps %%xmm9, %%xmm3\n"
13783 "addps %%xmm11, %%xmm1\n"
13784 "subps %%xmm11, %%xmm3\n"
13785 "movaps %%xmm12, %%xmm4\n"
13786 "movaps %%xmm12, %%xmm6\n"
13787 "addps %%xmm14, %%xmm4\n"
13788 "subps %%xmm14, %%xmm6\n"
13789 "movaps %%xmm13, %%xmm5\n"
13790 "movaps %%xmm13, %%xmm7\n"
13791 "addps %%xmm15, %%xmm5\n"
13792 "subps %%xmm15, %%xmm7\n"
13793 "movaps %%xmm0, %%xmm8\n"
13794 "movaps %%xmm0, %%xmm12\n"
13795 "addps %%xmm4, %%xmm8\n"
13796 "subps %%xmm4, %%xmm12\n"
13797 "movaps %%xmm1, %%xmm9\n"
13798 "movaps %%xmm1, %%xmm13\n"
13799 "addps %%xmm5, %%xmm9\n"
13800 "subps %%xmm5, %%xmm13\n"
13801 "movaps %%xmm2, %%xmm10\n"
13802 "movaps %%xmm2, %%xmm14\n"
13803 "addps %%xmm6, %%xmm10\n"
13804 "subps %%xmm6, %%xmm14\n"
13805 "movaps %%xmm3, %%xmm11\n"
13806 "movaps %%xmm3, %%xmm15\n"
13807 "addps %%xmm7, %%xmm11\n"
13808 "subps %%xmm7, %%xmm15\n"
13809 "movups %%xmm8, (%0)\n"
13810 "movups %%xmm9, (%1)\n"
13811 "movups %%xmm10, (%2)\n"
13812 "movups %%xmm11, (%3)\n"
13813 "movups %%xmm12, (%4)\n"
13814 "movups %%xmm13, (%5)\n"
13815 "movups %%xmm14, (%6)\n"
13816 "movups %%xmm15, (%7)\n"
13817 :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13818 );
13819 }
13820 }
13821 return;
13822 }
13823 }
13824 void helper_float_30(float *buf);
helper_float_30(float * buf)13825 void helper_float_30(float *buf) {
13826 helper_float_30_recursive(buf, 30);
13827 }
fht_float(float * buf,int log_n)13828 int fht_float(float *buf, int log_n) {
13829 if (log_n == 0) {
13830 return 0;
13831 }
13832 if (log_n == 1) {
13833 helper_float_1(buf);
13834 return 0;
13835 }
13836 if (log_n == 2) {
13837 helper_float_2(buf);
13838 return 0;
13839 }
13840 if (log_n == 3) {
13841 helper_float_3(buf);
13842 return 0;
13843 }
13844 if (log_n == 4) {
13845 helper_float_4(buf);
13846 return 0;
13847 }
13848 if (log_n == 5) {
13849 helper_float_5(buf);
13850 return 0;
13851 }
13852 if (log_n == 6) {
13853 helper_float_6(buf);
13854 return 0;
13855 }
13856 if (log_n == 7) {
13857 helper_float_7(buf);
13858 return 0;
13859 }
13860 if (log_n == 8) {
13861 helper_float_8(buf);
13862 return 0;
13863 }
13864 if (log_n == 9) {
13865 helper_float_9(buf);
13866 return 0;
13867 }
13868 if (log_n == 10) {
13869 helper_float_10(buf);
13870 return 0;
13871 }
13872 if (log_n == 11) {
13873 helper_float_11(buf);
13874 return 0;
13875 }
13876 if (log_n == 12) {
13877 helper_float_12(buf);
13878 return 0;
13879 }
13880 if (log_n == 13) {
13881 helper_float_13(buf);
13882 return 0;
13883 }
13884 if (log_n == 14) {
13885 helper_float_14(buf);
13886 return 0;
13887 }
13888 if (log_n == 15) {
13889 helper_float_15(buf);
13890 return 0;
13891 }
13892 if (log_n == 16) {
13893 helper_float_16(buf);
13894 return 0;
13895 }
13896 if (log_n == 17) {
13897 helper_float_17(buf);
13898 return 0;
13899 }
13900 if (log_n == 18) {
13901 helper_float_18(buf);
13902 return 0;
13903 }
13904 if (log_n == 19) {
13905 helper_float_19(buf);
13906 return 0;
13907 }
13908 if (log_n == 20) {
13909 helper_float_20(buf);
13910 return 0;
13911 }
13912 if (log_n == 21) {
13913 helper_float_21(buf);
13914 return 0;
13915 }
13916 if (log_n == 22) {
13917 helper_float_22(buf);
13918 return 0;
13919 }
13920 if (log_n == 23) {
13921 helper_float_23(buf);
13922 return 0;
13923 }
13924 if (log_n == 24) {
13925 helper_float_24(buf);
13926 return 0;
13927 }
13928 if (log_n == 25) {
13929 helper_float_25(buf);
13930 return 0;
13931 }
13932 if (log_n == 26) {
13933 helper_float_26(buf);
13934 return 0;
13935 }
13936 if (log_n == 27) {
13937 helper_float_27(buf);
13938 return 0;
13939 }
13940 if (log_n == 28) {
13941 helper_float_28(buf);
13942 return 0;
13943 }
13944 if (log_n == 29) {
13945 helper_float_29(buf);
13946 return 0;
13947 }
13948 if (log_n == 30) {
13949 helper_float_30(buf);
13950 return 0;
13951 }
13952 return 1;
13953 }
13954 static inline void helper_double_1(double *buf);
helper_double_1(double * buf)13955 static inline void helper_double_1(double *buf) {
13956 for (int j = 0; j < 2; j += 2) {
13957 __asm__ volatile (
13958 "movupd (%0), %%xmm0\n"
13959 "movapd %%xmm0, %%xmm8\n"
13960 "haddpd %%xmm8, %%xmm8\n"
13961 "movapd %%xmm0, %%xmm9\n"
13962 "hsubpd %%xmm9, %%xmm9\n"
13963 "blendpd $1, %%xmm8, %%xmm9\n"
13964 "movapd %%xmm9, %%xmm0\n"
13965 "movupd %%xmm0, (%0)\n"
13966 :: "r"(buf + j) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13967 );
13968 }
13969 }
13970 void helper_double_2_recursive(double *buf, int depth);
helper_double_2_recursive(double * buf,int depth)13971 void helper_double_2_recursive(double *buf, int depth) {
13972 if (depth == 2) {
13973 for (int j = 0; j < 4; j += 4) {
13974 for (int k = 0; k < 2; k += 2) {
13975 __asm__ volatile (
13976 "movupd (%0), %%xmm0\n"
13977 "movupd (%1), %%xmm1\n"
13978 "movapd %%xmm0, %%xmm8\n"
13979 "haddpd %%xmm8, %%xmm8\n"
13980 "movapd %%xmm0, %%xmm9\n"
13981 "hsubpd %%xmm9, %%xmm9\n"
13982 "blendpd $1, %%xmm8, %%xmm9\n"
13983 "movapd %%xmm9, %%xmm0\n"
13984 "movapd %%xmm1, %%xmm8\n"
13985 "haddpd %%xmm8, %%xmm8\n"
13986 "movapd %%xmm1, %%xmm9\n"
13987 "hsubpd %%xmm9, %%xmm9\n"
13988 "blendpd $1, %%xmm8, %%xmm9\n"
13989 "movapd %%xmm9, %%xmm1\n"
13990 "movapd %%xmm0, %%xmm8\n"
13991 "movapd %%xmm0, %%xmm9\n"
13992 "addpd %%xmm1, %%xmm8\n"
13993 "subpd %%xmm1, %%xmm9\n"
13994 "movupd %%xmm8, (%0)\n"
13995 "movupd %%xmm9, (%1)\n"
13996 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
13997 );
13998 }
13999 }
14000 return;
14001 }
14002 }
14003 void helper_double_2(double *buf);
helper_double_2(double * buf)14004 void helper_double_2(double *buf) {
14005 helper_double_2_recursive(buf, 2);
14006 }
14007 void helper_double_3_recursive(double *buf, int depth);
helper_double_3_recursive(double * buf,int depth)14008 void helper_double_3_recursive(double *buf, int depth) {
14009 if (depth == 3) {
14010 for (int j = 0; j < 8; j += 8) {
14011 for (int k = 0; k < 2; k += 2) {
14012 __asm__ volatile (
14013 "movupd (%0), %%xmm0\n"
14014 "movupd (%1), %%xmm1\n"
14015 "movupd (%2), %%xmm2\n"
14016 "movupd (%3), %%xmm3\n"
14017 "movapd %%xmm0, %%xmm8\n"
14018 "haddpd %%xmm8, %%xmm8\n"
14019 "movapd %%xmm0, %%xmm9\n"
14020 "hsubpd %%xmm9, %%xmm9\n"
14021 "blendpd $1, %%xmm8, %%xmm9\n"
14022 "movapd %%xmm9, %%xmm0\n"
14023 "movapd %%xmm1, %%xmm8\n"
14024 "haddpd %%xmm8, %%xmm8\n"
14025 "movapd %%xmm1, %%xmm9\n"
14026 "hsubpd %%xmm9, %%xmm9\n"
14027 "blendpd $1, %%xmm8, %%xmm9\n"
14028 "movapd %%xmm9, %%xmm1\n"
14029 "movapd %%xmm2, %%xmm8\n"
14030 "haddpd %%xmm8, %%xmm8\n"
14031 "movapd %%xmm2, %%xmm9\n"
14032 "hsubpd %%xmm9, %%xmm9\n"
14033 "blendpd $1, %%xmm8, %%xmm9\n"
14034 "movapd %%xmm9, %%xmm2\n"
14035 "movapd %%xmm3, %%xmm8\n"
14036 "haddpd %%xmm8, %%xmm8\n"
14037 "movapd %%xmm3, %%xmm9\n"
14038 "hsubpd %%xmm9, %%xmm9\n"
14039 "blendpd $1, %%xmm8, %%xmm9\n"
14040 "movapd %%xmm9, %%xmm3\n"
14041 "movapd %%xmm0, %%xmm8\n"
14042 "movapd %%xmm0, %%xmm9\n"
14043 "addpd %%xmm1, %%xmm8\n"
14044 "subpd %%xmm1, %%xmm9\n"
14045 "movapd %%xmm2, %%xmm10\n"
14046 "movapd %%xmm2, %%xmm11\n"
14047 "addpd %%xmm3, %%xmm10\n"
14048 "subpd %%xmm3, %%xmm11\n"
14049 "movapd %%xmm8, %%xmm0\n"
14050 "movapd %%xmm8, %%xmm2\n"
14051 "addpd %%xmm10, %%xmm0\n"
14052 "subpd %%xmm10, %%xmm2\n"
14053 "movapd %%xmm9, %%xmm1\n"
14054 "movapd %%xmm9, %%xmm3\n"
14055 "addpd %%xmm11, %%xmm1\n"
14056 "subpd %%xmm11, %%xmm3\n"
14057 "movupd %%xmm0, (%0)\n"
14058 "movupd %%xmm1, (%1)\n"
14059 "movupd %%xmm2, (%2)\n"
14060 "movupd %%xmm3, (%3)\n"
14061 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14062 );
14063 }
14064 }
14065 return;
14066 }
14067 }
14068 void helper_double_3(double *buf);
helper_double_3(double * buf)14069 void helper_double_3(double *buf) {
14070 helper_double_3_recursive(buf, 3);
14071 }
14072 static inline void helper_double_4(double *buf);
helper_double_4(double * buf)14073 static inline void helper_double_4(double *buf) {
14074 for (int j = 0; j < 16; j += 16) {
14075 for (int k = 0; k < 2; k += 2) {
14076 __asm__ volatile (
14077 "movupd (%0), %%xmm0\n"
14078 "movupd (%1), %%xmm1\n"
14079 "movupd (%2), %%xmm2\n"
14080 "movupd (%3), %%xmm3\n"
14081 "movupd (%4), %%xmm4\n"
14082 "movupd (%5), %%xmm5\n"
14083 "movupd (%6), %%xmm6\n"
14084 "movupd (%7), %%xmm7\n"
14085 "movapd %%xmm0, %%xmm8\n"
14086 "haddpd %%xmm8, %%xmm8\n"
14087 "movapd %%xmm0, %%xmm9\n"
14088 "hsubpd %%xmm9, %%xmm9\n"
14089 "blendpd $1, %%xmm8, %%xmm9\n"
14090 "movapd %%xmm9, %%xmm0\n"
14091 "movapd %%xmm1, %%xmm8\n"
14092 "haddpd %%xmm8, %%xmm8\n"
14093 "movapd %%xmm1, %%xmm9\n"
14094 "hsubpd %%xmm9, %%xmm9\n"
14095 "blendpd $1, %%xmm8, %%xmm9\n"
14096 "movapd %%xmm9, %%xmm1\n"
14097 "movapd %%xmm2, %%xmm8\n"
14098 "haddpd %%xmm8, %%xmm8\n"
14099 "movapd %%xmm2, %%xmm9\n"
14100 "hsubpd %%xmm9, %%xmm9\n"
14101 "blendpd $1, %%xmm8, %%xmm9\n"
14102 "movapd %%xmm9, %%xmm2\n"
14103 "movapd %%xmm3, %%xmm8\n"
14104 "haddpd %%xmm8, %%xmm8\n"
14105 "movapd %%xmm3, %%xmm9\n"
14106 "hsubpd %%xmm9, %%xmm9\n"
14107 "blendpd $1, %%xmm8, %%xmm9\n"
14108 "movapd %%xmm9, %%xmm3\n"
14109 "movapd %%xmm4, %%xmm8\n"
14110 "haddpd %%xmm8, %%xmm8\n"
14111 "movapd %%xmm4, %%xmm9\n"
14112 "hsubpd %%xmm9, %%xmm9\n"
14113 "blendpd $1, %%xmm8, %%xmm9\n"
14114 "movapd %%xmm9, %%xmm4\n"
14115 "movapd %%xmm5, %%xmm8\n"
14116 "haddpd %%xmm8, %%xmm8\n"
14117 "movapd %%xmm5, %%xmm9\n"
14118 "hsubpd %%xmm9, %%xmm9\n"
14119 "blendpd $1, %%xmm8, %%xmm9\n"
14120 "movapd %%xmm9, %%xmm5\n"
14121 "movapd %%xmm6, %%xmm8\n"
14122 "haddpd %%xmm8, %%xmm8\n"
14123 "movapd %%xmm6, %%xmm9\n"
14124 "hsubpd %%xmm9, %%xmm9\n"
14125 "blendpd $1, %%xmm8, %%xmm9\n"
14126 "movapd %%xmm9, %%xmm6\n"
14127 "movapd %%xmm7, %%xmm8\n"
14128 "haddpd %%xmm8, %%xmm8\n"
14129 "movapd %%xmm7, %%xmm9\n"
14130 "hsubpd %%xmm9, %%xmm9\n"
14131 "blendpd $1, %%xmm8, %%xmm9\n"
14132 "movapd %%xmm9, %%xmm7\n"
14133 "movapd %%xmm0, %%xmm8\n"
14134 "movapd %%xmm0, %%xmm9\n"
14135 "addpd %%xmm1, %%xmm8\n"
14136 "subpd %%xmm1, %%xmm9\n"
14137 "movapd %%xmm2, %%xmm10\n"
14138 "movapd %%xmm2, %%xmm11\n"
14139 "addpd %%xmm3, %%xmm10\n"
14140 "subpd %%xmm3, %%xmm11\n"
14141 "movapd %%xmm4, %%xmm12\n"
14142 "movapd %%xmm4, %%xmm13\n"
14143 "addpd %%xmm5, %%xmm12\n"
14144 "subpd %%xmm5, %%xmm13\n"
14145 "movapd %%xmm6, %%xmm14\n"
14146 "movapd %%xmm6, %%xmm15\n"
14147 "addpd %%xmm7, %%xmm14\n"
14148 "subpd %%xmm7, %%xmm15\n"
14149 "movapd %%xmm8, %%xmm0\n"
14150 "movapd %%xmm8, %%xmm2\n"
14151 "addpd %%xmm10, %%xmm0\n"
14152 "subpd %%xmm10, %%xmm2\n"
14153 "movapd %%xmm9, %%xmm1\n"
14154 "movapd %%xmm9, %%xmm3\n"
14155 "addpd %%xmm11, %%xmm1\n"
14156 "subpd %%xmm11, %%xmm3\n"
14157 "movapd %%xmm12, %%xmm4\n"
14158 "movapd %%xmm12, %%xmm6\n"
14159 "addpd %%xmm14, %%xmm4\n"
14160 "subpd %%xmm14, %%xmm6\n"
14161 "movapd %%xmm13, %%xmm5\n"
14162 "movapd %%xmm13, %%xmm7\n"
14163 "addpd %%xmm15, %%xmm5\n"
14164 "subpd %%xmm15, %%xmm7\n"
14165 "movapd %%xmm0, %%xmm8\n"
14166 "movapd %%xmm0, %%xmm12\n"
14167 "addpd %%xmm4, %%xmm8\n"
14168 "subpd %%xmm4, %%xmm12\n"
14169 "movapd %%xmm1, %%xmm9\n"
14170 "movapd %%xmm1, %%xmm13\n"
14171 "addpd %%xmm5, %%xmm9\n"
14172 "subpd %%xmm5, %%xmm13\n"
14173 "movapd %%xmm2, %%xmm10\n"
14174 "movapd %%xmm2, %%xmm14\n"
14175 "addpd %%xmm6, %%xmm10\n"
14176 "subpd %%xmm6, %%xmm14\n"
14177 "movapd %%xmm3, %%xmm11\n"
14178 "movapd %%xmm3, %%xmm15\n"
14179 "addpd %%xmm7, %%xmm11\n"
14180 "subpd %%xmm7, %%xmm15\n"
14181 "movupd %%xmm8, (%0)\n"
14182 "movupd %%xmm9, (%1)\n"
14183 "movupd %%xmm10, (%2)\n"
14184 "movupd %%xmm11, (%3)\n"
14185 "movupd %%xmm12, (%4)\n"
14186 "movupd %%xmm13, (%5)\n"
14187 "movupd %%xmm14, (%6)\n"
14188 "movupd %%xmm15, (%7)\n"
14189 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14190 );
14191 }
14192 }
14193 }
14194 void helper_double_5_recursive(double *buf, int depth);
helper_double_5_recursive(double * buf,int depth)14195 void helper_double_5_recursive(double *buf, int depth) {
14196 if (depth == 2) {
14197 for (int j = 0; j < 4; j += 4) {
14198 for (int k = 0; k < 2; k += 2) {
14199 __asm__ volatile (
14200 "movupd (%0), %%xmm0\n"
14201 "movupd (%1), %%xmm1\n"
14202 "movapd %%xmm0, %%xmm8\n"
14203 "haddpd %%xmm8, %%xmm8\n"
14204 "movapd %%xmm0, %%xmm9\n"
14205 "hsubpd %%xmm9, %%xmm9\n"
14206 "blendpd $1, %%xmm8, %%xmm9\n"
14207 "movapd %%xmm9, %%xmm0\n"
14208 "movapd %%xmm1, %%xmm8\n"
14209 "haddpd %%xmm8, %%xmm8\n"
14210 "movapd %%xmm1, %%xmm9\n"
14211 "hsubpd %%xmm9, %%xmm9\n"
14212 "blendpd $1, %%xmm8, %%xmm9\n"
14213 "movapd %%xmm9, %%xmm1\n"
14214 "movapd %%xmm0, %%xmm8\n"
14215 "movapd %%xmm0, %%xmm9\n"
14216 "addpd %%xmm1, %%xmm8\n"
14217 "subpd %%xmm1, %%xmm9\n"
14218 "movupd %%xmm8, (%0)\n"
14219 "movupd %%xmm9, (%1)\n"
14220 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14221 );
14222 }
14223 }
14224 return;
14225 }
14226 if (depth == 5) {
14227 helper_double_5_recursive(buf + 0, 2);
14228 helper_double_5_recursive(buf + 4, 2);
14229 helper_double_5_recursive(buf + 8, 2);
14230 helper_double_5_recursive(buf + 12, 2);
14231 helper_double_5_recursive(buf + 16, 2);
14232 helper_double_5_recursive(buf + 20, 2);
14233 helper_double_5_recursive(buf + 24, 2);
14234 helper_double_5_recursive(buf + 28, 2);
14235 for (int j = 0; j < 32; j += 32) {
14236 for (int k = 0; k < 4; k += 2) {
14237 __asm__ volatile (
14238 "movupd (%0), %%xmm0\n"
14239 "movupd (%1), %%xmm1\n"
14240 "movupd (%2), %%xmm2\n"
14241 "movupd (%3), %%xmm3\n"
14242 "movupd (%4), %%xmm4\n"
14243 "movupd (%5), %%xmm5\n"
14244 "movupd (%6), %%xmm6\n"
14245 "movupd (%7), %%xmm7\n"
14246 "movapd %%xmm0, %%xmm8\n"
14247 "movapd %%xmm0, %%xmm9\n"
14248 "addpd %%xmm1, %%xmm8\n"
14249 "subpd %%xmm1, %%xmm9\n"
14250 "movapd %%xmm2, %%xmm10\n"
14251 "movapd %%xmm2, %%xmm11\n"
14252 "addpd %%xmm3, %%xmm10\n"
14253 "subpd %%xmm3, %%xmm11\n"
14254 "movapd %%xmm4, %%xmm12\n"
14255 "movapd %%xmm4, %%xmm13\n"
14256 "addpd %%xmm5, %%xmm12\n"
14257 "subpd %%xmm5, %%xmm13\n"
14258 "movapd %%xmm6, %%xmm14\n"
14259 "movapd %%xmm6, %%xmm15\n"
14260 "addpd %%xmm7, %%xmm14\n"
14261 "subpd %%xmm7, %%xmm15\n"
14262 "movapd %%xmm8, %%xmm0\n"
14263 "movapd %%xmm8, %%xmm2\n"
14264 "addpd %%xmm10, %%xmm0\n"
14265 "subpd %%xmm10, %%xmm2\n"
14266 "movapd %%xmm9, %%xmm1\n"
14267 "movapd %%xmm9, %%xmm3\n"
14268 "addpd %%xmm11, %%xmm1\n"
14269 "subpd %%xmm11, %%xmm3\n"
14270 "movapd %%xmm12, %%xmm4\n"
14271 "movapd %%xmm12, %%xmm6\n"
14272 "addpd %%xmm14, %%xmm4\n"
14273 "subpd %%xmm14, %%xmm6\n"
14274 "movapd %%xmm13, %%xmm5\n"
14275 "movapd %%xmm13, %%xmm7\n"
14276 "addpd %%xmm15, %%xmm5\n"
14277 "subpd %%xmm15, %%xmm7\n"
14278 "movapd %%xmm0, %%xmm8\n"
14279 "movapd %%xmm0, %%xmm12\n"
14280 "addpd %%xmm4, %%xmm8\n"
14281 "subpd %%xmm4, %%xmm12\n"
14282 "movapd %%xmm1, %%xmm9\n"
14283 "movapd %%xmm1, %%xmm13\n"
14284 "addpd %%xmm5, %%xmm9\n"
14285 "subpd %%xmm5, %%xmm13\n"
14286 "movapd %%xmm2, %%xmm10\n"
14287 "movapd %%xmm2, %%xmm14\n"
14288 "addpd %%xmm6, %%xmm10\n"
14289 "subpd %%xmm6, %%xmm14\n"
14290 "movapd %%xmm3, %%xmm11\n"
14291 "movapd %%xmm3, %%xmm15\n"
14292 "addpd %%xmm7, %%xmm11\n"
14293 "subpd %%xmm7, %%xmm15\n"
14294 "movupd %%xmm8, (%0)\n"
14295 "movupd %%xmm9, (%1)\n"
14296 "movupd %%xmm10, (%2)\n"
14297 "movupd %%xmm11, (%3)\n"
14298 "movupd %%xmm12, (%4)\n"
14299 "movupd %%xmm13, (%5)\n"
14300 "movupd %%xmm14, (%6)\n"
14301 "movupd %%xmm15, (%7)\n"
14302 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14303 );
14304 }
14305 }
14306 return;
14307 }
14308 }
14309 void helper_double_5(double *buf);
helper_double_5(double * buf)14310 void helper_double_5(double *buf) {
14311 helper_double_5_recursive(buf, 5);
14312 }
14313 static inline void helper_double_6(double *buf);
helper_double_6(double * buf)14314 static inline void helper_double_6(double *buf) {
14315 for (int j = 0; j < 64; j += 16) {
14316 for (int k = 0; k < 2; k += 2) {
14317 __asm__ volatile (
14318 "movupd (%0), %%xmm0\n"
14319 "movupd (%1), %%xmm1\n"
14320 "movupd (%2), %%xmm2\n"
14321 "movupd (%3), %%xmm3\n"
14322 "movupd (%4), %%xmm4\n"
14323 "movupd (%5), %%xmm5\n"
14324 "movupd (%6), %%xmm6\n"
14325 "movupd (%7), %%xmm7\n"
14326 "movapd %%xmm0, %%xmm8\n"
14327 "haddpd %%xmm8, %%xmm8\n"
14328 "movapd %%xmm0, %%xmm9\n"
14329 "hsubpd %%xmm9, %%xmm9\n"
14330 "blendpd $1, %%xmm8, %%xmm9\n"
14331 "movapd %%xmm9, %%xmm0\n"
14332 "movapd %%xmm1, %%xmm8\n"
14333 "haddpd %%xmm8, %%xmm8\n"
14334 "movapd %%xmm1, %%xmm9\n"
14335 "hsubpd %%xmm9, %%xmm9\n"
14336 "blendpd $1, %%xmm8, %%xmm9\n"
14337 "movapd %%xmm9, %%xmm1\n"
14338 "movapd %%xmm2, %%xmm8\n"
14339 "haddpd %%xmm8, %%xmm8\n"
14340 "movapd %%xmm2, %%xmm9\n"
14341 "hsubpd %%xmm9, %%xmm9\n"
14342 "blendpd $1, %%xmm8, %%xmm9\n"
14343 "movapd %%xmm9, %%xmm2\n"
14344 "movapd %%xmm3, %%xmm8\n"
14345 "haddpd %%xmm8, %%xmm8\n"
14346 "movapd %%xmm3, %%xmm9\n"
14347 "hsubpd %%xmm9, %%xmm9\n"
14348 "blendpd $1, %%xmm8, %%xmm9\n"
14349 "movapd %%xmm9, %%xmm3\n"
14350 "movapd %%xmm4, %%xmm8\n"
14351 "haddpd %%xmm8, %%xmm8\n"
14352 "movapd %%xmm4, %%xmm9\n"
14353 "hsubpd %%xmm9, %%xmm9\n"
14354 "blendpd $1, %%xmm8, %%xmm9\n"
14355 "movapd %%xmm9, %%xmm4\n"
14356 "movapd %%xmm5, %%xmm8\n"
14357 "haddpd %%xmm8, %%xmm8\n"
14358 "movapd %%xmm5, %%xmm9\n"
14359 "hsubpd %%xmm9, %%xmm9\n"
14360 "blendpd $1, %%xmm8, %%xmm9\n"
14361 "movapd %%xmm9, %%xmm5\n"
14362 "movapd %%xmm6, %%xmm8\n"
14363 "haddpd %%xmm8, %%xmm8\n"
14364 "movapd %%xmm6, %%xmm9\n"
14365 "hsubpd %%xmm9, %%xmm9\n"
14366 "blendpd $1, %%xmm8, %%xmm9\n"
14367 "movapd %%xmm9, %%xmm6\n"
14368 "movapd %%xmm7, %%xmm8\n"
14369 "haddpd %%xmm8, %%xmm8\n"
14370 "movapd %%xmm7, %%xmm9\n"
14371 "hsubpd %%xmm9, %%xmm9\n"
14372 "blendpd $1, %%xmm8, %%xmm9\n"
14373 "movapd %%xmm9, %%xmm7\n"
14374 "movapd %%xmm0, %%xmm8\n"
14375 "movapd %%xmm0, %%xmm9\n"
14376 "addpd %%xmm1, %%xmm8\n"
14377 "subpd %%xmm1, %%xmm9\n"
14378 "movapd %%xmm2, %%xmm10\n"
14379 "movapd %%xmm2, %%xmm11\n"
14380 "addpd %%xmm3, %%xmm10\n"
14381 "subpd %%xmm3, %%xmm11\n"
14382 "movapd %%xmm4, %%xmm12\n"
14383 "movapd %%xmm4, %%xmm13\n"
14384 "addpd %%xmm5, %%xmm12\n"
14385 "subpd %%xmm5, %%xmm13\n"
14386 "movapd %%xmm6, %%xmm14\n"
14387 "movapd %%xmm6, %%xmm15\n"
14388 "addpd %%xmm7, %%xmm14\n"
14389 "subpd %%xmm7, %%xmm15\n"
14390 "movapd %%xmm8, %%xmm0\n"
14391 "movapd %%xmm8, %%xmm2\n"
14392 "addpd %%xmm10, %%xmm0\n"
14393 "subpd %%xmm10, %%xmm2\n"
14394 "movapd %%xmm9, %%xmm1\n"
14395 "movapd %%xmm9, %%xmm3\n"
14396 "addpd %%xmm11, %%xmm1\n"
14397 "subpd %%xmm11, %%xmm3\n"
14398 "movapd %%xmm12, %%xmm4\n"
14399 "movapd %%xmm12, %%xmm6\n"
14400 "addpd %%xmm14, %%xmm4\n"
14401 "subpd %%xmm14, %%xmm6\n"
14402 "movapd %%xmm13, %%xmm5\n"
14403 "movapd %%xmm13, %%xmm7\n"
14404 "addpd %%xmm15, %%xmm5\n"
14405 "subpd %%xmm15, %%xmm7\n"
14406 "movapd %%xmm0, %%xmm8\n"
14407 "movapd %%xmm0, %%xmm12\n"
14408 "addpd %%xmm4, %%xmm8\n"
14409 "subpd %%xmm4, %%xmm12\n"
14410 "movapd %%xmm1, %%xmm9\n"
14411 "movapd %%xmm1, %%xmm13\n"
14412 "addpd %%xmm5, %%xmm9\n"
14413 "subpd %%xmm5, %%xmm13\n"
14414 "movapd %%xmm2, %%xmm10\n"
14415 "movapd %%xmm2, %%xmm14\n"
14416 "addpd %%xmm6, %%xmm10\n"
14417 "subpd %%xmm6, %%xmm14\n"
14418 "movapd %%xmm3, %%xmm11\n"
14419 "movapd %%xmm3, %%xmm15\n"
14420 "addpd %%xmm7, %%xmm11\n"
14421 "subpd %%xmm7, %%xmm15\n"
14422 "movupd %%xmm8, (%0)\n"
14423 "movupd %%xmm9, (%1)\n"
14424 "movupd %%xmm10, (%2)\n"
14425 "movupd %%xmm11, (%3)\n"
14426 "movupd %%xmm12, (%4)\n"
14427 "movupd %%xmm13, (%5)\n"
14428 "movupd %%xmm14, (%6)\n"
14429 "movupd %%xmm15, (%7)\n"
14430 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14431 );
14432 }
14433 }
14434 for (int j = 0; j < 64; j += 64) {
14435 for (int k = 0; k < 16; k += 2) {
14436 __asm__ volatile (
14437 "movupd (%0), %%xmm0\n"
14438 "movupd (%1), %%xmm1\n"
14439 "movupd (%2), %%xmm2\n"
14440 "movupd (%3), %%xmm3\n"
14441 "movapd %%xmm0, %%xmm8\n"
14442 "movapd %%xmm0, %%xmm9\n"
14443 "addpd %%xmm1, %%xmm8\n"
14444 "subpd %%xmm1, %%xmm9\n"
14445 "movapd %%xmm2, %%xmm10\n"
14446 "movapd %%xmm2, %%xmm11\n"
14447 "addpd %%xmm3, %%xmm10\n"
14448 "subpd %%xmm3, %%xmm11\n"
14449 "movapd %%xmm8, %%xmm0\n"
14450 "movapd %%xmm8, %%xmm2\n"
14451 "addpd %%xmm10, %%xmm0\n"
14452 "subpd %%xmm10, %%xmm2\n"
14453 "movapd %%xmm9, %%xmm1\n"
14454 "movapd %%xmm9, %%xmm3\n"
14455 "addpd %%xmm11, %%xmm1\n"
14456 "subpd %%xmm11, %%xmm3\n"
14457 "movupd %%xmm0, (%0)\n"
14458 "movupd %%xmm1, (%1)\n"
14459 "movupd %%xmm2, (%2)\n"
14460 "movupd %%xmm3, (%3)\n"
14461 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14462 );
14463 }
14464 }
14465 }
14466 static inline void helper_double_7(double *buf);
helper_double_7(double * buf)14467 static inline void helper_double_7(double *buf) {
14468 for (int j = 0; j < 128; j += 16) {
14469 for (int k = 0; k < 2; k += 2) {
14470 __asm__ volatile (
14471 "movupd (%0), %%xmm0\n"
14472 "movupd (%1), %%xmm1\n"
14473 "movupd (%2), %%xmm2\n"
14474 "movupd (%3), %%xmm3\n"
14475 "movupd (%4), %%xmm4\n"
14476 "movupd (%5), %%xmm5\n"
14477 "movupd (%6), %%xmm6\n"
14478 "movupd (%7), %%xmm7\n"
14479 "movapd %%xmm0, %%xmm8\n"
14480 "haddpd %%xmm8, %%xmm8\n"
14481 "movapd %%xmm0, %%xmm9\n"
14482 "hsubpd %%xmm9, %%xmm9\n"
14483 "blendpd $1, %%xmm8, %%xmm9\n"
14484 "movapd %%xmm9, %%xmm0\n"
14485 "movapd %%xmm1, %%xmm8\n"
14486 "haddpd %%xmm8, %%xmm8\n"
14487 "movapd %%xmm1, %%xmm9\n"
14488 "hsubpd %%xmm9, %%xmm9\n"
14489 "blendpd $1, %%xmm8, %%xmm9\n"
14490 "movapd %%xmm9, %%xmm1\n"
14491 "movapd %%xmm2, %%xmm8\n"
14492 "haddpd %%xmm8, %%xmm8\n"
14493 "movapd %%xmm2, %%xmm9\n"
14494 "hsubpd %%xmm9, %%xmm9\n"
14495 "blendpd $1, %%xmm8, %%xmm9\n"
14496 "movapd %%xmm9, %%xmm2\n"
14497 "movapd %%xmm3, %%xmm8\n"
14498 "haddpd %%xmm8, %%xmm8\n"
14499 "movapd %%xmm3, %%xmm9\n"
14500 "hsubpd %%xmm9, %%xmm9\n"
14501 "blendpd $1, %%xmm8, %%xmm9\n"
14502 "movapd %%xmm9, %%xmm3\n"
14503 "movapd %%xmm4, %%xmm8\n"
14504 "haddpd %%xmm8, %%xmm8\n"
14505 "movapd %%xmm4, %%xmm9\n"
14506 "hsubpd %%xmm9, %%xmm9\n"
14507 "blendpd $1, %%xmm8, %%xmm9\n"
14508 "movapd %%xmm9, %%xmm4\n"
14509 "movapd %%xmm5, %%xmm8\n"
14510 "haddpd %%xmm8, %%xmm8\n"
14511 "movapd %%xmm5, %%xmm9\n"
14512 "hsubpd %%xmm9, %%xmm9\n"
14513 "blendpd $1, %%xmm8, %%xmm9\n"
14514 "movapd %%xmm9, %%xmm5\n"
14515 "movapd %%xmm6, %%xmm8\n"
14516 "haddpd %%xmm8, %%xmm8\n"
14517 "movapd %%xmm6, %%xmm9\n"
14518 "hsubpd %%xmm9, %%xmm9\n"
14519 "blendpd $1, %%xmm8, %%xmm9\n"
14520 "movapd %%xmm9, %%xmm6\n"
14521 "movapd %%xmm7, %%xmm8\n"
14522 "haddpd %%xmm8, %%xmm8\n"
14523 "movapd %%xmm7, %%xmm9\n"
14524 "hsubpd %%xmm9, %%xmm9\n"
14525 "blendpd $1, %%xmm8, %%xmm9\n"
14526 "movapd %%xmm9, %%xmm7\n"
14527 "movapd %%xmm0, %%xmm8\n"
14528 "movapd %%xmm0, %%xmm9\n"
14529 "addpd %%xmm1, %%xmm8\n"
14530 "subpd %%xmm1, %%xmm9\n"
14531 "movapd %%xmm2, %%xmm10\n"
14532 "movapd %%xmm2, %%xmm11\n"
14533 "addpd %%xmm3, %%xmm10\n"
14534 "subpd %%xmm3, %%xmm11\n"
14535 "movapd %%xmm4, %%xmm12\n"
14536 "movapd %%xmm4, %%xmm13\n"
14537 "addpd %%xmm5, %%xmm12\n"
14538 "subpd %%xmm5, %%xmm13\n"
14539 "movapd %%xmm6, %%xmm14\n"
14540 "movapd %%xmm6, %%xmm15\n"
14541 "addpd %%xmm7, %%xmm14\n"
14542 "subpd %%xmm7, %%xmm15\n"
14543 "movapd %%xmm8, %%xmm0\n"
14544 "movapd %%xmm8, %%xmm2\n"
14545 "addpd %%xmm10, %%xmm0\n"
14546 "subpd %%xmm10, %%xmm2\n"
14547 "movapd %%xmm9, %%xmm1\n"
14548 "movapd %%xmm9, %%xmm3\n"
14549 "addpd %%xmm11, %%xmm1\n"
14550 "subpd %%xmm11, %%xmm3\n"
14551 "movapd %%xmm12, %%xmm4\n"
14552 "movapd %%xmm12, %%xmm6\n"
14553 "addpd %%xmm14, %%xmm4\n"
14554 "subpd %%xmm14, %%xmm6\n"
14555 "movapd %%xmm13, %%xmm5\n"
14556 "movapd %%xmm13, %%xmm7\n"
14557 "addpd %%xmm15, %%xmm5\n"
14558 "subpd %%xmm15, %%xmm7\n"
14559 "movapd %%xmm0, %%xmm8\n"
14560 "movapd %%xmm0, %%xmm12\n"
14561 "addpd %%xmm4, %%xmm8\n"
14562 "subpd %%xmm4, %%xmm12\n"
14563 "movapd %%xmm1, %%xmm9\n"
14564 "movapd %%xmm1, %%xmm13\n"
14565 "addpd %%xmm5, %%xmm9\n"
14566 "subpd %%xmm5, %%xmm13\n"
14567 "movapd %%xmm2, %%xmm10\n"
14568 "movapd %%xmm2, %%xmm14\n"
14569 "addpd %%xmm6, %%xmm10\n"
14570 "subpd %%xmm6, %%xmm14\n"
14571 "movapd %%xmm3, %%xmm11\n"
14572 "movapd %%xmm3, %%xmm15\n"
14573 "addpd %%xmm7, %%xmm11\n"
14574 "subpd %%xmm7, %%xmm15\n"
14575 "movupd %%xmm8, (%0)\n"
14576 "movupd %%xmm9, (%1)\n"
14577 "movupd %%xmm10, (%2)\n"
14578 "movupd %%xmm11, (%3)\n"
14579 "movupd %%xmm12, (%4)\n"
14580 "movupd %%xmm13, (%5)\n"
14581 "movupd %%xmm14, (%6)\n"
14582 "movupd %%xmm15, (%7)\n"
14583 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14584 );
14585 }
14586 }
14587 for (int j = 0; j < 128; j += 128) {
14588 for (int k = 0; k < 16; k += 2) {
14589 __asm__ volatile (
14590 "movupd (%0), %%xmm0\n"
14591 "movupd (%1), %%xmm1\n"
14592 "movupd (%2), %%xmm2\n"
14593 "movupd (%3), %%xmm3\n"
14594 "movupd (%4), %%xmm4\n"
14595 "movupd (%5), %%xmm5\n"
14596 "movupd (%6), %%xmm6\n"
14597 "movupd (%7), %%xmm7\n"
14598 "movapd %%xmm0, %%xmm8\n"
14599 "movapd %%xmm0, %%xmm9\n"
14600 "addpd %%xmm1, %%xmm8\n"
14601 "subpd %%xmm1, %%xmm9\n"
14602 "movapd %%xmm2, %%xmm10\n"
14603 "movapd %%xmm2, %%xmm11\n"
14604 "addpd %%xmm3, %%xmm10\n"
14605 "subpd %%xmm3, %%xmm11\n"
14606 "movapd %%xmm4, %%xmm12\n"
14607 "movapd %%xmm4, %%xmm13\n"
14608 "addpd %%xmm5, %%xmm12\n"
14609 "subpd %%xmm5, %%xmm13\n"
14610 "movapd %%xmm6, %%xmm14\n"
14611 "movapd %%xmm6, %%xmm15\n"
14612 "addpd %%xmm7, %%xmm14\n"
14613 "subpd %%xmm7, %%xmm15\n"
14614 "movapd %%xmm8, %%xmm0\n"
14615 "movapd %%xmm8, %%xmm2\n"
14616 "addpd %%xmm10, %%xmm0\n"
14617 "subpd %%xmm10, %%xmm2\n"
14618 "movapd %%xmm9, %%xmm1\n"
14619 "movapd %%xmm9, %%xmm3\n"
14620 "addpd %%xmm11, %%xmm1\n"
14621 "subpd %%xmm11, %%xmm3\n"
14622 "movapd %%xmm12, %%xmm4\n"
14623 "movapd %%xmm12, %%xmm6\n"
14624 "addpd %%xmm14, %%xmm4\n"
14625 "subpd %%xmm14, %%xmm6\n"
14626 "movapd %%xmm13, %%xmm5\n"
14627 "movapd %%xmm13, %%xmm7\n"
14628 "addpd %%xmm15, %%xmm5\n"
14629 "subpd %%xmm15, %%xmm7\n"
14630 "movapd %%xmm0, %%xmm8\n"
14631 "movapd %%xmm0, %%xmm12\n"
14632 "addpd %%xmm4, %%xmm8\n"
14633 "subpd %%xmm4, %%xmm12\n"
14634 "movapd %%xmm1, %%xmm9\n"
14635 "movapd %%xmm1, %%xmm13\n"
14636 "addpd %%xmm5, %%xmm9\n"
14637 "subpd %%xmm5, %%xmm13\n"
14638 "movapd %%xmm2, %%xmm10\n"
14639 "movapd %%xmm2, %%xmm14\n"
14640 "addpd %%xmm6, %%xmm10\n"
14641 "subpd %%xmm6, %%xmm14\n"
14642 "movapd %%xmm3, %%xmm11\n"
14643 "movapd %%xmm3, %%xmm15\n"
14644 "addpd %%xmm7, %%xmm11\n"
14645 "subpd %%xmm7, %%xmm15\n"
14646 "movupd %%xmm8, (%0)\n"
14647 "movupd %%xmm9, (%1)\n"
14648 "movupd %%xmm10, (%2)\n"
14649 "movupd %%xmm11, (%3)\n"
14650 "movupd %%xmm12, (%4)\n"
14651 "movupd %%xmm13, (%5)\n"
14652 "movupd %%xmm14, (%6)\n"
14653 "movupd %%xmm15, (%7)\n"
14654 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14655 );
14656 }
14657 }
14658 }
14659 void helper_double_8_recursive(double *buf, int depth);
helper_double_8_recursive(double * buf,int depth)14660 void helper_double_8_recursive(double *buf, int depth) {
14661 if (depth == 2) {
14662 for (int j = 0; j < 4; j += 4) {
14663 for (int k = 0; k < 2; k += 2) {
14664 __asm__ volatile (
14665 "movupd (%0), %%xmm0\n"
14666 "movupd (%1), %%xmm1\n"
14667 "movapd %%xmm0, %%xmm8\n"
14668 "haddpd %%xmm8, %%xmm8\n"
14669 "movapd %%xmm0, %%xmm9\n"
14670 "hsubpd %%xmm9, %%xmm9\n"
14671 "blendpd $1, %%xmm8, %%xmm9\n"
14672 "movapd %%xmm9, %%xmm0\n"
14673 "movapd %%xmm1, %%xmm8\n"
14674 "haddpd %%xmm8, %%xmm8\n"
14675 "movapd %%xmm1, %%xmm9\n"
14676 "hsubpd %%xmm9, %%xmm9\n"
14677 "blendpd $1, %%xmm8, %%xmm9\n"
14678 "movapd %%xmm9, %%xmm1\n"
14679 "movapd %%xmm0, %%xmm8\n"
14680 "movapd %%xmm0, %%xmm9\n"
14681 "addpd %%xmm1, %%xmm8\n"
14682 "subpd %%xmm1, %%xmm9\n"
14683 "movupd %%xmm8, (%0)\n"
14684 "movupd %%xmm9, (%1)\n"
14685 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14686 );
14687 }
14688 }
14689 return;
14690 }
14691 if (depth == 5) {
14692 helper_double_8_recursive(buf + 0, 2);
14693 helper_double_8_recursive(buf + 4, 2);
14694 helper_double_8_recursive(buf + 8, 2);
14695 helper_double_8_recursive(buf + 12, 2);
14696 helper_double_8_recursive(buf + 16, 2);
14697 helper_double_8_recursive(buf + 20, 2);
14698 helper_double_8_recursive(buf + 24, 2);
14699 helper_double_8_recursive(buf + 28, 2);
14700 for (int j = 0; j < 32; j += 32) {
14701 for (int k = 0; k < 4; k += 2) {
14702 __asm__ volatile (
14703 "movupd (%0), %%xmm0\n"
14704 "movupd (%1), %%xmm1\n"
14705 "movupd (%2), %%xmm2\n"
14706 "movupd (%3), %%xmm3\n"
14707 "movupd (%4), %%xmm4\n"
14708 "movupd (%5), %%xmm5\n"
14709 "movupd (%6), %%xmm6\n"
14710 "movupd (%7), %%xmm7\n"
14711 "movapd %%xmm0, %%xmm8\n"
14712 "movapd %%xmm0, %%xmm9\n"
14713 "addpd %%xmm1, %%xmm8\n"
14714 "subpd %%xmm1, %%xmm9\n"
14715 "movapd %%xmm2, %%xmm10\n"
14716 "movapd %%xmm2, %%xmm11\n"
14717 "addpd %%xmm3, %%xmm10\n"
14718 "subpd %%xmm3, %%xmm11\n"
14719 "movapd %%xmm4, %%xmm12\n"
14720 "movapd %%xmm4, %%xmm13\n"
14721 "addpd %%xmm5, %%xmm12\n"
14722 "subpd %%xmm5, %%xmm13\n"
14723 "movapd %%xmm6, %%xmm14\n"
14724 "movapd %%xmm6, %%xmm15\n"
14725 "addpd %%xmm7, %%xmm14\n"
14726 "subpd %%xmm7, %%xmm15\n"
14727 "movapd %%xmm8, %%xmm0\n"
14728 "movapd %%xmm8, %%xmm2\n"
14729 "addpd %%xmm10, %%xmm0\n"
14730 "subpd %%xmm10, %%xmm2\n"
14731 "movapd %%xmm9, %%xmm1\n"
14732 "movapd %%xmm9, %%xmm3\n"
14733 "addpd %%xmm11, %%xmm1\n"
14734 "subpd %%xmm11, %%xmm3\n"
14735 "movapd %%xmm12, %%xmm4\n"
14736 "movapd %%xmm12, %%xmm6\n"
14737 "addpd %%xmm14, %%xmm4\n"
14738 "subpd %%xmm14, %%xmm6\n"
14739 "movapd %%xmm13, %%xmm5\n"
14740 "movapd %%xmm13, %%xmm7\n"
14741 "addpd %%xmm15, %%xmm5\n"
14742 "subpd %%xmm15, %%xmm7\n"
14743 "movapd %%xmm0, %%xmm8\n"
14744 "movapd %%xmm0, %%xmm12\n"
14745 "addpd %%xmm4, %%xmm8\n"
14746 "subpd %%xmm4, %%xmm12\n"
14747 "movapd %%xmm1, %%xmm9\n"
14748 "movapd %%xmm1, %%xmm13\n"
14749 "addpd %%xmm5, %%xmm9\n"
14750 "subpd %%xmm5, %%xmm13\n"
14751 "movapd %%xmm2, %%xmm10\n"
14752 "movapd %%xmm2, %%xmm14\n"
14753 "addpd %%xmm6, %%xmm10\n"
14754 "subpd %%xmm6, %%xmm14\n"
14755 "movapd %%xmm3, %%xmm11\n"
14756 "movapd %%xmm3, %%xmm15\n"
14757 "addpd %%xmm7, %%xmm11\n"
14758 "subpd %%xmm7, %%xmm15\n"
14759 "movupd %%xmm8, (%0)\n"
14760 "movupd %%xmm9, (%1)\n"
14761 "movupd %%xmm10, (%2)\n"
14762 "movupd %%xmm11, (%3)\n"
14763 "movupd %%xmm12, (%4)\n"
14764 "movupd %%xmm13, (%5)\n"
14765 "movupd %%xmm14, (%6)\n"
14766 "movupd %%xmm15, (%7)\n"
14767 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14768 );
14769 }
14770 }
14771 return;
14772 }
14773 if (depth == 8) {
14774 helper_double_8_recursive(buf + 0, 5);
14775 helper_double_8_recursive(buf + 32, 5);
14776 helper_double_8_recursive(buf + 64, 5);
14777 helper_double_8_recursive(buf + 96, 5);
14778 helper_double_8_recursive(buf + 128, 5);
14779 helper_double_8_recursive(buf + 160, 5);
14780 helper_double_8_recursive(buf + 192, 5);
14781 helper_double_8_recursive(buf + 224, 5);
14782 for (int j = 0; j < 256; j += 256) {
14783 for (int k = 0; k < 32; k += 2) {
14784 __asm__ volatile (
14785 "movupd (%0), %%xmm0\n"
14786 "movupd (%1), %%xmm1\n"
14787 "movupd (%2), %%xmm2\n"
14788 "movupd (%3), %%xmm3\n"
14789 "movupd (%4), %%xmm4\n"
14790 "movupd (%5), %%xmm5\n"
14791 "movupd (%6), %%xmm6\n"
14792 "movupd (%7), %%xmm7\n"
14793 "movapd %%xmm0, %%xmm8\n"
14794 "movapd %%xmm0, %%xmm9\n"
14795 "addpd %%xmm1, %%xmm8\n"
14796 "subpd %%xmm1, %%xmm9\n"
14797 "movapd %%xmm2, %%xmm10\n"
14798 "movapd %%xmm2, %%xmm11\n"
14799 "addpd %%xmm3, %%xmm10\n"
14800 "subpd %%xmm3, %%xmm11\n"
14801 "movapd %%xmm4, %%xmm12\n"
14802 "movapd %%xmm4, %%xmm13\n"
14803 "addpd %%xmm5, %%xmm12\n"
14804 "subpd %%xmm5, %%xmm13\n"
14805 "movapd %%xmm6, %%xmm14\n"
14806 "movapd %%xmm6, %%xmm15\n"
14807 "addpd %%xmm7, %%xmm14\n"
14808 "subpd %%xmm7, %%xmm15\n"
14809 "movapd %%xmm8, %%xmm0\n"
14810 "movapd %%xmm8, %%xmm2\n"
14811 "addpd %%xmm10, %%xmm0\n"
14812 "subpd %%xmm10, %%xmm2\n"
14813 "movapd %%xmm9, %%xmm1\n"
14814 "movapd %%xmm9, %%xmm3\n"
14815 "addpd %%xmm11, %%xmm1\n"
14816 "subpd %%xmm11, %%xmm3\n"
14817 "movapd %%xmm12, %%xmm4\n"
14818 "movapd %%xmm12, %%xmm6\n"
14819 "addpd %%xmm14, %%xmm4\n"
14820 "subpd %%xmm14, %%xmm6\n"
14821 "movapd %%xmm13, %%xmm5\n"
14822 "movapd %%xmm13, %%xmm7\n"
14823 "addpd %%xmm15, %%xmm5\n"
14824 "subpd %%xmm15, %%xmm7\n"
14825 "movapd %%xmm0, %%xmm8\n"
14826 "movapd %%xmm0, %%xmm12\n"
14827 "addpd %%xmm4, %%xmm8\n"
14828 "subpd %%xmm4, %%xmm12\n"
14829 "movapd %%xmm1, %%xmm9\n"
14830 "movapd %%xmm1, %%xmm13\n"
14831 "addpd %%xmm5, %%xmm9\n"
14832 "subpd %%xmm5, %%xmm13\n"
14833 "movapd %%xmm2, %%xmm10\n"
14834 "movapd %%xmm2, %%xmm14\n"
14835 "addpd %%xmm6, %%xmm10\n"
14836 "subpd %%xmm6, %%xmm14\n"
14837 "movapd %%xmm3, %%xmm11\n"
14838 "movapd %%xmm3, %%xmm15\n"
14839 "addpd %%xmm7, %%xmm11\n"
14840 "subpd %%xmm7, %%xmm15\n"
14841 "movupd %%xmm8, (%0)\n"
14842 "movupd %%xmm9, (%1)\n"
14843 "movupd %%xmm10, (%2)\n"
14844 "movupd %%xmm11, (%3)\n"
14845 "movupd %%xmm12, (%4)\n"
14846 "movupd %%xmm13, (%5)\n"
14847 "movupd %%xmm14, (%6)\n"
14848 "movupd %%xmm15, (%7)\n"
14849 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14850 );
14851 }
14852 }
14853 return;
14854 }
14855 }
14856 void helper_double_8(double *buf);
helper_double_8(double * buf)14857 void helper_double_8(double *buf) {
14858 helper_double_8_recursive(buf, 8);
14859 }
14860 void helper_double_9_recursive(double *buf, int depth);
helper_double_9_recursive(double * buf,int depth)14861 void helper_double_9_recursive(double *buf, int depth) {
14862 if (depth == 6) {
14863 for (int j = 0; j < 64; j += 16) {
14864 for (int k = 0; k < 2; k += 2) {
14865 __asm__ volatile (
14866 "movupd (%0), %%xmm0\n"
14867 "movupd (%1), %%xmm1\n"
14868 "movupd (%2), %%xmm2\n"
14869 "movupd (%3), %%xmm3\n"
14870 "movupd (%4), %%xmm4\n"
14871 "movupd (%5), %%xmm5\n"
14872 "movupd (%6), %%xmm6\n"
14873 "movupd (%7), %%xmm7\n"
14874 "movapd %%xmm0, %%xmm8\n"
14875 "haddpd %%xmm8, %%xmm8\n"
14876 "movapd %%xmm0, %%xmm9\n"
14877 "hsubpd %%xmm9, %%xmm9\n"
14878 "blendpd $1, %%xmm8, %%xmm9\n"
14879 "movapd %%xmm9, %%xmm0\n"
14880 "movapd %%xmm1, %%xmm8\n"
14881 "haddpd %%xmm8, %%xmm8\n"
14882 "movapd %%xmm1, %%xmm9\n"
14883 "hsubpd %%xmm9, %%xmm9\n"
14884 "blendpd $1, %%xmm8, %%xmm9\n"
14885 "movapd %%xmm9, %%xmm1\n"
14886 "movapd %%xmm2, %%xmm8\n"
14887 "haddpd %%xmm8, %%xmm8\n"
14888 "movapd %%xmm2, %%xmm9\n"
14889 "hsubpd %%xmm9, %%xmm9\n"
14890 "blendpd $1, %%xmm8, %%xmm9\n"
14891 "movapd %%xmm9, %%xmm2\n"
14892 "movapd %%xmm3, %%xmm8\n"
14893 "haddpd %%xmm8, %%xmm8\n"
14894 "movapd %%xmm3, %%xmm9\n"
14895 "hsubpd %%xmm9, %%xmm9\n"
14896 "blendpd $1, %%xmm8, %%xmm9\n"
14897 "movapd %%xmm9, %%xmm3\n"
14898 "movapd %%xmm4, %%xmm8\n"
14899 "haddpd %%xmm8, %%xmm8\n"
14900 "movapd %%xmm4, %%xmm9\n"
14901 "hsubpd %%xmm9, %%xmm9\n"
14902 "blendpd $1, %%xmm8, %%xmm9\n"
14903 "movapd %%xmm9, %%xmm4\n"
14904 "movapd %%xmm5, %%xmm8\n"
14905 "haddpd %%xmm8, %%xmm8\n"
14906 "movapd %%xmm5, %%xmm9\n"
14907 "hsubpd %%xmm9, %%xmm9\n"
14908 "blendpd $1, %%xmm8, %%xmm9\n"
14909 "movapd %%xmm9, %%xmm5\n"
14910 "movapd %%xmm6, %%xmm8\n"
14911 "haddpd %%xmm8, %%xmm8\n"
14912 "movapd %%xmm6, %%xmm9\n"
14913 "hsubpd %%xmm9, %%xmm9\n"
14914 "blendpd $1, %%xmm8, %%xmm9\n"
14915 "movapd %%xmm9, %%xmm6\n"
14916 "movapd %%xmm7, %%xmm8\n"
14917 "haddpd %%xmm8, %%xmm8\n"
14918 "movapd %%xmm7, %%xmm9\n"
14919 "hsubpd %%xmm9, %%xmm9\n"
14920 "blendpd $1, %%xmm8, %%xmm9\n"
14921 "movapd %%xmm9, %%xmm7\n"
14922 "movapd %%xmm0, %%xmm8\n"
14923 "movapd %%xmm0, %%xmm9\n"
14924 "addpd %%xmm1, %%xmm8\n"
14925 "subpd %%xmm1, %%xmm9\n"
14926 "movapd %%xmm2, %%xmm10\n"
14927 "movapd %%xmm2, %%xmm11\n"
14928 "addpd %%xmm3, %%xmm10\n"
14929 "subpd %%xmm3, %%xmm11\n"
14930 "movapd %%xmm4, %%xmm12\n"
14931 "movapd %%xmm4, %%xmm13\n"
14932 "addpd %%xmm5, %%xmm12\n"
14933 "subpd %%xmm5, %%xmm13\n"
14934 "movapd %%xmm6, %%xmm14\n"
14935 "movapd %%xmm6, %%xmm15\n"
14936 "addpd %%xmm7, %%xmm14\n"
14937 "subpd %%xmm7, %%xmm15\n"
14938 "movapd %%xmm8, %%xmm0\n"
14939 "movapd %%xmm8, %%xmm2\n"
14940 "addpd %%xmm10, %%xmm0\n"
14941 "subpd %%xmm10, %%xmm2\n"
14942 "movapd %%xmm9, %%xmm1\n"
14943 "movapd %%xmm9, %%xmm3\n"
14944 "addpd %%xmm11, %%xmm1\n"
14945 "subpd %%xmm11, %%xmm3\n"
14946 "movapd %%xmm12, %%xmm4\n"
14947 "movapd %%xmm12, %%xmm6\n"
14948 "addpd %%xmm14, %%xmm4\n"
14949 "subpd %%xmm14, %%xmm6\n"
14950 "movapd %%xmm13, %%xmm5\n"
14951 "movapd %%xmm13, %%xmm7\n"
14952 "addpd %%xmm15, %%xmm5\n"
14953 "subpd %%xmm15, %%xmm7\n"
14954 "movapd %%xmm0, %%xmm8\n"
14955 "movapd %%xmm0, %%xmm12\n"
14956 "addpd %%xmm4, %%xmm8\n"
14957 "subpd %%xmm4, %%xmm12\n"
14958 "movapd %%xmm1, %%xmm9\n"
14959 "movapd %%xmm1, %%xmm13\n"
14960 "addpd %%xmm5, %%xmm9\n"
14961 "subpd %%xmm5, %%xmm13\n"
14962 "movapd %%xmm2, %%xmm10\n"
14963 "movapd %%xmm2, %%xmm14\n"
14964 "addpd %%xmm6, %%xmm10\n"
14965 "subpd %%xmm6, %%xmm14\n"
14966 "movapd %%xmm3, %%xmm11\n"
14967 "movapd %%xmm3, %%xmm15\n"
14968 "addpd %%xmm7, %%xmm11\n"
14969 "subpd %%xmm7, %%xmm15\n"
14970 "movupd %%xmm8, (%0)\n"
14971 "movupd %%xmm9, (%1)\n"
14972 "movupd %%xmm10, (%2)\n"
14973 "movupd %%xmm11, (%3)\n"
14974 "movupd %%xmm12, (%4)\n"
14975 "movupd %%xmm13, (%5)\n"
14976 "movupd %%xmm14, (%6)\n"
14977 "movupd %%xmm15, (%7)\n"
14978 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
14979 );
14980 }
14981 }
14982 for (int j = 0; j < 64; j += 64) {
14983 for (int k = 0; k < 16; k += 2) {
14984 __asm__ volatile (
14985 "movupd (%0), %%xmm0\n"
14986 "movupd (%1), %%xmm1\n"
14987 "movupd (%2), %%xmm2\n"
14988 "movupd (%3), %%xmm3\n"
14989 "movapd %%xmm0, %%xmm8\n"
14990 "movapd %%xmm0, %%xmm9\n"
14991 "addpd %%xmm1, %%xmm8\n"
14992 "subpd %%xmm1, %%xmm9\n"
14993 "movapd %%xmm2, %%xmm10\n"
14994 "movapd %%xmm2, %%xmm11\n"
14995 "addpd %%xmm3, %%xmm10\n"
14996 "subpd %%xmm3, %%xmm11\n"
14997 "movapd %%xmm8, %%xmm0\n"
14998 "movapd %%xmm8, %%xmm2\n"
14999 "addpd %%xmm10, %%xmm0\n"
15000 "subpd %%xmm10, %%xmm2\n"
15001 "movapd %%xmm9, %%xmm1\n"
15002 "movapd %%xmm9, %%xmm3\n"
15003 "addpd %%xmm11, %%xmm1\n"
15004 "subpd %%xmm11, %%xmm3\n"
15005 "movupd %%xmm0, (%0)\n"
15006 "movupd %%xmm1, (%1)\n"
15007 "movupd %%xmm2, (%2)\n"
15008 "movupd %%xmm3, (%3)\n"
15009 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15010 );
15011 }
15012 }
15013 return;
15014 }
15015 if (depth == 9) {
15016 helper_double_9_recursive(buf + 0, 6);
15017 helper_double_9_recursive(buf + 64, 6);
15018 helper_double_9_recursive(buf + 128, 6);
15019 helper_double_9_recursive(buf + 192, 6);
15020 helper_double_9_recursive(buf + 256, 6);
15021 helper_double_9_recursive(buf + 320, 6);
15022 helper_double_9_recursive(buf + 384, 6);
15023 helper_double_9_recursive(buf + 448, 6);
15024 for (int j = 0; j < 512; j += 512) {
15025 for (int k = 0; k < 64; k += 2) {
15026 __asm__ volatile (
15027 "movupd (%0), %%xmm0\n"
15028 "movupd (%1), %%xmm1\n"
15029 "movupd (%2), %%xmm2\n"
15030 "movupd (%3), %%xmm3\n"
15031 "movupd (%4), %%xmm4\n"
15032 "movupd (%5), %%xmm5\n"
15033 "movupd (%6), %%xmm6\n"
15034 "movupd (%7), %%xmm7\n"
15035 "movapd %%xmm0, %%xmm8\n"
15036 "movapd %%xmm0, %%xmm9\n"
15037 "addpd %%xmm1, %%xmm8\n"
15038 "subpd %%xmm1, %%xmm9\n"
15039 "movapd %%xmm2, %%xmm10\n"
15040 "movapd %%xmm2, %%xmm11\n"
15041 "addpd %%xmm3, %%xmm10\n"
15042 "subpd %%xmm3, %%xmm11\n"
15043 "movapd %%xmm4, %%xmm12\n"
15044 "movapd %%xmm4, %%xmm13\n"
15045 "addpd %%xmm5, %%xmm12\n"
15046 "subpd %%xmm5, %%xmm13\n"
15047 "movapd %%xmm6, %%xmm14\n"
15048 "movapd %%xmm6, %%xmm15\n"
15049 "addpd %%xmm7, %%xmm14\n"
15050 "subpd %%xmm7, %%xmm15\n"
15051 "movapd %%xmm8, %%xmm0\n"
15052 "movapd %%xmm8, %%xmm2\n"
15053 "addpd %%xmm10, %%xmm0\n"
15054 "subpd %%xmm10, %%xmm2\n"
15055 "movapd %%xmm9, %%xmm1\n"
15056 "movapd %%xmm9, %%xmm3\n"
15057 "addpd %%xmm11, %%xmm1\n"
15058 "subpd %%xmm11, %%xmm3\n"
15059 "movapd %%xmm12, %%xmm4\n"
15060 "movapd %%xmm12, %%xmm6\n"
15061 "addpd %%xmm14, %%xmm4\n"
15062 "subpd %%xmm14, %%xmm6\n"
15063 "movapd %%xmm13, %%xmm5\n"
15064 "movapd %%xmm13, %%xmm7\n"
15065 "addpd %%xmm15, %%xmm5\n"
15066 "subpd %%xmm15, %%xmm7\n"
15067 "movapd %%xmm0, %%xmm8\n"
15068 "movapd %%xmm0, %%xmm12\n"
15069 "addpd %%xmm4, %%xmm8\n"
15070 "subpd %%xmm4, %%xmm12\n"
15071 "movapd %%xmm1, %%xmm9\n"
15072 "movapd %%xmm1, %%xmm13\n"
15073 "addpd %%xmm5, %%xmm9\n"
15074 "subpd %%xmm5, %%xmm13\n"
15075 "movapd %%xmm2, %%xmm10\n"
15076 "movapd %%xmm2, %%xmm14\n"
15077 "addpd %%xmm6, %%xmm10\n"
15078 "subpd %%xmm6, %%xmm14\n"
15079 "movapd %%xmm3, %%xmm11\n"
15080 "movapd %%xmm3, %%xmm15\n"
15081 "addpd %%xmm7, %%xmm11\n"
15082 "subpd %%xmm7, %%xmm15\n"
15083 "movupd %%xmm8, (%0)\n"
15084 "movupd %%xmm9, (%1)\n"
15085 "movupd %%xmm10, (%2)\n"
15086 "movupd %%xmm11, (%3)\n"
15087 "movupd %%xmm12, (%4)\n"
15088 "movupd %%xmm13, (%5)\n"
15089 "movupd %%xmm14, (%6)\n"
15090 "movupd %%xmm15, (%7)\n"
15091 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15092 );
15093 }
15094 }
15095 return;
15096 }
15097 }
15098 void helper_double_9(double *buf);
helper_double_9(double * buf)15099 void helper_double_9(double *buf) {
15100 helper_double_9_recursive(buf, 9);
15101 }
15102 void helper_double_10_recursive(double *buf, int depth);
helper_double_10_recursive(double * buf,int depth)15103 void helper_double_10_recursive(double *buf, int depth) {
15104 if (depth == 10) {
15105 for (int j = 0; j < 1024; j += 16) {
15106 for (int k = 0; k < 2; k += 2) {
15107 __asm__ volatile (
15108 "movupd (%0), %%xmm0\n"
15109 "movupd (%1), %%xmm1\n"
15110 "movupd (%2), %%xmm2\n"
15111 "movupd (%3), %%xmm3\n"
15112 "movupd (%4), %%xmm4\n"
15113 "movupd (%5), %%xmm5\n"
15114 "movupd (%6), %%xmm6\n"
15115 "movupd (%7), %%xmm7\n"
15116 "movapd %%xmm0, %%xmm8\n"
15117 "haddpd %%xmm8, %%xmm8\n"
15118 "movapd %%xmm0, %%xmm9\n"
15119 "hsubpd %%xmm9, %%xmm9\n"
15120 "blendpd $1, %%xmm8, %%xmm9\n"
15121 "movapd %%xmm9, %%xmm0\n"
15122 "movapd %%xmm1, %%xmm8\n"
15123 "haddpd %%xmm8, %%xmm8\n"
15124 "movapd %%xmm1, %%xmm9\n"
15125 "hsubpd %%xmm9, %%xmm9\n"
15126 "blendpd $1, %%xmm8, %%xmm9\n"
15127 "movapd %%xmm9, %%xmm1\n"
15128 "movapd %%xmm2, %%xmm8\n"
15129 "haddpd %%xmm8, %%xmm8\n"
15130 "movapd %%xmm2, %%xmm9\n"
15131 "hsubpd %%xmm9, %%xmm9\n"
15132 "blendpd $1, %%xmm8, %%xmm9\n"
15133 "movapd %%xmm9, %%xmm2\n"
15134 "movapd %%xmm3, %%xmm8\n"
15135 "haddpd %%xmm8, %%xmm8\n"
15136 "movapd %%xmm3, %%xmm9\n"
15137 "hsubpd %%xmm9, %%xmm9\n"
15138 "blendpd $1, %%xmm8, %%xmm9\n"
15139 "movapd %%xmm9, %%xmm3\n"
15140 "movapd %%xmm4, %%xmm8\n"
15141 "haddpd %%xmm8, %%xmm8\n"
15142 "movapd %%xmm4, %%xmm9\n"
15143 "hsubpd %%xmm9, %%xmm9\n"
15144 "blendpd $1, %%xmm8, %%xmm9\n"
15145 "movapd %%xmm9, %%xmm4\n"
15146 "movapd %%xmm5, %%xmm8\n"
15147 "haddpd %%xmm8, %%xmm8\n"
15148 "movapd %%xmm5, %%xmm9\n"
15149 "hsubpd %%xmm9, %%xmm9\n"
15150 "blendpd $1, %%xmm8, %%xmm9\n"
15151 "movapd %%xmm9, %%xmm5\n"
15152 "movapd %%xmm6, %%xmm8\n"
15153 "haddpd %%xmm8, %%xmm8\n"
15154 "movapd %%xmm6, %%xmm9\n"
15155 "hsubpd %%xmm9, %%xmm9\n"
15156 "blendpd $1, %%xmm8, %%xmm9\n"
15157 "movapd %%xmm9, %%xmm6\n"
15158 "movapd %%xmm7, %%xmm8\n"
15159 "haddpd %%xmm8, %%xmm8\n"
15160 "movapd %%xmm7, %%xmm9\n"
15161 "hsubpd %%xmm9, %%xmm9\n"
15162 "blendpd $1, %%xmm8, %%xmm9\n"
15163 "movapd %%xmm9, %%xmm7\n"
15164 "movapd %%xmm0, %%xmm8\n"
15165 "movapd %%xmm0, %%xmm9\n"
15166 "addpd %%xmm1, %%xmm8\n"
15167 "subpd %%xmm1, %%xmm9\n"
15168 "movapd %%xmm2, %%xmm10\n"
15169 "movapd %%xmm2, %%xmm11\n"
15170 "addpd %%xmm3, %%xmm10\n"
15171 "subpd %%xmm3, %%xmm11\n"
15172 "movapd %%xmm4, %%xmm12\n"
15173 "movapd %%xmm4, %%xmm13\n"
15174 "addpd %%xmm5, %%xmm12\n"
15175 "subpd %%xmm5, %%xmm13\n"
15176 "movapd %%xmm6, %%xmm14\n"
15177 "movapd %%xmm6, %%xmm15\n"
15178 "addpd %%xmm7, %%xmm14\n"
15179 "subpd %%xmm7, %%xmm15\n"
15180 "movapd %%xmm8, %%xmm0\n"
15181 "movapd %%xmm8, %%xmm2\n"
15182 "addpd %%xmm10, %%xmm0\n"
15183 "subpd %%xmm10, %%xmm2\n"
15184 "movapd %%xmm9, %%xmm1\n"
15185 "movapd %%xmm9, %%xmm3\n"
15186 "addpd %%xmm11, %%xmm1\n"
15187 "subpd %%xmm11, %%xmm3\n"
15188 "movapd %%xmm12, %%xmm4\n"
15189 "movapd %%xmm12, %%xmm6\n"
15190 "addpd %%xmm14, %%xmm4\n"
15191 "subpd %%xmm14, %%xmm6\n"
15192 "movapd %%xmm13, %%xmm5\n"
15193 "movapd %%xmm13, %%xmm7\n"
15194 "addpd %%xmm15, %%xmm5\n"
15195 "subpd %%xmm15, %%xmm7\n"
15196 "movapd %%xmm0, %%xmm8\n"
15197 "movapd %%xmm0, %%xmm12\n"
15198 "addpd %%xmm4, %%xmm8\n"
15199 "subpd %%xmm4, %%xmm12\n"
15200 "movapd %%xmm1, %%xmm9\n"
15201 "movapd %%xmm1, %%xmm13\n"
15202 "addpd %%xmm5, %%xmm9\n"
15203 "subpd %%xmm5, %%xmm13\n"
15204 "movapd %%xmm2, %%xmm10\n"
15205 "movapd %%xmm2, %%xmm14\n"
15206 "addpd %%xmm6, %%xmm10\n"
15207 "subpd %%xmm6, %%xmm14\n"
15208 "movapd %%xmm3, %%xmm11\n"
15209 "movapd %%xmm3, %%xmm15\n"
15210 "addpd %%xmm7, %%xmm11\n"
15211 "subpd %%xmm7, %%xmm15\n"
15212 "movupd %%xmm8, (%0)\n"
15213 "movupd %%xmm9, (%1)\n"
15214 "movupd %%xmm10, (%2)\n"
15215 "movupd %%xmm11, (%3)\n"
15216 "movupd %%xmm12, (%4)\n"
15217 "movupd %%xmm13, (%5)\n"
15218 "movupd %%xmm14, (%6)\n"
15219 "movupd %%xmm15, (%7)\n"
15220 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15221 );
15222 }
15223 }
15224 for (int j = 0; j < 1024; j += 128) {
15225 for (int k = 0; k < 16; k += 2) {
15226 __asm__ volatile (
15227 "movupd (%0), %%xmm0\n"
15228 "movupd (%1), %%xmm1\n"
15229 "movupd (%2), %%xmm2\n"
15230 "movupd (%3), %%xmm3\n"
15231 "movupd (%4), %%xmm4\n"
15232 "movupd (%5), %%xmm5\n"
15233 "movupd (%6), %%xmm6\n"
15234 "movupd (%7), %%xmm7\n"
15235 "movapd %%xmm0, %%xmm8\n"
15236 "movapd %%xmm0, %%xmm9\n"
15237 "addpd %%xmm1, %%xmm8\n"
15238 "subpd %%xmm1, %%xmm9\n"
15239 "movapd %%xmm2, %%xmm10\n"
15240 "movapd %%xmm2, %%xmm11\n"
15241 "addpd %%xmm3, %%xmm10\n"
15242 "subpd %%xmm3, %%xmm11\n"
15243 "movapd %%xmm4, %%xmm12\n"
15244 "movapd %%xmm4, %%xmm13\n"
15245 "addpd %%xmm5, %%xmm12\n"
15246 "subpd %%xmm5, %%xmm13\n"
15247 "movapd %%xmm6, %%xmm14\n"
15248 "movapd %%xmm6, %%xmm15\n"
15249 "addpd %%xmm7, %%xmm14\n"
15250 "subpd %%xmm7, %%xmm15\n"
15251 "movapd %%xmm8, %%xmm0\n"
15252 "movapd %%xmm8, %%xmm2\n"
15253 "addpd %%xmm10, %%xmm0\n"
15254 "subpd %%xmm10, %%xmm2\n"
15255 "movapd %%xmm9, %%xmm1\n"
15256 "movapd %%xmm9, %%xmm3\n"
15257 "addpd %%xmm11, %%xmm1\n"
15258 "subpd %%xmm11, %%xmm3\n"
15259 "movapd %%xmm12, %%xmm4\n"
15260 "movapd %%xmm12, %%xmm6\n"
15261 "addpd %%xmm14, %%xmm4\n"
15262 "subpd %%xmm14, %%xmm6\n"
15263 "movapd %%xmm13, %%xmm5\n"
15264 "movapd %%xmm13, %%xmm7\n"
15265 "addpd %%xmm15, %%xmm5\n"
15266 "subpd %%xmm15, %%xmm7\n"
15267 "movapd %%xmm0, %%xmm8\n"
15268 "movapd %%xmm0, %%xmm12\n"
15269 "addpd %%xmm4, %%xmm8\n"
15270 "subpd %%xmm4, %%xmm12\n"
15271 "movapd %%xmm1, %%xmm9\n"
15272 "movapd %%xmm1, %%xmm13\n"
15273 "addpd %%xmm5, %%xmm9\n"
15274 "subpd %%xmm5, %%xmm13\n"
15275 "movapd %%xmm2, %%xmm10\n"
15276 "movapd %%xmm2, %%xmm14\n"
15277 "addpd %%xmm6, %%xmm10\n"
15278 "subpd %%xmm6, %%xmm14\n"
15279 "movapd %%xmm3, %%xmm11\n"
15280 "movapd %%xmm3, %%xmm15\n"
15281 "addpd %%xmm7, %%xmm11\n"
15282 "subpd %%xmm7, %%xmm15\n"
15283 "movupd %%xmm8, (%0)\n"
15284 "movupd %%xmm9, (%1)\n"
15285 "movupd %%xmm10, (%2)\n"
15286 "movupd %%xmm11, (%3)\n"
15287 "movupd %%xmm12, (%4)\n"
15288 "movupd %%xmm13, (%5)\n"
15289 "movupd %%xmm14, (%6)\n"
15290 "movupd %%xmm15, (%7)\n"
15291 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15292 );
15293 }
15294 }
15295 for (int j = 0; j < 1024; j += 1024) {
15296 for (int k = 0; k < 128; k += 2) {
15297 __asm__ volatile (
15298 "movupd (%0), %%xmm0\n"
15299 "movupd (%1), %%xmm1\n"
15300 "movupd (%2), %%xmm2\n"
15301 "movupd (%3), %%xmm3\n"
15302 "movupd (%4), %%xmm4\n"
15303 "movupd (%5), %%xmm5\n"
15304 "movupd (%6), %%xmm6\n"
15305 "movupd (%7), %%xmm7\n"
15306 "movapd %%xmm0, %%xmm8\n"
15307 "movapd %%xmm0, %%xmm9\n"
15308 "addpd %%xmm1, %%xmm8\n"
15309 "subpd %%xmm1, %%xmm9\n"
15310 "movapd %%xmm2, %%xmm10\n"
15311 "movapd %%xmm2, %%xmm11\n"
15312 "addpd %%xmm3, %%xmm10\n"
15313 "subpd %%xmm3, %%xmm11\n"
15314 "movapd %%xmm4, %%xmm12\n"
15315 "movapd %%xmm4, %%xmm13\n"
15316 "addpd %%xmm5, %%xmm12\n"
15317 "subpd %%xmm5, %%xmm13\n"
15318 "movapd %%xmm6, %%xmm14\n"
15319 "movapd %%xmm6, %%xmm15\n"
15320 "addpd %%xmm7, %%xmm14\n"
15321 "subpd %%xmm7, %%xmm15\n"
15322 "movapd %%xmm8, %%xmm0\n"
15323 "movapd %%xmm8, %%xmm2\n"
15324 "addpd %%xmm10, %%xmm0\n"
15325 "subpd %%xmm10, %%xmm2\n"
15326 "movapd %%xmm9, %%xmm1\n"
15327 "movapd %%xmm9, %%xmm3\n"
15328 "addpd %%xmm11, %%xmm1\n"
15329 "subpd %%xmm11, %%xmm3\n"
15330 "movapd %%xmm12, %%xmm4\n"
15331 "movapd %%xmm12, %%xmm6\n"
15332 "addpd %%xmm14, %%xmm4\n"
15333 "subpd %%xmm14, %%xmm6\n"
15334 "movapd %%xmm13, %%xmm5\n"
15335 "movapd %%xmm13, %%xmm7\n"
15336 "addpd %%xmm15, %%xmm5\n"
15337 "subpd %%xmm15, %%xmm7\n"
15338 "movapd %%xmm0, %%xmm8\n"
15339 "movapd %%xmm0, %%xmm12\n"
15340 "addpd %%xmm4, %%xmm8\n"
15341 "subpd %%xmm4, %%xmm12\n"
15342 "movapd %%xmm1, %%xmm9\n"
15343 "movapd %%xmm1, %%xmm13\n"
15344 "addpd %%xmm5, %%xmm9\n"
15345 "subpd %%xmm5, %%xmm13\n"
15346 "movapd %%xmm2, %%xmm10\n"
15347 "movapd %%xmm2, %%xmm14\n"
15348 "addpd %%xmm6, %%xmm10\n"
15349 "subpd %%xmm6, %%xmm14\n"
15350 "movapd %%xmm3, %%xmm11\n"
15351 "movapd %%xmm3, %%xmm15\n"
15352 "addpd %%xmm7, %%xmm11\n"
15353 "subpd %%xmm7, %%xmm15\n"
15354 "movupd %%xmm8, (%0)\n"
15355 "movupd %%xmm9, (%1)\n"
15356 "movupd %%xmm10, (%2)\n"
15357 "movupd %%xmm11, (%3)\n"
15358 "movupd %%xmm12, (%4)\n"
15359 "movupd %%xmm13, (%5)\n"
15360 "movupd %%xmm14, (%6)\n"
15361 "movupd %%xmm15, (%7)\n"
15362 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15363 );
15364 }
15365 }
15366 return;
15367 }
15368 }
15369 void helper_double_10(double *buf);
helper_double_10(double * buf)15370 void helper_double_10(double *buf) {
15371 helper_double_10_recursive(buf, 10);
15372 }
15373 void helper_double_11_recursive(double *buf, int depth);
helper_double_11_recursive(double * buf,int depth)15374 void helper_double_11_recursive(double *buf, int depth) {
15375 if (depth == 2) {
15376 for (int j = 0; j < 4; j += 4) {
15377 for (int k = 0; k < 2; k += 2) {
15378 __asm__ volatile (
15379 "movupd (%0), %%xmm0\n"
15380 "movupd (%1), %%xmm1\n"
15381 "movapd %%xmm0, %%xmm8\n"
15382 "haddpd %%xmm8, %%xmm8\n"
15383 "movapd %%xmm0, %%xmm9\n"
15384 "hsubpd %%xmm9, %%xmm9\n"
15385 "blendpd $1, %%xmm8, %%xmm9\n"
15386 "movapd %%xmm9, %%xmm0\n"
15387 "movapd %%xmm1, %%xmm8\n"
15388 "haddpd %%xmm8, %%xmm8\n"
15389 "movapd %%xmm1, %%xmm9\n"
15390 "hsubpd %%xmm9, %%xmm9\n"
15391 "blendpd $1, %%xmm8, %%xmm9\n"
15392 "movapd %%xmm9, %%xmm1\n"
15393 "movapd %%xmm0, %%xmm8\n"
15394 "movapd %%xmm0, %%xmm9\n"
15395 "addpd %%xmm1, %%xmm8\n"
15396 "subpd %%xmm1, %%xmm9\n"
15397 "movupd %%xmm8, (%0)\n"
15398 "movupd %%xmm9, (%1)\n"
15399 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15400 );
15401 }
15402 }
15403 return;
15404 }
15405 if (depth == 5) {
15406 helper_double_11_recursive(buf + 0, 2);
15407 helper_double_11_recursive(buf + 4, 2);
15408 helper_double_11_recursive(buf + 8, 2);
15409 helper_double_11_recursive(buf + 12, 2);
15410 helper_double_11_recursive(buf + 16, 2);
15411 helper_double_11_recursive(buf + 20, 2);
15412 helper_double_11_recursive(buf + 24, 2);
15413 helper_double_11_recursive(buf + 28, 2);
15414 for (int j = 0; j < 32; j += 32) {
15415 for (int k = 0; k < 4; k += 2) {
15416 __asm__ volatile (
15417 "movupd (%0), %%xmm0\n"
15418 "movupd (%1), %%xmm1\n"
15419 "movupd (%2), %%xmm2\n"
15420 "movupd (%3), %%xmm3\n"
15421 "movupd (%4), %%xmm4\n"
15422 "movupd (%5), %%xmm5\n"
15423 "movupd (%6), %%xmm6\n"
15424 "movupd (%7), %%xmm7\n"
15425 "movapd %%xmm0, %%xmm8\n"
15426 "movapd %%xmm0, %%xmm9\n"
15427 "addpd %%xmm1, %%xmm8\n"
15428 "subpd %%xmm1, %%xmm9\n"
15429 "movapd %%xmm2, %%xmm10\n"
15430 "movapd %%xmm2, %%xmm11\n"
15431 "addpd %%xmm3, %%xmm10\n"
15432 "subpd %%xmm3, %%xmm11\n"
15433 "movapd %%xmm4, %%xmm12\n"
15434 "movapd %%xmm4, %%xmm13\n"
15435 "addpd %%xmm5, %%xmm12\n"
15436 "subpd %%xmm5, %%xmm13\n"
15437 "movapd %%xmm6, %%xmm14\n"
15438 "movapd %%xmm6, %%xmm15\n"
15439 "addpd %%xmm7, %%xmm14\n"
15440 "subpd %%xmm7, %%xmm15\n"
15441 "movapd %%xmm8, %%xmm0\n"
15442 "movapd %%xmm8, %%xmm2\n"
15443 "addpd %%xmm10, %%xmm0\n"
15444 "subpd %%xmm10, %%xmm2\n"
15445 "movapd %%xmm9, %%xmm1\n"
15446 "movapd %%xmm9, %%xmm3\n"
15447 "addpd %%xmm11, %%xmm1\n"
15448 "subpd %%xmm11, %%xmm3\n"
15449 "movapd %%xmm12, %%xmm4\n"
15450 "movapd %%xmm12, %%xmm6\n"
15451 "addpd %%xmm14, %%xmm4\n"
15452 "subpd %%xmm14, %%xmm6\n"
15453 "movapd %%xmm13, %%xmm5\n"
15454 "movapd %%xmm13, %%xmm7\n"
15455 "addpd %%xmm15, %%xmm5\n"
15456 "subpd %%xmm15, %%xmm7\n"
15457 "movapd %%xmm0, %%xmm8\n"
15458 "movapd %%xmm0, %%xmm12\n"
15459 "addpd %%xmm4, %%xmm8\n"
15460 "subpd %%xmm4, %%xmm12\n"
15461 "movapd %%xmm1, %%xmm9\n"
15462 "movapd %%xmm1, %%xmm13\n"
15463 "addpd %%xmm5, %%xmm9\n"
15464 "subpd %%xmm5, %%xmm13\n"
15465 "movapd %%xmm2, %%xmm10\n"
15466 "movapd %%xmm2, %%xmm14\n"
15467 "addpd %%xmm6, %%xmm10\n"
15468 "subpd %%xmm6, %%xmm14\n"
15469 "movapd %%xmm3, %%xmm11\n"
15470 "movapd %%xmm3, %%xmm15\n"
15471 "addpd %%xmm7, %%xmm11\n"
15472 "subpd %%xmm7, %%xmm15\n"
15473 "movupd %%xmm8, (%0)\n"
15474 "movupd %%xmm9, (%1)\n"
15475 "movupd %%xmm10, (%2)\n"
15476 "movupd %%xmm11, (%3)\n"
15477 "movupd %%xmm12, (%4)\n"
15478 "movupd %%xmm13, (%5)\n"
15479 "movupd %%xmm14, (%6)\n"
15480 "movupd %%xmm15, (%7)\n"
15481 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15482 );
15483 }
15484 }
15485 return;
15486 }
15487 if (depth == 8) {
15488 helper_double_11_recursive(buf + 0, 5);
15489 helper_double_11_recursive(buf + 32, 5);
15490 helper_double_11_recursive(buf + 64, 5);
15491 helper_double_11_recursive(buf + 96, 5);
15492 helper_double_11_recursive(buf + 128, 5);
15493 helper_double_11_recursive(buf + 160, 5);
15494 helper_double_11_recursive(buf + 192, 5);
15495 helper_double_11_recursive(buf + 224, 5);
15496 for (int j = 0; j < 256; j += 256) {
15497 for (int k = 0; k < 32; k += 2) {
15498 __asm__ volatile (
15499 "movupd (%0), %%xmm0\n"
15500 "movupd (%1), %%xmm1\n"
15501 "movupd (%2), %%xmm2\n"
15502 "movupd (%3), %%xmm3\n"
15503 "movupd (%4), %%xmm4\n"
15504 "movupd (%5), %%xmm5\n"
15505 "movupd (%6), %%xmm6\n"
15506 "movupd (%7), %%xmm7\n"
15507 "movapd %%xmm0, %%xmm8\n"
15508 "movapd %%xmm0, %%xmm9\n"
15509 "addpd %%xmm1, %%xmm8\n"
15510 "subpd %%xmm1, %%xmm9\n"
15511 "movapd %%xmm2, %%xmm10\n"
15512 "movapd %%xmm2, %%xmm11\n"
15513 "addpd %%xmm3, %%xmm10\n"
15514 "subpd %%xmm3, %%xmm11\n"
15515 "movapd %%xmm4, %%xmm12\n"
15516 "movapd %%xmm4, %%xmm13\n"
15517 "addpd %%xmm5, %%xmm12\n"
15518 "subpd %%xmm5, %%xmm13\n"
15519 "movapd %%xmm6, %%xmm14\n"
15520 "movapd %%xmm6, %%xmm15\n"
15521 "addpd %%xmm7, %%xmm14\n"
15522 "subpd %%xmm7, %%xmm15\n"
15523 "movapd %%xmm8, %%xmm0\n"
15524 "movapd %%xmm8, %%xmm2\n"
15525 "addpd %%xmm10, %%xmm0\n"
15526 "subpd %%xmm10, %%xmm2\n"
15527 "movapd %%xmm9, %%xmm1\n"
15528 "movapd %%xmm9, %%xmm3\n"
15529 "addpd %%xmm11, %%xmm1\n"
15530 "subpd %%xmm11, %%xmm3\n"
15531 "movapd %%xmm12, %%xmm4\n"
15532 "movapd %%xmm12, %%xmm6\n"
15533 "addpd %%xmm14, %%xmm4\n"
15534 "subpd %%xmm14, %%xmm6\n"
15535 "movapd %%xmm13, %%xmm5\n"
15536 "movapd %%xmm13, %%xmm7\n"
15537 "addpd %%xmm15, %%xmm5\n"
15538 "subpd %%xmm15, %%xmm7\n"
15539 "movapd %%xmm0, %%xmm8\n"
15540 "movapd %%xmm0, %%xmm12\n"
15541 "addpd %%xmm4, %%xmm8\n"
15542 "subpd %%xmm4, %%xmm12\n"
15543 "movapd %%xmm1, %%xmm9\n"
15544 "movapd %%xmm1, %%xmm13\n"
15545 "addpd %%xmm5, %%xmm9\n"
15546 "subpd %%xmm5, %%xmm13\n"
15547 "movapd %%xmm2, %%xmm10\n"
15548 "movapd %%xmm2, %%xmm14\n"
15549 "addpd %%xmm6, %%xmm10\n"
15550 "subpd %%xmm6, %%xmm14\n"
15551 "movapd %%xmm3, %%xmm11\n"
15552 "movapd %%xmm3, %%xmm15\n"
15553 "addpd %%xmm7, %%xmm11\n"
15554 "subpd %%xmm7, %%xmm15\n"
15555 "movupd %%xmm8, (%0)\n"
15556 "movupd %%xmm9, (%1)\n"
15557 "movupd %%xmm10, (%2)\n"
15558 "movupd %%xmm11, (%3)\n"
15559 "movupd %%xmm12, (%4)\n"
15560 "movupd %%xmm13, (%5)\n"
15561 "movupd %%xmm14, (%6)\n"
15562 "movupd %%xmm15, (%7)\n"
15563 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15564 );
15565 }
15566 }
15567 return;
15568 }
15569 if (depth == 11) {
15570 helper_double_11_recursive(buf + 0, 8);
15571 helper_double_11_recursive(buf + 256, 8);
15572 helper_double_11_recursive(buf + 512, 8);
15573 helper_double_11_recursive(buf + 768, 8);
15574 helper_double_11_recursive(buf + 1024, 8);
15575 helper_double_11_recursive(buf + 1280, 8);
15576 helper_double_11_recursive(buf + 1536, 8);
15577 helper_double_11_recursive(buf + 1792, 8);
15578 for (int j = 0; j < 2048; j += 2048) {
15579 for (int k = 0; k < 256; k += 2) {
15580 __asm__ volatile (
15581 "movupd (%0), %%xmm0\n"
15582 "movupd (%1), %%xmm1\n"
15583 "movupd (%2), %%xmm2\n"
15584 "movupd (%3), %%xmm3\n"
15585 "movupd (%4), %%xmm4\n"
15586 "movupd (%5), %%xmm5\n"
15587 "movupd (%6), %%xmm6\n"
15588 "movupd (%7), %%xmm7\n"
15589 "movapd %%xmm0, %%xmm8\n"
15590 "movapd %%xmm0, %%xmm9\n"
15591 "addpd %%xmm1, %%xmm8\n"
15592 "subpd %%xmm1, %%xmm9\n"
15593 "movapd %%xmm2, %%xmm10\n"
15594 "movapd %%xmm2, %%xmm11\n"
15595 "addpd %%xmm3, %%xmm10\n"
15596 "subpd %%xmm3, %%xmm11\n"
15597 "movapd %%xmm4, %%xmm12\n"
15598 "movapd %%xmm4, %%xmm13\n"
15599 "addpd %%xmm5, %%xmm12\n"
15600 "subpd %%xmm5, %%xmm13\n"
15601 "movapd %%xmm6, %%xmm14\n"
15602 "movapd %%xmm6, %%xmm15\n"
15603 "addpd %%xmm7, %%xmm14\n"
15604 "subpd %%xmm7, %%xmm15\n"
15605 "movapd %%xmm8, %%xmm0\n"
15606 "movapd %%xmm8, %%xmm2\n"
15607 "addpd %%xmm10, %%xmm0\n"
15608 "subpd %%xmm10, %%xmm2\n"
15609 "movapd %%xmm9, %%xmm1\n"
15610 "movapd %%xmm9, %%xmm3\n"
15611 "addpd %%xmm11, %%xmm1\n"
15612 "subpd %%xmm11, %%xmm3\n"
15613 "movapd %%xmm12, %%xmm4\n"
15614 "movapd %%xmm12, %%xmm6\n"
15615 "addpd %%xmm14, %%xmm4\n"
15616 "subpd %%xmm14, %%xmm6\n"
15617 "movapd %%xmm13, %%xmm5\n"
15618 "movapd %%xmm13, %%xmm7\n"
15619 "addpd %%xmm15, %%xmm5\n"
15620 "subpd %%xmm15, %%xmm7\n"
15621 "movapd %%xmm0, %%xmm8\n"
15622 "movapd %%xmm0, %%xmm12\n"
15623 "addpd %%xmm4, %%xmm8\n"
15624 "subpd %%xmm4, %%xmm12\n"
15625 "movapd %%xmm1, %%xmm9\n"
15626 "movapd %%xmm1, %%xmm13\n"
15627 "addpd %%xmm5, %%xmm9\n"
15628 "subpd %%xmm5, %%xmm13\n"
15629 "movapd %%xmm2, %%xmm10\n"
15630 "movapd %%xmm2, %%xmm14\n"
15631 "addpd %%xmm6, %%xmm10\n"
15632 "subpd %%xmm6, %%xmm14\n"
15633 "movapd %%xmm3, %%xmm11\n"
15634 "movapd %%xmm3, %%xmm15\n"
15635 "addpd %%xmm7, %%xmm11\n"
15636 "subpd %%xmm7, %%xmm15\n"
15637 "movupd %%xmm8, (%0)\n"
15638 "movupd %%xmm9, (%1)\n"
15639 "movupd %%xmm10, (%2)\n"
15640 "movupd %%xmm11, (%3)\n"
15641 "movupd %%xmm12, (%4)\n"
15642 "movupd %%xmm13, (%5)\n"
15643 "movupd %%xmm14, (%6)\n"
15644 "movupd %%xmm15, (%7)\n"
15645 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15646 );
15647 }
15648 }
15649 return;
15650 }
15651 }
15652 void helper_double_11(double *buf);
helper_double_11(double * buf)15653 void helper_double_11(double *buf) {
15654 helper_double_11_recursive(buf, 11);
15655 }
15656 void helper_double_12_recursive(double *buf, int depth);
helper_double_12_recursive(double * buf,int depth)15657 void helper_double_12_recursive(double *buf, int depth) {
15658 if (depth == 10) {
15659 for (int j = 0; j < 1024; j += 16) {
15660 for (int k = 0; k < 2; k += 2) {
15661 __asm__ volatile (
15662 "movupd (%0), %%xmm0\n"
15663 "movupd (%1), %%xmm1\n"
15664 "movupd (%2), %%xmm2\n"
15665 "movupd (%3), %%xmm3\n"
15666 "movupd (%4), %%xmm4\n"
15667 "movupd (%5), %%xmm5\n"
15668 "movupd (%6), %%xmm6\n"
15669 "movupd (%7), %%xmm7\n"
15670 "movapd %%xmm0, %%xmm8\n"
15671 "haddpd %%xmm8, %%xmm8\n"
15672 "movapd %%xmm0, %%xmm9\n"
15673 "hsubpd %%xmm9, %%xmm9\n"
15674 "blendpd $1, %%xmm8, %%xmm9\n"
15675 "movapd %%xmm9, %%xmm0\n"
15676 "movapd %%xmm1, %%xmm8\n"
15677 "haddpd %%xmm8, %%xmm8\n"
15678 "movapd %%xmm1, %%xmm9\n"
15679 "hsubpd %%xmm9, %%xmm9\n"
15680 "blendpd $1, %%xmm8, %%xmm9\n"
15681 "movapd %%xmm9, %%xmm1\n"
15682 "movapd %%xmm2, %%xmm8\n"
15683 "haddpd %%xmm8, %%xmm8\n"
15684 "movapd %%xmm2, %%xmm9\n"
15685 "hsubpd %%xmm9, %%xmm9\n"
15686 "blendpd $1, %%xmm8, %%xmm9\n"
15687 "movapd %%xmm9, %%xmm2\n"
15688 "movapd %%xmm3, %%xmm8\n"
15689 "haddpd %%xmm8, %%xmm8\n"
15690 "movapd %%xmm3, %%xmm9\n"
15691 "hsubpd %%xmm9, %%xmm9\n"
15692 "blendpd $1, %%xmm8, %%xmm9\n"
15693 "movapd %%xmm9, %%xmm3\n"
15694 "movapd %%xmm4, %%xmm8\n"
15695 "haddpd %%xmm8, %%xmm8\n"
15696 "movapd %%xmm4, %%xmm9\n"
15697 "hsubpd %%xmm9, %%xmm9\n"
15698 "blendpd $1, %%xmm8, %%xmm9\n"
15699 "movapd %%xmm9, %%xmm4\n"
15700 "movapd %%xmm5, %%xmm8\n"
15701 "haddpd %%xmm8, %%xmm8\n"
15702 "movapd %%xmm5, %%xmm9\n"
15703 "hsubpd %%xmm9, %%xmm9\n"
15704 "blendpd $1, %%xmm8, %%xmm9\n"
15705 "movapd %%xmm9, %%xmm5\n"
15706 "movapd %%xmm6, %%xmm8\n"
15707 "haddpd %%xmm8, %%xmm8\n"
15708 "movapd %%xmm6, %%xmm9\n"
15709 "hsubpd %%xmm9, %%xmm9\n"
15710 "blendpd $1, %%xmm8, %%xmm9\n"
15711 "movapd %%xmm9, %%xmm6\n"
15712 "movapd %%xmm7, %%xmm8\n"
15713 "haddpd %%xmm8, %%xmm8\n"
15714 "movapd %%xmm7, %%xmm9\n"
15715 "hsubpd %%xmm9, %%xmm9\n"
15716 "blendpd $1, %%xmm8, %%xmm9\n"
15717 "movapd %%xmm9, %%xmm7\n"
15718 "movapd %%xmm0, %%xmm8\n"
15719 "movapd %%xmm0, %%xmm9\n"
15720 "addpd %%xmm1, %%xmm8\n"
15721 "subpd %%xmm1, %%xmm9\n"
15722 "movapd %%xmm2, %%xmm10\n"
15723 "movapd %%xmm2, %%xmm11\n"
15724 "addpd %%xmm3, %%xmm10\n"
15725 "subpd %%xmm3, %%xmm11\n"
15726 "movapd %%xmm4, %%xmm12\n"
15727 "movapd %%xmm4, %%xmm13\n"
15728 "addpd %%xmm5, %%xmm12\n"
15729 "subpd %%xmm5, %%xmm13\n"
15730 "movapd %%xmm6, %%xmm14\n"
15731 "movapd %%xmm6, %%xmm15\n"
15732 "addpd %%xmm7, %%xmm14\n"
15733 "subpd %%xmm7, %%xmm15\n"
15734 "movapd %%xmm8, %%xmm0\n"
15735 "movapd %%xmm8, %%xmm2\n"
15736 "addpd %%xmm10, %%xmm0\n"
15737 "subpd %%xmm10, %%xmm2\n"
15738 "movapd %%xmm9, %%xmm1\n"
15739 "movapd %%xmm9, %%xmm3\n"
15740 "addpd %%xmm11, %%xmm1\n"
15741 "subpd %%xmm11, %%xmm3\n"
15742 "movapd %%xmm12, %%xmm4\n"
15743 "movapd %%xmm12, %%xmm6\n"
15744 "addpd %%xmm14, %%xmm4\n"
15745 "subpd %%xmm14, %%xmm6\n"
15746 "movapd %%xmm13, %%xmm5\n"
15747 "movapd %%xmm13, %%xmm7\n"
15748 "addpd %%xmm15, %%xmm5\n"
15749 "subpd %%xmm15, %%xmm7\n"
15750 "movapd %%xmm0, %%xmm8\n"
15751 "movapd %%xmm0, %%xmm12\n"
15752 "addpd %%xmm4, %%xmm8\n"
15753 "subpd %%xmm4, %%xmm12\n"
15754 "movapd %%xmm1, %%xmm9\n"
15755 "movapd %%xmm1, %%xmm13\n"
15756 "addpd %%xmm5, %%xmm9\n"
15757 "subpd %%xmm5, %%xmm13\n"
15758 "movapd %%xmm2, %%xmm10\n"
15759 "movapd %%xmm2, %%xmm14\n"
15760 "addpd %%xmm6, %%xmm10\n"
15761 "subpd %%xmm6, %%xmm14\n"
15762 "movapd %%xmm3, %%xmm11\n"
15763 "movapd %%xmm3, %%xmm15\n"
15764 "addpd %%xmm7, %%xmm11\n"
15765 "subpd %%xmm7, %%xmm15\n"
15766 "movupd %%xmm8, (%0)\n"
15767 "movupd %%xmm9, (%1)\n"
15768 "movupd %%xmm10, (%2)\n"
15769 "movupd %%xmm11, (%3)\n"
15770 "movupd %%xmm12, (%4)\n"
15771 "movupd %%xmm13, (%5)\n"
15772 "movupd %%xmm14, (%6)\n"
15773 "movupd %%xmm15, (%7)\n"
15774 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15775 );
15776 }
15777 }
15778 for (int j = 0; j < 1024; j += 128) {
15779 for (int k = 0; k < 16; k += 2) {
15780 __asm__ volatile (
15781 "movupd (%0), %%xmm0\n"
15782 "movupd (%1), %%xmm1\n"
15783 "movupd (%2), %%xmm2\n"
15784 "movupd (%3), %%xmm3\n"
15785 "movupd (%4), %%xmm4\n"
15786 "movupd (%5), %%xmm5\n"
15787 "movupd (%6), %%xmm6\n"
15788 "movupd (%7), %%xmm7\n"
15789 "movapd %%xmm0, %%xmm8\n"
15790 "movapd %%xmm0, %%xmm9\n"
15791 "addpd %%xmm1, %%xmm8\n"
15792 "subpd %%xmm1, %%xmm9\n"
15793 "movapd %%xmm2, %%xmm10\n"
15794 "movapd %%xmm2, %%xmm11\n"
15795 "addpd %%xmm3, %%xmm10\n"
15796 "subpd %%xmm3, %%xmm11\n"
15797 "movapd %%xmm4, %%xmm12\n"
15798 "movapd %%xmm4, %%xmm13\n"
15799 "addpd %%xmm5, %%xmm12\n"
15800 "subpd %%xmm5, %%xmm13\n"
15801 "movapd %%xmm6, %%xmm14\n"
15802 "movapd %%xmm6, %%xmm15\n"
15803 "addpd %%xmm7, %%xmm14\n"
15804 "subpd %%xmm7, %%xmm15\n"
15805 "movapd %%xmm8, %%xmm0\n"
15806 "movapd %%xmm8, %%xmm2\n"
15807 "addpd %%xmm10, %%xmm0\n"
15808 "subpd %%xmm10, %%xmm2\n"
15809 "movapd %%xmm9, %%xmm1\n"
15810 "movapd %%xmm9, %%xmm3\n"
15811 "addpd %%xmm11, %%xmm1\n"
15812 "subpd %%xmm11, %%xmm3\n"
15813 "movapd %%xmm12, %%xmm4\n"
15814 "movapd %%xmm12, %%xmm6\n"
15815 "addpd %%xmm14, %%xmm4\n"
15816 "subpd %%xmm14, %%xmm6\n"
15817 "movapd %%xmm13, %%xmm5\n"
15818 "movapd %%xmm13, %%xmm7\n"
15819 "addpd %%xmm15, %%xmm5\n"
15820 "subpd %%xmm15, %%xmm7\n"
15821 "movapd %%xmm0, %%xmm8\n"
15822 "movapd %%xmm0, %%xmm12\n"
15823 "addpd %%xmm4, %%xmm8\n"
15824 "subpd %%xmm4, %%xmm12\n"
15825 "movapd %%xmm1, %%xmm9\n"
15826 "movapd %%xmm1, %%xmm13\n"
15827 "addpd %%xmm5, %%xmm9\n"
15828 "subpd %%xmm5, %%xmm13\n"
15829 "movapd %%xmm2, %%xmm10\n"
15830 "movapd %%xmm2, %%xmm14\n"
15831 "addpd %%xmm6, %%xmm10\n"
15832 "subpd %%xmm6, %%xmm14\n"
15833 "movapd %%xmm3, %%xmm11\n"
15834 "movapd %%xmm3, %%xmm15\n"
15835 "addpd %%xmm7, %%xmm11\n"
15836 "subpd %%xmm7, %%xmm15\n"
15837 "movupd %%xmm8, (%0)\n"
15838 "movupd %%xmm9, (%1)\n"
15839 "movupd %%xmm10, (%2)\n"
15840 "movupd %%xmm11, (%3)\n"
15841 "movupd %%xmm12, (%4)\n"
15842 "movupd %%xmm13, (%5)\n"
15843 "movupd %%xmm14, (%6)\n"
15844 "movupd %%xmm15, (%7)\n"
15845 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15846 );
15847 }
15848 }
15849 for (int j = 0; j < 1024; j += 1024) {
15850 for (int k = 0; k < 128; k += 2) {
15851 __asm__ volatile (
15852 "movupd (%0), %%xmm0\n"
15853 "movupd (%1), %%xmm1\n"
15854 "movupd (%2), %%xmm2\n"
15855 "movupd (%3), %%xmm3\n"
15856 "movupd (%4), %%xmm4\n"
15857 "movupd (%5), %%xmm5\n"
15858 "movupd (%6), %%xmm6\n"
15859 "movupd (%7), %%xmm7\n"
15860 "movapd %%xmm0, %%xmm8\n"
15861 "movapd %%xmm0, %%xmm9\n"
15862 "addpd %%xmm1, %%xmm8\n"
15863 "subpd %%xmm1, %%xmm9\n"
15864 "movapd %%xmm2, %%xmm10\n"
15865 "movapd %%xmm2, %%xmm11\n"
15866 "addpd %%xmm3, %%xmm10\n"
15867 "subpd %%xmm3, %%xmm11\n"
15868 "movapd %%xmm4, %%xmm12\n"
15869 "movapd %%xmm4, %%xmm13\n"
15870 "addpd %%xmm5, %%xmm12\n"
15871 "subpd %%xmm5, %%xmm13\n"
15872 "movapd %%xmm6, %%xmm14\n"
15873 "movapd %%xmm6, %%xmm15\n"
15874 "addpd %%xmm7, %%xmm14\n"
15875 "subpd %%xmm7, %%xmm15\n"
15876 "movapd %%xmm8, %%xmm0\n"
15877 "movapd %%xmm8, %%xmm2\n"
15878 "addpd %%xmm10, %%xmm0\n"
15879 "subpd %%xmm10, %%xmm2\n"
15880 "movapd %%xmm9, %%xmm1\n"
15881 "movapd %%xmm9, %%xmm3\n"
15882 "addpd %%xmm11, %%xmm1\n"
15883 "subpd %%xmm11, %%xmm3\n"
15884 "movapd %%xmm12, %%xmm4\n"
15885 "movapd %%xmm12, %%xmm6\n"
15886 "addpd %%xmm14, %%xmm4\n"
15887 "subpd %%xmm14, %%xmm6\n"
15888 "movapd %%xmm13, %%xmm5\n"
15889 "movapd %%xmm13, %%xmm7\n"
15890 "addpd %%xmm15, %%xmm5\n"
15891 "subpd %%xmm15, %%xmm7\n"
15892 "movapd %%xmm0, %%xmm8\n"
15893 "movapd %%xmm0, %%xmm12\n"
15894 "addpd %%xmm4, %%xmm8\n"
15895 "subpd %%xmm4, %%xmm12\n"
15896 "movapd %%xmm1, %%xmm9\n"
15897 "movapd %%xmm1, %%xmm13\n"
15898 "addpd %%xmm5, %%xmm9\n"
15899 "subpd %%xmm5, %%xmm13\n"
15900 "movapd %%xmm2, %%xmm10\n"
15901 "movapd %%xmm2, %%xmm14\n"
15902 "addpd %%xmm6, %%xmm10\n"
15903 "subpd %%xmm6, %%xmm14\n"
15904 "movapd %%xmm3, %%xmm11\n"
15905 "movapd %%xmm3, %%xmm15\n"
15906 "addpd %%xmm7, %%xmm11\n"
15907 "subpd %%xmm7, %%xmm15\n"
15908 "movupd %%xmm8, (%0)\n"
15909 "movupd %%xmm9, (%1)\n"
15910 "movupd %%xmm10, (%2)\n"
15911 "movupd %%xmm11, (%3)\n"
15912 "movupd %%xmm12, (%4)\n"
15913 "movupd %%xmm13, (%5)\n"
15914 "movupd %%xmm14, (%6)\n"
15915 "movupd %%xmm15, (%7)\n"
15916 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15917 );
15918 }
15919 }
15920 return;
15921 }
15922 if (depth == 12) {
15923 helper_double_12_recursive(buf + 0, 10);
15924 helper_double_12_recursive(buf + 1024, 10);
15925 helper_double_12_recursive(buf + 2048, 10);
15926 helper_double_12_recursive(buf + 3072, 10);
15927 for (int j = 0; j < 4096; j += 4096) {
15928 for (int k = 0; k < 1024; k += 2) {
15929 __asm__ volatile (
15930 "movupd (%0), %%xmm0\n"
15931 "movupd (%1), %%xmm1\n"
15932 "movupd (%2), %%xmm2\n"
15933 "movupd (%3), %%xmm3\n"
15934 "movapd %%xmm0, %%xmm8\n"
15935 "movapd %%xmm0, %%xmm9\n"
15936 "addpd %%xmm1, %%xmm8\n"
15937 "subpd %%xmm1, %%xmm9\n"
15938 "movapd %%xmm2, %%xmm10\n"
15939 "movapd %%xmm2, %%xmm11\n"
15940 "addpd %%xmm3, %%xmm10\n"
15941 "subpd %%xmm3, %%xmm11\n"
15942 "movapd %%xmm8, %%xmm0\n"
15943 "movapd %%xmm8, %%xmm2\n"
15944 "addpd %%xmm10, %%xmm0\n"
15945 "subpd %%xmm10, %%xmm2\n"
15946 "movapd %%xmm9, %%xmm1\n"
15947 "movapd %%xmm9, %%xmm3\n"
15948 "addpd %%xmm11, %%xmm1\n"
15949 "subpd %%xmm11, %%xmm3\n"
15950 "movupd %%xmm0, (%0)\n"
15951 "movupd %%xmm1, (%1)\n"
15952 "movupd %%xmm2, (%2)\n"
15953 "movupd %%xmm3, (%3)\n"
15954 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
15955 );
15956 }
15957 }
15958 return;
15959 }
15960 }
15961 void helper_double_12(double *buf);
helper_double_12(double * buf)15962 void helper_double_12(double *buf) {
15963 helper_double_12_recursive(buf, 12);
15964 }
15965 static inline void helper_double_13(double *buf);
helper_double_13(double * buf)15966 static inline void helper_double_13(double *buf) {
15967 for (int j = 0; j < 8192; j += 16) {
15968 for (int k = 0; k < 2; k += 2) {
15969 __asm__ volatile (
15970 "movupd (%0), %%xmm0\n"
15971 "movupd (%1), %%xmm1\n"
15972 "movupd (%2), %%xmm2\n"
15973 "movupd (%3), %%xmm3\n"
15974 "movupd (%4), %%xmm4\n"
15975 "movupd (%5), %%xmm5\n"
15976 "movupd (%6), %%xmm6\n"
15977 "movupd (%7), %%xmm7\n"
15978 "movapd %%xmm0, %%xmm8\n"
15979 "haddpd %%xmm8, %%xmm8\n"
15980 "movapd %%xmm0, %%xmm9\n"
15981 "hsubpd %%xmm9, %%xmm9\n"
15982 "blendpd $1, %%xmm8, %%xmm9\n"
15983 "movapd %%xmm9, %%xmm0\n"
15984 "movapd %%xmm1, %%xmm8\n"
15985 "haddpd %%xmm8, %%xmm8\n"
15986 "movapd %%xmm1, %%xmm9\n"
15987 "hsubpd %%xmm9, %%xmm9\n"
15988 "blendpd $1, %%xmm8, %%xmm9\n"
15989 "movapd %%xmm9, %%xmm1\n"
15990 "movapd %%xmm2, %%xmm8\n"
15991 "haddpd %%xmm8, %%xmm8\n"
15992 "movapd %%xmm2, %%xmm9\n"
15993 "hsubpd %%xmm9, %%xmm9\n"
15994 "blendpd $1, %%xmm8, %%xmm9\n"
15995 "movapd %%xmm9, %%xmm2\n"
15996 "movapd %%xmm3, %%xmm8\n"
15997 "haddpd %%xmm8, %%xmm8\n"
15998 "movapd %%xmm3, %%xmm9\n"
15999 "hsubpd %%xmm9, %%xmm9\n"
16000 "blendpd $1, %%xmm8, %%xmm9\n"
16001 "movapd %%xmm9, %%xmm3\n"
16002 "movapd %%xmm4, %%xmm8\n"
16003 "haddpd %%xmm8, %%xmm8\n"
16004 "movapd %%xmm4, %%xmm9\n"
16005 "hsubpd %%xmm9, %%xmm9\n"
16006 "blendpd $1, %%xmm8, %%xmm9\n"
16007 "movapd %%xmm9, %%xmm4\n"
16008 "movapd %%xmm5, %%xmm8\n"
16009 "haddpd %%xmm8, %%xmm8\n"
16010 "movapd %%xmm5, %%xmm9\n"
16011 "hsubpd %%xmm9, %%xmm9\n"
16012 "blendpd $1, %%xmm8, %%xmm9\n"
16013 "movapd %%xmm9, %%xmm5\n"
16014 "movapd %%xmm6, %%xmm8\n"
16015 "haddpd %%xmm8, %%xmm8\n"
16016 "movapd %%xmm6, %%xmm9\n"
16017 "hsubpd %%xmm9, %%xmm9\n"
16018 "blendpd $1, %%xmm8, %%xmm9\n"
16019 "movapd %%xmm9, %%xmm6\n"
16020 "movapd %%xmm7, %%xmm8\n"
16021 "haddpd %%xmm8, %%xmm8\n"
16022 "movapd %%xmm7, %%xmm9\n"
16023 "hsubpd %%xmm9, %%xmm9\n"
16024 "blendpd $1, %%xmm8, %%xmm9\n"
16025 "movapd %%xmm9, %%xmm7\n"
16026 "movapd %%xmm0, %%xmm8\n"
16027 "movapd %%xmm0, %%xmm9\n"
16028 "addpd %%xmm1, %%xmm8\n"
16029 "subpd %%xmm1, %%xmm9\n"
16030 "movapd %%xmm2, %%xmm10\n"
16031 "movapd %%xmm2, %%xmm11\n"
16032 "addpd %%xmm3, %%xmm10\n"
16033 "subpd %%xmm3, %%xmm11\n"
16034 "movapd %%xmm4, %%xmm12\n"
16035 "movapd %%xmm4, %%xmm13\n"
16036 "addpd %%xmm5, %%xmm12\n"
16037 "subpd %%xmm5, %%xmm13\n"
16038 "movapd %%xmm6, %%xmm14\n"
16039 "movapd %%xmm6, %%xmm15\n"
16040 "addpd %%xmm7, %%xmm14\n"
16041 "subpd %%xmm7, %%xmm15\n"
16042 "movapd %%xmm8, %%xmm0\n"
16043 "movapd %%xmm8, %%xmm2\n"
16044 "addpd %%xmm10, %%xmm0\n"
16045 "subpd %%xmm10, %%xmm2\n"
16046 "movapd %%xmm9, %%xmm1\n"
16047 "movapd %%xmm9, %%xmm3\n"
16048 "addpd %%xmm11, %%xmm1\n"
16049 "subpd %%xmm11, %%xmm3\n"
16050 "movapd %%xmm12, %%xmm4\n"
16051 "movapd %%xmm12, %%xmm6\n"
16052 "addpd %%xmm14, %%xmm4\n"
16053 "subpd %%xmm14, %%xmm6\n"
16054 "movapd %%xmm13, %%xmm5\n"
16055 "movapd %%xmm13, %%xmm7\n"
16056 "addpd %%xmm15, %%xmm5\n"
16057 "subpd %%xmm15, %%xmm7\n"
16058 "movapd %%xmm0, %%xmm8\n"
16059 "movapd %%xmm0, %%xmm12\n"
16060 "addpd %%xmm4, %%xmm8\n"
16061 "subpd %%xmm4, %%xmm12\n"
16062 "movapd %%xmm1, %%xmm9\n"
16063 "movapd %%xmm1, %%xmm13\n"
16064 "addpd %%xmm5, %%xmm9\n"
16065 "subpd %%xmm5, %%xmm13\n"
16066 "movapd %%xmm2, %%xmm10\n"
16067 "movapd %%xmm2, %%xmm14\n"
16068 "addpd %%xmm6, %%xmm10\n"
16069 "subpd %%xmm6, %%xmm14\n"
16070 "movapd %%xmm3, %%xmm11\n"
16071 "movapd %%xmm3, %%xmm15\n"
16072 "addpd %%xmm7, %%xmm11\n"
16073 "subpd %%xmm7, %%xmm15\n"
16074 "movupd %%xmm8, (%0)\n"
16075 "movupd %%xmm9, (%1)\n"
16076 "movupd %%xmm10, (%2)\n"
16077 "movupd %%xmm11, (%3)\n"
16078 "movupd %%xmm12, (%4)\n"
16079 "movupd %%xmm13, (%5)\n"
16080 "movupd %%xmm14, (%6)\n"
16081 "movupd %%xmm15, (%7)\n"
16082 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16083 );
16084 }
16085 }
16086 for (int j = 0; j < 8192; j += 128) {
16087 for (int k = 0; k < 16; k += 2) {
16088 __asm__ volatile (
16089 "movupd (%0), %%xmm0\n"
16090 "movupd (%1), %%xmm1\n"
16091 "movupd (%2), %%xmm2\n"
16092 "movupd (%3), %%xmm3\n"
16093 "movupd (%4), %%xmm4\n"
16094 "movupd (%5), %%xmm5\n"
16095 "movupd (%6), %%xmm6\n"
16096 "movupd (%7), %%xmm7\n"
16097 "movapd %%xmm0, %%xmm8\n"
16098 "movapd %%xmm0, %%xmm9\n"
16099 "addpd %%xmm1, %%xmm8\n"
16100 "subpd %%xmm1, %%xmm9\n"
16101 "movapd %%xmm2, %%xmm10\n"
16102 "movapd %%xmm2, %%xmm11\n"
16103 "addpd %%xmm3, %%xmm10\n"
16104 "subpd %%xmm3, %%xmm11\n"
16105 "movapd %%xmm4, %%xmm12\n"
16106 "movapd %%xmm4, %%xmm13\n"
16107 "addpd %%xmm5, %%xmm12\n"
16108 "subpd %%xmm5, %%xmm13\n"
16109 "movapd %%xmm6, %%xmm14\n"
16110 "movapd %%xmm6, %%xmm15\n"
16111 "addpd %%xmm7, %%xmm14\n"
16112 "subpd %%xmm7, %%xmm15\n"
16113 "movapd %%xmm8, %%xmm0\n"
16114 "movapd %%xmm8, %%xmm2\n"
16115 "addpd %%xmm10, %%xmm0\n"
16116 "subpd %%xmm10, %%xmm2\n"
16117 "movapd %%xmm9, %%xmm1\n"
16118 "movapd %%xmm9, %%xmm3\n"
16119 "addpd %%xmm11, %%xmm1\n"
16120 "subpd %%xmm11, %%xmm3\n"
16121 "movapd %%xmm12, %%xmm4\n"
16122 "movapd %%xmm12, %%xmm6\n"
16123 "addpd %%xmm14, %%xmm4\n"
16124 "subpd %%xmm14, %%xmm6\n"
16125 "movapd %%xmm13, %%xmm5\n"
16126 "movapd %%xmm13, %%xmm7\n"
16127 "addpd %%xmm15, %%xmm5\n"
16128 "subpd %%xmm15, %%xmm7\n"
16129 "movapd %%xmm0, %%xmm8\n"
16130 "movapd %%xmm0, %%xmm12\n"
16131 "addpd %%xmm4, %%xmm8\n"
16132 "subpd %%xmm4, %%xmm12\n"
16133 "movapd %%xmm1, %%xmm9\n"
16134 "movapd %%xmm1, %%xmm13\n"
16135 "addpd %%xmm5, %%xmm9\n"
16136 "subpd %%xmm5, %%xmm13\n"
16137 "movapd %%xmm2, %%xmm10\n"
16138 "movapd %%xmm2, %%xmm14\n"
16139 "addpd %%xmm6, %%xmm10\n"
16140 "subpd %%xmm6, %%xmm14\n"
16141 "movapd %%xmm3, %%xmm11\n"
16142 "movapd %%xmm3, %%xmm15\n"
16143 "addpd %%xmm7, %%xmm11\n"
16144 "subpd %%xmm7, %%xmm15\n"
16145 "movupd %%xmm8, (%0)\n"
16146 "movupd %%xmm9, (%1)\n"
16147 "movupd %%xmm10, (%2)\n"
16148 "movupd %%xmm11, (%3)\n"
16149 "movupd %%xmm12, (%4)\n"
16150 "movupd %%xmm13, (%5)\n"
16151 "movupd %%xmm14, (%6)\n"
16152 "movupd %%xmm15, (%7)\n"
16153 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16154 );
16155 }
16156 }
16157 for (int j = 0; j < 8192; j += 1024) {
16158 for (int k = 0; k < 128; k += 2) {
16159 __asm__ volatile (
16160 "movupd (%0), %%xmm0\n"
16161 "movupd (%1), %%xmm1\n"
16162 "movupd (%2), %%xmm2\n"
16163 "movupd (%3), %%xmm3\n"
16164 "movupd (%4), %%xmm4\n"
16165 "movupd (%5), %%xmm5\n"
16166 "movupd (%6), %%xmm6\n"
16167 "movupd (%7), %%xmm7\n"
16168 "movapd %%xmm0, %%xmm8\n"
16169 "movapd %%xmm0, %%xmm9\n"
16170 "addpd %%xmm1, %%xmm8\n"
16171 "subpd %%xmm1, %%xmm9\n"
16172 "movapd %%xmm2, %%xmm10\n"
16173 "movapd %%xmm2, %%xmm11\n"
16174 "addpd %%xmm3, %%xmm10\n"
16175 "subpd %%xmm3, %%xmm11\n"
16176 "movapd %%xmm4, %%xmm12\n"
16177 "movapd %%xmm4, %%xmm13\n"
16178 "addpd %%xmm5, %%xmm12\n"
16179 "subpd %%xmm5, %%xmm13\n"
16180 "movapd %%xmm6, %%xmm14\n"
16181 "movapd %%xmm6, %%xmm15\n"
16182 "addpd %%xmm7, %%xmm14\n"
16183 "subpd %%xmm7, %%xmm15\n"
16184 "movapd %%xmm8, %%xmm0\n"
16185 "movapd %%xmm8, %%xmm2\n"
16186 "addpd %%xmm10, %%xmm0\n"
16187 "subpd %%xmm10, %%xmm2\n"
16188 "movapd %%xmm9, %%xmm1\n"
16189 "movapd %%xmm9, %%xmm3\n"
16190 "addpd %%xmm11, %%xmm1\n"
16191 "subpd %%xmm11, %%xmm3\n"
16192 "movapd %%xmm12, %%xmm4\n"
16193 "movapd %%xmm12, %%xmm6\n"
16194 "addpd %%xmm14, %%xmm4\n"
16195 "subpd %%xmm14, %%xmm6\n"
16196 "movapd %%xmm13, %%xmm5\n"
16197 "movapd %%xmm13, %%xmm7\n"
16198 "addpd %%xmm15, %%xmm5\n"
16199 "subpd %%xmm15, %%xmm7\n"
16200 "movapd %%xmm0, %%xmm8\n"
16201 "movapd %%xmm0, %%xmm12\n"
16202 "addpd %%xmm4, %%xmm8\n"
16203 "subpd %%xmm4, %%xmm12\n"
16204 "movapd %%xmm1, %%xmm9\n"
16205 "movapd %%xmm1, %%xmm13\n"
16206 "addpd %%xmm5, %%xmm9\n"
16207 "subpd %%xmm5, %%xmm13\n"
16208 "movapd %%xmm2, %%xmm10\n"
16209 "movapd %%xmm2, %%xmm14\n"
16210 "addpd %%xmm6, %%xmm10\n"
16211 "subpd %%xmm6, %%xmm14\n"
16212 "movapd %%xmm3, %%xmm11\n"
16213 "movapd %%xmm3, %%xmm15\n"
16214 "addpd %%xmm7, %%xmm11\n"
16215 "subpd %%xmm7, %%xmm15\n"
16216 "movupd %%xmm8, (%0)\n"
16217 "movupd %%xmm9, (%1)\n"
16218 "movupd %%xmm10, (%2)\n"
16219 "movupd %%xmm11, (%3)\n"
16220 "movupd %%xmm12, (%4)\n"
16221 "movupd %%xmm13, (%5)\n"
16222 "movupd %%xmm14, (%6)\n"
16223 "movupd %%xmm15, (%7)\n"
16224 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16225 );
16226 }
16227 }
16228 for (int j = 0; j < 8192; j += 8192) {
16229 for (int k = 0; k < 1024; k += 2) {
16230 __asm__ volatile (
16231 "movupd (%0), %%xmm0\n"
16232 "movupd (%1), %%xmm1\n"
16233 "movupd (%2), %%xmm2\n"
16234 "movupd (%3), %%xmm3\n"
16235 "movupd (%4), %%xmm4\n"
16236 "movupd (%5), %%xmm5\n"
16237 "movupd (%6), %%xmm6\n"
16238 "movupd (%7), %%xmm7\n"
16239 "movapd %%xmm0, %%xmm8\n"
16240 "movapd %%xmm0, %%xmm9\n"
16241 "addpd %%xmm1, %%xmm8\n"
16242 "subpd %%xmm1, %%xmm9\n"
16243 "movapd %%xmm2, %%xmm10\n"
16244 "movapd %%xmm2, %%xmm11\n"
16245 "addpd %%xmm3, %%xmm10\n"
16246 "subpd %%xmm3, %%xmm11\n"
16247 "movapd %%xmm4, %%xmm12\n"
16248 "movapd %%xmm4, %%xmm13\n"
16249 "addpd %%xmm5, %%xmm12\n"
16250 "subpd %%xmm5, %%xmm13\n"
16251 "movapd %%xmm6, %%xmm14\n"
16252 "movapd %%xmm6, %%xmm15\n"
16253 "addpd %%xmm7, %%xmm14\n"
16254 "subpd %%xmm7, %%xmm15\n"
16255 "movapd %%xmm8, %%xmm0\n"
16256 "movapd %%xmm8, %%xmm2\n"
16257 "addpd %%xmm10, %%xmm0\n"
16258 "subpd %%xmm10, %%xmm2\n"
16259 "movapd %%xmm9, %%xmm1\n"
16260 "movapd %%xmm9, %%xmm3\n"
16261 "addpd %%xmm11, %%xmm1\n"
16262 "subpd %%xmm11, %%xmm3\n"
16263 "movapd %%xmm12, %%xmm4\n"
16264 "movapd %%xmm12, %%xmm6\n"
16265 "addpd %%xmm14, %%xmm4\n"
16266 "subpd %%xmm14, %%xmm6\n"
16267 "movapd %%xmm13, %%xmm5\n"
16268 "movapd %%xmm13, %%xmm7\n"
16269 "addpd %%xmm15, %%xmm5\n"
16270 "subpd %%xmm15, %%xmm7\n"
16271 "movapd %%xmm0, %%xmm8\n"
16272 "movapd %%xmm0, %%xmm12\n"
16273 "addpd %%xmm4, %%xmm8\n"
16274 "subpd %%xmm4, %%xmm12\n"
16275 "movapd %%xmm1, %%xmm9\n"
16276 "movapd %%xmm1, %%xmm13\n"
16277 "addpd %%xmm5, %%xmm9\n"
16278 "subpd %%xmm5, %%xmm13\n"
16279 "movapd %%xmm2, %%xmm10\n"
16280 "movapd %%xmm2, %%xmm14\n"
16281 "addpd %%xmm6, %%xmm10\n"
16282 "subpd %%xmm6, %%xmm14\n"
16283 "movapd %%xmm3, %%xmm11\n"
16284 "movapd %%xmm3, %%xmm15\n"
16285 "addpd %%xmm7, %%xmm11\n"
16286 "subpd %%xmm7, %%xmm15\n"
16287 "movupd %%xmm8, (%0)\n"
16288 "movupd %%xmm9, (%1)\n"
16289 "movupd %%xmm10, (%2)\n"
16290 "movupd %%xmm11, (%3)\n"
16291 "movupd %%xmm12, (%4)\n"
16292 "movupd %%xmm13, (%5)\n"
16293 "movupd %%xmm14, (%6)\n"
16294 "movupd %%xmm15, (%7)\n"
16295 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16296 );
16297 }
16298 }
16299 }
16300 void helper_double_14_recursive(double *buf, int depth);
helper_double_14_recursive(double * buf,int depth)16301 void helper_double_14_recursive(double *buf, int depth) {
16302 if (depth == 9) {
16303 for (int j = 0; j < 512; j += 16) {
16304 for (int k = 0; k < 2; k += 2) {
16305 __asm__ volatile (
16306 "movupd (%0), %%xmm0\n"
16307 "movupd (%1), %%xmm1\n"
16308 "movupd (%2), %%xmm2\n"
16309 "movupd (%3), %%xmm3\n"
16310 "movupd (%4), %%xmm4\n"
16311 "movupd (%5), %%xmm5\n"
16312 "movupd (%6), %%xmm6\n"
16313 "movupd (%7), %%xmm7\n"
16314 "movapd %%xmm0, %%xmm8\n"
16315 "haddpd %%xmm8, %%xmm8\n"
16316 "movapd %%xmm0, %%xmm9\n"
16317 "hsubpd %%xmm9, %%xmm9\n"
16318 "blendpd $1, %%xmm8, %%xmm9\n"
16319 "movapd %%xmm9, %%xmm0\n"
16320 "movapd %%xmm1, %%xmm8\n"
16321 "haddpd %%xmm8, %%xmm8\n"
16322 "movapd %%xmm1, %%xmm9\n"
16323 "hsubpd %%xmm9, %%xmm9\n"
16324 "blendpd $1, %%xmm8, %%xmm9\n"
16325 "movapd %%xmm9, %%xmm1\n"
16326 "movapd %%xmm2, %%xmm8\n"
16327 "haddpd %%xmm8, %%xmm8\n"
16328 "movapd %%xmm2, %%xmm9\n"
16329 "hsubpd %%xmm9, %%xmm9\n"
16330 "blendpd $1, %%xmm8, %%xmm9\n"
16331 "movapd %%xmm9, %%xmm2\n"
16332 "movapd %%xmm3, %%xmm8\n"
16333 "haddpd %%xmm8, %%xmm8\n"
16334 "movapd %%xmm3, %%xmm9\n"
16335 "hsubpd %%xmm9, %%xmm9\n"
16336 "blendpd $1, %%xmm8, %%xmm9\n"
16337 "movapd %%xmm9, %%xmm3\n"
16338 "movapd %%xmm4, %%xmm8\n"
16339 "haddpd %%xmm8, %%xmm8\n"
16340 "movapd %%xmm4, %%xmm9\n"
16341 "hsubpd %%xmm9, %%xmm9\n"
16342 "blendpd $1, %%xmm8, %%xmm9\n"
16343 "movapd %%xmm9, %%xmm4\n"
16344 "movapd %%xmm5, %%xmm8\n"
16345 "haddpd %%xmm8, %%xmm8\n"
16346 "movapd %%xmm5, %%xmm9\n"
16347 "hsubpd %%xmm9, %%xmm9\n"
16348 "blendpd $1, %%xmm8, %%xmm9\n"
16349 "movapd %%xmm9, %%xmm5\n"
16350 "movapd %%xmm6, %%xmm8\n"
16351 "haddpd %%xmm8, %%xmm8\n"
16352 "movapd %%xmm6, %%xmm9\n"
16353 "hsubpd %%xmm9, %%xmm9\n"
16354 "blendpd $1, %%xmm8, %%xmm9\n"
16355 "movapd %%xmm9, %%xmm6\n"
16356 "movapd %%xmm7, %%xmm8\n"
16357 "haddpd %%xmm8, %%xmm8\n"
16358 "movapd %%xmm7, %%xmm9\n"
16359 "hsubpd %%xmm9, %%xmm9\n"
16360 "blendpd $1, %%xmm8, %%xmm9\n"
16361 "movapd %%xmm9, %%xmm7\n"
16362 "movapd %%xmm0, %%xmm8\n"
16363 "movapd %%xmm0, %%xmm9\n"
16364 "addpd %%xmm1, %%xmm8\n"
16365 "subpd %%xmm1, %%xmm9\n"
16366 "movapd %%xmm2, %%xmm10\n"
16367 "movapd %%xmm2, %%xmm11\n"
16368 "addpd %%xmm3, %%xmm10\n"
16369 "subpd %%xmm3, %%xmm11\n"
16370 "movapd %%xmm4, %%xmm12\n"
16371 "movapd %%xmm4, %%xmm13\n"
16372 "addpd %%xmm5, %%xmm12\n"
16373 "subpd %%xmm5, %%xmm13\n"
16374 "movapd %%xmm6, %%xmm14\n"
16375 "movapd %%xmm6, %%xmm15\n"
16376 "addpd %%xmm7, %%xmm14\n"
16377 "subpd %%xmm7, %%xmm15\n"
16378 "movapd %%xmm8, %%xmm0\n"
16379 "movapd %%xmm8, %%xmm2\n"
16380 "addpd %%xmm10, %%xmm0\n"
16381 "subpd %%xmm10, %%xmm2\n"
16382 "movapd %%xmm9, %%xmm1\n"
16383 "movapd %%xmm9, %%xmm3\n"
16384 "addpd %%xmm11, %%xmm1\n"
16385 "subpd %%xmm11, %%xmm3\n"
16386 "movapd %%xmm12, %%xmm4\n"
16387 "movapd %%xmm12, %%xmm6\n"
16388 "addpd %%xmm14, %%xmm4\n"
16389 "subpd %%xmm14, %%xmm6\n"
16390 "movapd %%xmm13, %%xmm5\n"
16391 "movapd %%xmm13, %%xmm7\n"
16392 "addpd %%xmm15, %%xmm5\n"
16393 "subpd %%xmm15, %%xmm7\n"
16394 "movapd %%xmm0, %%xmm8\n"
16395 "movapd %%xmm0, %%xmm12\n"
16396 "addpd %%xmm4, %%xmm8\n"
16397 "subpd %%xmm4, %%xmm12\n"
16398 "movapd %%xmm1, %%xmm9\n"
16399 "movapd %%xmm1, %%xmm13\n"
16400 "addpd %%xmm5, %%xmm9\n"
16401 "subpd %%xmm5, %%xmm13\n"
16402 "movapd %%xmm2, %%xmm10\n"
16403 "movapd %%xmm2, %%xmm14\n"
16404 "addpd %%xmm6, %%xmm10\n"
16405 "subpd %%xmm6, %%xmm14\n"
16406 "movapd %%xmm3, %%xmm11\n"
16407 "movapd %%xmm3, %%xmm15\n"
16408 "addpd %%xmm7, %%xmm11\n"
16409 "subpd %%xmm7, %%xmm15\n"
16410 "movupd %%xmm8, (%0)\n"
16411 "movupd %%xmm9, (%1)\n"
16412 "movupd %%xmm10, (%2)\n"
16413 "movupd %%xmm11, (%3)\n"
16414 "movupd %%xmm12, (%4)\n"
16415 "movupd %%xmm13, (%5)\n"
16416 "movupd %%xmm14, (%6)\n"
16417 "movupd %%xmm15, (%7)\n"
16418 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16419 );
16420 }
16421 }
16422 for (int j = 0; j < 512; j += 128) {
16423 for (int k = 0; k < 16; k += 2) {
16424 __asm__ volatile (
16425 "movupd (%0), %%xmm0\n"
16426 "movupd (%1), %%xmm1\n"
16427 "movupd (%2), %%xmm2\n"
16428 "movupd (%3), %%xmm3\n"
16429 "movupd (%4), %%xmm4\n"
16430 "movupd (%5), %%xmm5\n"
16431 "movupd (%6), %%xmm6\n"
16432 "movupd (%7), %%xmm7\n"
16433 "movapd %%xmm0, %%xmm8\n"
16434 "movapd %%xmm0, %%xmm9\n"
16435 "addpd %%xmm1, %%xmm8\n"
16436 "subpd %%xmm1, %%xmm9\n"
16437 "movapd %%xmm2, %%xmm10\n"
16438 "movapd %%xmm2, %%xmm11\n"
16439 "addpd %%xmm3, %%xmm10\n"
16440 "subpd %%xmm3, %%xmm11\n"
16441 "movapd %%xmm4, %%xmm12\n"
16442 "movapd %%xmm4, %%xmm13\n"
16443 "addpd %%xmm5, %%xmm12\n"
16444 "subpd %%xmm5, %%xmm13\n"
16445 "movapd %%xmm6, %%xmm14\n"
16446 "movapd %%xmm6, %%xmm15\n"
16447 "addpd %%xmm7, %%xmm14\n"
16448 "subpd %%xmm7, %%xmm15\n"
16449 "movapd %%xmm8, %%xmm0\n"
16450 "movapd %%xmm8, %%xmm2\n"
16451 "addpd %%xmm10, %%xmm0\n"
16452 "subpd %%xmm10, %%xmm2\n"
16453 "movapd %%xmm9, %%xmm1\n"
16454 "movapd %%xmm9, %%xmm3\n"
16455 "addpd %%xmm11, %%xmm1\n"
16456 "subpd %%xmm11, %%xmm3\n"
16457 "movapd %%xmm12, %%xmm4\n"
16458 "movapd %%xmm12, %%xmm6\n"
16459 "addpd %%xmm14, %%xmm4\n"
16460 "subpd %%xmm14, %%xmm6\n"
16461 "movapd %%xmm13, %%xmm5\n"
16462 "movapd %%xmm13, %%xmm7\n"
16463 "addpd %%xmm15, %%xmm5\n"
16464 "subpd %%xmm15, %%xmm7\n"
16465 "movapd %%xmm0, %%xmm8\n"
16466 "movapd %%xmm0, %%xmm12\n"
16467 "addpd %%xmm4, %%xmm8\n"
16468 "subpd %%xmm4, %%xmm12\n"
16469 "movapd %%xmm1, %%xmm9\n"
16470 "movapd %%xmm1, %%xmm13\n"
16471 "addpd %%xmm5, %%xmm9\n"
16472 "subpd %%xmm5, %%xmm13\n"
16473 "movapd %%xmm2, %%xmm10\n"
16474 "movapd %%xmm2, %%xmm14\n"
16475 "addpd %%xmm6, %%xmm10\n"
16476 "subpd %%xmm6, %%xmm14\n"
16477 "movapd %%xmm3, %%xmm11\n"
16478 "movapd %%xmm3, %%xmm15\n"
16479 "addpd %%xmm7, %%xmm11\n"
16480 "subpd %%xmm7, %%xmm15\n"
16481 "movupd %%xmm8, (%0)\n"
16482 "movupd %%xmm9, (%1)\n"
16483 "movupd %%xmm10, (%2)\n"
16484 "movupd %%xmm11, (%3)\n"
16485 "movupd %%xmm12, (%4)\n"
16486 "movupd %%xmm13, (%5)\n"
16487 "movupd %%xmm14, (%6)\n"
16488 "movupd %%xmm15, (%7)\n"
16489 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16490 );
16491 }
16492 }
16493 for (int j = 0; j < 512; j += 512) {
16494 for (int k = 0; k < 128; k += 2) {
16495 __asm__ volatile (
16496 "movupd (%0), %%xmm0\n"
16497 "movupd (%1), %%xmm1\n"
16498 "movupd (%2), %%xmm2\n"
16499 "movupd (%3), %%xmm3\n"
16500 "movapd %%xmm0, %%xmm8\n"
16501 "movapd %%xmm0, %%xmm9\n"
16502 "addpd %%xmm1, %%xmm8\n"
16503 "subpd %%xmm1, %%xmm9\n"
16504 "movapd %%xmm2, %%xmm10\n"
16505 "movapd %%xmm2, %%xmm11\n"
16506 "addpd %%xmm3, %%xmm10\n"
16507 "subpd %%xmm3, %%xmm11\n"
16508 "movapd %%xmm8, %%xmm0\n"
16509 "movapd %%xmm8, %%xmm2\n"
16510 "addpd %%xmm10, %%xmm0\n"
16511 "subpd %%xmm10, %%xmm2\n"
16512 "movapd %%xmm9, %%xmm1\n"
16513 "movapd %%xmm9, %%xmm3\n"
16514 "addpd %%xmm11, %%xmm1\n"
16515 "subpd %%xmm11, %%xmm3\n"
16516 "movupd %%xmm0, (%0)\n"
16517 "movupd %%xmm1, (%1)\n"
16518 "movupd %%xmm2, (%2)\n"
16519 "movupd %%xmm3, (%3)\n"
16520 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16521 );
16522 }
16523 }
16524 return;
16525 }
16526 if (depth == 12) {
16527 helper_double_14_recursive(buf + 0, 9);
16528 helper_double_14_recursive(buf + 512, 9);
16529 helper_double_14_recursive(buf + 1024, 9);
16530 helper_double_14_recursive(buf + 1536, 9);
16531 helper_double_14_recursive(buf + 2048, 9);
16532 helper_double_14_recursive(buf + 2560, 9);
16533 helper_double_14_recursive(buf + 3072, 9);
16534 helper_double_14_recursive(buf + 3584, 9);
16535 for (int j = 0; j < 4096; j += 4096) {
16536 for (int k = 0; k < 512; k += 2) {
16537 __asm__ volatile (
16538 "movupd (%0), %%xmm0\n"
16539 "movupd (%1), %%xmm1\n"
16540 "movupd (%2), %%xmm2\n"
16541 "movupd (%3), %%xmm3\n"
16542 "movupd (%4), %%xmm4\n"
16543 "movupd (%5), %%xmm5\n"
16544 "movupd (%6), %%xmm6\n"
16545 "movupd (%7), %%xmm7\n"
16546 "movapd %%xmm0, %%xmm8\n"
16547 "movapd %%xmm0, %%xmm9\n"
16548 "addpd %%xmm1, %%xmm8\n"
16549 "subpd %%xmm1, %%xmm9\n"
16550 "movapd %%xmm2, %%xmm10\n"
16551 "movapd %%xmm2, %%xmm11\n"
16552 "addpd %%xmm3, %%xmm10\n"
16553 "subpd %%xmm3, %%xmm11\n"
16554 "movapd %%xmm4, %%xmm12\n"
16555 "movapd %%xmm4, %%xmm13\n"
16556 "addpd %%xmm5, %%xmm12\n"
16557 "subpd %%xmm5, %%xmm13\n"
16558 "movapd %%xmm6, %%xmm14\n"
16559 "movapd %%xmm6, %%xmm15\n"
16560 "addpd %%xmm7, %%xmm14\n"
16561 "subpd %%xmm7, %%xmm15\n"
16562 "movapd %%xmm8, %%xmm0\n"
16563 "movapd %%xmm8, %%xmm2\n"
16564 "addpd %%xmm10, %%xmm0\n"
16565 "subpd %%xmm10, %%xmm2\n"
16566 "movapd %%xmm9, %%xmm1\n"
16567 "movapd %%xmm9, %%xmm3\n"
16568 "addpd %%xmm11, %%xmm1\n"
16569 "subpd %%xmm11, %%xmm3\n"
16570 "movapd %%xmm12, %%xmm4\n"
16571 "movapd %%xmm12, %%xmm6\n"
16572 "addpd %%xmm14, %%xmm4\n"
16573 "subpd %%xmm14, %%xmm6\n"
16574 "movapd %%xmm13, %%xmm5\n"
16575 "movapd %%xmm13, %%xmm7\n"
16576 "addpd %%xmm15, %%xmm5\n"
16577 "subpd %%xmm15, %%xmm7\n"
16578 "movapd %%xmm0, %%xmm8\n"
16579 "movapd %%xmm0, %%xmm12\n"
16580 "addpd %%xmm4, %%xmm8\n"
16581 "subpd %%xmm4, %%xmm12\n"
16582 "movapd %%xmm1, %%xmm9\n"
16583 "movapd %%xmm1, %%xmm13\n"
16584 "addpd %%xmm5, %%xmm9\n"
16585 "subpd %%xmm5, %%xmm13\n"
16586 "movapd %%xmm2, %%xmm10\n"
16587 "movapd %%xmm2, %%xmm14\n"
16588 "addpd %%xmm6, %%xmm10\n"
16589 "subpd %%xmm6, %%xmm14\n"
16590 "movapd %%xmm3, %%xmm11\n"
16591 "movapd %%xmm3, %%xmm15\n"
16592 "addpd %%xmm7, %%xmm11\n"
16593 "subpd %%xmm7, %%xmm15\n"
16594 "movupd %%xmm8, (%0)\n"
16595 "movupd %%xmm9, (%1)\n"
16596 "movupd %%xmm10, (%2)\n"
16597 "movupd %%xmm11, (%3)\n"
16598 "movupd %%xmm12, (%4)\n"
16599 "movupd %%xmm13, (%5)\n"
16600 "movupd %%xmm14, (%6)\n"
16601 "movupd %%xmm15, (%7)\n"
16602 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16603 );
16604 }
16605 }
16606 return;
16607 }
16608 if (depth == 14) {
16609 helper_double_14_recursive(buf + 0, 12);
16610 helper_double_14_recursive(buf + 4096, 12);
16611 helper_double_14_recursive(buf + 8192, 12);
16612 helper_double_14_recursive(buf + 12288, 12);
16613 for (int j = 0; j < 16384; j += 16384) {
16614 for (int k = 0; k < 4096; k += 2) {
16615 __asm__ volatile (
16616 "movupd (%0), %%xmm0\n"
16617 "movupd (%1), %%xmm1\n"
16618 "movupd (%2), %%xmm2\n"
16619 "movupd (%3), %%xmm3\n"
16620 "movapd %%xmm0, %%xmm8\n"
16621 "movapd %%xmm0, %%xmm9\n"
16622 "addpd %%xmm1, %%xmm8\n"
16623 "subpd %%xmm1, %%xmm9\n"
16624 "movapd %%xmm2, %%xmm10\n"
16625 "movapd %%xmm2, %%xmm11\n"
16626 "addpd %%xmm3, %%xmm10\n"
16627 "subpd %%xmm3, %%xmm11\n"
16628 "movapd %%xmm8, %%xmm0\n"
16629 "movapd %%xmm8, %%xmm2\n"
16630 "addpd %%xmm10, %%xmm0\n"
16631 "subpd %%xmm10, %%xmm2\n"
16632 "movapd %%xmm9, %%xmm1\n"
16633 "movapd %%xmm9, %%xmm3\n"
16634 "addpd %%xmm11, %%xmm1\n"
16635 "subpd %%xmm11, %%xmm3\n"
16636 "movupd %%xmm0, (%0)\n"
16637 "movupd %%xmm1, (%1)\n"
16638 "movupd %%xmm2, (%2)\n"
16639 "movupd %%xmm3, (%3)\n"
16640 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16641 );
16642 }
16643 }
16644 return;
16645 }
16646 }
16647 void helper_double_14(double *buf);
helper_double_14(double * buf)16648 void helper_double_14(double *buf) {
16649 helper_double_14_recursive(buf, 14);
16650 }
16651 void helper_double_15_recursive(double *buf, int depth);
helper_double_15_recursive(double * buf,int depth)16652 void helper_double_15_recursive(double *buf, int depth) {
16653 if (depth == 10) {
16654 for (int j = 0; j < 1024; j += 16) {
16655 for (int k = 0; k < 2; k += 2) {
16656 __asm__ volatile (
16657 "movupd (%0), %%xmm0\n"
16658 "movupd (%1), %%xmm1\n"
16659 "movupd (%2), %%xmm2\n"
16660 "movupd (%3), %%xmm3\n"
16661 "movupd (%4), %%xmm4\n"
16662 "movupd (%5), %%xmm5\n"
16663 "movupd (%6), %%xmm6\n"
16664 "movupd (%7), %%xmm7\n"
16665 "movapd %%xmm0, %%xmm8\n"
16666 "haddpd %%xmm8, %%xmm8\n"
16667 "movapd %%xmm0, %%xmm9\n"
16668 "hsubpd %%xmm9, %%xmm9\n"
16669 "blendpd $1, %%xmm8, %%xmm9\n"
16670 "movapd %%xmm9, %%xmm0\n"
16671 "movapd %%xmm1, %%xmm8\n"
16672 "haddpd %%xmm8, %%xmm8\n"
16673 "movapd %%xmm1, %%xmm9\n"
16674 "hsubpd %%xmm9, %%xmm9\n"
16675 "blendpd $1, %%xmm8, %%xmm9\n"
16676 "movapd %%xmm9, %%xmm1\n"
16677 "movapd %%xmm2, %%xmm8\n"
16678 "haddpd %%xmm8, %%xmm8\n"
16679 "movapd %%xmm2, %%xmm9\n"
16680 "hsubpd %%xmm9, %%xmm9\n"
16681 "blendpd $1, %%xmm8, %%xmm9\n"
16682 "movapd %%xmm9, %%xmm2\n"
16683 "movapd %%xmm3, %%xmm8\n"
16684 "haddpd %%xmm8, %%xmm8\n"
16685 "movapd %%xmm3, %%xmm9\n"
16686 "hsubpd %%xmm9, %%xmm9\n"
16687 "blendpd $1, %%xmm8, %%xmm9\n"
16688 "movapd %%xmm9, %%xmm3\n"
16689 "movapd %%xmm4, %%xmm8\n"
16690 "haddpd %%xmm8, %%xmm8\n"
16691 "movapd %%xmm4, %%xmm9\n"
16692 "hsubpd %%xmm9, %%xmm9\n"
16693 "blendpd $1, %%xmm8, %%xmm9\n"
16694 "movapd %%xmm9, %%xmm4\n"
16695 "movapd %%xmm5, %%xmm8\n"
16696 "haddpd %%xmm8, %%xmm8\n"
16697 "movapd %%xmm5, %%xmm9\n"
16698 "hsubpd %%xmm9, %%xmm9\n"
16699 "blendpd $1, %%xmm8, %%xmm9\n"
16700 "movapd %%xmm9, %%xmm5\n"
16701 "movapd %%xmm6, %%xmm8\n"
16702 "haddpd %%xmm8, %%xmm8\n"
16703 "movapd %%xmm6, %%xmm9\n"
16704 "hsubpd %%xmm9, %%xmm9\n"
16705 "blendpd $1, %%xmm8, %%xmm9\n"
16706 "movapd %%xmm9, %%xmm6\n"
16707 "movapd %%xmm7, %%xmm8\n"
16708 "haddpd %%xmm8, %%xmm8\n"
16709 "movapd %%xmm7, %%xmm9\n"
16710 "hsubpd %%xmm9, %%xmm9\n"
16711 "blendpd $1, %%xmm8, %%xmm9\n"
16712 "movapd %%xmm9, %%xmm7\n"
16713 "movapd %%xmm0, %%xmm8\n"
16714 "movapd %%xmm0, %%xmm9\n"
16715 "addpd %%xmm1, %%xmm8\n"
16716 "subpd %%xmm1, %%xmm9\n"
16717 "movapd %%xmm2, %%xmm10\n"
16718 "movapd %%xmm2, %%xmm11\n"
16719 "addpd %%xmm3, %%xmm10\n"
16720 "subpd %%xmm3, %%xmm11\n"
16721 "movapd %%xmm4, %%xmm12\n"
16722 "movapd %%xmm4, %%xmm13\n"
16723 "addpd %%xmm5, %%xmm12\n"
16724 "subpd %%xmm5, %%xmm13\n"
16725 "movapd %%xmm6, %%xmm14\n"
16726 "movapd %%xmm6, %%xmm15\n"
16727 "addpd %%xmm7, %%xmm14\n"
16728 "subpd %%xmm7, %%xmm15\n"
16729 "movapd %%xmm8, %%xmm0\n"
16730 "movapd %%xmm8, %%xmm2\n"
16731 "addpd %%xmm10, %%xmm0\n"
16732 "subpd %%xmm10, %%xmm2\n"
16733 "movapd %%xmm9, %%xmm1\n"
16734 "movapd %%xmm9, %%xmm3\n"
16735 "addpd %%xmm11, %%xmm1\n"
16736 "subpd %%xmm11, %%xmm3\n"
16737 "movapd %%xmm12, %%xmm4\n"
16738 "movapd %%xmm12, %%xmm6\n"
16739 "addpd %%xmm14, %%xmm4\n"
16740 "subpd %%xmm14, %%xmm6\n"
16741 "movapd %%xmm13, %%xmm5\n"
16742 "movapd %%xmm13, %%xmm7\n"
16743 "addpd %%xmm15, %%xmm5\n"
16744 "subpd %%xmm15, %%xmm7\n"
16745 "movapd %%xmm0, %%xmm8\n"
16746 "movapd %%xmm0, %%xmm12\n"
16747 "addpd %%xmm4, %%xmm8\n"
16748 "subpd %%xmm4, %%xmm12\n"
16749 "movapd %%xmm1, %%xmm9\n"
16750 "movapd %%xmm1, %%xmm13\n"
16751 "addpd %%xmm5, %%xmm9\n"
16752 "subpd %%xmm5, %%xmm13\n"
16753 "movapd %%xmm2, %%xmm10\n"
16754 "movapd %%xmm2, %%xmm14\n"
16755 "addpd %%xmm6, %%xmm10\n"
16756 "subpd %%xmm6, %%xmm14\n"
16757 "movapd %%xmm3, %%xmm11\n"
16758 "movapd %%xmm3, %%xmm15\n"
16759 "addpd %%xmm7, %%xmm11\n"
16760 "subpd %%xmm7, %%xmm15\n"
16761 "movupd %%xmm8, (%0)\n"
16762 "movupd %%xmm9, (%1)\n"
16763 "movupd %%xmm10, (%2)\n"
16764 "movupd %%xmm11, (%3)\n"
16765 "movupd %%xmm12, (%4)\n"
16766 "movupd %%xmm13, (%5)\n"
16767 "movupd %%xmm14, (%6)\n"
16768 "movupd %%xmm15, (%7)\n"
16769 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16770 );
16771 }
16772 }
16773 for (int j = 0; j < 1024; j += 128) {
16774 for (int k = 0; k < 16; k += 2) {
16775 __asm__ volatile (
16776 "movupd (%0), %%xmm0\n"
16777 "movupd (%1), %%xmm1\n"
16778 "movupd (%2), %%xmm2\n"
16779 "movupd (%3), %%xmm3\n"
16780 "movupd (%4), %%xmm4\n"
16781 "movupd (%5), %%xmm5\n"
16782 "movupd (%6), %%xmm6\n"
16783 "movupd (%7), %%xmm7\n"
16784 "movapd %%xmm0, %%xmm8\n"
16785 "movapd %%xmm0, %%xmm9\n"
16786 "addpd %%xmm1, %%xmm8\n"
16787 "subpd %%xmm1, %%xmm9\n"
16788 "movapd %%xmm2, %%xmm10\n"
16789 "movapd %%xmm2, %%xmm11\n"
16790 "addpd %%xmm3, %%xmm10\n"
16791 "subpd %%xmm3, %%xmm11\n"
16792 "movapd %%xmm4, %%xmm12\n"
16793 "movapd %%xmm4, %%xmm13\n"
16794 "addpd %%xmm5, %%xmm12\n"
16795 "subpd %%xmm5, %%xmm13\n"
16796 "movapd %%xmm6, %%xmm14\n"
16797 "movapd %%xmm6, %%xmm15\n"
16798 "addpd %%xmm7, %%xmm14\n"
16799 "subpd %%xmm7, %%xmm15\n"
16800 "movapd %%xmm8, %%xmm0\n"
16801 "movapd %%xmm8, %%xmm2\n"
16802 "addpd %%xmm10, %%xmm0\n"
16803 "subpd %%xmm10, %%xmm2\n"
16804 "movapd %%xmm9, %%xmm1\n"
16805 "movapd %%xmm9, %%xmm3\n"
16806 "addpd %%xmm11, %%xmm1\n"
16807 "subpd %%xmm11, %%xmm3\n"
16808 "movapd %%xmm12, %%xmm4\n"
16809 "movapd %%xmm12, %%xmm6\n"
16810 "addpd %%xmm14, %%xmm4\n"
16811 "subpd %%xmm14, %%xmm6\n"
16812 "movapd %%xmm13, %%xmm5\n"
16813 "movapd %%xmm13, %%xmm7\n"
16814 "addpd %%xmm15, %%xmm5\n"
16815 "subpd %%xmm15, %%xmm7\n"
16816 "movapd %%xmm0, %%xmm8\n"
16817 "movapd %%xmm0, %%xmm12\n"
16818 "addpd %%xmm4, %%xmm8\n"
16819 "subpd %%xmm4, %%xmm12\n"
16820 "movapd %%xmm1, %%xmm9\n"
16821 "movapd %%xmm1, %%xmm13\n"
16822 "addpd %%xmm5, %%xmm9\n"
16823 "subpd %%xmm5, %%xmm13\n"
16824 "movapd %%xmm2, %%xmm10\n"
16825 "movapd %%xmm2, %%xmm14\n"
16826 "addpd %%xmm6, %%xmm10\n"
16827 "subpd %%xmm6, %%xmm14\n"
16828 "movapd %%xmm3, %%xmm11\n"
16829 "movapd %%xmm3, %%xmm15\n"
16830 "addpd %%xmm7, %%xmm11\n"
16831 "subpd %%xmm7, %%xmm15\n"
16832 "movupd %%xmm8, (%0)\n"
16833 "movupd %%xmm9, (%1)\n"
16834 "movupd %%xmm10, (%2)\n"
16835 "movupd %%xmm11, (%3)\n"
16836 "movupd %%xmm12, (%4)\n"
16837 "movupd %%xmm13, (%5)\n"
16838 "movupd %%xmm14, (%6)\n"
16839 "movupd %%xmm15, (%7)\n"
16840 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16841 );
16842 }
16843 }
16844 for (int j = 0; j < 1024; j += 1024) {
16845 for (int k = 0; k < 128; k += 2) {
16846 __asm__ volatile (
16847 "movupd (%0), %%xmm0\n"
16848 "movupd (%1), %%xmm1\n"
16849 "movupd (%2), %%xmm2\n"
16850 "movupd (%3), %%xmm3\n"
16851 "movupd (%4), %%xmm4\n"
16852 "movupd (%5), %%xmm5\n"
16853 "movupd (%6), %%xmm6\n"
16854 "movupd (%7), %%xmm7\n"
16855 "movapd %%xmm0, %%xmm8\n"
16856 "movapd %%xmm0, %%xmm9\n"
16857 "addpd %%xmm1, %%xmm8\n"
16858 "subpd %%xmm1, %%xmm9\n"
16859 "movapd %%xmm2, %%xmm10\n"
16860 "movapd %%xmm2, %%xmm11\n"
16861 "addpd %%xmm3, %%xmm10\n"
16862 "subpd %%xmm3, %%xmm11\n"
16863 "movapd %%xmm4, %%xmm12\n"
16864 "movapd %%xmm4, %%xmm13\n"
16865 "addpd %%xmm5, %%xmm12\n"
16866 "subpd %%xmm5, %%xmm13\n"
16867 "movapd %%xmm6, %%xmm14\n"
16868 "movapd %%xmm6, %%xmm15\n"
16869 "addpd %%xmm7, %%xmm14\n"
16870 "subpd %%xmm7, %%xmm15\n"
16871 "movapd %%xmm8, %%xmm0\n"
16872 "movapd %%xmm8, %%xmm2\n"
16873 "addpd %%xmm10, %%xmm0\n"
16874 "subpd %%xmm10, %%xmm2\n"
16875 "movapd %%xmm9, %%xmm1\n"
16876 "movapd %%xmm9, %%xmm3\n"
16877 "addpd %%xmm11, %%xmm1\n"
16878 "subpd %%xmm11, %%xmm3\n"
16879 "movapd %%xmm12, %%xmm4\n"
16880 "movapd %%xmm12, %%xmm6\n"
16881 "addpd %%xmm14, %%xmm4\n"
16882 "subpd %%xmm14, %%xmm6\n"
16883 "movapd %%xmm13, %%xmm5\n"
16884 "movapd %%xmm13, %%xmm7\n"
16885 "addpd %%xmm15, %%xmm5\n"
16886 "subpd %%xmm15, %%xmm7\n"
16887 "movapd %%xmm0, %%xmm8\n"
16888 "movapd %%xmm0, %%xmm12\n"
16889 "addpd %%xmm4, %%xmm8\n"
16890 "subpd %%xmm4, %%xmm12\n"
16891 "movapd %%xmm1, %%xmm9\n"
16892 "movapd %%xmm1, %%xmm13\n"
16893 "addpd %%xmm5, %%xmm9\n"
16894 "subpd %%xmm5, %%xmm13\n"
16895 "movapd %%xmm2, %%xmm10\n"
16896 "movapd %%xmm2, %%xmm14\n"
16897 "addpd %%xmm6, %%xmm10\n"
16898 "subpd %%xmm6, %%xmm14\n"
16899 "movapd %%xmm3, %%xmm11\n"
16900 "movapd %%xmm3, %%xmm15\n"
16901 "addpd %%xmm7, %%xmm11\n"
16902 "subpd %%xmm7, %%xmm15\n"
16903 "movupd %%xmm8, (%0)\n"
16904 "movupd %%xmm9, (%1)\n"
16905 "movupd %%xmm10, (%2)\n"
16906 "movupd %%xmm11, (%3)\n"
16907 "movupd %%xmm12, (%4)\n"
16908 "movupd %%xmm13, (%5)\n"
16909 "movupd %%xmm14, (%6)\n"
16910 "movupd %%xmm15, (%7)\n"
16911 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16912 );
16913 }
16914 }
16915 return;
16916 }
16917 if (depth == 13) {
16918 helper_double_15_recursive(buf + 0, 10);
16919 helper_double_15_recursive(buf + 1024, 10);
16920 helper_double_15_recursive(buf + 2048, 10);
16921 helper_double_15_recursive(buf + 3072, 10);
16922 helper_double_15_recursive(buf + 4096, 10);
16923 helper_double_15_recursive(buf + 5120, 10);
16924 helper_double_15_recursive(buf + 6144, 10);
16925 helper_double_15_recursive(buf + 7168, 10);
16926 for (int j = 0; j < 8192; j += 8192) {
16927 for (int k = 0; k < 1024; k += 2) {
16928 __asm__ volatile (
16929 "movupd (%0), %%xmm0\n"
16930 "movupd (%1), %%xmm1\n"
16931 "movupd (%2), %%xmm2\n"
16932 "movupd (%3), %%xmm3\n"
16933 "movupd (%4), %%xmm4\n"
16934 "movupd (%5), %%xmm5\n"
16935 "movupd (%6), %%xmm6\n"
16936 "movupd (%7), %%xmm7\n"
16937 "movapd %%xmm0, %%xmm8\n"
16938 "movapd %%xmm0, %%xmm9\n"
16939 "addpd %%xmm1, %%xmm8\n"
16940 "subpd %%xmm1, %%xmm9\n"
16941 "movapd %%xmm2, %%xmm10\n"
16942 "movapd %%xmm2, %%xmm11\n"
16943 "addpd %%xmm3, %%xmm10\n"
16944 "subpd %%xmm3, %%xmm11\n"
16945 "movapd %%xmm4, %%xmm12\n"
16946 "movapd %%xmm4, %%xmm13\n"
16947 "addpd %%xmm5, %%xmm12\n"
16948 "subpd %%xmm5, %%xmm13\n"
16949 "movapd %%xmm6, %%xmm14\n"
16950 "movapd %%xmm6, %%xmm15\n"
16951 "addpd %%xmm7, %%xmm14\n"
16952 "subpd %%xmm7, %%xmm15\n"
16953 "movapd %%xmm8, %%xmm0\n"
16954 "movapd %%xmm8, %%xmm2\n"
16955 "addpd %%xmm10, %%xmm0\n"
16956 "subpd %%xmm10, %%xmm2\n"
16957 "movapd %%xmm9, %%xmm1\n"
16958 "movapd %%xmm9, %%xmm3\n"
16959 "addpd %%xmm11, %%xmm1\n"
16960 "subpd %%xmm11, %%xmm3\n"
16961 "movapd %%xmm12, %%xmm4\n"
16962 "movapd %%xmm12, %%xmm6\n"
16963 "addpd %%xmm14, %%xmm4\n"
16964 "subpd %%xmm14, %%xmm6\n"
16965 "movapd %%xmm13, %%xmm5\n"
16966 "movapd %%xmm13, %%xmm7\n"
16967 "addpd %%xmm15, %%xmm5\n"
16968 "subpd %%xmm15, %%xmm7\n"
16969 "movapd %%xmm0, %%xmm8\n"
16970 "movapd %%xmm0, %%xmm12\n"
16971 "addpd %%xmm4, %%xmm8\n"
16972 "subpd %%xmm4, %%xmm12\n"
16973 "movapd %%xmm1, %%xmm9\n"
16974 "movapd %%xmm1, %%xmm13\n"
16975 "addpd %%xmm5, %%xmm9\n"
16976 "subpd %%xmm5, %%xmm13\n"
16977 "movapd %%xmm2, %%xmm10\n"
16978 "movapd %%xmm2, %%xmm14\n"
16979 "addpd %%xmm6, %%xmm10\n"
16980 "subpd %%xmm6, %%xmm14\n"
16981 "movapd %%xmm3, %%xmm11\n"
16982 "movapd %%xmm3, %%xmm15\n"
16983 "addpd %%xmm7, %%xmm11\n"
16984 "subpd %%xmm7, %%xmm15\n"
16985 "movupd %%xmm8, (%0)\n"
16986 "movupd %%xmm9, (%1)\n"
16987 "movupd %%xmm10, (%2)\n"
16988 "movupd %%xmm11, (%3)\n"
16989 "movupd %%xmm12, (%4)\n"
16990 "movupd %%xmm13, (%5)\n"
16991 "movupd %%xmm14, (%6)\n"
16992 "movupd %%xmm15, (%7)\n"
16993 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
16994 );
16995 }
16996 }
16997 return;
16998 }
16999 if (depth == 15) {
17000 helper_double_15_recursive(buf + 0, 13);
17001 helper_double_15_recursive(buf + 8192, 13);
17002 helper_double_15_recursive(buf + 16384, 13);
17003 helper_double_15_recursive(buf + 24576, 13);
17004 for (int j = 0; j < 32768; j += 32768) {
17005 for (int k = 0; k < 8192; k += 2) {
17006 __asm__ volatile (
17007 "movupd (%0), %%xmm0\n"
17008 "movupd (%1), %%xmm1\n"
17009 "movupd (%2), %%xmm2\n"
17010 "movupd (%3), %%xmm3\n"
17011 "movapd %%xmm0, %%xmm8\n"
17012 "movapd %%xmm0, %%xmm9\n"
17013 "addpd %%xmm1, %%xmm8\n"
17014 "subpd %%xmm1, %%xmm9\n"
17015 "movapd %%xmm2, %%xmm10\n"
17016 "movapd %%xmm2, %%xmm11\n"
17017 "addpd %%xmm3, %%xmm10\n"
17018 "subpd %%xmm3, %%xmm11\n"
17019 "movapd %%xmm8, %%xmm0\n"
17020 "movapd %%xmm8, %%xmm2\n"
17021 "addpd %%xmm10, %%xmm0\n"
17022 "subpd %%xmm10, %%xmm2\n"
17023 "movapd %%xmm9, %%xmm1\n"
17024 "movapd %%xmm9, %%xmm3\n"
17025 "addpd %%xmm11, %%xmm1\n"
17026 "subpd %%xmm11, %%xmm3\n"
17027 "movupd %%xmm0, (%0)\n"
17028 "movupd %%xmm1, (%1)\n"
17029 "movupd %%xmm2, (%2)\n"
17030 "movupd %%xmm3, (%3)\n"
17031 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17032 );
17033 }
17034 }
17035 return;
17036 }
17037 }
17038 void helper_double_15(double *buf);
helper_double_15(double * buf)17039 void helper_double_15(double *buf) {
17040 helper_double_15_recursive(buf, 15);
17041 }
17042 void helper_double_16_recursive(double *buf, int depth);
helper_double_16_recursive(double * buf,int depth)17043 void helper_double_16_recursive(double *buf, int depth) {
17044 if (depth == 2) {
17045 for (int j = 0; j < 4; j += 4) {
17046 for (int k = 0; k < 2; k += 2) {
17047 __asm__ volatile (
17048 "movupd (%0), %%xmm0\n"
17049 "movupd (%1), %%xmm1\n"
17050 "movapd %%xmm0, %%xmm8\n"
17051 "haddpd %%xmm8, %%xmm8\n"
17052 "movapd %%xmm0, %%xmm9\n"
17053 "hsubpd %%xmm9, %%xmm9\n"
17054 "blendpd $1, %%xmm8, %%xmm9\n"
17055 "movapd %%xmm9, %%xmm0\n"
17056 "movapd %%xmm1, %%xmm8\n"
17057 "haddpd %%xmm8, %%xmm8\n"
17058 "movapd %%xmm1, %%xmm9\n"
17059 "hsubpd %%xmm9, %%xmm9\n"
17060 "blendpd $1, %%xmm8, %%xmm9\n"
17061 "movapd %%xmm9, %%xmm1\n"
17062 "movapd %%xmm0, %%xmm8\n"
17063 "movapd %%xmm0, %%xmm9\n"
17064 "addpd %%xmm1, %%xmm8\n"
17065 "subpd %%xmm1, %%xmm9\n"
17066 "movupd %%xmm8, (%0)\n"
17067 "movupd %%xmm9, (%1)\n"
17068 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17069 );
17070 }
17071 }
17072 return;
17073 }
17074 if (depth == 5) {
17075 helper_double_16_recursive(buf + 0, 2);
17076 helper_double_16_recursive(buf + 4, 2);
17077 helper_double_16_recursive(buf + 8, 2);
17078 helper_double_16_recursive(buf + 12, 2);
17079 helper_double_16_recursive(buf + 16, 2);
17080 helper_double_16_recursive(buf + 20, 2);
17081 helper_double_16_recursive(buf + 24, 2);
17082 helper_double_16_recursive(buf + 28, 2);
17083 for (int j = 0; j < 32; j += 32) {
17084 for (int k = 0; k < 4; k += 2) {
17085 __asm__ volatile (
17086 "movupd (%0), %%xmm0\n"
17087 "movupd (%1), %%xmm1\n"
17088 "movupd (%2), %%xmm2\n"
17089 "movupd (%3), %%xmm3\n"
17090 "movupd (%4), %%xmm4\n"
17091 "movupd (%5), %%xmm5\n"
17092 "movupd (%6), %%xmm6\n"
17093 "movupd (%7), %%xmm7\n"
17094 "movapd %%xmm0, %%xmm8\n"
17095 "movapd %%xmm0, %%xmm9\n"
17096 "addpd %%xmm1, %%xmm8\n"
17097 "subpd %%xmm1, %%xmm9\n"
17098 "movapd %%xmm2, %%xmm10\n"
17099 "movapd %%xmm2, %%xmm11\n"
17100 "addpd %%xmm3, %%xmm10\n"
17101 "subpd %%xmm3, %%xmm11\n"
17102 "movapd %%xmm4, %%xmm12\n"
17103 "movapd %%xmm4, %%xmm13\n"
17104 "addpd %%xmm5, %%xmm12\n"
17105 "subpd %%xmm5, %%xmm13\n"
17106 "movapd %%xmm6, %%xmm14\n"
17107 "movapd %%xmm6, %%xmm15\n"
17108 "addpd %%xmm7, %%xmm14\n"
17109 "subpd %%xmm7, %%xmm15\n"
17110 "movapd %%xmm8, %%xmm0\n"
17111 "movapd %%xmm8, %%xmm2\n"
17112 "addpd %%xmm10, %%xmm0\n"
17113 "subpd %%xmm10, %%xmm2\n"
17114 "movapd %%xmm9, %%xmm1\n"
17115 "movapd %%xmm9, %%xmm3\n"
17116 "addpd %%xmm11, %%xmm1\n"
17117 "subpd %%xmm11, %%xmm3\n"
17118 "movapd %%xmm12, %%xmm4\n"
17119 "movapd %%xmm12, %%xmm6\n"
17120 "addpd %%xmm14, %%xmm4\n"
17121 "subpd %%xmm14, %%xmm6\n"
17122 "movapd %%xmm13, %%xmm5\n"
17123 "movapd %%xmm13, %%xmm7\n"
17124 "addpd %%xmm15, %%xmm5\n"
17125 "subpd %%xmm15, %%xmm7\n"
17126 "movapd %%xmm0, %%xmm8\n"
17127 "movapd %%xmm0, %%xmm12\n"
17128 "addpd %%xmm4, %%xmm8\n"
17129 "subpd %%xmm4, %%xmm12\n"
17130 "movapd %%xmm1, %%xmm9\n"
17131 "movapd %%xmm1, %%xmm13\n"
17132 "addpd %%xmm5, %%xmm9\n"
17133 "subpd %%xmm5, %%xmm13\n"
17134 "movapd %%xmm2, %%xmm10\n"
17135 "movapd %%xmm2, %%xmm14\n"
17136 "addpd %%xmm6, %%xmm10\n"
17137 "subpd %%xmm6, %%xmm14\n"
17138 "movapd %%xmm3, %%xmm11\n"
17139 "movapd %%xmm3, %%xmm15\n"
17140 "addpd %%xmm7, %%xmm11\n"
17141 "subpd %%xmm7, %%xmm15\n"
17142 "movupd %%xmm8, (%0)\n"
17143 "movupd %%xmm9, (%1)\n"
17144 "movupd %%xmm10, (%2)\n"
17145 "movupd %%xmm11, (%3)\n"
17146 "movupd %%xmm12, (%4)\n"
17147 "movupd %%xmm13, (%5)\n"
17148 "movupd %%xmm14, (%6)\n"
17149 "movupd %%xmm15, (%7)\n"
17150 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17151 );
17152 }
17153 }
17154 return;
17155 }
17156 if (depth == 8) {
17157 helper_double_16_recursive(buf + 0, 5);
17158 helper_double_16_recursive(buf + 32, 5);
17159 helper_double_16_recursive(buf + 64, 5);
17160 helper_double_16_recursive(buf + 96, 5);
17161 helper_double_16_recursive(buf + 128, 5);
17162 helper_double_16_recursive(buf + 160, 5);
17163 helper_double_16_recursive(buf + 192, 5);
17164 helper_double_16_recursive(buf + 224, 5);
17165 for (int j = 0; j < 256; j += 256) {
17166 for (int k = 0; k < 32; k += 2) {
17167 __asm__ volatile (
17168 "movupd (%0), %%xmm0\n"
17169 "movupd (%1), %%xmm1\n"
17170 "movupd (%2), %%xmm2\n"
17171 "movupd (%3), %%xmm3\n"
17172 "movupd (%4), %%xmm4\n"
17173 "movupd (%5), %%xmm5\n"
17174 "movupd (%6), %%xmm6\n"
17175 "movupd (%7), %%xmm7\n"
17176 "movapd %%xmm0, %%xmm8\n"
17177 "movapd %%xmm0, %%xmm9\n"
17178 "addpd %%xmm1, %%xmm8\n"
17179 "subpd %%xmm1, %%xmm9\n"
17180 "movapd %%xmm2, %%xmm10\n"
17181 "movapd %%xmm2, %%xmm11\n"
17182 "addpd %%xmm3, %%xmm10\n"
17183 "subpd %%xmm3, %%xmm11\n"
17184 "movapd %%xmm4, %%xmm12\n"
17185 "movapd %%xmm4, %%xmm13\n"
17186 "addpd %%xmm5, %%xmm12\n"
17187 "subpd %%xmm5, %%xmm13\n"
17188 "movapd %%xmm6, %%xmm14\n"
17189 "movapd %%xmm6, %%xmm15\n"
17190 "addpd %%xmm7, %%xmm14\n"
17191 "subpd %%xmm7, %%xmm15\n"
17192 "movapd %%xmm8, %%xmm0\n"
17193 "movapd %%xmm8, %%xmm2\n"
17194 "addpd %%xmm10, %%xmm0\n"
17195 "subpd %%xmm10, %%xmm2\n"
17196 "movapd %%xmm9, %%xmm1\n"
17197 "movapd %%xmm9, %%xmm3\n"
17198 "addpd %%xmm11, %%xmm1\n"
17199 "subpd %%xmm11, %%xmm3\n"
17200 "movapd %%xmm12, %%xmm4\n"
17201 "movapd %%xmm12, %%xmm6\n"
17202 "addpd %%xmm14, %%xmm4\n"
17203 "subpd %%xmm14, %%xmm6\n"
17204 "movapd %%xmm13, %%xmm5\n"
17205 "movapd %%xmm13, %%xmm7\n"
17206 "addpd %%xmm15, %%xmm5\n"
17207 "subpd %%xmm15, %%xmm7\n"
17208 "movapd %%xmm0, %%xmm8\n"
17209 "movapd %%xmm0, %%xmm12\n"
17210 "addpd %%xmm4, %%xmm8\n"
17211 "subpd %%xmm4, %%xmm12\n"
17212 "movapd %%xmm1, %%xmm9\n"
17213 "movapd %%xmm1, %%xmm13\n"
17214 "addpd %%xmm5, %%xmm9\n"
17215 "subpd %%xmm5, %%xmm13\n"
17216 "movapd %%xmm2, %%xmm10\n"
17217 "movapd %%xmm2, %%xmm14\n"
17218 "addpd %%xmm6, %%xmm10\n"
17219 "subpd %%xmm6, %%xmm14\n"
17220 "movapd %%xmm3, %%xmm11\n"
17221 "movapd %%xmm3, %%xmm15\n"
17222 "addpd %%xmm7, %%xmm11\n"
17223 "subpd %%xmm7, %%xmm15\n"
17224 "movupd %%xmm8, (%0)\n"
17225 "movupd %%xmm9, (%1)\n"
17226 "movupd %%xmm10, (%2)\n"
17227 "movupd %%xmm11, (%3)\n"
17228 "movupd %%xmm12, (%4)\n"
17229 "movupd %%xmm13, (%5)\n"
17230 "movupd %%xmm14, (%6)\n"
17231 "movupd %%xmm15, (%7)\n"
17232 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17233 );
17234 }
17235 }
17236 return;
17237 }
17238 if (depth == 11) {
17239 helper_double_16_recursive(buf + 0, 8);
17240 helper_double_16_recursive(buf + 256, 8);
17241 helper_double_16_recursive(buf + 512, 8);
17242 helper_double_16_recursive(buf + 768, 8);
17243 helper_double_16_recursive(buf + 1024, 8);
17244 helper_double_16_recursive(buf + 1280, 8);
17245 helper_double_16_recursive(buf + 1536, 8);
17246 helper_double_16_recursive(buf + 1792, 8);
17247 for (int j = 0; j < 2048; j += 2048) {
17248 for (int k = 0; k < 256; k += 2) {
17249 __asm__ volatile (
17250 "movupd (%0), %%xmm0\n"
17251 "movupd (%1), %%xmm1\n"
17252 "movupd (%2), %%xmm2\n"
17253 "movupd (%3), %%xmm3\n"
17254 "movupd (%4), %%xmm4\n"
17255 "movupd (%5), %%xmm5\n"
17256 "movupd (%6), %%xmm6\n"
17257 "movupd (%7), %%xmm7\n"
17258 "movapd %%xmm0, %%xmm8\n"
17259 "movapd %%xmm0, %%xmm9\n"
17260 "addpd %%xmm1, %%xmm8\n"
17261 "subpd %%xmm1, %%xmm9\n"
17262 "movapd %%xmm2, %%xmm10\n"
17263 "movapd %%xmm2, %%xmm11\n"
17264 "addpd %%xmm3, %%xmm10\n"
17265 "subpd %%xmm3, %%xmm11\n"
17266 "movapd %%xmm4, %%xmm12\n"
17267 "movapd %%xmm4, %%xmm13\n"
17268 "addpd %%xmm5, %%xmm12\n"
17269 "subpd %%xmm5, %%xmm13\n"
17270 "movapd %%xmm6, %%xmm14\n"
17271 "movapd %%xmm6, %%xmm15\n"
17272 "addpd %%xmm7, %%xmm14\n"
17273 "subpd %%xmm7, %%xmm15\n"
17274 "movapd %%xmm8, %%xmm0\n"
17275 "movapd %%xmm8, %%xmm2\n"
17276 "addpd %%xmm10, %%xmm0\n"
17277 "subpd %%xmm10, %%xmm2\n"
17278 "movapd %%xmm9, %%xmm1\n"
17279 "movapd %%xmm9, %%xmm3\n"
17280 "addpd %%xmm11, %%xmm1\n"
17281 "subpd %%xmm11, %%xmm3\n"
17282 "movapd %%xmm12, %%xmm4\n"
17283 "movapd %%xmm12, %%xmm6\n"
17284 "addpd %%xmm14, %%xmm4\n"
17285 "subpd %%xmm14, %%xmm6\n"
17286 "movapd %%xmm13, %%xmm5\n"
17287 "movapd %%xmm13, %%xmm7\n"
17288 "addpd %%xmm15, %%xmm5\n"
17289 "subpd %%xmm15, %%xmm7\n"
17290 "movapd %%xmm0, %%xmm8\n"
17291 "movapd %%xmm0, %%xmm12\n"
17292 "addpd %%xmm4, %%xmm8\n"
17293 "subpd %%xmm4, %%xmm12\n"
17294 "movapd %%xmm1, %%xmm9\n"
17295 "movapd %%xmm1, %%xmm13\n"
17296 "addpd %%xmm5, %%xmm9\n"
17297 "subpd %%xmm5, %%xmm13\n"
17298 "movapd %%xmm2, %%xmm10\n"
17299 "movapd %%xmm2, %%xmm14\n"
17300 "addpd %%xmm6, %%xmm10\n"
17301 "subpd %%xmm6, %%xmm14\n"
17302 "movapd %%xmm3, %%xmm11\n"
17303 "movapd %%xmm3, %%xmm15\n"
17304 "addpd %%xmm7, %%xmm11\n"
17305 "subpd %%xmm7, %%xmm15\n"
17306 "movupd %%xmm8, (%0)\n"
17307 "movupd %%xmm9, (%1)\n"
17308 "movupd %%xmm10, (%2)\n"
17309 "movupd %%xmm11, (%3)\n"
17310 "movupd %%xmm12, (%4)\n"
17311 "movupd %%xmm13, (%5)\n"
17312 "movupd %%xmm14, (%6)\n"
17313 "movupd %%xmm15, (%7)\n"
17314 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17315 );
17316 }
17317 }
17318 return;
17319 }
17320 if (depth == 14) {
17321 helper_double_16_recursive(buf + 0, 11);
17322 helper_double_16_recursive(buf + 2048, 11);
17323 helper_double_16_recursive(buf + 4096, 11);
17324 helper_double_16_recursive(buf + 6144, 11);
17325 helper_double_16_recursive(buf + 8192, 11);
17326 helper_double_16_recursive(buf + 10240, 11);
17327 helper_double_16_recursive(buf + 12288, 11);
17328 helper_double_16_recursive(buf + 14336, 11);
17329 for (int j = 0; j < 16384; j += 16384) {
17330 for (int k = 0; k < 2048; k += 2) {
17331 __asm__ volatile (
17332 "movupd (%0), %%xmm0\n"
17333 "movupd (%1), %%xmm1\n"
17334 "movupd (%2), %%xmm2\n"
17335 "movupd (%3), %%xmm3\n"
17336 "movupd (%4), %%xmm4\n"
17337 "movupd (%5), %%xmm5\n"
17338 "movupd (%6), %%xmm6\n"
17339 "movupd (%7), %%xmm7\n"
17340 "movapd %%xmm0, %%xmm8\n"
17341 "movapd %%xmm0, %%xmm9\n"
17342 "addpd %%xmm1, %%xmm8\n"
17343 "subpd %%xmm1, %%xmm9\n"
17344 "movapd %%xmm2, %%xmm10\n"
17345 "movapd %%xmm2, %%xmm11\n"
17346 "addpd %%xmm3, %%xmm10\n"
17347 "subpd %%xmm3, %%xmm11\n"
17348 "movapd %%xmm4, %%xmm12\n"
17349 "movapd %%xmm4, %%xmm13\n"
17350 "addpd %%xmm5, %%xmm12\n"
17351 "subpd %%xmm5, %%xmm13\n"
17352 "movapd %%xmm6, %%xmm14\n"
17353 "movapd %%xmm6, %%xmm15\n"
17354 "addpd %%xmm7, %%xmm14\n"
17355 "subpd %%xmm7, %%xmm15\n"
17356 "movapd %%xmm8, %%xmm0\n"
17357 "movapd %%xmm8, %%xmm2\n"
17358 "addpd %%xmm10, %%xmm0\n"
17359 "subpd %%xmm10, %%xmm2\n"
17360 "movapd %%xmm9, %%xmm1\n"
17361 "movapd %%xmm9, %%xmm3\n"
17362 "addpd %%xmm11, %%xmm1\n"
17363 "subpd %%xmm11, %%xmm3\n"
17364 "movapd %%xmm12, %%xmm4\n"
17365 "movapd %%xmm12, %%xmm6\n"
17366 "addpd %%xmm14, %%xmm4\n"
17367 "subpd %%xmm14, %%xmm6\n"
17368 "movapd %%xmm13, %%xmm5\n"
17369 "movapd %%xmm13, %%xmm7\n"
17370 "addpd %%xmm15, %%xmm5\n"
17371 "subpd %%xmm15, %%xmm7\n"
17372 "movapd %%xmm0, %%xmm8\n"
17373 "movapd %%xmm0, %%xmm12\n"
17374 "addpd %%xmm4, %%xmm8\n"
17375 "subpd %%xmm4, %%xmm12\n"
17376 "movapd %%xmm1, %%xmm9\n"
17377 "movapd %%xmm1, %%xmm13\n"
17378 "addpd %%xmm5, %%xmm9\n"
17379 "subpd %%xmm5, %%xmm13\n"
17380 "movapd %%xmm2, %%xmm10\n"
17381 "movapd %%xmm2, %%xmm14\n"
17382 "addpd %%xmm6, %%xmm10\n"
17383 "subpd %%xmm6, %%xmm14\n"
17384 "movapd %%xmm3, %%xmm11\n"
17385 "movapd %%xmm3, %%xmm15\n"
17386 "addpd %%xmm7, %%xmm11\n"
17387 "subpd %%xmm7, %%xmm15\n"
17388 "movupd %%xmm8, (%0)\n"
17389 "movupd %%xmm9, (%1)\n"
17390 "movupd %%xmm10, (%2)\n"
17391 "movupd %%xmm11, (%3)\n"
17392 "movupd %%xmm12, (%4)\n"
17393 "movupd %%xmm13, (%5)\n"
17394 "movupd %%xmm14, (%6)\n"
17395 "movupd %%xmm15, (%7)\n"
17396 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17397 );
17398 }
17399 }
17400 return;
17401 }
17402 if (depth == 16) {
17403 helper_double_16_recursive(buf + 0, 14);
17404 helper_double_16_recursive(buf + 16384, 14);
17405 helper_double_16_recursive(buf + 32768, 14);
17406 helper_double_16_recursive(buf + 49152, 14);
17407 for (int j = 0; j < 65536; j += 65536) {
17408 for (int k = 0; k < 16384; k += 2) {
17409 __asm__ volatile (
17410 "movupd (%0), %%xmm0\n"
17411 "movupd (%1), %%xmm1\n"
17412 "movupd (%2), %%xmm2\n"
17413 "movupd (%3), %%xmm3\n"
17414 "movapd %%xmm0, %%xmm8\n"
17415 "movapd %%xmm0, %%xmm9\n"
17416 "addpd %%xmm1, %%xmm8\n"
17417 "subpd %%xmm1, %%xmm9\n"
17418 "movapd %%xmm2, %%xmm10\n"
17419 "movapd %%xmm2, %%xmm11\n"
17420 "addpd %%xmm3, %%xmm10\n"
17421 "subpd %%xmm3, %%xmm11\n"
17422 "movapd %%xmm8, %%xmm0\n"
17423 "movapd %%xmm8, %%xmm2\n"
17424 "addpd %%xmm10, %%xmm0\n"
17425 "subpd %%xmm10, %%xmm2\n"
17426 "movapd %%xmm9, %%xmm1\n"
17427 "movapd %%xmm9, %%xmm3\n"
17428 "addpd %%xmm11, %%xmm1\n"
17429 "subpd %%xmm11, %%xmm3\n"
17430 "movupd %%xmm0, (%0)\n"
17431 "movupd %%xmm1, (%1)\n"
17432 "movupd %%xmm2, (%2)\n"
17433 "movupd %%xmm3, (%3)\n"
17434 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17435 );
17436 }
17437 }
17438 return;
17439 }
17440 }
17441 void helper_double_16(double *buf);
helper_double_16(double * buf)17442 void helper_double_16(double *buf) {
17443 helper_double_16_recursive(buf, 16);
17444 }
17445 void helper_double_17_recursive(double *buf, int depth);
helper_double_17_recursive(double * buf,int depth)17446 void helper_double_17_recursive(double *buf, int depth) {
17447 if (depth == 12) {
17448 for (int j = 0; j < 4096; j += 16) {
17449 for (int k = 0; k < 2; k += 2) {
17450 __asm__ volatile (
17451 "movupd (%0), %%xmm0\n"
17452 "movupd (%1), %%xmm1\n"
17453 "movupd (%2), %%xmm2\n"
17454 "movupd (%3), %%xmm3\n"
17455 "movupd (%4), %%xmm4\n"
17456 "movupd (%5), %%xmm5\n"
17457 "movupd (%6), %%xmm6\n"
17458 "movupd (%7), %%xmm7\n"
17459 "movapd %%xmm0, %%xmm8\n"
17460 "haddpd %%xmm8, %%xmm8\n"
17461 "movapd %%xmm0, %%xmm9\n"
17462 "hsubpd %%xmm9, %%xmm9\n"
17463 "blendpd $1, %%xmm8, %%xmm9\n"
17464 "movapd %%xmm9, %%xmm0\n"
17465 "movapd %%xmm1, %%xmm8\n"
17466 "haddpd %%xmm8, %%xmm8\n"
17467 "movapd %%xmm1, %%xmm9\n"
17468 "hsubpd %%xmm9, %%xmm9\n"
17469 "blendpd $1, %%xmm8, %%xmm9\n"
17470 "movapd %%xmm9, %%xmm1\n"
17471 "movapd %%xmm2, %%xmm8\n"
17472 "haddpd %%xmm8, %%xmm8\n"
17473 "movapd %%xmm2, %%xmm9\n"
17474 "hsubpd %%xmm9, %%xmm9\n"
17475 "blendpd $1, %%xmm8, %%xmm9\n"
17476 "movapd %%xmm9, %%xmm2\n"
17477 "movapd %%xmm3, %%xmm8\n"
17478 "haddpd %%xmm8, %%xmm8\n"
17479 "movapd %%xmm3, %%xmm9\n"
17480 "hsubpd %%xmm9, %%xmm9\n"
17481 "blendpd $1, %%xmm8, %%xmm9\n"
17482 "movapd %%xmm9, %%xmm3\n"
17483 "movapd %%xmm4, %%xmm8\n"
17484 "haddpd %%xmm8, %%xmm8\n"
17485 "movapd %%xmm4, %%xmm9\n"
17486 "hsubpd %%xmm9, %%xmm9\n"
17487 "blendpd $1, %%xmm8, %%xmm9\n"
17488 "movapd %%xmm9, %%xmm4\n"
17489 "movapd %%xmm5, %%xmm8\n"
17490 "haddpd %%xmm8, %%xmm8\n"
17491 "movapd %%xmm5, %%xmm9\n"
17492 "hsubpd %%xmm9, %%xmm9\n"
17493 "blendpd $1, %%xmm8, %%xmm9\n"
17494 "movapd %%xmm9, %%xmm5\n"
17495 "movapd %%xmm6, %%xmm8\n"
17496 "haddpd %%xmm8, %%xmm8\n"
17497 "movapd %%xmm6, %%xmm9\n"
17498 "hsubpd %%xmm9, %%xmm9\n"
17499 "blendpd $1, %%xmm8, %%xmm9\n"
17500 "movapd %%xmm9, %%xmm6\n"
17501 "movapd %%xmm7, %%xmm8\n"
17502 "haddpd %%xmm8, %%xmm8\n"
17503 "movapd %%xmm7, %%xmm9\n"
17504 "hsubpd %%xmm9, %%xmm9\n"
17505 "blendpd $1, %%xmm8, %%xmm9\n"
17506 "movapd %%xmm9, %%xmm7\n"
17507 "movapd %%xmm0, %%xmm8\n"
17508 "movapd %%xmm0, %%xmm9\n"
17509 "addpd %%xmm1, %%xmm8\n"
17510 "subpd %%xmm1, %%xmm9\n"
17511 "movapd %%xmm2, %%xmm10\n"
17512 "movapd %%xmm2, %%xmm11\n"
17513 "addpd %%xmm3, %%xmm10\n"
17514 "subpd %%xmm3, %%xmm11\n"
17515 "movapd %%xmm4, %%xmm12\n"
17516 "movapd %%xmm4, %%xmm13\n"
17517 "addpd %%xmm5, %%xmm12\n"
17518 "subpd %%xmm5, %%xmm13\n"
17519 "movapd %%xmm6, %%xmm14\n"
17520 "movapd %%xmm6, %%xmm15\n"
17521 "addpd %%xmm7, %%xmm14\n"
17522 "subpd %%xmm7, %%xmm15\n"
17523 "movapd %%xmm8, %%xmm0\n"
17524 "movapd %%xmm8, %%xmm2\n"
17525 "addpd %%xmm10, %%xmm0\n"
17526 "subpd %%xmm10, %%xmm2\n"
17527 "movapd %%xmm9, %%xmm1\n"
17528 "movapd %%xmm9, %%xmm3\n"
17529 "addpd %%xmm11, %%xmm1\n"
17530 "subpd %%xmm11, %%xmm3\n"
17531 "movapd %%xmm12, %%xmm4\n"
17532 "movapd %%xmm12, %%xmm6\n"
17533 "addpd %%xmm14, %%xmm4\n"
17534 "subpd %%xmm14, %%xmm6\n"
17535 "movapd %%xmm13, %%xmm5\n"
17536 "movapd %%xmm13, %%xmm7\n"
17537 "addpd %%xmm15, %%xmm5\n"
17538 "subpd %%xmm15, %%xmm7\n"
17539 "movapd %%xmm0, %%xmm8\n"
17540 "movapd %%xmm0, %%xmm12\n"
17541 "addpd %%xmm4, %%xmm8\n"
17542 "subpd %%xmm4, %%xmm12\n"
17543 "movapd %%xmm1, %%xmm9\n"
17544 "movapd %%xmm1, %%xmm13\n"
17545 "addpd %%xmm5, %%xmm9\n"
17546 "subpd %%xmm5, %%xmm13\n"
17547 "movapd %%xmm2, %%xmm10\n"
17548 "movapd %%xmm2, %%xmm14\n"
17549 "addpd %%xmm6, %%xmm10\n"
17550 "subpd %%xmm6, %%xmm14\n"
17551 "movapd %%xmm3, %%xmm11\n"
17552 "movapd %%xmm3, %%xmm15\n"
17553 "addpd %%xmm7, %%xmm11\n"
17554 "subpd %%xmm7, %%xmm15\n"
17555 "movupd %%xmm8, (%0)\n"
17556 "movupd %%xmm9, (%1)\n"
17557 "movupd %%xmm10, (%2)\n"
17558 "movupd %%xmm11, (%3)\n"
17559 "movupd %%xmm12, (%4)\n"
17560 "movupd %%xmm13, (%5)\n"
17561 "movupd %%xmm14, (%6)\n"
17562 "movupd %%xmm15, (%7)\n"
17563 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17564 );
17565 }
17566 }
17567 for (int j = 0; j < 4096; j += 128) {
17568 for (int k = 0; k < 16; k += 2) {
17569 __asm__ volatile (
17570 "movupd (%0), %%xmm0\n"
17571 "movupd (%1), %%xmm1\n"
17572 "movupd (%2), %%xmm2\n"
17573 "movupd (%3), %%xmm3\n"
17574 "movupd (%4), %%xmm4\n"
17575 "movupd (%5), %%xmm5\n"
17576 "movupd (%6), %%xmm6\n"
17577 "movupd (%7), %%xmm7\n"
17578 "movapd %%xmm0, %%xmm8\n"
17579 "movapd %%xmm0, %%xmm9\n"
17580 "addpd %%xmm1, %%xmm8\n"
17581 "subpd %%xmm1, %%xmm9\n"
17582 "movapd %%xmm2, %%xmm10\n"
17583 "movapd %%xmm2, %%xmm11\n"
17584 "addpd %%xmm3, %%xmm10\n"
17585 "subpd %%xmm3, %%xmm11\n"
17586 "movapd %%xmm4, %%xmm12\n"
17587 "movapd %%xmm4, %%xmm13\n"
17588 "addpd %%xmm5, %%xmm12\n"
17589 "subpd %%xmm5, %%xmm13\n"
17590 "movapd %%xmm6, %%xmm14\n"
17591 "movapd %%xmm6, %%xmm15\n"
17592 "addpd %%xmm7, %%xmm14\n"
17593 "subpd %%xmm7, %%xmm15\n"
17594 "movapd %%xmm8, %%xmm0\n"
17595 "movapd %%xmm8, %%xmm2\n"
17596 "addpd %%xmm10, %%xmm0\n"
17597 "subpd %%xmm10, %%xmm2\n"
17598 "movapd %%xmm9, %%xmm1\n"
17599 "movapd %%xmm9, %%xmm3\n"
17600 "addpd %%xmm11, %%xmm1\n"
17601 "subpd %%xmm11, %%xmm3\n"
17602 "movapd %%xmm12, %%xmm4\n"
17603 "movapd %%xmm12, %%xmm6\n"
17604 "addpd %%xmm14, %%xmm4\n"
17605 "subpd %%xmm14, %%xmm6\n"
17606 "movapd %%xmm13, %%xmm5\n"
17607 "movapd %%xmm13, %%xmm7\n"
17608 "addpd %%xmm15, %%xmm5\n"
17609 "subpd %%xmm15, %%xmm7\n"
17610 "movapd %%xmm0, %%xmm8\n"
17611 "movapd %%xmm0, %%xmm12\n"
17612 "addpd %%xmm4, %%xmm8\n"
17613 "subpd %%xmm4, %%xmm12\n"
17614 "movapd %%xmm1, %%xmm9\n"
17615 "movapd %%xmm1, %%xmm13\n"
17616 "addpd %%xmm5, %%xmm9\n"
17617 "subpd %%xmm5, %%xmm13\n"
17618 "movapd %%xmm2, %%xmm10\n"
17619 "movapd %%xmm2, %%xmm14\n"
17620 "addpd %%xmm6, %%xmm10\n"
17621 "subpd %%xmm6, %%xmm14\n"
17622 "movapd %%xmm3, %%xmm11\n"
17623 "movapd %%xmm3, %%xmm15\n"
17624 "addpd %%xmm7, %%xmm11\n"
17625 "subpd %%xmm7, %%xmm15\n"
17626 "movupd %%xmm8, (%0)\n"
17627 "movupd %%xmm9, (%1)\n"
17628 "movupd %%xmm10, (%2)\n"
17629 "movupd %%xmm11, (%3)\n"
17630 "movupd %%xmm12, (%4)\n"
17631 "movupd %%xmm13, (%5)\n"
17632 "movupd %%xmm14, (%6)\n"
17633 "movupd %%xmm15, (%7)\n"
17634 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17635 );
17636 }
17637 }
17638 for (int j = 0; j < 4096; j += 1024) {
17639 for (int k = 0; k < 128; k += 2) {
17640 __asm__ volatile (
17641 "movupd (%0), %%xmm0\n"
17642 "movupd (%1), %%xmm1\n"
17643 "movupd (%2), %%xmm2\n"
17644 "movupd (%3), %%xmm3\n"
17645 "movupd (%4), %%xmm4\n"
17646 "movupd (%5), %%xmm5\n"
17647 "movupd (%6), %%xmm6\n"
17648 "movupd (%7), %%xmm7\n"
17649 "movapd %%xmm0, %%xmm8\n"
17650 "movapd %%xmm0, %%xmm9\n"
17651 "addpd %%xmm1, %%xmm8\n"
17652 "subpd %%xmm1, %%xmm9\n"
17653 "movapd %%xmm2, %%xmm10\n"
17654 "movapd %%xmm2, %%xmm11\n"
17655 "addpd %%xmm3, %%xmm10\n"
17656 "subpd %%xmm3, %%xmm11\n"
17657 "movapd %%xmm4, %%xmm12\n"
17658 "movapd %%xmm4, %%xmm13\n"
17659 "addpd %%xmm5, %%xmm12\n"
17660 "subpd %%xmm5, %%xmm13\n"
17661 "movapd %%xmm6, %%xmm14\n"
17662 "movapd %%xmm6, %%xmm15\n"
17663 "addpd %%xmm7, %%xmm14\n"
17664 "subpd %%xmm7, %%xmm15\n"
17665 "movapd %%xmm8, %%xmm0\n"
17666 "movapd %%xmm8, %%xmm2\n"
17667 "addpd %%xmm10, %%xmm0\n"
17668 "subpd %%xmm10, %%xmm2\n"
17669 "movapd %%xmm9, %%xmm1\n"
17670 "movapd %%xmm9, %%xmm3\n"
17671 "addpd %%xmm11, %%xmm1\n"
17672 "subpd %%xmm11, %%xmm3\n"
17673 "movapd %%xmm12, %%xmm4\n"
17674 "movapd %%xmm12, %%xmm6\n"
17675 "addpd %%xmm14, %%xmm4\n"
17676 "subpd %%xmm14, %%xmm6\n"
17677 "movapd %%xmm13, %%xmm5\n"
17678 "movapd %%xmm13, %%xmm7\n"
17679 "addpd %%xmm15, %%xmm5\n"
17680 "subpd %%xmm15, %%xmm7\n"
17681 "movapd %%xmm0, %%xmm8\n"
17682 "movapd %%xmm0, %%xmm12\n"
17683 "addpd %%xmm4, %%xmm8\n"
17684 "subpd %%xmm4, %%xmm12\n"
17685 "movapd %%xmm1, %%xmm9\n"
17686 "movapd %%xmm1, %%xmm13\n"
17687 "addpd %%xmm5, %%xmm9\n"
17688 "subpd %%xmm5, %%xmm13\n"
17689 "movapd %%xmm2, %%xmm10\n"
17690 "movapd %%xmm2, %%xmm14\n"
17691 "addpd %%xmm6, %%xmm10\n"
17692 "subpd %%xmm6, %%xmm14\n"
17693 "movapd %%xmm3, %%xmm11\n"
17694 "movapd %%xmm3, %%xmm15\n"
17695 "addpd %%xmm7, %%xmm11\n"
17696 "subpd %%xmm7, %%xmm15\n"
17697 "movupd %%xmm8, (%0)\n"
17698 "movupd %%xmm9, (%1)\n"
17699 "movupd %%xmm10, (%2)\n"
17700 "movupd %%xmm11, (%3)\n"
17701 "movupd %%xmm12, (%4)\n"
17702 "movupd %%xmm13, (%5)\n"
17703 "movupd %%xmm14, (%6)\n"
17704 "movupd %%xmm15, (%7)\n"
17705 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17706 );
17707 }
17708 }
17709 for (int j = 0; j < 4096; j += 4096) {
17710 for (int k = 0; k < 1024; k += 2) {
17711 __asm__ volatile (
17712 "movupd (%0), %%xmm0\n"
17713 "movupd (%1), %%xmm1\n"
17714 "movupd (%2), %%xmm2\n"
17715 "movupd (%3), %%xmm3\n"
17716 "movapd %%xmm0, %%xmm8\n"
17717 "movapd %%xmm0, %%xmm9\n"
17718 "addpd %%xmm1, %%xmm8\n"
17719 "subpd %%xmm1, %%xmm9\n"
17720 "movapd %%xmm2, %%xmm10\n"
17721 "movapd %%xmm2, %%xmm11\n"
17722 "addpd %%xmm3, %%xmm10\n"
17723 "subpd %%xmm3, %%xmm11\n"
17724 "movapd %%xmm8, %%xmm0\n"
17725 "movapd %%xmm8, %%xmm2\n"
17726 "addpd %%xmm10, %%xmm0\n"
17727 "subpd %%xmm10, %%xmm2\n"
17728 "movapd %%xmm9, %%xmm1\n"
17729 "movapd %%xmm9, %%xmm3\n"
17730 "addpd %%xmm11, %%xmm1\n"
17731 "subpd %%xmm11, %%xmm3\n"
17732 "movupd %%xmm0, (%0)\n"
17733 "movupd %%xmm1, (%1)\n"
17734 "movupd %%xmm2, (%2)\n"
17735 "movupd %%xmm3, (%3)\n"
17736 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17737 );
17738 }
17739 }
17740 return;
17741 }
17742 if (depth == 15) {
17743 helper_double_17_recursive(buf + 0, 12);
17744 helper_double_17_recursive(buf + 4096, 12);
17745 helper_double_17_recursive(buf + 8192, 12);
17746 helper_double_17_recursive(buf + 12288, 12);
17747 helper_double_17_recursive(buf + 16384, 12);
17748 helper_double_17_recursive(buf + 20480, 12);
17749 helper_double_17_recursive(buf + 24576, 12);
17750 helper_double_17_recursive(buf + 28672, 12);
17751 for (int j = 0; j < 32768; j += 32768) {
17752 for (int k = 0; k < 4096; k += 2) {
17753 __asm__ volatile (
17754 "movupd (%0), %%xmm0\n"
17755 "movupd (%1), %%xmm1\n"
17756 "movupd (%2), %%xmm2\n"
17757 "movupd (%3), %%xmm3\n"
17758 "movupd (%4), %%xmm4\n"
17759 "movupd (%5), %%xmm5\n"
17760 "movupd (%6), %%xmm6\n"
17761 "movupd (%7), %%xmm7\n"
17762 "movapd %%xmm0, %%xmm8\n"
17763 "movapd %%xmm0, %%xmm9\n"
17764 "addpd %%xmm1, %%xmm8\n"
17765 "subpd %%xmm1, %%xmm9\n"
17766 "movapd %%xmm2, %%xmm10\n"
17767 "movapd %%xmm2, %%xmm11\n"
17768 "addpd %%xmm3, %%xmm10\n"
17769 "subpd %%xmm3, %%xmm11\n"
17770 "movapd %%xmm4, %%xmm12\n"
17771 "movapd %%xmm4, %%xmm13\n"
17772 "addpd %%xmm5, %%xmm12\n"
17773 "subpd %%xmm5, %%xmm13\n"
17774 "movapd %%xmm6, %%xmm14\n"
17775 "movapd %%xmm6, %%xmm15\n"
17776 "addpd %%xmm7, %%xmm14\n"
17777 "subpd %%xmm7, %%xmm15\n"
17778 "movapd %%xmm8, %%xmm0\n"
17779 "movapd %%xmm8, %%xmm2\n"
17780 "addpd %%xmm10, %%xmm0\n"
17781 "subpd %%xmm10, %%xmm2\n"
17782 "movapd %%xmm9, %%xmm1\n"
17783 "movapd %%xmm9, %%xmm3\n"
17784 "addpd %%xmm11, %%xmm1\n"
17785 "subpd %%xmm11, %%xmm3\n"
17786 "movapd %%xmm12, %%xmm4\n"
17787 "movapd %%xmm12, %%xmm6\n"
17788 "addpd %%xmm14, %%xmm4\n"
17789 "subpd %%xmm14, %%xmm6\n"
17790 "movapd %%xmm13, %%xmm5\n"
17791 "movapd %%xmm13, %%xmm7\n"
17792 "addpd %%xmm15, %%xmm5\n"
17793 "subpd %%xmm15, %%xmm7\n"
17794 "movapd %%xmm0, %%xmm8\n"
17795 "movapd %%xmm0, %%xmm12\n"
17796 "addpd %%xmm4, %%xmm8\n"
17797 "subpd %%xmm4, %%xmm12\n"
17798 "movapd %%xmm1, %%xmm9\n"
17799 "movapd %%xmm1, %%xmm13\n"
17800 "addpd %%xmm5, %%xmm9\n"
17801 "subpd %%xmm5, %%xmm13\n"
17802 "movapd %%xmm2, %%xmm10\n"
17803 "movapd %%xmm2, %%xmm14\n"
17804 "addpd %%xmm6, %%xmm10\n"
17805 "subpd %%xmm6, %%xmm14\n"
17806 "movapd %%xmm3, %%xmm11\n"
17807 "movapd %%xmm3, %%xmm15\n"
17808 "addpd %%xmm7, %%xmm11\n"
17809 "subpd %%xmm7, %%xmm15\n"
17810 "movupd %%xmm8, (%0)\n"
17811 "movupd %%xmm9, (%1)\n"
17812 "movupd %%xmm10, (%2)\n"
17813 "movupd %%xmm11, (%3)\n"
17814 "movupd %%xmm12, (%4)\n"
17815 "movupd %%xmm13, (%5)\n"
17816 "movupd %%xmm14, (%6)\n"
17817 "movupd %%xmm15, (%7)\n"
17818 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17819 );
17820 }
17821 }
17822 return;
17823 }
17824 if (depth == 17) {
17825 helper_double_17_recursive(buf + 0, 15);
17826 helper_double_17_recursive(buf + 32768, 15);
17827 helper_double_17_recursive(buf + 65536, 15);
17828 helper_double_17_recursive(buf + 98304, 15);
17829 for (int j = 0; j < 131072; j += 131072) {
17830 for (int k = 0; k < 32768; k += 2) {
17831 __asm__ volatile (
17832 "movupd (%0), %%xmm0\n"
17833 "movupd (%1), %%xmm1\n"
17834 "movupd (%2), %%xmm2\n"
17835 "movupd (%3), %%xmm3\n"
17836 "movapd %%xmm0, %%xmm8\n"
17837 "movapd %%xmm0, %%xmm9\n"
17838 "addpd %%xmm1, %%xmm8\n"
17839 "subpd %%xmm1, %%xmm9\n"
17840 "movapd %%xmm2, %%xmm10\n"
17841 "movapd %%xmm2, %%xmm11\n"
17842 "addpd %%xmm3, %%xmm10\n"
17843 "subpd %%xmm3, %%xmm11\n"
17844 "movapd %%xmm8, %%xmm0\n"
17845 "movapd %%xmm8, %%xmm2\n"
17846 "addpd %%xmm10, %%xmm0\n"
17847 "subpd %%xmm10, %%xmm2\n"
17848 "movapd %%xmm9, %%xmm1\n"
17849 "movapd %%xmm9, %%xmm3\n"
17850 "addpd %%xmm11, %%xmm1\n"
17851 "subpd %%xmm11, %%xmm3\n"
17852 "movupd %%xmm0, (%0)\n"
17853 "movupd %%xmm1, (%1)\n"
17854 "movupd %%xmm2, (%2)\n"
17855 "movupd %%xmm3, (%3)\n"
17856 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17857 );
17858 }
17859 }
17860 return;
17861 }
17862 }
17863 void helper_double_17(double *buf);
helper_double_17(double * buf)17864 void helper_double_17(double *buf) {
17865 helper_double_17_recursive(buf, 17);
17866 }
17867 void helper_double_18_recursive(double *buf, int depth);
helper_double_18_recursive(double * buf,int depth)17868 void helper_double_18_recursive(double *buf, int depth) {
17869 if (depth == 12) {
17870 for (int j = 0; j < 4096; j += 16) {
17871 for (int k = 0; k < 2; k += 2) {
17872 __asm__ volatile (
17873 "movupd (%0), %%xmm0\n"
17874 "movupd (%1), %%xmm1\n"
17875 "movupd (%2), %%xmm2\n"
17876 "movupd (%3), %%xmm3\n"
17877 "movupd (%4), %%xmm4\n"
17878 "movupd (%5), %%xmm5\n"
17879 "movupd (%6), %%xmm6\n"
17880 "movupd (%7), %%xmm7\n"
17881 "movapd %%xmm0, %%xmm8\n"
17882 "haddpd %%xmm8, %%xmm8\n"
17883 "movapd %%xmm0, %%xmm9\n"
17884 "hsubpd %%xmm9, %%xmm9\n"
17885 "blendpd $1, %%xmm8, %%xmm9\n"
17886 "movapd %%xmm9, %%xmm0\n"
17887 "movapd %%xmm1, %%xmm8\n"
17888 "haddpd %%xmm8, %%xmm8\n"
17889 "movapd %%xmm1, %%xmm9\n"
17890 "hsubpd %%xmm9, %%xmm9\n"
17891 "blendpd $1, %%xmm8, %%xmm9\n"
17892 "movapd %%xmm9, %%xmm1\n"
17893 "movapd %%xmm2, %%xmm8\n"
17894 "haddpd %%xmm8, %%xmm8\n"
17895 "movapd %%xmm2, %%xmm9\n"
17896 "hsubpd %%xmm9, %%xmm9\n"
17897 "blendpd $1, %%xmm8, %%xmm9\n"
17898 "movapd %%xmm9, %%xmm2\n"
17899 "movapd %%xmm3, %%xmm8\n"
17900 "haddpd %%xmm8, %%xmm8\n"
17901 "movapd %%xmm3, %%xmm9\n"
17902 "hsubpd %%xmm9, %%xmm9\n"
17903 "blendpd $1, %%xmm8, %%xmm9\n"
17904 "movapd %%xmm9, %%xmm3\n"
17905 "movapd %%xmm4, %%xmm8\n"
17906 "haddpd %%xmm8, %%xmm8\n"
17907 "movapd %%xmm4, %%xmm9\n"
17908 "hsubpd %%xmm9, %%xmm9\n"
17909 "blendpd $1, %%xmm8, %%xmm9\n"
17910 "movapd %%xmm9, %%xmm4\n"
17911 "movapd %%xmm5, %%xmm8\n"
17912 "haddpd %%xmm8, %%xmm8\n"
17913 "movapd %%xmm5, %%xmm9\n"
17914 "hsubpd %%xmm9, %%xmm9\n"
17915 "blendpd $1, %%xmm8, %%xmm9\n"
17916 "movapd %%xmm9, %%xmm5\n"
17917 "movapd %%xmm6, %%xmm8\n"
17918 "haddpd %%xmm8, %%xmm8\n"
17919 "movapd %%xmm6, %%xmm9\n"
17920 "hsubpd %%xmm9, %%xmm9\n"
17921 "blendpd $1, %%xmm8, %%xmm9\n"
17922 "movapd %%xmm9, %%xmm6\n"
17923 "movapd %%xmm7, %%xmm8\n"
17924 "haddpd %%xmm8, %%xmm8\n"
17925 "movapd %%xmm7, %%xmm9\n"
17926 "hsubpd %%xmm9, %%xmm9\n"
17927 "blendpd $1, %%xmm8, %%xmm9\n"
17928 "movapd %%xmm9, %%xmm7\n"
17929 "movapd %%xmm0, %%xmm8\n"
17930 "movapd %%xmm0, %%xmm9\n"
17931 "addpd %%xmm1, %%xmm8\n"
17932 "subpd %%xmm1, %%xmm9\n"
17933 "movapd %%xmm2, %%xmm10\n"
17934 "movapd %%xmm2, %%xmm11\n"
17935 "addpd %%xmm3, %%xmm10\n"
17936 "subpd %%xmm3, %%xmm11\n"
17937 "movapd %%xmm4, %%xmm12\n"
17938 "movapd %%xmm4, %%xmm13\n"
17939 "addpd %%xmm5, %%xmm12\n"
17940 "subpd %%xmm5, %%xmm13\n"
17941 "movapd %%xmm6, %%xmm14\n"
17942 "movapd %%xmm6, %%xmm15\n"
17943 "addpd %%xmm7, %%xmm14\n"
17944 "subpd %%xmm7, %%xmm15\n"
17945 "movapd %%xmm8, %%xmm0\n"
17946 "movapd %%xmm8, %%xmm2\n"
17947 "addpd %%xmm10, %%xmm0\n"
17948 "subpd %%xmm10, %%xmm2\n"
17949 "movapd %%xmm9, %%xmm1\n"
17950 "movapd %%xmm9, %%xmm3\n"
17951 "addpd %%xmm11, %%xmm1\n"
17952 "subpd %%xmm11, %%xmm3\n"
17953 "movapd %%xmm12, %%xmm4\n"
17954 "movapd %%xmm12, %%xmm6\n"
17955 "addpd %%xmm14, %%xmm4\n"
17956 "subpd %%xmm14, %%xmm6\n"
17957 "movapd %%xmm13, %%xmm5\n"
17958 "movapd %%xmm13, %%xmm7\n"
17959 "addpd %%xmm15, %%xmm5\n"
17960 "subpd %%xmm15, %%xmm7\n"
17961 "movapd %%xmm0, %%xmm8\n"
17962 "movapd %%xmm0, %%xmm12\n"
17963 "addpd %%xmm4, %%xmm8\n"
17964 "subpd %%xmm4, %%xmm12\n"
17965 "movapd %%xmm1, %%xmm9\n"
17966 "movapd %%xmm1, %%xmm13\n"
17967 "addpd %%xmm5, %%xmm9\n"
17968 "subpd %%xmm5, %%xmm13\n"
17969 "movapd %%xmm2, %%xmm10\n"
17970 "movapd %%xmm2, %%xmm14\n"
17971 "addpd %%xmm6, %%xmm10\n"
17972 "subpd %%xmm6, %%xmm14\n"
17973 "movapd %%xmm3, %%xmm11\n"
17974 "movapd %%xmm3, %%xmm15\n"
17975 "addpd %%xmm7, %%xmm11\n"
17976 "subpd %%xmm7, %%xmm15\n"
17977 "movupd %%xmm8, (%0)\n"
17978 "movupd %%xmm9, (%1)\n"
17979 "movupd %%xmm10, (%2)\n"
17980 "movupd %%xmm11, (%3)\n"
17981 "movupd %%xmm12, (%4)\n"
17982 "movupd %%xmm13, (%5)\n"
17983 "movupd %%xmm14, (%6)\n"
17984 "movupd %%xmm15, (%7)\n"
17985 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
17986 );
17987 }
17988 }
17989 for (int j = 0; j < 4096; j += 128) {
17990 for (int k = 0; k < 16; k += 2) {
17991 __asm__ volatile (
17992 "movupd (%0), %%xmm0\n"
17993 "movupd (%1), %%xmm1\n"
17994 "movupd (%2), %%xmm2\n"
17995 "movupd (%3), %%xmm3\n"
17996 "movupd (%4), %%xmm4\n"
17997 "movupd (%5), %%xmm5\n"
17998 "movupd (%6), %%xmm6\n"
17999 "movupd (%7), %%xmm7\n"
18000 "movapd %%xmm0, %%xmm8\n"
18001 "movapd %%xmm0, %%xmm9\n"
18002 "addpd %%xmm1, %%xmm8\n"
18003 "subpd %%xmm1, %%xmm9\n"
18004 "movapd %%xmm2, %%xmm10\n"
18005 "movapd %%xmm2, %%xmm11\n"
18006 "addpd %%xmm3, %%xmm10\n"
18007 "subpd %%xmm3, %%xmm11\n"
18008 "movapd %%xmm4, %%xmm12\n"
18009 "movapd %%xmm4, %%xmm13\n"
18010 "addpd %%xmm5, %%xmm12\n"
18011 "subpd %%xmm5, %%xmm13\n"
18012 "movapd %%xmm6, %%xmm14\n"
18013 "movapd %%xmm6, %%xmm15\n"
18014 "addpd %%xmm7, %%xmm14\n"
18015 "subpd %%xmm7, %%xmm15\n"
18016 "movapd %%xmm8, %%xmm0\n"
18017 "movapd %%xmm8, %%xmm2\n"
18018 "addpd %%xmm10, %%xmm0\n"
18019 "subpd %%xmm10, %%xmm2\n"
18020 "movapd %%xmm9, %%xmm1\n"
18021 "movapd %%xmm9, %%xmm3\n"
18022 "addpd %%xmm11, %%xmm1\n"
18023 "subpd %%xmm11, %%xmm3\n"
18024 "movapd %%xmm12, %%xmm4\n"
18025 "movapd %%xmm12, %%xmm6\n"
18026 "addpd %%xmm14, %%xmm4\n"
18027 "subpd %%xmm14, %%xmm6\n"
18028 "movapd %%xmm13, %%xmm5\n"
18029 "movapd %%xmm13, %%xmm7\n"
18030 "addpd %%xmm15, %%xmm5\n"
18031 "subpd %%xmm15, %%xmm7\n"
18032 "movapd %%xmm0, %%xmm8\n"
18033 "movapd %%xmm0, %%xmm12\n"
18034 "addpd %%xmm4, %%xmm8\n"
18035 "subpd %%xmm4, %%xmm12\n"
18036 "movapd %%xmm1, %%xmm9\n"
18037 "movapd %%xmm1, %%xmm13\n"
18038 "addpd %%xmm5, %%xmm9\n"
18039 "subpd %%xmm5, %%xmm13\n"
18040 "movapd %%xmm2, %%xmm10\n"
18041 "movapd %%xmm2, %%xmm14\n"
18042 "addpd %%xmm6, %%xmm10\n"
18043 "subpd %%xmm6, %%xmm14\n"
18044 "movapd %%xmm3, %%xmm11\n"
18045 "movapd %%xmm3, %%xmm15\n"
18046 "addpd %%xmm7, %%xmm11\n"
18047 "subpd %%xmm7, %%xmm15\n"
18048 "movupd %%xmm8, (%0)\n"
18049 "movupd %%xmm9, (%1)\n"
18050 "movupd %%xmm10, (%2)\n"
18051 "movupd %%xmm11, (%3)\n"
18052 "movupd %%xmm12, (%4)\n"
18053 "movupd %%xmm13, (%5)\n"
18054 "movupd %%xmm14, (%6)\n"
18055 "movupd %%xmm15, (%7)\n"
18056 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18057 );
18058 }
18059 }
18060 for (int j = 0; j < 4096; j += 1024) {
18061 for (int k = 0; k < 128; k += 2) {
18062 __asm__ volatile (
18063 "movupd (%0), %%xmm0\n"
18064 "movupd (%1), %%xmm1\n"
18065 "movupd (%2), %%xmm2\n"
18066 "movupd (%3), %%xmm3\n"
18067 "movupd (%4), %%xmm4\n"
18068 "movupd (%5), %%xmm5\n"
18069 "movupd (%6), %%xmm6\n"
18070 "movupd (%7), %%xmm7\n"
18071 "movapd %%xmm0, %%xmm8\n"
18072 "movapd %%xmm0, %%xmm9\n"
18073 "addpd %%xmm1, %%xmm8\n"
18074 "subpd %%xmm1, %%xmm9\n"
18075 "movapd %%xmm2, %%xmm10\n"
18076 "movapd %%xmm2, %%xmm11\n"
18077 "addpd %%xmm3, %%xmm10\n"
18078 "subpd %%xmm3, %%xmm11\n"
18079 "movapd %%xmm4, %%xmm12\n"
18080 "movapd %%xmm4, %%xmm13\n"
18081 "addpd %%xmm5, %%xmm12\n"
18082 "subpd %%xmm5, %%xmm13\n"
18083 "movapd %%xmm6, %%xmm14\n"
18084 "movapd %%xmm6, %%xmm15\n"
18085 "addpd %%xmm7, %%xmm14\n"
18086 "subpd %%xmm7, %%xmm15\n"
18087 "movapd %%xmm8, %%xmm0\n"
18088 "movapd %%xmm8, %%xmm2\n"
18089 "addpd %%xmm10, %%xmm0\n"
18090 "subpd %%xmm10, %%xmm2\n"
18091 "movapd %%xmm9, %%xmm1\n"
18092 "movapd %%xmm9, %%xmm3\n"
18093 "addpd %%xmm11, %%xmm1\n"
18094 "subpd %%xmm11, %%xmm3\n"
18095 "movapd %%xmm12, %%xmm4\n"
18096 "movapd %%xmm12, %%xmm6\n"
18097 "addpd %%xmm14, %%xmm4\n"
18098 "subpd %%xmm14, %%xmm6\n"
18099 "movapd %%xmm13, %%xmm5\n"
18100 "movapd %%xmm13, %%xmm7\n"
18101 "addpd %%xmm15, %%xmm5\n"
18102 "subpd %%xmm15, %%xmm7\n"
18103 "movapd %%xmm0, %%xmm8\n"
18104 "movapd %%xmm0, %%xmm12\n"
18105 "addpd %%xmm4, %%xmm8\n"
18106 "subpd %%xmm4, %%xmm12\n"
18107 "movapd %%xmm1, %%xmm9\n"
18108 "movapd %%xmm1, %%xmm13\n"
18109 "addpd %%xmm5, %%xmm9\n"
18110 "subpd %%xmm5, %%xmm13\n"
18111 "movapd %%xmm2, %%xmm10\n"
18112 "movapd %%xmm2, %%xmm14\n"
18113 "addpd %%xmm6, %%xmm10\n"
18114 "subpd %%xmm6, %%xmm14\n"
18115 "movapd %%xmm3, %%xmm11\n"
18116 "movapd %%xmm3, %%xmm15\n"
18117 "addpd %%xmm7, %%xmm11\n"
18118 "subpd %%xmm7, %%xmm15\n"
18119 "movupd %%xmm8, (%0)\n"
18120 "movupd %%xmm9, (%1)\n"
18121 "movupd %%xmm10, (%2)\n"
18122 "movupd %%xmm11, (%3)\n"
18123 "movupd %%xmm12, (%4)\n"
18124 "movupd %%xmm13, (%5)\n"
18125 "movupd %%xmm14, (%6)\n"
18126 "movupd %%xmm15, (%7)\n"
18127 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18128 );
18129 }
18130 }
18131 for (int j = 0; j < 4096; j += 4096) {
18132 for (int k = 0; k < 1024; k += 2) {
18133 __asm__ volatile (
18134 "movupd (%0), %%xmm0\n"
18135 "movupd (%1), %%xmm1\n"
18136 "movupd (%2), %%xmm2\n"
18137 "movupd (%3), %%xmm3\n"
18138 "movapd %%xmm0, %%xmm8\n"
18139 "movapd %%xmm0, %%xmm9\n"
18140 "addpd %%xmm1, %%xmm8\n"
18141 "subpd %%xmm1, %%xmm9\n"
18142 "movapd %%xmm2, %%xmm10\n"
18143 "movapd %%xmm2, %%xmm11\n"
18144 "addpd %%xmm3, %%xmm10\n"
18145 "subpd %%xmm3, %%xmm11\n"
18146 "movapd %%xmm8, %%xmm0\n"
18147 "movapd %%xmm8, %%xmm2\n"
18148 "addpd %%xmm10, %%xmm0\n"
18149 "subpd %%xmm10, %%xmm2\n"
18150 "movapd %%xmm9, %%xmm1\n"
18151 "movapd %%xmm9, %%xmm3\n"
18152 "addpd %%xmm11, %%xmm1\n"
18153 "subpd %%xmm11, %%xmm3\n"
18154 "movupd %%xmm0, (%0)\n"
18155 "movupd %%xmm1, (%1)\n"
18156 "movupd %%xmm2, (%2)\n"
18157 "movupd %%xmm3, (%3)\n"
18158 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18159 );
18160 }
18161 }
18162 return;
18163 }
18164 if (depth == 15) {
18165 helper_double_18_recursive(buf + 0, 12);
18166 helper_double_18_recursive(buf + 4096, 12);
18167 helper_double_18_recursive(buf + 8192, 12);
18168 helper_double_18_recursive(buf + 12288, 12);
18169 helper_double_18_recursive(buf + 16384, 12);
18170 helper_double_18_recursive(buf + 20480, 12);
18171 helper_double_18_recursive(buf + 24576, 12);
18172 helper_double_18_recursive(buf + 28672, 12);
18173 for (int j = 0; j < 32768; j += 32768) {
18174 for (int k = 0; k < 4096; k += 2) {
18175 __asm__ volatile (
18176 "movupd (%0), %%xmm0\n"
18177 "movupd (%1), %%xmm1\n"
18178 "movupd (%2), %%xmm2\n"
18179 "movupd (%3), %%xmm3\n"
18180 "movupd (%4), %%xmm4\n"
18181 "movupd (%5), %%xmm5\n"
18182 "movupd (%6), %%xmm6\n"
18183 "movupd (%7), %%xmm7\n"
18184 "movapd %%xmm0, %%xmm8\n"
18185 "movapd %%xmm0, %%xmm9\n"
18186 "addpd %%xmm1, %%xmm8\n"
18187 "subpd %%xmm1, %%xmm9\n"
18188 "movapd %%xmm2, %%xmm10\n"
18189 "movapd %%xmm2, %%xmm11\n"
18190 "addpd %%xmm3, %%xmm10\n"
18191 "subpd %%xmm3, %%xmm11\n"
18192 "movapd %%xmm4, %%xmm12\n"
18193 "movapd %%xmm4, %%xmm13\n"
18194 "addpd %%xmm5, %%xmm12\n"
18195 "subpd %%xmm5, %%xmm13\n"
18196 "movapd %%xmm6, %%xmm14\n"
18197 "movapd %%xmm6, %%xmm15\n"
18198 "addpd %%xmm7, %%xmm14\n"
18199 "subpd %%xmm7, %%xmm15\n"
18200 "movapd %%xmm8, %%xmm0\n"
18201 "movapd %%xmm8, %%xmm2\n"
18202 "addpd %%xmm10, %%xmm0\n"
18203 "subpd %%xmm10, %%xmm2\n"
18204 "movapd %%xmm9, %%xmm1\n"
18205 "movapd %%xmm9, %%xmm3\n"
18206 "addpd %%xmm11, %%xmm1\n"
18207 "subpd %%xmm11, %%xmm3\n"
18208 "movapd %%xmm12, %%xmm4\n"
18209 "movapd %%xmm12, %%xmm6\n"
18210 "addpd %%xmm14, %%xmm4\n"
18211 "subpd %%xmm14, %%xmm6\n"
18212 "movapd %%xmm13, %%xmm5\n"
18213 "movapd %%xmm13, %%xmm7\n"
18214 "addpd %%xmm15, %%xmm5\n"
18215 "subpd %%xmm15, %%xmm7\n"
18216 "movapd %%xmm0, %%xmm8\n"
18217 "movapd %%xmm0, %%xmm12\n"
18218 "addpd %%xmm4, %%xmm8\n"
18219 "subpd %%xmm4, %%xmm12\n"
18220 "movapd %%xmm1, %%xmm9\n"
18221 "movapd %%xmm1, %%xmm13\n"
18222 "addpd %%xmm5, %%xmm9\n"
18223 "subpd %%xmm5, %%xmm13\n"
18224 "movapd %%xmm2, %%xmm10\n"
18225 "movapd %%xmm2, %%xmm14\n"
18226 "addpd %%xmm6, %%xmm10\n"
18227 "subpd %%xmm6, %%xmm14\n"
18228 "movapd %%xmm3, %%xmm11\n"
18229 "movapd %%xmm3, %%xmm15\n"
18230 "addpd %%xmm7, %%xmm11\n"
18231 "subpd %%xmm7, %%xmm15\n"
18232 "movupd %%xmm8, (%0)\n"
18233 "movupd %%xmm9, (%1)\n"
18234 "movupd %%xmm10, (%2)\n"
18235 "movupd %%xmm11, (%3)\n"
18236 "movupd %%xmm12, (%4)\n"
18237 "movupd %%xmm13, (%5)\n"
18238 "movupd %%xmm14, (%6)\n"
18239 "movupd %%xmm15, (%7)\n"
18240 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18241 );
18242 }
18243 }
18244 return;
18245 }
18246 if (depth == 18) {
18247 helper_double_18_recursive(buf + 0, 15);
18248 helper_double_18_recursive(buf + 32768, 15);
18249 helper_double_18_recursive(buf + 65536, 15);
18250 helper_double_18_recursive(buf + 98304, 15);
18251 helper_double_18_recursive(buf + 131072, 15);
18252 helper_double_18_recursive(buf + 163840, 15);
18253 helper_double_18_recursive(buf + 196608, 15);
18254 helper_double_18_recursive(buf + 229376, 15);
18255 for (int j = 0; j < 262144; j += 262144) {
18256 for (int k = 0; k < 32768; k += 2) {
18257 __asm__ volatile (
18258 "movupd (%0), %%xmm0\n"
18259 "movupd (%1), %%xmm1\n"
18260 "movupd (%2), %%xmm2\n"
18261 "movupd (%3), %%xmm3\n"
18262 "movupd (%4), %%xmm4\n"
18263 "movupd (%5), %%xmm5\n"
18264 "movupd (%6), %%xmm6\n"
18265 "movupd (%7), %%xmm7\n"
18266 "movapd %%xmm0, %%xmm8\n"
18267 "movapd %%xmm0, %%xmm9\n"
18268 "addpd %%xmm1, %%xmm8\n"
18269 "subpd %%xmm1, %%xmm9\n"
18270 "movapd %%xmm2, %%xmm10\n"
18271 "movapd %%xmm2, %%xmm11\n"
18272 "addpd %%xmm3, %%xmm10\n"
18273 "subpd %%xmm3, %%xmm11\n"
18274 "movapd %%xmm4, %%xmm12\n"
18275 "movapd %%xmm4, %%xmm13\n"
18276 "addpd %%xmm5, %%xmm12\n"
18277 "subpd %%xmm5, %%xmm13\n"
18278 "movapd %%xmm6, %%xmm14\n"
18279 "movapd %%xmm6, %%xmm15\n"
18280 "addpd %%xmm7, %%xmm14\n"
18281 "subpd %%xmm7, %%xmm15\n"
18282 "movapd %%xmm8, %%xmm0\n"
18283 "movapd %%xmm8, %%xmm2\n"
18284 "addpd %%xmm10, %%xmm0\n"
18285 "subpd %%xmm10, %%xmm2\n"
18286 "movapd %%xmm9, %%xmm1\n"
18287 "movapd %%xmm9, %%xmm3\n"
18288 "addpd %%xmm11, %%xmm1\n"
18289 "subpd %%xmm11, %%xmm3\n"
18290 "movapd %%xmm12, %%xmm4\n"
18291 "movapd %%xmm12, %%xmm6\n"
18292 "addpd %%xmm14, %%xmm4\n"
18293 "subpd %%xmm14, %%xmm6\n"
18294 "movapd %%xmm13, %%xmm5\n"
18295 "movapd %%xmm13, %%xmm7\n"
18296 "addpd %%xmm15, %%xmm5\n"
18297 "subpd %%xmm15, %%xmm7\n"
18298 "movapd %%xmm0, %%xmm8\n"
18299 "movapd %%xmm0, %%xmm12\n"
18300 "addpd %%xmm4, %%xmm8\n"
18301 "subpd %%xmm4, %%xmm12\n"
18302 "movapd %%xmm1, %%xmm9\n"
18303 "movapd %%xmm1, %%xmm13\n"
18304 "addpd %%xmm5, %%xmm9\n"
18305 "subpd %%xmm5, %%xmm13\n"
18306 "movapd %%xmm2, %%xmm10\n"
18307 "movapd %%xmm2, %%xmm14\n"
18308 "addpd %%xmm6, %%xmm10\n"
18309 "subpd %%xmm6, %%xmm14\n"
18310 "movapd %%xmm3, %%xmm11\n"
18311 "movapd %%xmm3, %%xmm15\n"
18312 "addpd %%xmm7, %%xmm11\n"
18313 "subpd %%xmm7, %%xmm15\n"
18314 "movupd %%xmm8, (%0)\n"
18315 "movupd %%xmm9, (%1)\n"
18316 "movupd %%xmm10, (%2)\n"
18317 "movupd %%xmm11, (%3)\n"
18318 "movupd %%xmm12, (%4)\n"
18319 "movupd %%xmm13, (%5)\n"
18320 "movupd %%xmm14, (%6)\n"
18321 "movupd %%xmm15, (%7)\n"
18322 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18323 );
18324 }
18325 }
18326 return;
18327 }
18328 }
18329 void helper_double_18(double *buf);
helper_double_18(double * buf)18330 void helper_double_18(double *buf) {
18331 helper_double_18_recursive(buf, 18);
18332 }
18333 void helper_double_19_recursive(double *buf, int depth);
helper_double_19_recursive(double * buf,int depth)18334 void helper_double_19_recursive(double *buf, int depth) {
18335 if (depth == 2) {
18336 for (int j = 0; j < 4; j += 4) {
18337 for (int k = 0; k < 2; k += 2) {
18338 __asm__ volatile (
18339 "movupd (%0), %%xmm0\n"
18340 "movupd (%1), %%xmm1\n"
18341 "movapd %%xmm0, %%xmm8\n"
18342 "haddpd %%xmm8, %%xmm8\n"
18343 "movapd %%xmm0, %%xmm9\n"
18344 "hsubpd %%xmm9, %%xmm9\n"
18345 "blendpd $1, %%xmm8, %%xmm9\n"
18346 "movapd %%xmm9, %%xmm0\n"
18347 "movapd %%xmm1, %%xmm8\n"
18348 "haddpd %%xmm8, %%xmm8\n"
18349 "movapd %%xmm1, %%xmm9\n"
18350 "hsubpd %%xmm9, %%xmm9\n"
18351 "blendpd $1, %%xmm8, %%xmm9\n"
18352 "movapd %%xmm9, %%xmm1\n"
18353 "movapd %%xmm0, %%xmm8\n"
18354 "movapd %%xmm0, %%xmm9\n"
18355 "addpd %%xmm1, %%xmm8\n"
18356 "subpd %%xmm1, %%xmm9\n"
18357 "movupd %%xmm8, (%0)\n"
18358 "movupd %%xmm9, (%1)\n"
18359 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18360 );
18361 }
18362 }
18363 return;
18364 }
18365 if (depth == 5) {
18366 helper_double_19_recursive(buf + 0, 2);
18367 helper_double_19_recursive(buf + 4, 2);
18368 helper_double_19_recursive(buf + 8, 2);
18369 helper_double_19_recursive(buf + 12, 2);
18370 helper_double_19_recursive(buf + 16, 2);
18371 helper_double_19_recursive(buf + 20, 2);
18372 helper_double_19_recursive(buf + 24, 2);
18373 helper_double_19_recursive(buf + 28, 2);
18374 for (int j = 0; j < 32; j += 32) {
18375 for (int k = 0; k < 4; k += 2) {
18376 __asm__ volatile (
18377 "movupd (%0), %%xmm0\n"
18378 "movupd (%1), %%xmm1\n"
18379 "movupd (%2), %%xmm2\n"
18380 "movupd (%3), %%xmm3\n"
18381 "movupd (%4), %%xmm4\n"
18382 "movupd (%5), %%xmm5\n"
18383 "movupd (%6), %%xmm6\n"
18384 "movupd (%7), %%xmm7\n"
18385 "movapd %%xmm0, %%xmm8\n"
18386 "movapd %%xmm0, %%xmm9\n"
18387 "addpd %%xmm1, %%xmm8\n"
18388 "subpd %%xmm1, %%xmm9\n"
18389 "movapd %%xmm2, %%xmm10\n"
18390 "movapd %%xmm2, %%xmm11\n"
18391 "addpd %%xmm3, %%xmm10\n"
18392 "subpd %%xmm3, %%xmm11\n"
18393 "movapd %%xmm4, %%xmm12\n"
18394 "movapd %%xmm4, %%xmm13\n"
18395 "addpd %%xmm5, %%xmm12\n"
18396 "subpd %%xmm5, %%xmm13\n"
18397 "movapd %%xmm6, %%xmm14\n"
18398 "movapd %%xmm6, %%xmm15\n"
18399 "addpd %%xmm7, %%xmm14\n"
18400 "subpd %%xmm7, %%xmm15\n"
18401 "movapd %%xmm8, %%xmm0\n"
18402 "movapd %%xmm8, %%xmm2\n"
18403 "addpd %%xmm10, %%xmm0\n"
18404 "subpd %%xmm10, %%xmm2\n"
18405 "movapd %%xmm9, %%xmm1\n"
18406 "movapd %%xmm9, %%xmm3\n"
18407 "addpd %%xmm11, %%xmm1\n"
18408 "subpd %%xmm11, %%xmm3\n"
18409 "movapd %%xmm12, %%xmm4\n"
18410 "movapd %%xmm12, %%xmm6\n"
18411 "addpd %%xmm14, %%xmm4\n"
18412 "subpd %%xmm14, %%xmm6\n"
18413 "movapd %%xmm13, %%xmm5\n"
18414 "movapd %%xmm13, %%xmm7\n"
18415 "addpd %%xmm15, %%xmm5\n"
18416 "subpd %%xmm15, %%xmm7\n"
18417 "movapd %%xmm0, %%xmm8\n"
18418 "movapd %%xmm0, %%xmm12\n"
18419 "addpd %%xmm4, %%xmm8\n"
18420 "subpd %%xmm4, %%xmm12\n"
18421 "movapd %%xmm1, %%xmm9\n"
18422 "movapd %%xmm1, %%xmm13\n"
18423 "addpd %%xmm5, %%xmm9\n"
18424 "subpd %%xmm5, %%xmm13\n"
18425 "movapd %%xmm2, %%xmm10\n"
18426 "movapd %%xmm2, %%xmm14\n"
18427 "addpd %%xmm6, %%xmm10\n"
18428 "subpd %%xmm6, %%xmm14\n"
18429 "movapd %%xmm3, %%xmm11\n"
18430 "movapd %%xmm3, %%xmm15\n"
18431 "addpd %%xmm7, %%xmm11\n"
18432 "subpd %%xmm7, %%xmm15\n"
18433 "movupd %%xmm8, (%0)\n"
18434 "movupd %%xmm9, (%1)\n"
18435 "movupd %%xmm10, (%2)\n"
18436 "movupd %%xmm11, (%3)\n"
18437 "movupd %%xmm12, (%4)\n"
18438 "movupd %%xmm13, (%5)\n"
18439 "movupd %%xmm14, (%6)\n"
18440 "movupd %%xmm15, (%7)\n"
18441 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18442 );
18443 }
18444 }
18445 return;
18446 }
18447 if (depth == 8) {
18448 helper_double_19_recursive(buf + 0, 5);
18449 helper_double_19_recursive(buf + 32, 5);
18450 helper_double_19_recursive(buf + 64, 5);
18451 helper_double_19_recursive(buf + 96, 5);
18452 helper_double_19_recursive(buf + 128, 5);
18453 helper_double_19_recursive(buf + 160, 5);
18454 helper_double_19_recursive(buf + 192, 5);
18455 helper_double_19_recursive(buf + 224, 5);
18456 for (int j = 0; j < 256; j += 256) {
18457 for (int k = 0; k < 32; k += 2) {
18458 __asm__ volatile (
18459 "movupd (%0), %%xmm0\n"
18460 "movupd (%1), %%xmm1\n"
18461 "movupd (%2), %%xmm2\n"
18462 "movupd (%3), %%xmm3\n"
18463 "movupd (%4), %%xmm4\n"
18464 "movupd (%5), %%xmm5\n"
18465 "movupd (%6), %%xmm6\n"
18466 "movupd (%7), %%xmm7\n"
18467 "movapd %%xmm0, %%xmm8\n"
18468 "movapd %%xmm0, %%xmm9\n"
18469 "addpd %%xmm1, %%xmm8\n"
18470 "subpd %%xmm1, %%xmm9\n"
18471 "movapd %%xmm2, %%xmm10\n"
18472 "movapd %%xmm2, %%xmm11\n"
18473 "addpd %%xmm3, %%xmm10\n"
18474 "subpd %%xmm3, %%xmm11\n"
18475 "movapd %%xmm4, %%xmm12\n"
18476 "movapd %%xmm4, %%xmm13\n"
18477 "addpd %%xmm5, %%xmm12\n"
18478 "subpd %%xmm5, %%xmm13\n"
18479 "movapd %%xmm6, %%xmm14\n"
18480 "movapd %%xmm6, %%xmm15\n"
18481 "addpd %%xmm7, %%xmm14\n"
18482 "subpd %%xmm7, %%xmm15\n"
18483 "movapd %%xmm8, %%xmm0\n"
18484 "movapd %%xmm8, %%xmm2\n"
18485 "addpd %%xmm10, %%xmm0\n"
18486 "subpd %%xmm10, %%xmm2\n"
18487 "movapd %%xmm9, %%xmm1\n"
18488 "movapd %%xmm9, %%xmm3\n"
18489 "addpd %%xmm11, %%xmm1\n"
18490 "subpd %%xmm11, %%xmm3\n"
18491 "movapd %%xmm12, %%xmm4\n"
18492 "movapd %%xmm12, %%xmm6\n"
18493 "addpd %%xmm14, %%xmm4\n"
18494 "subpd %%xmm14, %%xmm6\n"
18495 "movapd %%xmm13, %%xmm5\n"
18496 "movapd %%xmm13, %%xmm7\n"
18497 "addpd %%xmm15, %%xmm5\n"
18498 "subpd %%xmm15, %%xmm7\n"
18499 "movapd %%xmm0, %%xmm8\n"
18500 "movapd %%xmm0, %%xmm12\n"
18501 "addpd %%xmm4, %%xmm8\n"
18502 "subpd %%xmm4, %%xmm12\n"
18503 "movapd %%xmm1, %%xmm9\n"
18504 "movapd %%xmm1, %%xmm13\n"
18505 "addpd %%xmm5, %%xmm9\n"
18506 "subpd %%xmm5, %%xmm13\n"
18507 "movapd %%xmm2, %%xmm10\n"
18508 "movapd %%xmm2, %%xmm14\n"
18509 "addpd %%xmm6, %%xmm10\n"
18510 "subpd %%xmm6, %%xmm14\n"
18511 "movapd %%xmm3, %%xmm11\n"
18512 "movapd %%xmm3, %%xmm15\n"
18513 "addpd %%xmm7, %%xmm11\n"
18514 "subpd %%xmm7, %%xmm15\n"
18515 "movupd %%xmm8, (%0)\n"
18516 "movupd %%xmm9, (%1)\n"
18517 "movupd %%xmm10, (%2)\n"
18518 "movupd %%xmm11, (%3)\n"
18519 "movupd %%xmm12, (%4)\n"
18520 "movupd %%xmm13, (%5)\n"
18521 "movupd %%xmm14, (%6)\n"
18522 "movupd %%xmm15, (%7)\n"
18523 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18524 );
18525 }
18526 }
18527 return;
18528 }
18529 if (depth == 11) {
18530 helper_double_19_recursive(buf + 0, 8);
18531 helper_double_19_recursive(buf + 256, 8);
18532 helper_double_19_recursive(buf + 512, 8);
18533 helper_double_19_recursive(buf + 768, 8);
18534 helper_double_19_recursive(buf + 1024, 8);
18535 helper_double_19_recursive(buf + 1280, 8);
18536 helper_double_19_recursive(buf + 1536, 8);
18537 helper_double_19_recursive(buf + 1792, 8);
18538 for (int j = 0; j < 2048; j += 2048) {
18539 for (int k = 0; k < 256; k += 2) {
18540 __asm__ volatile (
18541 "movupd (%0), %%xmm0\n"
18542 "movupd (%1), %%xmm1\n"
18543 "movupd (%2), %%xmm2\n"
18544 "movupd (%3), %%xmm3\n"
18545 "movupd (%4), %%xmm4\n"
18546 "movupd (%5), %%xmm5\n"
18547 "movupd (%6), %%xmm6\n"
18548 "movupd (%7), %%xmm7\n"
18549 "movapd %%xmm0, %%xmm8\n"
18550 "movapd %%xmm0, %%xmm9\n"
18551 "addpd %%xmm1, %%xmm8\n"
18552 "subpd %%xmm1, %%xmm9\n"
18553 "movapd %%xmm2, %%xmm10\n"
18554 "movapd %%xmm2, %%xmm11\n"
18555 "addpd %%xmm3, %%xmm10\n"
18556 "subpd %%xmm3, %%xmm11\n"
18557 "movapd %%xmm4, %%xmm12\n"
18558 "movapd %%xmm4, %%xmm13\n"
18559 "addpd %%xmm5, %%xmm12\n"
18560 "subpd %%xmm5, %%xmm13\n"
18561 "movapd %%xmm6, %%xmm14\n"
18562 "movapd %%xmm6, %%xmm15\n"
18563 "addpd %%xmm7, %%xmm14\n"
18564 "subpd %%xmm7, %%xmm15\n"
18565 "movapd %%xmm8, %%xmm0\n"
18566 "movapd %%xmm8, %%xmm2\n"
18567 "addpd %%xmm10, %%xmm0\n"
18568 "subpd %%xmm10, %%xmm2\n"
18569 "movapd %%xmm9, %%xmm1\n"
18570 "movapd %%xmm9, %%xmm3\n"
18571 "addpd %%xmm11, %%xmm1\n"
18572 "subpd %%xmm11, %%xmm3\n"
18573 "movapd %%xmm12, %%xmm4\n"
18574 "movapd %%xmm12, %%xmm6\n"
18575 "addpd %%xmm14, %%xmm4\n"
18576 "subpd %%xmm14, %%xmm6\n"
18577 "movapd %%xmm13, %%xmm5\n"
18578 "movapd %%xmm13, %%xmm7\n"
18579 "addpd %%xmm15, %%xmm5\n"
18580 "subpd %%xmm15, %%xmm7\n"
18581 "movapd %%xmm0, %%xmm8\n"
18582 "movapd %%xmm0, %%xmm12\n"
18583 "addpd %%xmm4, %%xmm8\n"
18584 "subpd %%xmm4, %%xmm12\n"
18585 "movapd %%xmm1, %%xmm9\n"
18586 "movapd %%xmm1, %%xmm13\n"
18587 "addpd %%xmm5, %%xmm9\n"
18588 "subpd %%xmm5, %%xmm13\n"
18589 "movapd %%xmm2, %%xmm10\n"
18590 "movapd %%xmm2, %%xmm14\n"
18591 "addpd %%xmm6, %%xmm10\n"
18592 "subpd %%xmm6, %%xmm14\n"
18593 "movapd %%xmm3, %%xmm11\n"
18594 "movapd %%xmm3, %%xmm15\n"
18595 "addpd %%xmm7, %%xmm11\n"
18596 "subpd %%xmm7, %%xmm15\n"
18597 "movupd %%xmm8, (%0)\n"
18598 "movupd %%xmm9, (%1)\n"
18599 "movupd %%xmm10, (%2)\n"
18600 "movupd %%xmm11, (%3)\n"
18601 "movupd %%xmm12, (%4)\n"
18602 "movupd %%xmm13, (%5)\n"
18603 "movupd %%xmm14, (%6)\n"
18604 "movupd %%xmm15, (%7)\n"
18605 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18606 );
18607 }
18608 }
18609 return;
18610 }
18611 if (depth == 14) {
18612 helper_double_19_recursive(buf + 0, 11);
18613 helper_double_19_recursive(buf + 2048, 11);
18614 helper_double_19_recursive(buf + 4096, 11);
18615 helper_double_19_recursive(buf + 6144, 11);
18616 helper_double_19_recursive(buf + 8192, 11);
18617 helper_double_19_recursive(buf + 10240, 11);
18618 helper_double_19_recursive(buf + 12288, 11);
18619 helper_double_19_recursive(buf + 14336, 11);
18620 for (int j = 0; j < 16384; j += 16384) {
18621 for (int k = 0; k < 2048; k += 2) {
18622 __asm__ volatile (
18623 "movupd (%0), %%xmm0\n"
18624 "movupd (%1), %%xmm1\n"
18625 "movupd (%2), %%xmm2\n"
18626 "movupd (%3), %%xmm3\n"
18627 "movupd (%4), %%xmm4\n"
18628 "movupd (%5), %%xmm5\n"
18629 "movupd (%6), %%xmm6\n"
18630 "movupd (%7), %%xmm7\n"
18631 "movapd %%xmm0, %%xmm8\n"
18632 "movapd %%xmm0, %%xmm9\n"
18633 "addpd %%xmm1, %%xmm8\n"
18634 "subpd %%xmm1, %%xmm9\n"
18635 "movapd %%xmm2, %%xmm10\n"
18636 "movapd %%xmm2, %%xmm11\n"
18637 "addpd %%xmm3, %%xmm10\n"
18638 "subpd %%xmm3, %%xmm11\n"
18639 "movapd %%xmm4, %%xmm12\n"
18640 "movapd %%xmm4, %%xmm13\n"
18641 "addpd %%xmm5, %%xmm12\n"
18642 "subpd %%xmm5, %%xmm13\n"
18643 "movapd %%xmm6, %%xmm14\n"
18644 "movapd %%xmm6, %%xmm15\n"
18645 "addpd %%xmm7, %%xmm14\n"
18646 "subpd %%xmm7, %%xmm15\n"
18647 "movapd %%xmm8, %%xmm0\n"
18648 "movapd %%xmm8, %%xmm2\n"
18649 "addpd %%xmm10, %%xmm0\n"
18650 "subpd %%xmm10, %%xmm2\n"
18651 "movapd %%xmm9, %%xmm1\n"
18652 "movapd %%xmm9, %%xmm3\n"
18653 "addpd %%xmm11, %%xmm1\n"
18654 "subpd %%xmm11, %%xmm3\n"
18655 "movapd %%xmm12, %%xmm4\n"
18656 "movapd %%xmm12, %%xmm6\n"
18657 "addpd %%xmm14, %%xmm4\n"
18658 "subpd %%xmm14, %%xmm6\n"
18659 "movapd %%xmm13, %%xmm5\n"
18660 "movapd %%xmm13, %%xmm7\n"
18661 "addpd %%xmm15, %%xmm5\n"
18662 "subpd %%xmm15, %%xmm7\n"
18663 "movapd %%xmm0, %%xmm8\n"
18664 "movapd %%xmm0, %%xmm12\n"
18665 "addpd %%xmm4, %%xmm8\n"
18666 "subpd %%xmm4, %%xmm12\n"
18667 "movapd %%xmm1, %%xmm9\n"
18668 "movapd %%xmm1, %%xmm13\n"
18669 "addpd %%xmm5, %%xmm9\n"
18670 "subpd %%xmm5, %%xmm13\n"
18671 "movapd %%xmm2, %%xmm10\n"
18672 "movapd %%xmm2, %%xmm14\n"
18673 "addpd %%xmm6, %%xmm10\n"
18674 "subpd %%xmm6, %%xmm14\n"
18675 "movapd %%xmm3, %%xmm11\n"
18676 "movapd %%xmm3, %%xmm15\n"
18677 "addpd %%xmm7, %%xmm11\n"
18678 "subpd %%xmm7, %%xmm15\n"
18679 "movupd %%xmm8, (%0)\n"
18680 "movupd %%xmm9, (%1)\n"
18681 "movupd %%xmm10, (%2)\n"
18682 "movupd %%xmm11, (%3)\n"
18683 "movupd %%xmm12, (%4)\n"
18684 "movupd %%xmm13, (%5)\n"
18685 "movupd %%xmm14, (%6)\n"
18686 "movupd %%xmm15, (%7)\n"
18687 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18688 );
18689 }
18690 }
18691 return;
18692 }
18693 if (depth == 17) {
18694 helper_double_19_recursive(buf + 0, 14);
18695 helper_double_19_recursive(buf + 16384, 14);
18696 helper_double_19_recursive(buf + 32768, 14);
18697 helper_double_19_recursive(buf + 49152, 14);
18698 helper_double_19_recursive(buf + 65536, 14);
18699 helper_double_19_recursive(buf + 81920, 14);
18700 helper_double_19_recursive(buf + 98304, 14);
18701 helper_double_19_recursive(buf + 114688, 14);
18702 for (int j = 0; j < 131072; j += 131072) {
18703 for (int k = 0; k < 16384; k += 2) {
18704 __asm__ volatile (
18705 "movupd (%0), %%xmm0\n"
18706 "movupd (%1), %%xmm1\n"
18707 "movupd (%2), %%xmm2\n"
18708 "movupd (%3), %%xmm3\n"
18709 "movupd (%4), %%xmm4\n"
18710 "movupd (%5), %%xmm5\n"
18711 "movupd (%6), %%xmm6\n"
18712 "movupd (%7), %%xmm7\n"
18713 "movapd %%xmm0, %%xmm8\n"
18714 "movapd %%xmm0, %%xmm9\n"
18715 "addpd %%xmm1, %%xmm8\n"
18716 "subpd %%xmm1, %%xmm9\n"
18717 "movapd %%xmm2, %%xmm10\n"
18718 "movapd %%xmm2, %%xmm11\n"
18719 "addpd %%xmm3, %%xmm10\n"
18720 "subpd %%xmm3, %%xmm11\n"
18721 "movapd %%xmm4, %%xmm12\n"
18722 "movapd %%xmm4, %%xmm13\n"
18723 "addpd %%xmm5, %%xmm12\n"
18724 "subpd %%xmm5, %%xmm13\n"
18725 "movapd %%xmm6, %%xmm14\n"
18726 "movapd %%xmm6, %%xmm15\n"
18727 "addpd %%xmm7, %%xmm14\n"
18728 "subpd %%xmm7, %%xmm15\n"
18729 "movapd %%xmm8, %%xmm0\n"
18730 "movapd %%xmm8, %%xmm2\n"
18731 "addpd %%xmm10, %%xmm0\n"
18732 "subpd %%xmm10, %%xmm2\n"
18733 "movapd %%xmm9, %%xmm1\n"
18734 "movapd %%xmm9, %%xmm3\n"
18735 "addpd %%xmm11, %%xmm1\n"
18736 "subpd %%xmm11, %%xmm3\n"
18737 "movapd %%xmm12, %%xmm4\n"
18738 "movapd %%xmm12, %%xmm6\n"
18739 "addpd %%xmm14, %%xmm4\n"
18740 "subpd %%xmm14, %%xmm6\n"
18741 "movapd %%xmm13, %%xmm5\n"
18742 "movapd %%xmm13, %%xmm7\n"
18743 "addpd %%xmm15, %%xmm5\n"
18744 "subpd %%xmm15, %%xmm7\n"
18745 "movapd %%xmm0, %%xmm8\n"
18746 "movapd %%xmm0, %%xmm12\n"
18747 "addpd %%xmm4, %%xmm8\n"
18748 "subpd %%xmm4, %%xmm12\n"
18749 "movapd %%xmm1, %%xmm9\n"
18750 "movapd %%xmm1, %%xmm13\n"
18751 "addpd %%xmm5, %%xmm9\n"
18752 "subpd %%xmm5, %%xmm13\n"
18753 "movapd %%xmm2, %%xmm10\n"
18754 "movapd %%xmm2, %%xmm14\n"
18755 "addpd %%xmm6, %%xmm10\n"
18756 "subpd %%xmm6, %%xmm14\n"
18757 "movapd %%xmm3, %%xmm11\n"
18758 "movapd %%xmm3, %%xmm15\n"
18759 "addpd %%xmm7, %%xmm11\n"
18760 "subpd %%xmm7, %%xmm15\n"
18761 "movupd %%xmm8, (%0)\n"
18762 "movupd %%xmm9, (%1)\n"
18763 "movupd %%xmm10, (%2)\n"
18764 "movupd %%xmm11, (%3)\n"
18765 "movupd %%xmm12, (%4)\n"
18766 "movupd %%xmm13, (%5)\n"
18767 "movupd %%xmm14, (%6)\n"
18768 "movupd %%xmm15, (%7)\n"
18769 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18770 );
18771 }
18772 }
18773 return;
18774 }
18775 if (depth == 19) {
18776 helper_double_19_recursive(buf + 0, 17);
18777 helper_double_19_recursive(buf + 131072, 17);
18778 helper_double_19_recursive(buf + 262144, 17);
18779 helper_double_19_recursive(buf + 393216, 17);
18780 for (int j = 0; j < 524288; j += 524288) {
18781 for (int k = 0; k < 131072; k += 2) {
18782 __asm__ volatile (
18783 "movupd (%0), %%xmm0\n"
18784 "movupd (%1), %%xmm1\n"
18785 "movupd (%2), %%xmm2\n"
18786 "movupd (%3), %%xmm3\n"
18787 "movapd %%xmm0, %%xmm8\n"
18788 "movapd %%xmm0, %%xmm9\n"
18789 "addpd %%xmm1, %%xmm8\n"
18790 "subpd %%xmm1, %%xmm9\n"
18791 "movapd %%xmm2, %%xmm10\n"
18792 "movapd %%xmm2, %%xmm11\n"
18793 "addpd %%xmm3, %%xmm10\n"
18794 "subpd %%xmm3, %%xmm11\n"
18795 "movapd %%xmm8, %%xmm0\n"
18796 "movapd %%xmm8, %%xmm2\n"
18797 "addpd %%xmm10, %%xmm0\n"
18798 "subpd %%xmm10, %%xmm2\n"
18799 "movapd %%xmm9, %%xmm1\n"
18800 "movapd %%xmm9, %%xmm3\n"
18801 "addpd %%xmm11, %%xmm1\n"
18802 "subpd %%xmm11, %%xmm3\n"
18803 "movupd %%xmm0, (%0)\n"
18804 "movupd %%xmm1, (%1)\n"
18805 "movupd %%xmm2, (%2)\n"
18806 "movupd %%xmm3, (%3)\n"
18807 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18808 );
18809 }
18810 }
18811 return;
18812 }
18813 }
18814 void helper_double_19(double *buf);
helper_double_19(double * buf)18815 void helper_double_19(double *buf) {
18816 helper_double_19_recursive(buf, 19);
18817 }
18818 void helper_double_20_recursive(double *buf, int depth);
helper_double_20_recursive(double * buf,int depth)18819 void helper_double_20_recursive(double *buf, int depth) {
18820 if (depth == 12) {
18821 for (int j = 0; j < 4096; j += 16) {
18822 for (int k = 0; k < 2; k += 2) {
18823 __asm__ volatile (
18824 "movupd (%0), %%xmm0\n"
18825 "movupd (%1), %%xmm1\n"
18826 "movupd (%2), %%xmm2\n"
18827 "movupd (%3), %%xmm3\n"
18828 "movupd (%4), %%xmm4\n"
18829 "movupd (%5), %%xmm5\n"
18830 "movupd (%6), %%xmm6\n"
18831 "movupd (%7), %%xmm7\n"
18832 "movapd %%xmm0, %%xmm8\n"
18833 "haddpd %%xmm8, %%xmm8\n"
18834 "movapd %%xmm0, %%xmm9\n"
18835 "hsubpd %%xmm9, %%xmm9\n"
18836 "blendpd $1, %%xmm8, %%xmm9\n"
18837 "movapd %%xmm9, %%xmm0\n"
18838 "movapd %%xmm1, %%xmm8\n"
18839 "haddpd %%xmm8, %%xmm8\n"
18840 "movapd %%xmm1, %%xmm9\n"
18841 "hsubpd %%xmm9, %%xmm9\n"
18842 "blendpd $1, %%xmm8, %%xmm9\n"
18843 "movapd %%xmm9, %%xmm1\n"
18844 "movapd %%xmm2, %%xmm8\n"
18845 "haddpd %%xmm8, %%xmm8\n"
18846 "movapd %%xmm2, %%xmm9\n"
18847 "hsubpd %%xmm9, %%xmm9\n"
18848 "blendpd $1, %%xmm8, %%xmm9\n"
18849 "movapd %%xmm9, %%xmm2\n"
18850 "movapd %%xmm3, %%xmm8\n"
18851 "haddpd %%xmm8, %%xmm8\n"
18852 "movapd %%xmm3, %%xmm9\n"
18853 "hsubpd %%xmm9, %%xmm9\n"
18854 "blendpd $1, %%xmm8, %%xmm9\n"
18855 "movapd %%xmm9, %%xmm3\n"
18856 "movapd %%xmm4, %%xmm8\n"
18857 "haddpd %%xmm8, %%xmm8\n"
18858 "movapd %%xmm4, %%xmm9\n"
18859 "hsubpd %%xmm9, %%xmm9\n"
18860 "blendpd $1, %%xmm8, %%xmm9\n"
18861 "movapd %%xmm9, %%xmm4\n"
18862 "movapd %%xmm5, %%xmm8\n"
18863 "haddpd %%xmm8, %%xmm8\n"
18864 "movapd %%xmm5, %%xmm9\n"
18865 "hsubpd %%xmm9, %%xmm9\n"
18866 "blendpd $1, %%xmm8, %%xmm9\n"
18867 "movapd %%xmm9, %%xmm5\n"
18868 "movapd %%xmm6, %%xmm8\n"
18869 "haddpd %%xmm8, %%xmm8\n"
18870 "movapd %%xmm6, %%xmm9\n"
18871 "hsubpd %%xmm9, %%xmm9\n"
18872 "blendpd $1, %%xmm8, %%xmm9\n"
18873 "movapd %%xmm9, %%xmm6\n"
18874 "movapd %%xmm7, %%xmm8\n"
18875 "haddpd %%xmm8, %%xmm8\n"
18876 "movapd %%xmm7, %%xmm9\n"
18877 "hsubpd %%xmm9, %%xmm9\n"
18878 "blendpd $1, %%xmm8, %%xmm9\n"
18879 "movapd %%xmm9, %%xmm7\n"
18880 "movapd %%xmm0, %%xmm8\n"
18881 "movapd %%xmm0, %%xmm9\n"
18882 "addpd %%xmm1, %%xmm8\n"
18883 "subpd %%xmm1, %%xmm9\n"
18884 "movapd %%xmm2, %%xmm10\n"
18885 "movapd %%xmm2, %%xmm11\n"
18886 "addpd %%xmm3, %%xmm10\n"
18887 "subpd %%xmm3, %%xmm11\n"
18888 "movapd %%xmm4, %%xmm12\n"
18889 "movapd %%xmm4, %%xmm13\n"
18890 "addpd %%xmm5, %%xmm12\n"
18891 "subpd %%xmm5, %%xmm13\n"
18892 "movapd %%xmm6, %%xmm14\n"
18893 "movapd %%xmm6, %%xmm15\n"
18894 "addpd %%xmm7, %%xmm14\n"
18895 "subpd %%xmm7, %%xmm15\n"
18896 "movapd %%xmm8, %%xmm0\n"
18897 "movapd %%xmm8, %%xmm2\n"
18898 "addpd %%xmm10, %%xmm0\n"
18899 "subpd %%xmm10, %%xmm2\n"
18900 "movapd %%xmm9, %%xmm1\n"
18901 "movapd %%xmm9, %%xmm3\n"
18902 "addpd %%xmm11, %%xmm1\n"
18903 "subpd %%xmm11, %%xmm3\n"
18904 "movapd %%xmm12, %%xmm4\n"
18905 "movapd %%xmm12, %%xmm6\n"
18906 "addpd %%xmm14, %%xmm4\n"
18907 "subpd %%xmm14, %%xmm6\n"
18908 "movapd %%xmm13, %%xmm5\n"
18909 "movapd %%xmm13, %%xmm7\n"
18910 "addpd %%xmm15, %%xmm5\n"
18911 "subpd %%xmm15, %%xmm7\n"
18912 "movapd %%xmm0, %%xmm8\n"
18913 "movapd %%xmm0, %%xmm12\n"
18914 "addpd %%xmm4, %%xmm8\n"
18915 "subpd %%xmm4, %%xmm12\n"
18916 "movapd %%xmm1, %%xmm9\n"
18917 "movapd %%xmm1, %%xmm13\n"
18918 "addpd %%xmm5, %%xmm9\n"
18919 "subpd %%xmm5, %%xmm13\n"
18920 "movapd %%xmm2, %%xmm10\n"
18921 "movapd %%xmm2, %%xmm14\n"
18922 "addpd %%xmm6, %%xmm10\n"
18923 "subpd %%xmm6, %%xmm14\n"
18924 "movapd %%xmm3, %%xmm11\n"
18925 "movapd %%xmm3, %%xmm15\n"
18926 "addpd %%xmm7, %%xmm11\n"
18927 "subpd %%xmm7, %%xmm15\n"
18928 "movupd %%xmm8, (%0)\n"
18929 "movupd %%xmm9, (%1)\n"
18930 "movupd %%xmm10, (%2)\n"
18931 "movupd %%xmm11, (%3)\n"
18932 "movupd %%xmm12, (%4)\n"
18933 "movupd %%xmm13, (%5)\n"
18934 "movupd %%xmm14, (%6)\n"
18935 "movupd %%xmm15, (%7)\n"
18936 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
18937 );
18938 }
18939 }
18940 for (int j = 0; j < 4096; j += 128) {
18941 for (int k = 0; k < 16; k += 2) {
18942 __asm__ volatile (
18943 "movupd (%0), %%xmm0\n"
18944 "movupd (%1), %%xmm1\n"
18945 "movupd (%2), %%xmm2\n"
18946 "movupd (%3), %%xmm3\n"
18947 "movupd (%4), %%xmm4\n"
18948 "movupd (%5), %%xmm5\n"
18949 "movupd (%6), %%xmm6\n"
18950 "movupd (%7), %%xmm7\n"
18951 "movapd %%xmm0, %%xmm8\n"
18952 "movapd %%xmm0, %%xmm9\n"
18953 "addpd %%xmm1, %%xmm8\n"
18954 "subpd %%xmm1, %%xmm9\n"
18955 "movapd %%xmm2, %%xmm10\n"
18956 "movapd %%xmm2, %%xmm11\n"
18957 "addpd %%xmm3, %%xmm10\n"
18958 "subpd %%xmm3, %%xmm11\n"
18959 "movapd %%xmm4, %%xmm12\n"
18960 "movapd %%xmm4, %%xmm13\n"
18961 "addpd %%xmm5, %%xmm12\n"
18962 "subpd %%xmm5, %%xmm13\n"
18963 "movapd %%xmm6, %%xmm14\n"
18964 "movapd %%xmm6, %%xmm15\n"
18965 "addpd %%xmm7, %%xmm14\n"
18966 "subpd %%xmm7, %%xmm15\n"
18967 "movapd %%xmm8, %%xmm0\n"
18968 "movapd %%xmm8, %%xmm2\n"
18969 "addpd %%xmm10, %%xmm0\n"
18970 "subpd %%xmm10, %%xmm2\n"
18971 "movapd %%xmm9, %%xmm1\n"
18972 "movapd %%xmm9, %%xmm3\n"
18973 "addpd %%xmm11, %%xmm1\n"
18974 "subpd %%xmm11, %%xmm3\n"
18975 "movapd %%xmm12, %%xmm4\n"
18976 "movapd %%xmm12, %%xmm6\n"
18977 "addpd %%xmm14, %%xmm4\n"
18978 "subpd %%xmm14, %%xmm6\n"
18979 "movapd %%xmm13, %%xmm5\n"
18980 "movapd %%xmm13, %%xmm7\n"
18981 "addpd %%xmm15, %%xmm5\n"
18982 "subpd %%xmm15, %%xmm7\n"
18983 "movapd %%xmm0, %%xmm8\n"
18984 "movapd %%xmm0, %%xmm12\n"
18985 "addpd %%xmm4, %%xmm8\n"
18986 "subpd %%xmm4, %%xmm12\n"
18987 "movapd %%xmm1, %%xmm9\n"
18988 "movapd %%xmm1, %%xmm13\n"
18989 "addpd %%xmm5, %%xmm9\n"
18990 "subpd %%xmm5, %%xmm13\n"
18991 "movapd %%xmm2, %%xmm10\n"
18992 "movapd %%xmm2, %%xmm14\n"
18993 "addpd %%xmm6, %%xmm10\n"
18994 "subpd %%xmm6, %%xmm14\n"
18995 "movapd %%xmm3, %%xmm11\n"
18996 "movapd %%xmm3, %%xmm15\n"
18997 "addpd %%xmm7, %%xmm11\n"
18998 "subpd %%xmm7, %%xmm15\n"
18999 "movupd %%xmm8, (%0)\n"
19000 "movupd %%xmm9, (%1)\n"
19001 "movupd %%xmm10, (%2)\n"
19002 "movupd %%xmm11, (%3)\n"
19003 "movupd %%xmm12, (%4)\n"
19004 "movupd %%xmm13, (%5)\n"
19005 "movupd %%xmm14, (%6)\n"
19006 "movupd %%xmm15, (%7)\n"
19007 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19008 );
19009 }
19010 }
19011 for (int j = 0; j < 4096; j += 1024) {
19012 for (int k = 0; k < 128; k += 2) {
19013 __asm__ volatile (
19014 "movupd (%0), %%xmm0\n"
19015 "movupd (%1), %%xmm1\n"
19016 "movupd (%2), %%xmm2\n"
19017 "movupd (%3), %%xmm3\n"
19018 "movupd (%4), %%xmm4\n"
19019 "movupd (%5), %%xmm5\n"
19020 "movupd (%6), %%xmm6\n"
19021 "movupd (%7), %%xmm7\n"
19022 "movapd %%xmm0, %%xmm8\n"
19023 "movapd %%xmm0, %%xmm9\n"
19024 "addpd %%xmm1, %%xmm8\n"
19025 "subpd %%xmm1, %%xmm9\n"
19026 "movapd %%xmm2, %%xmm10\n"
19027 "movapd %%xmm2, %%xmm11\n"
19028 "addpd %%xmm3, %%xmm10\n"
19029 "subpd %%xmm3, %%xmm11\n"
19030 "movapd %%xmm4, %%xmm12\n"
19031 "movapd %%xmm4, %%xmm13\n"
19032 "addpd %%xmm5, %%xmm12\n"
19033 "subpd %%xmm5, %%xmm13\n"
19034 "movapd %%xmm6, %%xmm14\n"
19035 "movapd %%xmm6, %%xmm15\n"
19036 "addpd %%xmm7, %%xmm14\n"
19037 "subpd %%xmm7, %%xmm15\n"
19038 "movapd %%xmm8, %%xmm0\n"
19039 "movapd %%xmm8, %%xmm2\n"
19040 "addpd %%xmm10, %%xmm0\n"
19041 "subpd %%xmm10, %%xmm2\n"
19042 "movapd %%xmm9, %%xmm1\n"
19043 "movapd %%xmm9, %%xmm3\n"
19044 "addpd %%xmm11, %%xmm1\n"
19045 "subpd %%xmm11, %%xmm3\n"
19046 "movapd %%xmm12, %%xmm4\n"
19047 "movapd %%xmm12, %%xmm6\n"
19048 "addpd %%xmm14, %%xmm4\n"
19049 "subpd %%xmm14, %%xmm6\n"
19050 "movapd %%xmm13, %%xmm5\n"
19051 "movapd %%xmm13, %%xmm7\n"
19052 "addpd %%xmm15, %%xmm5\n"
19053 "subpd %%xmm15, %%xmm7\n"
19054 "movapd %%xmm0, %%xmm8\n"
19055 "movapd %%xmm0, %%xmm12\n"
19056 "addpd %%xmm4, %%xmm8\n"
19057 "subpd %%xmm4, %%xmm12\n"
19058 "movapd %%xmm1, %%xmm9\n"
19059 "movapd %%xmm1, %%xmm13\n"
19060 "addpd %%xmm5, %%xmm9\n"
19061 "subpd %%xmm5, %%xmm13\n"
19062 "movapd %%xmm2, %%xmm10\n"
19063 "movapd %%xmm2, %%xmm14\n"
19064 "addpd %%xmm6, %%xmm10\n"
19065 "subpd %%xmm6, %%xmm14\n"
19066 "movapd %%xmm3, %%xmm11\n"
19067 "movapd %%xmm3, %%xmm15\n"
19068 "addpd %%xmm7, %%xmm11\n"
19069 "subpd %%xmm7, %%xmm15\n"
19070 "movupd %%xmm8, (%0)\n"
19071 "movupd %%xmm9, (%1)\n"
19072 "movupd %%xmm10, (%2)\n"
19073 "movupd %%xmm11, (%3)\n"
19074 "movupd %%xmm12, (%4)\n"
19075 "movupd %%xmm13, (%5)\n"
19076 "movupd %%xmm14, (%6)\n"
19077 "movupd %%xmm15, (%7)\n"
19078 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19079 );
19080 }
19081 }
19082 for (int j = 0; j < 4096; j += 4096) {
19083 for (int k = 0; k < 1024; k += 2) {
19084 __asm__ volatile (
19085 "movupd (%0), %%xmm0\n"
19086 "movupd (%1), %%xmm1\n"
19087 "movupd (%2), %%xmm2\n"
19088 "movupd (%3), %%xmm3\n"
19089 "movapd %%xmm0, %%xmm8\n"
19090 "movapd %%xmm0, %%xmm9\n"
19091 "addpd %%xmm1, %%xmm8\n"
19092 "subpd %%xmm1, %%xmm9\n"
19093 "movapd %%xmm2, %%xmm10\n"
19094 "movapd %%xmm2, %%xmm11\n"
19095 "addpd %%xmm3, %%xmm10\n"
19096 "subpd %%xmm3, %%xmm11\n"
19097 "movapd %%xmm8, %%xmm0\n"
19098 "movapd %%xmm8, %%xmm2\n"
19099 "addpd %%xmm10, %%xmm0\n"
19100 "subpd %%xmm10, %%xmm2\n"
19101 "movapd %%xmm9, %%xmm1\n"
19102 "movapd %%xmm9, %%xmm3\n"
19103 "addpd %%xmm11, %%xmm1\n"
19104 "subpd %%xmm11, %%xmm3\n"
19105 "movupd %%xmm0, (%0)\n"
19106 "movupd %%xmm1, (%1)\n"
19107 "movupd %%xmm2, (%2)\n"
19108 "movupd %%xmm3, (%3)\n"
19109 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19110 );
19111 }
19112 }
19113 return;
19114 }
19115 if (depth == 15) {
19116 helper_double_20_recursive(buf + 0, 12);
19117 helper_double_20_recursive(buf + 4096, 12);
19118 helper_double_20_recursive(buf + 8192, 12);
19119 helper_double_20_recursive(buf + 12288, 12);
19120 helper_double_20_recursive(buf + 16384, 12);
19121 helper_double_20_recursive(buf + 20480, 12);
19122 helper_double_20_recursive(buf + 24576, 12);
19123 helper_double_20_recursive(buf + 28672, 12);
19124 for (int j = 0; j < 32768; j += 32768) {
19125 for (int k = 0; k < 4096; k += 2) {
19126 __asm__ volatile (
19127 "movupd (%0), %%xmm0\n"
19128 "movupd (%1), %%xmm1\n"
19129 "movupd (%2), %%xmm2\n"
19130 "movupd (%3), %%xmm3\n"
19131 "movupd (%4), %%xmm4\n"
19132 "movupd (%5), %%xmm5\n"
19133 "movupd (%6), %%xmm6\n"
19134 "movupd (%7), %%xmm7\n"
19135 "movapd %%xmm0, %%xmm8\n"
19136 "movapd %%xmm0, %%xmm9\n"
19137 "addpd %%xmm1, %%xmm8\n"
19138 "subpd %%xmm1, %%xmm9\n"
19139 "movapd %%xmm2, %%xmm10\n"
19140 "movapd %%xmm2, %%xmm11\n"
19141 "addpd %%xmm3, %%xmm10\n"
19142 "subpd %%xmm3, %%xmm11\n"
19143 "movapd %%xmm4, %%xmm12\n"
19144 "movapd %%xmm4, %%xmm13\n"
19145 "addpd %%xmm5, %%xmm12\n"
19146 "subpd %%xmm5, %%xmm13\n"
19147 "movapd %%xmm6, %%xmm14\n"
19148 "movapd %%xmm6, %%xmm15\n"
19149 "addpd %%xmm7, %%xmm14\n"
19150 "subpd %%xmm7, %%xmm15\n"
19151 "movapd %%xmm8, %%xmm0\n"
19152 "movapd %%xmm8, %%xmm2\n"
19153 "addpd %%xmm10, %%xmm0\n"
19154 "subpd %%xmm10, %%xmm2\n"
19155 "movapd %%xmm9, %%xmm1\n"
19156 "movapd %%xmm9, %%xmm3\n"
19157 "addpd %%xmm11, %%xmm1\n"
19158 "subpd %%xmm11, %%xmm3\n"
19159 "movapd %%xmm12, %%xmm4\n"
19160 "movapd %%xmm12, %%xmm6\n"
19161 "addpd %%xmm14, %%xmm4\n"
19162 "subpd %%xmm14, %%xmm6\n"
19163 "movapd %%xmm13, %%xmm5\n"
19164 "movapd %%xmm13, %%xmm7\n"
19165 "addpd %%xmm15, %%xmm5\n"
19166 "subpd %%xmm15, %%xmm7\n"
19167 "movapd %%xmm0, %%xmm8\n"
19168 "movapd %%xmm0, %%xmm12\n"
19169 "addpd %%xmm4, %%xmm8\n"
19170 "subpd %%xmm4, %%xmm12\n"
19171 "movapd %%xmm1, %%xmm9\n"
19172 "movapd %%xmm1, %%xmm13\n"
19173 "addpd %%xmm5, %%xmm9\n"
19174 "subpd %%xmm5, %%xmm13\n"
19175 "movapd %%xmm2, %%xmm10\n"
19176 "movapd %%xmm2, %%xmm14\n"
19177 "addpd %%xmm6, %%xmm10\n"
19178 "subpd %%xmm6, %%xmm14\n"
19179 "movapd %%xmm3, %%xmm11\n"
19180 "movapd %%xmm3, %%xmm15\n"
19181 "addpd %%xmm7, %%xmm11\n"
19182 "subpd %%xmm7, %%xmm15\n"
19183 "movupd %%xmm8, (%0)\n"
19184 "movupd %%xmm9, (%1)\n"
19185 "movupd %%xmm10, (%2)\n"
19186 "movupd %%xmm11, (%3)\n"
19187 "movupd %%xmm12, (%4)\n"
19188 "movupd %%xmm13, (%5)\n"
19189 "movupd %%xmm14, (%6)\n"
19190 "movupd %%xmm15, (%7)\n"
19191 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19192 );
19193 }
19194 }
19195 return;
19196 }
19197 if (depth == 18) {
19198 helper_double_20_recursive(buf + 0, 15);
19199 helper_double_20_recursive(buf + 32768, 15);
19200 helper_double_20_recursive(buf + 65536, 15);
19201 helper_double_20_recursive(buf + 98304, 15);
19202 helper_double_20_recursive(buf + 131072, 15);
19203 helper_double_20_recursive(buf + 163840, 15);
19204 helper_double_20_recursive(buf + 196608, 15);
19205 helper_double_20_recursive(buf + 229376, 15);
19206 for (int j = 0; j < 262144; j += 262144) {
19207 for (int k = 0; k < 32768; k += 2) {
19208 __asm__ volatile (
19209 "movupd (%0), %%xmm0\n"
19210 "movupd (%1), %%xmm1\n"
19211 "movupd (%2), %%xmm2\n"
19212 "movupd (%3), %%xmm3\n"
19213 "movupd (%4), %%xmm4\n"
19214 "movupd (%5), %%xmm5\n"
19215 "movupd (%6), %%xmm6\n"
19216 "movupd (%7), %%xmm7\n"
19217 "movapd %%xmm0, %%xmm8\n"
19218 "movapd %%xmm0, %%xmm9\n"
19219 "addpd %%xmm1, %%xmm8\n"
19220 "subpd %%xmm1, %%xmm9\n"
19221 "movapd %%xmm2, %%xmm10\n"
19222 "movapd %%xmm2, %%xmm11\n"
19223 "addpd %%xmm3, %%xmm10\n"
19224 "subpd %%xmm3, %%xmm11\n"
19225 "movapd %%xmm4, %%xmm12\n"
19226 "movapd %%xmm4, %%xmm13\n"
19227 "addpd %%xmm5, %%xmm12\n"
19228 "subpd %%xmm5, %%xmm13\n"
19229 "movapd %%xmm6, %%xmm14\n"
19230 "movapd %%xmm6, %%xmm15\n"
19231 "addpd %%xmm7, %%xmm14\n"
19232 "subpd %%xmm7, %%xmm15\n"
19233 "movapd %%xmm8, %%xmm0\n"
19234 "movapd %%xmm8, %%xmm2\n"
19235 "addpd %%xmm10, %%xmm0\n"
19236 "subpd %%xmm10, %%xmm2\n"
19237 "movapd %%xmm9, %%xmm1\n"
19238 "movapd %%xmm9, %%xmm3\n"
19239 "addpd %%xmm11, %%xmm1\n"
19240 "subpd %%xmm11, %%xmm3\n"
19241 "movapd %%xmm12, %%xmm4\n"
19242 "movapd %%xmm12, %%xmm6\n"
19243 "addpd %%xmm14, %%xmm4\n"
19244 "subpd %%xmm14, %%xmm6\n"
19245 "movapd %%xmm13, %%xmm5\n"
19246 "movapd %%xmm13, %%xmm7\n"
19247 "addpd %%xmm15, %%xmm5\n"
19248 "subpd %%xmm15, %%xmm7\n"
19249 "movapd %%xmm0, %%xmm8\n"
19250 "movapd %%xmm0, %%xmm12\n"
19251 "addpd %%xmm4, %%xmm8\n"
19252 "subpd %%xmm4, %%xmm12\n"
19253 "movapd %%xmm1, %%xmm9\n"
19254 "movapd %%xmm1, %%xmm13\n"
19255 "addpd %%xmm5, %%xmm9\n"
19256 "subpd %%xmm5, %%xmm13\n"
19257 "movapd %%xmm2, %%xmm10\n"
19258 "movapd %%xmm2, %%xmm14\n"
19259 "addpd %%xmm6, %%xmm10\n"
19260 "subpd %%xmm6, %%xmm14\n"
19261 "movapd %%xmm3, %%xmm11\n"
19262 "movapd %%xmm3, %%xmm15\n"
19263 "addpd %%xmm7, %%xmm11\n"
19264 "subpd %%xmm7, %%xmm15\n"
19265 "movupd %%xmm8, (%0)\n"
19266 "movupd %%xmm9, (%1)\n"
19267 "movupd %%xmm10, (%2)\n"
19268 "movupd %%xmm11, (%3)\n"
19269 "movupd %%xmm12, (%4)\n"
19270 "movupd %%xmm13, (%5)\n"
19271 "movupd %%xmm14, (%6)\n"
19272 "movupd %%xmm15, (%7)\n"
19273 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19274 );
19275 }
19276 }
19277 return;
19278 }
19279 if (depth == 20) {
19280 helper_double_20_recursive(buf + 0, 18);
19281 helper_double_20_recursive(buf + 262144, 18);
19282 helper_double_20_recursive(buf + 524288, 18);
19283 helper_double_20_recursive(buf + 786432, 18);
19284 for (int j = 0; j < 1048576; j += 1048576) {
19285 for (int k = 0; k < 262144; k += 2) {
19286 __asm__ volatile (
19287 "movupd (%0), %%xmm0\n"
19288 "movupd (%1), %%xmm1\n"
19289 "movupd (%2), %%xmm2\n"
19290 "movupd (%3), %%xmm3\n"
19291 "movapd %%xmm0, %%xmm8\n"
19292 "movapd %%xmm0, %%xmm9\n"
19293 "addpd %%xmm1, %%xmm8\n"
19294 "subpd %%xmm1, %%xmm9\n"
19295 "movapd %%xmm2, %%xmm10\n"
19296 "movapd %%xmm2, %%xmm11\n"
19297 "addpd %%xmm3, %%xmm10\n"
19298 "subpd %%xmm3, %%xmm11\n"
19299 "movapd %%xmm8, %%xmm0\n"
19300 "movapd %%xmm8, %%xmm2\n"
19301 "addpd %%xmm10, %%xmm0\n"
19302 "subpd %%xmm10, %%xmm2\n"
19303 "movapd %%xmm9, %%xmm1\n"
19304 "movapd %%xmm9, %%xmm3\n"
19305 "addpd %%xmm11, %%xmm1\n"
19306 "subpd %%xmm11, %%xmm3\n"
19307 "movupd %%xmm0, (%0)\n"
19308 "movupd %%xmm1, (%1)\n"
19309 "movupd %%xmm2, (%2)\n"
19310 "movupd %%xmm3, (%3)\n"
19311 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19312 );
19313 }
19314 }
19315 return;
19316 }
19317 }
19318 void helper_double_20(double *buf);
helper_double_20(double * buf)19319 void helper_double_20(double *buf) {
19320 helper_double_20_recursive(buf, 20);
19321 }
19322 void helper_double_21_recursive(double *buf, int depth);
helper_double_21_recursive(double * buf,int depth)19323 void helper_double_21_recursive(double *buf, int depth) {
19324 if (depth == 13) {
19325 for (int j = 0; j < 8192; j += 16) {
19326 for (int k = 0; k < 2; k += 2) {
19327 __asm__ volatile (
19328 "movupd (%0), %%xmm0\n"
19329 "movupd (%1), %%xmm1\n"
19330 "movupd (%2), %%xmm2\n"
19331 "movupd (%3), %%xmm3\n"
19332 "movupd (%4), %%xmm4\n"
19333 "movupd (%5), %%xmm5\n"
19334 "movupd (%6), %%xmm6\n"
19335 "movupd (%7), %%xmm7\n"
19336 "movapd %%xmm0, %%xmm8\n"
19337 "haddpd %%xmm8, %%xmm8\n"
19338 "movapd %%xmm0, %%xmm9\n"
19339 "hsubpd %%xmm9, %%xmm9\n"
19340 "blendpd $1, %%xmm8, %%xmm9\n"
19341 "movapd %%xmm9, %%xmm0\n"
19342 "movapd %%xmm1, %%xmm8\n"
19343 "haddpd %%xmm8, %%xmm8\n"
19344 "movapd %%xmm1, %%xmm9\n"
19345 "hsubpd %%xmm9, %%xmm9\n"
19346 "blendpd $1, %%xmm8, %%xmm9\n"
19347 "movapd %%xmm9, %%xmm1\n"
19348 "movapd %%xmm2, %%xmm8\n"
19349 "haddpd %%xmm8, %%xmm8\n"
19350 "movapd %%xmm2, %%xmm9\n"
19351 "hsubpd %%xmm9, %%xmm9\n"
19352 "blendpd $1, %%xmm8, %%xmm9\n"
19353 "movapd %%xmm9, %%xmm2\n"
19354 "movapd %%xmm3, %%xmm8\n"
19355 "haddpd %%xmm8, %%xmm8\n"
19356 "movapd %%xmm3, %%xmm9\n"
19357 "hsubpd %%xmm9, %%xmm9\n"
19358 "blendpd $1, %%xmm8, %%xmm9\n"
19359 "movapd %%xmm9, %%xmm3\n"
19360 "movapd %%xmm4, %%xmm8\n"
19361 "haddpd %%xmm8, %%xmm8\n"
19362 "movapd %%xmm4, %%xmm9\n"
19363 "hsubpd %%xmm9, %%xmm9\n"
19364 "blendpd $1, %%xmm8, %%xmm9\n"
19365 "movapd %%xmm9, %%xmm4\n"
19366 "movapd %%xmm5, %%xmm8\n"
19367 "haddpd %%xmm8, %%xmm8\n"
19368 "movapd %%xmm5, %%xmm9\n"
19369 "hsubpd %%xmm9, %%xmm9\n"
19370 "blendpd $1, %%xmm8, %%xmm9\n"
19371 "movapd %%xmm9, %%xmm5\n"
19372 "movapd %%xmm6, %%xmm8\n"
19373 "haddpd %%xmm8, %%xmm8\n"
19374 "movapd %%xmm6, %%xmm9\n"
19375 "hsubpd %%xmm9, %%xmm9\n"
19376 "blendpd $1, %%xmm8, %%xmm9\n"
19377 "movapd %%xmm9, %%xmm6\n"
19378 "movapd %%xmm7, %%xmm8\n"
19379 "haddpd %%xmm8, %%xmm8\n"
19380 "movapd %%xmm7, %%xmm9\n"
19381 "hsubpd %%xmm9, %%xmm9\n"
19382 "blendpd $1, %%xmm8, %%xmm9\n"
19383 "movapd %%xmm9, %%xmm7\n"
19384 "movapd %%xmm0, %%xmm8\n"
19385 "movapd %%xmm0, %%xmm9\n"
19386 "addpd %%xmm1, %%xmm8\n"
19387 "subpd %%xmm1, %%xmm9\n"
19388 "movapd %%xmm2, %%xmm10\n"
19389 "movapd %%xmm2, %%xmm11\n"
19390 "addpd %%xmm3, %%xmm10\n"
19391 "subpd %%xmm3, %%xmm11\n"
19392 "movapd %%xmm4, %%xmm12\n"
19393 "movapd %%xmm4, %%xmm13\n"
19394 "addpd %%xmm5, %%xmm12\n"
19395 "subpd %%xmm5, %%xmm13\n"
19396 "movapd %%xmm6, %%xmm14\n"
19397 "movapd %%xmm6, %%xmm15\n"
19398 "addpd %%xmm7, %%xmm14\n"
19399 "subpd %%xmm7, %%xmm15\n"
19400 "movapd %%xmm8, %%xmm0\n"
19401 "movapd %%xmm8, %%xmm2\n"
19402 "addpd %%xmm10, %%xmm0\n"
19403 "subpd %%xmm10, %%xmm2\n"
19404 "movapd %%xmm9, %%xmm1\n"
19405 "movapd %%xmm9, %%xmm3\n"
19406 "addpd %%xmm11, %%xmm1\n"
19407 "subpd %%xmm11, %%xmm3\n"
19408 "movapd %%xmm12, %%xmm4\n"
19409 "movapd %%xmm12, %%xmm6\n"
19410 "addpd %%xmm14, %%xmm4\n"
19411 "subpd %%xmm14, %%xmm6\n"
19412 "movapd %%xmm13, %%xmm5\n"
19413 "movapd %%xmm13, %%xmm7\n"
19414 "addpd %%xmm15, %%xmm5\n"
19415 "subpd %%xmm15, %%xmm7\n"
19416 "movapd %%xmm0, %%xmm8\n"
19417 "movapd %%xmm0, %%xmm12\n"
19418 "addpd %%xmm4, %%xmm8\n"
19419 "subpd %%xmm4, %%xmm12\n"
19420 "movapd %%xmm1, %%xmm9\n"
19421 "movapd %%xmm1, %%xmm13\n"
19422 "addpd %%xmm5, %%xmm9\n"
19423 "subpd %%xmm5, %%xmm13\n"
19424 "movapd %%xmm2, %%xmm10\n"
19425 "movapd %%xmm2, %%xmm14\n"
19426 "addpd %%xmm6, %%xmm10\n"
19427 "subpd %%xmm6, %%xmm14\n"
19428 "movapd %%xmm3, %%xmm11\n"
19429 "movapd %%xmm3, %%xmm15\n"
19430 "addpd %%xmm7, %%xmm11\n"
19431 "subpd %%xmm7, %%xmm15\n"
19432 "movupd %%xmm8, (%0)\n"
19433 "movupd %%xmm9, (%1)\n"
19434 "movupd %%xmm10, (%2)\n"
19435 "movupd %%xmm11, (%3)\n"
19436 "movupd %%xmm12, (%4)\n"
19437 "movupd %%xmm13, (%5)\n"
19438 "movupd %%xmm14, (%6)\n"
19439 "movupd %%xmm15, (%7)\n"
19440 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19441 );
19442 }
19443 }
19444 for (int j = 0; j < 8192; j += 128) {
19445 for (int k = 0; k < 16; k += 2) {
19446 __asm__ volatile (
19447 "movupd (%0), %%xmm0\n"
19448 "movupd (%1), %%xmm1\n"
19449 "movupd (%2), %%xmm2\n"
19450 "movupd (%3), %%xmm3\n"
19451 "movupd (%4), %%xmm4\n"
19452 "movupd (%5), %%xmm5\n"
19453 "movupd (%6), %%xmm6\n"
19454 "movupd (%7), %%xmm7\n"
19455 "movapd %%xmm0, %%xmm8\n"
19456 "movapd %%xmm0, %%xmm9\n"
19457 "addpd %%xmm1, %%xmm8\n"
19458 "subpd %%xmm1, %%xmm9\n"
19459 "movapd %%xmm2, %%xmm10\n"
19460 "movapd %%xmm2, %%xmm11\n"
19461 "addpd %%xmm3, %%xmm10\n"
19462 "subpd %%xmm3, %%xmm11\n"
19463 "movapd %%xmm4, %%xmm12\n"
19464 "movapd %%xmm4, %%xmm13\n"
19465 "addpd %%xmm5, %%xmm12\n"
19466 "subpd %%xmm5, %%xmm13\n"
19467 "movapd %%xmm6, %%xmm14\n"
19468 "movapd %%xmm6, %%xmm15\n"
19469 "addpd %%xmm7, %%xmm14\n"
19470 "subpd %%xmm7, %%xmm15\n"
19471 "movapd %%xmm8, %%xmm0\n"
19472 "movapd %%xmm8, %%xmm2\n"
19473 "addpd %%xmm10, %%xmm0\n"
19474 "subpd %%xmm10, %%xmm2\n"
19475 "movapd %%xmm9, %%xmm1\n"
19476 "movapd %%xmm9, %%xmm3\n"
19477 "addpd %%xmm11, %%xmm1\n"
19478 "subpd %%xmm11, %%xmm3\n"
19479 "movapd %%xmm12, %%xmm4\n"
19480 "movapd %%xmm12, %%xmm6\n"
19481 "addpd %%xmm14, %%xmm4\n"
19482 "subpd %%xmm14, %%xmm6\n"
19483 "movapd %%xmm13, %%xmm5\n"
19484 "movapd %%xmm13, %%xmm7\n"
19485 "addpd %%xmm15, %%xmm5\n"
19486 "subpd %%xmm15, %%xmm7\n"
19487 "movapd %%xmm0, %%xmm8\n"
19488 "movapd %%xmm0, %%xmm12\n"
19489 "addpd %%xmm4, %%xmm8\n"
19490 "subpd %%xmm4, %%xmm12\n"
19491 "movapd %%xmm1, %%xmm9\n"
19492 "movapd %%xmm1, %%xmm13\n"
19493 "addpd %%xmm5, %%xmm9\n"
19494 "subpd %%xmm5, %%xmm13\n"
19495 "movapd %%xmm2, %%xmm10\n"
19496 "movapd %%xmm2, %%xmm14\n"
19497 "addpd %%xmm6, %%xmm10\n"
19498 "subpd %%xmm6, %%xmm14\n"
19499 "movapd %%xmm3, %%xmm11\n"
19500 "movapd %%xmm3, %%xmm15\n"
19501 "addpd %%xmm7, %%xmm11\n"
19502 "subpd %%xmm7, %%xmm15\n"
19503 "movupd %%xmm8, (%0)\n"
19504 "movupd %%xmm9, (%1)\n"
19505 "movupd %%xmm10, (%2)\n"
19506 "movupd %%xmm11, (%3)\n"
19507 "movupd %%xmm12, (%4)\n"
19508 "movupd %%xmm13, (%5)\n"
19509 "movupd %%xmm14, (%6)\n"
19510 "movupd %%xmm15, (%7)\n"
19511 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19512 );
19513 }
19514 }
19515 for (int j = 0; j < 8192; j += 1024) {
19516 for (int k = 0; k < 128; k += 2) {
19517 __asm__ volatile (
19518 "movupd (%0), %%xmm0\n"
19519 "movupd (%1), %%xmm1\n"
19520 "movupd (%2), %%xmm2\n"
19521 "movupd (%3), %%xmm3\n"
19522 "movupd (%4), %%xmm4\n"
19523 "movupd (%5), %%xmm5\n"
19524 "movupd (%6), %%xmm6\n"
19525 "movupd (%7), %%xmm7\n"
19526 "movapd %%xmm0, %%xmm8\n"
19527 "movapd %%xmm0, %%xmm9\n"
19528 "addpd %%xmm1, %%xmm8\n"
19529 "subpd %%xmm1, %%xmm9\n"
19530 "movapd %%xmm2, %%xmm10\n"
19531 "movapd %%xmm2, %%xmm11\n"
19532 "addpd %%xmm3, %%xmm10\n"
19533 "subpd %%xmm3, %%xmm11\n"
19534 "movapd %%xmm4, %%xmm12\n"
19535 "movapd %%xmm4, %%xmm13\n"
19536 "addpd %%xmm5, %%xmm12\n"
19537 "subpd %%xmm5, %%xmm13\n"
19538 "movapd %%xmm6, %%xmm14\n"
19539 "movapd %%xmm6, %%xmm15\n"
19540 "addpd %%xmm7, %%xmm14\n"
19541 "subpd %%xmm7, %%xmm15\n"
19542 "movapd %%xmm8, %%xmm0\n"
19543 "movapd %%xmm8, %%xmm2\n"
19544 "addpd %%xmm10, %%xmm0\n"
19545 "subpd %%xmm10, %%xmm2\n"
19546 "movapd %%xmm9, %%xmm1\n"
19547 "movapd %%xmm9, %%xmm3\n"
19548 "addpd %%xmm11, %%xmm1\n"
19549 "subpd %%xmm11, %%xmm3\n"
19550 "movapd %%xmm12, %%xmm4\n"
19551 "movapd %%xmm12, %%xmm6\n"
19552 "addpd %%xmm14, %%xmm4\n"
19553 "subpd %%xmm14, %%xmm6\n"
19554 "movapd %%xmm13, %%xmm5\n"
19555 "movapd %%xmm13, %%xmm7\n"
19556 "addpd %%xmm15, %%xmm5\n"
19557 "subpd %%xmm15, %%xmm7\n"
19558 "movapd %%xmm0, %%xmm8\n"
19559 "movapd %%xmm0, %%xmm12\n"
19560 "addpd %%xmm4, %%xmm8\n"
19561 "subpd %%xmm4, %%xmm12\n"
19562 "movapd %%xmm1, %%xmm9\n"
19563 "movapd %%xmm1, %%xmm13\n"
19564 "addpd %%xmm5, %%xmm9\n"
19565 "subpd %%xmm5, %%xmm13\n"
19566 "movapd %%xmm2, %%xmm10\n"
19567 "movapd %%xmm2, %%xmm14\n"
19568 "addpd %%xmm6, %%xmm10\n"
19569 "subpd %%xmm6, %%xmm14\n"
19570 "movapd %%xmm3, %%xmm11\n"
19571 "movapd %%xmm3, %%xmm15\n"
19572 "addpd %%xmm7, %%xmm11\n"
19573 "subpd %%xmm7, %%xmm15\n"
19574 "movupd %%xmm8, (%0)\n"
19575 "movupd %%xmm9, (%1)\n"
19576 "movupd %%xmm10, (%2)\n"
19577 "movupd %%xmm11, (%3)\n"
19578 "movupd %%xmm12, (%4)\n"
19579 "movupd %%xmm13, (%5)\n"
19580 "movupd %%xmm14, (%6)\n"
19581 "movupd %%xmm15, (%7)\n"
19582 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19583 );
19584 }
19585 }
19586 for (int j = 0; j < 8192; j += 8192) {
19587 for (int k = 0; k < 1024; k += 2) {
19588 __asm__ volatile (
19589 "movupd (%0), %%xmm0\n"
19590 "movupd (%1), %%xmm1\n"
19591 "movupd (%2), %%xmm2\n"
19592 "movupd (%3), %%xmm3\n"
19593 "movupd (%4), %%xmm4\n"
19594 "movupd (%5), %%xmm5\n"
19595 "movupd (%6), %%xmm6\n"
19596 "movupd (%7), %%xmm7\n"
19597 "movapd %%xmm0, %%xmm8\n"
19598 "movapd %%xmm0, %%xmm9\n"
19599 "addpd %%xmm1, %%xmm8\n"
19600 "subpd %%xmm1, %%xmm9\n"
19601 "movapd %%xmm2, %%xmm10\n"
19602 "movapd %%xmm2, %%xmm11\n"
19603 "addpd %%xmm3, %%xmm10\n"
19604 "subpd %%xmm3, %%xmm11\n"
19605 "movapd %%xmm4, %%xmm12\n"
19606 "movapd %%xmm4, %%xmm13\n"
19607 "addpd %%xmm5, %%xmm12\n"
19608 "subpd %%xmm5, %%xmm13\n"
19609 "movapd %%xmm6, %%xmm14\n"
19610 "movapd %%xmm6, %%xmm15\n"
19611 "addpd %%xmm7, %%xmm14\n"
19612 "subpd %%xmm7, %%xmm15\n"
19613 "movapd %%xmm8, %%xmm0\n"
19614 "movapd %%xmm8, %%xmm2\n"
19615 "addpd %%xmm10, %%xmm0\n"
19616 "subpd %%xmm10, %%xmm2\n"
19617 "movapd %%xmm9, %%xmm1\n"
19618 "movapd %%xmm9, %%xmm3\n"
19619 "addpd %%xmm11, %%xmm1\n"
19620 "subpd %%xmm11, %%xmm3\n"
19621 "movapd %%xmm12, %%xmm4\n"
19622 "movapd %%xmm12, %%xmm6\n"
19623 "addpd %%xmm14, %%xmm4\n"
19624 "subpd %%xmm14, %%xmm6\n"
19625 "movapd %%xmm13, %%xmm5\n"
19626 "movapd %%xmm13, %%xmm7\n"
19627 "addpd %%xmm15, %%xmm5\n"
19628 "subpd %%xmm15, %%xmm7\n"
19629 "movapd %%xmm0, %%xmm8\n"
19630 "movapd %%xmm0, %%xmm12\n"
19631 "addpd %%xmm4, %%xmm8\n"
19632 "subpd %%xmm4, %%xmm12\n"
19633 "movapd %%xmm1, %%xmm9\n"
19634 "movapd %%xmm1, %%xmm13\n"
19635 "addpd %%xmm5, %%xmm9\n"
19636 "subpd %%xmm5, %%xmm13\n"
19637 "movapd %%xmm2, %%xmm10\n"
19638 "movapd %%xmm2, %%xmm14\n"
19639 "addpd %%xmm6, %%xmm10\n"
19640 "subpd %%xmm6, %%xmm14\n"
19641 "movapd %%xmm3, %%xmm11\n"
19642 "movapd %%xmm3, %%xmm15\n"
19643 "addpd %%xmm7, %%xmm11\n"
19644 "subpd %%xmm7, %%xmm15\n"
19645 "movupd %%xmm8, (%0)\n"
19646 "movupd %%xmm9, (%1)\n"
19647 "movupd %%xmm10, (%2)\n"
19648 "movupd %%xmm11, (%3)\n"
19649 "movupd %%xmm12, (%4)\n"
19650 "movupd %%xmm13, (%5)\n"
19651 "movupd %%xmm14, (%6)\n"
19652 "movupd %%xmm15, (%7)\n"
19653 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19654 );
19655 }
19656 }
19657 return;
19658 }
19659 if (depth == 16) {
19660 helper_double_21_recursive(buf + 0, 13);
19661 helper_double_21_recursive(buf + 8192, 13);
19662 helper_double_21_recursive(buf + 16384, 13);
19663 helper_double_21_recursive(buf + 24576, 13);
19664 helper_double_21_recursive(buf + 32768, 13);
19665 helper_double_21_recursive(buf + 40960, 13);
19666 helper_double_21_recursive(buf + 49152, 13);
19667 helper_double_21_recursive(buf + 57344, 13);
19668 for (int j = 0; j < 65536; j += 65536) {
19669 for (int k = 0; k < 8192; k += 2) {
19670 __asm__ volatile (
19671 "movupd (%0), %%xmm0\n"
19672 "movupd (%1), %%xmm1\n"
19673 "movupd (%2), %%xmm2\n"
19674 "movupd (%3), %%xmm3\n"
19675 "movupd (%4), %%xmm4\n"
19676 "movupd (%5), %%xmm5\n"
19677 "movupd (%6), %%xmm6\n"
19678 "movupd (%7), %%xmm7\n"
19679 "movapd %%xmm0, %%xmm8\n"
19680 "movapd %%xmm0, %%xmm9\n"
19681 "addpd %%xmm1, %%xmm8\n"
19682 "subpd %%xmm1, %%xmm9\n"
19683 "movapd %%xmm2, %%xmm10\n"
19684 "movapd %%xmm2, %%xmm11\n"
19685 "addpd %%xmm3, %%xmm10\n"
19686 "subpd %%xmm3, %%xmm11\n"
19687 "movapd %%xmm4, %%xmm12\n"
19688 "movapd %%xmm4, %%xmm13\n"
19689 "addpd %%xmm5, %%xmm12\n"
19690 "subpd %%xmm5, %%xmm13\n"
19691 "movapd %%xmm6, %%xmm14\n"
19692 "movapd %%xmm6, %%xmm15\n"
19693 "addpd %%xmm7, %%xmm14\n"
19694 "subpd %%xmm7, %%xmm15\n"
19695 "movapd %%xmm8, %%xmm0\n"
19696 "movapd %%xmm8, %%xmm2\n"
19697 "addpd %%xmm10, %%xmm0\n"
19698 "subpd %%xmm10, %%xmm2\n"
19699 "movapd %%xmm9, %%xmm1\n"
19700 "movapd %%xmm9, %%xmm3\n"
19701 "addpd %%xmm11, %%xmm1\n"
19702 "subpd %%xmm11, %%xmm3\n"
19703 "movapd %%xmm12, %%xmm4\n"
19704 "movapd %%xmm12, %%xmm6\n"
19705 "addpd %%xmm14, %%xmm4\n"
19706 "subpd %%xmm14, %%xmm6\n"
19707 "movapd %%xmm13, %%xmm5\n"
19708 "movapd %%xmm13, %%xmm7\n"
19709 "addpd %%xmm15, %%xmm5\n"
19710 "subpd %%xmm15, %%xmm7\n"
19711 "movapd %%xmm0, %%xmm8\n"
19712 "movapd %%xmm0, %%xmm12\n"
19713 "addpd %%xmm4, %%xmm8\n"
19714 "subpd %%xmm4, %%xmm12\n"
19715 "movapd %%xmm1, %%xmm9\n"
19716 "movapd %%xmm1, %%xmm13\n"
19717 "addpd %%xmm5, %%xmm9\n"
19718 "subpd %%xmm5, %%xmm13\n"
19719 "movapd %%xmm2, %%xmm10\n"
19720 "movapd %%xmm2, %%xmm14\n"
19721 "addpd %%xmm6, %%xmm10\n"
19722 "subpd %%xmm6, %%xmm14\n"
19723 "movapd %%xmm3, %%xmm11\n"
19724 "movapd %%xmm3, %%xmm15\n"
19725 "addpd %%xmm7, %%xmm11\n"
19726 "subpd %%xmm7, %%xmm15\n"
19727 "movupd %%xmm8, (%0)\n"
19728 "movupd %%xmm9, (%1)\n"
19729 "movupd %%xmm10, (%2)\n"
19730 "movupd %%xmm11, (%3)\n"
19731 "movupd %%xmm12, (%4)\n"
19732 "movupd %%xmm13, (%5)\n"
19733 "movupd %%xmm14, (%6)\n"
19734 "movupd %%xmm15, (%7)\n"
19735 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19736 );
19737 }
19738 }
19739 return;
19740 }
19741 if (depth == 19) {
19742 helper_double_21_recursive(buf + 0, 16);
19743 helper_double_21_recursive(buf + 65536, 16);
19744 helper_double_21_recursive(buf + 131072, 16);
19745 helper_double_21_recursive(buf + 196608, 16);
19746 helper_double_21_recursive(buf + 262144, 16);
19747 helper_double_21_recursive(buf + 327680, 16);
19748 helper_double_21_recursive(buf + 393216, 16);
19749 helper_double_21_recursive(buf + 458752, 16);
19750 for (int j = 0; j < 524288; j += 524288) {
19751 for (int k = 0; k < 65536; k += 2) {
19752 __asm__ volatile (
19753 "movupd (%0), %%xmm0\n"
19754 "movupd (%1), %%xmm1\n"
19755 "movupd (%2), %%xmm2\n"
19756 "movupd (%3), %%xmm3\n"
19757 "movupd (%4), %%xmm4\n"
19758 "movupd (%5), %%xmm5\n"
19759 "movupd (%6), %%xmm6\n"
19760 "movupd (%7), %%xmm7\n"
19761 "movapd %%xmm0, %%xmm8\n"
19762 "movapd %%xmm0, %%xmm9\n"
19763 "addpd %%xmm1, %%xmm8\n"
19764 "subpd %%xmm1, %%xmm9\n"
19765 "movapd %%xmm2, %%xmm10\n"
19766 "movapd %%xmm2, %%xmm11\n"
19767 "addpd %%xmm3, %%xmm10\n"
19768 "subpd %%xmm3, %%xmm11\n"
19769 "movapd %%xmm4, %%xmm12\n"
19770 "movapd %%xmm4, %%xmm13\n"
19771 "addpd %%xmm5, %%xmm12\n"
19772 "subpd %%xmm5, %%xmm13\n"
19773 "movapd %%xmm6, %%xmm14\n"
19774 "movapd %%xmm6, %%xmm15\n"
19775 "addpd %%xmm7, %%xmm14\n"
19776 "subpd %%xmm7, %%xmm15\n"
19777 "movapd %%xmm8, %%xmm0\n"
19778 "movapd %%xmm8, %%xmm2\n"
19779 "addpd %%xmm10, %%xmm0\n"
19780 "subpd %%xmm10, %%xmm2\n"
19781 "movapd %%xmm9, %%xmm1\n"
19782 "movapd %%xmm9, %%xmm3\n"
19783 "addpd %%xmm11, %%xmm1\n"
19784 "subpd %%xmm11, %%xmm3\n"
19785 "movapd %%xmm12, %%xmm4\n"
19786 "movapd %%xmm12, %%xmm6\n"
19787 "addpd %%xmm14, %%xmm4\n"
19788 "subpd %%xmm14, %%xmm6\n"
19789 "movapd %%xmm13, %%xmm5\n"
19790 "movapd %%xmm13, %%xmm7\n"
19791 "addpd %%xmm15, %%xmm5\n"
19792 "subpd %%xmm15, %%xmm7\n"
19793 "movapd %%xmm0, %%xmm8\n"
19794 "movapd %%xmm0, %%xmm12\n"
19795 "addpd %%xmm4, %%xmm8\n"
19796 "subpd %%xmm4, %%xmm12\n"
19797 "movapd %%xmm1, %%xmm9\n"
19798 "movapd %%xmm1, %%xmm13\n"
19799 "addpd %%xmm5, %%xmm9\n"
19800 "subpd %%xmm5, %%xmm13\n"
19801 "movapd %%xmm2, %%xmm10\n"
19802 "movapd %%xmm2, %%xmm14\n"
19803 "addpd %%xmm6, %%xmm10\n"
19804 "subpd %%xmm6, %%xmm14\n"
19805 "movapd %%xmm3, %%xmm11\n"
19806 "movapd %%xmm3, %%xmm15\n"
19807 "addpd %%xmm7, %%xmm11\n"
19808 "subpd %%xmm7, %%xmm15\n"
19809 "movupd %%xmm8, (%0)\n"
19810 "movupd %%xmm9, (%1)\n"
19811 "movupd %%xmm10, (%2)\n"
19812 "movupd %%xmm11, (%3)\n"
19813 "movupd %%xmm12, (%4)\n"
19814 "movupd %%xmm13, (%5)\n"
19815 "movupd %%xmm14, (%6)\n"
19816 "movupd %%xmm15, (%7)\n"
19817 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19818 );
19819 }
19820 }
19821 return;
19822 }
19823 if (depth == 21) {
19824 helper_double_21_recursive(buf + 0, 19);
19825 helper_double_21_recursive(buf + 524288, 19);
19826 helper_double_21_recursive(buf + 1048576, 19);
19827 helper_double_21_recursive(buf + 1572864, 19);
19828 for (int j = 0; j < 2097152; j += 2097152) {
19829 for (int k = 0; k < 524288; k += 2) {
19830 __asm__ volatile (
19831 "movupd (%0), %%xmm0\n"
19832 "movupd (%1), %%xmm1\n"
19833 "movupd (%2), %%xmm2\n"
19834 "movupd (%3), %%xmm3\n"
19835 "movapd %%xmm0, %%xmm8\n"
19836 "movapd %%xmm0, %%xmm9\n"
19837 "addpd %%xmm1, %%xmm8\n"
19838 "subpd %%xmm1, %%xmm9\n"
19839 "movapd %%xmm2, %%xmm10\n"
19840 "movapd %%xmm2, %%xmm11\n"
19841 "addpd %%xmm3, %%xmm10\n"
19842 "subpd %%xmm3, %%xmm11\n"
19843 "movapd %%xmm8, %%xmm0\n"
19844 "movapd %%xmm8, %%xmm2\n"
19845 "addpd %%xmm10, %%xmm0\n"
19846 "subpd %%xmm10, %%xmm2\n"
19847 "movapd %%xmm9, %%xmm1\n"
19848 "movapd %%xmm9, %%xmm3\n"
19849 "addpd %%xmm11, %%xmm1\n"
19850 "subpd %%xmm11, %%xmm3\n"
19851 "movupd %%xmm0, (%0)\n"
19852 "movupd %%xmm1, (%1)\n"
19853 "movupd %%xmm2, (%2)\n"
19854 "movupd %%xmm3, (%3)\n"
19855 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19856 );
19857 }
19858 }
19859 return;
19860 }
19861 }
19862 void helper_double_21(double *buf);
helper_double_21(double * buf)19863 void helper_double_21(double *buf) {
19864 helper_double_21_recursive(buf, 21);
19865 }
19866 void helper_double_22_recursive(double *buf, int depth);
helper_double_22_recursive(double * buf,int depth)19867 void helper_double_22_recursive(double *buf, int depth) {
19868 if (depth == 2) {
19869 for (int j = 0; j < 4; j += 4) {
19870 for (int k = 0; k < 2; k += 2) {
19871 __asm__ volatile (
19872 "movupd (%0), %%xmm0\n"
19873 "movupd (%1), %%xmm1\n"
19874 "movapd %%xmm0, %%xmm8\n"
19875 "haddpd %%xmm8, %%xmm8\n"
19876 "movapd %%xmm0, %%xmm9\n"
19877 "hsubpd %%xmm9, %%xmm9\n"
19878 "blendpd $1, %%xmm8, %%xmm9\n"
19879 "movapd %%xmm9, %%xmm0\n"
19880 "movapd %%xmm1, %%xmm8\n"
19881 "haddpd %%xmm8, %%xmm8\n"
19882 "movapd %%xmm1, %%xmm9\n"
19883 "hsubpd %%xmm9, %%xmm9\n"
19884 "blendpd $1, %%xmm8, %%xmm9\n"
19885 "movapd %%xmm9, %%xmm1\n"
19886 "movapd %%xmm0, %%xmm8\n"
19887 "movapd %%xmm0, %%xmm9\n"
19888 "addpd %%xmm1, %%xmm8\n"
19889 "subpd %%xmm1, %%xmm9\n"
19890 "movupd %%xmm8, (%0)\n"
19891 "movupd %%xmm9, (%1)\n"
19892 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19893 );
19894 }
19895 }
19896 return;
19897 }
19898 if (depth == 5) {
19899 helper_double_22_recursive(buf + 0, 2);
19900 helper_double_22_recursive(buf + 4, 2);
19901 helper_double_22_recursive(buf + 8, 2);
19902 helper_double_22_recursive(buf + 12, 2);
19903 helper_double_22_recursive(buf + 16, 2);
19904 helper_double_22_recursive(buf + 20, 2);
19905 helper_double_22_recursive(buf + 24, 2);
19906 helper_double_22_recursive(buf + 28, 2);
19907 for (int j = 0; j < 32; j += 32) {
19908 for (int k = 0; k < 4; k += 2) {
19909 __asm__ volatile (
19910 "movupd (%0), %%xmm0\n"
19911 "movupd (%1), %%xmm1\n"
19912 "movupd (%2), %%xmm2\n"
19913 "movupd (%3), %%xmm3\n"
19914 "movupd (%4), %%xmm4\n"
19915 "movupd (%5), %%xmm5\n"
19916 "movupd (%6), %%xmm6\n"
19917 "movupd (%7), %%xmm7\n"
19918 "movapd %%xmm0, %%xmm8\n"
19919 "movapd %%xmm0, %%xmm9\n"
19920 "addpd %%xmm1, %%xmm8\n"
19921 "subpd %%xmm1, %%xmm9\n"
19922 "movapd %%xmm2, %%xmm10\n"
19923 "movapd %%xmm2, %%xmm11\n"
19924 "addpd %%xmm3, %%xmm10\n"
19925 "subpd %%xmm3, %%xmm11\n"
19926 "movapd %%xmm4, %%xmm12\n"
19927 "movapd %%xmm4, %%xmm13\n"
19928 "addpd %%xmm5, %%xmm12\n"
19929 "subpd %%xmm5, %%xmm13\n"
19930 "movapd %%xmm6, %%xmm14\n"
19931 "movapd %%xmm6, %%xmm15\n"
19932 "addpd %%xmm7, %%xmm14\n"
19933 "subpd %%xmm7, %%xmm15\n"
19934 "movapd %%xmm8, %%xmm0\n"
19935 "movapd %%xmm8, %%xmm2\n"
19936 "addpd %%xmm10, %%xmm0\n"
19937 "subpd %%xmm10, %%xmm2\n"
19938 "movapd %%xmm9, %%xmm1\n"
19939 "movapd %%xmm9, %%xmm3\n"
19940 "addpd %%xmm11, %%xmm1\n"
19941 "subpd %%xmm11, %%xmm3\n"
19942 "movapd %%xmm12, %%xmm4\n"
19943 "movapd %%xmm12, %%xmm6\n"
19944 "addpd %%xmm14, %%xmm4\n"
19945 "subpd %%xmm14, %%xmm6\n"
19946 "movapd %%xmm13, %%xmm5\n"
19947 "movapd %%xmm13, %%xmm7\n"
19948 "addpd %%xmm15, %%xmm5\n"
19949 "subpd %%xmm15, %%xmm7\n"
19950 "movapd %%xmm0, %%xmm8\n"
19951 "movapd %%xmm0, %%xmm12\n"
19952 "addpd %%xmm4, %%xmm8\n"
19953 "subpd %%xmm4, %%xmm12\n"
19954 "movapd %%xmm1, %%xmm9\n"
19955 "movapd %%xmm1, %%xmm13\n"
19956 "addpd %%xmm5, %%xmm9\n"
19957 "subpd %%xmm5, %%xmm13\n"
19958 "movapd %%xmm2, %%xmm10\n"
19959 "movapd %%xmm2, %%xmm14\n"
19960 "addpd %%xmm6, %%xmm10\n"
19961 "subpd %%xmm6, %%xmm14\n"
19962 "movapd %%xmm3, %%xmm11\n"
19963 "movapd %%xmm3, %%xmm15\n"
19964 "addpd %%xmm7, %%xmm11\n"
19965 "subpd %%xmm7, %%xmm15\n"
19966 "movupd %%xmm8, (%0)\n"
19967 "movupd %%xmm9, (%1)\n"
19968 "movupd %%xmm10, (%2)\n"
19969 "movupd %%xmm11, (%3)\n"
19970 "movupd %%xmm12, (%4)\n"
19971 "movupd %%xmm13, (%5)\n"
19972 "movupd %%xmm14, (%6)\n"
19973 "movupd %%xmm15, (%7)\n"
19974 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
19975 );
19976 }
19977 }
19978 return;
19979 }
19980 if (depth == 8) {
19981 helper_double_22_recursive(buf + 0, 5);
19982 helper_double_22_recursive(buf + 32, 5);
19983 helper_double_22_recursive(buf + 64, 5);
19984 helper_double_22_recursive(buf + 96, 5);
19985 helper_double_22_recursive(buf + 128, 5);
19986 helper_double_22_recursive(buf + 160, 5);
19987 helper_double_22_recursive(buf + 192, 5);
19988 helper_double_22_recursive(buf + 224, 5);
19989 for (int j = 0; j < 256; j += 256) {
19990 for (int k = 0; k < 32; k += 2) {
19991 __asm__ volatile (
19992 "movupd (%0), %%xmm0\n"
19993 "movupd (%1), %%xmm1\n"
19994 "movupd (%2), %%xmm2\n"
19995 "movupd (%3), %%xmm3\n"
19996 "movupd (%4), %%xmm4\n"
19997 "movupd (%5), %%xmm5\n"
19998 "movupd (%6), %%xmm6\n"
19999 "movupd (%7), %%xmm7\n"
20000 "movapd %%xmm0, %%xmm8\n"
20001 "movapd %%xmm0, %%xmm9\n"
20002 "addpd %%xmm1, %%xmm8\n"
20003 "subpd %%xmm1, %%xmm9\n"
20004 "movapd %%xmm2, %%xmm10\n"
20005 "movapd %%xmm2, %%xmm11\n"
20006 "addpd %%xmm3, %%xmm10\n"
20007 "subpd %%xmm3, %%xmm11\n"
20008 "movapd %%xmm4, %%xmm12\n"
20009 "movapd %%xmm4, %%xmm13\n"
20010 "addpd %%xmm5, %%xmm12\n"
20011 "subpd %%xmm5, %%xmm13\n"
20012 "movapd %%xmm6, %%xmm14\n"
20013 "movapd %%xmm6, %%xmm15\n"
20014 "addpd %%xmm7, %%xmm14\n"
20015 "subpd %%xmm7, %%xmm15\n"
20016 "movapd %%xmm8, %%xmm0\n"
20017 "movapd %%xmm8, %%xmm2\n"
20018 "addpd %%xmm10, %%xmm0\n"
20019 "subpd %%xmm10, %%xmm2\n"
20020 "movapd %%xmm9, %%xmm1\n"
20021 "movapd %%xmm9, %%xmm3\n"
20022 "addpd %%xmm11, %%xmm1\n"
20023 "subpd %%xmm11, %%xmm3\n"
20024 "movapd %%xmm12, %%xmm4\n"
20025 "movapd %%xmm12, %%xmm6\n"
20026 "addpd %%xmm14, %%xmm4\n"
20027 "subpd %%xmm14, %%xmm6\n"
20028 "movapd %%xmm13, %%xmm5\n"
20029 "movapd %%xmm13, %%xmm7\n"
20030 "addpd %%xmm15, %%xmm5\n"
20031 "subpd %%xmm15, %%xmm7\n"
20032 "movapd %%xmm0, %%xmm8\n"
20033 "movapd %%xmm0, %%xmm12\n"
20034 "addpd %%xmm4, %%xmm8\n"
20035 "subpd %%xmm4, %%xmm12\n"
20036 "movapd %%xmm1, %%xmm9\n"
20037 "movapd %%xmm1, %%xmm13\n"
20038 "addpd %%xmm5, %%xmm9\n"
20039 "subpd %%xmm5, %%xmm13\n"
20040 "movapd %%xmm2, %%xmm10\n"
20041 "movapd %%xmm2, %%xmm14\n"
20042 "addpd %%xmm6, %%xmm10\n"
20043 "subpd %%xmm6, %%xmm14\n"
20044 "movapd %%xmm3, %%xmm11\n"
20045 "movapd %%xmm3, %%xmm15\n"
20046 "addpd %%xmm7, %%xmm11\n"
20047 "subpd %%xmm7, %%xmm15\n"
20048 "movupd %%xmm8, (%0)\n"
20049 "movupd %%xmm9, (%1)\n"
20050 "movupd %%xmm10, (%2)\n"
20051 "movupd %%xmm11, (%3)\n"
20052 "movupd %%xmm12, (%4)\n"
20053 "movupd %%xmm13, (%5)\n"
20054 "movupd %%xmm14, (%6)\n"
20055 "movupd %%xmm15, (%7)\n"
20056 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20057 );
20058 }
20059 }
20060 return;
20061 }
20062 if (depth == 11) {
20063 helper_double_22_recursive(buf + 0, 8);
20064 helper_double_22_recursive(buf + 256, 8);
20065 helper_double_22_recursive(buf + 512, 8);
20066 helper_double_22_recursive(buf + 768, 8);
20067 helper_double_22_recursive(buf + 1024, 8);
20068 helper_double_22_recursive(buf + 1280, 8);
20069 helper_double_22_recursive(buf + 1536, 8);
20070 helper_double_22_recursive(buf + 1792, 8);
20071 for (int j = 0; j < 2048; j += 2048) {
20072 for (int k = 0; k < 256; k += 2) {
20073 __asm__ volatile (
20074 "movupd (%0), %%xmm0\n"
20075 "movupd (%1), %%xmm1\n"
20076 "movupd (%2), %%xmm2\n"
20077 "movupd (%3), %%xmm3\n"
20078 "movupd (%4), %%xmm4\n"
20079 "movupd (%5), %%xmm5\n"
20080 "movupd (%6), %%xmm6\n"
20081 "movupd (%7), %%xmm7\n"
20082 "movapd %%xmm0, %%xmm8\n"
20083 "movapd %%xmm0, %%xmm9\n"
20084 "addpd %%xmm1, %%xmm8\n"
20085 "subpd %%xmm1, %%xmm9\n"
20086 "movapd %%xmm2, %%xmm10\n"
20087 "movapd %%xmm2, %%xmm11\n"
20088 "addpd %%xmm3, %%xmm10\n"
20089 "subpd %%xmm3, %%xmm11\n"
20090 "movapd %%xmm4, %%xmm12\n"
20091 "movapd %%xmm4, %%xmm13\n"
20092 "addpd %%xmm5, %%xmm12\n"
20093 "subpd %%xmm5, %%xmm13\n"
20094 "movapd %%xmm6, %%xmm14\n"
20095 "movapd %%xmm6, %%xmm15\n"
20096 "addpd %%xmm7, %%xmm14\n"
20097 "subpd %%xmm7, %%xmm15\n"
20098 "movapd %%xmm8, %%xmm0\n"
20099 "movapd %%xmm8, %%xmm2\n"
20100 "addpd %%xmm10, %%xmm0\n"
20101 "subpd %%xmm10, %%xmm2\n"
20102 "movapd %%xmm9, %%xmm1\n"
20103 "movapd %%xmm9, %%xmm3\n"
20104 "addpd %%xmm11, %%xmm1\n"
20105 "subpd %%xmm11, %%xmm3\n"
20106 "movapd %%xmm12, %%xmm4\n"
20107 "movapd %%xmm12, %%xmm6\n"
20108 "addpd %%xmm14, %%xmm4\n"
20109 "subpd %%xmm14, %%xmm6\n"
20110 "movapd %%xmm13, %%xmm5\n"
20111 "movapd %%xmm13, %%xmm7\n"
20112 "addpd %%xmm15, %%xmm5\n"
20113 "subpd %%xmm15, %%xmm7\n"
20114 "movapd %%xmm0, %%xmm8\n"
20115 "movapd %%xmm0, %%xmm12\n"
20116 "addpd %%xmm4, %%xmm8\n"
20117 "subpd %%xmm4, %%xmm12\n"
20118 "movapd %%xmm1, %%xmm9\n"
20119 "movapd %%xmm1, %%xmm13\n"
20120 "addpd %%xmm5, %%xmm9\n"
20121 "subpd %%xmm5, %%xmm13\n"
20122 "movapd %%xmm2, %%xmm10\n"
20123 "movapd %%xmm2, %%xmm14\n"
20124 "addpd %%xmm6, %%xmm10\n"
20125 "subpd %%xmm6, %%xmm14\n"
20126 "movapd %%xmm3, %%xmm11\n"
20127 "movapd %%xmm3, %%xmm15\n"
20128 "addpd %%xmm7, %%xmm11\n"
20129 "subpd %%xmm7, %%xmm15\n"
20130 "movupd %%xmm8, (%0)\n"
20131 "movupd %%xmm9, (%1)\n"
20132 "movupd %%xmm10, (%2)\n"
20133 "movupd %%xmm11, (%3)\n"
20134 "movupd %%xmm12, (%4)\n"
20135 "movupd %%xmm13, (%5)\n"
20136 "movupd %%xmm14, (%6)\n"
20137 "movupd %%xmm15, (%7)\n"
20138 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20139 );
20140 }
20141 }
20142 return;
20143 }
20144 if (depth == 14) {
20145 helper_double_22_recursive(buf + 0, 11);
20146 helper_double_22_recursive(buf + 2048, 11);
20147 helper_double_22_recursive(buf + 4096, 11);
20148 helper_double_22_recursive(buf + 6144, 11);
20149 helper_double_22_recursive(buf + 8192, 11);
20150 helper_double_22_recursive(buf + 10240, 11);
20151 helper_double_22_recursive(buf + 12288, 11);
20152 helper_double_22_recursive(buf + 14336, 11);
20153 for (int j = 0; j < 16384; j += 16384) {
20154 for (int k = 0; k < 2048; k += 2) {
20155 __asm__ volatile (
20156 "movupd (%0), %%xmm0\n"
20157 "movupd (%1), %%xmm1\n"
20158 "movupd (%2), %%xmm2\n"
20159 "movupd (%3), %%xmm3\n"
20160 "movupd (%4), %%xmm4\n"
20161 "movupd (%5), %%xmm5\n"
20162 "movupd (%6), %%xmm6\n"
20163 "movupd (%7), %%xmm7\n"
20164 "movapd %%xmm0, %%xmm8\n"
20165 "movapd %%xmm0, %%xmm9\n"
20166 "addpd %%xmm1, %%xmm8\n"
20167 "subpd %%xmm1, %%xmm9\n"
20168 "movapd %%xmm2, %%xmm10\n"
20169 "movapd %%xmm2, %%xmm11\n"
20170 "addpd %%xmm3, %%xmm10\n"
20171 "subpd %%xmm3, %%xmm11\n"
20172 "movapd %%xmm4, %%xmm12\n"
20173 "movapd %%xmm4, %%xmm13\n"
20174 "addpd %%xmm5, %%xmm12\n"
20175 "subpd %%xmm5, %%xmm13\n"
20176 "movapd %%xmm6, %%xmm14\n"
20177 "movapd %%xmm6, %%xmm15\n"
20178 "addpd %%xmm7, %%xmm14\n"
20179 "subpd %%xmm7, %%xmm15\n"
20180 "movapd %%xmm8, %%xmm0\n"
20181 "movapd %%xmm8, %%xmm2\n"
20182 "addpd %%xmm10, %%xmm0\n"
20183 "subpd %%xmm10, %%xmm2\n"
20184 "movapd %%xmm9, %%xmm1\n"
20185 "movapd %%xmm9, %%xmm3\n"
20186 "addpd %%xmm11, %%xmm1\n"
20187 "subpd %%xmm11, %%xmm3\n"
20188 "movapd %%xmm12, %%xmm4\n"
20189 "movapd %%xmm12, %%xmm6\n"
20190 "addpd %%xmm14, %%xmm4\n"
20191 "subpd %%xmm14, %%xmm6\n"
20192 "movapd %%xmm13, %%xmm5\n"
20193 "movapd %%xmm13, %%xmm7\n"
20194 "addpd %%xmm15, %%xmm5\n"
20195 "subpd %%xmm15, %%xmm7\n"
20196 "movapd %%xmm0, %%xmm8\n"
20197 "movapd %%xmm0, %%xmm12\n"
20198 "addpd %%xmm4, %%xmm8\n"
20199 "subpd %%xmm4, %%xmm12\n"
20200 "movapd %%xmm1, %%xmm9\n"
20201 "movapd %%xmm1, %%xmm13\n"
20202 "addpd %%xmm5, %%xmm9\n"
20203 "subpd %%xmm5, %%xmm13\n"
20204 "movapd %%xmm2, %%xmm10\n"
20205 "movapd %%xmm2, %%xmm14\n"
20206 "addpd %%xmm6, %%xmm10\n"
20207 "subpd %%xmm6, %%xmm14\n"
20208 "movapd %%xmm3, %%xmm11\n"
20209 "movapd %%xmm3, %%xmm15\n"
20210 "addpd %%xmm7, %%xmm11\n"
20211 "subpd %%xmm7, %%xmm15\n"
20212 "movupd %%xmm8, (%0)\n"
20213 "movupd %%xmm9, (%1)\n"
20214 "movupd %%xmm10, (%2)\n"
20215 "movupd %%xmm11, (%3)\n"
20216 "movupd %%xmm12, (%4)\n"
20217 "movupd %%xmm13, (%5)\n"
20218 "movupd %%xmm14, (%6)\n"
20219 "movupd %%xmm15, (%7)\n"
20220 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20221 );
20222 }
20223 }
20224 return;
20225 }
20226 if (depth == 17) {
20227 helper_double_22_recursive(buf + 0, 14);
20228 helper_double_22_recursive(buf + 16384, 14);
20229 helper_double_22_recursive(buf + 32768, 14);
20230 helper_double_22_recursive(buf + 49152, 14);
20231 helper_double_22_recursive(buf + 65536, 14);
20232 helper_double_22_recursive(buf + 81920, 14);
20233 helper_double_22_recursive(buf + 98304, 14);
20234 helper_double_22_recursive(buf + 114688, 14);
20235 for (int j = 0; j < 131072; j += 131072) {
20236 for (int k = 0; k < 16384; k += 2) {
20237 __asm__ volatile (
20238 "movupd (%0), %%xmm0\n"
20239 "movupd (%1), %%xmm1\n"
20240 "movupd (%2), %%xmm2\n"
20241 "movupd (%3), %%xmm3\n"
20242 "movupd (%4), %%xmm4\n"
20243 "movupd (%5), %%xmm5\n"
20244 "movupd (%6), %%xmm6\n"
20245 "movupd (%7), %%xmm7\n"
20246 "movapd %%xmm0, %%xmm8\n"
20247 "movapd %%xmm0, %%xmm9\n"
20248 "addpd %%xmm1, %%xmm8\n"
20249 "subpd %%xmm1, %%xmm9\n"
20250 "movapd %%xmm2, %%xmm10\n"
20251 "movapd %%xmm2, %%xmm11\n"
20252 "addpd %%xmm3, %%xmm10\n"
20253 "subpd %%xmm3, %%xmm11\n"
20254 "movapd %%xmm4, %%xmm12\n"
20255 "movapd %%xmm4, %%xmm13\n"
20256 "addpd %%xmm5, %%xmm12\n"
20257 "subpd %%xmm5, %%xmm13\n"
20258 "movapd %%xmm6, %%xmm14\n"
20259 "movapd %%xmm6, %%xmm15\n"
20260 "addpd %%xmm7, %%xmm14\n"
20261 "subpd %%xmm7, %%xmm15\n"
20262 "movapd %%xmm8, %%xmm0\n"
20263 "movapd %%xmm8, %%xmm2\n"
20264 "addpd %%xmm10, %%xmm0\n"
20265 "subpd %%xmm10, %%xmm2\n"
20266 "movapd %%xmm9, %%xmm1\n"
20267 "movapd %%xmm9, %%xmm3\n"
20268 "addpd %%xmm11, %%xmm1\n"
20269 "subpd %%xmm11, %%xmm3\n"
20270 "movapd %%xmm12, %%xmm4\n"
20271 "movapd %%xmm12, %%xmm6\n"
20272 "addpd %%xmm14, %%xmm4\n"
20273 "subpd %%xmm14, %%xmm6\n"
20274 "movapd %%xmm13, %%xmm5\n"
20275 "movapd %%xmm13, %%xmm7\n"
20276 "addpd %%xmm15, %%xmm5\n"
20277 "subpd %%xmm15, %%xmm7\n"
20278 "movapd %%xmm0, %%xmm8\n"
20279 "movapd %%xmm0, %%xmm12\n"
20280 "addpd %%xmm4, %%xmm8\n"
20281 "subpd %%xmm4, %%xmm12\n"
20282 "movapd %%xmm1, %%xmm9\n"
20283 "movapd %%xmm1, %%xmm13\n"
20284 "addpd %%xmm5, %%xmm9\n"
20285 "subpd %%xmm5, %%xmm13\n"
20286 "movapd %%xmm2, %%xmm10\n"
20287 "movapd %%xmm2, %%xmm14\n"
20288 "addpd %%xmm6, %%xmm10\n"
20289 "subpd %%xmm6, %%xmm14\n"
20290 "movapd %%xmm3, %%xmm11\n"
20291 "movapd %%xmm3, %%xmm15\n"
20292 "addpd %%xmm7, %%xmm11\n"
20293 "subpd %%xmm7, %%xmm15\n"
20294 "movupd %%xmm8, (%0)\n"
20295 "movupd %%xmm9, (%1)\n"
20296 "movupd %%xmm10, (%2)\n"
20297 "movupd %%xmm11, (%3)\n"
20298 "movupd %%xmm12, (%4)\n"
20299 "movupd %%xmm13, (%5)\n"
20300 "movupd %%xmm14, (%6)\n"
20301 "movupd %%xmm15, (%7)\n"
20302 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20303 );
20304 }
20305 }
20306 return;
20307 }
20308 if (depth == 20) {
20309 helper_double_22_recursive(buf + 0, 17);
20310 helper_double_22_recursive(buf + 131072, 17);
20311 helper_double_22_recursive(buf + 262144, 17);
20312 helper_double_22_recursive(buf + 393216, 17);
20313 helper_double_22_recursive(buf + 524288, 17);
20314 helper_double_22_recursive(buf + 655360, 17);
20315 helper_double_22_recursive(buf + 786432, 17);
20316 helper_double_22_recursive(buf + 917504, 17);
20317 for (int j = 0; j < 1048576; j += 1048576) {
20318 for (int k = 0; k < 131072; k += 2) {
20319 __asm__ volatile (
20320 "movupd (%0), %%xmm0\n"
20321 "movupd (%1), %%xmm1\n"
20322 "movupd (%2), %%xmm2\n"
20323 "movupd (%3), %%xmm3\n"
20324 "movupd (%4), %%xmm4\n"
20325 "movupd (%5), %%xmm5\n"
20326 "movupd (%6), %%xmm6\n"
20327 "movupd (%7), %%xmm7\n"
20328 "movapd %%xmm0, %%xmm8\n"
20329 "movapd %%xmm0, %%xmm9\n"
20330 "addpd %%xmm1, %%xmm8\n"
20331 "subpd %%xmm1, %%xmm9\n"
20332 "movapd %%xmm2, %%xmm10\n"
20333 "movapd %%xmm2, %%xmm11\n"
20334 "addpd %%xmm3, %%xmm10\n"
20335 "subpd %%xmm3, %%xmm11\n"
20336 "movapd %%xmm4, %%xmm12\n"
20337 "movapd %%xmm4, %%xmm13\n"
20338 "addpd %%xmm5, %%xmm12\n"
20339 "subpd %%xmm5, %%xmm13\n"
20340 "movapd %%xmm6, %%xmm14\n"
20341 "movapd %%xmm6, %%xmm15\n"
20342 "addpd %%xmm7, %%xmm14\n"
20343 "subpd %%xmm7, %%xmm15\n"
20344 "movapd %%xmm8, %%xmm0\n"
20345 "movapd %%xmm8, %%xmm2\n"
20346 "addpd %%xmm10, %%xmm0\n"
20347 "subpd %%xmm10, %%xmm2\n"
20348 "movapd %%xmm9, %%xmm1\n"
20349 "movapd %%xmm9, %%xmm3\n"
20350 "addpd %%xmm11, %%xmm1\n"
20351 "subpd %%xmm11, %%xmm3\n"
20352 "movapd %%xmm12, %%xmm4\n"
20353 "movapd %%xmm12, %%xmm6\n"
20354 "addpd %%xmm14, %%xmm4\n"
20355 "subpd %%xmm14, %%xmm6\n"
20356 "movapd %%xmm13, %%xmm5\n"
20357 "movapd %%xmm13, %%xmm7\n"
20358 "addpd %%xmm15, %%xmm5\n"
20359 "subpd %%xmm15, %%xmm7\n"
20360 "movapd %%xmm0, %%xmm8\n"
20361 "movapd %%xmm0, %%xmm12\n"
20362 "addpd %%xmm4, %%xmm8\n"
20363 "subpd %%xmm4, %%xmm12\n"
20364 "movapd %%xmm1, %%xmm9\n"
20365 "movapd %%xmm1, %%xmm13\n"
20366 "addpd %%xmm5, %%xmm9\n"
20367 "subpd %%xmm5, %%xmm13\n"
20368 "movapd %%xmm2, %%xmm10\n"
20369 "movapd %%xmm2, %%xmm14\n"
20370 "addpd %%xmm6, %%xmm10\n"
20371 "subpd %%xmm6, %%xmm14\n"
20372 "movapd %%xmm3, %%xmm11\n"
20373 "movapd %%xmm3, %%xmm15\n"
20374 "addpd %%xmm7, %%xmm11\n"
20375 "subpd %%xmm7, %%xmm15\n"
20376 "movupd %%xmm8, (%0)\n"
20377 "movupd %%xmm9, (%1)\n"
20378 "movupd %%xmm10, (%2)\n"
20379 "movupd %%xmm11, (%3)\n"
20380 "movupd %%xmm12, (%4)\n"
20381 "movupd %%xmm13, (%5)\n"
20382 "movupd %%xmm14, (%6)\n"
20383 "movupd %%xmm15, (%7)\n"
20384 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20385 );
20386 }
20387 }
20388 return;
20389 }
20390 if (depth == 22) {
20391 helper_double_22_recursive(buf + 0, 20);
20392 helper_double_22_recursive(buf + 1048576, 20);
20393 helper_double_22_recursive(buf + 2097152, 20);
20394 helper_double_22_recursive(buf + 3145728, 20);
20395 for (int j = 0; j < 4194304; j += 4194304) {
20396 for (int k = 0; k < 1048576; k += 2) {
20397 __asm__ volatile (
20398 "movupd (%0), %%xmm0\n"
20399 "movupd (%1), %%xmm1\n"
20400 "movupd (%2), %%xmm2\n"
20401 "movupd (%3), %%xmm3\n"
20402 "movapd %%xmm0, %%xmm8\n"
20403 "movapd %%xmm0, %%xmm9\n"
20404 "addpd %%xmm1, %%xmm8\n"
20405 "subpd %%xmm1, %%xmm9\n"
20406 "movapd %%xmm2, %%xmm10\n"
20407 "movapd %%xmm2, %%xmm11\n"
20408 "addpd %%xmm3, %%xmm10\n"
20409 "subpd %%xmm3, %%xmm11\n"
20410 "movapd %%xmm8, %%xmm0\n"
20411 "movapd %%xmm8, %%xmm2\n"
20412 "addpd %%xmm10, %%xmm0\n"
20413 "subpd %%xmm10, %%xmm2\n"
20414 "movapd %%xmm9, %%xmm1\n"
20415 "movapd %%xmm9, %%xmm3\n"
20416 "addpd %%xmm11, %%xmm1\n"
20417 "subpd %%xmm11, %%xmm3\n"
20418 "movupd %%xmm0, (%0)\n"
20419 "movupd %%xmm1, (%1)\n"
20420 "movupd %%xmm2, (%2)\n"
20421 "movupd %%xmm3, (%3)\n"
20422 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20423 );
20424 }
20425 }
20426 return;
20427 }
20428 }
20429 void helper_double_22(double *buf);
helper_double_22(double * buf)20430 void helper_double_22(double *buf) {
20431 helper_double_22_recursive(buf, 22);
20432 }
20433 void helper_double_23_recursive(double *buf, int depth);
helper_double_23_recursive(double * buf,int depth)20434 void helper_double_23_recursive(double *buf, int depth) {
20435 if (depth == 2) {
20436 for (int j = 0; j < 4; j += 4) {
20437 for (int k = 0; k < 2; k += 2) {
20438 __asm__ volatile (
20439 "movupd (%0), %%xmm0\n"
20440 "movupd (%1), %%xmm1\n"
20441 "movapd %%xmm0, %%xmm8\n"
20442 "haddpd %%xmm8, %%xmm8\n"
20443 "movapd %%xmm0, %%xmm9\n"
20444 "hsubpd %%xmm9, %%xmm9\n"
20445 "blendpd $1, %%xmm8, %%xmm9\n"
20446 "movapd %%xmm9, %%xmm0\n"
20447 "movapd %%xmm1, %%xmm8\n"
20448 "haddpd %%xmm8, %%xmm8\n"
20449 "movapd %%xmm1, %%xmm9\n"
20450 "hsubpd %%xmm9, %%xmm9\n"
20451 "blendpd $1, %%xmm8, %%xmm9\n"
20452 "movapd %%xmm9, %%xmm1\n"
20453 "movapd %%xmm0, %%xmm8\n"
20454 "movapd %%xmm0, %%xmm9\n"
20455 "addpd %%xmm1, %%xmm8\n"
20456 "subpd %%xmm1, %%xmm9\n"
20457 "movupd %%xmm8, (%0)\n"
20458 "movupd %%xmm9, (%1)\n"
20459 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20460 );
20461 }
20462 }
20463 return;
20464 }
20465 if (depth == 5) {
20466 helper_double_23_recursive(buf + 0, 2);
20467 helper_double_23_recursive(buf + 4, 2);
20468 helper_double_23_recursive(buf + 8, 2);
20469 helper_double_23_recursive(buf + 12, 2);
20470 helper_double_23_recursive(buf + 16, 2);
20471 helper_double_23_recursive(buf + 20, 2);
20472 helper_double_23_recursive(buf + 24, 2);
20473 helper_double_23_recursive(buf + 28, 2);
20474 for (int j = 0; j < 32; j += 32) {
20475 for (int k = 0; k < 4; k += 2) {
20476 __asm__ volatile (
20477 "movupd (%0), %%xmm0\n"
20478 "movupd (%1), %%xmm1\n"
20479 "movupd (%2), %%xmm2\n"
20480 "movupd (%3), %%xmm3\n"
20481 "movupd (%4), %%xmm4\n"
20482 "movupd (%5), %%xmm5\n"
20483 "movupd (%6), %%xmm6\n"
20484 "movupd (%7), %%xmm7\n"
20485 "movapd %%xmm0, %%xmm8\n"
20486 "movapd %%xmm0, %%xmm9\n"
20487 "addpd %%xmm1, %%xmm8\n"
20488 "subpd %%xmm1, %%xmm9\n"
20489 "movapd %%xmm2, %%xmm10\n"
20490 "movapd %%xmm2, %%xmm11\n"
20491 "addpd %%xmm3, %%xmm10\n"
20492 "subpd %%xmm3, %%xmm11\n"
20493 "movapd %%xmm4, %%xmm12\n"
20494 "movapd %%xmm4, %%xmm13\n"
20495 "addpd %%xmm5, %%xmm12\n"
20496 "subpd %%xmm5, %%xmm13\n"
20497 "movapd %%xmm6, %%xmm14\n"
20498 "movapd %%xmm6, %%xmm15\n"
20499 "addpd %%xmm7, %%xmm14\n"
20500 "subpd %%xmm7, %%xmm15\n"
20501 "movapd %%xmm8, %%xmm0\n"
20502 "movapd %%xmm8, %%xmm2\n"
20503 "addpd %%xmm10, %%xmm0\n"
20504 "subpd %%xmm10, %%xmm2\n"
20505 "movapd %%xmm9, %%xmm1\n"
20506 "movapd %%xmm9, %%xmm3\n"
20507 "addpd %%xmm11, %%xmm1\n"
20508 "subpd %%xmm11, %%xmm3\n"
20509 "movapd %%xmm12, %%xmm4\n"
20510 "movapd %%xmm12, %%xmm6\n"
20511 "addpd %%xmm14, %%xmm4\n"
20512 "subpd %%xmm14, %%xmm6\n"
20513 "movapd %%xmm13, %%xmm5\n"
20514 "movapd %%xmm13, %%xmm7\n"
20515 "addpd %%xmm15, %%xmm5\n"
20516 "subpd %%xmm15, %%xmm7\n"
20517 "movapd %%xmm0, %%xmm8\n"
20518 "movapd %%xmm0, %%xmm12\n"
20519 "addpd %%xmm4, %%xmm8\n"
20520 "subpd %%xmm4, %%xmm12\n"
20521 "movapd %%xmm1, %%xmm9\n"
20522 "movapd %%xmm1, %%xmm13\n"
20523 "addpd %%xmm5, %%xmm9\n"
20524 "subpd %%xmm5, %%xmm13\n"
20525 "movapd %%xmm2, %%xmm10\n"
20526 "movapd %%xmm2, %%xmm14\n"
20527 "addpd %%xmm6, %%xmm10\n"
20528 "subpd %%xmm6, %%xmm14\n"
20529 "movapd %%xmm3, %%xmm11\n"
20530 "movapd %%xmm3, %%xmm15\n"
20531 "addpd %%xmm7, %%xmm11\n"
20532 "subpd %%xmm7, %%xmm15\n"
20533 "movupd %%xmm8, (%0)\n"
20534 "movupd %%xmm9, (%1)\n"
20535 "movupd %%xmm10, (%2)\n"
20536 "movupd %%xmm11, (%3)\n"
20537 "movupd %%xmm12, (%4)\n"
20538 "movupd %%xmm13, (%5)\n"
20539 "movupd %%xmm14, (%6)\n"
20540 "movupd %%xmm15, (%7)\n"
20541 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20542 );
20543 }
20544 }
20545 return;
20546 }
20547 if (depth == 8) {
20548 helper_double_23_recursive(buf + 0, 5);
20549 helper_double_23_recursive(buf + 32, 5);
20550 helper_double_23_recursive(buf + 64, 5);
20551 helper_double_23_recursive(buf + 96, 5);
20552 helper_double_23_recursive(buf + 128, 5);
20553 helper_double_23_recursive(buf + 160, 5);
20554 helper_double_23_recursive(buf + 192, 5);
20555 helper_double_23_recursive(buf + 224, 5);
20556 for (int j = 0; j < 256; j += 256) {
20557 for (int k = 0; k < 32; k += 2) {
20558 __asm__ volatile (
20559 "movupd (%0), %%xmm0\n"
20560 "movupd (%1), %%xmm1\n"
20561 "movupd (%2), %%xmm2\n"
20562 "movupd (%3), %%xmm3\n"
20563 "movupd (%4), %%xmm4\n"
20564 "movupd (%5), %%xmm5\n"
20565 "movupd (%6), %%xmm6\n"
20566 "movupd (%7), %%xmm7\n"
20567 "movapd %%xmm0, %%xmm8\n"
20568 "movapd %%xmm0, %%xmm9\n"
20569 "addpd %%xmm1, %%xmm8\n"
20570 "subpd %%xmm1, %%xmm9\n"
20571 "movapd %%xmm2, %%xmm10\n"
20572 "movapd %%xmm2, %%xmm11\n"
20573 "addpd %%xmm3, %%xmm10\n"
20574 "subpd %%xmm3, %%xmm11\n"
20575 "movapd %%xmm4, %%xmm12\n"
20576 "movapd %%xmm4, %%xmm13\n"
20577 "addpd %%xmm5, %%xmm12\n"
20578 "subpd %%xmm5, %%xmm13\n"
20579 "movapd %%xmm6, %%xmm14\n"
20580 "movapd %%xmm6, %%xmm15\n"
20581 "addpd %%xmm7, %%xmm14\n"
20582 "subpd %%xmm7, %%xmm15\n"
20583 "movapd %%xmm8, %%xmm0\n"
20584 "movapd %%xmm8, %%xmm2\n"
20585 "addpd %%xmm10, %%xmm0\n"
20586 "subpd %%xmm10, %%xmm2\n"
20587 "movapd %%xmm9, %%xmm1\n"
20588 "movapd %%xmm9, %%xmm3\n"
20589 "addpd %%xmm11, %%xmm1\n"
20590 "subpd %%xmm11, %%xmm3\n"
20591 "movapd %%xmm12, %%xmm4\n"
20592 "movapd %%xmm12, %%xmm6\n"
20593 "addpd %%xmm14, %%xmm4\n"
20594 "subpd %%xmm14, %%xmm6\n"
20595 "movapd %%xmm13, %%xmm5\n"
20596 "movapd %%xmm13, %%xmm7\n"
20597 "addpd %%xmm15, %%xmm5\n"
20598 "subpd %%xmm15, %%xmm7\n"
20599 "movapd %%xmm0, %%xmm8\n"
20600 "movapd %%xmm0, %%xmm12\n"
20601 "addpd %%xmm4, %%xmm8\n"
20602 "subpd %%xmm4, %%xmm12\n"
20603 "movapd %%xmm1, %%xmm9\n"
20604 "movapd %%xmm1, %%xmm13\n"
20605 "addpd %%xmm5, %%xmm9\n"
20606 "subpd %%xmm5, %%xmm13\n"
20607 "movapd %%xmm2, %%xmm10\n"
20608 "movapd %%xmm2, %%xmm14\n"
20609 "addpd %%xmm6, %%xmm10\n"
20610 "subpd %%xmm6, %%xmm14\n"
20611 "movapd %%xmm3, %%xmm11\n"
20612 "movapd %%xmm3, %%xmm15\n"
20613 "addpd %%xmm7, %%xmm11\n"
20614 "subpd %%xmm7, %%xmm15\n"
20615 "movupd %%xmm8, (%0)\n"
20616 "movupd %%xmm9, (%1)\n"
20617 "movupd %%xmm10, (%2)\n"
20618 "movupd %%xmm11, (%3)\n"
20619 "movupd %%xmm12, (%4)\n"
20620 "movupd %%xmm13, (%5)\n"
20621 "movupd %%xmm14, (%6)\n"
20622 "movupd %%xmm15, (%7)\n"
20623 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20624 );
20625 }
20626 }
20627 return;
20628 }
20629 if (depth == 11) {
20630 helper_double_23_recursive(buf + 0, 8);
20631 helper_double_23_recursive(buf + 256, 8);
20632 helper_double_23_recursive(buf + 512, 8);
20633 helper_double_23_recursive(buf + 768, 8);
20634 helper_double_23_recursive(buf + 1024, 8);
20635 helper_double_23_recursive(buf + 1280, 8);
20636 helper_double_23_recursive(buf + 1536, 8);
20637 helper_double_23_recursive(buf + 1792, 8);
20638 for (int j = 0; j < 2048; j += 2048) {
20639 for (int k = 0; k < 256; k += 2) {
20640 __asm__ volatile (
20641 "movupd (%0), %%xmm0\n"
20642 "movupd (%1), %%xmm1\n"
20643 "movupd (%2), %%xmm2\n"
20644 "movupd (%3), %%xmm3\n"
20645 "movupd (%4), %%xmm4\n"
20646 "movupd (%5), %%xmm5\n"
20647 "movupd (%6), %%xmm6\n"
20648 "movupd (%7), %%xmm7\n"
20649 "movapd %%xmm0, %%xmm8\n"
20650 "movapd %%xmm0, %%xmm9\n"
20651 "addpd %%xmm1, %%xmm8\n"
20652 "subpd %%xmm1, %%xmm9\n"
20653 "movapd %%xmm2, %%xmm10\n"
20654 "movapd %%xmm2, %%xmm11\n"
20655 "addpd %%xmm3, %%xmm10\n"
20656 "subpd %%xmm3, %%xmm11\n"
20657 "movapd %%xmm4, %%xmm12\n"
20658 "movapd %%xmm4, %%xmm13\n"
20659 "addpd %%xmm5, %%xmm12\n"
20660 "subpd %%xmm5, %%xmm13\n"
20661 "movapd %%xmm6, %%xmm14\n"
20662 "movapd %%xmm6, %%xmm15\n"
20663 "addpd %%xmm7, %%xmm14\n"
20664 "subpd %%xmm7, %%xmm15\n"
20665 "movapd %%xmm8, %%xmm0\n"
20666 "movapd %%xmm8, %%xmm2\n"
20667 "addpd %%xmm10, %%xmm0\n"
20668 "subpd %%xmm10, %%xmm2\n"
20669 "movapd %%xmm9, %%xmm1\n"
20670 "movapd %%xmm9, %%xmm3\n"
20671 "addpd %%xmm11, %%xmm1\n"
20672 "subpd %%xmm11, %%xmm3\n"
20673 "movapd %%xmm12, %%xmm4\n"
20674 "movapd %%xmm12, %%xmm6\n"
20675 "addpd %%xmm14, %%xmm4\n"
20676 "subpd %%xmm14, %%xmm6\n"
20677 "movapd %%xmm13, %%xmm5\n"
20678 "movapd %%xmm13, %%xmm7\n"
20679 "addpd %%xmm15, %%xmm5\n"
20680 "subpd %%xmm15, %%xmm7\n"
20681 "movapd %%xmm0, %%xmm8\n"
20682 "movapd %%xmm0, %%xmm12\n"
20683 "addpd %%xmm4, %%xmm8\n"
20684 "subpd %%xmm4, %%xmm12\n"
20685 "movapd %%xmm1, %%xmm9\n"
20686 "movapd %%xmm1, %%xmm13\n"
20687 "addpd %%xmm5, %%xmm9\n"
20688 "subpd %%xmm5, %%xmm13\n"
20689 "movapd %%xmm2, %%xmm10\n"
20690 "movapd %%xmm2, %%xmm14\n"
20691 "addpd %%xmm6, %%xmm10\n"
20692 "subpd %%xmm6, %%xmm14\n"
20693 "movapd %%xmm3, %%xmm11\n"
20694 "movapd %%xmm3, %%xmm15\n"
20695 "addpd %%xmm7, %%xmm11\n"
20696 "subpd %%xmm7, %%xmm15\n"
20697 "movupd %%xmm8, (%0)\n"
20698 "movupd %%xmm9, (%1)\n"
20699 "movupd %%xmm10, (%2)\n"
20700 "movupd %%xmm11, (%3)\n"
20701 "movupd %%xmm12, (%4)\n"
20702 "movupd %%xmm13, (%5)\n"
20703 "movupd %%xmm14, (%6)\n"
20704 "movupd %%xmm15, (%7)\n"
20705 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20706 );
20707 }
20708 }
20709 return;
20710 }
20711 if (depth == 14) {
20712 helper_double_23_recursive(buf + 0, 11);
20713 helper_double_23_recursive(buf + 2048, 11);
20714 helper_double_23_recursive(buf + 4096, 11);
20715 helper_double_23_recursive(buf + 6144, 11);
20716 helper_double_23_recursive(buf + 8192, 11);
20717 helper_double_23_recursive(buf + 10240, 11);
20718 helper_double_23_recursive(buf + 12288, 11);
20719 helper_double_23_recursive(buf + 14336, 11);
20720 for (int j = 0; j < 16384; j += 16384) {
20721 for (int k = 0; k < 2048; k += 2) {
20722 __asm__ volatile (
20723 "movupd (%0), %%xmm0\n"
20724 "movupd (%1), %%xmm1\n"
20725 "movupd (%2), %%xmm2\n"
20726 "movupd (%3), %%xmm3\n"
20727 "movupd (%4), %%xmm4\n"
20728 "movupd (%5), %%xmm5\n"
20729 "movupd (%6), %%xmm6\n"
20730 "movupd (%7), %%xmm7\n"
20731 "movapd %%xmm0, %%xmm8\n"
20732 "movapd %%xmm0, %%xmm9\n"
20733 "addpd %%xmm1, %%xmm8\n"
20734 "subpd %%xmm1, %%xmm9\n"
20735 "movapd %%xmm2, %%xmm10\n"
20736 "movapd %%xmm2, %%xmm11\n"
20737 "addpd %%xmm3, %%xmm10\n"
20738 "subpd %%xmm3, %%xmm11\n"
20739 "movapd %%xmm4, %%xmm12\n"
20740 "movapd %%xmm4, %%xmm13\n"
20741 "addpd %%xmm5, %%xmm12\n"
20742 "subpd %%xmm5, %%xmm13\n"
20743 "movapd %%xmm6, %%xmm14\n"
20744 "movapd %%xmm6, %%xmm15\n"
20745 "addpd %%xmm7, %%xmm14\n"
20746 "subpd %%xmm7, %%xmm15\n"
20747 "movapd %%xmm8, %%xmm0\n"
20748 "movapd %%xmm8, %%xmm2\n"
20749 "addpd %%xmm10, %%xmm0\n"
20750 "subpd %%xmm10, %%xmm2\n"
20751 "movapd %%xmm9, %%xmm1\n"
20752 "movapd %%xmm9, %%xmm3\n"
20753 "addpd %%xmm11, %%xmm1\n"
20754 "subpd %%xmm11, %%xmm3\n"
20755 "movapd %%xmm12, %%xmm4\n"
20756 "movapd %%xmm12, %%xmm6\n"
20757 "addpd %%xmm14, %%xmm4\n"
20758 "subpd %%xmm14, %%xmm6\n"
20759 "movapd %%xmm13, %%xmm5\n"
20760 "movapd %%xmm13, %%xmm7\n"
20761 "addpd %%xmm15, %%xmm5\n"
20762 "subpd %%xmm15, %%xmm7\n"
20763 "movapd %%xmm0, %%xmm8\n"
20764 "movapd %%xmm0, %%xmm12\n"
20765 "addpd %%xmm4, %%xmm8\n"
20766 "subpd %%xmm4, %%xmm12\n"
20767 "movapd %%xmm1, %%xmm9\n"
20768 "movapd %%xmm1, %%xmm13\n"
20769 "addpd %%xmm5, %%xmm9\n"
20770 "subpd %%xmm5, %%xmm13\n"
20771 "movapd %%xmm2, %%xmm10\n"
20772 "movapd %%xmm2, %%xmm14\n"
20773 "addpd %%xmm6, %%xmm10\n"
20774 "subpd %%xmm6, %%xmm14\n"
20775 "movapd %%xmm3, %%xmm11\n"
20776 "movapd %%xmm3, %%xmm15\n"
20777 "addpd %%xmm7, %%xmm11\n"
20778 "subpd %%xmm7, %%xmm15\n"
20779 "movupd %%xmm8, (%0)\n"
20780 "movupd %%xmm9, (%1)\n"
20781 "movupd %%xmm10, (%2)\n"
20782 "movupd %%xmm11, (%3)\n"
20783 "movupd %%xmm12, (%4)\n"
20784 "movupd %%xmm13, (%5)\n"
20785 "movupd %%xmm14, (%6)\n"
20786 "movupd %%xmm15, (%7)\n"
20787 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20788 );
20789 }
20790 }
20791 return;
20792 }
20793 if (depth == 17) {
20794 helper_double_23_recursive(buf + 0, 14);
20795 helper_double_23_recursive(buf + 16384, 14);
20796 helper_double_23_recursive(buf + 32768, 14);
20797 helper_double_23_recursive(buf + 49152, 14);
20798 helper_double_23_recursive(buf + 65536, 14);
20799 helper_double_23_recursive(buf + 81920, 14);
20800 helper_double_23_recursive(buf + 98304, 14);
20801 helper_double_23_recursive(buf + 114688, 14);
20802 for (int j = 0; j < 131072; j += 131072) {
20803 for (int k = 0; k < 16384; k += 2) {
20804 __asm__ volatile (
20805 "movupd (%0), %%xmm0\n"
20806 "movupd (%1), %%xmm1\n"
20807 "movupd (%2), %%xmm2\n"
20808 "movupd (%3), %%xmm3\n"
20809 "movupd (%4), %%xmm4\n"
20810 "movupd (%5), %%xmm5\n"
20811 "movupd (%6), %%xmm6\n"
20812 "movupd (%7), %%xmm7\n"
20813 "movapd %%xmm0, %%xmm8\n"
20814 "movapd %%xmm0, %%xmm9\n"
20815 "addpd %%xmm1, %%xmm8\n"
20816 "subpd %%xmm1, %%xmm9\n"
20817 "movapd %%xmm2, %%xmm10\n"
20818 "movapd %%xmm2, %%xmm11\n"
20819 "addpd %%xmm3, %%xmm10\n"
20820 "subpd %%xmm3, %%xmm11\n"
20821 "movapd %%xmm4, %%xmm12\n"
20822 "movapd %%xmm4, %%xmm13\n"
20823 "addpd %%xmm5, %%xmm12\n"
20824 "subpd %%xmm5, %%xmm13\n"
20825 "movapd %%xmm6, %%xmm14\n"
20826 "movapd %%xmm6, %%xmm15\n"
20827 "addpd %%xmm7, %%xmm14\n"
20828 "subpd %%xmm7, %%xmm15\n"
20829 "movapd %%xmm8, %%xmm0\n"
20830 "movapd %%xmm8, %%xmm2\n"
20831 "addpd %%xmm10, %%xmm0\n"
20832 "subpd %%xmm10, %%xmm2\n"
20833 "movapd %%xmm9, %%xmm1\n"
20834 "movapd %%xmm9, %%xmm3\n"
20835 "addpd %%xmm11, %%xmm1\n"
20836 "subpd %%xmm11, %%xmm3\n"
20837 "movapd %%xmm12, %%xmm4\n"
20838 "movapd %%xmm12, %%xmm6\n"
20839 "addpd %%xmm14, %%xmm4\n"
20840 "subpd %%xmm14, %%xmm6\n"
20841 "movapd %%xmm13, %%xmm5\n"
20842 "movapd %%xmm13, %%xmm7\n"
20843 "addpd %%xmm15, %%xmm5\n"
20844 "subpd %%xmm15, %%xmm7\n"
20845 "movapd %%xmm0, %%xmm8\n"
20846 "movapd %%xmm0, %%xmm12\n"
20847 "addpd %%xmm4, %%xmm8\n"
20848 "subpd %%xmm4, %%xmm12\n"
20849 "movapd %%xmm1, %%xmm9\n"
20850 "movapd %%xmm1, %%xmm13\n"
20851 "addpd %%xmm5, %%xmm9\n"
20852 "subpd %%xmm5, %%xmm13\n"
20853 "movapd %%xmm2, %%xmm10\n"
20854 "movapd %%xmm2, %%xmm14\n"
20855 "addpd %%xmm6, %%xmm10\n"
20856 "subpd %%xmm6, %%xmm14\n"
20857 "movapd %%xmm3, %%xmm11\n"
20858 "movapd %%xmm3, %%xmm15\n"
20859 "addpd %%xmm7, %%xmm11\n"
20860 "subpd %%xmm7, %%xmm15\n"
20861 "movupd %%xmm8, (%0)\n"
20862 "movupd %%xmm9, (%1)\n"
20863 "movupd %%xmm10, (%2)\n"
20864 "movupd %%xmm11, (%3)\n"
20865 "movupd %%xmm12, (%4)\n"
20866 "movupd %%xmm13, (%5)\n"
20867 "movupd %%xmm14, (%6)\n"
20868 "movupd %%xmm15, (%7)\n"
20869 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20870 );
20871 }
20872 }
20873 return;
20874 }
20875 if (depth == 20) {
20876 helper_double_23_recursive(buf + 0, 17);
20877 helper_double_23_recursive(buf + 131072, 17);
20878 helper_double_23_recursive(buf + 262144, 17);
20879 helper_double_23_recursive(buf + 393216, 17);
20880 helper_double_23_recursive(buf + 524288, 17);
20881 helper_double_23_recursive(buf + 655360, 17);
20882 helper_double_23_recursive(buf + 786432, 17);
20883 helper_double_23_recursive(buf + 917504, 17);
20884 for (int j = 0; j < 1048576; j += 1048576) {
20885 for (int k = 0; k < 131072; k += 2) {
20886 __asm__ volatile (
20887 "movupd (%0), %%xmm0\n"
20888 "movupd (%1), %%xmm1\n"
20889 "movupd (%2), %%xmm2\n"
20890 "movupd (%3), %%xmm3\n"
20891 "movupd (%4), %%xmm4\n"
20892 "movupd (%5), %%xmm5\n"
20893 "movupd (%6), %%xmm6\n"
20894 "movupd (%7), %%xmm7\n"
20895 "movapd %%xmm0, %%xmm8\n"
20896 "movapd %%xmm0, %%xmm9\n"
20897 "addpd %%xmm1, %%xmm8\n"
20898 "subpd %%xmm1, %%xmm9\n"
20899 "movapd %%xmm2, %%xmm10\n"
20900 "movapd %%xmm2, %%xmm11\n"
20901 "addpd %%xmm3, %%xmm10\n"
20902 "subpd %%xmm3, %%xmm11\n"
20903 "movapd %%xmm4, %%xmm12\n"
20904 "movapd %%xmm4, %%xmm13\n"
20905 "addpd %%xmm5, %%xmm12\n"
20906 "subpd %%xmm5, %%xmm13\n"
20907 "movapd %%xmm6, %%xmm14\n"
20908 "movapd %%xmm6, %%xmm15\n"
20909 "addpd %%xmm7, %%xmm14\n"
20910 "subpd %%xmm7, %%xmm15\n"
20911 "movapd %%xmm8, %%xmm0\n"
20912 "movapd %%xmm8, %%xmm2\n"
20913 "addpd %%xmm10, %%xmm0\n"
20914 "subpd %%xmm10, %%xmm2\n"
20915 "movapd %%xmm9, %%xmm1\n"
20916 "movapd %%xmm9, %%xmm3\n"
20917 "addpd %%xmm11, %%xmm1\n"
20918 "subpd %%xmm11, %%xmm3\n"
20919 "movapd %%xmm12, %%xmm4\n"
20920 "movapd %%xmm12, %%xmm6\n"
20921 "addpd %%xmm14, %%xmm4\n"
20922 "subpd %%xmm14, %%xmm6\n"
20923 "movapd %%xmm13, %%xmm5\n"
20924 "movapd %%xmm13, %%xmm7\n"
20925 "addpd %%xmm15, %%xmm5\n"
20926 "subpd %%xmm15, %%xmm7\n"
20927 "movapd %%xmm0, %%xmm8\n"
20928 "movapd %%xmm0, %%xmm12\n"
20929 "addpd %%xmm4, %%xmm8\n"
20930 "subpd %%xmm4, %%xmm12\n"
20931 "movapd %%xmm1, %%xmm9\n"
20932 "movapd %%xmm1, %%xmm13\n"
20933 "addpd %%xmm5, %%xmm9\n"
20934 "subpd %%xmm5, %%xmm13\n"
20935 "movapd %%xmm2, %%xmm10\n"
20936 "movapd %%xmm2, %%xmm14\n"
20937 "addpd %%xmm6, %%xmm10\n"
20938 "subpd %%xmm6, %%xmm14\n"
20939 "movapd %%xmm3, %%xmm11\n"
20940 "movapd %%xmm3, %%xmm15\n"
20941 "addpd %%xmm7, %%xmm11\n"
20942 "subpd %%xmm7, %%xmm15\n"
20943 "movupd %%xmm8, (%0)\n"
20944 "movupd %%xmm9, (%1)\n"
20945 "movupd %%xmm10, (%2)\n"
20946 "movupd %%xmm11, (%3)\n"
20947 "movupd %%xmm12, (%4)\n"
20948 "movupd %%xmm13, (%5)\n"
20949 "movupd %%xmm14, (%6)\n"
20950 "movupd %%xmm15, (%7)\n"
20951 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
20952 );
20953 }
20954 }
20955 return;
20956 }
20957 if (depth == 23) {
20958 helper_double_23_recursive(buf + 0, 20);
20959 helper_double_23_recursive(buf + 1048576, 20);
20960 helper_double_23_recursive(buf + 2097152, 20);
20961 helper_double_23_recursive(buf + 3145728, 20);
20962 helper_double_23_recursive(buf + 4194304, 20);
20963 helper_double_23_recursive(buf + 5242880, 20);
20964 helper_double_23_recursive(buf + 6291456, 20);
20965 helper_double_23_recursive(buf + 7340032, 20);
20966 for (int j = 0; j < 8388608; j += 8388608) {
20967 for (int k = 0; k < 1048576; k += 2) {
20968 __asm__ volatile (
20969 "movupd (%0), %%xmm0\n"
20970 "movupd (%1), %%xmm1\n"
20971 "movupd (%2), %%xmm2\n"
20972 "movupd (%3), %%xmm3\n"
20973 "movupd (%4), %%xmm4\n"
20974 "movupd (%5), %%xmm5\n"
20975 "movupd (%6), %%xmm6\n"
20976 "movupd (%7), %%xmm7\n"
20977 "movapd %%xmm0, %%xmm8\n"
20978 "movapd %%xmm0, %%xmm9\n"
20979 "addpd %%xmm1, %%xmm8\n"
20980 "subpd %%xmm1, %%xmm9\n"
20981 "movapd %%xmm2, %%xmm10\n"
20982 "movapd %%xmm2, %%xmm11\n"
20983 "addpd %%xmm3, %%xmm10\n"
20984 "subpd %%xmm3, %%xmm11\n"
20985 "movapd %%xmm4, %%xmm12\n"
20986 "movapd %%xmm4, %%xmm13\n"
20987 "addpd %%xmm5, %%xmm12\n"
20988 "subpd %%xmm5, %%xmm13\n"
20989 "movapd %%xmm6, %%xmm14\n"
20990 "movapd %%xmm6, %%xmm15\n"
20991 "addpd %%xmm7, %%xmm14\n"
20992 "subpd %%xmm7, %%xmm15\n"
20993 "movapd %%xmm8, %%xmm0\n"
20994 "movapd %%xmm8, %%xmm2\n"
20995 "addpd %%xmm10, %%xmm0\n"
20996 "subpd %%xmm10, %%xmm2\n"
20997 "movapd %%xmm9, %%xmm1\n"
20998 "movapd %%xmm9, %%xmm3\n"
20999 "addpd %%xmm11, %%xmm1\n"
21000 "subpd %%xmm11, %%xmm3\n"
21001 "movapd %%xmm12, %%xmm4\n"
21002 "movapd %%xmm12, %%xmm6\n"
21003 "addpd %%xmm14, %%xmm4\n"
21004 "subpd %%xmm14, %%xmm6\n"
21005 "movapd %%xmm13, %%xmm5\n"
21006 "movapd %%xmm13, %%xmm7\n"
21007 "addpd %%xmm15, %%xmm5\n"
21008 "subpd %%xmm15, %%xmm7\n"
21009 "movapd %%xmm0, %%xmm8\n"
21010 "movapd %%xmm0, %%xmm12\n"
21011 "addpd %%xmm4, %%xmm8\n"
21012 "subpd %%xmm4, %%xmm12\n"
21013 "movapd %%xmm1, %%xmm9\n"
21014 "movapd %%xmm1, %%xmm13\n"
21015 "addpd %%xmm5, %%xmm9\n"
21016 "subpd %%xmm5, %%xmm13\n"
21017 "movapd %%xmm2, %%xmm10\n"
21018 "movapd %%xmm2, %%xmm14\n"
21019 "addpd %%xmm6, %%xmm10\n"
21020 "subpd %%xmm6, %%xmm14\n"
21021 "movapd %%xmm3, %%xmm11\n"
21022 "movapd %%xmm3, %%xmm15\n"
21023 "addpd %%xmm7, %%xmm11\n"
21024 "subpd %%xmm7, %%xmm15\n"
21025 "movupd %%xmm8, (%0)\n"
21026 "movupd %%xmm9, (%1)\n"
21027 "movupd %%xmm10, (%2)\n"
21028 "movupd %%xmm11, (%3)\n"
21029 "movupd %%xmm12, (%4)\n"
21030 "movupd %%xmm13, (%5)\n"
21031 "movupd %%xmm14, (%6)\n"
21032 "movupd %%xmm15, (%7)\n"
21033 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21034 );
21035 }
21036 }
21037 return;
21038 }
21039 }
21040 void helper_double_23(double *buf);
helper_double_23(double * buf)21041 void helper_double_23(double *buf) {
21042 helper_double_23_recursive(buf, 23);
21043 }
21044 void helper_double_24_recursive(double *buf, int depth);
helper_double_24_recursive(double * buf,int depth)21045 void helper_double_24_recursive(double *buf, int depth) {
21046 if (depth == 13) {
21047 for (int j = 0; j < 8192; j += 16) {
21048 for (int k = 0; k < 2; k += 2) {
21049 __asm__ volatile (
21050 "movupd (%0), %%xmm0\n"
21051 "movupd (%1), %%xmm1\n"
21052 "movupd (%2), %%xmm2\n"
21053 "movupd (%3), %%xmm3\n"
21054 "movupd (%4), %%xmm4\n"
21055 "movupd (%5), %%xmm5\n"
21056 "movupd (%6), %%xmm6\n"
21057 "movupd (%7), %%xmm7\n"
21058 "movapd %%xmm0, %%xmm8\n"
21059 "haddpd %%xmm8, %%xmm8\n"
21060 "movapd %%xmm0, %%xmm9\n"
21061 "hsubpd %%xmm9, %%xmm9\n"
21062 "blendpd $1, %%xmm8, %%xmm9\n"
21063 "movapd %%xmm9, %%xmm0\n"
21064 "movapd %%xmm1, %%xmm8\n"
21065 "haddpd %%xmm8, %%xmm8\n"
21066 "movapd %%xmm1, %%xmm9\n"
21067 "hsubpd %%xmm9, %%xmm9\n"
21068 "blendpd $1, %%xmm8, %%xmm9\n"
21069 "movapd %%xmm9, %%xmm1\n"
21070 "movapd %%xmm2, %%xmm8\n"
21071 "haddpd %%xmm8, %%xmm8\n"
21072 "movapd %%xmm2, %%xmm9\n"
21073 "hsubpd %%xmm9, %%xmm9\n"
21074 "blendpd $1, %%xmm8, %%xmm9\n"
21075 "movapd %%xmm9, %%xmm2\n"
21076 "movapd %%xmm3, %%xmm8\n"
21077 "haddpd %%xmm8, %%xmm8\n"
21078 "movapd %%xmm3, %%xmm9\n"
21079 "hsubpd %%xmm9, %%xmm9\n"
21080 "blendpd $1, %%xmm8, %%xmm9\n"
21081 "movapd %%xmm9, %%xmm3\n"
21082 "movapd %%xmm4, %%xmm8\n"
21083 "haddpd %%xmm8, %%xmm8\n"
21084 "movapd %%xmm4, %%xmm9\n"
21085 "hsubpd %%xmm9, %%xmm9\n"
21086 "blendpd $1, %%xmm8, %%xmm9\n"
21087 "movapd %%xmm9, %%xmm4\n"
21088 "movapd %%xmm5, %%xmm8\n"
21089 "haddpd %%xmm8, %%xmm8\n"
21090 "movapd %%xmm5, %%xmm9\n"
21091 "hsubpd %%xmm9, %%xmm9\n"
21092 "blendpd $1, %%xmm8, %%xmm9\n"
21093 "movapd %%xmm9, %%xmm5\n"
21094 "movapd %%xmm6, %%xmm8\n"
21095 "haddpd %%xmm8, %%xmm8\n"
21096 "movapd %%xmm6, %%xmm9\n"
21097 "hsubpd %%xmm9, %%xmm9\n"
21098 "blendpd $1, %%xmm8, %%xmm9\n"
21099 "movapd %%xmm9, %%xmm6\n"
21100 "movapd %%xmm7, %%xmm8\n"
21101 "haddpd %%xmm8, %%xmm8\n"
21102 "movapd %%xmm7, %%xmm9\n"
21103 "hsubpd %%xmm9, %%xmm9\n"
21104 "blendpd $1, %%xmm8, %%xmm9\n"
21105 "movapd %%xmm9, %%xmm7\n"
21106 "movapd %%xmm0, %%xmm8\n"
21107 "movapd %%xmm0, %%xmm9\n"
21108 "addpd %%xmm1, %%xmm8\n"
21109 "subpd %%xmm1, %%xmm9\n"
21110 "movapd %%xmm2, %%xmm10\n"
21111 "movapd %%xmm2, %%xmm11\n"
21112 "addpd %%xmm3, %%xmm10\n"
21113 "subpd %%xmm3, %%xmm11\n"
21114 "movapd %%xmm4, %%xmm12\n"
21115 "movapd %%xmm4, %%xmm13\n"
21116 "addpd %%xmm5, %%xmm12\n"
21117 "subpd %%xmm5, %%xmm13\n"
21118 "movapd %%xmm6, %%xmm14\n"
21119 "movapd %%xmm6, %%xmm15\n"
21120 "addpd %%xmm7, %%xmm14\n"
21121 "subpd %%xmm7, %%xmm15\n"
21122 "movapd %%xmm8, %%xmm0\n"
21123 "movapd %%xmm8, %%xmm2\n"
21124 "addpd %%xmm10, %%xmm0\n"
21125 "subpd %%xmm10, %%xmm2\n"
21126 "movapd %%xmm9, %%xmm1\n"
21127 "movapd %%xmm9, %%xmm3\n"
21128 "addpd %%xmm11, %%xmm1\n"
21129 "subpd %%xmm11, %%xmm3\n"
21130 "movapd %%xmm12, %%xmm4\n"
21131 "movapd %%xmm12, %%xmm6\n"
21132 "addpd %%xmm14, %%xmm4\n"
21133 "subpd %%xmm14, %%xmm6\n"
21134 "movapd %%xmm13, %%xmm5\n"
21135 "movapd %%xmm13, %%xmm7\n"
21136 "addpd %%xmm15, %%xmm5\n"
21137 "subpd %%xmm15, %%xmm7\n"
21138 "movapd %%xmm0, %%xmm8\n"
21139 "movapd %%xmm0, %%xmm12\n"
21140 "addpd %%xmm4, %%xmm8\n"
21141 "subpd %%xmm4, %%xmm12\n"
21142 "movapd %%xmm1, %%xmm9\n"
21143 "movapd %%xmm1, %%xmm13\n"
21144 "addpd %%xmm5, %%xmm9\n"
21145 "subpd %%xmm5, %%xmm13\n"
21146 "movapd %%xmm2, %%xmm10\n"
21147 "movapd %%xmm2, %%xmm14\n"
21148 "addpd %%xmm6, %%xmm10\n"
21149 "subpd %%xmm6, %%xmm14\n"
21150 "movapd %%xmm3, %%xmm11\n"
21151 "movapd %%xmm3, %%xmm15\n"
21152 "addpd %%xmm7, %%xmm11\n"
21153 "subpd %%xmm7, %%xmm15\n"
21154 "movupd %%xmm8, (%0)\n"
21155 "movupd %%xmm9, (%1)\n"
21156 "movupd %%xmm10, (%2)\n"
21157 "movupd %%xmm11, (%3)\n"
21158 "movupd %%xmm12, (%4)\n"
21159 "movupd %%xmm13, (%5)\n"
21160 "movupd %%xmm14, (%6)\n"
21161 "movupd %%xmm15, (%7)\n"
21162 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21163 );
21164 }
21165 }
21166 for (int j = 0; j < 8192; j += 128) {
21167 for (int k = 0; k < 16; k += 2) {
21168 __asm__ volatile (
21169 "movupd (%0), %%xmm0\n"
21170 "movupd (%1), %%xmm1\n"
21171 "movupd (%2), %%xmm2\n"
21172 "movupd (%3), %%xmm3\n"
21173 "movupd (%4), %%xmm4\n"
21174 "movupd (%5), %%xmm5\n"
21175 "movupd (%6), %%xmm6\n"
21176 "movupd (%7), %%xmm7\n"
21177 "movapd %%xmm0, %%xmm8\n"
21178 "movapd %%xmm0, %%xmm9\n"
21179 "addpd %%xmm1, %%xmm8\n"
21180 "subpd %%xmm1, %%xmm9\n"
21181 "movapd %%xmm2, %%xmm10\n"
21182 "movapd %%xmm2, %%xmm11\n"
21183 "addpd %%xmm3, %%xmm10\n"
21184 "subpd %%xmm3, %%xmm11\n"
21185 "movapd %%xmm4, %%xmm12\n"
21186 "movapd %%xmm4, %%xmm13\n"
21187 "addpd %%xmm5, %%xmm12\n"
21188 "subpd %%xmm5, %%xmm13\n"
21189 "movapd %%xmm6, %%xmm14\n"
21190 "movapd %%xmm6, %%xmm15\n"
21191 "addpd %%xmm7, %%xmm14\n"
21192 "subpd %%xmm7, %%xmm15\n"
21193 "movapd %%xmm8, %%xmm0\n"
21194 "movapd %%xmm8, %%xmm2\n"
21195 "addpd %%xmm10, %%xmm0\n"
21196 "subpd %%xmm10, %%xmm2\n"
21197 "movapd %%xmm9, %%xmm1\n"
21198 "movapd %%xmm9, %%xmm3\n"
21199 "addpd %%xmm11, %%xmm1\n"
21200 "subpd %%xmm11, %%xmm3\n"
21201 "movapd %%xmm12, %%xmm4\n"
21202 "movapd %%xmm12, %%xmm6\n"
21203 "addpd %%xmm14, %%xmm4\n"
21204 "subpd %%xmm14, %%xmm6\n"
21205 "movapd %%xmm13, %%xmm5\n"
21206 "movapd %%xmm13, %%xmm7\n"
21207 "addpd %%xmm15, %%xmm5\n"
21208 "subpd %%xmm15, %%xmm7\n"
21209 "movapd %%xmm0, %%xmm8\n"
21210 "movapd %%xmm0, %%xmm12\n"
21211 "addpd %%xmm4, %%xmm8\n"
21212 "subpd %%xmm4, %%xmm12\n"
21213 "movapd %%xmm1, %%xmm9\n"
21214 "movapd %%xmm1, %%xmm13\n"
21215 "addpd %%xmm5, %%xmm9\n"
21216 "subpd %%xmm5, %%xmm13\n"
21217 "movapd %%xmm2, %%xmm10\n"
21218 "movapd %%xmm2, %%xmm14\n"
21219 "addpd %%xmm6, %%xmm10\n"
21220 "subpd %%xmm6, %%xmm14\n"
21221 "movapd %%xmm3, %%xmm11\n"
21222 "movapd %%xmm3, %%xmm15\n"
21223 "addpd %%xmm7, %%xmm11\n"
21224 "subpd %%xmm7, %%xmm15\n"
21225 "movupd %%xmm8, (%0)\n"
21226 "movupd %%xmm9, (%1)\n"
21227 "movupd %%xmm10, (%2)\n"
21228 "movupd %%xmm11, (%3)\n"
21229 "movupd %%xmm12, (%4)\n"
21230 "movupd %%xmm13, (%5)\n"
21231 "movupd %%xmm14, (%6)\n"
21232 "movupd %%xmm15, (%7)\n"
21233 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21234 );
21235 }
21236 }
21237 for (int j = 0; j < 8192; j += 1024) {
21238 for (int k = 0; k < 128; k += 2) {
21239 __asm__ volatile (
21240 "movupd (%0), %%xmm0\n"
21241 "movupd (%1), %%xmm1\n"
21242 "movupd (%2), %%xmm2\n"
21243 "movupd (%3), %%xmm3\n"
21244 "movupd (%4), %%xmm4\n"
21245 "movupd (%5), %%xmm5\n"
21246 "movupd (%6), %%xmm6\n"
21247 "movupd (%7), %%xmm7\n"
21248 "movapd %%xmm0, %%xmm8\n"
21249 "movapd %%xmm0, %%xmm9\n"
21250 "addpd %%xmm1, %%xmm8\n"
21251 "subpd %%xmm1, %%xmm9\n"
21252 "movapd %%xmm2, %%xmm10\n"
21253 "movapd %%xmm2, %%xmm11\n"
21254 "addpd %%xmm3, %%xmm10\n"
21255 "subpd %%xmm3, %%xmm11\n"
21256 "movapd %%xmm4, %%xmm12\n"
21257 "movapd %%xmm4, %%xmm13\n"
21258 "addpd %%xmm5, %%xmm12\n"
21259 "subpd %%xmm5, %%xmm13\n"
21260 "movapd %%xmm6, %%xmm14\n"
21261 "movapd %%xmm6, %%xmm15\n"
21262 "addpd %%xmm7, %%xmm14\n"
21263 "subpd %%xmm7, %%xmm15\n"
21264 "movapd %%xmm8, %%xmm0\n"
21265 "movapd %%xmm8, %%xmm2\n"
21266 "addpd %%xmm10, %%xmm0\n"
21267 "subpd %%xmm10, %%xmm2\n"
21268 "movapd %%xmm9, %%xmm1\n"
21269 "movapd %%xmm9, %%xmm3\n"
21270 "addpd %%xmm11, %%xmm1\n"
21271 "subpd %%xmm11, %%xmm3\n"
21272 "movapd %%xmm12, %%xmm4\n"
21273 "movapd %%xmm12, %%xmm6\n"
21274 "addpd %%xmm14, %%xmm4\n"
21275 "subpd %%xmm14, %%xmm6\n"
21276 "movapd %%xmm13, %%xmm5\n"
21277 "movapd %%xmm13, %%xmm7\n"
21278 "addpd %%xmm15, %%xmm5\n"
21279 "subpd %%xmm15, %%xmm7\n"
21280 "movapd %%xmm0, %%xmm8\n"
21281 "movapd %%xmm0, %%xmm12\n"
21282 "addpd %%xmm4, %%xmm8\n"
21283 "subpd %%xmm4, %%xmm12\n"
21284 "movapd %%xmm1, %%xmm9\n"
21285 "movapd %%xmm1, %%xmm13\n"
21286 "addpd %%xmm5, %%xmm9\n"
21287 "subpd %%xmm5, %%xmm13\n"
21288 "movapd %%xmm2, %%xmm10\n"
21289 "movapd %%xmm2, %%xmm14\n"
21290 "addpd %%xmm6, %%xmm10\n"
21291 "subpd %%xmm6, %%xmm14\n"
21292 "movapd %%xmm3, %%xmm11\n"
21293 "movapd %%xmm3, %%xmm15\n"
21294 "addpd %%xmm7, %%xmm11\n"
21295 "subpd %%xmm7, %%xmm15\n"
21296 "movupd %%xmm8, (%0)\n"
21297 "movupd %%xmm9, (%1)\n"
21298 "movupd %%xmm10, (%2)\n"
21299 "movupd %%xmm11, (%3)\n"
21300 "movupd %%xmm12, (%4)\n"
21301 "movupd %%xmm13, (%5)\n"
21302 "movupd %%xmm14, (%6)\n"
21303 "movupd %%xmm15, (%7)\n"
21304 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21305 );
21306 }
21307 }
21308 for (int j = 0; j < 8192; j += 8192) {
21309 for (int k = 0; k < 1024; k += 2) {
21310 __asm__ volatile (
21311 "movupd (%0), %%xmm0\n"
21312 "movupd (%1), %%xmm1\n"
21313 "movupd (%2), %%xmm2\n"
21314 "movupd (%3), %%xmm3\n"
21315 "movupd (%4), %%xmm4\n"
21316 "movupd (%5), %%xmm5\n"
21317 "movupd (%6), %%xmm6\n"
21318 "movupd (%7), %%xmm7\n"
21319 "movapd %%xmm0, %%xmm8\n"
21320 "movapd %%xmm0, %%xmm9\n"
21321 "addpd %%xmm1, %%xmm8\n"
21322 "subpd %%xmm1, %%xmm9\n"
21323 "movapd %%xmm2, %%xmm10\n"
21324 "movapd %%xmm2, %%xmm11\n"
21325 "addpd %%xmm3, %%xmm10\n"
21326 "subpd %%xmm3, %%xmm11\n"
21327 "movapd %%xmm4, %%xmm12\n"
21328 "movapd %%xmm4, %%xmm13\n"
21329 "addpd %%xmm5, %%xmm12\n"
21330 "subpd %%xmm5, %%xmm13\n"
21331 "movapd %%xmm6, %%xmm14\n"
21332 "movapd %%xmm6, %%xmm15\n"
21333 "addpd %%xmm7, %%xmm14\n"
21334 "subpd %%xmm7, %%xmm15\n"
21335 "movapd %%xmm8, %%xmm0\n"
21336 "movapd %%xmm8, %%xmm2\n"
21337 "addpd %%xmm10, %%xmm0\n"
21338 "subpd %%xmm10, %%xmm2\n"
21339 "movapd %%xmm9, %%xmm1\n"
21340 "movapd %%xmm9, %%xmm3\n"
21341 "addpd %%xmm11, %%xmm1\n"
21342 "subpd %%xmm11, %%xmm3\n"
21343 "movapd %%xmm12, %%xmm4\n"
21344 "movapd %%xmm12, %%xmm6\n"
21345 "addpd %%xmm14, %%xmm4\n"
21346 "subpd %%xmm14, %%xmm6\n"
21347 "movapd %%xmm13, %%xmm5\n"
21348 "movapd %%xmm13, %%xmm7\n"
21349 "addpd %%xmm15, %%xmm5\n"
21350 "subpd %%xmm15, %%xmm7\n"
21351 "movapd %%xmm0, %%xmm8\n"
21352 "movapd %%xmm0, %%xmm12\n"
21353 "addpd %%xmm4, %%xmm8\n"
21354 "subpd %%xmm4, %%xmm12\n"
21355 "movapd %%xmm1, %%xmm9\n"
21356 "movapd %%xmm1, %%xmm13\n"
21357 "addpd %%xmm5, %%xmm9\n"
21358 "subpd %%xmm5, %%xmm13\n"
21359 "movapd %%xmm2, %%xmm10\n"
21360 "movapd %%xmm2, %%xmm14\n"
21361 "addpd %%xmm6, %%xmm10\n"
21362 "subpd %%xmm6, %%xmm14\n"
21363 "movapd %%xmm3, %%xmm11\n"
21364 "movapd %%xmm3, %%xmm15\n"
21365 "addpd %%xmm7, %%xmm11\n"
21366 "subpd %%xmm7, %%xmm15\n"
21367 "movupd %%xmm8, (%0)\n"
21368 "movupd %%xmm9, (%1)\n"
21369 "movupd %%xmm10, (%2)\n"
21370 "movupd %%xmm11, (%3)\n"
21371 "movupd %%xmm12, (%4)\n"
21372 "movupd %%xmm13, (%5)\n"
21373 "movupd %%xmm14, (%6)\n"
21374 "movupd %%xmm15, (%7)\n"
21375 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21376 );
21377 }
21378 }
21379 return;
21380 }
21381 if (depth == 16) {
21382 helper_double_24_recursive(buf + 0, 13);
21383 helper_double_24_recursive(buf + 8192, 13);
21384 helper_double_24_recursive(buf + 16384, 13);
21385 helper_double_24_recursive(buf + 24576, 13);
21386 helper_double_24_recursive(buf + 32768, 13);
21387 helper_double_24_recursive(buf + 40960, 13);
21388 helper_double_24_recursive(buf + 49152, 13);
21389 helper_double_24_recursive(buf + 57344, 13);
21390 for (int j = 0; j < 65536; j += 65536) {
21391 for (int k = 0; k < 8192; k += 2) {
21392 __asm__ volatile (
21393 "movupd (%0), %%xmm0\n"
21394 "movupd (%1), %%xmm1\n"
21395 "movupd (%2), %%xmm2\n"
21396 "movupd (%3), %%xmm3\n"
21397 "movupd (%4), %%xmm4\n"
21398 "movupd (%5), %%xmm5\n"
21399 "movupd (%6), %%xmm6\n"
21400 "movupd (%7), %%xmm7\n"
21401 "movapd %%xmm0, %%xmm8\n"
21402 "movapd %%xmm0, %%xmm9\n"
21403 "addpd %%xmm1, %%xmm8\n"
21404 "subpd %%xmm1, %%xmm9\n"
21405 "movapd %%xmm2, %%xmm10\n"
21406 "movapd %%xmm2, %%xmm11\n"
21407 "addpd %%xmm3, %%xmm10\n"
21408 "subpd %%xmm3, %%xmm11\n"
21409 "movapd %%xmm4, %%xmm12\n"
21410 "movapd %%xmm4, %%xmm13\n"
21411 "addpd %%xmm5, %%xmm12\n"
21412 "subpd %%xmm5, %%xmm13\n"
21413 "movapd %%xmm6, %%xmm14\n"
21414 "movapd %%xmm6, %%xmm15\n"
21415 "addpd %%xmm7, %%xmm14\n"
21416 "subpd %%xmm7, %%xmm15\n"
21417 "movapd %%xmm8, %%xmm0\n"
21418 "movapd %%xmm8, %%xmm2\n"
21419 "addpd %%xmm10, %%xmm0\n"
21420 "subpd %%xmm10, %%xmm2\n"
21421 "movapd %%xmm9, %%xmm1\n"
21422 "movapd %%xmm9, %%xmm3\n"
21423 "addpd %%xmm11, %%xmm1\n"
21424 "subpd %%xmm11, %%xmm3\n"
21425 "movapd %%xmm12, %%xmm4\n"
21426 "movapd %%xmm12, %%xmm6\n"
21427 "addpd %%xmm14, %%xmm4\n"
21428 "subpd %%xmm14, %%xmm6\n"
21429 "movapd %%xmm13, %%xmm5\n"
21430 "movapd %%xmm13, %%xmm7\n"
21431 "addpd %%xmm15, %%xmm5\n"
21432 "subpd %%xmm15, %%xmm7\n"
21433 "movapd %%xmm0, %%xmm8\n"
21434 "movapd %%xmm0, %%xmm12\n"
21435 "addpd %%xmm4, %%xmm8\n"
21436 "subpd %%xmm4, %%xmm12\n"
21437 "movapd %%xmm1, %%xmm9\n"
21438 "movapd %%xmm1, %%xmm13\n"
21439 "addpd %%xmm5, %%xmm9\n"
21440 "subpd %%xmm5, %%xmm13\n"
21441 "movapd %%xmm2, %%xmm10\n"
21442 "movapd %%xmm2, %%xmm14\n"
21443 "addpd %%xmm6, %%xmm10\n"
21444 "subpd %%xmm6, %%xmm14\n"
21445 "movapd %%xmm3, %%xmm11\n"
21446 "movapd %%xmm3, %%xmm15\n"
21447 "addpd %%xmm7, %%xmm11\n"
21448 "subpd %%xmm7, %%xmm15\n"
21449 "movupd %%xmm8, (%0)\n"
21450 "movupd %%xmm9, (%1)\n"
21451 "movupd %%xmm10, (%2)\n"
21452 "movupd %%xmm11, (%3)\n"
21453 "movupd %%xmm12, (%4)\n"
21454 "movupd %%xmm13, (%5)\n"
21455 "movupd %%xmm14, (%6)\n"
21456 "movupd %%xmm15, (%7)\n"
21457 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21458 );
21459 }
21460 }
21461 return;
21462 }
21463 if (depth == 19) {
21464 helper_double_24_recursive(buf + 0, 16);
21465 helper_double_24_recursive(buf + 65536, 16);
21466 helper_double_24_recursive(buf + 131072, 16);
21467 helper_double_24_recursive(buf + 196608, 16);
21468 helper_double_24_recursive(buf + 262144, 16);
21469 helper_double_24_recursive(buf + 327680, 16);
21470 helper_double_24_recursive(buf + 393216, 16);
21471 helper_double_24_recursive(buf + 458752, 16);
21472 for (int j = 0; j < 524288; j += 524288) {
21473 for (int k = 0; k < 65536; k += 2) {
21474 __asm__ volatile (
21475 "movupd (%0), %%xmm0\n"
21476 "movupd (%1), %%xmm1\n"
21477 "movupd (%2), %%xmm2\n"
21478 "movupd (%3), %%xmm3\n"
21479 "movupd (%4), %%xmm4\n"
21480 "movupd (%5), %%xmm5\n"
21481 "movupd (%6), %%xmm6\n"
21482 "movupd (%7), %%xmm7\n"
21483 "movapd %%xmm0, %%xmm8\n"
21484 "movapd %%xmm0, %%xmm9\n"
21485 "addpd %%xmm1, %%xmm8\n"
21486 "subpd %%xmm1, %%xmm9\n"
21487 "movapd %%xmm2, %%xmm10\n"
21488 "movapd %%xmm2, %%xmm11\n"
21489 "addpd %%xmm3, %%xmm10\n"
21490 "subpd %%xmm3, %%xmm11\n"
21491 "movapd %%xmm4, %%xmm12\n"
21492 "movapd %%xmm4, %%xmm13\n"
21493 "addpd %%xmm5, %%xmm12\n"
21494 "subpd %%xmm5, %%xmm13\n"
21495 "movapd %%xmm6, %%xmm14\n"
21496 "movapd %%xmm6, %%xmm15\n"
21497 "addpd %%xmm7, %%xmm14\n"
21498 "subpd %%xmm7, %%xmm15\n"
21499 "movapd %%xmm8, %%xmm0\n"
21500 "movapd %%xmm8, %%xmm2\n"
21501 "addpd %%xmm10, %%xmm0\n"
21502 "subpd %%xmm10, %%xmm2\n"
21503 "movapd %%xmm9, %%xmm1\n"
21504 "movapd %%xmm9, %%xmm3\n"
21505 "addpd %%xmm11, %%xmm1\n"
21506 "subpd %%xmm11, %%xmm3\n"
21507 "movapd %%xmm12, %%xmm4\n"
21508 "movapd %%xmm12, %%xmm6\n"
21509 "addpd %%xmm14, %%xmm4\n"
21510 "subpd %%xmm14, %%xmm6\n"
21511 "movapd %%xmm13, %%xmm5\n"
21512 "movapd %%xmm13, %%xmm7\n"
21513 "addpd %%xmm15, %%xmm5\n"
21514 "subpd %%xmm15, %%xmm7\n"
21515 "movapd %%xmm0, %%xmm8\n"
21516 "movapd %%xmm0, %%xmm12\n"
21517 "addpd %%xmm4, %%xmm8\n"
21518 "subpd %%xmm4, %%xmm12\n"
21519 "movapd %%xmm1, %%xmm9\n"
21520 "movapd %%xmm1, %%xmm13\n"
21521 "addpd %%xmm5, %%xmm9\n"
21522 "subpd %%xmm5, %%xmm13\n"
21523 "movapd %%xmm2, %%xmm10\n"
21524 "movapd %%xmm2, %%xmm14\n"
21525 "addpd %%xmm6, %%xmm10\n"
21526 "subpd %%xmm6, %%xmm14\n"
21527 "movapd %%xmm3, %%xmm11\n"
21528 "movapd %%xmm3, %%xmm15\n"
21529 "addpd %%xmm7, %%xmm11\n"
21530 "subpd %%xmm7, %%xmm15\n"
21531 "movupd %%xmm8, (%0)\n"
21532 "movupd %%xmm9, (%1)\n"
21533 "movupd %%xmm10, (%2)\n"
21534 "movupd %%xmm11, (%3)\n"
21535 "movupd %%xmm12, (%4)\n"
21536 "movupd %%xmm13, (%5)\n"
21537 "movupd %%xmm14, (%6)\n"
21538 "movupd %%xmm15, (%7)\n"
21539 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21540 );
21541 }
21542 }
21543 return;
21544 }
21545 if (depth == 22) {
21546 helper_double_24_recursive(buf + 0, 19);
21547 helper_double_24_recursive(buf + 524288, 19);
21548 helper_double_24_recursive(buf + 1048576, 19);
21549 helper_double_24_recursive(buf + 1572864, 19);
21550 helper_double_24_recursive(buf + 2097152, 19);
21551 helper_double_24_recursive(buf + 2621440, 19);
21552 helper_double_24_recursive(buf + 3145728, 19);
21553 helper_double_24_recursive(buf + 3670016, 19);
21554 for (int j = 0; j < 4194304; j += 4194304) {
21555 for (int k = 0; k < 524288; k += 2) {
21556 __asm__ volatile (
21557 "movupd (%0), %%xmm0\n"
21558 "movupd (%1), %%xmm1\n"
21559 "movupd (%2), %%xmm2\n"
21560 "movupd (%3), %%xmm3\n"
21561 "movupd (%4), %%xmm4\n"
21562 "movupd (%5), %%xmm5\n"
21563 "movupd (%6), %%xmm6\n"
21564 "movupd (%7), %%xmm7\n"
21565 "movapd %%xmm0, %%xmm8\n"
21566 "movapd %%xmm0, %%xmm9\n"
21567 "addpd %%xmm1, %%xmm8\n"
21568 "subpd %%xmm1, %%xmm9\n"
21569 "movapd %%xmm2, %%xmm10\n"
21570 "movapd %%xmm2, %%xmm11\n"
21571 "addpd %%xmm3, %%xmm10\n"
21572 "subpd %%xmm3, %%xmm11\n"
21573 "movapd %%xmm4, %%xmm12\n"
21574 "movapd %%xmm4, %%xmm13\n"
21575 "addpd %%xmm5, %%xmm12\n"
21576 "subpd %%xmm5, %%xmm13\n"
21577 "movapd %%xmm6, %%xmm14\n"
21578 "movapd %%xmm6, %%xmm15\n"
21579 "addpd %%xmm7, %%xmm14\n"
21580 "subpd %%xmm7, %%xmm15\n"
21581 "movapd %%xmm8, %%xmm0\n"
21582 "movapd %%xmm8, %%xmm2\n"
21583 "addpd %%xmm10, %%xmm0\n"
21584 "subpd %%xmm10, %%xmm2\n"
21585 "movapd %%xmm9, %%xmm1\n"
21586 "movapd %%xmm9, %%xmm3\n"
21587 "addpd %%xmm11, %%xmm1\n"
21588 "subpd %%xmm11, %%xmm3\n"
21589 "movapd %%xmm12, %%xmm4\n"
21590 "movapd %%xmm12, %%xmm6\n"
21591 "addpd %%xmm14, %%xmm4\n"
21592 "subpd %%xmm14, %%xmm6\n"
21593 "movapd %%xmm13, %%xmm5\n"
21594 "movapd %%xmm13, %%xmm7\n"
21595 "addpd %%xmm15, %%xmm5\n"
21596 "subpd %%xmm15, %%xmm7\n"
21597 "movapd %%xmm0, %%xmm8\n"
21598 "movapd %%xmm0, %%xmm12\n"
21599 "addpd %%xmm4, %%xmm8\n"
21600 "subpd %%xmm4, %%xmm12\n"
21601 "movapd %%xmm1, %%xmm9\n"
21602 "movapd %%xmm1, %%xmm13\n"
21603 "addpd %%xmm5, %%xmm9\n"
21604 "subpd %%xmm5, %%xmm13\n"
21605 "movapd %%xmm2, %%xmm10\n"
21606 "movapd %%xmm2, %%xmm14\n"
21607 "addpd %%xmm6, %%xmm10\n"
21608 "subpd %%xmm6, %%xmm14\n"
21609 "movapd %%xmm3, %%xmm11\n"
21610 "movapd %%xmm3, %%xmm15\n"
21611 "addpd %%xmm7, %%xmm11\n"
21612 "subpd %%xmm7, %%xmm15\n"
21613 "movupd %%xmm8, (%0)\n"
21614 "movupd %%xmm9, (%1)\n"
21615 "movupd %%xmm10, (%2)\n"
21616 "movupd %%xmm11, (%3)\n"
21617 "movupd %%xmm12, (%4)\n"
21618 "movupd %%xmm13, (%5)\n"
21619 "movupd %%xmm14, (%6)\n"
21620 "movupd %%xmm15, (%7)\n"
21621 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21622 );
21623 }
21624 }
21625 return;
21626 }
21627 if (depth == 24) {
21628 helper_double_24_recursive(buf + 0, 22);
21629 helper_double_24_recursive(buf + 4194304, 22);
21630 helper_double_24_recursive(buf + 8388608, 22);
21631 helper_double_24_recursive(buf + 12582912, 22);
21632 for (int j = 0; j < 16777216; j += 16777216) {
21633 for (int k = 0; k < 4194304; k += 2) {
21634 __asm__ volatile (
21635 "movupd (%0), %%xmm0\n"
21636 "movupd (%1), %%xmm1\n"
21637 "movupd (%2), %%xmm2\n"
21638 "movupd (%3), %%xmm3\n"
21639 "movapd %%xmm0, %%xmm8\n"
21640 "movapd %%xmm0, %%xmm9\n"
21641 "addpd %%xmm1, %%xmm8\n"
21642 "subpd %%xmm1, %%xmm9\n"
21643 "movapd %%xmm2, %%xmm10\n"
21644 "movapd %%xmm2, %%xmm11\n"
21645 "addpd %%xmm3, %%xmm10\n"
21646 "subpd %%xmm3, %%xmm11\n"
21647 "movapd %%xmm8, %%xmm0\n"
21648 "movapd %%xmm8, %%xmm2\n"
21649 "addpd %%xmm10, %%xmm0\n"
21650 "subpd %%xmm10, %%xmm2\n"
21651 "movapd %%xmm9, %%xmm1\n"
21652 "movapd %%xmm9, %%xmm3\n"
21653 "addpd %%xmm11, %%xmm1\n"
21654 "subpd %%xmm11, %%xmm3\n"
21655 "movupd %%xmm0, (%0)\n"
21656 "movupd %%xmm1, (%1)\n"
21657 "movupd %%xmm2, (%2)\n"
21658 "movupd %%xmm3, (%3)\n"
21659 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21660 );
21661 }
21662 }
21663 return;
21664 }
21665 }
21666 void helper_double_24(double *buf);
helper_double_24(double * buf)21667 void helper_double_24(double *buf) {
21668 helper_double_24_recursive(buf, 24);
21669 }
21670 void helper_double_25_recursive(double *buf, int depth);
helper_double_25_recursive(double * buf,int depth)21671 void helper_double_25_recursive(double *buf, int depth) {
21672 if (depth == 10) {
21673 for (int j = 0; j < 1024; j += 16) {
21674 for (int k = 0; k < 2; k += 2) {
21675 __asm__ volatile (
21676 "movupd (%0), %%xmm0\n"
21677 "movupd (%1), %%xmm1\n"
21678 "movupd (%2), %%xmm2\n"
21679 "movupd (%3), %%xmm3\n"
21680 "movupd (%4), %%xmm4\n"
21681 "movupd (%5), %%xmm5\n"
21682 "movupd (%6), %%xmm6\n"
21683 "movupd (%7), %%xmm7\n"
21684 "movapd %%xmm0, %%xmm8\n"
21685 "haddpd %%xmm8, %%xmm8\n"
21686 "movapd %%xmm0, %%xmm9\n"
21687 "hsubpd %%xmm9, %%xmm9\n"
21688 "blendpd $1, %%xmm8, %%xmm9\n"
21689 "movapd %%xmm9, %%xmm0\n"
21690 "movapd %%xmm1, %%xmm8\n"
21691 "haddpd %%xmm8, %%xmm8\n"
21692 "movapd %%xmm1, %%xmm9\n"
21693 "hsubpd %%xmm9, %%xmm9\n"
21694 "blendpd $1, %%xmm8, %%xmm9\n"
21695 "movapd %%xmm9, %%xmm1\n"
21696 "movapd %%xmm2, %%xmm8\n"
21697 "haddpd %%xmm8, %%xmm8\n"
21698 "movapd %%xmm2, %%xmm9\n"
21699 "hsubpd %%xmm9, %%xmm9\n"
21700 "blendpd $1, %%xmm8, %%xmm9\n"
21701 "movapd %%xmm9, %%xmm2\n"
21702 "movapd %%xmm3, %%xmm8\n"
21703 "haddpd %%xmm8, %%xmm8\n"
21704 "movapd %%xmm3, %%xmm9\n"
21705 "hsubpd %%xmm9, %%xmm9\n"
21706 "blendpd $1, %%xmm8, %%xmm9\n"
21707 "movapd %%xmm9, %%xmm3\n"
21708 "movapd %%xmm4, %%xmm8\n"
21709 "haddpd %%xmm8, %%xmm8\n"
21710 "movapd %%xmm4, %%xmm9\n"
21711 "hsubpd %%xmm9, %%xmm9\n"
21712 "blendpd $1, %%xmm8, %%xmm9\n"
21713 "movapd %%xmm9, %%xmm4\n"
21714 "movapd %%xmm5, %%xmm8\n"
21715 "haddpd %%xmm8, %%xmm8\n"
21716 "movapd %%xmm5, %%xmm9\n"
21717 "hsubpd %%xmm9, %%xmm9\n"
21718 "blendpd $1, %%xmm8, %%xmm9\n"
21719 "movapd %%xmm9, %%xmm5\n"
21720 "movapd %%xmm6, %%xmm8\n"
21721 "haddpd %%xmm8, %%xmm8\n"
21722 "movapd %%xmm6, %%xmm9\n"
21723 "hsubpd %%xmm9, %%xmm9\n"
21724 "blendpd $1, %%xmm8, %%xmm9\n"
21725 "movapd %%xmm9, %%xmm6\n"
21726 "movapd %%xmm7, %%xmm8\n"
21727 "haddpd %%xmm8, %%xmm8\n"
21728 "movapd %%xmm7, %%xmm9\n"
21729 "hsubpd %%xmm9, %%xmm9\n"
21730 "blendpd $1, %%xmm8, %%xmm9\n"
21731 "movapd %%xmm9, %%xmm7\n"
21732 "movapd %%xmm0, %%xmm8\n"
21733 "movapd %%xmm0, %%xmm9\n"
21734 "addpd %%xmm1, %%xmm8\n"
21735 "subpd %%xmm1, %%xmm9\n"
21736 "movapd %%xmm2, %%xmm10\n"
21737 "movapd %%xmm2, %%xmm11\n"
21738 "addpd %%xmm3, %%xmm10\n"
21739 "subpd %%xmm3, %%xmm11\n"
21740 "movapd %%xmm4, %%xmm12\n"
21741 "movapd %%xmm4, %%xmm13\n"
21742 "addpd %%xmm5, %%xmm12\n"
21743 "subpd %%xmm5, %%xmm13\n"
21744 "movapd %%xmm6, %%xmm14\n"
21745 "movapd %%xmm6, %%xmm15\n"
21746 "addpd %%xmm7, %%xmm14\n"
21747 "subpd %%xmm7, %%xmm15\n"
21748 "movapd %%xmm8, %%xmm0\n"
21749 "movapd %%xmm8, %%xmm2\n"
21750 "addpd %%xmm10, %%xmm0\n"
21751 "subpd %%xmm10, %%xmm2\n"
21752 "movapd %%xmm9, %%xmm1\n"
21753 "movapd %%xmm9, %%xmm3\n"
21754 "addpd %%xmm11, %%xmm1\n"
21755 "subpd %%xmm11, %%xmm3\n"
21756 "movapd %%xmm12, %%xmm4\n"
21757 "movapd %%xmm12, %%xmm6\n"
21758 "addpd %%xmm14, %%xmm4\n"
21759 "subpd %%xmm14, %%xmm6\n"
21760 "movapd %%xmm13, %%xmm5\n"
21761 "movapd %%xmm13, %%xmm7\n"
21762 "addpd %%xmm15, %%xmm5\n"
21763 "subpd %%xmm15, %%xmm7\n"
21764 "movapd %%xmm0, %%xmm8\n"
21765 "movapd %%xmm0, %%xmm12\n"
21766 "addpd %%xmm4, %%xmm8\n"
21767 "subpd %%xmm4, %%xmm12\n"
21768 "movapd %%xmm1, %%xmm9\n"
21769 "movapd %%xmm1, %%xmm13\n"
21770 "addpd %%xmm5, %%xmm9\n"
21771 "subpd %%xmm5, %%xmm13\n"
21772 "movapd %%xmm2, %%xmm10\n"
21773 "movapd %%xmm2, %%xmm14\n"
21774 "addpd %%xmm6, %%xmm10\n"
21775 "subpd %%xmm6, %%xmm14\n"
21776 "movapd %%xmm3, %%xmm11\n"
21777 "movapd %%xmm3, %%xmm15\n"
21778 "addpd %%xmm7, %%xmm11\n"
21779 "subpd %%xmm7, %%xmm15\n"
21780 "movupd %%xmm8, (%0)\n"
21781 "movupd %%xmm9, (%1)\n"
21782 "movupd %%xmm10, (%2)\n"
21783 "movupd %%xmm11, (%3)\n"
21784 "movupd %%xmm12, (%4)\n"
21785 "movupd %%xmm13, (%5)\n"
21786 "movupd %%xmm14, (%6)\n"
21787 "movupd %%xmm15, (%7)\n"
21788 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21789 );
21790 }
21791 }
21792 for (int j = 0; j < 1024; j += 128) {
21793 for (int k = 0; k < 16; k += 2) {
21794 __asm__ volatile (
21795 "movupd (%0), %%xmm0\n"
21796 "movupd (%1), %%xmm1\n"
21797 "movupd (%2), %%xmm2\n"
21798 "movupd (%3), %%xmm3\n"
21799 "movupd (%4), %%xmm4\n"
21800 "movupd (%5), %%xmm5\n"
21801 "movupd (%6), %%xmm6\n"
21802 "movupd (%7), %%xmm7\n"
21803 "movapd %%xmm0, %%xmm8\n"
21804 "movapd %%xmm0, %%xmm9\n"
21805 "addpd %%xmm1, %%xmm8\n"
21806 "subpd %%xmm1, %%xmm9\n"
21807 "movapd %%xmm2, %%xmm10\n"
21808 "movapd %%xmm2, %%xmm11\n"
21809 "addpd %%xmm3, %%xmm10\n"
21810 "subpd %%xmm3, %%xmm11\n"
21811 "movapd %%xmm4, %%xmm12\n"
21812 "movapd %%xmm4, %%xmm13\n"
21813 "addpd %%xmm5, %%xmm12\n"
21814 "subpd %%xmm5, %%xmm13\n"
21815 "movapd %%xmm6, %%xmm14\n"
21816 "movapd %%xmm6, %%xmm15\n"
21817 "addpd %%xmm7, %%xmm14\n"
21818 "subpd %%xmm7, %%xmm15\n"
21819 "movapd %%xmm8, %%xmm0\n"
21820 "movapd %%xmm8, %%xmm2\n"
21821 "addpd %%xmm10, %%xmm0\n"
21822 "subpd %%xmm10, %%xmm2\n"
21823 "movapd %%xmm9, %%xmm1\n"
21824 "movapd %%xmm9, %%xmm3\n"
21825 "addpd %%xmm11, %%xmm1\n"
21826 "subpd %%xmm11, %%xmm3\n"
21827 "movapd %%xmm12, %%xmm4\n"
21828 "movapd %%xmm12, %%xmm6\n"
21829 "addpd %%xmm14, %%xmm4\n"
21830 "subpd %%xmm14, %%xmm6\n"
21831 "movapd %%xmm13, %%xmm5\n"
21832 "movapd %%xmm13, %%xmm7\n"
21833 "addpd %%xmm15, %%xmm5\n"
21834 "subpd %%xmm15, %%xmm7\n"
21835 "movapd %%xmm0, %%xmm8\n"
21836 "movapd %%xmm0, %%xmm12\n"
21837 "addpd %%xmm4, %%xmm8\n"
21838 "subpd %%xmm4, %%xmm12\n"
21839 "movapd %%xmm1, %%xmm9\n"
21840 "movapd %%xmm1, %%xmm13\n"
21841 "addpd %%xmm5, %%xmm9\n"
21842 "subpd %%xmm5, %%xmm13\n"
21843 "movapd %%xmm2, %%xmm10\n"
21844 "movapd %%xmm2, %%xmm14\n"
21845 "addpd %%xmm6, %%xmm10\n"
21846 "subpd %%xmm6, %%xmm14\n"
21847 "movapd %%xmm3, %%xmm11\n"
21848 "movapd %%xmm3, %%xmm15\n"
21849 "addpd %%xmm7, %%xmm11\n"
21850 "subpd %%xmm7, %%xmm15\n"
21851 "movupd %%xmm8, (%0)\n"
21852 "movupd %%xmm9, (%1)\n"
21853 "movupd %%xmm10, (%2)\n"
21854 "movupd %%xmm11, (%3)\n"
21855 "movupd %%xmm12, (%4)\n"
21856 "movupd %%xmm13, (%5)\n"
21857 "movupd %%xmm14, (%6)\n"
21858 "movupd %%xmm15, (%7)\n"
21859 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21860 );
21861 }
21862 }
21863 for (int j = 0; j < 1024; j += 1024) {
21864 for (int k = 0; k < 128; k += 2) {
21865 __asm__ volatile (
21866 "movupd (%0), %%xmm0\n"
21867 "movupd (%1), %%xmm1\n"
21868 "movupd (%2), %%xmm2\n"
21869 "movupd (%3), %%xmm3\n"
21870 "movupd (%4), %%xmm4\n"
21871 "movupd (%5), %%xmm5\n"
21872 "movupd (%6), %%xmm6\n"
21873 "movupd (%7), %%xmm7\n"
21874 "movapd %%xmm0, %%xmm8\n"
21875 "movapd %%xmm0, %%xmm9\n"
21876 "addpd %%xmm1, %%xmm8\n"
21877 "subpd %%xmm1, %%xmm9\n"
21878 "movapd %%xmm2, %%xmm10\n"
21879 "movapd %%xmm2, %%xmm11\n"
21880 "addpd %%xmm3, %%xmm10\n"
21881 "subpd %%xmm3, %%xmm11\n"
21882 "movapd %%xmm4, %%xmm12\n"
21883 "movapd %%xmm4, %%xmm13\n"
21884 "addpd %%xmm5, %%xmm12\n"
21885 "subpd %%xmm5, %%xmm13\n"
21886 "movapd %%xmm6, %%xmm14\n"
21887 "movapd %%xmm6, %%xmm15\n"
21888 "addpd %%xmm7, %%xmm14\n"
21889 "subpd %%xmm7, %%xmm15\n"
21890 "movapd %%xmm8, %%xmm0\n"
21891 "movapd %%xmm8, %%xmm2\n"
21892 "addpd %%xmm10, %%xmm0\n"
21893 "subpd %%xmm10, %%xmm2\n"
21894 "movapd %%xmm9, %%xmm1\n"
21895 "movapd %%xmm9, %%xmm3\n"
21896 "addpd %%xmm11, %%xmm1\n"
21897 "subpd %%xmm11, %%xmm3\n"
21898 "movapd %%xmm12, %%xmm4\n"
21899 "movapd %%xmm12, %%xmm6\n"
21900 "addpd %%xmm14, %%xmm4\n"
21901 "subpd %%xmm14, %%xmm6\n"
21902 "movapd %%xmm13, %%xmm5\n"
21903 "movapd %%xmm13, %%xmm7\n"
21904 "addpd %%xmm15, %%xmm5\n"
21905 "subpd %%xmm15, %%xmm7\n"
21906 "movapd %%xmm0, %%xmm8\n"
21907 "movapd %%xmm0, %%xmm12\n"
21908 "addpd %%xmm4, %%xmm8\n"
21909 "subpd %%xmm4, %%xmm12\n"
21910 "movapd %%xmm1, %%xmm9\n"
21911 "movapd %%xmm1, %%xmm13\n"
21912 "addpd %%xmm5, %%xmm9\n"
21913 "subpd %%xmm5, %%xmm13\n"
21914 "movapd %%xmm2, %%xmm10\n"
21915 "movapd %%xmm2, %%xmm14\n"
21916 "addpd %%xmm6, %%xmm10\n"
21917 "subpd %%xmm6, %%xmm14\n"
21918 "movapd %%xmm3, %%xmm11\n"
21919 "movapd %%xmm3, %%xmm15\n"
21920 "addpd %%xmm7, %%xmm11\n"
21921 "subpd %%xmm7, %%xmm15\n"
21922 "movupd %%xmm8, (%0)\n"
21923 "movupd %%xmm9, (%1)\n"
21924 "movupd %%xmm10, (%2)\n"
21925 "movupd %%xmm11, (%3)\n"
21926 "movupd %%xmm12, (%4)\n"
21927 "movupd %%xmm13, (%5)\n"
21928 "movupd %%xmm14, (%6)\n"
21929 "movupd %%xmm15, (%7)\n"
21930 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
21931 );
21932 }
21933 }
21934 return;
21935 }
21936 if (depth == 13) {
21937 helper_double_25_recursive(buf + 0, 10);
21938 helper_double_25_recursive(buf + 1024, 10);
21939 helper_double_25_recursive(buf + 2048, 10);
21940 helper_double_25_recursive(buf + 3072, 10);
21941 helper_double_25_recursive(buf + 4096, 10);
21942 helper_double_25_recursive(buf + 5120, 10);
21943 helper_double_25_recursive(buf + 6144, 10);
21944 helper_double_25_recursive(buf + 7168, 10);
21945 for (int j = 0; j < 8192; j += 8192) {
21946 for (int k = 0; k < 1024; k += 2) {
21947 __asm__ volatile (
21948 "movupd (%0), %%xmm0\n"
21949 "movupd (%1), %%xmm1\n"
21950 "movupd (%2), %%xmm2\n"
21951 "movupd (%3), %%xmm3\n"
21952 "movupd (%4), %%xmm4\n"
21953 "movupd (%5), %%xmm5\n"
21954 "movupd (%6), %%xmm6\n"
21955 "movupd (%7), %%xmm7\n"
21956 "movapd %%xmm0, %%xmm8\n"
21957 "movapd %%xmm0, %%xmm9\n"
21958 "addpd %%xmm1, %%xmm8\n"
21959 "subpd %%xmm1, %%xmm9\n"
21960 "movapd %%xmm2, %%xmm10\n"
21961 "movapd %%xmm2, %%xmm11\n"
21962 "addpd %%xmm3, %%xmm10\n"
21963 "subpd %%xmm3, %%xmm11\n"
21964 "movapd %%xmm4, %%xmm12\n"
21965 "movapd %%xmm4, %%xmm13\n"
21966 "addpd %%xmm5, %%xmm12\n"
21967 "subpd %%xmm5, %%xmm13\n"
21968 "movapd %%xmm6, %%xmm14\n"
21969 "movapd %%xmm6, %%xmm15\n"
21970 "addpd %%xmm7, %%xmm14\n"
21971 "subpd %%xmm7, %%xmm15\n"
21972 "movapd %%xmm8, %%xmm0\n"
21973 "movapd %%xmm8, %%xmm2\n"
21974 "addpd %%xmm10, %%xmm0\n"
21975 "subpd %%xmm10, %%xmm2\n"
21976 "movapd %%xmm9, %%xmm1\n"
21977 "movapd %%xmm9, %%xmm3\n"
21978 "addpd %%xmm11, %%xmm1\n"
21979 "subpd %%xmm11, %%xmm3\n"
21980 "movapd %%xmm12, %%xmm4\n"
21981 "movapd %%xmm12, %%xmm6\n"
21982 "addpd %%xmm14, %%xmm4\n"
21983 "subpd %%xmm14, %%xmm6\n"
21984 "movapd %%xmm13, %%xmm5\n"
21985 "movapd %%xmm13, %%xmm7\n"
21986 "addpd %%xmm15, %%xmm5\n"
21987 "subpd %%xmm15, %%xmm7\n"
21988 "movapd %%xmm0, %%xmm8\n"
21989 "movapd %%xmm0, %%xmm12\n"
21990 "addpd %%xmm4, %%xmm8\n"
21991 "subpd %%xmm4, %%xmm12\n"
21992 "movapd %%xmm1, %%xmm9\n"
21993 "movapd %%xmm1, %%xmm13\n"
21994 "addpd %%xmm5, %%xmm9\n"
21995 "subpd %%xmm5, %%xmm13\n"
21996 "movapd %%xmm2, %%xmm10\n"
21997 "movapd %%xmm2, %%xmm14\n"
21998 "addpd %%xmm6, %%xmm10\n"
21999 "subpd %%xmm6, %%xmm14\n"
22000 "movapd %%xmm3, %%xmm11\n"
22001 "movapd %%xmm3, %%xmm15\n"
22002 "addpd %%xmm7, %%xmm11\n"
22003 "subpd %%xmm7, %%xmm15\n"
22004 "movupd %%xmm8, (%0)\n"
22005 "movupd %%xmm9, (%1)\n"
22006 "movupd %%xmm10, (%2)\n"
22007 "movupd %%xmm11, (%3)\n"
22008 "movupd %%xmm12, (%4)\n"
22009 "movupd %%xmm13, (%5)\n"
22010 "movupd %%xmm14, (%6)\n"
22011 "movupd %%xmm15, (%7)\n"
22012 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22013 );
22014 }
22015 }
22016 return;
22017 }
22018 if (depth == 16) {
22019 helper_double_25_recursive(buf + 0, 13);
22020 helper_double_25_recursive(buf + 8192, 13);
22021 helper_double_25_recursive(buf + 16384, 13);
22022 helper_double_25_recursive(buf + 24576, 13);
22023 helper_double_25_recursive(buf + 32768, 13);
22024 helper_double_25_recursive(buf + 40960, 13);
22025 helper_double_25_recursive(buf + 49152, 13);
22026 helper_double_25_recursive(buf + 57344, 13);
22027 for (int j = 0; j < 65536; j += 65536) {
22028 for (int k = 0; k < 8192; k += 2) {
22029 __asm__ volatile (
22030 "movupd (%0), %%xmm0\n"
22031 "movupd (%1), %%xmm1\n"
22032 "movupd (%2), %%xmm2\n"
22033 "movupd (%3), %%xmm3\n"
22034 "movupd (%4), %%xmm4\n"
22035 "movupd (%5), %%xmm5\n"
22036 "movupd (%6), %%xmm6\n"
22037 "movupd (%7), %%xmm7\n"
22038 "movapd %%xmm0, %%xmm8\n"
22039 "movapd %%xmm0, %%xmm9\n"
22040 "addpd %%xmm1, %%xmm8\n"
22041 "subpd %%xmm1, %%xmm9\n"
22042 "movapd %%xmm2, %%xmm10\n"
22043 "movapd %%xmm2, %%xmm11\n"
22044 "addpd %%xmm3, %%xmm10\n"
22045 "subpd %%xmm3, %%xmm11\n"
22046 "movapd %%xmm4, %%xmm12\n"
22047 "movapd %%xmm4, %%xmm13\n"
22048 "addpd %%xmm5, %%xmm12\n"
22049 "subpd %%xmm5, %%xmm13\n"
22050 "movapd %%xmm6, %%xmm14\n"
22051 "movapd %%xmm6, %%xmm15\n"
22052 "addpd %%xmm7, %%xmm14\n"
22053 "subpd %%xmm7, %%xmm15\n"
22054 "movapd %%xmm8, %%xmm0\n"
22055 "movapd %%xmm8, %%xmm2\n"
22056 "addpd %%xmm10, %%xmm0\n"
22057 "subpd %%xmm10, %%xmm2\n"
22058 "movapd %%xmm9, %%xmm1\n"
22059 "movapd %%xmm9, %%xmm3\n"
22060 "addpd %%xmm11, %%xmm1\n"
22061 "subpd %%xmm11, %%xmm3\n"
22062 "movapd %%xmm12, %%xmm4\n"
22063 "movapd %%xmm12, %%xmm6\n"
22064 "addpd %%xmm14, %%xmm4\n"
22065 "subpd %%xmm14, %%xmm6\n"
22066 "movapd %%xmm13, %%xmm5\n"
22067 "movapd %%xmm13, %%xmm7\n"
22068 "addpd %%xmm15, %%xmm5\n"
22069 "subpd %%xmm15, %%xmm7\n"
22070 "movapd %%xmm0, %%xmm8\n"
22071 "movapd %%xmm0, %%xmm12\n"
22072 "addpd %%xmm4, %%xmm8\n"
22073 "subpd %%xmm4, %%xmm12\n"
22074 "movapd %%xmm1, %%xmm9\n"
22075 "movapd %%xmm1, %%xmm13\n"
22076 "addpd %%xmm5, %%xmm9\n"
22077 "subpd %%xmm5, %%xmm13\n"
22078 "movapd %%xmm2, %%xmm10\n"
22079 "movapd %%xmm2, %%xmm14\n"
22080 "addpd %%xmm6, %%xmm10\n"
22081 "subpd %%xmm6, %%xmm14\n"
22082 "movapd %%xmm3, %%xmm11\n"
22083 "movapd %%xmm3, %%xmm15\n"
22084 "addpd %%xmm7, %%xmm11\n"
22085 "subpd %%xmm7, %%xmm15\n"
22086 "movupd %%xmm8, (%0)\n"
22087 "movupd %%xmm9, (%1)\n"
22088 "movupd %%xmm10, (%2)\n"
22089 "movupd %%xmm11, (%3)\n"
22090 "movupd %%xmm12, (%4)\n"
22091 "movupd %%xmm13, (%5)\n"
22092 "movupd %%xmm14, (%6)\n"
22093 "movupd %%xmm15, (%7)\n"
22094 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22095 );
22096 }
22097 }
22098 return;
22099 }
22100 if (depth == 19) {
22101 helper_double_25_recursive(buf + 0, 16);
22102 helper_double_25_recursive(buf + 65536, 16);
22103 helper_double_25_recursive(buf + 131072, 16);
22104 helper_double_25_recursive(buf + 196608, 16);
22105 helper_double_25_recursive(buf + 262144, 16);
22106 helper_double_25_recursive(buf + 327680, 16);
22107 helper_double_25_recursive(buf + 393216, 16);
22108 helper_double_25_recursive(buf + 458752, 16);
22109 for (int j = 0; j < 524288; j += 524288) {
22110 for (int k = 0; k < 65536; k += 2) {
22111 __asm__ volatile (
22112 "movupd (%0), %%xmm0\n"
22113 "movupd (%1), %%xmm1\n"
22114 "movupd (%2), %%xmm2\n"
22115 "movupd (%3), %%xmm3\n"
22116 "movupd (%4), %%xmm4\n"
22117 "movupd (%5), %%xmm5\n"
22118 "movupd (%6), %%xmm6\n"
22119 "movupd (%7), %%xmm7\n"
22120 "movapd %%xmm0, %%xmm8\n"
22121 "movapd %%xmm0, %%xmm9\n"
22122 "addpd %%xmm1, %%xmm8\n"
22123 "subpd %%xmm1, %%xmm9\n"
22124 "movapd %%xmm2, %%xmm10\n"
22125 "movapd %%xmm2, %%xmm11\n"
22126 "addpd %%xmm3, %%xmm10\n"
22127 "subpd %%xmm3, %%xmm11\n"
22128 "movapd %%xmm4, %%xmm12\n"
22129 "movapd %%xmm4, %%xmm13\n"
22130 "addpd %%xmm5, %%xmm12\n"
22131 "subpd %%xmm5, %%xmm13\n"
22132 "movapd %%xmm6, %%xmm14\n"
22133 "movapd %%xmm6, %%xmm15\n"
22134 "addpd %%xmm7, %%xmm14\n"
22135 "subpd %%xmm7, %%xmm15\n"
22136 "movapd %%xmm8, %%xmm0\n"
22137 "movapd %%xmm8, %%xmm2\n"
22138 "addpd %%xmm10, %%xmm0\n"
22139 "subpd %%xmm10, %%xmm2\n"
22140 "movapd %%xmm9, %%xmm1\n"
22141 "movapd %%xmm9, %%xmm3\n"
22142 "addpd %%xmm11, %%xmm1\n"
22143 "subpd %%xmm11, %%xmm3\n"
22144 "movapd %%xmm12, %%xmm4\n"
22145 "movapd %%xmm12, %%xmm6\n"
22146 "addpd %%xmm14, %%xmm4\n"
22147 "subpd %%xmm14, %%xmm6\n"
22148 "movapd %%xmm13, %%xmm5\n"
22149 "movapd %%xmm13, %%xmm7\n"
22150 "addpd %%xmm15, %%xmm5\n"
22151 "subpd %%xmm15, %%xmm7\n"
22152 "movapd %%xmm0, %%xmm8\n"
22153 "movapd %%xmm0, %%xmm12\n"
22154 "addpd %%xmm4, %%xmm8\n"
22155 "subpd %%xmm4, %%xmm12\n"
22156 "movapd %%xmm1, %%xmm9\n"
22157 "movapd %%xmm1, %%xmm13\n"
22158 "addpd %%xmm5, %%xmm9\n"
22159 "subpd %%xmm5, %%xmm13\n"
22160 "movapd %%xmm2, %%xmm10\n"
22161 "movapd %%xmm2, %%xmm14\n"
22162 "addpd %%xmm6, %%xmm10\n"
22163 "subpd %%xmm6, %%xmm14\n"
22164 "movapd %%xmm3, %%xmm11\n"
22165 "movapd %%xmm3, %%xmm15\n"
22166 "addpd %%xmm7, %%xmm11\n"
22167 "subpd %%xmm7, %%xmm15\n"
22168 "movupd %%xmm8, (%0)\n"
22169 "movupd %%xmm9, (%1)\n"
22170 "movupd %%xmm10, (%2)\n"
22171 "movupd %%xmm11, (%3)\n"
22172 "movupd %%xmm12, (%4)\n"
22173 "movupd %%xmm13, (%5)\n"
22174 "movupd %%xmm14, (%6)\n"
22175 "movupd %%xmm15, (%7)\n"
22176 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22177 );
22178 }
22179 }
22180 return;
22181 }
22182 if (depth == 22) {
22183 helper_double_25_recursive(buf + 0, 19);
22184 helper_double_25_recursive(buf + 524288, 19);
22185 helper_double_25_recursive(buf + 1048576, 19);
22186 helper_double_25_recursive(buf + 1572864, 19);
22187 helper_double_25_recursive(buf + 2097152, 19);
22188 helper_double_25_recursive(buf + 2621440, 19);
22189 helper_double_25_recursive(buf + 3145728, 19);
22190 helper_double_25_recursive(buf + 3670016, 19);
22191 for (int j = 0; j < 4194304; j += 4194304) {
22192 for (int k = 0; k < 524288; k += 2) {
22193 __asm__ volatile (
22194 "movupd (%0), %%xmm0\n"
22195 "movupd (%1), %%xmm1\n"
22196 "movupd (%2), %%xmm2\n"
22197 "movupd (%3), %%xmm3\n"
22198 "movupd (%4), %%xmm4\n"
22199 "movupd (%5), %%xmm5\n"
22200 "movupd (%6), %%xmm6\n"
22201 "movupd (%7), %%xmm7\n"
22202 "movapd %%xmm0, %%xmm8\n"
22203 "movapd %%xmm0, %%xmm9\n"
22204 "addpd %%xmm1, %%xmm8\n"
22205 "subpd %%xmm1, %%xmm9\n"
22206 "movapd %%xmm2, %%xmm10\n"
22207 "movapd %%xmm2, %%xmm11\n"
22208 "addpd %%xmm3, %%xmm10\n"
22209 "subpd %%xmm3, %%xmm11\n"
22210 "movapd %%xmm4, %%xmm12\n"
22211 "movapd %%xmm4, %%xmm13\n"
22212 "addpd %%xmm5, %%xmm12\n"
22213 "subpd %%xmm5, %%xmm13\n"
22214 "movapd %%xmm6, %%xmm14\n"
22215 "movapd %%xmm6, %%xmm15\n"
22216 "addpd %%xmm7, %%xmm14\n"
22217 "subpd %%xmm7, %%xmm15\n"
22218 "movapd %%xmm8, %%xmm0\n"
22219 "movapd %%xmm8, %%xmm2\n"
22220 "addpd %%xmm10, %%xmm0\n"
22221 "subpd %%xmm10, %%xmm2\n"
22222 "movapd %%xmm9, %%xmm1\n"
22223 "movapd %%xmm9, %%xmm3\n"
22224 "addpd %%xmm11, %%xmm1\n"
22225 "subpd %%xmm11, %%xmm3\n"
22226 "movapd %%xmm12, %%xmm4\n"
22227 "movapd %%xmm12, %%xmm6\n"
22228 "addpd %%xmm14, %%xmm4\n"
22229 "subpd %%xmm14, %%xmm6\n"
22230 "movapd %%xmm13, %%xmm5\n"
22231 "movapd %%xmm13, %%xmm7\n"
22232 "addpd %%xmm15, %%xmm5\n"
22233 "subpd %%xmm15, %%xmm7\n"
22234 "movapd %%xmm0, %%xmm8\n"
22235 "movapd %%xmm0, %%xmm12\n"
22236 "addpd %%xmm4, %%xmm8\n"
22237 "subpd %%xmm4, %%xmm12\n"
22238 "movapd %%xmm1, %%xmm9\n"
22239 "movapd %%xmm1, %%xmm13\n"
22240 "addpd %%xmm5, %%xmm9\n"
22241 "subpd %%xmm5, %%xmm13\n"
22242 "movapd %%xmm2, %%xmm10\n"
22243 "movapd %%xmm2, %%xmm14\n"
22244 "addpd %%xmm6, %%xmm10\n"
22245 "subpd %%xmm6, %%xmm14\n"
22246 "movapd %%xmm3, %%xmm11\n"
22247 "movapd %%xmm3, %%xmm15\n"
22248 "addpd %%xmm7, %%xmm11\n"
22249 "subpd %%xmm7, %%xmm15\n"
22250 "movupd %%xmm8, (%0)\n"
22251 "movupd %%xmm9, (%1)\n"
22252 "movupd %%xmm10, (%2)\n"
22253 "movupd %%xmm11, (%3)\n"
22254 "movupd %%xmm12, (%4)\n"
22255 "movupd %%xmm13, (%5)\n"
22256 "movupd %%xmm14, (%6)\n"
22257 "movupd %%xmm15, (%7)\n"
22258 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22259 );
22260 }
22261 }
22262 return;
22263 }
22264 if (depth == 25) {
22265 helper_double_25_recursive(buf + 0, 22);
22266 helper_double_25_recursive(buf + 4194304, 22);
22267 helper_double_25_recursive(buf + 8388608, 22);
22268 helper_double_25_recursive(buf + 12582912, 22);
22269 helper_double_25_recursive(buf + 16777216, 22);
22270 helper_double_25_recursive(buf + 20971520, 22);
22271 helper_double_25_recursive(buf + 25165824, 22);
22272 helper_double_25_recursive(buf + 29360128, 22);
22273 for (int j = 0; j < 33554432; j += 33554432) {
22274 for (int k = 0; k < 4194304; k += 2) {
22275 __asm__ volatile (
22276 "movupd (%0), %%xmm0\n"
22277 "movupd (%1), %%xmm1\n"
22278 "movupd (%2), %%xmm2\n"
22279 "movupd (%3), %%xmm3\n"
22280 "movupd (%4), %%xmm4\n"
22281 "movupd (%5), %%xmm5\n"
22282 "movupd (%6), %%xmm6\n"
22283 "movupd (%7), %%xmm7\n"
22284 "movapd %%xmm0, %%xmm8\n"
22285 "movapd %%xmm0, %%xmm9\n"
22286 "addpd %%xmm1, %%xmm8\n"
22287 "subpd %%xmm1, %%xmm9\n"
22288 "movapd %%xmm2, %%xmm10\n"
22289 "movapd %%xmm2, %%xmm11\n"
22290 "addpd %%xmm3, %%xmm10\n"
22291 "subpd %%xmm3, %%xmm11\n"
22292 "movapd %%xmm4, %%xmm12\n"
22293 "movapd %%xmm4, %%xmm13\n"
22294 "addpd %%xmm5, %%xmm12\n"
22295 "subpd %%xmm5, %%xmm13\n"
22296 "movapd %%xmm6, %%xmm14\n"
22297 "movapd %%xmm6, %%xmm15\n"
22298 "addpd %%xmm7, %%xmm14\n"
22299 "subpd %%xmm7, %%xmm15\n"
22300 "movapd %%xmm8, %%xmm0\n"
22301 "movapd %%xmm8, %%xmm2\n"
22302 "addpd %%xmm10, %%xmm0\n"
22303 "subpd %%xmm10, %%xmm2\n"
22304 "movapd %%xmm9, %%xmm1\n"
22305 "movapd %%xmm9, %%xmm3\n"
22306 "addpd %%xmm11, %%xmm1\n"
22307 "subpd %%xmm11, %%xmm3\n"
22308 "movapd %%xmm12, %%xmm4\n"
22309 "movapd %%xmm12, %%xmm6\n"
22310 "addpd %%xmm14, %%xmm4\n"
22311 "subpd %%xmm14, %%xmm6\n"
22312 "movapd %%xmm13, %%xmm5\n"
22313 "movapd %%xmm13, %%xmm7\n"
22314 "addpd %%xmm15, %%xmm5\n"
22315 "subpd %%xmm15, %%xmm7\n"
22316 "movapd %%xmm0, %%xmm8\n"
22317 "movapd %%xmm0, %%xmm12\n"
22318 "addpd %%xmm4, %%xmm8\n"
22319 "subpd %%xmm4, %%xmm12\n"
22320 "movapd %%xmm1, %%xmm9\n"
22321 "movapd %%xmm1, %%xmm13\n"
22322 "addpd %%xmm5, %%xmm9\n"
22323 "subpd %%xmm5, %%xmm13\n"
22324 "movapd %%xmm2, %%xmm10\n"
22325 "movapd %%xmm2, %%xmm14\n"
22326 "addpd %%xmm6, %%xmm10\n"
22327 "subpd %%xmm6, %%xmm14\n"
22328 "movapd %%xmm3, %%xmm11\n"
22329 "movapd %%xmm3, %%xmm15\n"
22330 "addpd %%xmm7, %%xmm11\n"
22331 "subpd %%xmm7, %%xmm15\n"
22332 "movupd %%xmm8, (%0)\n"
22333 "movupd %%xmm9, (%1)\n"
22334 "movupd %%xmm10, (%2)\n"
22335 "movupd %%xmm11, (%3)\n"
22336 "movupd %%xmm12, (%4)\n"
22337 "movupd %%xmm13, (%5)\n"
22338 "movupd %%xmm14, (%6)\n"
22339 "movupd %%xmm15, (%7)\n"
22340 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22341 );
22342 }
22343 }
22344 return;
22345 }
22346 }
22347 void helper_double_25(double *buf);
helper_double_25(double * buf)22348 void helper_double_25(double *buf) {
22349 helper_double_25_recursive(buf, 25);
22350 }
22351 void helper_double_26_recursive(double *buf, int depth);
helper_double_26_recursive(double * buf,int depth)22352 void helper_double_26_recursive(double *buf, int depth) {
22353 if (depth == 5) {
22354 for (int j = 0; j < 32; j += 16) {
22355 for (int k = 0; k < 2; k += 2) {
22356 __asm__ volatile (
22357 "movupd (%0), %%xmm0\n"
22358 "movupd (%1), %%xmm1\n"
22359 "movupd (%2), %%xmm2\n"
22360 "movupd (%3), %%xmm3\n"
22361 "movupd (%4), %%xmm4\n"
22362 "movupd (%5), %%xmm5\n"
22363 "movupd (%6), %%xmm6\n"
22364 "movupd (%7), %%xmm7\n"
22365 "movapd %%xmm0, %%xmm8\n"
22366 "haddpd %%xmm8, %%xmm8\n"
22367 "movapd %%xmm0, %%xmm9\n"
22368 "hsubpd %%xmm9, %%xmm9\n"
22369 "blendpd $1, %%xmm8, %%xmm9\n"
22370 "movapd %%xmm9, %%xmm0\n"
22371 "movapd %%xmm1, %%xmm8\n"
22372 "haddpd %%xmm8, %%xmm8\n"
22373 "movapd %%xmm1, %%xmm9\n"
22374 "hsubpd %%xmm9, %%xmm9\n"
22375 "blendpd $1, %%xmm8, %%xmm9\n"
22376 "movapd %%xmm9, %%xmm1\n"
22377 "movapd %%xmm2, %%xmm8\n"
22378 "haddpd %%xmm8, %%xmm8\n"
22379 "movapd %%xmm2, %%xmm9\n"
22380 "hsubpd %%xmm9, %%xmm9\n"
22381 "blendpd $1, %%xmm8, %%xmm9\n"
22382 "movapd %%xmm9, %%xmm2\n"
22383 "movapd %%xmm3, %%xmm8\n"
22384 "haddpd %%xmm8, %%xmm8\n"
22385 "movapd %%xmm3, %%xmm9\n"
22386 "hsubpd %%xmm9, %%xmm9\n"
22387 "blendpd $1, %%xmm8, %%xmm9\n"
22388 "movapd %%xmm9, %%xmm3\n"
22389 "movapd %%xmm4, %%xmm8\n"
22390 "haddpd %%xmm8, %%xmm8\n"
22391 "movapd %%xmm4, %%xmm9\n"
22392 "hsubpd %%xmm9, %%xmm9\n"
22393 "blendpd $1, %%xmm8, %%xmm9\n"
22394 "movapd %%xmm9, %%xmm4\n"
22395 "movapd %%xmm5, %%xmm8\n"
22396 "haddpd %%xmm8, %%xmm8\n"
22397 "movapd %%xmm5, %%xmm9\n"
22398 "hsubpd %%xmm9, %%xmm9\n"
22399 "blendpd $1, %%xmm8, %%xmm9\n"
22400 "movapd %%xmm9, %%xmm5\n"
22401 "movapd %%xmm6, %%xmm8\n"
22402 "haddpd %%xmm8, %%xmm8\n"
22403 "movapd %%xmm6, %%xmm9\n"
22404 "hsubpd %%xmm9, %%xmm9\n"
22405 "blendpd $1, %%xmm8, %%xmm9\n"
22406 "movapd %%xmm9, %%xmm6\n"
22407 "movapd %%xmm7, %%xmm8\n"
22408 "haddpd %%xmm8, %%xmm8\n"
22409 "movapd %%xmm7, %%xmm9\n"
22410 "hsubpd %%xmm9, %%xmm9\n"
22411 "blendpd $1, %%xmm8, %%xmm9\n"
22412 "movapd %%xmm9, %%xmm7\n"
22413 "movapd %%xmm0, %%xmm8\n"
22414 "movapd %%xmm0, %%xmm9\n"
22415 "addpd %%xmm1, %%xmm8\n"
22416 "subpd %%xmm1, %%xmm9\n"
22417 "movapd %%xmm2, %%xmm10\n"
22418 "movapd %%xmm2, %%xmm11\n"
22419 "addpd %%xmm3, %%xmm10\n"
22420 "subpd %%xmm3, %%xmm11\n"
22421 "movapd %%xmm4, %%xmm12\n"
22422 "movapd %%xmm4, %%xmm13\n"
22423 "addpd %%xmm5, %%xmm12\n"
22424 "subpd %%xmm5, %%xmm13\n"
22425 "movapd %%xmm6, %%xmm14\n"
22426 "movapd %%xmm6, %%xmm15\n"
22427 "addpd %%xmm7, %%xmm14\n"
22428 "subpd %%xmm7, %%xmm15\n"
22429 "movapd %%xmm8, %%xmm0\n"
22430 "movapd %%xmm8, %%xmm2\n"
22431 "addpd %%xmm10, %%xmm0\n"
22432 "subpd %%xmm10, %%xmm2\n"
22433 "movapd %%xmm9, %%xmm1\n"
22434 "movapd %%xmm9, %%xmm3\n"
22435 "addpd %%xmm11, %%xmm1\n"
22436 "subpd %%xmm11, %%xmm3\n"
22437 "movapd %%xmm12, %%xmm4\n"
22438 "movapd %%xmm12, %%xmm6\n"
22439 "addpd %%xmm14, %%xmm4\n"
22440 "subpd %%xmm14, %%xmm6\n"
22441 "movapd %%xmm13, %%xmm5\n"
22442 "movapd %%xmm13, %%xmm7\n"
22443 "addpd %%xmm15, %%xmm5\n"
22444 "subpd %%xmm15, %%xmm7\n"
22445 "movapd %%xmm0, %%xmm8\n"
22446 "movapd %%xmm0, %%xmm12\n"
22447 "addpd %%xmm4, %%xmm8\n"
22448 "subpd %%xmm4, %%xmm12\n"
22449 "movapd %%xmm1, %%xmm9\n"
22450 "movapd %%xmm1, %%xmm13\n"
22451 "addpd %%xmm5, %%xmm9\n"
22452 "subpd %%xmm5, %%xmm13\n"
22453 "movapd %%xmm2, %%xmm10\n"
22454 "movapd %%xmm2, %%xmm14\n"
22455 "addpd %%xmm6, %%xmm10\n"
22456 "subpd %%xmm6, %%xmm14\n"
22457 "movapd %%xmm3, %%xmm11\n"
22458 "movapd %%xmm3, %%xmm15\n"
22459 "addpd %%xmm7, %%xmm11\n"
22460 "subpd %%xmm7, %%xmm15\n"
22461 "movupd %%xmm8, (%0)\n"
22462 "movupd %%xmm9, (%1)\n"
22463 "movupd %%xmm10, (%2)\n"
22464 "movupd %%xmm11, (%3)\n"
22465 "movupd %%xmm12, (%4)\n"
22466 "movupd %%xmm13, (%5)\n"
22467 "movupd %%xmm14, (%6)\n"
22468 "movupd %%xmm15, (%7)\n"
22469 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22470 );
22471 }
22472 }
22473 for (int j = 0; j < 32; j += 32) {
22474 for (int k = 0; k < 16; k += 2) {
22475 __asm__ volatile (
22476 "movupd (%0), %%xmm0\n"
22477 "movupd (%1), %%xmm1\n"
22478 "movapd %%xmm0, %%xmm8\n"
22479 "movapd %%xmm0, %%xmm9\n"
22480 "addpd %%xmm1, %%xmm8\n"
22481 "subpd %%xmm1, %%xmm9\n"
22482 "movupd %%xmm8, (%0)\n"
22483 "movupd %%xmm9, (%1)\n"
22484 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22485 );
22486 }
22487 }
22488 return;
22489 }
22490 if (depth == 8) {
22491 helper_double_26_recursive(buf + 0, 5);
22492 helper_double_26_recursive(buf + 32, 5);
22493 helper_double_26_recursive(buf + 64, 5);
22494 helper_double_26_recursive(buf + 96, 5);
22495 helper_double_26_recursive(buf + 128, 5);
22496 helper_double_26_recursive(buf + 160, 5);
22497 helper_double_26_recursive(buf + 192, 5);
22498 helper_double_26_recursive(buf + 224, 5);
22499 for (int j = 0; j < 256; j += 256) {
22500 for (int k = 0; k < 32; k += 2) {
22501 __asm__ volatile (
22502 "movupd (%0), %%xmm0\n"
22503 "movupd (%1), %%xmm1\n"
22504 "movupd (%2), %%xmm2\n"
22505 "movupd (%3), %%xmm3\n"
22506 "movupd (%4), %%xmm4\n"
22507 "movupd (%5), %%xmm5\n"
22508 "movupd (%6), %%xmm6\n"
22509 "movupd (%7), %%xmm7\n"
22510 "movapd %%xmm0, %%xmm8\n"
22511 "movapd %%xmm0, %%xmm9\n"
22512 "addpd %%xmm1, %%xmm8\n"
22513 "subpd %%xmm1, %%xmm9\n"
22514 "movapd %%xmm2, %%xmm10\n"
22515 "movapd %%xmm2, %%xmm11\n"
22516 "addpd %%xmm3, %%xmm10\n"
22517 "subpd %%xmm3, %%xmm11\n"
22518 "movapd %%xmm4, %%xmm12\n"
22519 "movapd %%xmm4, %%xmm13\n"
22520 "addpd %%xmm5, %%xmm12\n"
22521 "subpd %%xmm5, %%xmm13\n"
22522 "movapd %%xmm6, %%xmm14\n"
22523 "movapd %%xmm6, %%xmm15\n"
22524 "addpd %%xmm7, %%xmm14\n"
22525 "subpd %%xmm7, %%xmm15\n"
22526 "movapd %%xmm8, %%xmm0\n"
22527 "movapd %%xmm8, %%xmm2\n"
22528 "addpd %%xmm10, %%xmm0\n"
22529 "subpd %%xmm10, %%xmm2\n"
22530 "movapd %%xmm9, %%xmm1\n"
22531 "movapd %%xmm9, %%xmm3\n"
22532 "addpd %%xmm11, %%xmm1\n"
22533 "subpd %%xmm11, %%xmm3\n"
22534 "movapd %%xmm12, %%xmm4\n"
22535 "movapd %%xmm12, %%xmm6\n"
22536 "addpd %%xmm14, %%xmm4\n"
22537 "subpd %%xmm14, %%xmm6\n"
22538 "movapd %%xmm13, %%xmm5\n"
22539 "movapd %%xmm13, %%xmm7\n"
22540 "addpd %%xmm15, %%xmm5\n"
22541 "subpd %%xmm15, %%xmm7\n"
22542 "movapd %%xmm0, %%xmm8\n"
22543 "movapd %%xmm0, %%xmm12\n"
22544 "addpd %%xmm4, %%xmm8\n"
22545 "subpd %%xmm4, %%xmm12\n"
22546 "movapd %%xmm1, %%xmm9\n"
22547 "movapd %%xmm1, %%xmm13\n"
22548 "addpd %%xmm5, %%xmm9\n"
22549 "subpd %%xmm5, %%xmm13\n"
22550 "movapd %%xmm2, %%xmm10\n"
22551 "movapd %%xmm2, %%xmm14\n"
22552 "addpd %%xmm6, %%xmm10\n"
22553 "subpd %%xmm6, %%xmm14\n"
22554 "movapd %%xmm3, %%xmm11\n"
22555 "movapd %%xmm3, %%xmm15\n"
22556 "addpd %%xmm7, %%xmm11\n"
22557 "subpd %%xmm7, %%xmm15\n"
22558 "movupd %%xmm8, (%0)\n"
22559 "movupd %%xmm9, (%1)\n"
22560 "movupd %%xmm10, (%2)\n"
22561 "movupd %%xmm11, (%3)\n"
22562 "movupd %%xmm12, (%4)\n"
22563 "movupd %%xmm13, (%5)\n"
22564 "movupd %%xmm14, (%6)\n"
22565 "movupd %%xmm15, (%7)\n"
22566 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22567 );
22568 }
22569 }
22570 return;
22571 }
22572 if (depth == 11) {
22573 helper_double_26_recursive(buf + 0, 8);
22574 helper_double_26_recursive(buf + 256, 8);
22575 helper_double_26_recursive(buf + 512, 8);
22576 helper_double_26_recursive(buf + 768, 8);
22577 helper_double_26_recursive(buf + 1024, 8);
22578 helper_double_26_recursive(buf + 1280, 8);
22579 helper_double_26_recursive(buf + 1536, 8);
22580 helper_double_26_recursive(buf + 1792, 8);
22581 for (int j = 0; j < 2048; j += 2048) {
22582 for (int k = 0; k < 256; k += 2) {
22583 __asm__ volatile (
22584 "movupd (%0), %%xmm0\n"
22585 "movupd (%1), %%xmm1\n"
22586 "movupd (%2), %%xmm2\n"
22587 "movupd (%3), %%xmm3\n"
22588 "movupd (%4), %%xmm4\n"
22589 "movupd (%5), %%xmm5\n"
22590 "movupd (%6), %%xmm6\n"
22591 "movupd (%7), %%xmm7\n"
22592 "movapd %%xmm0, %%xmm8\n"
22593 "movapd %%xmm0, %%xmm9\n"
22594 "addpd %%xmm1, %%xmm8\n"
22595 "subpd %%xmm1, %%xmm9\n"
22596 "movapd %%xmm2, %%xmm10\n"
22597 "movapd %%xmm2, %%xmm11\n"
22598 "addpd %%xmm3, %%xmm10\n"
22599 "subpd %%xmm3, %%xmm11\n"
22600 "movapd %%xmm4, %%xmm12\n"
22601 "movapd %%xmm4, %%xmm13\n"
22602 "addpd %%xmm5, %%xmm12\n"
22603 "subpd %%xmm5, %%xmm13\n"
22604 "movapd %%xmm6, %%xmm14\n"
22605 "movapd %%xmm6, %%xmm15\n"
22606 "addpd %%xmm7, %%xmm14\n"
22607 "subpd %%xmm7, %%xmm15\n"
22608 "movapd %%xmm8, %%xmm0\n"
22609 "movapd %%xmm8, %%xmm2\n"
22610 "addpd %%xmm10, %%xmm0\n"
22611 "subpd %%xmm10, %%xmm2\n"
22612 "movapd %%xmm9, %%xmm1\n"
22613 "movapd %%xmm9, %%xmm3\n"
22614 "addpd %%xmm11, %%xmm1\n"
22615 "subpd %%xmm11, %%xmm3\n"
22616 "movapd %%xmm12, %%xmm4\n"
22617 "movapd %%xmm12, %%xmm6\n"
22618 "addpd %%xmm14, %%xmm4\n"
22619 "subpd %%xmm14, %%xmm6\n"
22620 "movapd %%xmm13, %%xmm5\n"
22621 "movapd %%xmm13, %%xmm7\n"
22622 "addpd %%xmm15, %%xmm5\n"
22623 "subpd %%xmm15, %%xmm7\n"
22624 "movapd %%xmm0, %%xmm8\n"
22625 "movapd %%xmm0, %%xmm12\n"
22626 "addpd %%xmm4, %%xmm8\n"
22627 "subpd %%xmm4, %%xmm12\n"
22628 "movapd %%xmm1, %%xmm9\n"
22629 "movapd %%xmm1, %%xmm13\n"
22630 "addpd %%xmm5, %%xmm9\n"
22631 "subpd %%xmm5, %%xmm13\n"
22632 "movapd %%xmm2, %%xmm10\n"
22633 "movapd %%xmm2, %%xmm14\n"
22634 "addpd %%xmm6, %%xmm10\n"
22635 "subpd %%xmm6, %%xmm14\n"
22636 "movapd %%xmm3, %%xmm11\n"
22637 "movapd %%xmm3, %%xmm15\n"
22638 "addpd %%xmm7, %%xmm11\n"
22639 "subpd %%xmm7, %%xmm15\n"
22640 "movupd %%xmm8, (%0)\n"
22641 "movupd %%xmm9, (%1)\n"
22642 "movupd %%xmm10, (%2)\n"
22643 "movupd %%xmm11, (%3)\n"
22644 "movupd %%xmm12, (%4)\n"
22645 "movupd %%xmm13, (%5)\n"
22646 "movupd %%xmm14, (%6)\n"
22647 "movupd %%xmm15, (%7)\n"
22648 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22649 );
22650 }
22651 }
22652 return;
22653 }
22654 if (depth == 14) {
22655 helper_double_26_recursive(buf + 0, 11);
22656 helper_double_26_recursive(buf + 2048, 11);
22657 helper_double_26_recursive(buf + 4096, 11);
22658 helper_double_26_recursive(buf + 6144, 11);
22659 helper_double_26_recursive(buf + 8192, 11);
22660 helper_double_26_recursive(buf + 10240, 11);
22661 helper_double_26_recursive(buf + 12288, 11);
22662 helper_double_26_recursive(buf + 14336, 11);
22663 for (int j = 0; j < 16384; j += 16384) {
22664 for (int k = 0; k < 2048; k += 2) {
22665 __asm__ volatile (
22666 "movupd (%0), %%xmm0\n"
22667 "movupd (%1), %%xmm1\n"
22668 "movupd (%2), %%xmm2\n"
22669 "movupd (%3), %%xmm3\n"
22670 "movupd (%4), %%xmm4\n"
22671 "movupd (%5), %%xmm5\n"
22672 "movupd (%6), %%xmm6\n"
22673 "movupd (%7), %%xmm7\n"
22674 "movapd %%xmm0, %%xmm8\n"
22675 "movapd %%xmm0, %%xmm9\n"
22676 "addpd %%xmm1, %%xmm8\n"
22677 "subpd %%xmm1, %%xmm9\n"
22678 "movapd %%xmm2, %%xmm10\n"
22679 "movapd %%xmm2, %%xmm11\n"
22680 "addpd %%xmm3, %%xmm10\n"
22681 "subpd %%xmm3, %%xmm11\n"
22682 "movapd %%xmm4, %%xmm12\n"
22683 "movapd %%xmm4, %%xmm13\n"
22684 "addpd %%xmm5, %%xmm12\n"
22685 "subpd %%xmm5, %%xmm13\n"
22686 "movapd %%xmm6, %%xmm14\n"
22687 "movapd %%xmm6, %%xmm15\n"
22688 "addpd %%xmm7, %%xmm14\n"
22689 "subpd %%xmm7, %%xmm15\n"
22690 "movapd %%xmm8, %%xmm0\n"
22691 "movapd %%xmm8, %%xmm2\n"
22692 "addpd %%xmm10, %%xmm0\n"
22693 "subpd %%xmm10, %%xmm2\n"
22694 "movapd %%xmm9, %%xmm1\n"
22695 "movapd %%xmm9, %%xmm3\n"
22696 "addpd %%xmm11, %%xmm1\n"
22697 "subpd %%xmm11, %%xmm3\n"
22698 "movapd %%xmm12, %%xmm4\n"
22699 "movapd %%xmm12, %%xmm6\n"
22700 "addpd %%xmm14, %%xmm4\n"
22701 "subpd %%xmm14, %%xmm6\n"
22702 "movapd %%xmm13, %%xmm5\n"
22703 "movapd %%xmm13, %%xmm7\n"
22704 "addpd %%xmm15, %%xmm5\n"
22705 "subpd %%xmm15, %%xmm7\n"
22706 "movapd %%xmm0, %%xmm8\n"
22707 "movapd %%xmm0, %%xmm12\n"
22708 "addpd %%xmm4, %%xmm8\n"
22709 "subpd %%xmm4, %%xmm12\n"
22710 "movapd %%xmm1, %%xmm9\n"
22711 "movapd %%xmm1, %%xmm13\n"
22712 "addpd %%xmm5, %%xmm9\n"
22713 "subpd %%xmm5, %%xmm13\n"
22714 "movapd %%xmm2, %%xmm10\n"
22715 "movapd %%xmm2, %%xmm14\n"
22716 "addpd %%xmm6, %%xmm10\n"
22717 "subpd %%xmm6, %%xmm14\n"
22718 "movapd %%xmm3, %%xmm11\n"
22719 "movapd %%xmm3, %%xmm15\n"
22720 "addpd %%xmm7, %%xmm11\n"
22721 "subpd %%xmm7, %%xmm15\n"
22722 "movupd %%xmm8, (%0)\n"
22723 "movupd %%xmm9, (%1)\n"
22724 "movupd %%xmm10, (%2)\n"
22725 "movupd %%xmm11, (%3)\n"
22726 "movupd %%xmm12, (%4)\n"
22727 "movupd %%xmm13, (%5)\n"
22728 "movupd %%xmm14, (%6)\n"
22729 "movupd %%xmm15, (%7)\n"
22730 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22731 );
22732 }
22733 }
22734 return;
22735 }
22736 if (depth == 17) {
22737 helper_double_26_recursive(buf + 0, 14);
22738 helper_double_26_recursive(buf + 16384, 14);
22739 helper_double_26_recursive(buf + 32768, 14);
22740 helper_double_26_recursive(buf + 49152, 14);
22741 helper_double_26_recursive(buf + 65536, 14);
22742 helper_double_26_recursive(buf + 81920, 14);
22743 helper_double_26_recursive(buf + 98304, 14);
22744 helper_double_26_recursive(buf + 114688, 14);
22745 for (int j = 0; j < 131072; j += 131072) {
22746 for (int k = 0; k < 16384; k += 2) {
22747 __asm__ volatile (
22748 "movupd (%0), %%xmm0\n"
22749 "movupd (%1), %%xmm1\n"
22750 "movupd (%2), %%xmm2\n"
22751 "movupd (%3), %%xmm3\n"
22752 "movupd (%4), %%xmm4\n"
22753 "movupd (%5), %%xmm5\n"
22754 "movupd (%6), %%xmm6\n"
22755 "movupd (%7), %%xmm7\n"
22756 "movapd %%xmm0, %%xmm8\n"
22757 "movapd %%xmm0, %%xmm9\n"
22758 "addpd %%xmm1, %%xmm8\n"
22759 "subpd %%xmm1, %%xmm9\n"
22760 "movapd %%xmm2, %%xmm10\n"
22761 "movapd %%xmm2, %%xmm11\n"
22762 "addpd %%xmm3, %%xmm10\n"
22763 "subpd %%xmm3, %%xmm11\n"
22764 "movapd %%xmm4, %%xmm12\n"
22765 "movapd %%xmm4, %%xmm13\n"
22766 "addpd %%xmm5, %%xmm12\n"
22767 "subpd %%xmm5, %%xmm13\n"
22768 "movapd %%xmm6, %%xmm14\n"
22769 "movapd %%xmm6, %%xmm15\n"
22770 "addpd %%xmm7, %%xmm14\n"
22771 "subpd %%xmm7, %%xmm15\n"
22772 "movapd %%xmm8, %%xmm0\n"
22773 "movapd %%xmm8, %%xmm2\n"
22774 "addpd %%xmm10, %%xmm0\n"
22775 "subpd %%xmm10, %%xmm2\n"
22776 "movapd %%xmm9, %%xmm1\n"
22777 "movapd %%xmm9, %%xmm3\n"
22778 "addpd %%xmm11, %%xmm1\n"
22779 "subpd %%xmm11, %%xmm3\n"
22780 "movapd %%xmm12, %%xmm4\n"
22781 "movapd %%xmm12, %%xmm6\n"
22782 "addpd %%xmm14, %%xmm4\n"
22783 "subpd %%xmm14, %%xmm6\n"
22784 "movapd %%xmm13, %%xmm5\n"
22785 "movapd %%xmm13, %%xmm7\n"
22786 "addpd %%xmm15, %%xmm5\n"
22787 "subpd %%xmm15, %%xmm7\n"
22788 "movapd %%xmm0, %%xmm8\n"
22789 "movapd %%xmm0, %%xmm12\n"
22790 "addpd %%xmm4, %%xmm8\n"
22791 "subpd %%xmm4, %%xmm12\n"
22792 "movapd %%xmm1, %%xmm9\n"
22793 "movapd %%xmm1, %%xmm13\n"
22794 "addpd %%xmm5, %%xmm9\n"
22795 "subpd %%xmm5, %%xmm13\n"
22796 "movapd %%xmm2, %%xmm10\n"
22797 "movapd %%xmm2, %%xmm14\n"
22798 "addpd %%xmm6, %%xmm10\n"
22799 "subpd %%xmm6, %%xmm14\n"
22800 "movapd %%xmm3, %%xmm11\n"
22801 "movapd %%xmm3, %%xmm15\n"
22802 "addpd %%xmm7, %%xmm11\n"
22803 "subpd %%xmm7, %%xmm15\n"
22804 "movupd %%xmm8, (%0)\n"
22805 "movupd %%xmm9, (%1)\n"
22806 "movupd %%xmm10, (%2)\n"
22807 "movupd %%xmm11, (%3)\n"
22808 "movupd %%xmm12, (%4)\n"
22809 "movupd %%xmm13, (%5)\n"
22810 "movupd %%xmm14, (%6)\n"
22811 "movupd %%xmm15, (%7)\n"
22812 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22813 );
22814 }
22815 }
22816 return;
22817 }
22818 if (depth == 20) {
22819 helper_double_26_recursive(buf + 0, 17);
22820 helper_double_26_recursive(buf + 131072, 17);
22821 helper_double_26_recursive(buf + 262144, 17);
22822 helper_double_26_recursive(buf + 393216, 17);
22823 helper_double_26_recursive(buf + 524288, 17);
22824 helper_double_26_recursive(buf + 655360, 17);
22825 helper_double_26_recursive(buf + 786432, 17);
22826 helper_double_26_recursive(buf + 917504, 17);
22827 for (int j = 0; j < 1048576; j += 1048576) {
22828 for (int k = 0; k < 131072; k += 2) {
22829 __asm__ volatile (
22830 "movupd (%0), %%xmm0\n"
22831 "movupd (%1), %%xmm1\n"
22832 "movupd (%2), %%xmm2\n"
22833 "movupd (%3), %%xmm3\n"
22834 "movupd (%4), %%xmm4\n"
22835 "movupd (%5), %%xmm5\n"
22836 "movupd (%6), %%xmm6\n"
22837 "movupd (%7), %%xmm7\n"
22838 "movapd %%xmm0, %%xmm8\n"
22839 "movapd %%xmm0, %%xmm9\n"
22840 "addpd %%xmm1, %%xmm8\n"
22841 "subpd %%xmm1, %%xmm9\n"
22842 "movapd %%xmm2, %%xmm10\n"
22843 "movapd %%xmm2, %%xmm11\n"
22844 "addpd %%xmm3, %%xmm10\n"
22845 "subpd %%xmm3, %%xmm11\n"
22846 "movapd %%xmm4, %%xmm12\n"
22847 "movapd %%xmm4, %%xmm13\n"
22848 "addpd %%xmm5, %%xmm12\n"
22849 "subpd %%xmm5, %%xmm13\n"
22850 "movapd %%xmm6, %%xmm14\n"
22851 "movapd %%xmm6, %%xmm15\n"
22852 "addpd %%xmm7, %%xmm14\n"
22853 "subpd %%xmm7, %%xmm15\n"
22854 "movapd %%xmm8, %%xmm0\n"
22855 "movapd %%xmm8, %%xmm2\n"
22856 "addpd %%xmm10, %%xmm0\n"
22857 "subpd %%xmm10, %%xmm2\n"
22858 "movapd %%xmm9, %%xmm1\n"
22859 "movapd %%xmm9, %%xmm3\n"
22860 "addpd %%xmm11, %%xmm1\n"
22861 "subpd %%xmm11, %%xmm3\n"
22862 "movapd %%xmm12, %%xmm4\n"
22863 "movapd %%xmm12, %%xmm6\n"
22864 "addpd %%xmm14, %%xmm4\n"
22865 "subpd %%xmm14, %%xmm6\n"
22866 "movapd %%xmm13, %%xmm5\n"
22867 "movapd %%xmm13, %%xmm7\n"
22868 "addpd %%xmm15, %%xmm5\n"
22869 "subpd %%xmm15, %%xmm7\n"
22870 "movapd %%xmm0, %%xmm8\n"
22871 "movapd %%xmm0, %%xmm12\n"
22872 "addpd %%xmm4, %%xmm8\n"
22873 "subpd %%xmm4, %%xmm12\n"
22874 "movapd %%xmm1, %%xmm9\n"
22875 "movapd %%xmm1, %%xmm13\n"
22876 "addpd %%xmm5, %%xmm9\n"
22877 "subpd %%xmm5, %%xmm13\n"
22878 "movapd %%xmm2, %%xmm10\n"
22879 "movapd %%xmm2, %%xmm14\n"
22880 "addpd %%xmm6, %%xmm10\n"
22881 "subpd %%xmm6, %%xmm14\n"
22882 "movapd %%xmm3, %%xmm11\n"
22883 "movapd %%xmm3, %%xmm15\n"
22884 "addpd %%xmm7, %%xmm11\n"
22885 "subpd %%xmm7, %%xmm15\n"
22886 "movupd %%xmm8, (%0)\n"
22887 "movupd %%xmm9, (%1)\n"
22888 "movupd %%xmm10, (%2)\n"
22889 "movupd %%xmm11, (%3)\n"
22890 "movupd %%xmm12, (%4)\n"
22891 "movupd %%xmm13, (%5)\n"
22892 "movupd %%xmm14, (%6)\n"
22893 "movupd %%xmm15, (%7)\n"
22894 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22895 );
22896 }
22897 }
22898 return;
22899 }
22900 if (depth == 23) {
22901 helper_double_26_recursive(buf + 0, 20);
22902 helper_double_26_recursive(buf + 1048576, 20);
22903 helper_double_26_recursive(buf + 2097152, 20);
22904 helper_double_26_recursive(buf + 3145728, 20);
22905 helper_double_26_recursive(buf + 4194304, 20);
22906 helper_double_26_recursive(buf + 5242880, 20);
22907 helper_double_26_recursive(buf + 6291456, 20);
22908 helper_double_26_recursive(buf + 7340032, 20);
22909 for (int j = 0; j < 8388608; j += 8388608) {
22910 for (int k = 0; k < 1048576; k += 2) {
22911 __asm__ volatile (
22912 "movupd (%0), %%xmm0\n"
22913 "movupd (%1), %%xmm1\n"
22914 "movupd (%2), %%xmm2\n"
22915 "movupd (%3), %%xmm3\n"
22916 "movupd (%4), %%xmm4\n"
22917 "movupd (%5), %%xmm5\n"
22918 "movupd (%6), %%xmm6\n"
22919 "movupd (%7), %%xmm7\n"
22920 "movapd %%xmm0, %%xmm8\n"
22921 "movapd %%xmm0, %%xmm9\n"
22922 "addpd %%xmm1, %%xmm8\n"
22923 "subpd %%xmm1, %%xmm9\n"
22924 "movapd %%xmm2, %%xmm10\n"
22925 "movapd %%xmm2, %%xmm11\n"
22926 "addpd %%xmm3, %%xmm10\n"
22927 "subpd %%xmm3, %%xmm11\n"
22928 "movapd %%xmm4, %%xmm12\n"
22929 "movapd %%xmm4, %%xmm13\n"
22930 "addpd %%xmm5, %%xmm12\n"
22931 "subpd %%xmm5, %%xmm13\n"
22932 "movapd %%xmm6, %%xmm14\n"
22933 "movapd %%xmm6, %%xmm15\n"
22934 "addpd %%xmm7, %%xmm14\n"
22935 "subpd %%xmm7, %%xmm15\n"
22936 "movapd %%xmm8, %%xmm0\n"
22937 "movapd %%xmm8, %%xmm2\n"
22938 "addpd %%xmm10, %%xmm0\n"
22939 "subpd %%xmm10, %%xmm2\n"
22940 "movapd %%xmm9, %%xmm1\n"
22941 "movapd %%xmm9, %%xmm3\n"
22942 "addpd %%xmm11, %%xmm1\n"
22943 "subpd %%xmm11, %%xmm3\n"
22944 "movapd %%xmm12, %%xmm4\n"
22945 "movapd %%xmm12, %%xmm6\n"
22946 "addpd %%xmm14, %%xmm4\n"
22947 "subpd %%xmm14, %%xmm6\n"
22948 "movapd %%xmm13, %%xmm5\n"
22949 "movapd %%xmm13, %%xmm7\n"
22950 "addpd %%xmm15, %%xmm5\n"
22951 "subpd %%xmm15, %%xmm7\n"
22952 "movapd %%xmm0, %%xmm8\n"
22953 "movapd %%xmm0, %%xmm12\n"
22954 "addpd %%xmm4, %%xmm8\n"
22955 "subpd %%xmm4, %%xmm12\n"
22956 "movapd %%xmm1, %%xmm9\n"
22957 "movapd %%xmm1, %%xmm13\n"
22958 "addpd %%xmm5, %%xmm9\n"
22959 "subpd %%xmm5, %%xmm13\n"
22960 "movapd %%xmm2, %%xmm10\n"
22961 "movapd %%xmm2, %%xmm14\n"
22962 "addpd %%xmm6, %%xmm10\n"
22963 "subpd %%xmm6, %%xmm14\n"
22964 "movapd %%xmm3, %%xmm11\n"
22965 "movapd %%xmm3, %%xmm15\n"
22966 "addpd %%xmm7, %%xmm11\n"
22967 "subpd %%xmm7, %%xmm15\n"
22968 "movupd %%xmm8, (%0)\n"
22969 "movupd %%xmm9, (%1)\n"
22970 "movupd %%xmm10, (%2)\n"
22971 "movupd %%xmm11, (%3)\n"
22972 "movupd %%xmm12, (%4)\n"
22973 "movupd %%xmm13, (%5)\n"
22974 "movupd %%xmm14, (%6)\n"
22975 "movupd %%xmm15, (%7)\n"
22976 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
22977 );
22978 }
22979 }
22980 return;
22981 }
22982 if (depth == 26) {
22983 helper_double_26_recursive(buf + 0, 23);
22984 helper_double_26_recursive(buf + 8388608, 23);
22985 helper_double_26_recursive(buf + 16777216, 23);
22986 helper_double_26_recursive(buf + 25165824, 23);
22987 helper_double_26_recursive(buf + 33554432, 23);
22988 helper_double_26_recursive(buf + 41943040, 23);
22989 helper_double_26_recursive(buf + 50331648, 23);
22990 helper_double_26_recursive(buf + 58720256, 23);
22991 for (int j = 0; j < 67108864; j += 67108864) {
22992 for (int k = 0; k < 8388608; k += 2) {
22993 __asm__ volatile (
22994 "movupd (%0), %%xmm0\n"
22995 "movupd (%1), %%xmm1\n"
22996 "movupd (%2), %%xmm2\n"
22997 "movupd (%3), %%xmm3\n"
22998 "movupd (%4), %%xmm4\n"
22999 "movupd (%5), %%xmm5\n"
23000 "movupd (%6), %%xmm6\n"
23001 "movupd (%7), %%xmm7\n"
23002 "movapd %%xmm0, %%xmm8\n"
23003 "movapd %%xmm0, %%xmm9\n"
23004 "addpd %%xmm1, %%xmm8\n"
23005 "subpd %%xmm1, %%xmm9\n"
23006 "movapd %%xmm2, %%xmm10\n"
23007 "movapd %%xmm2, %%xmm11\n"
23008 "addpd %%xmm3, %%xmm10\n"
23009 "subpd %%xmm3, %%xmm11\n"
23010 "movapd %%xmm4, %%xmm12\n"
23011 "movapd %%xmm4, %%xmm13\n"
23012 "addpd %%xmm5, %%xmm12\n"
23013 "subpd %%xmm5, %%xmm13\n"
23014 "movapd %%xmm6, %%xmm14\n"
23015 "movapd %%xmm6, %%xmm15\n"
23016 "addpd %%xmm7, %%xmm14\n"
23017 "subpd %%xmm7, %%xmm15\n"
23018 "movapd %%xmm8, %%xmm0\n"
23019 "movapd %%xmm8, %%xmm2\n"
23020 "addpd %%xmm10, %%xmm0\n"
23021 "subpd %%xmm10, %%xmm2\n"
23022 "movapd %%xmm9, %%xmm1\n"
23023 "movapd %%xmm9, %%xmm3\n"
23024 "addpd %%xmm11, %%xmm1\n"
23025 "subpd %%xmm11, %%xmm3\n"
23026 "movapd %%xmm12, %%xmm4\n"
23027 "movapd %%xmm12, %%xmm6\n"
23028 "addpd %%xmm14, %%xmm4\n"
23029 "subpd %%xmm14, %%xmm6\n"
23030 "movapd %%xmm13, %%xmm5\n"
23031 "movapd %%xmm13, %%xmm7\n"
23032 "addpd %%xmm15, %%xmm5\n"
23033 "subpd %%xmm15, %%xmm7\n"
23034 "movapd %%xmm0, %%xmm8\n"
23035 "movapd %%xmm0, %%xmm12\n"
23036 "addpd %%xmm4, %%xmm8\n"
23037 "subpd %%xmm4, %%xmm12\n"
23038 "movapd %%xmm1, %%xmm9\n"
23039 "movapd %%xmm1, %%xmm13\n"
23040 "addpd %%xmm5, %%xmm9\n"
23041 "subpd %%xmm5, %%xmm13\n"
23042 "movapd %%xmm2, %%xmm10\n"
23043 "movapd %%xmm2, %%xmm14\n"
23044 "addpd %%xmm6, %%xmm10\n"
23045 "subpd %%xmm6, %%xmm14\n"
23046 "movapd %%xmm3, %%xmm11\n"
23047 "movapd %%xmm3, %%xmm15\n"
23048 "addpd %%xmm7, %%xmm11\n"
23049 "subpd %%xmm7, %%xmm15\n"
23050 "movupd %%xmm8, (%0)\n"
23051 "movupd %%xmm9, (%1)\n"
23052 "movupd %%xmm10, (%2)\n"
23053 "movupd %%xmm11, (%3)\n"
23054 "movupd %%xmm12, (%4)\n"
23055 "movupd %%xmm13, (%5)\n"
23056 "movupd %%xmm14, (%6)\n"
23057 "movupd %%xmm15, (%7)\n"
23058 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23059 );
23060 }
23061 }
23062 return;
23063 }
23064 }
23065 void helper_double_26(double *buf);
helper_double_26(double * buf)23066 void helper_double_26(double *buf) {
23067 helper_double_26_recursive(buf, 26);
23068 }
23069 void helper_double_27_recursive(double *buf, int depth);
helper_double_27_recursive(double * buf,int depth)23070 void helper_double_27_recursive(double *buf, int depth) {
23071 if (depth == 6) {
23072 for (int j = 0; j < 64; j += 16) {
23073 for (int k = 0; k < 2; k += 2) {
23074 __asm__ volatile (
23075 "movupd (%0), %%xmm0\n"
23076 "movupd (%1), %%xmm1\n"
23077 "movupd (%2), %%xmm2\n"
23078 "movupd (%3), %%xmm3\n"
23079 "movupd (%4), %%xmm4\n"
23080 "movupd (%5), %%xmm5\n"
23081 "movupd (%6), %%xmm6\n"
23082 "movupd (%7), %%xmm7\n"
23083 "movapd %%xmm0, %%xmm8\n"
23084 "haddpd %%xmm8, %%xmm8\n"
23085 "movapd %%xmm0, %%xmm9\n"
23086 "hsubpd %%xmm9, %%xmm9\n"
23087 "blendpd $1, %%xmm8, %%xmm9\n"
23088 "movapd %%xmm9, %%xmm0\n"
23089 "movapd %%xmm1, %%xmm8\n"
23090 "haddpd %%xmm8, %%xmm8\n"
23091 "movapd %%xmm1, %%xmm9\n"
23092 "hsubpd %%xmm9, %%xmm9\n"
23093 "blendpd $1, %%xmm8, %%xmm9\n"
23094 "movapd %%xmm9, %%xmm1\n"
23095 "movapd %%xmm2, %%xmm8\n"
23096 "haddpd %%xmm8, %%xmm8\n"
23097 "movapd %%xmm2, %%xmm9\n"
23098 "hsubpd %%xmm9, %%xmm9\n"
23099 "blendpd $1, %%xmm8, %%xmm9\n"
23100 "movapd %%xmm9, %%xmm2\n"
23101 "movapd %%xmm3, %%xmm8\n"
23102 "haddpd %%xmm8, %%xmm8\n"
23103 "movapd %%xmm3, %%xmm9\n"
23104 "hsubpd %%xmm9, %%xmm9\n"
23105 "blendpd $1, %%xmm8, %%xmm9\n"
23106 "movapd %%xmm9, %%xmm3\n"
23107 "movapd %%xmm4, %%xmm8\n"
23108 "haddpd %%xmm8, %%xmm8\n"
23109 "movapd %%xmm4, %%xmm9\n"
23110 "hsubpd %%xmm9, %%xmm9\n"
23111 "blendpd $1, %%xmm8, %%xmm9\n"
23112 "movapd %%xmm9, %%xmm4\n"
23113 "movapd %%xmm5, %%xmm8\n"
23114 "haddpd %%xmm8, %%xmm8\n"
23115 "movapd %%xmm5, %%xmm9\n"
23116 "hsubpd %%xmm9, %%xmm9\n"
23117 "blendpd $1, %%xmm8, %%xmm9\n"
23118 "movapd %%xmm9, %%xmm5\n"
23119 "movapd %%xmm6, %%xmm8\n"
23120 "haddpd %%xmm8, %%xmm8\n"
23121 "movapd %%xmm6, %%xmm9\n"
23122 "hsubpd %%xmm9, %%xmm9\n"
23123 "blendpd $1, %%xmm8, %%xmm9\n"
23124 "movapd %%xmm9, %%xmm6\n"
23125 "movapd %%xmm7, %%xmm8\n"
23126 "haddpd %%xmm8, %%xmm8\n"
23127 "movapd %%xmm7, %%xmm9\n"
23128 "hsubpd %%xmm9, %%xmm9\n"
23129 "blendpd $1, %%xmm8, %%xmm9\n"
23130 "movapd %%xmm9, %%xmm7\n"
23131 "movapd %%xmm0, %%xmm8\n"
23132 "movapd %%xmm0, %%xmm9\n"
23133 "addpd %%xmm1, %%xmm8\n"
23134 "subpd %%xmm1, %%xmm9\n"
23135 "movapd %%xmm2, %%xmm10\n"
23136 "movapd %%xmm2, %%xmm11\n"
23137 "addpd %%xmm3, %%xmm10\n"
23138 "subpd %%xmm3, %%xmm11\n"
23139 "movapd %%xmm4, %%xmm12\n"
23140 "movapd %%xmm4, %%xmm13\n"
23141 "addpd %%xmm5, %%xmm12\n"
23142 "subpd %%xmm5, %%xmm13\n"
23143 "movapd %%xmm6, %%xmm14\n"
23144 "movapd %%xmm6, %%xmm15\n"
23145 "addpd %%xmm7, %%xmm14\n"
23146 "subpd %%xmm7, %%xmm15\n"
23147 "movapd %%xmm8, %%xmm0\n"
23148 "movapd %%xmm8, %%xmm2\n"
23149 "addpd %%xmm10, %%xmm0\n"
23150 "subpd %%xmm10, %%xmm2\n"
23151 "movapd %%xmm9, %%xmm1\n"
23152 "movapd %%xmm9, %%xmm3\n"
23153 "addpd %%xmm11, %%xmm1\n"
23154 "subpd %%xmm11, %%xmm3\n"
23155 "movapd %%xmm12, %%xmm4\n"
23156 "movapd %%xmm12, %%xmm6\n"
23157 "addpd %%xmm14, %%xmm4\n"
23158 "subpd %%xmm14, %%xmm6\n"
23159 "movapd %%xmm13, %%xmm5\n"
23160 "movapd %%xmm13, %%xmm7\n"
23161 "addpd %%xmm15, %%xmm5\n"
23162 "subpd %%xmm15, %%xmm7\n"
23163 "movapd %%xmm0, %%xmm8\n"
23164 "movapd %%xmm0, %%xmm12\n"
23165 "addpd %%xmm4, %%xmm8\n"
23166 "subpd %%xmm4, %%xmm12\n"
23167 "movapd %%xmm1, %%xmm9\n"
23168 "movapd %%xmm1, %%xmm13\n"
23169 "addpd %%xmm5, %%xmm9\n"
23170 "subpd %%xmm5, %%xmm13\n"
23171 "movapd %%xmm2, %%xmm10\n"
23172 "movapd %%xmm2, %%xmm14\n"
23173 "addpd %%xmm6, %%xmm10\n"
23174 "subpd %%xmm6, %%xmm14\n"
23175 "movapd %%xmm3, %%xmm11\n"
23176 "movapd %%xmm3, %%xmm15\n"
23177 "addpd %%xmm7, %%xmm11\n"
23178 "subpd %%xmm7, %%xmm15\n"
23179 "movupd %%xmm8, (%0)\n"
23180 "movupd %%xmm9, (%1)\n"
23181 "movupd %%xmm10, (%2)\n"
23182 "movupd %%xmm11, (%3)\n"
23183 "movupd %%xmm12, (%4)\n"
23184 "movupd %%xmm13, (%5)\n"
23185 "movupd %%xmm14, (%6)\n"
23186 "movupd %%xmm15, (%7)\n"
23187 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23188 );
23189 }
23190 }
23191 for (int j = 0; j < 64; j += 64) {
23192 for (int k = 0; k < 16; k += 2) {
23193 __asm__ volatile (
23194 "movupd (%0), %%xmm0\n"
23195 "movupd (%1), %%xmm1\n"
23196 "movupd (%2), %%xmm2\n"
23197 "movupd (%3), %%xmm3\n"
23198 "movapd %%xmm0, %%xmm8\n"
23199 "movapd %%xmm0, %%xmm9\n"
23200 "addpd %%xmm1, %%xmm8\n"
23201 "subpd %%xmm1, %%xmm9\n"
23202 "movapd %%xmm2, %%xmm10\n"
23203 "movapd %%xmm2, %%xmm11\n"
23204 "addpd %%xmm3, %%xmm10\n"
23205 "subpd %%xmm3, %%xmm11\n"
23206 "movapd %%xmm8, %%xmm0\n"
23207 "movapd %%xmm8, %%xmm2\n"
23208 "addpd %%xmm10, %%xmm0\n"
23209 "subpd %%xmm10, %%xmm2\n"
23210 "movapd %%xmm9, %%xmm1\n"
23211 "movapd %%xmm9, %%xmm3\n"
23212 "addpd %%xmm11, %%xmm1\n"
23213 "subpd %%xmm11, %%xmm3\n"
23214 "movupd %%xmm0, (%0)\n"
23215 "movupd %%xmm1, (%1)\n"
23216 "movupd %%xmm2, (%2)\n"
23217 "movupd %%xmm3, (%3)\n"
23218 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23219 );
23220 }
23221 }
23222 return;
23223 }
23224 if (depth == 9) {
23225 helper_double_27_recursive(buf + 0, 6);
23226 helper_double_27_recursive(buf + 64, 6);
23227 helper_double_27_recursive(buf + 128, 6);
23228 helper_double_27_recursive(buf + 192, 6);
23229 helper_double_27_recursive(buf + 256, 6);
23230 helper_double_27_recursive(buf + 320, 6);
23231 helper_double_27_recursive(buf + 384, 6);
23232 helper_double_27_recursive(buf + 448, 6);
23233 for (int j = 0; j < 512; j += 512) {
23234 for (int k = 0; k < 64; k += 2) {
23235 __asm__ volatile (
23236 "movupd (%0), %%xmm0\n"
23237 "movupd (%1), %%xmm1\n"
23238 "movupd (%2), %%xmm2\n"
23239 "movupd (%3), %%xmm3\n"
23240 "movupd (%4), %%xmm4\n"
23241 "movupd (%5), %%xmm5\n"
23242 "movupd (%6), %%xmm6\n"
23243 "movupd (%7), %%xmm7\n"
23244 "movapd %%xmm0, %%xmm8\n"
23245 "movapd %%xmm0, %%xmm9\n"
23246 "addpd %%xmm1, %%xmm8\n"
23247 "subpd %%xmm1, %%xmm9\n"
23248 "movapd %%xmm2, %%xmm10\n"
23249 "movapd %%xmm2, %%xmm11\n"
23250 "addpd %%xmm3, %%xmm10\n"
23251 "subpd %%xmm3, %%xmm11\n"
23252 "movapd %%xmm4, %%xmm12\n"
23253 "movapd %%xmm4, %%xmm13\n"
23254 "addpd %%xmm5, %%xmm12\n"
23255 "subpd %%xmm5, %%xmm13\n"
23256 "movapd %%xmm6, %%xmm14\n"
23257 "movapd %%xmm6, %%xmm15\n"
23258 "addpd %%xmm7, %%xmm14\n"
23259 "subpd %%xmm7, %%xmm15\n"
23260 "movapd %%xmm8, %%xmm0\n"
23261 "movapd %%xmm8, %%xmm2\n"
23262 "addpd %%xmm10, %%xmm0\n"
23263 "subpd %%xmm10, %%xmm2\n"
23264 "movapd %%xmm9, %%xmm1\n"
23265 "movapd %%xmm9, %%xmm3\n"
23266 "addpd %%xmm11, %%xmm1\n"
23267 "subpd %%xmm11, %%xmm3\n"
23268 "movapd %%xmm12, %%xmm4\n"
23269 "movapd %%xmm12, %%xmm6\n"
23270 "addpd %%xmm14, %%xmm4\n"
23271 "subpd %%xmm14, %%xmm6\n"
23272 "movapd %%xmm13, %%xmm5\n"
23273 "movapd %%xmm13, %%xmm7\n"
23274 "addpd %%xmm15, %%xmm5\n"
23275 "subpd %%xmm15, %%xmm7\n"
23276 "movapd %%xmm0, %%xmm8\n"
23277 "movapd %%xmm0, %%xmm12\n"
23278 "addpd %%xmm4, %%xmm8\n"
23279 "subpd %%xmm4, %%xmm12\n"
23280 "movapd %%xmm1, %%xmm9\n"
23281 "movapd %%xmm1, %%xmm13\n"
23282 "addpd %%xmm5, %%xmm9\n"
23283 "subpd %%xmm5, %%xmm13\n"
23284 "movapd %%xmm2, %%xmm10\n"
23285 "movapd %%xmm2, %%xmm14\n"
23286 "addpd %%xmm6, %%xmm10\n"
23287 "subpd %%xmm6, %%xmm14\n"
23288 "movapd %%xmm3, %%xmm11\n"
23289 "movapd %%xmm3, %%xmm15\n"
23290 "addpd %%xmm7, %%xmm11\n"
23291 "subpd %%xmm7, %%xmm15\n"
23292 "movupd %%xmm8, (%0)\n"
23293 "movupd %%xmm9, (%1)\n"
23294 "movupd %%xmm10, (%2)\n"
23295 "movupd %%xmm11, (%3)\n"
23296 "movupd %%xmm12, (%4)\n"
23297 "movupd %%xmm13, (%5)\n"
23298 "movupd %%xmm14, (%6)\n"
23299 "movupd %%xmm15, (%7)\n"
23300 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23301 );
23302 }
23303 }
23304 return;
23305 }
23306 if (depth == 12) {
23307 helper_double_27_recursive(buf + 0, 9);
23308 helper_double_27_recursive(buf + 512, 9);
23309 helper_double_27_recursive(buf + 1024, 9);
23310 helper_double_27_recursive(buf + 1536, 9);
23311 helper_double_27_recursive(buf + 2048, 9);
23312 helper_double_27_recursive(buf + 2560, 9);
23313 helper_double_27_recursive(buf + 3072, 9);
23314 helper_double_27_recursive(buf + 3584, 9);
23315 for (int j = 0; j < 4096; j += 4096) {
23316 for (int k = 0; k < 512; k += 2) {
23317 __asm__ volatile (
23318 "movupd (%0), %%xmm0\n"
23319 "movupd (%1), %%xmm1\n"
23320 "movupd (%2), %%xmm2\n"
23321 "movupd (%3), %%xmm3\n"
23322 "movupd (%4), %%xmm4\n"
23323 "movupd (%5), %%xmm5\n"
23324 "movupd (%6), %%xmm6\n"
23325 "movupd (%7), %%xmm7\n"
23326 "movapd %%xmm0, %%xmm8\n"
23327 "movapd %%xmm0, %%xmm9\n"
23328 "addpd %%xmm1, %%xmm8\n"
23329 "subpd %%xmm1, %%xmm9\n"
23330 "movapd %%xmm2, %%xmm10\n"
23331 "movapd %%xmm2, %%xmm11\n"
23332 "addpd %%xmm3, %%xmm10\n"
23333 "subpd %%xmm3, %%xmm11\n"
23334 "movapd %%xmm4, %%xmm12\n"
23335 "movapd %%xmm4, %%xmm13\n"
23336 "addpd %%xmm5, %%xmm12\n"
23337 "subpd %%xmm5, %%xmm13\n"
23338 "movapd %%xmm6, %%xmm14\n"
23339 "movapd %%xmm6, %%xmm15\n"
23340 "addpd %%xmm7, %%xmm14\n"
23341 "subpd %%xmm7, %%xmm15\n"
23342 "movapd %%xmm8, %%xmm0\n"
23343 "movapd %%xmm8, %%xmm2\n"
23344 "addpd %%xmm10, %%xmm0\n"
23345 "subpd %%xmm10, %%xmm2\n"
23346 "movapd %%xmm9, %%xmm1\n"
23347 "movapd %%xmm9, %%xmm3\n"
23348 "addpd %%xmm11, %%xmm1\n"
23349 "subpd %%xmm11, %%xmm3\n"
23350 "movapd %%xmm12, %%xmm4\n"
23351 "movapd %%xmm12, %%xmm6\n"
23352 "addpd %%xmm14, %%xmm4\n"
23353 "subpd %%xmm14, %%xmm6\n"
23354 "movapd %%xmm13, %%xmm5\n"
23355 "movapd %%xmm13, %%xmm7\n"
23356 "addpd %%xmm15, %%xmm5\n"
23357 "subpd %%xmm15, %%xmm7\n"
23358 "movapd %%xmm0, %%xmm8\n"
23359 "movapd %%xmm0, %%xmm12\n"
23360 "addpd %%xmm4, %%xmm8\n"
23361 "subpd %%xmm4, %%xmm12\n"
23362 "movapd %%xmm1, %%xmm9\n"
23363 "movapd %%xmm1, %%xmm13\n"
23364 "addpd %%xmm5, %%xmm9\n"
23365 "subpd %%xmm5, %%xmm13\n"
23366 "movapd %%xmm2, %%xmm10\n"
23367 "movapd %%xmm2, %%xmm14\n"
23368 "addpd %%xmm6, %%xmm10\n"
23369 "subpd %%xmm6, %%xmm14\n"
23370 "movapd %%xmm3, %%xmm11\n"
23371 "movapd %%xmm3, %%xmm15\n"
23372 "addpd %%xmm7, %%xmm11\n"
23373 "subpd %%xmm7, %%xmm15\n"
23374 "movupd %%xmm8, (%0)\n"
23375 "movupd %%xmm9, (%1)\n"
23376 "movupd %%xmm10, (%2)\n"
23377 "movupd %%xmm11, (%3)\n"
23378 "movupd %%xmm12, (%4)\n"
23379 "movupd %%xmm13, (%5)\n"
23380 "movupd %%xmm14, (%6)\n"
23381 "movupd %%xmm15, (%7)\n"
23382 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23383 );
23384 }
23385 }
23386 return;
23387 }
23388 if (depth == 15) {
23389 helper_double_27_recursive(buf + 0, 12);
23390 helper_double_27_recursive(buf + 4096, 12);
23391 helper_double_27_recursive(buf + 8192, 12);
23392 helper_double_27_recursive(buf + 12288, 12);
23393 helper_double_27_recursive(buf + 16384, 12);
23394 helper_double_27_recursive(buf + 20480, 12);
23395 helper_double_27_recursive(buf + 24576, 12);
23396 helper_double_27_recursive(buf + 28672, 12);
23397 for (int j = 0; j < 32768; j += 32768) {
23398 for (int k = 0; k < 4096; k += 2) {
23399 __asm__ volatile (
23400 "movupd (%0), %%xmm0\n"
23401 "movupd (%1), %%xmm1\n"
23402 "movupd (%2), %%xmm2\n"
23403 "movupd (%3), %%xmm3\n"
23404 "movupd (%4), %%xmm4\n"
23405 "movupd (%5), %%xmm5\n"
23406 "movupd (%6), %%xmm6\n"
23407 "movupd (%7), %%xmm7\n"
23408 "movapd %%xmm0, %%xmm8\n"
23409 "movapd %%xmm0, %%xmm9\n"
23410 "addpd %%xmm1, %%xmm8\n"
23411 "subpd %%xmm1, %%xmm9\n"
23412 "movapd %%xmm2, %%xmm10\n"
23413 "movapd %%xmm2, %%xmm11\n"
23414 "addpd %%xmm3, %%xmm10\n"
23415 "subpd %%xmm3, %%xmm11\n"
23416 "movapd %%xmm4, %%xmm12\n"
23417 "movapd %%xmm4, %%xmm13\n"
23418 "addpd %%xmm5, %%xmm12\n"
23419 "subpd %%xmm5, %%xmm13\n"
23420 "movapd %%xmm6, %%xmm14\n"
23421 "movapd %%xmm6, %%xmm15\n"
23422 "addpd %%xmm7, %%xmm14\n"
23423 "subpd %%xmm7, %%xmm15\n"
23424 "movapd %%xmm8, %%xmm0\n"
23425 "movapd %%xmm8, %%xmm2\n"
23426 "addpd %%xmm10, %%xmm0\n"
23427 "subpd %%xmm10, %%xmm2\n"
23428 "movapd %%xmm9, %%xmm1\n"
23429 "movapd %%xmm9, %%xmm3\n"
23430 "addpd %%xmm11, %%xmm1\n"
23431 "subpd %%xmm11, %%xmm3\n"
23432 "movapd %%xmm12, %%xmm4\n"
23433 "movapd %%xmm12, %%xmm6\n"
23434 "addpd %%xmm14, %%xmm4\n"
23435 "subpd %%xmm14, %%xmm6\n"
23436 "movapd %%xmm13, %%xmm5\n"
23437 "movapd %%xmm13, %%xmm7\n"
23438 "addpd %%xmm15, %%xmm5\n"
23439 "subpd %%xmm15, %%xmm7\n"
23440 "movapd %%xmm0, %%xmm8\n"
23441 "movapd %%xmm0, %%xmm12\n"
23442 "addpd %%xmm4, %%xmm8\n"
23443 "subpd %%xmm4, %%xmm12\n"
23444 "movapd %%xmm1, %%xmm9\n"
23445 "movapd %%xmm1, %%xmm13\n"
23446 "addpd %%xmm5, %%xmm9\n"
23447 "subpd %%xmm5, %%xmm13\n"
23448 "movapd %%xmm2, %%xmm10\n"
23449 "movapd %%xmm2, %%xmm14\n"
23450 "addpd %%xmm6, %%xmm10\n"
23451 "subpd %%xmm6, %%xmm14\n"
23452 "movapd %%xmm3, %%xmm11\n"
23453 "movapd %%xmm3, %%xmm15\n"
23454 "addpd %%xmm7, %%xmm11\n"
23455 "subpd %%xmm7, %%xmm15\n"
23456 "movupd %%xmm8, (%0)\n"
23457 "movupd %%xmm9, (%1)\n"
23458 "movupd %%xmm10, (%2)\n"
23459 "movupd %%xmm11, (%3)\n"
23460 "movupd %%xmm12, (%4)\n"
23461 "movupd %%xmm13, (%5)\n"
23462 "movupd %%xmm14, (%6)\n"
23463 "movupd %%xmm15, (%7)\n"
23464 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23465 );
23466 }
23467 }
23468 return;
23469 }
23470 if (depth == 18) {
23471 helper_double_27_recursive(buf + 0, 15);
23472 helper_double_27_recursive(buf + 32768, 15);
23473 helper_double_27_recursive(buf + 65536, 15);
23474 helper_double_27_recursive(buf + 98304, 15);
23475 helper_double_27_recursive(buf + 131072, 15);
23476 helper_double_27_recursive(buf + 163840, 15);
23477 helper_double_27_recursive(buf + 196608, 15);
23478 helper_double_27_recursive(buf + 229376, 15);
23479 for (int j = 0; j < 262144; j += 262144) {
23480 for (int k = 0; k < 32768; k += 2) {
23481 __asm__ volatile (
23482 "movupd (%0), %%xmm0\n"
23483 "movupd (%1), %%xmm1\n"
23484 "movupd (%2), %%xmm2\n"
23485 "movupd (%3), %%xmm3\n"
23486 "movupd (%4), %%xmm4\n"
23487 "movupd (%5), %%xmm5\n"
23488 "movupd (%6), %%xmm6\n"
23489 "movupd (%7), %%xmm7\n"
23490 "movapd %%xmm0, %%xmm8\n"
23491 "movapd %%xmm0, %%xmm9\n"
23492 "addpd %%xmm1, %%xmm8\n"
23493 "subpd %%xmm1, %%xmm9\n"
23494 "movapd %%xmm2, %%xmm10\n"
23495 "movapd %%xmm2, %%xmm11\n"
23496 "addpd %%xmm3, %%xmm10\n"
23497 "subpd %%xmm3, %%xmm11\n"
23498 "movapd %%xmm4, %%xmm12\n"
23499 "movapd %%xmm4, %%xmm13\n"
23500 "addpd %%xmm5, %%xmm12\n"
23501 "subpd %%xmm5, %%xmm13\n"
23502 "movapd %%xmm6, %%xmm14\n"
23503 "movapd %%xmm6, %%xmm15\n"
23504 "addpd %%xmm7, %%xmm14\n"
23505 "subpd %%xmm7, %%xmm15\n"
23506 "movapd %%xmm8, %%xmm0\n"
23507 "movapd %%xmm8, %%xmm2\n"
23508 "addpd %%xmm10, %%xmm0\n"
23509 "subpd %%xmm10, %%xmm2\n"
23510 "movapd %%xmm9, %%xmm1\n"
23511 "movapd %%xmm9, %%xmm3\n"
23512 "addpd %%xmm11, %%xmm1\n"
23513 "subpd %%xmm11, %%xmm3\n"
23514 "movapd %%xmm12, %%xmm4\n"
23515 "movapd %%xmm12, %%xmm6\n"
23516 "addpd %%xmm14, %%xmm4\n"
23517 "subpd %%xmm14, %%xmm6\n"
23518 "movapd %%xmm13, %%xmm5\n"
23519 "movapd %%xmm13, %%xmm7\n"
23520 "addpd %%xmm15, %%xmm5\n"
23521 "subpd %%xmm15, %%xmm7\n"
23522 "movapd %%xmm0, %%xmm8\n"
23523 "movapd %%xmm0, %%xmm12\n"
23524 "addpd %%xmm4, %%xmm8\n"
23525 "subpd %%xmm4, %%xmm12\n"
23526 "movapd %%xmm1, %%xmm9\n"
23527 "movapd %%xmm1, %%xmm13\n"
23528 "addpd %%xmm5, %%xmm9\n"
23529 "subpd %%xmm5, %%xmm13\n"
23530 "movapd %%xmm2, %%xmm10\n"
23531 "movapd %%xmm2, %%xmm14\n"
23532 "addpd %%xmm6, %%xmm10\n"
23533 "subpd %%xmm6, %%xmm14\n"
23534 "movapd %%xmm3, %%xmm11\n"
23535 "movapd %%xmm3, %%xmm15\n"
23536 "addpd %%xmm7, %%xmm11\n"
23537 "subpd %%xmm7, %%xmm15\n"
23538 "movupd %%xmm8, (%0)\n"
23539 "movupd %%xmm9, (%1)\n"
23540 "movupd %%xmm10, (%2)\n"
23541 "movupd %%xmm11, (%3)\n"
23542 "movupd %%xmm12, (%4)\n"
23543 "movupd %%xmm13, (%5)\n"
23544 "movupd %%xmm14, (%6)\n"
23545 "movupd %%xmm15, (%7)\n"
23546 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23547 );
23548 }
23549 }
23550 return;
23551 }
23552 if (depth == 21) {
23553 helper_double_27_recursive(buf + 0, 18);
23554 helper_double_27_recursive(buf + 262144, 18);
23555 helper_double_27_recursive(buf + 524288, 18);
23556 helper_double_27_recursive(buf + 786432, 18);
23557 helper_double_27_recursive(buf + 1048576, 18);
23558 helper_double_27_recursive(buf + 1310720, 18);
23559 helper_double_27_recursive(buf + 1572864, 18);
23560 helper_double_27_recursive(buf + 1835008, 18);
23561 for (int j = 0; j < 2097152; j += 2097152) {
23562 for (int k = 0; k < 262144; k += 2) {
23563 __asm__ volatile (
23564 "movupd (%0), %%xmm0\n"
23565 "movupd (%1), %%xmm1\n"
23566 "movupd (%2), %%xmm2\n"
23567 "movupd (%3), %%xmm3\n"
23568 "movupd (%4), %%xmm4\n"
23569 "movupd (%5), %%xmm5\n"
23570 "movupd (%6), %%xmm6\n"
23571 "movupd (%7), %%xmm7\n"
23572 "movapd %%xmm0, %%xmm8\n"
23573 "movapd %%xmm0, %%xmm9\n"
23574 "addpd %%xmm1, %%xmm8\n"
23575 "subpd %%xmm1, %%xmm9\n"
23576 "movapd %%xmm2, %%xmm10\n"
23577 "movapd %%xmm2, %%xmm11\n"
23578 "addpd %%xmm3, %%xmm10\n"
23579 "subpd %%xmm3, %%xmm11\n"
23580 "movapd %%xmm4, %%xmm12\n"
23581 "movapd %%xmm4, %%xmm13\n"
23582 "addpd %%xmm5, %%xmm12\n"
23583 "subpd %%xmm5, %%xmm13\n"
23584 "movapd %%xmm6, %%xmm14\n"
23585 "movapd %%xmm6, %%xmm15\n"
23586 "addpd %%xmm7, %%xmm14\n"
23587 "subpd %%xmm7, %%xmm15\n"
23588 "movapd %%xmm8, %%xmm0\n"
23589 "movapd %%xmm8, %%xmm2\n"
23590 "addpd %%xmm10, %%xmm0\n"
23591 "subpd %%xmm10, %%xmm2\n"
23592 "movapd %%xmm9, %%xmm1\n"
23593 "movapd %%xmm9, %%xmm3\n"
23594 "addpd %%xmm11, %%xmm1\n"
23595 "subpd %%xmm11, %%xmm3\n"
23596 "movapd %%xmm12, %%xmm4\n"
23597 "movapd %%xmm12, %%xmm6\n"
23598 "addpd %%xmm14, %%xmm4\n"
23599 "subpd %%xmm14, %%xmm6\n"
23600 "movapd %%xmm13, %%xmm5\n"
23601 "movapd %%xmm13, %%xmm7\n"
23602 "addpd %%xmm15, %%xmm5\n"
23603 "subpd %%xmm15, %%xmm7\n"
23604 "movapd %%xmm0, %%xmm8\n"
23605 "movapd %%xmm0, %%xmm12\n"
23606 "addpd %%xmm4, %%xmm8\n"
23607 "subpd %%xmm4, %%xmm12\n"
23608 "movapd %%xmm1, %%xmm9\n"
23609 "movapd %%xmm1, %%xmm13\n"
23610 "addpd %%xmm5, %%xmm9\n"
23611 "subpd %%xmm5, %%xmm13\n"
23612 "movapd %%xmm2, %%xmm10\n"
23613 "movapd %%xmm2, %%xmm14\n"
23614 "addpd %%xmm6, %%xmm10\n"
23615 "subpd %%xmm6, %%xmm14\n"
23616 "movapd %%xmm3, %%xmm11\n"
23617 "movapd %%xmm3, %%xmm15\n"
23618 "addpd %%xmm7, %%xmm11\n"
23619 "subpd %%xmm7, %%xmm15\n"
23620 "movupd %%xmm8, (%0)\n"
23621 "movupd %%xmm9, (%1)\n"
23622 "movupd %%xmm10, (%2)\n"
23623 "movupd %%xmm11, (%3)\n"
23624 "movupd %%xmm12, (%4)\n"
23625 "movupd %%xmm13, (%5)\n"
23626 "movupd %%xmm14, (%6)\n"
23627 "movupd %%xmm15, (%7)\n"
23628 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23629 );
23630 }
23631 }
23632 return;
23633 }
23634 if (depth == 24) {
23635 helper_double_27_recursive(buf + 0, 21);
23636 helper_double_27_recursive(buf + 2097152, 21);
23637 helper_double_27_recursive(buf + 4194304, 21);
23638 helper_double_27_recursive(buf + 6291456, 21);
23639 helper_double_27_recursive(buf + 8388608, 21);
23640 helper_double_27_recursive(buf + 10485760, 21);
23641 helper_double_27_recursive(buf + 12582912, 21);
23642 helper_double_27_recursive(buf + 14680064, 21);
23643 for (int j = 0; j < 16777216; j += 16777216) {
23644 for (int k = 0; k < 2097152; k += 2) {
23645 __asm__ volatile (
23646 "movupd (%0), %%xmm0\n"
23647 "movupd (%1), %%xmm1\n"
23648 "movupd (%2), %%xmm2\n"
23649 "movupd (%3), %%xmm3\n"
23650 "movupd (%4), %%xmm4\n"
23651 "movupd (%5), %%xmm5\n"
23652 "movupd (%6), %%xmm6\n"
23653 "movupd (%7), %%xmm7\n"
23654 "movapd %%xmm0, %%xmm8\n"
23655 "movapd %%xmm0, %%xmm9\n"
23656 "addpd %%xmm1, %%xmm8\n"
23657 "subpd %%xmm1, %%xmm9\n"
23658 "movapd %%xmm2, %%xmm10\n"
23659 "movapd %%xmm2, %%xmm11\n"
23660 "addpd %%xmm3, %%xmm10\n"
23661 "subpd %%xmm3, %%xmm11\n"
23662 "movapd %%xmm4, %%xmm12\n"
23663 "movapd %%xmm4, %%xmm13\n"
23664 "addpd %%xmm5, %%xmm12\n"
23665 "subpd %%xmm5, %%xmm13\n"
23666 "movapd %%xmm6, %%xmm14\n"
23667 "movapd %%xmm6, %%xmm15\n"
23668 "addpd %%xmm7, %%xmm14\n"
23669 "subpd %%xmm7, %%xmm15\n"
23670 "movapd %%xmm8, %%xmm0\n"
23671 "movapd %%xmm8, %%xmm2\n"
23672 "addpd %%xmm10, %%xmm0\n"
23673 "subpd %%xmm10, %%xmm2\n"
23674 "movapd %%xmm9, %%xmm1\n"
23675 "movapd %%xmm9, %%xmm3\n"
23676 "addpd %%xmm11, %%xmm1\n"
23677 "subpd %%xmm11, %%xmm3\n"
23678 "movapd %%xmm12, %%xmm4\n"
23679 "movapd %%xmm12, %%xmm6\n"
23680 "addpd %%xmm14, %%xmm4\n"
23681 "subpd %%xmm14, %%xmm6\n"
23682 "movapd %%xmm13, %%xmm5\n"
23683 "movapd %%xmm13, %%xmm7\n"
23684 "addpd %%xmm15, %%xmm5\n"
23685 "subpd %%xmm15, %%xmm7\n"
23686 "movapd %%xmm0, %%xmm8\n"
23687 "movapd %%xmm0, %%xmm12\n"
23688 "addpd %%xmm4, %%xmm8\n"
23689 "subpd %%xmm4, %%xmm12\n"
23690 "movapd %%xmm1, %%xmm9\n"
23691 "movapd %%xmm1, %%xmm13\n"
23692 "addpd %%xmm5, %%xmm9\n"
23693 "subpd %%xmm5, %%xmm13\n"
23694 "movapd %%xmm2, %%xmm10\n"
23695 "movapd %%xmm2, %%xmm14\n"
23696 "addpd %%xmm6, %%xmm10\n"
23697 "subpd %%xmm6, %%xmm14\n"
23698 "movapd %%xmm3, %%xmm11\n"
23699 "movapd %%xmm3, %%xmm15\n"
23700 "addpd %%xmm7, %%xmm11\n"
23701 "subpd %%xmm7, %%xmm15\n"
23702 "movupd %%xmm8, (%0)\n"
23703 "movupd %%xmm9, (%1)\n"
23704 "movupd %%xmm10, (%2)\n"
23705 "movupd %%xmm11, (%3)\n"
23706 "movupd %%xmm12, (%4)\n"
23707 "movupd %%xmm13, (%5)\n"
23708 "movupd %%xmm14, (%6)\n"
23709 "movupd %%xmm15, (%7)\n"
23710 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23711 );
23712 }
23713 }
23714 return;
23715 }
23716 if (depth == 27) {
23717 helper_double_27_recursive(buf + 0, 24);
23718 helper_double_27_recursive(buf + 16777216, 24);
23719 helper_double_27_recursive(buf + 33554432, 24);
23720 helper_double_27_recursive(buf + 50331648, 24);
23721 helper_double_27_recursive(buf + 67108864, 24);
23722 helper_double_27_recursive(buf + 83886080, 24);
23723 helper_double_27_recursive(buf + 100663296, 24);
23724 helper_double_27_recursive(buf + 117440512, 24);
23725 for (int j = 0; j < 134217728; j += 134217728) {
23726 for (int k = 0; k < 16777216; k += 2) {
23727 __asm__ volatile (
23728 "movupd (%0), %%xmm0\n"
23729 "movupd (%1), %%xmm1\n"
23730 "movupd (%2), %%xmm2\n"
23731 "movupd (%3), %%xmm3\n"
23732 "movupd (%4), %%xmm4\n"
23733 "movupd (%5), %%xmm5\n"
23734 "movupd (%6), %%xmm6\n"
23735 "movupd (%7), %%xmm7\n"
23736 "movapd %%xmm0, %%xmm8\n"
23737 "movapd %%xmm0, %%xmm9\n"
23738 "addpd %%xmm1, %%xmm8\n"
23739 "subpd %%xmm1, %%xmm9\n"
23740 "movapd %%xmm2, %%xmm10\n"
23741 "movapd %%xmm2, %%xmm11\n"
23742 "addpd %%xmm3, %%xmm10\n"
23743 "subpd %%xmm3, %%xmm11\n"
23744 "movapd %%xmm4, %%xmm12\n"
23745 "movapd %%xmm4, %%xmm13\n"
23746 "addpd %%xmm5, %%xmm12\n"
23747 "subpd %%xmm5, %%xmm13\n"
23748 "movapd %%xmm6, %%xmm14\n"
23749 "movapd %%xmm6, %%xmm15\n"
23750 "addpd %%xmm7, %%xmm14\n"
23751 "subpd %%xmm7, %%xmm15\n"
23752 "movapd %%xmm8, %%xmm0\n"
23753 "movapd %%xmm8, %%xmm2\n"
23754 "addpd %%xmm10, %%xmm0\n"
23755 "subpd %%xmm10, %%xmm2\n"
23756 "movapd %%xmm9, %%xmm1\n"
23757 "movapd %%xmm9, %%xmm3\n"
23758 "addpd %%xmm11, %%xmm1\n"
23759 "subpd %%xmm11, %%xmm3\n"
23760 "movapd %%xmm12, %%xmm4\n"
23761 "movapd %%xmm12, %%xmm6\n"
23762 "addpd %%xmm14, %%xmm4\n"
23763 "subpd %%xmm14, %%xmm6\n"
23764 "movapd %%xmm13, %%xmm5\n"
23765 "movapd %%xmm13, %%xmm7\n"
23766 "addpd %%xmm15, %%xmm5\n"
23767 "subpd %%xmm15, %%xmm7\n"
23768 "movapd %%xmm0, %%xmm8\n"
23769 "movapd %%xmm0, %%xmm12\n"
23770 "addpd %%xmm4, %%xmm8\n"
23771 "subpd %%xmm4, %%xmm12\n"
23772 "movapd %%xmm1, %%xmm9\n"
23773 "movapd %%xmm1, %%xmm13\n"
23774 "addpd %%xmm5, %%xmm9\n"
23775 "subpd %%xmm5, %%xmm13\n"
23776 "movapd %%xmm2, %%xmm10\n"
23777 "movapd %%xmm2, %%xmm14\n"
23778 "addpd %%xmm6, %%xmm10\n"
23779 "subpd %%xmm6, %%xmm14\n"
23780 "movapd %%xmm3, %%xmm11\n"
23781 "movapd %%xmm3, %%xmm15\n"
23782 "addpd %%xmm7, %%xmm11\n"
23783 "subpd %%xmm7, %%xmm15\n"
23784 "movupd %%xmm8, (%0)\n"
23785 "movupd %%xmm9, (%1)\n"
23786 "movupd %%xmm10, (%2)\n"
23787 "movupd %%xmm11, (%3)\n"
23788 "movupd %%xmm12, (%4)\n"
23789 "movupd %%xmm13, (%5)\n"
23790 "movupd %%xmm14, (%6)\n"
23791 "movupd %%xmm15, (%7)\n"
23792 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23793 );
23794 }
23795 }
23796 return;
23797 }
23798 }
23799 void helper_double_27(double *buf);
helper_double_27(double * buf)23800 void helper_double_27(double *buf) {
23801 helper_double_27_recursive(buf, 27);
23802 }
23803 void helper_double_28_recursive(double *buf, int depth);
helper_double_28_recursive(double * buf,int depth)23804 void helper_double_28_recursive(double *buf, int depth) {
23805 if (depth == 14) {
23806 for (int j = 0; j < 16384; j += 16) {
23807 for (int k = 0; k < 2; k += 2) {
23808 __asm__ volatile (
23809 "movupd (%0), %%xmm0\n"
23810 "movupd (%1), %%xmm1\n"
23811 "movupd (%2), %%xmm2\n"
23812 "movupd (%3), %%xmm3\n"
23813 "movupd (%4), %%xmm4\n"
23814 "movupd (%5), %%xmm5\n"
23815 "movupd (%6), %%xmm6\n"
23816 "movupd (%7), %%xmm7\n"
23817 "movapd %%xmm0, %%xmm8\n"
23818 "haddpd %%xmm8, %%xmm8\n"
23819 "movapd %%xmm0, %%xmm9\n"
23820 "hsubpd %%xmm9, %%xmm9\n"
23821 "blendpd $1, %%xmm8, %%xmm9\n"
23822 "movapd %%xmm9, %%xmm0\n"
23823 "movapd %%xmm1, %%xmm8\n"
23824 "haddpd %%xmm8, %%xmm8\n"
23825 "movapd %%xmm1, %%xmm9\n"
23826 "hsubpd %%xmm9, %%xmm9\n"
23827 "blendpd $1, %%xmm8, %%xmm9\n"
23828 "movapd %%xmm9, %%xmm1\n"
23829 "movapd %%xmm2, %%xmm8\n"
23830 "haddpd %%xmm8, %%xmm8\n"
23831 "movapd %%xmm2, %%xmm9\n"
23832 "hsubpd %%xmm9, %%xmm9\n"
23833 "blendpd $1, %%xmm8, %%xmm9\n"
23834 "movapd %%xmm9, %%xmm2\n"
23835 "movapd %%xmm3, %%xmm8\n"
23836 "haddpd %%xmm8, %%xmm8\n"
23837 "movapd %%xmm3, %%xmm9\n"
23838 "hsubpd %%xmm9, %%xmm9\n"
23839 "blendpd $1, %%xmm8, %%xmm9\n"
23840 "movapd %%xmm9, %%xmm3\n"
23841 "movapd %%xmm4, %%xmm8\n"
23842 "haddpd %%xmm8, %%xmm8\n"
23843 "movapd %%xmm4, %%xmm9\n"
23844 "hsubpd %%xmm9, %%xmm9\n"
23845 "blendpd $1, %%xmm8, %%xmm9\n"
23846 "movapd %%xmm9, %%xmm4\n"
23847 "movapd %%xmm5, %%xmm8\n"
23848 "haddpd %%xmm8, %%xmm8\n"
23849 "movapd %%xmm5, %%xmm9\n"
23850 "hsubpd %%xmm9, %%xmm9\n"
23851 "blendpd $1, %%xmm8, %%xmm9\n"
23852 "movapd %%xmm9, %%xmm5\n"
23853 "movapd %%xmm6, %%xmm8\n"
23854 "haddpd %%xmm8, %%xmm8\n"
23855 "movapd %%xmm6, %%xmm9\n"
23856 "hsubpd %%xmm9, %%xmm9\n"
23857 "blendpd $1, %%xmm8, %%xmm9\n"
23858 "movapd %%xmm9, %%xmm6\n"
23859 "movapd %%xmm7, %%xmm8\n"
23860 "haddpd %%xmm8, %%xmm8\n"
23861 "movapd %%xmm7, %%xmm9\n"
23862 "hsubpd %%xmm9, %%xmm9\n"
23863 "blendpd $1, %%xmm8, %%xmm9\n"
23864 "movapd %%xmm9, %%xmm7\n"
23865 "movapd %%xmm0, %%xmm8\n"
23866 "movapd %%xmm0, %%xmm9\n"
23867 "addpd %%xmm1, %%xmm8\n"
23868 "subpd %%xmm1, %%xmm9\n"
23869 "movapd %%xmm2, %%xmm10\n"
23870 "movapd %%xmm2, %%xmm11\n"
23871 "addpd %%xmm3, %%xmm10\n"
23872 "subpd %%xmm3, %%xmm11\n"
23873 "movapd %%xmm4, %%xmm12\n"
23874 "movapd %%xmm4, %%xmm13\n"
23875 "addpd %%xmm5, %%xmm12\n"
23876 "subpd %%xmm5, %%xmm13\n"
23877 "movapd %%xmm6, %%xmm14\n"
23878 "movapd %%xmm6, %%xmm15\n"
23879 "addpd %%xmm7, %%xmm14\n"
23880 "subpd %%xmm7, %%xmm15\n"
23881 "movapd %%xmm8, %%xmm0\n"
23882 "movapd %%xmm8, %%xmm2\n"
23883 "addpd %%xmm10, %%xmm0\n"
23884 "subpd %%xmm10, %%xmm2\n"
23885 "movapd %%xmm9, %%xmm1\n"
23886 "movapd %%xmm9, %%xmm3\n"
23887 "addpd %%xmm11, %%xmm1\n"
23888 "subpd %%xmm11, %%xmm3\n"
23889 "movapd %%xmm12, %%xmm4\n"
23890 "movapd %%xmm12, %%xmm6\n"
23891 "addpd %%xmm14, %%xmm4\n"
23892 "subpd %%xmm14, %%xmm6\n"
23893 "movapd %%xmm13, %%xmm5\n"
23894 "movapd %%xmm13, %%xmm7\n"
23895 "addpd %%xmm15, %%xmm5\n"
23896 "subpd %%xmm15, %%xmm7\n"
23897 "movapd %%xmm0, %%xmm8\n"
23898 "movapd %%xmm0, %%xmm12\n"
23899 "addpd %%xmm4, %%xmm8\n"
23900 "subpd %%xmm4, %%xmm12\n"
23901 "movapd %%xmm1, %%xmm9\n"
23902 "movapd %%xmm1, %%xmm13\n"
23903 "addpd %%xmm5, %%xmm9\n"
23904 "subpd %%xmm5, %%xmm13\n"
23905 "movapd %%xmm2, %%xmm10\n"
23906 "movapd %%xmm2, %%xmm14\n"
23907 "addpd %%xmm6, %%xmm10\n"
23908 "subpd %%xmm6, %%xmm14\n"
23909 "movapd %%xmm3, %%xmm11\n"
23910 "movapd %%xmm3, %%xmm15\n"
23911 "addpd %%xmm7, %%xmm11\n"
23912 "subpd %%xmm7, %%xmm15\n"
23913 "movupd %%xmm8, (%0)\n"
23914 "movupd %%xmm9, (%1)\n"
23915 "movupd %%xmm10, (%2)\n"
23916 "movupd %%xmm11, (%3)\n"
23917 "movupd %%xmm12, (%4)\n"
23918 "movupd %%xmm13, (%5)\n"
23919 "movupd %%xmm14, (%6)\n"
23920 "movupd %%xmm15, (%7)\n"
23921 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23922 );
23923 }
23924 }
23925 for (int j = 0; j < 16384; j += 128) {
23926 for (int k = 0; k < 16; k += 2) {
23927 __asm__ volatile (
23928 "movupd (%0), %%xmm0\n"
23929 "movupd (%1), %%xmm1\n"
23930 "movupd (%2), %%xmm2\n"
23931 "movupd (%3), %%xmm3\n"
23932 "movupd (%4), %%xmm4\n"
23933 "movupd (%5), %%xmm5\n"
23934 "movupd (%6), %%xmm6\n"
23935 "movupd (%7), %%xmm7\n"
23936 "movapd %%xmm0, %%xmm8\n"
23937 "movapd %%xmm0, %%xmm9\n"
23938 "addpd %%xmm1, %%xmm8\n"
23939 "subpd %%xmm1, %%xmm9\n"
23940 "movapd %%xmm2, %%xmm10\n"
23941 "movapd %%xmm2, %%xmm11\n"
23942 "addpd %%xmm3, %%xmm10\n"
23943 "subpd %%xmm3, %%xmm11\n"
23944 "movapd %%xmm4, %%xmm12\n"
23945 "movapd %%xmm4, %%xmm13\n"
23946 "addpd %%xmm5, %%xmm12\n"
23947 "subpd %%xmm5, %%xmm13\n"
23948 "movapd %%xmm6, %%xmm14\n"
23949 "movapd %%xmm6, %%xmm15\n"
23950 "addpd %%xmm7, %%xmm14\n"
23951 "subpd %%xmm7, %%xmm15\n"
23952 "movapd %%xmm8, %%xmm0\n"
23953 "movapd %%xmm8, %%xmm2\n"
23954 "addpd %%xmm10, %%xmm0\n"
23955 "subpd %%xmm10, %%xmm2\n"
23956 "movapd %%xmm9, %%xmm1\n"
23957 "movapd %%xmm9, %%xmm3\n"
23958 "addpd %%xmm11, %%xmm1\n"
23959 "subpd %%xmm11, %%xmm3\n"
23960 "movapd %%xmm12, %%xmm4\n"
23961 "movapd %%xmm12, %%xmm6\n"
23962 "addpd %%xmm14, %%xmm4\n"
23963 "subpd %%xmm14, %%xmm6\n"
23964 "movapd %%xmm13, %%xmm5\n"
23965 "movapd %%xmm13, %%xmm7\n"
23966 "addpd %%xmm15, %%xmm5\n"
23967 "subpd %%xmm15, %%xmm7\n"
23968 "movapd %%xmm0, %%xmm8\n"
23969 "movapd %%xmm0, %%xmm12\n"
23970 "addpd %%xmm4, %%xmm8\n"
23971 "subpd %%xmm4, %%xmm12\n"
23972 "movapd %%xmm1, %%xmm9\n"
23973 "movapd %%xmm1, %%xmm13\n"
23974 "addpd %%xmm5, %%xmm9\n"
23975 "subpd %%xmm5, %%xmm13\n"
23976 "movapd %%xmm2, %%xmm10\n"
23977 "movapd %%xmm2, %%xmm14\n"
23978 "addpd %%xmm6, %%xmm10\n"
23979 "subpd %%xmm6, %%xmm14\n"
23980 "movapd %%xmm3, %%xmm11\n"
23981 "movapd %%xmm3, %%xmm15\n"
23982 "addpd %%xmm7, %%xmm11\n"
23983 "subpd %%xmm7, %%xmm15\n"
23984 "movupd %%xmm8, (%0)\n"
23985 "movupd %%xmm9, (%1)\n"
23986 "movupd %%xmm10, (%2)\n"
23987 "movupd %%xmm11, (%3)\n"
23988 "movupd %%xmm12, (%4)\n"
23989 "movupd %%xmm13, (%5)\n"
23990 "movupd %%xmm14, (%6)\n"
23991 "movupd %%xmm15, (%7)\n"
23992 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
23993 );
23994 }
23995 }
23996 for (int j = 0; j < 16384; j += 1024) {
23997 for (int k = 0; k < 128; k += 2) {
23998 __asm__ volatile (
23999 "movupd (%0), %%xmm0\n"
24000 "movupd (%1), %%xmm1\n"
24001 "movupd (%2), %%xmm2\n"
24002 "movupd (%3), %%xmm3\n"
24003 "movupd (%4), %%xmm4\n"
24004 "movupd (%5), %%xmm5\n"
24005 "movupd (%6), %%xmm6\n"
24006 "movupd (%7), %%xmm7\n"
24007 "movapd %%xmm0, %%xmm8\n"
24008 "movapd %%xmm0, %%xmm9\n"
24009 "addpd %%xmm1, %%xmm8\n"
24010 "subpd %%xmm1, %%xmm9\n"
24011 "movapd %%xmm2, %%xmm10\n"
24012 "movapd %%xmm2, %%xmm11\n"
24013 "addpd %%xmm3, %%xmm10\n"
24014 "subpd %%xmm3, %%xmm11\n"
24015 "movapd %%xmm4, %%xmm12\n"
24016 "movapd %%xmm4, %%xmm13\n"
24017 "addpd %%xmm5, %%xmm12\n"
24018 "subpd %%xmm5, %%xmm13\n"
24019 "movapd %%xmm6, %%xmm14\n"
24020 "movapd %%xmm6, %%xmm15\n"
24021 "addpd %%xmm7, %%xmm14\n"
24022 "subpd %%xmm7, %%xmm15\n"
24023 "movapd %%xmm8, %%xmm0\n"
24024 "movapd %%xmm8, %%xmm2\n"
24025 "addpd %%xmm10, %%xmm0\n"
24026 "subpd %%xmm10, %%xmm2\n"
24027 "movapd %%xmm9, %%xmm1\n"
24028 "movapd %%xmm9, %%xmm3\n"
24029 "addpd %%xmm11, %%xmm1\n"
24030 "subpd %%xmm11, %%xmm3\n"
24031 "movapd %%xmm12, %%xmm4\n"
24032 "movapd %%xmm12, %%xmm6\n"
24033 "addpd %%xmm14, %%xmm4\n"
24034 "subpd %%xmm14, %%xmm6\n"
24035 "movapd %%xmm13, %%xmm5\n"
24036 "movapd %%xmm13, %%xmm7\n"
24037 "addpd %%xmm15, %%xmm5\n"
24038 "subpd %%xmm15, %%xmm7\n"
24039 "movapd %%xmm0, %%xmm8\n"
24040 "movapd %%xmm0, %%xmm12\n"
24041 "addpd %%xmm4, %%xmm8\n"
24042 "subpd %%xmm4, %%xmm12\n"
24043 "movapd %%xmm1, %%xmm9\n"
24044 "movapd %%xmm1, %%xmm13\n"
24045 "addpd %%xmm5, %%xmm9\n"
24046 "subpd %%xmm5, %%xmm13\n"
24047 "movapd %%xmm2, %%xmm10\n"
24048 "movapd %%xmm2, %%xmm14\n"
24049 "addpd %%xmm6, %%xmm10\n"
24050 "subpd %%xmm6, %%xmm14\n"
24051 "movapd %%xmm3, %%xmm11\n"
24052 "movapd %%xmm3, %%xmm15\n"
24053 "addpd %%xmm7, %%xmm11\n"
24054 "subpd %%xmm7, %%xmm15\n"
24055 "movupd %%xmm8, (%0)\n"
24056 "movupd %%xmm9, (%1)\n"
24057 "movupd %%xmm10, (%2)\n"
24058 "movupd %%xmm11, (%3)\n"
24059 "movupd %%xmm12, (%4)\n"
24060 "movupd %%xmm13, (%5)\n"
24061 "movupd %%xmm14, (%6)\n"
24062 "movupd %%xmm15, (%7)\n"
24063 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24064 );
24065 }
24066 }
24067 for (int j = 0; j < 16384; j += 8192) {
24068 for (int k = 0; k < 1024; k += 2) {
24069 __asm__ volatile (
24070 "movupd (%0), %%xmm0\n"
24071 "movupd (%1), %%xmm1\n"
24072 "movupd (%2), %%xmm2\n"
24073 "movupd (%3), %%xmm3\n"
24074 "movupd (%4), %%xmm4\n"
24075 "movupd (%5), %%xmm5\n"
24076 "movupd (%6), %%xmm6\n"
24077 "movupd (%7), %%xmm7\n"
24078 "movapd %%xmm0, %%xmm8\n"
24079 "movapd %%xmm0, %%xmm9\n"
24080 "addpd %%xmm1, %%xmm8\n"
24081 "subpd %%xmm1, %%xmm9\n"
24082 "movapd %%xmm2, %%xmm10\n"
24083 "movapd %%xmm2, %%xmm11\n"
24084 "addpd %%xmm3, %%xmm10\n"
24085 "subpd %%xmm3, %%xmm11\n"
24086 "movapd %%xmm4, %%xmm12\n"
24087 "movapd %%xmm4, %%xmm13\n"
24088 "addpd %%xmm5, %%xmm12\n"
24089 "subpd %%xmm5, %%xmm13\n"
24090 "movapd %%xmm6, %%xmm14\n"
24091 "movapd %%xmm6, %%xmm15\n"
24092 "addpd %%xmm7, %%xmm14\n"
24093 "subpd %%xmm7, %%xmm15\n"
24094 "movapd %%xmm8, %%xmm0\n"
24095 "movapd %%xmm8, %%xmm2\n"
24096 "addpd %%xmm10, %%xmm0\n"
24097 "subpd %%xmm10, %%xmm2\n"
24098 "movapd %%xmm9, %%xmm1\n"
24099 "movapd %%xmm9, %%xmm3\n"
24100 "addpd %%xmm11, %%xmm1\n"
24101 "subpd %%xmm11, %%xmm3\n"
24102 "movapd %%xmm12, %%xmm4\n"
24103 "movapd %%xmm12, %%xmm6\n"
24104 "addpd %%xmm14, %%xmm4\n"
24105 "subpd %%xmm14, %%xmm6\n"
24106 "movapd %%xmm13, %%xmm5\n"
24107 "movapd %%xmm13, %%xmm7\n"
24108 "addpd %%xmm15, %%xmm5\n"
24109 "subpd %%xmm15, %%xmm7\n"
24110 "movapd %%xmm0, %%xmm8\n"
24111 "movapd %%xmm0, %%xmm12\n"
24112 "addpd %%xmm4, %%xmm8\n"
24113 "subpd %%xmm4, %%xmm12\n"
24114 "movapd %%xmm1, %%xmm9\n"
24115 "movapd %%xmm1, %%xmm13\n"
24116 "addpd %%xmm5, %%xmm9\n"
24117 "subpd %%xmm5, %%xmm13\n"
24118 "movapd %%xmm2, %%xmm10\n"
24119 "movapd %%xmm2, %%xmm14\n"
24120 "addpd %%xmm6, %%xmm10\n"
24121 "subpd %%xmm6, %%xmm14\n"
24122 "movapd %%xmm3, %%xmm11\n"
24123 "movapd %%xmm3, %%xmm15\n"
24124 "addpd %%xmm7, %%xmm11\n"
24125 "subpd %%xmm7, %%xmm15\n"
24126 "movupd %%xmm8, (%0)\n"
24127 "movupd %%xmm9, (%1)\n"
24128 "movupd %%xmm10, (%2)\n"
24129 "movupd %%xmm11, (%3)\n"
24130 "movupd %%xmm12, (%4)\n"
24131 "movupd %%xmm13, (%5)\n"
24132 "movupd %%xmm14, (%6)\n"
24133 "movupd %%xmm15, (%7)\n"
24134 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24135 );
24136 }
24137 }
24138 for (int j = 0; j < 16384; j += 16384) {
24139 for (int k = 0; k < 8192; k += 2) {
24140 __asm__ volatile (
24141 "movupd (%0), %%xmm0\n"
24142 "movupd (%1), %%xmm1\n"
24143 "movapd %%xmm0, %%xmm8\n"
24144 "movapd %%xmm0, %%xmm9\n"
24145 "addpd %%xmm1, %%xmm8\n"
24146 "subpd %%xmm1, %%xmm9\n"
24147 "movupd %%xmm8, (%0)\n"
24148 "movupd %%xmm9, (%1)\n"
24149 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24150 );
24151 }
24152 }
24153 return;
24154 }
24155 if (depth == 17) {
24156 helper_double_28_recursive(buf + 0, 14);
24157 helper_double_28_recursive(buf + 16384, 14);
24158 helper_double_28_recursive(buf + 32768, 14);
24159 helper_double_28_recursive(buf + 49152, 14);
24160 helper_double_28_recursive(buf + 65536, 14);
24161 helper_double_28_recursive(buf + 81920, 14);
24162 helper_double_28_recursive(buf + 98304, 14);
24163 helper_double_28_recursive(buf + 114688, 14);
24164 for (int j = 0; j < 131072; j += 131072) {
24165 for (int k = 0; k < 16384; k += 2) {
24166 __asm__ volatile (
24167 "movupd (%0), %%xmm0\n"
24168 "movupd (%1), %%xmm1\n"
24169 "movupd (%2), %%xmm2\n"
24170 "movupd (%3), %%xmm3\n"
24171 "movupd (%4), %%xmm4\n"
24172 "movupd (%5), %%xmm5\n"
24173 "movupd (%6), %%xmm6\n"
24174 "movupd (%7), %%xmm7\n"
24175 "movapd %%xmm0, %%xmm8\n"
24176 "movapd %%xmm0, %%xmm9\n"
24177 "addpd %%xmm1, %%xmm8\n"
24178 "subpd %%xmm1, %%xmm9\n"
24179 "movapd %%xmm2, %%xmm10\n"
24180 "movapd %%xmm2, %%xmm11\n"
24181 "addpd %%xmm3, %%xmm10\n"
24182 "subpd %%xmm3, %%xmm11\n"
24183 "movapd %%xmm4, %%xmm12\n"
24184 "movapd %%xmm4, %%xmm13\n"
24185 "addpd %%xmm5, %%xmm12\n"
24186 "subpd %%xmm5, %%xmm13\n"
24187 "movapd %%xmm6, %%xmm14\n"
24188 "movapd %%xmm6, %%xmm15\n"
24189 "addpd %%xmm7, %%xmm14\n"
24190 "subpd %%xmm7, %%xmm15\n"
24191 "movapd %%xmm8, %%xmm0\n"
24192 "movapd %%xmm8, %%xmm2\n"
24193 "addpd %%xmm10, %%xmm0\n"
24194 "subpd %%xmm10, %%xmm2\n"
24195 "movapd %%xmm9, %%xmm1\n"
24196 "movapd %%xmm9, %%xmm3\n"
24197 "addpd %%xmm11, %%xmm1\n"
24198 "subpd %%xmm11, %%xmm3\n"
24199 "movapd %%xmm12, %%xmm4\n"
24200 "movapd %%xmm12, %%xmm6\n"
24201 "addpd %%xmm14, %%xmm4\n"
24202 "subpd %%xmm14, %%xmm6\n"
24203 "movapd %%xmm13, %%xmm5\n"
24204 "movapd %%xmm13, %%xmm7\n"
24205 "addpd %%xmm15, %%xmm5\n"
24206 "subpd %%xmm15, %%xmm7\n"
24207 "movapd %%xmm0, %%xmm8\n"
24208 "movapd %%xmm0, %%xmm12\n"
24209 "addpd %%xmm4, %%xmm8\n"
24210 "subpd %%xmm4, %%xmm12\n"
24211 "movapd %%xmm1, %%xmm9\n"
24212 "movapd %%xmm1, %%xmm13\n"
24213 "addpd %%xmm5, %%xmm9\n"
24214 "subpd %%xmm5, %%xmm13\n"
24215 "movapd %%xmm2, %%xmm10\n"
24216 "movapd %%xmm2, %%xmm14\n"
24217 "addpd %%xmm6, %%xmm10\n"
24218 "subpd %%xmm6, %%xmm14\n"
24219 "movapd %%xmm3, %%xmm11\n"
24220 "movapd %%xmm3, %%xmm15\n"
24221 "addpd %%xmm7, %%xmm11\n"
24222 "subpd %%xmm7, %%xmm15\n"
24223 "movupd %%xmm8, (%0)\n"
24224 "movupd %%xmm9, (%1)\n"
24225 "movupd %%xmm10, (%2)\n"
24226 "movupd %%xmm11, (%3)\n"
24227 "movupd %%xmm12, (%4)\n"
24228 "movupd %%xmm13, (%5)\n"
24229 "movupd %%xmm14, (%6)\n"
24230 "movupd %%xmm15, (%7)\n"
24231 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24232 );
24233 }
24234 }
24235 return;
24236 }
24237 if (depth == 20) {
24238 helper_double_28_recursive(buf + 0, 17);
24239 helper_double_28_recursive(buf + 131072, 17);
24240 helper_double_28_recursive(buf + 262144, 17);
24241 helper_double_28_recursive(buf + 393216, 17);
24242 helper_double_28_recursive(buf + 524288, 17);
24243 helper_double_28_recursive(buf + 655360, 17);
24244 helper_double_28_recursive(buf + 786432, 17);
24245 helper_double_28_recursive(buf + 917504, 17);
24246 for (int j = 0; j < 1048576; j += 1048576) {
24247 for (int k = 0; k < 131072; k += 2) {
24248 __asm__ volatile (
24249 "movupd (%0), %%xmm0\n"
24250 "movupd (%1), %%xmm1\n"
24251 "movupd (%2), %%xmm2\n"
24252 "movupd (%3), %%xmm3\n"
24253 "movupd (%4), %%xmm4\n"
24254 "movupd (%5), %%xmm5\n"
24255 "movupd (%6), %%xmm6\n"
24256 "movupd (%7), %%xmm7\n"
24257 "movapd %%xmm0, %%xmm8\n"
24258 "movapd %%xmm0, %%xmm9\n"
24259 "addpd %%xmm1, %%xmm8\n"
24260 "subpd %%xmm1, %%xmm9\n"
24261 "movapd %%xmm2, %%xmm10\n"
24262 "movapd %%xmm2, %%xmm11\n"
24263 "addpd %%xmm3, %%xmm10\n"
24264 "subpd %%xmm3, %%xmm11\n"
24265 "movapd %%xmm4, %%xmm12\n"
24266 "movapd %%xmm4, %%xmm13\n"
24267 "addpd %%xmm5, %%xmm12\n"
24268 "subpd %%xmm5, %%xmm13\n"
24269 "movapd %%xmm6, %%xmm14\n"
24270 "movapd %%xmm6, %%xmm15\n"
24271 "addpd %%xmm7, %%xmm14\n"
24272 "subpd %%xmm7, %%xmm15\n"
24273 "movapd %%xmm8, %%xmm0\n"
24274 "movapd %%xmm8, %%xmm2\n"
24275 "addpd %%xmm10, %%xmm0\n"
24276 "subpd %%xmm10, %%xmm2\n"
24277 "movapd %%xmm9, %%xmm1\n"
24278 "movapd %%xmm9, %%xmm3\n"
24279 "addpd %%xmm11, %%xmm1\n"
24280 "subpd %%xmm11, %%xmm3\n"
24281 "movapd %%xmm12, %%xmm4\n"
24282 "movapd %%xmm12, %%xmm6\n"
24283 "addpd %%xmm14, %%xmm4\n"
24284 "subpd %%xmm14, %%xmm6\n"
24285 "movapd %%xmm13, %%xmm5\n"
24286 "movapd %%xmm13, %%xmm7\n"
24287 "addpd %%xmm15, %%xmm5\n"
24288 "subpd %%xmm15, %%xmm7\n"
24289 "movapd %%xmm0, %%xmm8\n"
24290 "movapd %%xmm0, %%xmm12\n"
24291 "addpd %%xmm4, %%xmm8\n"
24292 "subpd %%xmm4, %%xmm12\n"
24293 "movapd %%xmm1, %%xmm9\n"
24294 "movapd %%xmm1, %%xmm13\n"
24295 "addpd %%xmm5, %%xmm9\n"
24296 "subpd %%xmm5, %%xmm13\n"
24297 "movapd %%xmm2, %%xmm10\n"
24298 "movapd %%xmm2, %%xmm14\n"
24299 "addpd %%xmm6, %%xmm10\n"
24300 "subpd %%xmm6, %%xmm14\n"
24301 "movapd %%xmm3, %%xmm11\n"
24302 "movapd %%xmm3, %%xmm15\n"
24303 "addpd %%xmm7, %%xmm11\n"
24304 "subpd %%xmm7, %%xmm15\n"
24305 "movupd %%xmm8, (%0)\n"
24306 "movupd %%xmm9, (%1)\n"
24307 "movupd %%xmm10, (%2)\n"
24308 "movupd %%xmm11, (%3)\n"
24309 "movupd %%xmm12, (%4)\n"
24310 "movupd %%xmm13, (%5)\n"
24311 "movupd %%xmm14, (%6)\n"
24312 "movupd %%xmm15, (%7)\n"
24313 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24314 );
24315 }
24316 }
24317 return;
24318 }
24319 if (depth == 23) {
24320 helper_double_28_recursive(buf + 0, 20);
24321 helper_double_28_recursive(buf + 1048576, 20);
24322 helper_double_28_recursive(buf + 2097152, 20);
24323 helper_double_28_recursive(buf + 3145728, 20);
24324 helper_double_28_recursive(buf + 4194304, 20);
24325 helper_double_28_recursive(buf + 5242880, 20);
24326 helper_double_28_recursive(buf + 6291456, 20);
24327 helper_double_28_recursive(buf + 7340032, 20);
24328 for (int j = 0; j < 8388608; j += 8388608) {
24329 for (int k = 0; k < 1048576; k += 2) {
24330 __asm__ volatile (
24331 "movupd (%0), %%xmm0\n"
24332 "movupd (%1), %%xmm1\n"
24333 "movupd (%2), %%xmm2\n"
24334 "movupd (%3), %%xmm3\n"
24335 "movupd (%4), %%xmm4\n"
24336 "movupd (%5), %%xmm5\n"
24337 "movupd (%6), %%xmm6\n"
24338 "movupd (%7), %%xmm7\n"
24339 "movapd %%xmm0, %%xmm8\n"
24340 "movapd %%xmm0, %%xmm9\n"
24341 "addpd %%xmm1, %%xmm8\n"
24342 "subpd %%xmm1, %%xmm9\n"
24343 "movapd %%xmm2, %%xmm10\n"
24344 "movapd %%xmm2, %%xmm11\n"
24345 "addpd %%xmm3, %%xmm10\n"
24346 "subpd %%xmm3, %%xmm11\n"
24347 "movapd %%xmm4, %%xmm12\n"
24348 "movapd %%xmm4, %%xmm13\n"
24349 "addpd %%xmm5, %%xmm12\n"
24350 "subpd %%xmm5, %%xmm13\n"
24351 "movapd %%xmm6, %%xmm14\n"
24352 "movapd %%xmm6, %%xmm15\n"
24353 "addpd %%xmm7, %%xmm14\n"
24354 "subpd %%xmm7, %%xmm15\n"
24355 "movapd %%xmm8, %%xmm0\n"
24356 "movapd %%xmm8, %%xmm2\n"
24357 "addpd %%xmm10, %%xmm0\n"
24358 "subpd %%xmm10, %%xmm2\n"
24359 "movapd %%xmm9, %%xmm1\n"
24360 "movapd %%xmm9, %%xmm3\n"
24361 "addpd %%xmm11, %%xmm1\n"
24362 "subpd %%xmm11, %%xmm3\n"
24363 "movapd %%xmm12, %%xmm4\n"
24364 "movapd %%xmm12, %%xmm6\n"
24365 "addpd %%xmm14, %%xmm4\n"
24366 "subpd %%xmm14, %%xmm6\n"
24367 "movapd %%xmm13, %%xmm5\n"
24368 "movapd %%xmm13, %%xmm7\n"
24369 "addpd %%xmm15, %%xmm5\n"
24370 "subpd %%xmm15, %%xmm7\n"
24371 "movapd %%xmm0, %%xmm8\n"
24372 "movapd %%xmm0, %%xmm12\n"
24373 "addpd %%xmm4, %%xmm8\n"
24374 "subpd %%xmm4, %%xmm12\n"
24375 "movapd %%xmm1, %%xmm9\n"
24376 "movapd %%xmm1, %%xmm13\n"
24377 "addpd %%xmm5, %%xmm9\n"
24378 "subpd %%xmm5, %%xmm13\n"
24379 "movapd %%xmm2, %%xmm10\n"
24380 "movapd %%xmm2, %%xmm14\n"
24381 "addpd %%xmm6, %%xmm10\n"
24382 "subpd %%xmm6, %%xmm14\n"
24383 "movapd %%xmm3, %%xmm11\n"
24384 "movapd %%xmm3, %%xmm15\n"
24385 "addpd %%xmm7, %%xmm11\n"
24386 "subpd %%xmm7, %%xmm15\n"
24387 "movupd %%xmm8, (%0)\n"
24388 "movupd %%xmm9, (%1)\n"
24389 "movupd %%xmm10, (%2)\n"
24390 "movupd %%xmm11, (%3)\n"
24391 "movupd %%xmm12, (%4)\n"
24392 "movupd %%xmm13, (%5)\n"
24393 "movupd %%xmm14, (%6)\n"
24394 "movupd %%xmm15, (%7)\n"
24395 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24396 );
24397 }
24398 }
24399 return;
24400 }
24401 if (depth == 26) {
24402 helper_double_28_recursive(buf + 0, 23);
24403 helper_double_28_recursive(buf + 8388608, 23);
24404 helper_double_28_recursive(buf + 16777216, 23);
24405 helper_double_28_recursive(buf + 25165824, 23);
24406 helper_double_28_recursive(buf + 33554432, 23);
24407 helper_double_28_recursive(buf + 41943040, 23);
24408 helper_double_28_recursive(buf + 50331648, 23);
24409 helper_double_28_recursive(buf + 58720256, 23);
24410 for (int j = 0; j < 67108864; j += 67108864) {
24411 for (int k = 0; k < 8388608; k += 2) {
24412 __asm__ volatile (
24413 "movupd (%0), %%xmm0\n"
24414 "movupd (%1), %%xmm1\n"
24415 "movupd (%2), %%xmm2\n"
24416 "movupd (%3), %%xmm3\n"
24417 "movupd (%4), %%xmm4\n"
24418 "movupd (%5), %%xmm5\n"
24419 "movupd (%6), %%xmm6\n"
24420 "movupd (%7), %%xmm7\n"
24421 "movapd %%xmm0, %%xmm8\n"
24422 "movapd %%xmm0, %%xmm9\n"
24423 "addpd %%xmm1, %%xmm8\n"
24424 "subpd %%xmm1, %%xmm9\n"
24425 "movapd %%xmm2, %%xmm10\n"
24426 "movapd %%xmm2, %%xmm11\n"
24427 "addpd %%xmm3, %%xmm10\n"
24428 "subpd %%xmm3, %%xmm11\n"
24429 "movapd %%xmm4, %%xmm12\n"
24430 "movapd %%xmm4, %%xmm13\n"
24431 "addpd %%xmm5, %%xmm12\n"
24432 "subpd %%xmm5, %%xmm13\n"
24433 "movapd %%xmm6, %%xmm14\n"
24434 "movapd %%xmm6, %%xmm15\n"
24435 "addpd %%xmm7, %%xmm14\n"
24436 "subpd %%xmm7, %%xmm15\n"
24437 "movapd %%xmm8, %%xmm0\n"
24438 "movapd %%xmm8, %%xmm2\n"
24439 "addpd %%xmm10, %%xmm0\n"
24440 "subpd %%xmm10, %%xmm2\n"
24441 "movapd %%xmm9, %%xmm1\n"
24442 "movapd %%xmm9, %%xmm3\n"
24443 "addpd %%xmm11, %%xmm1\n"
24444 "subpd %%xmm11, %%xmm3\n"
24445 "movapd %%xmm12, %%xmm4\n"
24446 "movapd %%xmm12, %%xmm6\n"
24447 "addpd %%xmm14, %%xmm4\n"
24448 "subpd %%xmm14, %%xmm6\n"
24449 "movapd %%xmm13, %%xmm5\n"
24450 "movapd %%xmm13, %%xmm7\n"
24451 "addpd %%xmm15, %%xmm5\n"
24452 "subpd %%xmm15, %%xmm7\n"
24453 "movapd %%xmm0, %%xmm8\n"
24454 "movapd %%xmm0, %%xmm12\n"
24455 "addpd %%xmm4, %%xmm8\n"
24456 "subpd %%xmm4, %%xmm12\n"
24457 "movapd %%xmm1, %%xmm9\n"
24458 "movapd %%xmm1, %%xmm13\n"
24459 "addpd %%xmm5, %%xmm9\n"
24460 "subpd %%xmm5, %%xmm13\n"
24461 "movapd %%xmm2, %%xmm10\n"
24462 "movapd %%xmm2, %%xmm14\n"
24463 "addpd %%xmm6, %%xmm10\n"
24464 "subpd %%xmm6, %%xmm14\n"
24465 "movapd %%xmm3, %%xmm11\n"
24466 "movapd %%xmm3, %%xmm15\n"
24467 "addpd %%xmm7, %%xmm11\n"
24468 "subpd %%xmm7, %%xmm15\n"
24469 "movupd %%xmm8, (%0)\n"
24470 "movupd %%xmm9, (%1)\n"
24471 "movupd %%xmm10, (%2)\n"
24472 "movupd %%xmm11, (%3)\n"
24473 "movupd %%xmm12, (%4)\n"
24474 "movupd %%xmm13, (%5)\n"
24475 "movupd %%xmm14, (%6)\n"
24476 "movupd %%xmm15, (%7)\n"
24477 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24478 );
24479 }
24480 }
24481 return;
24482 }
24483 if (depth == 28) {
24484 helper_double_28_recursive(buf + 0, 26);
24485 helper_double_28_recursive(buf + 67108864, 26);
24486 helper_double_28_recursive(buf + 134217728, 26);
24487 helper_double_28_recursive(buf + 201326592, 26);
24488 for (int j = 0; j < 268435456; j += 268435456) {
24489 for (int k = 0; k < 67108864; k += 2) {
24490 __asm__ volatile (
24491 "movupd (%0), %%xmm0\n"
24492 "movupd (%1), %%xmm1\n"
24493 "movupd (%2), %%xmm2\n"
24494 "movupd (%3), %%xmm3\n"
24495 "movapd %%xmm0, %%xmm8\n"
24496 "movapd %%xmm0, %%xmm9\n"
24497 "addpd %%xmm1, %%xmm8\n"
24498 "subpd %%xmm1, %%xmm9\n"
24499 "movapd %%xmm2, %%xmm10\n"
24500 "movapd %%xmm2, %%xmm11\n"
24501 "addpd %%xmm3, %%xmm10\n"
24502 "subpd %%xmm3, %%xmm11\n"
24503 "movapd %%xmm8, %%xmm0\n"
24504 "movapd %%xmm8, %%xmm2\n"
24505 "addpd %%xmm10, %%xmm0\n"
24506 "subpd %%xmm10, %%xmm2\n"
24507 "movapd %%xmm9, %%xmm1\n"
24508 "movapd %%xmm9, %%xmm3\n"
24509 "addpd %%xmm11, %%xmm1\n"
24510 "subpd %%xmm11, %%xmm3\n"
24511 "movupd %%xmm0, (%0)\n"
24512 "movupd %%xmm1, (%1)\n"
24513 "movupd %%xmm2, (%2)\n"
24514 "movupd %%xmm3, (%3)\n"
24515 :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24516 );
24517 }
24518 }
24519 return;
24520 }
24521 }
24522 void helper_double_28(double *buf);
helper_double_28(double * buf)24523 void helper_double_28(double *buf) {
24524 helper_double_28_recursive(buf, 28);
24525 }
24526 void helper_double_29_recursive(double *buf, int depth);
helper_double_29_recursive(double * buf,int depth)24527 void helper_double_29_recursive(double *buf, int depth) {
24528 if (depth == 9) {
24529 for (int j = 0; j < 512; j += 16) {
24530 for (int k = 0; k < 2; k += 2) {
24531 __asm__ volatile (
24532 "movupd (%0), %%xmm0\n"
24533 "movupd (%1), %%xmm1\n"
24534 "movupd (%2), %%xmm2\n"
24535 "movupd (%3), %%xmm3\n"
24536 "movupd (%4), %%xmm4\n"
24537 "movupd (%5), %%xmm5\n"
24538 "movupd (%6), %%xmm6\n"
24539 "movupd (%7), %%xmm7\n"
24540 "movapd %%xmm0, %%xmm8\n"
24541 "haddpd %%xmm8, %%xmm8\n"
24542 "movapd %%xmm0, %%xmm9\n"
24543 "hsubpd %%xmm9, %%xmm9\n"
24544 "blendpd $1, %%xmm8, %%xmm9\n"
24545 "movapd %%xmm9, %%xmm0\n"
24546 "movapd %%xmm1, %%xmm8\n"
24547 "haddpd %%xmm8, %%xmm8\n"
24548 "movapd %%xmm1, %%xmm9\n"
24549 "hsubpd %%xmm9, %%xmm9\n"
24550 "blendpd $1, %%xmm8, %%xmm9\n"
24551 "movapd %%xmm9, %%xmm1\n"
24552 "movapd %%xmm2, %%xmm8\n"
24553 "haddpd %%xmm8, %%xmm8\n"
24554 "movapd %%xmm2, %%xmm9\n"
24555 "hsubpd %%xmm9, %%xmm9\n"
24556 "blendpd $1, %%xmm8, %%xmm9\n"
24557 "movapd %%xmm9, %%xmm2\n"
24558 "movapd %%xmm3, %%xmm8\n"
24559 "haddpd %%xmm8, %%xmm8\n"
24560 "movapd %%xmm3, %%xmm9\n"
24561 "hsubpd %%xmm9, %%xmm9\n"
24562 "blendpd $1, %%xmm8, %%xmm9\n"
24563 "movapd %%xmm9, %%xmm3\n"
24564 "movapd %%xmm4, %%xmm8\n"
24565 "haddpd %%xmm8, %%xmm8\n"
24566 "movapd %%xmm4, %%xmm9\n"
24567 "hsubpd %%xmm9, %%xmm9\n"
24568 "blendpd $1, %%xmm8, %%xmm9\n"
24569 "movapd %%xmm9, %%xmm4\n"
24570 "movapd %%xmm5, %%xmm8\n"
24571 "haddpd %%xmm8, %%xmm8\n"
24572 "movapd %%xmm5, %%xmm9\n"
24573 "hsubpd %%xmm9, %%xmm9\n"
24574 "blendpd $1, %%xmm8, %%xmm9\n"
24575 "movapd %%xmm9, %%xmm5\n"
24576 "movapd %%xmm6, %%xmm8\n"
24577 "haddpd %%xmm8, %%xmm8\n"
24578 "movapd %%xmm6, %%xmm9\n"
24579 "hsubpd %%xmm9, %%xmm9\n"
24580 "blendpd $1, %%xmm8, %%xmm9\n"
24581 "movapd %%xmm9, %%xmm6\n"
24582 "movapd %%xmm7, %%xmm8\n"
24583 "haddpd %%xmm8, %%xmm8\n"
24584 "movapd %%xmm7, %%xmm9\n"
24585 "hsubpd %%xmm9, %%xmm9\n"
24586 "blendpd $1, %%xmm8, %%xmm9\n"
24587 "movapd %%xmm9, %%xmm7\n"
24588 "movapd %%xmm0, %%xmm8\n"
24589 "movapd %%xmm0, %%xmm9\n"
24590 "addpd %%xmm1, %%xmm8\n"
24591 "subpd %%xmm1, %%xmm9\n"
24592 "movapd %%xmm2, %%xmm10\n"
24593 "movapd %%xmm2, %%xmm11\n"
24594 "addpd %%xmm3, %%xmm10\n"
24595 "subpd %%xmm3, %%xmm11\n"
24596 "movapd %%xmm4, %%xmm12\n"
24597 "movapd %%xmm4, %%xmm13\n"
24598 "addpd %%xmm5, %%xmm12\n"
24599 "subpd %%xmm5, %%xmm13\n"
24600 "movapd %%xmm6, %%xmm14\n"
24601 "movapd %%xmm6, %%xmm15\n"
24602 "addpd %%xmm7, %%xmm14\n"
24603 "subpd %%xmm7, %%xmm15\n"
24604 "movapd %%xmm8, %%xmm0\n"
24605 "movapd %%xmm8, %%xmm2\n"
24606 "addpd %%xmm10, %%xmm0\n"
24607 "subpd %%xmm10, %%xmm2\n"
24608 "movapd %%xmm9, %%xmm1\n"
24609 "movapd %%xmm9, %%xmm3\n"
24610 "addpd %%xmm11, %%xmm1\n"
24611 "subpd %%xmm11, %%xmm3\n"
24612 "movapd %%xmm12, %%xmm4\n"
24613 "movapd %%xmm12, %%xmm6\n"
24614 "addpd %%xmm14, %%xmm4\n"
24615 "subpd %%xmm14, %%xmm6\n"
24616 "movapd %%xmm13, %%xmm5\n"
24617 "movapd %%xmm13, %%xmm7\n"
24618 "addpd %%xmm15, %%xmm5\n"
24619 "subpd %%xmm15, %%xmm7\n"
24620 "movapd %%xmm0, %%xmm8\n"
24621 "movapd %%xmm0, %%xmm12\n"
24622 "addpd %%xmm4, %%xmm8\n"
24623 "subpd %%xmm4, %%xmm12\n"
24624 "movapd %%xmm1, %%xmm9\n"
24625 "movapd %%xmm1, %%xmm13\n"
24626 "addpd %%xmm5, %%xmm9\n"
24627 "subpd %%xmm5, %%xmm13\n"
24628 "movapd %%xmm2, %%xmm10\n"
24629 "movapd %%xmm2, %%xmm14\n"
24630 "addpd %%xmm6, %%xmm10\n"
24631 "subpd %%xmm6, %%xmm14\n"
24632 "movapd %%xmm3, %%xmm11\n"
24633 "movapd %%xmm3, %%xmm15\n"
24634 "addpd %%xmm7, %%xmm11\n"
24635 "subpd %%xmm7, %%xmm15\n"
24636 "movupd %%xmm8, (%0)\n"
24637 "movupd %%xmm9, (%1)\n"
24638 "movupd %%xmm10, (%2)\n"
24639 "movupd %%xmm11, (%3)\n"
24640 "movupd %%xmm12, (%4)\n"
24641 "movupd %%xmm13, (%5)\n"
24642 "movupd %%xmm14, (%6)\n"
24643 "movupd %%xmm15, (%7)\n"
24644 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6), "r"(buf + j + k + 8), "r"(buf + j + k + 10), "r"(buf + j + k + 12), "r"(buf + j + k + 14) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24645 );
24646 }
24647 }
24648 for (int j = 0; j < 512; j += 128) {
24649 for (int k = 0; k < 16; k += 2) {
24650 __asm__ volatile (
24651 "movupd (%0), %%xmm0\n"
24652 "movupd (%1), %%xmm1\n"
24653 "movupd (%2), %%xmm2\n"
24654 "movupd (%3), %%xmm3\n"
24655 "movupd (%4), %%xmm4\n"
24656 "movupd (%5), %%xmm5\n"
24657 "movupd (%6), %%xmm6\n"
24658 "movupd (%7), %%xmm7\n"
24659 "movapd %%xmm0, %%xmm8\n"
24660 "movapd %%xmm0, %%xmm9\n"
24661 "addpd %%xmm1, %%xmm8\n"
24662 "subpd %%xmm1, %%xmm9\n"
24663 "movapd %%xmm2, %%xmm10\n"
24664 "movapd %%xmm2, %%xmm11\n"
24665 "addpd %%xmm3, %%xmm10\n"
24666 "subpd %%xmm3, %%xmm11\n"
24667 "movapd %%xmm4, %%xmm12\n"
24668 "movapd %%xmm4, %%xmm13\n"
24669 "addpd %%xmm5, %%xmm12\n"
24670 "subpd %%xmm5, %%xmm13\n"
24671 "movapd %%xmm6, %%xmm14\n"
24672 "movapd %%xmm6, %%xmm15\n"
24673 "addpd %%xmm7, %%xmm14\n"
24674 "subpd %%xmm7, %%xmm15\n"
24675 "movapd %%xmm8, %%xmm0\n"
24676 "movapd %%xmm8, %%xmm2\n"
24677 "addpd %%xmm10, %%xmm0\n"
24678 "subpd %%xmm10, %%xmm2\n"
24679 "movapd %%xmm9, %%xmm1\n"
24680 "movapd %%xmm9, %%xmm3\n"
24681 "addpd %%xmm11, %%xmm1\n"
24682 "subpd %%xmm11, %%xmm3\n"
24683 "movapd %%xmm12, %%xmm4\n"
24684 "movapd %%xmm12, %%xmm6\n"
24685 "addpd %%xmm14, %%xmm4\n"
24686 "subpd %%xmm14, %%xmm6\n"
24687 "movapd %%xmm13, %%xmm5\n"
24688 "movapd %%xmm13, %%xmm7\n"
24689 "addpd %%xmm15, %%xmm5\n"
24690 "subpd %%xmm15, %%xmm7\n"
24691 "movapd %%xmm0, %%xmm8\n"
24692 "movapd %%xmm0, %%xmm12\n"
24693 "addpd %%xmm4, %%xmm8\n"
24694 "subpd %%xmm4, %%xmm12\n"
24695 "movapd %%xmm1, %%xmm9\n"
24696 "movapd %%xmm1, %%xmm13\n"
24697 "addpd %%xmm5, %%xmm9\n"
24698 "subpd %%xmm5, %%xmm13\n"
24699 "movapd %%xmm2, %%xmm10\n"
24700 "movapd %%xmm2, %%xmm14\n"
24701 "addpd %%xmm6, %%xmm10\n"
24702 "subpd %%xmm6, %%xmm14\n"
24703 "movapd %%xmm3, %%xmm11\n"
24704 "movapd %%xmm3, %%xmm15\n"
24705 "addpd %%xmm7, %%xmm11\n"
24706 "subpd %%xmm7, %%xmm15\n"
24707 "movupd %%xmm8, (%0)\n"
24708 "movupd %%xmm9, (%1)\n"
24709 "movupd %%xmm10, (%2)\n"
24710 "movupd %%xmm11, (%3)\n"
24711 "movupd %%xmm12, (%4)\n"
24712 "movupd %%xmm13, (%5)\n"
24713 "movupd %%xmm14, (%6)\n"
24714 "movupd %%xmm15, (%7)\n"
24715 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16), "r"(buf + j + k + 32), "r"(buf + j + k + 48), "r"(buf + j + k + 64), "r"(buf + j + k + 80), "r"(buf + j + k + 96), "r"(buf + j + k + 112) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24716 );
24717 }
24718 }
24719 for (int j = 0; j < 512; j += 512) {
24720 for (int k = 0; k < 128; k += 2) {
24721 __asm__ volatile (
24722 "movupd (%0), %%xmm0\n"
24723 "movupd (%1), %%xmm1\n"
24724 "movupd (%2), %%xmm2\n"
24725 "movupd (%3), %%xmm3\n"
24726 "movapd %%xmm0, %%xmm8\n"
24727 "movapd %%xmm0, %%xmm9\n"
24728 "addpd %%xmm1, %%xmm8\n"
24729 "subpd %%xmm1, %%xmm9\n"
24730 "movapd %%xmm2, %%xmm10\n"
24731 "movapd %%xmm2, %%xmm11\n"
24732 "addpd %%xmm3, %%xmm10\n"
24733 "subpd %%xmm3, %%xmm11\n"
24734 "movapd %%xmm8, %%xmm0\n"
24735 "movapd %%xmm8, %%xmm2\n"
24736 "addpd %%xmm10, %%xmm0\n"
24737 "subpd %%xmm10, %%xmm2\n"
24738 "movapd %%xmm9, %%xmm1\n"
24739 "movapd %%xmm9, %%xmm3\n"
24740 "addpd %%xmm11, %%xmm1\n"
24741 "subpd %%xmm11, %%xmm3\n"
24742 "movupd %%xmm0, (%0)\n"
24743 "movupd %%xmm1, (%1)\n"
24744 "movupd %%xmm2, (%2)\n"
24745 "movupd %%xmm3, (%3)\n"
24746 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24747 );
24748 }
24749 }
24750 return;
24751 }
24752 if (depth == 12) {
24753 helper_double_29_recursive(buf + 0, 9);
24754 helper_double_29_recursive(buf + 512, 9);
24755 helper_double_29_recursive(buf + 1024, 9);
24756 helper_double_29_recursive(buf + 1536, 9);
24757 helper_double_29_recursive(buf + 2048, 9);
24758 helper_double_29_recursive(buf + 2560, 9);
24759 helper_double_29_recursive(buf + 3072, 9);
24760 helper_double_29_recursive(buf + 3584, 9);
24761 for (int j = 0; j < 4096; j += 4096) {
24762 for (int k = 0; k < 512; k += 2) {
24763 __asm__ volatile (
24764 "movupd (%0), %%xmm0\n"
24765 "movupd (%1), %%xmm1\n"
24766 "movupd (%2), %%xmm2\n"
24767 "movupd (%3), %%xmm3\n"
24768 "movupd (%4), %%xmm4\n"
24769 "movupd (%5), %%xmm5\n"
24770 "movupd (%6), %%xmm6\n"
24771 "movupd (%7), %%xmm7\n"
24772 "movapd %%xmm0, %%xmm8\n"
24773 "movapd %%xmm0, %%xmm9\n"
24774 "addpd %%xmm1, %%xmm8\n"
24775 "subpd %%xmm1, %%xmm9\n"
24776 "movapd %%xmm2, %%xmm10\n"
24777 "movapd %%xmm2, %%xmm11\n"
24778 "addpd %%xmm3, %%xmm10\n"
24779 "subpd %%xmm3, %%xmm11\n"
24780 "movapd %%xmm4, %%xmm12\n"
24781 "movapd %%xmm4, %%xmm13\n"
24782 "addpd %%xmm5, %%xmm12\n"
24783 "subpd %%xmm5, %%xmm13\n"
24784 "movapd %%xmm6, %%xmm14\n"
24785 "movapd %%xmm6, %%xmm15\n"
24786 "addpd %%xmm7, %%xmm14\n"
24787 "subpd %%xmm7, %%xmm15\n"
24788 "movapd %%xmm8, %%xmm0\n"
24789 "movapd %%xmm8, %%xmm2\n"
24790 "addpd %%xmm10, %%xmm0\n"
24791 "subpd %%xmm10, %%xmm2\n"
24792 "movapd %%xmm9, %%xmm1\n"
24793 "movapd %%xmm9, %%xmm3\n"
24794 "addpd %%xmm11, %%xmm1\n"
24795 "subpd %%xmm11, %%xmm3\n"
24796 "movapd %%xmm12, %%xmm4\n"
24797 "movapd %%xmm12, %%xmm6\n"
24798 "addpd %%xmm14, %%xmm4\n"
24799 "subpd %%xmm14, %%xmm6\n"
24800 "movapd %%xmm13, %%xmm5\n"
24801 "movapd %%xmm13, %%xmm7\n"
24802 "addpd %%xmm15, %%xmm5\n"
24803 "subpd %%xmm15, %%xmm7\n"
24804 "movapd %%xmm0, %%xmm8\n"
24805 "movapd %%xmm0, %%xmm12\n"
24806 "addpd %%xmm4, %%xmm8\n"
24807 "subpd %%xmm4, %%xmm12\n"
24808 "movapd %%xmm1, %%xmm9\n"
24809 "movapd %%xmm1, %%xmm13\n"
24810 "addpd %%xmm5, %%xmm9\n"
24811 "subpd %%xmm5, %%xmm13\n"
24812 "movapd %%xmm2, %%xmm10\n"
24813 "movapd %%xmm2, %%xmm14\n"
24814 "addpd %%xmm6, %%xmm10\n"
24815 "subpd %%xmm6, %%xmm14\n"
24816 "movapd %%xmm3, %%xmm11\n"
24817 "movapd %%xmm3, %%xmm15\n"
24818 "addpd %%xmm7, %%xmm11\n"
24819 "subpd %%xmm7, %%xmm15\n"
24820 "movupd %%xmm8, (%0)\n"
24821 "movupd %%xmm9, (%1)\n"
24822 "movupd %%xmm10, (%2)\n"
24823 "movupd %%xmm11, (%3)\n"
24824 "movupd %%xmm12, (%4)\n"
24825 "movupd %%xmm13, (%5)\n"
24826 "movupd %%xmm14, (%6)\n"
24827 "movupd %%xmm15, (%7)\n"
24828 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24829 );
24830 }
24831 }
24832 return;
24833 }
24834 if (depth == 15) {
24835 helper_double_29_recursive(buf + 0, 12);
24836 helper_double_29_recursive(buf + 4096, 12);
24837 helper_double_29_recursive(buf + 8192, 12);
24838 helper_double_29_recursive(buf + 12288, 12);
24839 helper_double_29_recursive(buf + 16384, 12);
24840 helper_double_29_recursive(buf + 20480, 12);
24841 helper_double_29_recursive(buf + 24576, 12);
24842 helper_double_29_recursive(buf + 28672, 12);
24843 for (int j = 0; j < 32768; j += 32768) {
24844 for (int k = 0; k < 4096; k += 2) {
24845 __asm__ volatile (
24846 "movupd (%0), %%xmm0\n"
24847 "movupd (%1), %%xmm1\n"
24848 "movupd (%2), %%xmm2\n"
24849 "movupd (%3), %%xmm3\n"
24850 "movupd (%4), %%xmm4\n"
24851 "movupd (%5), %%xmm5\n"
24852 "movupd (%6), %%xmm6\n"
24853 "movupd (%7), %%xmm7\n"
24854 "movapd %%xmm0, %%xmm8\n"
24855 "movapd %%xmm0, %%xmm9\n"
24856 "addpd %%xmm1, %%xmm8\n"
24857 "subpd %%xmm1, %%xmm9\n"
24858 "movapd %%xmm2, %%xmm10\n"
24859 "movapd %%xmm2, %%xmm11\n"
24860 "addpd %%xmm3, %%xmm10\n"
24861 "subpd %%xmm3, %%xmm11\n"
24862 "movapd %%xmm4, %%xmm12\n"
24863 "movapd %%xmm4, %%xmm13\n"
24864 "addpd %%xmm5, %%xmm12\n"
24865 "subpd %%xmm5, %%xmm13\n"
24866 "movapd %%xmm6, %%xmm14\n"
24867 "movapd %%xmm6, %%xmm15\n"
24868 "addpd %%xmm7, %%xmm14\n"
24869 "subpd %%xmm7, %%xmm15\n"
24870 "movapd %%xmm8, %%xmm0\n"
24871 "movapd %%xmm8, %%xmm2\n"
24872 "addpd %%xmm10, %%xmm0\n"
24873 "subpd %%xmm10, %%xmm2\n"
24874 "movapd %%xmm9, %%xmm1\n"
24875 "movapd %%xmm9, %%xmm3\n"
24876 "addpd %%xmm11, %%xmm1\n"
24877 "subpd %%xmm11, %%xmm3\n"
24878 "movapd %%xmm12, %%xmm4\n"
24879 "movapd %%xmm12, %%xmm6\n"
24880 "addpd %%xmm14, %%xmm4\n"
24881 "subpd %%xmm14, %%xmm6\n"
24882 "movapd %%xmm13, %%xmm5\n"
24883 "movapd %%xmm13, %%xmm7\n"
24884 "addpd %%xmm15, %%xmm5\n"
24885 "subpd %%xmm15, %%xmm7\n"
24886 "movapd %%xmm0, %%xmm8\n"
24887 "movapd %%xmm0, %%xmm12\n"
24888 "addpd %%xmm4, %%xmm8\n"
24889 "subpd %%xmm4, %%xmm12\n"
24890 "movapd %%xmm1, %%xmm9\n"
24891 "movapd %%xmm1, %%xmm13\n"
24892 "addpd %%xmm5, %%xmm9\n"
24893 "subpd %%xmm5, %%xmm13\n"
24894 "movapd %%xmm2, %%xmm10\n"
24895 "movapd %%xmm2, %%xmm14\n"
24896 "addpd %%xmm6, %%xmm10\n"
24897 "subpd %%xmm6, %%xmm14\n"
24898 "movapd %%xmm3, %%xmm11\n"
24899 "movapd %%xmm3, %%xmm15\n"
24900 "addpd %%xmm7, %%xmm11\n"
24901 "subpd %%xmm7, %%xmm15\n"
24902 "movupd %%xmm8, (%0)\n"
24903 "movupd %%xmm9, (%1)\n"
24904 "movupd %%xmm10, (%2)\n"
24905 "movupd %%xmm11, (%3)\n"
24906 "movupd %%xmm12, (%4)\n"
24907 "movupd %%xmm13, (%5)\n"
24908 "movupd %%xmm14, (%6)\n"
24909 "movupd %%xmm15, (%7)\n"
24910 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24911 );
24912 }
24913 }
24914 return;
24915 }
24916 if (depth == 18) {
24917 helper_double_29_recursive(buf + 0, 15);
24918 helper_double_29_recursive(buf + 32768, 15);
24919 helper_double_29_recursive(buf + 65536, 15);
24920 helper_double_29_recursive(buf + 98304, 15);
24921 helper_double_29_recursive(buf + 131072, 15);
24922 helper_double_29_recursive(buf + 163840, 15);
24923 helper_double_29_recursive(buf + 196608, 15);
24924 helper_double_29_recursive(buf + 229376, 15);
24925 for (int j = 0; j < 262144; j += 262144) {
24926 for (int k = 0; k < 32768; k += 2) {
24927 __asm__ volatile (
24928 "movupd (%0), %%xmm0\n"
24929 "movupd (%1), %%xmm1\n"
24930 "movupd (%2), %%xmm2\n"
24931 "movupd (%3), %%xmm3\n"
24932 "movupd (%4), %%xmm4\n"
24933 "movupd (%5), %%xmm5\n"
24934 "movupd (%6), %%xmm6\n"
24935 "movupd (%7), %%xmm7\n"
24936 "movapd %%xmm0, %%xmm8\n"
24937 "movapd %%xmm0, %%xmm9\n"
24938 "addpd %%xmm1, %%xmm8\n"
24939 "subpd %%xmm1, %%xmm9\n"
24940 "movapd %%xmm2, %%xmm10\n"
24941 "movapd %%xmm2, %%xmm11\n"
24942 "addpd %%xmm3, %%xmm10\n"
24943 "subpd %%xmm3, %%xmm11\n"
24944 "movapd %%xmm4, %%xmm12\n"
24945 "movapd %%xmm4, %%xmm13\n"
24946 "addpd %%xmm5, %%xmm12\n"
24947 "subpd %%xmm5, %%xmm13\n"
24948 "movapd %%xmm6, %%xmm14\n"
24949 "movapd %%xmm6, %%xmm15\n"
24950 "addpd %%xmm7, %%xmm14\n"
24951 "subpd %%xmm7, %%xmm15\n"
24952 "movapd %%xmm8, %%xmm0\n"
24953 "movapd %%xmm8, %%xmm2\n"
24954 "addpd %%xmm10, %%xmm0\n"
24955 "subpd %%xmm10, %%xmm2\n"
24956 "movapd %%xmm9, %%xmm1\n"
24957 "movapd %%xmm9, %%xmm3\n"
24958 "addpd %%xmm11, %%xmm1\n"
24959 "subpd %%xmm11, %%xmm3\n"
24960 "movapd %%xmm12, %%xmm4\n"
24961 "movapd %%xmm12, %%xmm6\n"
24962 "addpd %%xmm14, %%xmm4\n"
24963 "subpd %%xmm14, %%xmm6\n"
24964 "movapd %%xmm13, %%xmm5\n"
24965 "movapd %%xmm13, %%xmm7\n"
24966 "addpd %%xmm15, %%xmm5\n"
24967 "subpd %%xmm15, %%xmm7\n"
24968 "movapd %%xmm0, %%xmm8\n"
24969 "movapd %%xmm0, %%xmm12\n"
24970 "addpd %%xmm4, %%xmm8\n"
24971 "subpd %%xmm4, %%xmm12\n"
24972 "movapd %%xmm1, %%xmm9\n"
24973 "movapd %%xmm1, %%xmm13\n"
24974 "addpd %%xmm5, %%xmm9\n"
24975 "subpd %%xmm5, %%xmm13\n"
24976 "movapd %%xmm2, %%xmm10\n"
24977 "movapd %%xmm2, %%xmm14\n"
24978 "addpd %%xmm6, %%xmm10\n"
24979 "subpd %%xmm6, %%xmm14\n"
24980 "movapd %%xmm3, %%xmm11\n"
24981 "movapd %%xmm3, %%xmm15\n"
24982 "addpd %%xmm7, %%xmm11\n"
24983 "subpd %%xmm7, %%xmm15\n"
24984 "movupd %%xmm8, (%0)\n"
24985 "movupd %%xmm9, (%1)\n"
24986 "movupd %%xmm10, (%2)\n"
24987 "movupd %%xmm11, (%3)\n"
24988 "movupd %%xmm12, (%4)\n"
24989 "movupd %%xmm13, (%5)\n"
24990 "movupd %%xmm14, (%6)\n"
24991 "movupd %%xmm15, (%7)\n"
24992 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
24993 );
24994 }
24995 }
24996 return;
24997 }
24998 if (depth == 21) {
24999 helper_double_29_recursive(buf + 0, 18);
25000 helper_double_29_recursive(buf + 262144, 18);
25001 helper_double_29_recursive(buf + 524288, 18);
25002 helper_double_29_recursive(buf + 786432, 18);
25003 helper_double_29_recursive(buf + 1048576, 18);
25004 helper_double_29_recursive(buf + 1310720, 18);
25005 helper_double_29_recursive(buf + 1572864, 18);
25006 helper_double_29_recursive(buf + 1835008, 18);
25007 for (int j = 0; j < 2097152; j += 2097152) {
25008 for (int k = 0; k < 262144; k += 2) {
25009 __asm__ volatile (
25010 "movupd (%0), %%xmm0\n"
25011 "movupd (%1), %%xmm1\n"
25012 "movupd (%2), %%xmm2\n"
25013 "movupd (%3), %%xmm3\n"
25014 "movupd (%4), %%xmm4\n"
25015 "movupd (%5), %%xmm5\n"
25016 "movupd (%6), %%xmm6\n"
25017 "movupd (%7), %%xmm7\n"
25018 "movapd %%xmm0, %%xmm8\n"
25019 "movapd %%xmm0, %%xmm9\n"
25020 "addpd %%xmm1, %%xmm8\n"
25021 "subpd %%xmm1, %%xmm9\n"
25022 "movapd %%xmm2, %%xmm10\n"
25023 "movapd %%xmm2, %%xmm11\n"
25024 "addpd %%xmm3, %%xmm10\n"
25025 "subpd %%xmm3, %%xmm11\n"
25026 "movapd %%xmm4, %%xmm12\n"
25027 "movapd %%xmm4, %%xmm13\n"
25028 "addpd %%xmm5, %%xmm12\n"
25029 "subpd %%xmm5, %%xmm13\n"
25030 "movapd %%xmm6, %%xmm14\n"
25031 "movapd %%xmm6, %%xmm15\n"
25032 "addpd %%xmm7, %%xmm14\n"
25033 "subpd %%xmm7, %%xmm15\n"
25034 "movapd %%xmm8, %%xmm0\n"
25035 "movapd %%xmm8, %%xmm2\n"
25036 "addpd %%xmm10, %%xmm0\n"
25037 "subpd %%xmm10, %%xmm2\n"
25038 "movapd %%xmm9, %%xmm1\n"
25039 "movapd %%xmm9, %%xmm3\n"
25040 "addpd %%xmm11, %%xmm1\n"
25041 "subpd %%xmm11, %%xmm3\n"
25042 "movapd %%xmm12, %%xmm4\n"
25043 "movapd %%xmm12, %%xmm6\n"
25044 "addpd %%xmm14, %%xmm4\n"
25045 "subpd %%xmm14, %%xmm6\n"
25046 "movapd %%xmm13, %%xmm5\n"
25047 "movapd %%xmm13, %%xmm7\n"
25048 "addpd %%xmm15, %%xmm5\n"
25049 "subpd %%xmm15, %%xmm7\n"
25050 "movapd %%xmm0, %%xmm8\n"
25051 "movapd %%xmm0, %%xmm12\n"
25052 "addpd %%xmm4, %%xmm8\n"
25053 "subpd %%xmm4, %%xmm12\n"
25054 "movapd %%xmm1, %%xmm9\n"
25055 "movapd %%xmm1, %%xmm13\n"
25056 "addpd %%xmm5, %%xmm9\n"
25057 "subpd %%xmm5, %%xmm13\n"
25058 "movapd %%xmm2, %%xmm10\n"
25059 "movapd %%xmm2, %%xmm14\n"
25060 "addpd %%xmm6, %%xmm10\n"
25061 "subpd %%xmm6, %%xmm14\n"
25062 "movapd %%xmm3, %%xmm11\n"
25063 "movapd %%xmm3, %%xmm15\n"
25064 "addpd %%xmm7, %%xmm11\n"
25065 "subpd %%xmm7, %%xmm15\n"
25066 "movupd %%xmm8, (%0)\n"
25067 "movupd %%xmm9, (%1)\n"
25068 "movupd %%xmm10, (%2)\n"
25069 "movupd %%xmm11, (%3)\n"
25070 "movupd %%xmm12, (%4)\n"
25071 "movupd %%xmm13, (%5)\n"
25072 "movupd %%xmm14, (%6)\n"
25073 "movupd %%xmm15, (%7)\n"
25074 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25075 );
25076 }
25077 }
25078 return;
25079 }
25080 if (depth == 24) {
25081 helper_double_29_recursive(buf + 0, 21);
25082 helper_double_29_recursive(buf + 2097152, 21);
25083 helper_double_29_recursive(buf + 4194304, 21);
25084 helper_double_29_recursive(buf + 6291456, 21);
25085 helper_double_29_recursive(buf + 8388608, 21);
25086 helper_double_29_recursive(buf + 10485760, 21);
25087 helper_double_29_recursive(buf + 12582912, 21);
25088 helper_double_29_recursive(buf + 14680064, 21);
25089 for (int j = 0; j < 16777216; j += 16777216) {
25090 for (int k = 0; k < 2097152; k += 2) {
25091 __asm__ volatile (
25092 "movupd (%0), %%xmm0\n"
25093 "movupd (%1), %%xmm1\n"
25094 "movupd (%2), %%xmm2\n"
25095 "movupd (%3), %%xmm3\n"
25096 "movupd (%4), %%xmm4\n"
25097 "movupd (%5), %%xmm5\n"
25098 "movupd (%6), %%xmm6\n"
25099 "movupd (%7), %%xmm7\n"
25100 "movapd %%xmm0, %%xmm8\n"
25101 "movapd %%xmm0, %%xmm9\n"
25102 "addpd %%xmm1, %%xmm8\n"
25103 "subpd %%xmm1, %%xmm9\n"
25104 "movapd %%xmm2, %%xmm10\n"
25105 "movapd %%xmm2, %%xmm11\n"
25106 "addpd %%xmm3, %%xmm10\n"
25107 "subpd %%xmm3, %%xmm11\n"
25108 "movapd %%xmm4, %%xmm12\n"
25109 "movapd %%xmm4, %%xmm13\n"
25110 "addpd %%xmm5, %%xmm12\n"
25111 "subpd %%xmm5, %%xmm13\n"
25112 "movapd %%xmm6, %%xmm14\n"
25113 "movapd %%xmm6, %%xmm15\n"
25114 "addpd %%xmm7, %%xmm14\n"
25115 "subpd %%xmm7, %%xmm15\n"
25116 "movapd %%xmm8, %%xmm0\n"
25117 "movapd %%xmm8, %%xmm2\n"
25118 "addpd %%xmm10, %%xmm0\n"
25119 "subpd %%xmm10, %%xmm2\n"
25120 "movapd %%xmm9, %%xmm1\n"
25121 "movapd %%xmm9, %%xmm3\n"
25122 "addpd %%xmm11, %%xmm1\n"
25123 "subpd %%xmm11, %%xmm3\n"
25124 "movapd %%xmm12, %%xmm4\n"
25125 "movapd %%xmm12, %%xmm6\n"
25126 "addpd %%xmm14, %%xmm4\n"
25127 "subpd %%xmm14, %%xmm6\n"
25128 "movapd %%xmm13, %%xmm5\n"
25129 "movapd %%xmm13, %%xmm7\n"
25130 "addpd %%xmm15, %%xmm5\n"
25131 "subpd %%xmm15, %%xmm7\n"
25132 "movapd %%xmm0, %%xmm8\n"
25133 "movapd %%xmm0, %%xmm12\n"
25134 "addpd %%xmm4, %%xmm8\n"
25135 "subpd %%xmm4, %%xmm12\n"
25136 "movapd %%xmm1, %%xmm9\n"
25137 "movapd %%xmm1, %%xmm13\n"
25138 "addpd %%xmm5, %%xmm9\n"
25139 "subpd %%xmm5, %%xmm13\n"
25140 "movapd %%xmm2, %%xmm10\n"
25141 "movapd %%xmm2, %%xmm14\n"
25142 "addpd %%xmm6, %%xmm10\n"
25143 "subpd %%xmm6, %%xmm14\n"
25144 "movapd %%xmm3, %%xmm11\n"
25145 "movapd %%xmm3, %%xmm15\n"
25146 "addpd %%xmm7, %%xmm11\n"
25147 "subpd %%xmm7, %%xmm15\n"
25148 "movupd %%xmm8, (%0)\n"
25149 "movupd %%xmm9, (%1)\n"
25150 "movupd %%xmm10, (%2)\n"
25151 "movupd %%xmm11, (%3)\n"
25152 "movupd %%xmm12, (%4)\n"
25153 "movupd %%xmm13, (%5)\n"
25154 "movupd %%xmm14, (%6)\n"
25155 "movupd %%xmm15, (%7)\n"
25156 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25157 );
25158 }
25159 }
25160 return;
25161 }
25162 if (depth == 27) {
25163 helper_double_29_recursive(buf + 0, 24);
25164 helper_double_29_recursive(buf + 16777216, 24);
25165 helper_double_29_recursive(buf + 33554432, 24);
25166 helper_double_29_recursive(buf + 50331648, 24);
25167 helper_double_29_recursive(buf + 67108864, 24);
25168 helper_double_29_recursive(buf + 83886080, 24);
25169 helper_double_29_recursive(buf + 100663296, 24);
25170 helper_double_29_recursive(buf + 117440512, 24);
25171 for (int j = 0; j < 134217728; j += 134217728) {
25172 for (int k = 0; k < 16777216; k += 2) {
25173 __asm__ volatile (
25174 "movupd (%0), %%xmm0\n"
25175 "movupd (%1), %%xmm1\n"
25176 "movupd (%2), %%xmm2\n"
25177 "movupd (%3), %%xmm3\n"
25178 "movupd (%4), %%xmm4\n"
25179 "movupd (%5), %%xmm5\n"
25180 "movupd (%6), %%xmm6\n"
25181 "movupd (%7), %%xmm7\n"
25182 "movapd %%xmm0, %%xmm8\n"
25183 "movapd %%xmm0, %%xmm9\n"
25184 "addpd %%xmm1, %%xmm8\n"
25185 "subpd %%xmm1, %%xmm9\n"
25186 "movapd %%xmm2, %%xmm10\n"
25187 "movapd %%xmm2, %%xmm11\n"
25188 "addpd %%xmm3, %%xmm10\n"
25189 "subpd %%xmm3, %%xmm11\n"
25190 "movapd %%xmm4, %%xmm12\n"
25191 "movapd %%xmm4, %%xmm13\n"
25192 "addpd %%xmm5, %%xmm12\n"
25193 "subpd %%xmm5, %%xmm13\n"
25194 "movapd %%xmm6, %%xmm14\n"
25195 "movapd %%xmm6, %%xmm15\n"
25196 "addpd %%xmm7, %%xmm14\n"
25197 "subpd %%xmm7, %%xmm15\n"
25198 "movapd %%xmm8, %%xmm0\n"
25199 "movapd %%xmm8, %%xmm2\n"
25200 "addpd %%xmm10, %%xmm0\n"
25201 "subpd %%xmm10, %%xmm2\n"
25202 "movapd %%xmm9, %%xmm1\n"
25203 "movapd %%xmm9, %%xmm3\n"
25204 "addpd %%xmm11, %%xmm1\n"
25205 "subpd %%xmm11, %%xmm3\n"
25206 "movapd %%xmm12, %%xmm4\n"
25207 "movapd %%xmm12, %%xmm6\n"
25208 "addpd %%xmm14, %%xmm4\n"
25209 "subpd %%xmm14, %%xmm6\n"
25210 "movapd %%xmm13, %%xmm5\n"
25211 "movapd %%xmm13, %%xmm7\n"
25212 "addpd %%xmm15, %%xmm5\n"
25213 "subpd %%xmm15, %%xmm7\n"
25214 "movapd %%xmm0, %%xmm8\n"
25215 "movapd %%xmm0, %%xmm12\n"
25216 "addpd %%xmm4, %%xmm8\n"
25217 "subpd %%xmm4, %%xmm12\n"
25218 "movapd %%xmm1, %%xmm9\n"
25219 "movapd %%xmm1, %%xmm13\n"
25220 "addpd %%xmm5, %%xmm9\n"
25221 "subpd %%xmm5, %%xmm13\n"
25222 "movapd %%xmm2, %%xmm10\n"
25223 "movapd %%xmm2, %%xmm14\n"
25224 "addpd %%xmm6, %%xmm10\n"
25225 "subpd %%xmm6, %%xmm14\n"
25226 "movapd %%xmm3, %%xmm11\n"
25227 "movapd %%xmm3, %%xmm15\n"
25228 "addpd %%xmm7, %%xmm11\n"
25229 "subpd %%xmm7, %%xmm15\n"
25230 "movupd %%xmm8, (%0)\n"
25231 "movupd %%xmm9, (%1)\n"
25232 "movupd %%xmm10, (%2)\n"
25233 "movupd %%xmm11, (%3)\n"
25234 "movupd %%xmm12, (%4)\n"
25235 "movupd %%xmm13, (%5)\n"
25236 "movupd %%xmm14, (%6)\n"
25237 "movupd %%xmm15, (%7)\n"
25238 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25239 );
25240 }
25241 }
25242 return;
25243 }
25244 if (depth == 29) {
25245 helper_double_29_recursive(buf + 0, 27);
25246 helper_double_29_recursive(buf + 134217728, 27);
25247 helper_double_29_recursive(buf + 268435456, 27);
25248 helper_double_29_recursive(buf + 402653184, 27);
25249 for (int j = 0; j < 536870912; j += 536870912) {
25250 for (int k = 0; k < 134217728; k += 2) {
25251 __asm__ volatile (
25252 "movupd (%0), %%xmm0\n"
25253 "movupd (%1), %%xmm1\n"
25254 "movupd (%2), %%xmm2\n"
25255 "movupd (%3), %%xmm3\n"
25256 "movapd %%xmm0, %%xmm8\n"
25257 "movapd %%xmm0, %%xmm9\n"
25258 "addpd %%xmm1, %%xmm8\n"
25259 "subpd %%xmm1, %%xmm9\n"
25260 "movapd %%xmm2, %%xmm10\n"
25261 "movapd %%xmm2, %%xmm11\n"
25262 "addpd %%xmm3, %%xmm10\n"
25263 "subpd %%xmm3, %%xmm11\n"
25264 "movapd %%xmm8, %%xmm0\n"
25265 "movapd %%xmm8, %%xmm2\n"
25266 "addpd %%xmm10, %%xmm0\n"
25267 "subpd %%xmm10, %%xmm2\n"
25268 "movapd %%xmm9, %%xmm1\n"
25269 "movapd %%xmm9, %%xmm3\n"
25270 "addpd %%xmm11, %%xmm1\n"
25271 "subpd %%xmm11, %%xmm3\n"
25272 "movupd %%xmm0, (%0)\n"
25273 "movupd %%xmm1, (%1)\n"
25274 "movupd %%xmm2, (%2)\n"
25275 "movupd %%xmm3, (%3)\n"
25276 :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25277 );
25278 }
25279 }
25280 return;
25281 }
25282 }
25283 void helper_double_29(double *buf);
helper_double_29(double * buf)25284 void helper_double_29(double *buf) {
25285 helper_double_29_recursive(buf, 29);
25286 }
25287 void helper_double_30_recursive(double *buf, int depth);
helper_double_30_recursive(double * buf,int depth)25288 void helper_double_30_recursive(double *buf, int depth) {
25289 if (depth == 3) {
25290 for (int j = 0; j < 8; j += 8) {
25291 for (int k = 0; k < 2; k += 2) {
25292 __asm__ volatile (
25293 "movupd (%0), %%xmm0\n"
25294 "movupd (%1), %%xmm1\n"
25295 "movupd (%2), %%xmm2\n"
25296 "movupd (%3), %%xmm3\n"
25297 "movapd %%xmm0, %%xmm8\n"
25298 "haddpd %%xmm8, %%xmm8\n"
25299 "movapd %%xmm0, %%xmm9\n"
25300 "hsubpd %%xmm9, %%xmm9\n"
25301 "blendpd $1, %%xmm8, %%xmm9\n"
25302 "movapd %%xmm9, %%xmm0\n"
25303 "movapd %%xmm1, %%xmm8\n"
25304 "haddpd %%xmm8, %%xmm8\n"
25305 "movapd %%xmm1, %%xmm9\n"
25306 "hsubpd %%xmm9, %%xmm9\n"
25307 "blendpd $1, %%xmm8, %%xmm9\n"
25308 "movapd %%xmm9, %%xmm1\n"
25309 "movapd %%xmm2, %%xmm8\n"
25310 "haddpd %%xmm8, %%xmm8\n"
25311 "movapd %%xmm2, %%xmm9\n"
25312 "hsubpd %%xmm9, %%xmm9\n"
25313 "blendpd $1, %%xmm8, %%xmm9\n"
25314 "movapd %%xmm9, %%xmm2\n"
25315 "movapd %%xmm3, %%xmm8\n"
25316 "haddpd %%xmm8, %%xmm8\n"
25317 "movapd %%xmm3, %%xmm9\n"
25318 "hsubpd %%xmm9, %%xmm9\n"
25319 "blendpd $1, %%xmm8, %%xmm9\n"
25320 "movapd %%xmm9, %%xmm3\n"
25321 "movapd %%xmm0, %%xmm8\n"
25322 "movapd %%xmm0, %%xmm9\n"
25323 "addpd %%xmm1, %%xmm8\n"
25324 "subpd %%xmm1, %%xmm9\n"
25325 "movapd %%xmm2, %%xmm10\n"
25326 "movapd %%xmm2, %%xmm11\n"
25327 "addpd %%xmm3, %%xmm10\n"
25328 "subpd %%xmm3, %%xmm11\n"
25329 "movapd %%xmm8, %%xmm0\n"
25330 "movapd %%xmm8, %%xmm2\n"
25331 "addpd %%xmm10, %%xmm0\n"
25332 "subpd %%xmm10, %%xmm2\n"
25333 "movapd %%xmm9, %%xmm1\n"
25334 "movapd %%xmm9, %%xmm3\n"
25335 "addpd %%xmm11, %%xmm1\n"
25336 "subpd %%xmm11, %%xmm3\n"
25337 "movupd %%xmm0, (%0)\n"
25338 "movupd %%xmm1, (%1)\n"
25339 "movupd %%xmm2, (%2)\n"
25340 "movupd %%xmm3, (%3)\n"
25341 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2), "r"(buf + j + k + 4), "r"(buf + j + k + 6) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25342 );
25343 }
25344 }
25345 return;
25346 }
25347 if (depth == 6) {
25348 helper_double_30_recursive(buf + 0, 3);
25349 helper_double_30_recursive(buf + 8, 3);
25350 helper_double_30_recursive(buf + 16, 3);
25351 helper_double_30_recursive(buf + 24, 3);
25352 helper_double_30_recursive(buf + 32, 3);
25353 helper_double_30_recursive(buf + 40, 3);
25354 helper_double_30_recursive(buf + 48, 3);
25355 helper_double_30_recursive(buf + 56, 3);
25356 for (int j = 0; j < 64; j += 64) {
25357 for (int k = 0; k < 8; k += 2) {
25358 __asm__ volatile (
25359 "movupd (%0), %%xmm0\n"
25360 "movupd (%1), %%xmm1\n"
25361 "movupd (%2), %%xmm2\n"
25362 "movupd (%3), %%xmm3\n"
25363 "movupd (%4), %%xmm4\n"
25364 "movupd (%5), %%xmm5\n"
25365 "movupd (%6), %%xmm6\n"
25366 "movupd (%7), %%xmm7\n"
25367 "movapd %%xmm0, %%xmm8\n"
25368 "movapd %%xmm0, %%xmm9\n"
25369 "addpd %%xmm1, %%xmm8\n"
25370 "subpd %%xmm1, %%xmm9\n"
25371 "movapd %%xmm2, %%xmm10\n"
25372 "movapd %%xmm2, %%xmm11\n"
25373 "addpd %%xmm3, %%xmm10\n"
25374 "subpd %%xmm3, %%xmm11\n"
25375 "movapd %%xmm4, %%xmm12\n"
25376 "movapd %%xmm4, %%xmm13\n"
25377 "addpd %%xmm5, %%xmm12\n"
25378 "subpd %%xmm5, %%xmm13\n"
25379 "movapd %%xmm6, %%xmm14\n"
25380 "movapd %%xmm6, %%xmm15\n"
25381 "addpd %%xmm7, %%xmm14\n"
25382 "subpd %%xmm7, %%xmm15\n"
25383 "movapd %%xmm8, %%xmm0\n"
25384 "movapd %%xmm8, %%xmm2\n"
25385 "addpd %%xmm10, %%xmm0\n"
25386 "subpd %%xmm10, %%xmm2\n"
25387 "movapd %%xmm9, %%xmm1\n"
25388 "movapd %%xmm9, %%xmm3\n"
25389 "addpd %%xmm11, %%xmm1\n"
25390 "subpd %%xmm11, %%xmm3\n"
25391 "movapd %%xmm12, %%xmm4\n"
25392 "movapd %%xmm12, %%xmm6\n"
25393 "addpd %%xmm14, %%xmm4\n"
25394 "subpd %%xmm14, %%xmm6\n"
25395 "movapd %%xmm13, %%xmm5\n"
25396 "movapd %%xmm13, %%xmm7\n"
25397 "addpd %%xmm15, %%xmm5\n"
25398 "subpd %%xmm15, %%xmm7\n"
25399 "movapd %%xmm0, %%xmm8\n"
25400 "movapd %%xmm0, %%xmm12\n"
25401 "addpd %%xmm4, %%xmm8\n"
25402 "subpd %%xmm4, %%xmm12\n"
25403 "movapd %%xmm1, %%xmm9\n"
25404 "movapd %%xmm1, %%xmm13\n"
25405 "addpd %%xmm5, %%xmm9\n"
25406 "subpd %%xmm5, %%xmm13\n"
25407 "movapd %%xmm2, %%xmm10\n"
25408 "movapd %%xmm2, %%xmm14\n"
25409 "addpd %%xmm6, %%xmm10\n"
25410 "subpd %%xmm6, %%xmm14\n"
25411 "movapd %%xmm3, %%xmm11\n"
25412 "movapd %%xmm3, %%xmm15\n"
25413 "addpd %%xmm7, %%xmm11\n"
25414 "subpd %%xmm7, %%xmm15\n"
25415 "movupd %%xmm8, (%0)\n"
25416 "movupd %%xmm9, (%1)\n"
25417 "movupd %%xmm10, (%2)\n"
25418 "movupd %%xmm11, (%3)\n"
25419 "movupd %%xmm12, (%4)\n"
25420 "movupd %%xmm13, (%5)\n"
25421 "movupd %%xmm14, (%6)\n"
25422 "movupd %%xmm15, (%7)\n"
25423 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25424 );
25425 }
25426 }
25427 return;
25428 }
25429 if (depth == 9) {
25430 helper_double_30_recursive(buf + 0, 6);
25431 helper_double_30_recursive(buf + 64, 6);
25432 helper_double_30_recursive(buf + 128, 6);
25433 helper_double_30_recursive(buf + 192, 6);
25434 helper_double_30_recursive(buf + 256, 6);
25435 helper_double_30_recursive(buf + 320, 6);
25436 helper_double_30_recursive(buf + 384, 6);
25437 helper_double_30_recursive(buf + 448, 6);
25438 for (int j = 0; j < 512; j += 512) {
25439 for (int k = 0; k < 64; k += 2) {
25440 __asm__ volatile (
25441 "movupd (%0), %%xmm0\n"
25442 "movupd (%1), %%xmm1\n"
25443 "movupd (%2), %%xmm2\n"
25444 "movupd (%3), %%xmm3\n"
25445 "movupd (%4), %%xmm4\n"
25446 "movupd (%5), %%xmm5\n"
25447 "movupd (%6), %%xmm6\n"
25448 "movupd (%7), %%xmm7\n"
25449 "movapd %%xmm0, %%xmm8\n"
25450 "movapd %%xmm0, %%xmm9\n"
25451 "addpd %%xmm1, %%xmm8\n"
25452 "subpd %%xmm1, %%xmm9\n"
25453 "movapd %%xmm2, %%xmm10\n"
25454 "movapd %%xmm2, %%xmm11\n"
25455 "addpd %%xmm3, %%xmm10\n"
25456 "subpd %%xmm3, %%xmm11\n"
25457 "movapd %%xmm4, %%xmm12\n"
25458 "movapd %%xmm4, %%xmm13\n"
25459 "addpd %%xmm5, %%xmm12\n"
25460 "subpd %%xmm5, %%xmm13\n"
25461 "movapd %%xmm6, %%xmm14\n"
25462 "movapd %%xmm6, %%xmm15\n"
25463 "addpd %%xmm7, %%xmm14\n"
25464 "subpd %%xmm7, %%xmm15\n"
25465 "movapd %%xmm8, %%xmm0\n"
25466 "movapd %%xmm8, %%xmm2\n"
25467 "addpd %%xmm10, %%xmm0\n"
25468 "subpd %%xmm10, %%xmm2\n"
25469 "movapd %%xmm9, %%xmm1\n"
25470 "movapd %%xmm9, %%xmm3\n"
25471 "addpd %%xmm11, %%xmm1\n"
25472 "subpd %%xmm11, %%xmm3\n"
25473 "movapd %%xmm12, %%xmm4\n"
25474 "movapd %%xmm12, %%xmm6\n"
25475 "addpd %%xmm14, %%xmm4\n"
25476 "subpd %%xmm14, %%xmm6\n"
25477 "movapd %%xmm13, %%xmm5\n"
25478 "movapd %%xmm13, %%xmm7\n"
25479 "addpd %%xmm15, %%xmm5\n"
25480 "subpd %%xmm15, %%xmm7\n"
25481 "movapd %%xmm0, %%xmm8\n"
25482 "movapd %%xmm0, %%xmm12\n"
25483 "addpd %%xmm4, %%xmm8\n"
25484 "subpd %%xmm4, %%xmm12\n"
25485 "movapd %%xmm1, %%xmm9\n"
25486 "movapd %%xmm1, %%xmm13\n"
25487 "addpd %%xmm5, %%xmm9\n"
25488 "subpd %%xmm5, %%xmm13\n"
25489 "movapd %%xmm2, %%xmm10\n"
25490 "movapd %%xmm2, %%xmm14\n"
25491 "addpd %%xmm6, %%xmm10\n"
25492 "subpd %%xmm6, %%xmm14\n"
25493 "movapd %%xmm3, %%xmm11\n"
25494 "movapd %%xmm3, %%xmm15\n"
25495 "addpd %%xmm7, %%xmm11\n"
25496 "subpd %%xmm7, %%xmm15\n"
25497 "movupd %%xmm8, (%0)\n"
25498 "movupd %%xmm9, (%1)\n"
25499 "movupd %%xmm10, (%2)\n"
25500 "movupd %%xmm11, (%3)\n"
25501 "movupd %%xmm12, (%4)\n"
25502 "movupd %%xmm13, (%5)\n"
25503 "movupd %%xmm14, (%6)\n"
25504 "movupd %%xmm15, (%7)\n"
25505 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25506 );
25507 }
25508 }
25509 return;
25510 }
25511 if (depth == 12) {
25512 helper_double_30_recursive(buf + 0, 9);
25513 helper_double_30_recursive(buf + 512, 9);
25514 helper_double_30_recursive(buf + 1024, 9);
25515 helper_double_30_recursive(buf + 1536, 9);
25516 helper_double_30_recursive(buf + 2048, 9);
25517 helper_double_30_recursive(buf + 2560, 9);
25518 helper_double_30_recursive(buf + 3072, 9);
25519 helper_double_30_recursive(buf + 3584, 9);
25520 for (int j = 0; j < 4096; j += 4096) {
25521 for (int k = 0; k < 512; k += 2) {
25522 __asm__ volatile (
25523 "movupd (%0), %%xmm0\n"
25524 "movupd (%1), %%xmm1\n"
25525 "movupd (%2), %%xmm2\n"
25526 "movupd (%3), %%xmm3\n"
25527 "movupd (%4), %%xmm4\n"
25528 "movupd (%5), %%xmm5\n"
25529 "movupd (%6), %%xmm6\n"
25530 "movupd (%7), %%xmm7\n"
25531 "movapd %%xmm0, %%xmm8\n"
25532 "movapd %%xmm0, %%xmm9\n"
25533 "addpd %%xmm1, %%xmm8\n"
25534 "subpd %%xmm1, %%xmm9\n"
25535 "movapd %%xmm2, %%xmm10\n"
25536 "movapd %%xmm2, %%xmm11\n"
25537 "addpd %%xmm3, %%xmm10\n"
25538 "subpd %%xmm3, %%xmm11\n"
25539 "movapd %%xmm4, %%xmm12\n"
25540 "movapd %%xmm4, %%xmm13\n"
25541 "addpd %%xmm5, %%xmm12\n"
25542 "subpd %%xmm5, %%xmm13\n"
25543 "movapd %%xmm6, %%xmm14\n"
25544 "movapd %%xmm6, %%xmm15\n"
25545 "addpd %%xmm7, %%xmm14\n"
25546 "subpd %%xmm7, %%xmm15\n"
25547 "movapd %%xmm8, %%xmm0\n"
25548 "movapd %%xmm8, %%xmm2\n"
25549 "addpd %%xmm10, %%xmm0\n"
25550 "subpd %%xmm10, %%xmm2\n"
25551 "movapd %%xmm9, %%xmm1\n"
25552 "movapd %%xmm9, %%xmm3\n"
25553 "addpd %%xmm11, %%xmm1\n"
25554 "subpd %%xmm11, %%xmm3\n"
25555 "movapd %%xmm12, %%xmm4\n"
25556 "movapd %%xmm12, %%xmm6\n"
25557 "addpd %%xmm14, %%xmm4\n"
25558 "subpd %%xmm14, %%xmm6\n"
25559 "movapd %%xmm13, %%xmm5\n"
25560 "movapd %%xmm13, %%xmm7\n"
25561 "addpd %%xmm15, %%xmm5\n"
25562 "subpd %%xmm15, %%xmm7\n"
25563 "movapd %%xmm0, %%xmm8\n"
25564 "movapd %%xmm0, %%xmm12\n"
25565 "addpd %%xmm4, %%xmm8\n"
25566 "subpd %%xmm4, %%xmm12\n"
25567 "movapd %%xmm1, %%xmm9\n"
25568 "movapd %%xmm1, %%xmm13\n"
25569 "addpd %%xmm5, %%xmm9\n"
25570 "subpd %%xmm5, %%xmm13\n"
25571 "movapd %%xmm2, %%xmm10\n"
25572 "movapd %%xmm2, %%xmm14\n"
25573 "addpd %%xmm6, %%xmm10\n"
25574 "subpd %%xmm6, %%xmm14\n"
25575 "movapd %%xmm3, %%xmm11\n"
25576 "movapd %%xmm3, %%xmm15\n"
25577 "addpd %%xmm7, %%xmm11\n"
25578 "subpd %%xmm7, %%xmm15\n"
25579 "movupd %%xmm8, (%0)\n"
25580 "movupd %%xmm9, (%1)\n"
25581 "movupd %%xmm10, (%2)\n"
25582 "movupd %%xmm11, (%3)\n"
25583 "movupd %%xmm12, (%4)\n"
25584 "movupd %%xmm13, (%5)\n"
25585 "movupd %%xmm14, (%6)\n"
25586 "movupd %%xmm15, (%7)\n"
25587 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25588 );
25589 }
25590 }
25591 return;
25592 }
25593 if (depth == 15) {
25594 helper_double_30_recursive(buf + 0, 12);
25595 helper_double_30_recursive(buf + 4096, 12);
25596 helper_double_30_recursive(buf + 8192, 12);
25597 helper_double_30_recursive(buf + 12288, 12);
25598 helper_double_30_recursive(buf + 16384, 12);
25599 helper_double_30_recursive(buf + 20480, 12);
25600 helper_double_30_recursive(buf + 24576, 12);
25601 helper_double_30_recursive(buf + 28672, 12);
25602 for (int j = 0; j < 32768; j += 32768) {
25603 for (int k = 0; k < 4096; k += 2) {
25604 __asm__ volatile (
25605 "movupd (%0), %%xmm0\n"
25606 "movupd (%1), %%xmm1\n"
25607 "movupd (%2), %%xmm2\n"
25608 "movupd (%3), %%xmm3\n"
25609 "movupd (%4), %%xmm4\n"
25610 "movupd (%5), %%xmm5\n"
25611 "movupd (%6), %%xmm6\n"
25612 "movupd (%7), %%xmm7\n"
25613 "movapd %%xmm0, %%xmm8\n"
25614 "movapd %%xmm0, %%xmm9\n"
25615 "addpd %%xmm1, %%xmm8\n"
25616 "subpd %%xmm1, %%xmm9\n"
25617 "movapd %%xmm2, %%xmm10\n"
25618 "movapd %%xmm2, %%xmm11\n"
25619 "addpd %%xmm3, %%xmm10\n"
25620 "subpd %%xmm3, %%xmm11\n"
25621 "movapd %%xmm4, %%xmm12\n"
25622 "movapd %%xmm4, %%xmm13\n"
25623 "addpd %%xmm5, %%xmm12\n"
25624 "subpd %%xmm5, %%xmm13\n"
25625 "movapd %%xmm6, %%xmm14\n"
25626 "movapd %%xmm6, %%xmm15\n"
25627 "addpd %%xmm7, %%xmm14\n"
25628 "subpd %%xmm7, %%xmm15\n"
25629 "movapd %%xmm8, %%xmm0\n"
25630 "movapd %%xmm8, %%xmm2\n"
25631 "addpd %%xmm10, %%xmm0\n"
25632 "subpd %%xmm10, %%xmm2\n"
25633 "movapd %%xmm9, %%xmm1\n"
25634 "movapd %%xmm9, %%xmm3\n"
25635 "addpd %%xmm11, %%xmm1\n"
25636 "subpd %%xmm11, %%xmm3\n"
25637 "movapd %%xmm12, %%xmm4\n"
25638 "movapd %%xmm12, %%xmm6\n"
25639 "addpd %%xmm14, %%xmm4\n"
25640 "subpd %%xmm14, %%xmm6\n"
25641 "movapd %%xmm13, %%xmm5\n"
25642 "movapd %%xmm13, %%xmm7\n"
25643 "addpd %%xmm15, %%xmm5\n"
25644 "subpd %%xmm15, %%xmm7\n"
25645 "movapd %%xmm0, %%xmm8\n"
25646 "movapd %%xmm0, %%xmm12\n"
25647 "addpd %%xmm4, %%xmm8\n"
25648 "subpd %%xmm4, %%xmm12\n"
25649 "movapd %%xmm1, %%xmm9\n"
25650 "movapd %%xmm1, %%xmm13\n"
25651 "addpd %%xmm5, %%xmm9\n"
25652 "subpd %%xmm5, %%xmm13\n"
25653 "movapd %%xmm2, %%xmm10\n"
25654 "movapd %%xmm2, %%xmm14\n"
25655 "addpd %%xmm6, %%xmm10\n"
25656 "subpd %%xmm6, %%xmm14\n"
25657 "movapd %%xmm3, %%xmm11\n"
25658 "movapd %%xmm3, %%xmm15\n"
25659 "addpd %%xmm7, %%xmm11\n"
25660 "subpd %%xmm7, %%xmm15\n"
25661 "movupd %%xmm8, (%0)\n"
25662 "movupd %%xmm9, (%1)\n"
25663 "movupd %%xmm10, (%2)\n"
25664 "movupd %%xmm11, (%3)\n"
25665 "movupd %%xmm12, (%4)\n"
25666 "movupd %%xmm13, (%5)\n"
25667 "movupd %%xmm14, (%6)\n"
25668 "movupd %%xmm15, (%7)\n"
25669 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25670 );
25671 }
25672 }
25673 return;
25674 }
25675 if (depth == 18) {
25676 helper_double_30_recursive(buf + 0, 15);
25677 helper_double_30_recursive(buf + 32768, 15);
25678 helper_double_30_recursive(buf + 65536, 15);
25679 helper_double_30_recursive(buf + 98304, 15);
25680 helper_double_30_recursive(buf + 131072, 15);
25681 helper_double_30_recursive(buf + 163840, 15);
25682 helper_double_30_recursive(buf + 196608, 15);
25683 helper_double_30_recursive(buf + 229376, 15);
25684 for (int j = 0; j < 262144; j += 262144) {
25685 for (int k = 0; k < 32768; k += 2) {
25686 __asm__ volatile (
25687 "movupd (%0), %%xmm0\n"
25688 "movupd (%1), %%xmm1\n"
25689 "movupd (%2), %%xmm2\n"
25690 "movupd (%3), %%xmm3\n"
25691 "movupd (%4), %%xmm4\n"
25692 "movupd (%5), %%xmm5\n"
25693 "movupd (%6), %%xmm6\n"
25694 "movupd (%7), %%xmm7\n"
25695 "movapd %%xmm0, %%xmm8\n"
25696 "movapd %%xmm0, %%xmm9\n"
25697 "addpd %%xmm1, %%xmm8\n"
25698 "subpd %%xmm1, %%xmm9\n"
25699 "movapd %%xmm2, %%xmm10\n"
25700 "movapd %%xmm2, %%xmm11\n"
25701 "addpd %%xmm3, %%xmm10\n"
25702 "subpd %%xmm3, %%xmm11\n"
25703 "movapd %%xmm4, %%xmm12\n"
25704 "movapd %%xmm4, %%xmm13\n"
25705 "addpd %%xmm5, %%xmm12\n"
25706 "subpd %%xmm5, %%xmm13\n"
25707 "movapd %%xmm6, %%xmm14\n"
25708 "movapd %%xmm6, %%xmm15\n"
25709 "addpd %%xmm7, %%xmm14\n"
25710 "subpd %%xmm7, %%xmm15\n"
25711 "movapd %%xmm8, %%xmm0\n"
25712 "movapd %%xmm8, %%xmm2\n"
25713 "addpd %%xmm10, %%xmm0\n"
25714 "subpd %%xmm10, %%xmm2\n"
25715 "movapd %%xmm9, %%xmm1\n"
25716 "movapd %%xmm9, %%xmm3\n"
25717 "addpd %%xmm11, %%xmm1\n"
25718 "subpd %%xmm11, %%xmm3\n"
25719 "movapd %%xmm12, %%xmm4\n"
25720 "movapd %%xmm12, %%xmm6\n"
25721 "addpd %%xmm14, %%xmm4\n"
25722 "subpd %%xmm14, %%xmm6\n"
25723 "movapd %%xmm13, %%xmm5\n"
25724 "movapd %%xmm13, %%xmm7\n"
25725 "addpd %%xmm15, %%xmm5\n"
25726 "subpd %%xmm15, %%xmm7\n"
25727 "movapd %%xmm0, %%xmm8\n"
25728 "movapd %%xmm0, %%xmm12\n"
25729 "addpd %%xmm4, %%xmm8\n"
25730 "subpd %%xmm4, %%xmm12\n"
25731 "movapd %%xmm1, %%xmm9\n"
25732 "movapd %%xmm1, %%xmm13\n"
25733 "addpd %%xmm5, %%xmm9\n"
25734 "subpd %%xmm5, %%xmm13\n"
25735 "movapd %%xmm2, %%xmm10\n"
25736 "movapd %%xmm2, %%xmm14\n"
25737 "addpd %%xmm6, %%xmm10\n"
25738 "subpd %%xmm6, %%xmm14\n"
25739 "movapd %%xmm3, %%xmm11\n"
25740 "movapd %%xmm3, %%xmm15\n"
25741 "addpd %%xmm7, %%xmm11\n"
25742 "subpd %%xmm7, %%xmm15\n"
25743 "movupd %%xmm8, (%0)\n"
25744 "movupd %%xmm9, (%1)\n"
25745 "movupd %%xmm10, (%2)\n"
25746 "movupd %%xmm11, (%3)\n"
25747 "movupd %%xmm12, (%4)\n"
25748 "movupd %%xmm13, (%5)\n"
25749 "movupd %%xmm14, (%6)\n"
25750 "movupd %%xmm15, (%7)\n"
25751 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25752 );
25753 }
25754 }
25755 return;
25756 }
25757 if (depth == 21) {
25758 helper_double_30_recursive(buf + 0, 18);
25759 helper_double_30_recursive(buf + 262144, 18);
25760 helper_double_30_recursive(buf + 524288, 18);
25761 helper_double_30_recursive(buf + 786432, 18);
25762 helper_double_30_recursive(buf + 1048576, 18);
25763 helper_double_30_recursive(buf + 1310720, 18);
25764 helper_double_30_recursive(buf + 1572864, 18);
25765 helper_double_30_recursive(buf + 1835008, 18);
25766 for (int j = 0; j < 2097152; j += 2097152) {
25767 for (int k = 0; k < 262144; k += 2) {
25768 __asm__ volatile (
25769 "movupd (%0), %%xmm0\n"
25770 "movupd (%1), %%xmm1\n"
25771 "movupd (%2), %%xmm2\n"
25772 "movupd (%3), %%xmm3\n"
25773 "movupd (%4), %%xmm4\n"
25774 "movupd (%5), %%xmm5\n"
25775 "movupd (%6), %%xmm6\n"
25776 "movupd (%7), %%xmm7\n"
25777 "movapd %%xmm0, %%xmm8\n"
25778 "movapd %%xmm0, %%xmm9\n"
25779 "addpd %%xmm1, %%xmm8\n"
25780 "subpd %%xmm1, %%xmm9\n"
25781 "movapd %%xmm2, %%xmm10\n"
25782 "movapd %%xmm2, %%xmm11\n"
25783 "addpd %%xmm3, %%xmm10\n"
25784 "subpd %%xmm3, %%xmm11\n"
25785 "movapd %%xmm4, %%xmm12\n"
25786 "movapd %%xmm4, %%xmm13\n"
25787 "addpd %%xmm5, %%xmm12\n"
25788 "subpd %%xmm5, %%xmm13\n"
25789 "movapd %%xmm6, %%xmm14\n"
25790 "movapd %%xmm6, %%xmm15\n"
25791 "addpd %%xmm7, %%xmm14\n"
25792 "subpd %%xmm7, %%xmm15\n"
25793 "movapd %%xmm8, %%xmm0\n"
25794 "movapd %%xmm8, %%xmm2\n"
25795 "addpd %%xmm10, %%xmm0\n"
25796 "subpd %%xmm10, %%xmm2\n"
25797 "movapd %%xmm9, %%xmm1\n"
25798 "movapd %%xmm9, %%xmm3\n"
25799 "addpd %%xmm11, %%xmm1\n"
25800 "subpd %%xmm11, %%xmm3\n"
25801 "movapd %%xmm12, %%xmm4\n"
25802 "movapd %%xmm12, %%xmm6\n"
25803 "addpd %%xmm14, %%xmm4\n"
25804 "subpd %%xmm14, %%xmm6\n"
25805 "movapd %%xmm13, %%xmm5\n"
25806 "movapd %%xmm13, %%xmm7\n"
25807 "addpd %%xmm15, %%xmm5\n"
25808 "subpd %%xmm15, %%xmm7\n"
25809 "movapd %%xmm0, %%xmm8\n"
25810 "movapd %%xmm0, %%xmm12\n"
25811 "addpd %%xmm4, %%xmm8\n"
25812 "subpd %%xmm4, %%xmm12\n"
25813 "movapd %%xmm1, %%xmm9\n"
25814 "movapd %%xmm1, %%xmm13\n"
25815 "addpd %%xmm5, %%xmm9\n"
25816 "subpd %%xmm5, %%xmm13\n"
25817 "movapd %%xmm2, %%xmm10\n"
25818 "movapd %%xmm2, %%xmm14\n"
25819 "addpd %%xmm6, %%xmm10\n"
25820 "subpd %%xmm6, %%xmm14\n"
25821 "movapd %%xmm3, %%xmm11\n"
25822 "movapd %%xmm3, %%xmm15\n"
25823 "addpd %%xmm7, %%xmm11\n"
25824 "subpd %%xmm7, %%xmm15\n"
25825 "movupd %%xmm8, (%0)\n"
25826 "movupd %%xmm9, (%1)\n"
25827 "movupd %%xmm10, (%2)\n"
25828 "movupd %%xmm11, (%3)\n"
25829 "movupd %%xmm12, (%4)\n"
25830 "movupd %%xmm13, (%5)\n"
25831 "movupd %%xmm14, (%6)\n"
25832 "movupd %%xmm15, (%7)\n"
25833 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25834 );
25835 }
25836 }
25837 return;
25838 }
25839 if (depth == 24) {
25840 helper_double_30_recursive(buf + 0, 21);
25841 helper_double_30_recursive(buf + 2097152, 21);
25842 helper_double_30_recursive(buf + 4194304, 21);
25843 helper_double_30_recursive(buf + 6291456, 21);
25844 helper_double_30_recursive(buf + 8388608, 21);
25845 helper_double_30_recursive(buf + 10485760, 21);
25846 helper_double_30_recursive(buf + 12582912, 21);
25847 helper_double_30_recursive(buf + 14680064, 21);
25848 for (int j = 0; j < 16777216; j += 16777216) {
25849 for (int k = 0; k < 2097152; k += 2) {
25850 __asm__ volatile (
25851 "movupd (%0), %%xmm0\n"
25852 "movupd (%1), %%xmm1\n"
25853 "movupd (%2), %%xmm2\n"
25854 "movupd (%3), %%xmm3\n"
25855 "movupd (%4), %%xmm4\n"
25856 "movupd (%5), %%xmm5\n"
25857 "movupd (%6), %%xmm6\n"
25858 "movupd (%7), %%xmm7\n"
25859 "movapd %%xmm0, %%xmm8\n"
25860 "movapd %%xmm0, %%xmm9\n"
25861 "addpd %%xmm1, %%xmm8\n"
25862 "subpd %%xmm1, %%xmm9\n"
25863 "movapd %%xmm2, %%xmm10\n"
25864 "movapd %%xmm2, %%xmm11\n"
25865 "addpd %%xmm3, %%xmm10\n"
25866 "subpd %%xmm3, %%xmm11\n"
25867 "movapd %%xmm4, %%xmm12\n"
25868 "movapd %%xmm4, %%xmm13\n"
25869 "addpd %%xmm5, %%xmm12\n"
25870 "subpd %%xmm5, %%xmm13\n"
25871 "movapd %%xmm6, %%xmm14\n"
25872 "movapd %%xmm6, %%xmm15\n"
25873 "addpd %%xmm7, %%xmm14\n"
25874 "subpd %%xmm7, %%xmm15\n"
25875 "movapd %%xmm8, %%xmm0\n"
25876 "movapd %%xmm8, %%xmm2\n"
25877 "addpd %%xmm10, %%xmm0\n"
25878 "subpd %%xmm10, %%xmm2\n"
25879 "movapd %%xmm9, %%xmm1\n"
25880 "movapd %%xmm9, %%xmm3\n"
25881 "addpd %%xmm11, %%xmm1\n"
25882 "subpd %%xmm11, %%xmm3\n"
25883 "movapd %%xmm12, %%xmm4\n"
25884 "movapd %%xmm12, %%xmm6\n"
25885 "addpd %%xmm14, %%xmm4\n"
25886 "subpd %%xmm14, %%xmm6\n"
25887 "movapd %%xmm13, %%xmm5\n"
25888 "movapd %%xmm13, %%xmm7\n"
25889 "addpd %%xmm15, %%xmm5\n"
25890 "subpd %%xmm15, %%xmm7\n"
25891 "movapd %%xmm0, %%xmm8\n"
25892 "movapd %%xmm0, %%xmm12\n"
25893 "addpd %%xmm4, %%xmm8\n"
25894 "subpd %%xmm4, %%xmm12\n"
25895 "movapd %%xmm1, %%xmm9\n"
25896 "movapd %%xmm1, %%xmm13\n"
25897 "addpd %%xmm5, %%xmm9\n"
25898 "subpd %%xmm5, %%xmm13\n"
25899 "movapd %%xmm2, %%xmm10\n"
25900 "movapd %%xmm2, %%xmm14\n"
25901 "addpd %%xmm6, %%xmm10\n"
25902 "subpd %%xmm6, %%xmm14\n"
25903 "movapd %%xmm3, %%xmm11\n"
25904 "movapd %%xmm3, %%xmm15\n"
25905 "addpd %%xmm7, %%xmm11\n"
25906 "subpd %%xmm7, %%xmm15\n"
25907 "movupd %%xmm8, (%0)\n"
25908 "movupd %%xmm9, (%1)\n"
25909 "movupd %%xmm10, (%2)\n"
25910 "movupd %%xmm11, (%3)\n"
25911 "movupd %%xmm12, (%4)\n"
25912 "movupd %%xmm13, (%5)\n"
25913 "movupd %%xmm14, (%6)\n"
25914 "movupd %%xmm15, (%7)\n"
25915 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25916 );
25917 }
25918 }
25919 return;
25920 }
25921 if (depth == 27) {
25922 helper_double_30_recursive(buf + 0, 24);
25923 helper_double_30_recursive(buf + 16777216, 24);
25924 helper_double_30_recursive(buf + 33554432, 24);
25925 helper_double_30_recursive(buf + 50331648, 24);
25926 helper_double_30_recursive(buf + 67108864, 24);
25927 helper_double_30_recursive(buf + 83886080, 24);
25928 helper_double_30_recursive(buf + 100663296, 24);
25929 helper_double_30_recursive(buf + 117440512, 24);
25930 for (int j = 0; j < 134217728; j += 134217728) {
25931 for (int k = 0; k < 16777216; k += 2) {
25932 __asm__ volatile (
25933 "movupd (%0), %%xmm0\n"
25934 "movupd (%1), %%xmm1\n"
25935 "movupd (%2), %%xmm2\n"
25936 "movupd (%3), %%xmm3\n"
25937 "movupd (%4), %%xmm4\n"
25938 "movupd (%5), %%xmm5\n"
25939 "movupd (%6), %%xmm6\n"
25940 "movupd (%7), %%xmm7\n"
25941 "movapd %%xmm0, %%xmm8\n"
25942 "movapd %%xmm0, %%xmm9\n"
25943 "addpd %%xmm1, %%xmm8\n"
25944 "subpd %%xmm1, %%xmm9\n"
25945 "movapd %%xmm2, %%xmm10\n"
25946 "movapd %%xmm2, %%xmm11\n"
25947 "addpd %%xmm3, %%xmm10\n"
25948 "subpd %%xmm3, %%xmm11\n"
25949 "movapd %%xmm4, %%xmm12\n"
25950 "movapd %%xmm4, %%xmm13\n"
25951 "addpd %%xmm5, %%xmm12\n"
25952 "subpd %%xmm5, %%xmm13\n"
25953 "movapd %%xmm6, %%xmm14\n"
25954 "movapd %%xmm6, %%xmm15\n"
25955 "addpd %%xmm7, %%xmm14\n"
25956 "subpd %%xmm7, %%xmm15\n"
25957 "movapd %%xmm8, %%xmm0\n"
25958 "movapd %%xmm8, %%xmm2\n"
25959 "addpd %%xmm10, %%xmm0\n"
25960 "subpd %%xmm10, %%xmm2\n"
25961 "movapd %%xmm9, %%xmm1\n"
25962 "movapd %%xmm9, %%xmm3\n"
25963 "addpd %%xmm11, %%xmm1\n"
25964 "subpd %%xmm11, %%xmm3\n"
25965 "movapd %%xmm12, %%xmm4\n"
25966 "movapd %%xmm12, %%xmm6\n"
25967 "addpd %%xmm14, %%xmm4\n"
25968 "subpd %%xmm14, %%xmm6\n"
25969 "movapd %%xmm13, %%xmm5\n"
25970 "movapd %%xmm13, %%xmm7\n"
25971 "addpd %%xmm15, %%xmm5\n"
25972 "subpd %%xmm15, %%xmm7\n"
25973 "movapd %%xmm0, %%xmm8\n"
25974 "movapd %%xmm0, %%xmm12\n"
25975 "addpd %%xmm4, %%xmm8\n"
25976 "subpd %%xmm4, %%xmm12\n"
25977 "movapd %%xmm1, %%xmm9\n"
25978 "movapd %%xmm1, %%xmm13\n"
25979 "addpd %%xmm5, %%xmm9\n"
25980 "subpd %%xmm5, %%xmm13\n"
25981 "movapd %%xmm2, %%xmm10\n"
25982 "movapd %%xmm2, %%xmm14\n"
25983 "addpd %%xmm6, %%xmm10\n"
25984 "subpd %%xmm6, %%xmm14\n"
25985 "movapd %%xmm3, %%xmm11\n"
25986 "movapd %%xmm3, %%xmm15\n"
25987 "addpd %%xmm7, %%xmm11\n"
25988 "subpd %%xmm7, %%xmm15\n"
25989 "movupd %%xmm8, (%0)\n"
25990 "movupd %%xmm9, (%1)\n"
25991 "movupd %%xmm10, (%2)\n"
25992 "movupd %%xmm11, (%3)\n"
25993 "movupd %%xmm12, (%4)\n"
25994 "movupd %%xmm13, (%5)\n"
25995 "movupd %%xmm14, (%6)\n"
25996 "movupd %%xmm15, (%7)\n"
25997 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
25998 );
25999 }
26000 }
26001 return;
26002 }
26003 if (depth == 30) {
26004 helper_double_30_recursive(buf + 0, 27);
26005 helper_double_30_recursive(buf + 134217728, 27);
26006 helper_double_30_recursive(buf + 268435456, 27);
26007 helper_double_30_recursive(buf + 402653184, 27);
26008 helper_double_30_recursive(buf + 536870912, 27);
26009 helper_double_30_recursive(buf + 671088640, 27);
26010 helper_double_30_recursive(buf + 805306368, 27);
26011 helper_double_30_recursive(buf + 939524096, 27);
26012 for (int j = 0; j < 1073741824; j += 1073741824) {
26013 for (int k = 0; k < 134217728; k += 2) {
26014 __asm__ volatile (
26015 "movupd (%0), %%xmm0\n"
26016 "movupd (%1), %%xmm1\n"
26017 "movupd (%2), %%xmm2\n"
26018 "movupd (%3), %%xmm3\n"
26019 "movupd (%4), %%xmm4\n"
26020 "movupd (%5), %%xmm5\n"
26021 "movupd (%6), %%xmm6\n"
26022 "movupd (%7), %%xmm7\n"
26023 "movapd %%xmm0, %%xmm8\n"
26024 "movapd %%xmm0, %%xmm9\n"
26025 "addpd %%xmm1, %%xmm8\n"
26026 "subpd %%xmm1, %%xmm9\n"
26027 "movapd %%xmm2, %%xmm10\n"
26028 "movapd %%xmm2, %%xmm11\n"
26029 "addpd %%xmm3, %%xmm10\n"
26030 "subpd %%xmm3, %%xmm11\n"
26031 "movapd %%xmm4, %%xmm12\n"
26032 "movapd %%xmm4, %%xmm13\n"
26033 "addpd %%xmm5, %%xmm12\n"
26034 "subpd %%xmm5, %%xmm13\n"
26035 "movapd %%xmm6, %%xmm14\n"
26036 "movapd %%xmm6, %%xmm15\n"
26037 "addpd %%xmm7, %%xmm14\n"
26038 "subpd %%xmm7, %%xmm15\n"
26039 "movapd %%xmm8, %%xmm0\n"
26040 "movapd %%xmm8, %%xmm2\n"
26041 "addpd %%xmm10, %%xmm0\n"
26042 "subpd %%xmm10, %%xmm2\n"
26043 "movapd %%xmm9, %%xmm1\n"
26044 "movapd %%xmm9, %%xmm3\n"
26045 "addpd %%xmm11, %%xmm1\n"
26046 "subpd %%xmm11, %%xmm3\n"
26047 "movapd %%xmm12, %%xmm4\n"
26048 "movapd %%xmm12, %%xmm6\n"
26049 "addpd %%xmm14, %%xmm4\n"
26050 "subpd %%xmm14, %%xmm6\n"
26051 "movapd %%xmm13, %%xmm5\n"
26052 "movapd %%xmm13, %%xmm7\n"
26053 "addpd %%xmm15, %%xmm5\n"
26054 "subpd %%xmm15, %%xmm7\n"
26055 "movapd %%xmm0, %%xmm8\n"
26056 "movapd %%xmm0, %%xmm12\n"
26057 "addpd %%xmm4, %%xmm8\n"
26058 "subpd %%xmm4, %%xmm12\n"
26059 "movapd %%xmm1, %%xmm9\n"
26060 "movapd %%xmm1, %%xmm13\n"
26061 "addpd %%xmm5, %%xmm9\n"
26062 "subpd %%xmm5, %%xmm13\n"
26063 "movapd %%xmm2, %%xmm10\n"
26064 "movapd %%xmm2, %%xmm14\n"
26065 "addpd %%xmm6, %%xmm10\n"
26066 "subpd %%xmm6, %%xmm14\n"
26067 "movapd %%xmm3, %%xmm11\n"
26068 "movapd %%xmm3, %%xmm15\n"
26069 "addpd %%xmm7, %%xmm11\n"
26070 "subpd %%xmm7, %%xmm15\n"
26071 "movupd %%xmm8, (%0)\n"
26072 "movupd %%xmm9, (%1)\n"
26073 "movupd %%xmm10, (%2)\n"
26074 "movupd %%xmm11, (%3)\n"
26075 "movupd %%xmm12, (%4)\n"
26076 "movupd %%xmm13, (%5)\n"
26077 "movupd %%xmm14, (%6)\n"
26078 "movupd %%xmm15, (%7)\n"
26079 :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
26080 );
26081 }
26082 }
26083 return;
26084 }
26085 }
26086 void helper_double_30(double *buf);
helper_double_30(double * buf)26087 void helper_double_30(double *buf) {
26088 helper_double_30_recursive(buf, 30);
26089 }
fht_double(double * buf,int log_n)26090 int fht_double(double *buf, int log_n) {
26091 if (log_n == 0) {
26092 return 0;
26093 }
26094 if (log_n == 1) {
26095 helper_double_1(buf);
26096 return 0;
26097 }
26098 if (log_n == 2) {
26099 helper_double_2(buf);
26100 return 0;
26101 }
26102 if (log_n == 3) {
26103 helper_double_3(buf);
26104 return 0;
26105 }
26106 if (log_n == 4) {
26107 helper_double_4(buf);
26108 return 0;
26109 }
26110 if (log_n == 5) {
26111 helper_double_5(buf);
26112 return 0;
26113 }
26114 if (log_n == 6) {
26115 helper_double_6(buf);
26116 return 0;
26117 }
26118 if (log_n == 7) {
26119 helper_double_7(buf);
26120 return 0;
26121 }
26122 if (log_n == 8) {
26123 helper_double_8(buf);
26124 return 0;
26125 }
26126 if (log_n == 9) {
26127 helper_double_9(buf);
26128 return 0;
26129 }
26130 if (log_n == 10) {
26131 helper_double_10(buf);
26132 return 0;
26133 }
26134 if (log_n == 11) {
26135 helper_double_11(buf);
26136 return 0;
26137 }
26138 if (log_n == 12) {
26139 helper_double_12(buf);
26140 return 0;
26141 }
26142 if (log_n == 13) {
26143 helper_double_13(buf);
26144 return 0;
26145 }
26146 if (log_n == 14) {
26147 helper_double_14(buf);
26148 return 0;
26149 }
26150 if (log_n == 15) {
26151 helper_double_15(buf);
26152 return 0;
26153 }
26154 if (log_n == 16) {
26155 helper_double_16(buf);
26156 return 0;
26157 }
26158 if (log_n == 17) {
26159 helper_double_17(buf);
26160 return 0;
26161 }
26162 if (log_n == 18) {
26163 helper_double_18(buf);
26164 return 0;
26165 }
26166 if (log_n == 19) {
26167 helper_double_19(buf);
26168 return 0;
26169 }
26170 if (log_n == 20) {
26171 helper_double_20(buf);
26172 return 0;
26173 }
26174 if (log_n == 21) {
26175 helper_double_21(buf);
26176 return 0;
26177 }
26178 if (log_n == 22) {
26179 helper_double_22(buf);
26180 return 0;
26181 }
26182 if (log_n == 23) {
26183 helper_double_23(buf);
26184 return 0;
26185 }
26186 if (log_n == 24) {
26187 helper_double_24(buf);
26188 return 0;
26189 }
26190 if (log_n == 25) {
26191 helper_double_25(buf);
26192 return 0;
26193 }
26194 if (log_n == 26) {
26195 helper_double_26(buf);
26196 return 0;
26197 }
26198 if (log_n == 27) {
26199 helper_double_27(buf);
26200 return 0;
26201 }
26202 if (log_n == 28) {
26203 helper_double_28(buf);
26204 return 0;
26205 }
26206 if (log_n == 29) {
26207 helper_double_29(buf);
26208 return 0;
26209 }
26210 if (log_n == 30) {
26211 helper_double_30(buf);
26212 return 0;
26213 }
26214 return 1;
26215 }
26216