1 // @generated
2 #include "fht.h"
3 static inline void helper_float_1(float* buf);
helper_float_1(float * buf)4 static inline void helper_float_1(float* buf) {
5 for (int j = 0; j < 2; j += 2) {
6 for (int k = 0; k < 1; ++k) {
7 float u = buf[j + k];
8 float v = buf[j + k + 1];
9 buf[j + k] = u + v;
10 buf[j + k + 1] = u - v;
11 }
12 }
13 }
14 static inline void helper_float_2(float* buf);
helper_float_2(float * buf)15 static inline void helper_float_2(float* buf) {
16 for (int j = 0; j < 4; j += 4) {
17 __asm__ volatile(
18 "LD1 {v0.4S}, [%0]\n"
19 "TRN1 v16.4S, v0.4S, v0.4S\n"
20 "FNEG v17.4S, v0.4S\n"
21 "TRN2 v17.4S, v0.4S, v17.4S\n"
22 "FADD v0.4S, v16.4S, v17.4S\n"
23 "DUP v16.2D, v0.D[0]\n"
24 "FNEG v17.4S, v0.4S\n"
25 "INS v17.D[0], v0.D[1]\n"
26 "FADD v0.4S, v16.4S, v17.4S\n"
27 "ST1 {v0.4S}, [%0]\n" ::"r"(buf + j)
28 : "%v0",
29 "%v1",
30 "%v2",
31 "%v3",
32 "%v4",
33 "%v5",
34 "%v6",
35 "%v7",
36 "%v8",
37 "%v9",
38 "%v10",
39 "%v11",
40 "%v12",
41 "%v13",
42 "%v14",
43 "%v15",
44 "%v16",
45 "%v17",
46 "%v18",
47 "%v19",
48 "%v20",
49 "%v21",
50 "%v22",
51 "%v23",
52 "%v24",
53 "%v25",
54 "%v26",
55 "%v27",
56 "%v28",
57 "%v29",
58 "%v30",
59 "%v31",
60 "memory");
61 }
62 }
63 void helper_float_3_recursive(float* buf, int depth);
helper_float_3_recursive(float * buf,int depth)64 void helper_float_3_recursive(float* buf, int depth) {
65 if (depth == 2) {
66 helper_float_2(buf);
67 return;
68 }
69 if (depth == 3) {
70 helper_float_3_recursive(buf + 0, 2);
71 helper_float_3_recursive(buf + 4, 2);
72 for (int j = 0; j < 8; j += 8) {
73 for (int k = 0; k < 4; k += 4) {
74 __asm__ volatile(
75 "LD1 {v0.4S}, [%0]\n"
76 "LD1 {v1.4S}, [%1]\n"
77 "FADD v16.4S, v0.4S, v1.4S\n"
78 "FSUB v17.4S, v0.4S, v1.4S\n"
79 "ST1 {v16.4S}, [%0]\n"
80 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
81 "r"(buf + j + k + 4)
82 : "%v0",
83 "%v1",
84 "%v2",
85 "%v3",
86 "%v4",
87 "%v5",
88 "%v6",
89 "%v7",
90 "%v8",
91 "%v9",
92 "%v10",
93 "%v11",
94 "%v12",
95 "%v13",
96 "%v14",
97 "%v15",
98 "%v16",
99 "%v17",
100 "%v18",
101 "%v19",
102 "%v20",
103 "%v21",
104 "%v22",
105 "%v23",
106 "%v24",
107 "%v25",
108 "%v26",
109 "%v27",
110 "%v28",
111 "%v29",
112 "%v30",
113 "%v31",
114 "memory");
115 }
116 }
117 return;
118 }
119 }
120 void helper_float_3(float* buf);
helper_float_3(float * buf)121 void helper_float_3(float* buf) {
122 helper_float_3_recursive(buf, 3);
123 }
124 void helper_float_4_recursive(float* buf, int depth);
helper_float_4_recursive(float * buf,int depth)125 void helper_float_4_recursive(float* buf, int depth) {
126 if (depth == 3) {
127 helper_float_3(buf);
128 return;
129 }
130 if (depth == 4) {
131 helper_float_4_recursive(buf + 0, 3);
132 helper_float_4_recursive(buf + 8, 3);
133 for (int j = 0; j < 16; j += 16) {
134 for (int k = 0; k < 8; k += 4) {
135 __asm__ volatile(
136 "LD1 {v0.4S}, [%0]\n"
137 "LD1 {v1.4S}, [%1]\n"
138 "FADD v16.4S, v0.4S, v1.4S\n"
139 "FSUB v17.4S, v0.4S, v1.4S\n"
140 "ST1 {v16.4S}, [%0]\n"
141 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
142 "r"(buf + j + k + 8)
143 : "%v0",
144 "%v1",
145 "%v2",
146 "%v3",
147 "%v4",
148 "%v5",
149 "%v6",
150 "%v7",
151 "%v8",
152 "%v9",
153 "%v10",
154 "%v11",
155 "%v12",
156 "%v13",
157 "%v14",
158 "%v15",
159 "%v16",
160 "%v17",
161 "%v18",
162 "%v19",
163 "%v20",
164 "%v21",
165 "%v22",
166 "%v23",
167 "%v24",
168 "%v25",
169 "%v26",
170 "%v27",
171 "%v28",
172 "%v29",
173 "%v30",
174 "%v31",
175 "memory");
176 }
177 }
178 return;
179 }
180 }
181 void helper_float_4(float* buf);
helper_float_4(float * buf)182 void helper_float_4(float* buf) {
183 helper_float_4_recursive(buf, 4);
184 }
185 void helper_float_5_recursive(float* buf, int depth);
helper_float_5_recursive(float * buf,int depth)186 void helper_float_5_recursive(float* buf, int depth) {
187 if (depth == 4) {
188 helper_float_4(buf);
189 return;
190 }
191 if (depth == 5) {
192 helper_float_5_recursive(buf + 0, 4);
193 helper_float_5_recursive(buf + 16, 4);
194 for (int j = 0; j < 32; j += 32) {
195 for (int k = 0; k < 16; k += 4) {
196 __asm__ volatile(
197 "LD1 {v0.4S}, [%0]\n"
198 "LD1 {v1.4S}, [%1]\n"
199 "FADD v16.4S, v0.4S, v1.4S\n"
200 "FSUB v17.4S, v0.4S, v1.4S\n"
201 "ST1 {v16.4S}, [%0]\n"
202 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
203 "r"(buf + j + k + 16)
204 : "%v0",
205 "%v1",
206 "%v2",
207 "%v3",
208 "%v4",
209 "%v5",
210 "%v6",
211 "%v7",
212 "%v8",
213 "%v9",
214 "%v10",
215 "%v11",
216 "%v12",
217 "%v13",
218 "%v14",
219 "%v15",
220 "%v16",
221 "%v17",
222 "%v18",
223 "%v19",
224 "%v20",
225 "%v21",
226 "%v22",
227 "%v23",
228 "%v24",
229 "%v25",
230 "%v26",
231 "%v27",
232 "%v28",
233 "%v29",
234 "%v30",
235 "%v31",
236 "memory");
237 }
238 }
239 return;
240 }
241 }
242 void helper_float_5(float* buf);
helper_float_5(float * buf)243 void helper_float_5(float* buf) {
244 helper_float_5_recursive(buf, 5);
245 }
246 void helper_float_6_recursive(float* buf, int depth);
helper_float_6_recursive(float * buf,int depth)247 void helper_float_6_recursive(float* buf, int depth) {
248 if (depth == 3) {
249 helper_float_3(buf);
250 return;
251 }
252 if (depth == 6) {
253 helper_float_6_recursive(buf + 0, 3);
254 helper_float_6_recursive(buf + 8, 3);
255 helper_float_6_recursive(buf + 16, 3);
256 helper_float_6_recursive(buf + 24, 3);
257 helper_float_6_recursive(buf + 32, 3);
258 helper_float_6_recursive(buf + 40, 3);
259 helper_float_6_recursive(buf + 48, 3);
260 helper_float_6_recursive(buf + 56, 3);
261 for (int j = 0; j < 64; j += 64) {
262 for (int k = 0; k < 8; k += 4) {
263 __asm__ volatile(
264 "LD1 {v0.4S}, [%0]\n"
265 "LD1 {v1.4S}, [%1]\n"
266 "LD1 {v2.4S}, [%2]\n"
267 "LD1 {v3.4S}, [%3]\n"
268 "LD1 {v4.4S}, [%4]\n"
269 "LD1 {v5.4S}, [%5]\n"
270 "LD1 {v6.4S}, [%6]\n"
271 "LD1 {v7.4S}, [%7]\n"
272 "FADD v16.4S, v0.4S, v1.4S\n"
273 "FSUB v17.4S, v0.4S, v1.4S\n"
274 "FADD v18.4S, v2.4S, v3.4S\n"
275 "FSUB v19.4S, v2.4S, v3.4S\n"
276 "FADD v20.4S, v4.4S, v5.4S\n"
277 "FSUB v21.4S, v4.4S, v5.4S\n"
278 "FADD v22.4S, v6.4S, v7.4S\n"
279 "FSUB v23.4S, v6.4S, v7.4S\n"
280 "FADD v0.4S, v16.4S, v18.4S\n"
281 "FSUB v2.4S, v16.4S, v18.4S\n"
282 "FADD v1.4S, v17.4S, v19.4S\n"
283 "FSUB v3.4S, v17.4S, v19.4S\n"
284 "FADD v4.4S, v20.4S, v22.4S\n"
285 "FSUB v6.4S, v20.4S, v22.4S\n"
286 "FADD v5.4S, v21.4S, v23.4S\n"
287 "FSUB v7.4S, v21.4S, v23.4S\n"
288 "FADD v16.4S, v0.4S, v4.4S\n"
289 "FSUB v20.4S, v0.4S, v4.4S\n"
290 "FADD v17.4S, v1.4S, v5.4S\n"
291 "FSUB v21.4S, v1.4S, v5.4S\n"
292 "FADD v18.4S, v2.4S, v6.4S\n"
293 "FSUB v22.4S, v2.4S, v6.4S\n"
294 "FADD v19.4S, v3.4S, v7.4S\n"
295 "FSUB v23.4S, v3.4S, v7.4S\n"
296 "ST1 {v16.4S}, [%0]\n"
297 "ST1 {v17.4S}, [%1]\n"
298 "ST1 {v18.4S}, [%2]\n"
299 "ST1 {v19.4S}, [%3]\n"
300 "ST1 {v20.4S}, [%4]\n"
301 "ST1 {v21.4S}, [%5]\n"
302 "ST1 {v22.4S}, [%6]\n"
303 "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
304 "r"(buf + j + k + 8),
305 "r"(buf + j + k + 16),
306 "r"(buf + j + k + 24),
307 "r"(buf + j + k + 32),
308 "r"(buf + j + k + 40),
309 "r"(buf + j + k + 48),
310 "r"(buf + j + k + 56)
311 : "%v0",
312 "%v1",
313 "%v2",
314 "%v3",
315 "%v4",
316 "%v5",
317 "%v6",
318 "%v7",
319 "%v8",
320 "%v9",
321 "%v10",
322 "%v11",
323 "%v12",
324 "%v13",
325 "%v14",
326 "%v15",
327 "%v16",
328 "%v17",
329 "%v18",
330 "%v19",
331 "%v20",
332 "%v21",
333 "%v22",
334 "%v23",
335 "%v24",
336 "%v25",
337 "%v26",
338 "%v27",
339 "%v28",
340 "%v29",
341 "%v30",
342 "%v31",
343 "memory");
344 }
345 }
346 return;
347 }
348 }
349 void helper_float_6(float* buf);
helper_float_6(float * buf)350 void helper_float_6(float* buf) {
351 helper_float_6_recursive(buf, 6);
352 }
353 void helper_float_7_recursive(float* buf, int depth);
helper_float_7_recursive(float * buf,int depth)354 void helper_float_7_recursive(float* buf, int depth) {
355 if (depth == 3) {
356 helper_float_3(buf);
357 return;
358 }
359 if (depth == 7) {
360 helper_float_7_recursive(buf + 0, 3);
361 helper_float_7_recursive(buf + 8, 3);
362 helper_float_7_recursive(buf + 16, 3);
363 helper_float_7_recursive(buf + 24, 3);
364 helper_float_7_recursive(buf + 32, 3);
365 helper_float_7_recursive(buf + 40, 3);
366 helper_float_7_recursive(buf + 48, 3);
367 helper_float_7_recursive(buf + 56, 3);
368 helper_float_7_recursive(buf + 64, 3);
369 helper_float_7_recursive(buf + 72, 3);
370 helper_float_7_recursive(buf + 80, 3);
371 helper_float_7_recursive(buf + 88, 3);
372 helper_float_7_recursive(buf + 96, 3);
373 helper_float_7_recursive(buf + 104, 3);
374 helper_float_7_recursive(buf + 112, 3);
375 helper_float_7_recursive(buf + 120, 3);
376 for (int j = 0; j < 128; j += 128) {
377 for (int k = 0; k < 8; k += 4) {
378 __asm__ volatile(
379 "LD1 {v0.4S}, [%0]\n"
380 "LD1 {v1.4S}, [%1]\n"
381 "LD1 {v2.4S}, [%2]\n"
382 "LD1 {v3.4S}, [%3]\n"
383 "LD1 {v4.4S}, [%4]\n"
384 "LD1 {v5.4S}, [%5]\n"
385 "LD1 {v6.4S}, [%6]\n"
386 "LD1 {v7.4S}, [%7]\n"
387 "LD1 {v8.4S}, [%8]\n"
388 "LD1 {v9.4S}, [%9]\n"
389 "LD1 {v10.4S}, [%10]\n"
390 "LD1 {v11.4S}, [%11]\n"
391 "LD1 {v12.4S}, [%12]\n"
392 "LD1 {v13.4S}, [%13]\n"
393 "LD1 {v14.4S}, [%14]\n"
394 "LD1 {v15.4S}, [%15]\n"
395 "FADD v16.4S, v0.4S, v1.4S\n"
396 "FSUB v17.4S, v0.4S, v1.4S\n"
397 "FADD v18.4S, v2.4S, v3.4S\n"
398 "FSUB v19.4S, v2.4S, v3.4S\n"
399 "FADD v20.4S, v4.4S, v5.4S\n"
400 "FSUB v21.4S, v4.4S, v5.4S\n"
401 "FADD v22.4S, v6.4S, v7.4S\n"
402 "FSUB v23.4S, v6.4S, v7.4S\n"
403 "FADD v24.4S, v8.4S, v9.4S\n"
404 "FSUB v25.4S, v8.4S, v9.4S\n"
405 "FADD v26.4S, v10.4S, v11.4S\n"
406 "FSUB v27.4S, v10.4S, v11.4S\n"
407 "FADD v28.4S, v12.4S, v13.4S\n"
408 "FSUB v29.4S, v12.4S, v13.4S\n"
409 "FADD v30.4S, v14.4S, v15.4S\n"
410 "FSUB v31.4S, v14.4S, v15.4S\n"
411 "FADD v0.4S, v16.4S, v18.4S\n"
412 "FSUB v2.4S, v16.4S, v18.4S\n"
413 "FADD v1.4S, v17.4S, v19.4S\n"
414 "FSUB v3.4S, v17.4S, v19.4S\n"
415 "FADD v4.4S, v20.4S, v22.4S\n"
416 "FSUB v6.4S, v20.4S, v22.4S\n"
417 "FADD v5.4S, v21.4S, v23.4S\n"
418 "FSUB v7.4S, v21.4S, v23.4S\n"
419 "FADD v8.4S, v24.4S, v26.4S\n"
420 "FSUB v10.4S, v24.4S, v26.4S\n"
421 "FADD v9.4S, v25.4S, v27.4S\n"
422 "FSUB v11.4S, v25.4S, v27.4S\n"
423 "FADD v12.4S, v28.4S, v30.4S\n"
424 "FSUB v14.4S, v28.4S, v30.4S\n"
425 "FADD v13.4S, v29.4S, v31.4S\n"
426 "FSUB v15.4S, v29.4S, v31.4S\n"
427 "FADD v16.4S, v0.4S, v4.4S\n"
428 "FSUB v20.4S, v0.4S, v4.4S\n"
429 "FADD v17.4S, v1.4S, v5.4S\n"
430 "FSUB v21.4S, v1.4S, v5.4S\n"
431 "FADD v18.4S, v2.4S, v6.4S\n"
432 "FSUB v22.4S, v2.4S, v6.4S\n"
433 "FADD v19.4S, v3.4S, v7.4S\n"
434 "FSUB v23.4S, v3.4S, v7.4S\n"
435 "FADD v24.4S, v8.4S, v12.4S\n"
436 "FSUB v28.4S, v8.4S, v12.4S\n"
437 "FADD v25.4S, v9.4S, v13.4S\n"
438 "FSUB v29.4S, v9.4S, v13.4S\n"
439 "FADD v26.4S, v10.4S, v14.4S\n"
440 "FSUB v30.4S, v10.4S, v14.4S\n"
441 "FADD v27.4S, v11.4S, v15.4S\n"
442 "FSUB v31.4S, v11.4S, v15.4S\n"
443 "FADD v0.4S, v16.4S, v24.4S\n"
444 "FSUB v8.4S, v16.4S, v24.4S\n"
445 "FADD v1.4S, v17.4S, v25.4S\n"
446 "FSUB v9.4S, v17.4S, v25.4S\n"
447 "FADD v2.4S, v18.4S, v26.4S\n"
448 "FSUB v10.4S, v18.4S, v26.4S\n"
449 "FADD v3.4S, v19.4S, v27.4S\n"
450 "FSUB v11.4S, v19.4S, v27.4S\n"
451 "FADD v4.4S, v20.4S, v28.4S\n"
452 "FSUB v12.4S, v20.4S, v28.4S\n"
453 "FADD v5.4S, v21.4S, v29.4S\n"
454 "FSUB v13.4S, v21.4S, v29.4S\n"
455 "FADD v6.4S, v22.4S, v30.4S\n"
456 "FSUB v14.4S, v22.4S, v30.4S\n"
457 "FADD v7.4S, v23.4S, v31.4S\n"
458 "FSUB v15.4S, v23.4S, v31.4S\n"
459 "ST1 {v0.4S}, [%0]\n"
460 "ST1 {v1.4S}, [%1]\n"
461 "ST1 {v2.4S}, [%2]\n"
462 "ST1 {v3.4S}, [%3]\n"
463 "ST1 {v4.4S}, [%4]\n"
464 "ST1 {v5.4S}, [%5]\n"
465 "ST1 {v6.4S}, [%6]\n"
466 "ST1 {v7.4S}, [%7]\n"
467 "ST1 {v8.4S}, [%8]\n"
468 "ST1 {v9.4S}, [%9]\n"
469 "ST1 {v10.4S}, [%10]\n"
470 "ST1 {v11.4S}, [%11]\n"
471 "ST1 {v12.4S}, [%12]\n"
472 "ST1 {v13.4S}, [%13]\n"
473 "ST1 {v14.4S}, [%14]\n"
474 "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
475 "r"(buf + j + k + 8),
476 "r"(buf + j + k + 16),
477 "r"(buf + j + k + 24),
478 "r"(buf + j + k + 32),
479 "r"(buf + j + k + 40),
480 "r"(buf + j + k + 48),
481 "r"(buf + j + k + 56),
482 "r"(buf + j + k + 64),
483 "r"(buf + j + k + 72),
484 "r"(buf + j + k + 80),
485 "r"(buf + j + k + 88),
486 "r"(buf + j + k + 96),
487 "r"(buf + j + k + 104),
488 "r"(buf + j + k + 112),
489 "r"(buf + j + k + 120)
490 : "%v0",
491 "%v1",
492 "%v2",
493 "%v3",
494 "%v4",
495 "%v5",
496 "%v6",
497 "%v7",
498 "%v8",
499 "%v9",
500 "%v10",
501 "%v11",
502 "%v12",
503 "%v13",
504 "%v14",
505 "%v15",
506 "%v16",
507 "%v17",
508 "%v18",
509 "%v19",
510 "%v20",
511 "%v21",
512 "%v22",
513 "%v23",
514 "%v24",
515 "%v25",
516 "%v26",
517 "%v27",
518 "%v28",
519 "%v29",
520 "%v30",
521 "%v31",
522 "memory");
523 }
524 }
525 return;
526 }
527 }
528 void helper_float_7(float* buf);
helper_float_7(float * buf)529 void helper_float_7(float* buf) {
530 helper_float_7_recursive(buf, 7);
531 }
532 static inline void helper_float_8(float* buf);
helper_float_8(float * buf)533 static inline void helper_float_8(float* buf) {
534 for (int j = 0; j < 256; j += 64) {
535 for (int k = 0; k < 4; k += 4) {
536 __asm__ volatile(
537 "LD1 {v0.4S}, [%0]\n"
538 "LD1 {v1.4S}, [%1]\n"
539 "LD1 {v2.4S}, [%2]\n"
540 "LD1 {v3.4S}, [%3]\n"
541 "LD1 {v4.4S}, [%4]\n"
542 "LD1 {v5.4S}, [%5]\n"
543 "LD1 {v6.4S}, [%6]\n"
544 "LD1 {v7.4S}, [%7]\n"
545 "LD1 {v8.4S}, [%8]\n"
546 "LD1 {v9.4S}, [%9]\n"
547 "LD1 {v10.4S}, [%10]\n"
548 "LD1 {v11.4S}, [%11]\n"
549 "LD1 {v12.4S}, [%12]\n"
550 "LD1 {v13.4S}, [%13]\n"
551 "LD1 {v14.4S}, [%14]\n"
552 "LD1 {v15.4S}, [%15]\n"
553 "TRN1 v16.4S, v0.4S, v0.4S\n"
554 "FNEG v17.4S, v0.4S\n"
555 "TRN2 v17.4S, v0.4S, v17.4S\n"
556 "FADD v0.4S, v16.4S, v17.4S\n"
557 "TRN1 v16.4S, v1.4S, v1.4S\n"
558 "FNEG v17.4S, v1.4S\n"
559 "TRN2 v17.4S, v1.4S, v17.4S\n"
560 "FADD v1.4S, v16.4S, v17.4S\n"
561 "TRN1 v16.4S, v2.4S, v2.4S\n"
562 "FNEG v17.4S, v2.4S\n"
563 "TRN2 v17.4S, v2.4S, v17.4S\n"
564 "FADD v2.4S, v16.4S, v17.4S\n"
565 "TRN1 v16.4S, v3.4S, v3.4S\n"
566 "FNEG v17.4S, v3.4S\n"
567 "TRN2 v17.4S, v3.4S, v17.4S\n"
568 "FADD v3.4S, v16.4S, v17.4S\n"
569 "TRN1 v16.4S, v4.4S, v4.4S\n"
570 "FNEG v17.4S, v4.4S\n"
571 "TRN2 v17.4S, v4.4S, v17.4S\n"
572 "FADD v4.4S, v16.4S, v17.4S\n"
573 "TRN1 v16.4S, v5.4S, v5.4S\n"
574 "FNEG v17.4S, v5.4S\n"
575 "TRN2 v17.4S, v5.4S, v17.4S\n"
576 "FADD v5.4S, v16.4S, v17.4S\n"
577 "TRN1 v16.4S, v6.4S, v6.4S\n"
578 "FNEG v17.4S, v6.4S\n"
579 "TRN2 v17.4S, v6.4S, v17.4S\n"
580 "FADD v6.4S, v16.4S, v17.4S\n"
581 "TRN1 v16.4S, v7.4S, v7.4S\n"
582 "FNEG v17.4S, v7.4S\n"
583 "TRN2 v17.4S, v7.4S, v17.4S\n"
584 "FADD v7.4S, v16.4S, v17.4S\n"
585 "TRN1 v16.4S, v8.4S, v8.4S\n"
586 "FNEG v17.4S, v8.4S\n"
587 "TRN2 v17.4S, v8.4S, v17.4S\n"
588 "FADD v8.4S, v16.4S, v17.4S\n"
589 "TRN1 v16.4S, v9.4S, v9.4S\n"
590 "FNEG v17.4S, v9.4S\n"
591 "TRN2 v17.4S, v9.4S, v17.4S\n"
592 "FADD v9.4S, v16.4S, v17.4S\n"
593 "TRN1 v16.4S, v10.4S, v10.4S\n"
594 "FNEG v17.4S, v10.4S\n"
595 "TRN2 v17.4S, v10.4S, v17.4S\n"
596 "FADD v10.4S, v16.4S, v17.4S\n"
597 "TRN1 v16.4S, v11.4S, v11.4S\n"
598 "FNEG v17.4S, v11.4S\n"
599 "TRN2 v17.4S, v11.4S, v17.4S\n"
600 "FADD v11.4S, v16.4S, v17.4S\n"
601 "TRN1 v16.4S, v12.4S, v12.4S\n"
602 "FNEG v17.4S, v12.4S\n"
603 "TRN2 v17.4S, v12.4S, v17.4S\n"
604 "FADD v12.4S, v16.4S, v17.4S\n"
605 "TRN1 v16.4S, v13.4S, v13.4S\n"
606 "FNEG v17.4S, v13.4S\n"
607 "TRN2 v17.4S, v13.4S, v17.4S\n"
608 "FADD v13.4S, v16.4S, v17.4S\n"
609 "TRN1 v16.4S, v14.4S, v14.4S\n"
610 "FNEG v17.4S, v14.4S\n"
611 "TRN2 v17.4S, v14.4S, v17.4S\n"
612 "FADD v14.4S, v16.4S, v17.4S\n"
613 "TRN1 v16.4S, v15.4S, v15.4S\n"
614 "FNEG v17.4S, v15.4S\n"
615 "TRN2 v17.4S, v15.4S, v17.4S\n"
616 "FADD v15.4S, v16.4S, v17.4S\n"
617 "DUP v16.2D, v0.D[0]\n"
618 "FNEG v17.4S, v0.4S\n"
619 "INS v17.D[0], v0.D[1]\n"
620 "FADD v0.4S, v16.4S, v17.4S\n"
621 "DUP v16.2D, v1.D[0]\n"
622 "FNEG v17.4S, v1.4S\n"
623 "INS v17.D[0], v1.D[1]\n"
624 "FADD v1.4S, v16.4S, v17.4S\n"
625 "DUP v16.2D, v2.D[0]\n"
626 "FNEG v17.4S, v2.4S\n"
627 "INS v17.D[0], v2.D[1]\n"
628 "FADD v2.4S, v16.4S, v17.4S\n"
629 "DUP v16.2D, v3.D[0]\n"
630 "FNEG v17.4S, v3.4S\n"
631 "INS v17.D[0], v3.D[1]\n"
632 "FADD v3.4S, v16.4S, v17.4S\n"
633 "DUP v16.2D, v4.D[0]\n"
634 "FNEG v17.4S, v4.4S\n"
635 "INS v17.D[0], v4.D[1]\n"
636 "FADD v4.4S, v16.4S, v17.4S\n"
637 "DUP v16.2D, v5.D[0]\n"
638 "FNEG v17.4S, v5.4S\n"
639 "INS v17.D[0], v5.D[1]\n"
640 "FADD v5.4S, v16.4S, v17.4S\n"
641 "DUP v16.2D, v6.D[0]\n"
642 "FNEG v17.4S, v6.4S\n"
643 "INS v17.D[0], v6.D[1]\n"
644 "FADD v6.4S, v16.4S, v17.4S\n"
645 "DUP v16.2D, v7.D[0]\n"
646 "FNEG v17.4S, v7.4S\n"
647 "INS v17.D[0], v7.D[1]\n"
648 "FADD v7.4S, v16.4S, v17.4S\n"
649 "DUP v16.2D, v8.D[0]\n"
650 "FNEG v17.4S, v8.4S\n"
651 "INS v17.D[0], v8.D[1]\n"
652 "FADD v8.4S, v16.4S, v17.4S\n"
653 "DUP v16.2D, v9.D[0]\n"
654 "FNEG v17.4S, v9.4S\n"
655 "INS v17.D[0], v9.D[1]\n"
656 "FADD v9.4S, v16.4S, v17.4S\n"
657 "DUP v16.2D, v10.D[0]\n"
658 "FNEG v17.4S, v10.4S\n"
659 "INS v17.D[0], v10.D[1]\n"
660 "FADD v10.4S, v16.4S, v17.4S\n"
661 "DUP v16.2D, v11.D[0]\n"
662 "FNEG v17.4S, v11.4S\n"
663 "INS v17.D[0], v11.D[1]\n"
664 "FADD v11.4S, v16.4S, v17.4S\n"
665 "DUP v16.2D, v12.D[0]\n"
666 "FNEG v17.4S, v12.4S\n"
667 "INS v17.D[0], v12.D[1]\n"
668 "FADD v12.4S, v16.4S, v17.4S\n"
669 "DUP v16.2D, v13.D[0]\n"
670 "FNEG v17.4S, v13.4S\n"
671 "INS v17.D[0], v13.D[1]\n"
672 "FADD v13.4S, v16.4S, v17.4S\n"
673 "DUP v16.2D, v14.D[0]\n"
674 "FNEG v17.4S, v14.4S\n"
675 "INS v17.D[0], v14.D[1]\n"
676 "FADD v14.4S, v16.4S, v17.4S\n"
677 "DUP v16.2D, v15.D[0]\n"
678 "FNEG v17.4S, v15.4S\n"
679 "INS v17.D[0], v15.D[1]\n"
680 "FADD v15.4S, v16.4S, v17.4S\n"
681 "FADD v16.4S, v0.4S, v1.4S\n"
682 "FSUB v17.4S, v0.4S, v1.4S\n"
683 "FADD v18.4S, v2.4S, v3.4S\n"
684 "FSUB v19.4S, v2.4S, v3.4S\n"
685 "FADD v20.4S, v4.4S, v5.4S\n"
686 "FSUB v21.4S, v4.4S, v5.4S\n"
687 "FADD v22.4S, v6.4S, v7.4S\n"
688 "FSUB v23.4S, v6.4S, v7.4S\n"
689 "FADD v24.4S, v8.4S, v9.4S\n"
690 "FSUB v25.4S, v8.4S, v9.4S\n"
691 "FADD v26.4S, v10.4S, v11.4S\n"
692 "FSUB v27.4S, v10.4S, v11.4S\n"
693 "FADD v28.4S, v12.4S, v13.4S\n"
694 "FSUB v29.4S, v12.4S, v13.4S\n"
695 "FADD v30.4S, v14.4S, v15.4S\n"
696 "FSUB v31.4S, v14.4S, v15.4S\n"
697 "FADD v0.4S, v16.4S, v18.4S\n"
698 "FSUB v2.4S, v16.4S, v18.4S\n"
699 "FADD v1.4S, v17.4S, v19.4S\n"
700 "FSUB v3.4S, v17.4S, v19.4S\n"
701 "FADD v4.4S, v20.4S, v22.4S\n"
702 "FSUB v6.4S, v20.4S, v22.4S\n"
703 "FADD v5.4S, v21.4S, v23.4S\n"
704 "FSUB v7.4S, v21.4S, v23.4S\n"
705 "FADD v8.4S, v24.4S, v26.4S\n"
706 "FSUB v10.4S, v24.4S, v26.4S\n"
707 "FADD v9.4S, v25.4S, v27.4S\n"
708 "FSUB v11.4S, v25.4S, v27.4S\n"
709 "FADD v12.4S, v28.4S, v30.4S\n"
710 "FSUB v14.4S, v28.4S, v30.4S\n"
711 "FADD v13.4S, v29.4S, v31.4S\n"
712 "FSUB v15.4S, v29.4S, v31.4S\n"
713 "FADD v16.4S, v0.4S, v4.4S\n"
714 "FSUB v20.4S, v0.4S, v4.4S\n"
715 "FADD v17.4S, v1.4S, v5.4S\n"
716 "FSUB v21.4S, v1.4S, v5.4S\n"
717 "FADD v18.4S, v2.4S, v6.4S\n"
718 "FSUB v22.4S, v2.4S, v6.4S\n"
719 "FADD v19.4S, v3.4S, v7.4S\n"
720 "FSUB v23.4S, v3.4S, v7.4S\n"
721 "FADD v24.4S, v8.4S, v12.4S\n"
722 "FSUB v28.4S, v8.4S, v12.4S\n"
723 "FADD v25.4S, v9.4S, v13.4S\n"
724 "FSUB v29.4S, v9.4S, v13.4S\n"
725 "FADD v26.4S, v10.4S, v14.4S\n"
726 "FSUB v30.4S, v10.4S, v14.4S\n"
727 "FADD v27.4S, v11.4S, v15.4S\n"
728 "FSUB v31.4S, v11.4S, v15.4S\n"
729 "FADD v0.4S, v16.4S, v24.4S\n"
730 "FSUB v8.4S, v16.4S, v24.4S\n"
731 "FADD v1.4S, v17.4S, v25.4S\n"
732 "FSUB v9.4S, v17.4S, v25.4S\n"
733 "FADD v2.4S, v18.4S, v26.4S\n"
734 "FSUB v10.4S, v18.4S, v26.4S\n"
735 "FADD v3.4S, v19.4S, v27.4S\n"
736 "FSUB v11.4S, v19.4S, v27.4S\n"
737 "FADD v4.4S, v20.4S, v28.4S\n"
738 "FSUB v12.4S, v20.4S, v28.4S\n"
739 "FADD v5.4S, v21.4S, v29.4S\n"
740 "FSUB v13.4S, v21.4S, v29.4S\n"
741 "FADD v6.4S, v22.4S, v30.4S\n"
742 "FSUB v14.4S, v22.4S, v30.4S\n"
743 "FADD v7.4S, v23.4S, v31.4S\n"
744 "FSUB v15.4S, v23.4S, v31.4S\n"
745 "ST1 {v0.4S}, [%0]\n"
746 "ST1 {v1.4S}, [%1]\n"
747 "ST1 {v2.4S}, [%2]\n"
748 "ST1 {v3.4S}, [%3]\n"
749 "ST1 {v4.4S}, [%4]\n"
750 "ST1 {v5.4S}, [%5]\n"
751 "ST1 {v6.4S}, [%6]\n"
752 "ST1 {v7.4S}, [%7]\n"
753 "ST1 {v8.4S}, [%8]\n"
754 "ST1 {v9.4S}, [%9]\n"
755 "ST1 {v10.4S}, [%10]\n"
756 "ST1 {v11.4S}, [%11]\n"
757 "ST1 {v12.4S}, [%12]\n"
758 "ST1 {v13.4S}, [%13]\n"
759 "ST1 {v14.4S}, [%14]\n"
760 "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
761 "r"(buf + j + k + 4),
762 "r"(buf + j + k + 8),
763 "r"(buf + j + k + 12),
764 "r"(buf + j + k + 16),
765 "r"(buf + j + k + 20),
766 "r"(buf + j + k + 24),
767 "r"(buf + j + k + 28),
768 "r"(buf + j + k + 32),
769 "r"(buf + j + k + 36),
770 "r"(buf + j + k + 40),
771 "r"(buf + j + k + 44),
772 "r"(buf + j + k + 48),
773 "r"(buf + j + k + 52),
774 "r"(buf + j + k + 56),
775 "r"(buf + j + k + 60)
776 : "%v0",
777 "%v1",
778 "%v2",
779 "%v3",
780 "%v4",
781 "%v5",
782 "%v6",
783 "%v7",
784 "%v8",
785 "%v9",
786 "%v10",
787 "%v11",
788 "%v12",
789 "%v13",
790 "%v14",
791 "%v15",
792 "%v16",
793 "%v17",
794 "%v18",
795 "%v19",
796 "%v20",
797 "%v21",
798 "%v22",
799 "%v23",
800 "%v24",
801 "%v25",
802 "%v26",
803 "%v27",
804 "%v28",
805 "%v29",
806 "%v30",
807 "%v31",
808 "memory");
809 }
810 }
811 for (int j = 0; j < 256; j += 256) {
812 for (int k = 0; k < 64; k += 4) {
813 __asm__ volatile(
814 "LD1 {v0.4S}, [%0]\n"
815 "LD1 {v1.4S}, [%1]\n"
816 "LD1 {v2.4S}, [%2]\n"
817 "LD1 {v3.4S}, [%3]\n"
818 "FADD v16.4S, v0.4S, v1.4S\n"
819 "FSUB v17.4S, v0.4S, v1.4S\n"
820 "FADD v18.4S, v2.4S, v3.4S\n"
821 "FSUB v19.4S, v2.4S, v3.4S\n"
822 "FADD v0.4S, v16.4S, v18.4S\n"
823 "FSUB v2.4S, v16.4S, v18.4S\n"
824 "FADD v1.4S, v17.4S, v19.4S\n"
825 "FSUB v3.4S, v17.4S, v19.4S\n"
826 "ST1 {v0.4S}, [%0]\n"
827 "ST1 {v1.4S}, [%1]\n"
828 "ST1 {v2.4S}, [%2]\n"
829 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
830 "r"(buf + j + k + 64),
831 "r"(buf + j + k + 128),
832 "r"(buf + j + k + 192)
833 : "%v0",
834 "%v1",
835 "%v2",
836 "%v3",
837 "%v4",
838 "%v5",
839 "%v6",
840 "%v7",
841 "%v8",
842 "%v9",
843 "%v10",
844 "%v11",
845 "%v12",
846 "%v13",
847 "%v14",
848 "%v15",
849 "%v16",
850 "%v17",
851 "%v18",
852 "%v19",
853 "%v20",
854 "%v21",
855 "%v22",
856 "%v23",
857 "%v24",
858 "%v25",
859 "%v26",
860 "%v27",
861 "%v28",
862 "%v29",
863 "%v30",
864 "%v31",
865 "memory");
866 }
867 }
868 }
869 void helper_float_9_recursive(float* buf, int depth);
helper_float_9_recursive(float * buf,int depth)870 void helper_float_9_recursive(float* buf, int depth) {
871 if (depth == 8) {
872 helper_float_8(buf);
873 return;
874 }
875 if (depth == 9) {
876 helper_float_9_recursive(buf + 0, 8);
877 helper_float_9_recursive(buf + 256, 8);
878 for (int j = 0; j < 512; j += 512) {
879 for (int k = 0; k < 256; k += 4) {
880 __asm__ volatile(
881 "LD1 {v0.4S}, [%0]\n"
882 "LD1 {v1.4S}, [%1]\n"
883 "FADD v16.4S, v0.4S, v1.4S\n"
884 "FSUB v17.4S, v0.4S, v1.4S\n"
885 "ST1 {v16.4S}, [%0]\n"
886 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
887 "r"(buf + j + k + 256)
888 : "%v0",
889 "%v1",
890 "%v2",
891 "%v3",
892 "%v4",
893 "%v5",
894 "%v6",
895 "%v7",
896 "%v8",
897 "%v9",
898 "%v10",
899 "%v11",
900 "%v12",
901 "%v13",
902 "%v14",
903 "%v15",
904 "%v16",
905 "%v17",
906 "%v18",
907 "%v19",
908 "%v20",
909 "%v21",
910 "%v22",
911 "%v23",
912 "%v24",
913 "%v25",
914 "%v26",
915 "%v27",
916 "%v28",
917 "%v29",
918 "%v30",
919 "%v31",
920 "memory");
921 }
922 }
923 return;
924 }
925 }
926 void helper_float_9(float* buf);
helper_float_9(float * buf)927 void helper_float_9(float* buf) {
928 helper_float_9_recursive(buf, 9);
929 }
930 void helper_float_10_recursive(float* buf, int depth);
helper_float_10_recursive(float * buf,int depth)931 void helper_float_10_recursive(float* buf, int depth) {
932 if (depth == 8) {
933 helper_float_8(buf);
934 return;
935 }
936 if (depth == 10) {
937 helper_float_10_recursive(buf + 0, 8);
938 helper_float_10_recursive(buf + 256, 8);
939 helper_float_10_recursive(buf + 512, 8);
940 helper_float_10_recursive(buf + 768, 8);
941 for (int j = 0; j < 1024; j += 1024) {
942 for (int k = 0; k < 256; k += 4) {
943 __asm__ volatile(
944 "LD1 {v0.4S}, [%0]\n"
945 "LD1 {v1.4S}, [%1]\n"
946 "LD1 {v2.4S}, [%2]\n"
947 "LD1 {v3.4S}, [%3]\n"
948 "FADD v16.4S, v0.4S, v1.4S\n"
949 "FSUB v17.4S, v0.4S, v1.4S\n"
950 "FADD v18.4S, v2.4S, v3.4S\n"
951 "FSUB v19.4S, v2.4S, v3.4S\n"
952 "FADD v0.4S, v16.4S, v18.4S\n"
953 "FSUB v2.4S, v16.4S, v18.4S\n"
954 "FADD v1.4S, v17.4S, v19.4S\n"
955 "FSUB v3.4S, v17.4S, v19.4S\n"
956 "ST1 {v0.4S}, [%0]\n"
957 "ST1 {v1.4S}, [%1]\n"
958 "ST1 {v2.4S}, [%2]\n"
959 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
960 "r"(buf + j + k + 256),
961 "r"(buf + j + k + 512),
962 "r"(buf + j + k + 768)
963 : "%v0",
964 "%v1",
965 "%v2",
966 "%v3",
967 "%v4",
968 "%v5",
969 "%v6",
970 "%v7",
971 "%v8",
972 "%v9",
973 "%v10",
974 "%v11",
975 "%v12",
976 "%v13",
977 "%v14",
978 "%v15",
979 "%v16",
980 "%v17",
981 "%v18",
982 "%v19",
983 "%v20",
984 "%v21",
985 "%v22",
986 "%v23",
987 "%v24",
988 "%v25",
989 "%v26",
990 "%v27",
991 "%v28",
992 "%v29",
993 "%v30",
994 "%v31",
995 "memory");
996 }
997 }
998 return;
999 }
1000 }
1001 void helper_float_10(float* buf);
helper_float_10(float * buf)1002 void helper_float_10(float* buf) {
1003 helper_float_10_recursive(buf, 10);
1004 }
1005 void helper_float_11_recursive(float* buf, int depth);
helper_float_11_recursive(float * buf,int depth)1006 void helper_float_11_recursive(float* buf, int depth) {
1007 if (depth == 10) {
1008 helper_float_10(buf);
1009 return;
1010 }
1011 if (depth == 11) {
1012 helper_float_11_recursive(buf + 0, 10);
1013 helper_float_11_recursive(buf + 1024, 10);
1014 for (int j = 0; j < 2048; j += 2048) {
1015 for (int k = 0; k < 1024; k += 4) {
1016 __asm__ volatile(
1017 "LD1 {v0.4S}, [%0]\n"
1018 "LD1 {v1.4S}, [%1]\n"
1019 "FADD v16.4S, v0.4S, v1.4S\n"
1020 "FSUB v17.4S, v0.4S, v1.4S\n"
1021 "ST1 {v16.4S}, [%0]\n"
1022 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
1023 "r"(buf + j + k + 1024)
1024 : "%v0",
1025 "%v1",
1026 "%v2",
1027 "%v3",
1028 "%v4",
1029 "%v5",
1030 "%v6",
1031 "%v7",
1032 "%v8",
1033 "%v9",
1034 "%v10",
1035 "%v11",
1036 "%v12",
1037 "%v13",
1038 "%v14",
1039 "%v15",
1040 "%v16",
1041 "%v17",
1042 "%v18",
1043 "%v19",
1044 "%v20",
1045 "%v21",
1046 "%v22",
1047 "%v23",
1048 "%v24",
1049 "%v25",
1050 "%v26",
1051 "%v27",
1052 "%v28",
1053 "%v29",
1054 "%v30",
1055 "%v31",
1056 "memory");
1057 }
1058 }
1059 return;
1060 }
1061 }
1062 void helper_float_11(float* buf);
helper_float_11(float * buf)1063 void helper_float_11(float* buf) {
1064 helper_float_11_recursive(buf, 11);
1065 }
1066 void helper_float_12_recursive(float* buf, int depth);
helper_float_12_recursive(float * buf,int depth)1067 void helper_float_12_recursive(float* buf, int depth) {
1068 if (depth == 10) {
1069 helper_float_10(buf);
1070 return;
1071 }
1072 if (depth == 12) {
1073 helper_float_12_recursive(buf + 0, 10);
1074 helper_float_12_recursive(buf + 1024, 10);
1075 helper_float_12_recursive(buf + 2048, 10);
1076 helper_float_12_recursive(buf + 3072, 10);
1077 for (int j = 0; j < 4096; j += 4096) {
1078 for (int k = 0; k < 1024; k += 4) {
1079 __asm__ volatile(
1080 "LD1 {v0.4S}, [%0]\n"
1081 "LD1 {v1.4S}, [%1]\n"
1082 "LD1 {v2.4S}, [%2]\n"
1083 "LD1 {v3.4S}, [%3]\n"
1084 "FADD v16.4S, v0.4S, v1.4S\n"
1085 "FSUB v17.4S, v0.4S, v1.4S\n"
1086 "FADD v18.4S, v2.4S, v3.4S\n"
1087 "FSUB v19.4S, v2.4S, v3.4S\n"
1088 "FADD v0.4S, v16.4S, v18.4S\n"
1089 "FSUB v2.4S, v16.4S, v18.4S\n"
1090 "FADD v1.4S, v17.4S, v19.4S\n"
1091 "FSUB v3.4S, v17.4S, v19.4S\n"
1092 "ST1 {v0.4S}, [%0]\n"
1093 "ST1 {v1.4S}, [%1]\n"
1094 "ST1 {v2.4S}, [%2]\n"
1095 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
1096 "r"(buf + j + k + 1024),
1097 "r"(buf + j + k + 2048),
1098 "r"(buf + j + k + 3072)
1099 : "%v0",
1100 "%v1",
1101 "%v2",
1102 "%v3",
1103 "%v4",
1104 "%v5",
1105 "%v6",
1106 "%v7",
1107 "%v8",
1108 "%v9",
1109 "%v10",
1110 "%v11",
1111 "%v12",
1112 "%v13",
1113 "%v14",
1114 "%v15",
1115 "%v16",
1116 "%v17",
1117 "%v18",
1118 "%v19",
1119 "%v20",
1120 "%v21",
1121 "%v22",
1122 "%v23",
1123 "%v24",
1124 "%v25",
1125 "%v26",
1126 "%v27",
1127 "%v28",
1128 "%v29",
1129 "%v30",
1130 "%v31",
1131 "memory");
1132 }
1133 }
1134 return;
1135 }
1136 }
1137 void helper_float_12(float* buf);
helper_float_12(float * buf)1138 void helper_float_12(float* buf) {
1139 helper_float_12_recursive(buf, 12);
1140 }
1141 static inline void helper_float_13(float* buf);
helper_float_13(float * buf)1142 static inline void helper_float_13(float* buf) {
1143 for (int j = 0; j < 8192; j += 64) {
1144 for (int k = 0; k < 4; k += 4) {
1145 __asm__ volatile(
1146 "LD1 {v0.4S}, [%0]\n"
1147 "LD1 {v1.4S}, [%1]\n"
1148 "LD1 {v2.4S}, [%2]\n"
1149 "LD1 {v3.4S}, [%3]\n"
1150 "LD1 {v4.4S}, [%4]\n"
1151 "LD1 {v5.4S}, [%5]\n"
1152 "LD1 {v6.4S}, [%6]\n"
1153 "LD1 {v7.4S}, [%7]\n"
1154 "LD1 {v8.4S}, [%8]\n"
1155 "LD1 {v9.4S}, [%9]\n"
1156 "LD1 {v10.4S}, [%10]\n"
1157 "LD1 {v11.4S}, [%11]\n"
1158 "LD1 {v12.4S}, [%12]\n"
1159 "LD1 {v13.4S}, [%13]\n"
1160 "LD1 {v14.4S}, [%14]\n"
1161 "LD1 {v15.4S}, [%15]\n"
1162 "TRN1 v16.4S, v0.4S, v0.4S\n"
1163 "FNEG v17.4S, v0.4S\n"
1164 "TRN2 v17.4S, v0.4S, v17.4S\n"
1165 "FADD v0.4S, v16.4S, v17.4S\n"
1166 "TRN1 v16.4S, v1.4S, v1.4S\n"
1167 "FNEG v17.4S, v1.4S\n"
1168 "TRN2 v17.4S, v1.4S, v17.4S\n"
1169 "FADD v1.4S, v16.4S, v17.4S\n"
1170 "TRN1 v16.4S, v2.4S, v2.4S\n"
1171 "FNEG v17.4S, v2.4S\n"
1172 "TRN2 v17.4S, v2.4S, v17.4S\n"
1173 "FADD v2.4S, v16.4S, v17.4S\n"
1174 "TRN1 v16.4S, v3.4S, v3.4S\n"
1175 "FNEG v17.4S, v3.4S\n"
1176 "TRN2 v17.4S, v3.4S, v17.4S\n"
1177 "FADD v3.4S, v16.4S, v17.4S\n"
1178 "TRN1 v16.4S, v4.4S, v4.4S\n"
1179 "FNEG v17.4S, v4.4S\n"
1180 "TRN2 v17.4S, v4.4S, v17.4S\n"
1181 "FADD v4.4S, v16.4S, v17.4S\n"
1182 "TRN1 v16.4S, v5.4S, v5.4S\n"
1183 "FNEG v17.4S, v5.4S\n"
1184 "TRN2 v17.4S, v5.4S, v17.4S\n"
1185 "FADD v5.4S, v16.4S, v17.4S\n"
1186 "TRN1 v16.4S, v6.4S, v6.4S\n"
1187 "FNEG v17.4S, v6.4S\n"
1188 "TRN2 v17.4S, v6.4S, v17.4S\n"
1189 "FADD v6.4S, v16.4S, v17.4S\n"
1190 "TRN1 v16.4S, v7.4S, v7.4S\n"
1191 "FNEG v17.4S, v7.4S\n"
1192 "TRN2 v17.4S, v7.4S, v17.4S\n"
1193 "FADD v7.4S, v16.4S, v17.4S\n"
1194 "TRN1 v16.4S, v8.4S, v8.4S\n"
1195 "FNEG v17.4S, v8.4S\n"
1196 "TRN2 v17.4S, v8.4S, v17.4S\n"
1197 "FADD v8.4S, v16.4S, v17.4S\n"
1198 "TRN1 v16.4S, v9.4S, v9.4S\n"
1199 "FNEG v17.4S, v9.4S\n"
1200 "TRN2 v17.4S, v9.4S, v17.4S\n"
1201 "FADD v9.4S, v16.4S, v17.4S\n"
1202 "TRN1 v16.4S, v10.4S, v10.4S\n"
1203 "FNEG v17.4S, v10.4S\n"
1204 "TRN2 v17.4S, v10.4S, v17.4S\n"
1205 "FADD v10.4S, v16.4S, v17.4S\n"
1206 "TRN1 v16.4S, v11.4S, v11.4S\n"
1207 "FNEG v17.4S, v11.4S\n"
1208 "TRN2 v17.4S, v11.4S, v17.4S\n"
1209 "FADD v11.4S, v16.4S, v17.4S\n"
1210 "TRN1 v16.4S, v12.4S, v12.4S\n"
1211 "FNEG v17.4S, v12.4S\n"
1212 "TRN2 v17.4S, v12.4S, v17.4S\n"
1213 "FADD v12.4S, v16.4S, v17.4S\n"
1214 "TRN1 v16.4S, v13.4S, v13.4S\n"
1215 "FNEG v17.4S, v13.4S\n"
1216 "TRN2 v17.4S, v13.4S, v17.4S\n"
1217 "FADD v13.4S, v16.4S, v17.4S\n"
1218 "TRN1 v16.4S, v14.4S, v14.4S\n"
1219 "FNEG v17.4S, v14.4S\n"
1220 "TRN2 v17.4S, v14.4S, v17.4S\n"
1221 "FADD v14.4S, v16.4S, v17.4S\n"
1222 "TRN1 v16.4S, v15.4S, v15.4S\n"
1223 "FNEG v17.4S, v15.4S\n"
1224 "TRN2 v17.4S, v15.4S, v17.4S\n"
1225 "FADD v15.4S, v16.4S, v17.4S\n"
1226 "DUP v16.2D, v0.D[0]\n"
1227 "FNEG v17.4S, v0.4S\n"
1228 "INS v17.D[0], v0.D[1]\n"
1229 "FADD v0.4S, v16.4S, v17.4S\n"
1230 "DUP v16.2D, v1.D[0]\n"
1231 "FNEG v17.4S, v1.4S\n"
1232 "INS v17.D[0], v1.D[1]\n"
1233 "FADD v1.4S, v16.4S, v17.4S\n"
1234 "DUP v16.2D, v2.D[0]\n"
1235 "FNEG v17.4S, v2.4S\n"
1236 "INS v17.D[0], v2.D[1]\n"
1237 "FADD v2.4S, v16.4S, v17.4S\n"
1238 "DUP v16.2D, v3.D[0]\n"
1239 "FNEG v17.4S, v3.4S\n"
1240 "INS v17.D[0], v3.D[1]\n"
1241 "FADD v3.4S, v16.4S, v17.4S\n"
1242 "DUP v16.2D, v4.D[0]\n"
1243 "FNEG v17.4S, v4.4S\n"
1244 "INS v17.D[0], v4.D[1]\n"
1245 "FADD v4.4S, v16.4S, v17.4S\n"
1246 "DUP v16.2D, v5.D[0]\n"
1247 "FNEG v17.4S, v5.4S\n"
1248 "INS v17.D[0], v5.D[1]\n"
1249 "FADD v5.4S, v16.4S, v17.4S\n"
1250 "DUP v16.2D, v6.D[0]\n"
1251 "FNEG v17.4S, v6.4S\n"
1252 "INS v17.D[0], v6.D[1]\n"
1253 "FADD v6.4S, v16.4S, v17.4S\n"
1254 "DUP v16.2D, v7.D[0]\n"
1255 "FNEG v17.4S, v7.4S\n"
1256 "INS v17.D[0], v7.D[1]\n"
1257 "FADD v7.4S, v16.4S, v17.4S\n"
1258 "DUP v16.2D, v8.D[0]\n"
1259 "FNEG v17.4S, v8.4S\n"
1260 "INS v17.D[0], v8.D[1]\n"
1261 "FADD v8.4S, v16.4S, v17.4S\n"
1262 "DUP v16.2D, v9.D[0]\n"
1263 "FNEG v17.4S, v9.4S\n"
1264 "INS v17.D[0], v9.D[1]\n"
1265 "FADD v9.4S, v16.4S, v17.4S\n"
1266 "DUP v16.2D, v10.D[0]\n"
1267 "FNEG v17.4S, v10.4S\n"
1268 "INS v17.D[0], v10.D[1]\n"
1269 "FADD v10.4S, v16.4S, v17.4S\n"
1270 "DUP v16.2D, v11.D[0]\n"
1271 "FNEG v17.4S, v11.4S\n"
1272 "INS v17.D[0], v11.D[1]\n"
1273 "FADD v11.4S, v16.4S, v17.4S\n"
1274 "DUP v16.2D, v12.D[0]\n"
1275 "FNEG v17.4S, v12.4S\n"
1276 "INS v17.D[0], v12.D[1]\n"
1277 "FADD v12.4S, v16.4S, v17.4S\n"
1278 "DUP v16.2D, v13.D[0]\n"
1279 "FNEG v17.4S, v13.4S\n"
1280 "INS v17.D[0], v13.D[1]\n"
1281 "FADD v13.4S, v16.4S, v17.4S\n"
1282 "DUP v16.2D, v14.D[0]\n"
1283 "FNEG v17.4S, v14.4S\n"
1284 "INS v17.D[0], v14.D[1]\n"
1285 "FADD v14.4S, v16.4S, v17.4S\n"
1286 "DUP v16.2D, v15.D[0]\n"
1287 "FNEG v17.4S, v15.4S\n"
1288 "INS v17.D[0], v15.D[1]\n"
1289 "FADD v15.4S, v16.4S, v17.4S\n"
1290 "FADD v16.4S, v0.4S, v1.4S\n"
1291 "FSUB v17.4S, v0.4S, v1.4S\n"
1292 "FADD v18.4S, v2.4S, v3.4S\n"
1293 "FSUB v19.4S, v2.4S, v3.4S\n"
1294 "FADD v20.4S, v4.4S, v5.4S\n"
1295 "FSUB v21.4S, v4.4S, v5.4S\n"
1296 "FADD v22.4S, v6.4S, v7.4S\n"
1297 "FSUB v23.4S, v6.4S, v7.4S\n"
1298 "FADD v24.4S, v8.4S, v9.4S\n"
1299 "FSUB v25.4S, v8.4S, v9.4S\n"
1300 "FADD v26.4S, v10.4S, v11.4S\n"
1301 "FSUB v27.4S, v10.4S, v11.4S\n"
1302 "FADD v28.4S, v12.4S, v13.4S\n"
1303 "FSUB v29.4S, v12.4S, v13.4S\n"
1304 "FADD v30.4S, v14.4S, v15.4S\n"
1305 "FSUB v31.4S, v14.4S, v15.4S\n"
1306 "FADD v0.4S, v16.4S, v18.4S\n"
1307 "FSUB v2.4S, v16.4S, v18.4S\n"
1308 "FADD v1.4S, v17.4S, v19.4S\n"
1309 "FSUB v3.4S, v17.4S, v19.4S\n"
1310 "FADD v4.4S, v20.4S, v22.4S\n"
1311 "FSUB v6.4S, v20.4S, v22.4S\n"
1312 "FADD v5.4S, v21.4S, v23.4S\n"
1313 "FSUB v7.4S, v21.4S, v23.4S\n"
1314 "FADD v8.4S, v24.4S, v26.4S\n"
1315 "FSUB v10.4S, v24.4S, v26.4S\n"
1316 "FADD v9.4S, v25.4S, v27.4S\n"
1317 "FSUB v11.4S, v25.4S, v27.4S\n"
1318 "FADD v12.4S, v28.4S, v30.4S\n"
1319 "FSUB v14.4S, v28.4S, v30.4S\n"
1320 "FADD v13.4S, v29.4S, v31.4S\n"
1321 "FSUB v15.4S, v29.4S, v31.4S\n"
1322 "FADD v16.4S, v0.4S, v4.4S\n"
1323 "FSUB v20.4S, v0.4S, v4.4S\n"
1324 "FADD v17.4S, v1.4S, v5.4S\n"
1325 "FSUB v21.4S, v1.4S, v5.4S\n"
1326 "FADD v18.4S, v2.4S, v6.4S\n"
1327 "FSUB v22.4S, v2.4S, v6.4S\n"
1328 "FADD v19.4S, v3.4S, v7.4S\n"
1329 "FSUB v23.4S, v3.4S, v7.4S\n"
1330 "FADD v24.4S, v8.4S, v12.4S\n"
1331 "FSUB v28.4S, v8.4S, v12.4S\n"
1332 "FADD v25.4S, v9.4S, v13.4S\n"
1333 "FSUB v29.4S, v9.4S, v13.4S\n"
1334 "FADD v26.4S, v10.4S, v14.4S\n"
1335 "FSUB v30.4S, v10.4S, v14.4S\n"
1336 "FADD v27.4S, v11.4S, v15.4S\n"
1337 "FSUB v31.4S, v11.4S, v15.4S\n"
1338 "FADD v0.4S, v16.4S, v24.4S\n"
1339 "FSUB v8.4S, v16.4S, v24.4S\n"
1340 "FADD v1.4S, v17.4S, v25.4S\n"
1341 "FSUB v9.4S, v17.4S, v25.4S\n"
1342 "FADD v2.4S, v18.4S, v26.4S\n"
1343 "FSUB v10.4S, v18.4S, v26.4S\n"
1344 "FADD v3.4S, v19.4S, v27.4S\n"
1345 "FSUB v11.4S, v19.4S, v27.4S\n"
1346 "FADD v4.4S, v20.4S, v28.4S\n"
1347 "FSUB v12.4S, v20.4S, v28.4S\n"
1348 "FADD v5.4S, v21.4S, v29.4S\n"
1349 "FSUB v13.4S, v21.4S, v29.4S\n"
1350 "FADD v6.4S, v22.4S, v30.4S\n"
1351 "FSUB v14.4S, v22.4S, v30.4S\n"
1352 "FADD v7.4S, v23.4S, v31.4S\n"
1353 "FSUB v15.4S, v23.4S, v31.4S\n"
1354 "ST1 {v0.4S}, [%0]\n"
1355 "ST1 {v1.4S}, [%1]\n"
1356 "ST1 {v2.4S}, [%2]\n"
1357 "ST1 {v3.4S}, [%3]\n"
1358 "ST1 {v4.4S}, [%4]\n"
1359 "ST1 {v5.4S}, [%5]\n"
1360 "ST1 {v6.4S}, [%6]\n"
1361 "ST1 {v7.4S}, [%7]\n"
1362 "ST1 {v8.4S}, [%8]\n"
1363 "ST1 {v9.4S}, [%9]\n"
1364 "ST1 {v10.4S}, [%10]\n"
1365 "ST1 {v11.4S}, [%11]\n"
1366 "ST1 {v12.4S}, [%12]\n"
1367 "ST1 {v13.4S}, [%13]\n"
1368 "ST1 {v14.4S}, [%14]\n"
1369 "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
1370 "r"(buf + j + k + 4),
1371 "r"(buf + j + k + 8),
1372 "r"(buf + j + k + 12),
1373 "r"(buf + j + k + 16),
1374 "r"(buf + j + k + 20),
1375 "r"(buf + j + k + 24),
1376 "r"(buf + j + k + 28),
1377 "r"(buf + j + k + 32),
1378 "r"(buf + j + k + 36),
1379 "r"(buf + j + k + 40),
1380 "r"(buf + j + k + 44),
1381 "r"(buf + j + k + 48),
1382 "r"(buf + j + k + 52),
1383 "r"(buf + j + k + 56),
1384 "r"(buf + j + k + 60)
1385 : "%v0",
1386 "%v1",
1387 "%v2",
1388 "%v3",
1389 "%v4",
1390 "%v5",
1391 "%v6",
1392 "%v7",
1393 "%v8",
1394 "%v9",
1395 "%v10",
1396 "%v11",
1397 "%v12",
1398 "%v13",
1399 "%v14",
1400 "%v15",
1401 "%v16",
1402 "%v17",
1403 "%v18",
1404 "%v19",
1405 "%v20",
1406 "%v21",
1407 "%v22",
1408 "%v23",
1409 "%v24",
1410 "%v25",
1411 "%v26",
1412 "%v27",
1413 "%v28",
1414 "%v29",
1415 "%v30",
1416 "%v31",
1417 "memory");
1418 }
1419 }
1420 for (int j = 0; j < 8192; j += 1024) {
1421 for (int k = 0; k < 64; k += 4) {
1422 __asm__ volatile(
1423 "LD1 {v0.4S}, [%0]\n"
1424 "LD1 {v1.4S}, [%1]\n"
1425 "LD1 {v2.4S}, [%2]\n"
1426 "LD1 {v3.4S}, [%3]\n"
1427 "LD1 {v4.4S}, [%4]\n"
1428 "LD1 {v5.4S}, [%5]\n"
1429 "LD1 {v6.4S}, [%6]\n"
1430 "LD1 {v7.4S}, [%7]\n"
1431 "LD1 {v8.4S}, [%8]\n"
1432 "LD1 {v9.4S}, [%9]\n"
1433 "LD1 {v10.4S}, [%10]\n"
1434 "LD1 {v11.4S}, [%11]\n"
1435 "LD1 {v12.4S}, [%12]\n"
1436 "LD1 {v13.4S}, [%13]\n"
1437 "LD1 {v14.4S}, [%14]\n"
1438 "LD1 {v15.4S}, [%15]\n"
1439 "FADD v16.4S, v0.4S, v1.4S\n"
1440 "FSUB v17.4S, v0.4S, v1.4S\n"
1441 "FADD v18.4S, v2.4S, v3.4S\n"
1442 "FSUB v19.4S, v2.4S, v3.4S\n"
1443 "FADD v20.4S, v4.4S, v5.4S\n"
1444 "FSUB v21.4S, v4.4S, v5.4S\n"
1445 "FADD v22.4S, v6.4S, v7.4S\n"
1446 "FSUB v23.4S, v6.4S, v7.4S\n"
1447 "FADD v24.4S, v8.4S, v9.4S\n"
1448 "FSUB v25.4S, v8.4S, v9.4S\n"
1449 "FADD v26.4S, v10.4S, v11.4S\n"
1450 "FSUB v27.4S, v10.4S, v11.4S\n"
1451 "FADD v28.4S, v12.4S, v13.4S\n"
1452 "FSUB v29.4S, v12.4S, v13.4S\n"
1453 "FADD v30.4S, v14.4S, v15.4S\n"
1454 "FSUB v31.4S, v14.4S, v15.4S\n"
1455 "FADD v0.4S, v16.4S, v18.4S\n"
1456 "FSUB v2.4S, v16.4S, v18.4S\n"
1457 "FADD v1.4S, v17.4S, v19.4S\n"
1458 "FSUB v3.4S, v17.4S, v19.4S\n"
1459 "FADD v4.4S, v20.4S, v22.4S\n"
1460 "FSUB v6.4S, v20.4S, v22.4S\n"
1461 "FADD v5.4S, v21.4S, v23.4S\n"
1462 "FSUB v7.4S, v21.4S, v23.4S\n"
1463 "FADD v8.4S, v24.4S, v26.4S\n"
1464 "FSUB v10.4S, v24.4S, v26.4S\n"
1465 "FADD v9.4S, v25.4S, v27.4S\n"
1466 "FSUB v11.4S, v25.4S, v27.4S\n"
1467 "FADD v12.4S, v28.4S, v30.4S\n"
1468 "FSUB v14.4S, v28.4S, v30.4S\n"
1469 "FADD v13.4S, v29.4S, v31.4S\n"
1470 "FSUB v15.4S, v29.4S, v31.4S\n"
1471 "FADD v16.4S, v0.4S, v4.4S\n"
1472 "FSUB v20.4S, v0.4S, v4.4S\n"
1473 "FADD v17.4S, v1.4S, v5.4S\n"
1474 "FSUB v21.4S, v1.4S, v5.4S\n"
1475 "FADD v18.4S, v2.4S, v6.4S\n"
1476 "FSUB v22.4S, v2.4S, v6.4S\n"
1477 "FADD v19.4S, v3.4S, v7.4S\n"
1478 "FSUB v23.4S, v3.4S, v7.4S\n"
1479 "FADD v24.4S, v8.4S, v12.4S\n"
1480 "FSUB v28.4S, v8.4S, v12.4S\n"
1481 "FADD v25.4S, v9.4S, v13.4S\n"
1482 "FSUB v29.4S, v9.4S, v13.4S\n"
1483 "FADD v26.4S, v10.4S, v14.4S\n"
1484 "FSUB v30.4S, v10.4S, v14.4S\n"
1485 "FADD v27.4S, v11.4S, v15.4S\n"
1486 "FSUB v31.4S, v11.4S, v15.4S\n"
1487 "FADD v0.4S, v16.4S, v24.4S\n"
1488 "FSUB v8.4S, v16.4S, v24.4S\n"
1489 "FADD v1.4S, v17.4S, v25.4S\n"
1490 "FSUB v9.4S, v17.4S, v25.4S\n"
1491 "FADD v2.4S, v18.4S, v26.4S\n"
1492 "FSUB v10.4S, v18.4S, v26.4S\n"
1493 "FADD v3.4S, v19.4S, v27.4S\n"
1494 "FSUB v11.4S, v19.4S, v27.4S\n"
1495 "FADD v4.4S, v20.4S, v28.4S\n"
1496 "FSUB v12.4S, v20.4S, v28.4S\n"
1497 "FADD v5.4S, v21.4S, v29.4S\n"
1498 "FSUB v13.4S, v21.4S, v29.4S\n"
1499 "FADD v6.4S, v22.4S, v30.4S\n"
1500 "FSUB v14.4S, v22.4S, v30.4S\n"
1501 "FADD v7.4S, v23.4S, v31.4S\n"
1502 "FSUB v15.4S, v23.4S, v31.4S\n"
1503 "ST1 {v0.4S}, [%0]\n"
1504 "ST1 {v1.4S}, [%1]\n"
1505 "ST1 {v2.4S}, [%2]\n"
1506 "ST1 {v3.4S}, [%3]\n"
1507 "ST1 {v4.4S}, [%4]\n"
1508 "ST1 {v5.4S}, [%5]\n"
1509 "ST1 {v6.4S}, [%6]\n"
1510 "ST1 {v7.4S}, [%7]\n"
1511 "ST1 {v8.4S}, [%8]\n"
1512 "ST1 {v9.4S}, [%9]\n"
1513 "ST1 {v10.4S}, [%10]\n"
1514 "ST1 {v11.4S}, [%11]\n"
1515 "ST1 {v12.4S}, [%12]\n"
1516 "ST1 {v13.4S}, [%13]\n"
1517 "ST1 {v14.4S}, [%14]\n"
1518 "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
1519 "r"(buf + j + k + 64),
1520 "r"(buf + j + k + 128),
1521 "r"(buf + j + k + 192),
1522 "r"(buf + j + k + 256),
1523 "r"(buf + j + k + 320),
1524 "r"(buf + j + k + 384),
1525 "r"(buf + j + k + 448),
1526 "r"(buf + j + k + 512),
1527 "r"(buf + j + k + 576),
1528 "r"(buf + j + k + 640),
1529 "r"(buf + j + k + 704),
1530 "r"(buf + j + k + 768),
1531 "r"(buf + j + k + 832),
1532 "r"(buf + j + k + 896),
1533 "r"(buf + j + k + 960)
1534 : "%v0",
1535 "%v1",
1536 "%v2",
1537 "%v3",
1538 "%v4",
1539 "%v5",
1540 "%v6",
1541 "%v7",
1542 "%v8",
1543 "%v9",
1544 "%v10",
1545 "%v11",
1546 "%v12",
1547 "%v13",
1548 "%v14",
1549 "%v15",
1550 "%v16",
1551 "%v17",
1552 "%v18",
1553 "%v19",
1554 "%v20",
1555 "%v21",
1556 "%v22",
1557 "%v23",
1558 "%v24",
1559 "%v25",
1560 "%v26",
1561 "%v27",
1562 "%v28",
1563 "%v29",
1564 "%v30",
1565 "%v31",
1566 "memory");
1567 }
1568 }
1569 for (int j = 0; j < 8192; j += 8192) {
1570 for (int k = 0; k < 1024; k += 4) {
1571 __asm__ volatile(
1572 "LD1 {v0.4S}, [%0]\n"
1573 "LD1 {v1.4S}, [%1]\n"
1574 "LD1 {v2.4S}, [%2]\n"
1575 "LD1 {v3.4S}, [%3]\n"
1576 "LD1 {v4.4S}, [%4]\n"
1577 "LD1 {v5.4S}, [%5]\n"
1578 "LD1 {v6.4S}, [%6]\n"
1579 "LD1 {v7.4S}, [%7]\n"
1580 "FADD v16.4S, v0.4S, v1.4S\n"
1581 "FSUB v17.4S, v0.4S, v1.4S\n"
1582 "FADD v18.4S, v2.4S, v3.4S\n"
1583 "FSUB v19.4S, v2.4S, v3.4S\n"
1584 "FADD v20.4S, v4.4S, v5.4S\n"
1585 "FSUB v21.4S, v4.4S, v5.4S\n"
1586 "FADD v22.4S, v6.4S, v7.4S\n"
1587 "FSUB v23.4S, v6.4S, v7.4S\n"
1588 "FADD v0.4S, v16.4S, v18.4S\n"
1589 "FSUB v2.4S, v16.4S, v18.4S\n"
1590 "FADD v1.4S, v17.4S, v19.4S\n"
1591 "FSUB v3.4S, v17.4S, v19.4S\n"
1592 "FADD v4.4S, v20.4S, v22.4S\n"
1593 "FSUB v6.4S, v20.4S, v22.4S\n"
1594 "FADD v5.4S, v21.4S, v23.4S\n"
1595 "FSUB v7.4S, v21.4S, v23.4S\n"
1596 "FADD v16.4S, v0.4S, v4.4S\n"
1597 "FSUB v20.4S, v0.4S, v4.4S\n"
1598 "FADD v17.4S, v1.4S, v5.4S\n"
1599 "FSUB v21.4S, v1.4S, v5.4S\n"
1600 "FADD v18.4S, v2.4S, v6.4S\n"
1601 "FSUB v22.4S, v2.4S, v6.4S\n"
1602 "FADD v19.4S, v3.4S, v7.4S\n"
1603 "FSUB v23.4S, v3.4S, v7.4S\n"
1604 "ST1 {v16.4S}, [%0]\n"
1605 "ST1 {v17.4S}, [%1]\n"
1606 "ST1 {v18.4S}, [%2]\n"
1607 "ST1 {v19.4S}, [%3]\n"
1608 "ST1 {v20.4S}, [%4]\n"
1609 "ST1 {v21.4S}, [%5]\n"
1610 "ST1 {v22.4S}, [%6]\n"
1611 "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
1612 "r"(buf + j + k + 1024),
1613 "r"(buf + j + k + 2048),
1614 "r"(buf + j + k + 3072),
1615 "r"(buf + j + k + 4096),
1616 "r"(buf + j + k + 5120),
1617 "r"(buf + j + k + 6144),
1618 "r"(buf + j + k + 7168)
1619 : "%v0",
1620 "%v1",
1621 "%v2",
1622 "%v3",
1623 "%v4",
1624 "%v5",
1625 "%v6",
1626 "%v7",
1627 "%v8",
1628 "%v9",
1629 "%v10",
1630 "%v11",
1631 "%v12",
1632 "%v13",
1633 "%v14",
1634 "%v15",
1635 "%v16",
1636 "%v17",
1637 "%v18",
1638 "%v19",
1639 "%v20",
1640 "%v21",
1641 "%v22",
1642 "%v23",
1643 "%v24",
1644 "%v25",
1645 "%v26",
1646 "%v27",
1647 "%v28",
1648 "%v29",
1649 "%v30",
1650 "%v31",
1651 "memory");
1652 }
1653 }
1654 }
1655 void helper_float_14_recursive(float* buf, int depth);
helper_float_14_recursive(float * buf,int depth)1656 void helper_float_14_recursive(float* buf, int depth) {
1657 if (depth == 10) {
1658 helper_float_10(buf);
1659 return;
1660 }
1661 if (depth == 14) {
1662 helper_float_14_recursive(buf + 0, 10);
1663 helper_float_14_recursive(buf + 1024, 10);
1664 helper_float_14_recursive(buf + 2048, 10);
1665 helper_float_14_recursive(buf + 3072, 10);
1666 helper_float_14_recursive(buf + 4096, 10);
1667 helper_float_14_recursive(buf + 5120, 10);
1668 helper_float_14_recursive(buf + 6144, 10);
1669 helper_float_14_recursive(buf + 7168, 10);
1670 helper_float_14_recursive(buf + 8192, 10);
1671 helper_float_14_recursive(buf + 9216, 10);
1672 helper_float_14_recursive(buf + 10240, 10);
1673 helper_float_14_recursive(buf + 11264, 10);
1674 helper_float_14_recursive(buf + 12288, 10);
1675 helper_float_14_recursive(buf + 13312, 10);
1676 helper_float_14_recursive(buf + 14336, 10);
1677 helper_float_14_recursive(buf + 15360, 10);
1678 for (int j = 0; j < 16384; j += 16384) {
1679 for (int k = 0; k < 1024; k += 4) {
1680 __asm__ volatile(
1681 "LD1 {v0.4S}, [%0]\n"
1682 "LD1 {v1.4S}, [%1]\n"
1683 "LD1 {v2.4S}, [%2]\n"
1684 "LD1 {v3.4S}, [%3]\n"
1685 "LD1 {v4.4S}, [%4]\n"
1686 "LD1 {v5.4S}, [%5]\n"
1687 "LD1 {v6.4S}, [%6]\n"
1688 "LD1 {v7.4S}, [%7]\n"
1689 "LD1 {v8.4S}, [%8]\n"
1690 "LD1 {v9.4S}, [%9]\n"
1691 "LD1 {v10.4S}, [%10]\n"
1692 "LD1 {v11.4S}, [%11]\n"
1693 "LD1 {v12.4S}, [%12]\n"
1694 "LD1 {v13.4S}, [%13]\n"
1695 "LD1 {v14.4S}, [%14]\n"
1696 "LD1 {v15.4S}, [%15]\n"
1697 "FADD v16.4S, v0.4S, v1.4S\n"
1698 "FSUB v17.4S, v0.4S, v1.4S\n"
1699 "FADD v18.4S, v2.4S, v3.4S\n"
1700 "FSUB v19.4S, v2.4S, v3.4S\n"
1701 "FADD v20.4S, v4.4S, v5.4S\n"
1702 "FSUB v21.4S, v4.4S, v5.4S\n"
1703 "FADD v22.4S, v6.4S, v7.4S\n"
1704 "FSUB v23.4S, v6.4S, v7.4S\n"
1705 "FADD v24.4S, v8.4S, v9.4S\n"
1706 "FSUB v25.4S, v8.4S, v9.4S\n"
1707 "FADD v26.4S, v10.4S, v11.4S\n"
1708 "FSUB v27.4S, v10.4S, v11.4S\n"
1709 "FADD v28.4S, v12.4S, v13.4S\n"
1710 "FSUB v29.4S, v12.4S, v13.4S\n"
1711 "FADD v30.4S, v14.4S, v15.4S\n"
1712 "FSUB v31.4S, v14.4S, v15.4S\n"
1713 "FADD v0.4S, v16.4S, v18.4S\n"
1714 "FSUB v2.4S, v16.4S, v18.4S\n"
1715 "FADD v1.4S, v17.4S, v19.4S\n"
1716 "FSUB v3.4S, v17.4S, v19.4S\n"
1717 "FADD v4.4S, v20.4S, v22.4S\n"
1718 "FSUB v6.4S, v20.4S, v22.4S\n"
1719 "FADD v5.4S, v21.4S, v23.4S\n"
1720 "FSUB v7.4S, v21.4S, v23.4S\n"
1721 "FADD v8.4S, v24.4S, v26.4S\n"
1722 "FSUB v10.4S, v24.4S, v26.4S\n"
1723 "FADD v9.4S, v25.4S, v27.4S\n"
1724 "FSUB v11.4S, v25.4S, v27.4S\n"
1725 "FADD v12.4S, v28.4S, v30.4S\n"
1726 "FSUB v14.4S, v28.4S, v30.4S\n"
1727 "FADD v13.4S, v29.4S, v31.4S\n"
1728 "FSUB v15.4S, v29.4S, v31.4S\n"
1729 "FADD v16.4S, v0.4S, v4.4S\n"
1730 "FSUB v20.4S, v0.4S, v4.4S\n"
1731 "FADD v17.4S, v1.4S, v5.4S\n"
1732 "FSUB v21.4S, v1.4S, v5.4S\n"
1733 "FADD v18.4S, v2.4S, v6.4S\n"
1734 "FSUB v22.4S, v2.4S, v6.4S\n"
1735 "FADD v19.4S, v3.4S, v7.4S\n"
1736 "FSUB v23.4S, v3.4S, v7.4S\n"
1737 "FADD v24.4S, v8.4S, v12.4S\n"
1738 "FSUB v28.4S, v8.4S, v12.4S\n"
1739 "FADD v25.4S, v9.4S, v13.4S\n"
1740 "FSUB v29.4S, v9.4S, v13.4S\n"
1741 "FADD v26.4S, v10.4S, v14.4S\n"
1742 "FSUB v30.4S, v10.4S, v14.4S\n"
1743 "FADD v27.4S, v11.4S, v15.4S\n"
1744 "FSUB v31.4S, v11.4S, v15.4S\n"
1745 "FADD v0.4S, v16.4S, v24.4S\n"
1746 "FSUB v8.4S, v16.4S, v24.4S\n"
1747 "FADD v1.4S, v17.4S, v25.4S\n"
1748 "FSUB v9.4S, v17.4S, v25.4S\n"
1749 "FADD v2.4S, v18.4S, v26.4S\n"
1750 "FSUB v10.4S, v18.4S, v26.4S\n"
1751 "FADD v3.4S, v19.4S, v27.4S\n"
1752 "FSUB v11.4S, v19.4S, v27.4S\n"
1753 "FADD v4.4S, v20.4S, v28.4S\n"
1754 "FSUB v12.4S, v20.4S, v28.4S\n"
1755 "FADD v5.4S, v21.4S, v29.4S\n"
1756 "FSUB v13.4S, v21.4S, v29.4S\n"
1757 "FADD v6.4S, v22.4S, v30.4S\n"
1758 "FSUB v14.4S, v22.4S, v30.4S\n"
1759 "FADD v7.4S, v23.4S, v31.4S\n"
1760 "FSUB v15.4S, v23.4S, v31.4S\n"
1761 "ST1 {v0.4S}, [%0]\n"
1762 "ST1 {v1.4S}, [%1]\n"
1763 "ST1 {v2.4S}, [%2]\n"
1764 "ST1 {v3.4S}, [%3]\n"
1765 "ST1 {v4.4S}, [%4]\n"
1766 "ST1 {v5.4S}, [%5]\n"
1767 "ST1 {v6.4S}, [%6]\n"
1768 "ST1 {v7.4S}, [%7]\n"
1769 "ST1 {v8.4S}, [%8]\n"
1770 "ST1 {v9.4S}, [%9]\n"
1771 "ST1 {v10.4S}, [%10]\n"
1772 "ST1 {v11.4S}, [%11]\n"
1773 "ST1 {v12.4S}, [%12]\n"
1774 "ST1 {v13.4S}, [%13]\n"
1775 "ST1 {v14.4S}, [%14]\n"
1776 "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
1777 "r"(buf + j + k + 1024),
1778 "r"(buf + j + k + 2048),
1779 "r"(buf + j + k + 3072),
1780 "r"(buf + j + k + 4096),
1781 "r"(buf + j + k + 5120),
1782 "r"(buf + j + k + 6144),
1783 "r"(buf + j + k + 7168),
1784 "r"(buf + j + k + 8192),
1785 "r"(buf + j + k + 9216),
1786 "r"(buf + j + k + 10240),
1787 "r"(buf + j + k + 11264),
1788 "r"(buf + j + k + 12288),
1789 "r"(buf + j + k + 13312),
1790 "r"(buf + j + k + 14336),
1791 "r"(buf + j + k + 15360)
1792 : "%v0",
1793 "%v1",
1794 "%v2",
1795 "%v3",
1796 "%v4",
1797 "%v5",
1798 "%v6",
1799 "%v7",
1800 "%v8",
1801 "%v9",
1802 "%v10",
1803 "%v11",
1804 "%v12",
1805 "%v13",
1806 "%v14",
1807 "%v15",
1808 "%v16",
1809 "%v17",
1810 "%v18",
1811 "%v19",
1812 "%v20",
1813 "%v21",
1814 "%v22",
1815 "%v23",
1816 "%v24",
1817 "%v25",
1818 "%v26",
1819 "%v27",
1820 "%v28",
1821 "%v29",
1822 "%v30",
1823 "%v31",
1824 "memory");
1825 }
1826 }
1827 return;
1828 }
1829 }
1830 void helper_float_14(float* buf);
helper_float_14(float * buf)1831 void helper_float_14(float* buf) {
1832 helper_float_14_recursive(buf, 14);
1833 }
1834 void helper_float_15_recursive(float* buf, int depth);
helper_float_15_recursive(float * buf,int depth)1835 void helper_float_15_recursive(float* buf, int depth) {
1836 if (depth == 13) {
1837 helper_float_13(buf);
1838 return;
1839 }
1840 if (depth == 15) {
1841 helper_float_15_recursive(buf + 0, 13);
1842 helper_float_15_recursive(buf + 8192, 13);
1843 helper_float_15_recursive(buf + 16384, 13);
1844 helper_float_15_recursive(buf + 24576, 13);
1845 for (int j = 0; j < 32768; j += 32768) {
1846 for (int k = 0; k < 8192; k += 4) {
1847 __asm__ volatile(
1848 "LD1 {v0.4S}, [%0]\n"
1849 "LD1 {v1.4S}, [%1]\n"
1850 "LD1 {v2.4S}, [%2]\n"
1851 "LD1 {v3.4S}, [%3]\n"
1852 "FADD v16.4S, v0.4S, v1.4S\n"
1853 "FSUB v17.4S, v0.4S, v1.4S\n"
1854 "FADD v18.4S, v2.4S, v3.4S\n"
1855 "FSUB v19.4S, v2.4S, v3.4S\n"
1856 "FADD v0.4S, v16.4S, v18.4S\n"
1857 "FSUB v2.4S, v16.4S, v18.4S\n"
1858 "FADD v1.4S, v17.4S, v19.4S\n"
1859 "FSUB v3.4S, v17.4S, v19.4S\n"
1860 "ST1 {v0.4S}, [%0]\n"
1861 "ST1 {v1.4S}, [%1]\n"
1862 "ST1 {v2.4S}, [%2]\n"
1863 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
1864 "r"(buf + j + k + 8192),
1865 "r"(buf + j + k + 16384),
1866 "r"(buf + j + k + 24576)
1867 : "%v0",
1868 "%v1",
1869 "%v2",
1870 "%v3",
1871 "%v4",
1872 "%v5",
1873 "%v6",
1874 "%v7",
1875 "%v8",
1876 "%v9",
1877 "%v10",
1878 "%v11",
1879 "%v12",
1880 "%v13",
1881 "%v14",
1882 "%v15",
1883 "%v16",
1884 "%v17",
1885 "%v18",
1886 "%v19",
1887 "%v20",
1888 "%v21",
1889 "%v22",
1890 "%v23",
1891 "%v24",
1892 "%v25",
1893 "%v26",
1894 "%v27",
1895 "%v28",
1896 "%v29",
1897 "%v30",
1898 "%v31",
1899 "memory");
1900 }
1901 }
1902 return;
1903 }
1904 }
1905 void helper_float_15(float* buf);
helper_float_15(float * buf)1906 void helper_float_15(float* buf) {
1907 helper_float_15_recursive(buf, 15);
1908 }
1909 void helper_float_16_recursive(float* buf, int depth);
helper_float_16_recursive(float * buf,int depth)1910 void helper_float_16_recursive(float* buf, int depth) {
1911 if (depth == 15) {
1912 helper_float_15(buf);
1913 return;
1914 }
1915 if (depth == 16) {
1916 helper_float_16_recursive(buf + 0, 15);
1917 helper_float_16_recursive(buf + 32768, 15);
1918 for (int j = 0; j < 65536; j += 65536) {
1919 for (int k = 0; k < 32768; k += 4) {
1920 __asm__ volatile(
1921 "LD1 {v0.4S}, [%0]\n"
1922 "LD1 {v1.4S}, [%1]\n"
1923 "FADD v16.4S, v0.4S, v1.4S\n"
1924 "FSUB v17.4S, v0.4S, v1.4S\n"
1925 "ST1 {v16.4S}, [%0]\n"
1926 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
1927 "r"(buf + j + k + 32768)
1928 : "%v0",
1929 "%v1",
1930 "%v2",
1931 "%v3",
1932 "%v4",
1933 "%v5",
1934 "%v6",
1935 "%v7",
1936 "%v8",
1937 "%v9",
1938 "%v10",
1939 "%v11",
1940 "%v12",
1941 "%v13",
1942 "%v14",
1943 "%v15",
1944 "%v16",
1945 "%v17",
1946 "%v18",
1947 "%v19",
1948 "%v20",
1949 "%v21",
1950 "%v22",
1951 "%v23",
1952 "%v24",
1953 "%v25",
1954 "%v26",
1955 "%v27",
1956 "%v28",
1957 "%v29",
1958 "%v30",
1959 "%v31",
1960 "memory");
1961 }
1962 }
1963 return;
1964 }
1965 }
1966 void helper_float_16(float* buf);
helper_float_16(float * buf)1967 void helper_float_16(float* buf) {
1968 helper_float_16_recursive(buf, 16);
1969 }
1970 void helper_float_17_recursive(float* buf, int depth);
helper_float_17_recursive(float * buf,int depth)1971 void helper_float_17_recursive(float* buf, int depth) {
1972 if (depth == 15) {
1973 helper_float_15(buf);
1974 return;
1975 }
1976 if (depth == 17) {
1977 helper_float_17_recursive(buf + 0, 15);
1978 helper_float_17_recursive(buf + 32768, 15);
1979 helper_float_17_recursive(buf + 65536, 15);
1980 helper_float_17_recursive(buf + 98304, 15);
1981 for (int j = 0; j < 131072; j += 131072) {
1982 for (int k = 0; k < 32768; k += 4) {
1983 __asm__ volatile(
1984 "LD1 {v0.4S}, [%0]\n"
1985 "LD1 {v1.4S}, [%1]\n"
1986 "LD1 {v2.4S}, [%2]\n"
1987 "LD1 {v3.4S}, [%3]\n"
1988 "FADD v16.4S, v0.4S, v1.4S\n"
1989 "FSUB v17.4S, v0.4S, v1.4S\n"
1990 "FADD v18.4S, v2.4S, v3.4S\n"
1991 "FSUB v19.4S, v2.4S, v3.4S\n"
1992 "FADD v0.4S, v16.4S, v18.4S\n"
1993 "FSUB v2.4S, v16.4S, v18.4S\n"
1994 "FADD v1.4S, v17.4S, v19.4S\n"
1995 "FSUB v3.4S, v17.4S, v19.4S\n"
1996 "ST1 {v0.4S}, [%0]\n"
1997 "ST1 {v1.4S}, [%1]\n"
1998 "ST1 {v2.4S}, [%2]\n"
1999 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2000 "r"(buf + j + k + 32768),
2001 "r"(buf + j + k + 65536),
2002 "r"(buf + j + k + 98304)
2003 : "%v0",
2004 "%v1",
2005 "%v2",
2006 "%v3",
2007 "%v4",
2008 "%v5",
2009 "%v6",
2010 "%v7",
2011 "%v8",
2012 "%v9",
2013 "%v10",
2014 "%v11",
2015 "%v12",
2016 "%v13",
2017 "%v14",
2018 "%v15",
2019 "%v16",
2020 "%v17",
2021 "%v18",
2022 "%v19",
2023 "%v20",
2024 "%v21",
2025 "%v22",
2026 "%v23",
2027 "%v24",
2028 "%v25",
2029 "%v26",
2030 "%v27",
2031 "%v28",
2032 "%v29",
2033 "%v30",
2034 "%v31",
2035 "memory");
2036 }
2037 }
2038 return;
2039 }
2040 }
2041 void helper_float_17(float* buf);
helper_float_17(float * buf)2042 void helper_float_17(float* buf) {
2043 helper_float_17_recursive(buf, 17);
2044 }
2045 void helper_float_18_recursive(float* buf, int depth);
helper_float_18_recursive(float * buf,int depth)2046 void helper_float_18_recursive(float* buf, int depth) {
2047 if (depth == 17) {
2048 helper_float_17(buf);
2049 return;
2050 }
2051 if (depth == 18) {
2052 helper_float_18_recursive(buf + 0, 17);
2053 helper_float_18_recursive(buf + 131072, 17);
2054 for (int j = 0; j < 262144; j += 262144) {
2055 for (int k = 0; k < 131072; k += 4) {
2056 __asm__ volatile(
2057 "LD1 {v0.4S}, [%0]\n"
2058 "LD1 {v1.4S}, [%1]\n"
2059 "FADD v16.4S, v0.4S, v1.4S\n"
2060 "FSUB v17.4S, v0.4S, v1.4S\n"
2061 "ST1 {v16.4S}, [%0]\n"
2062 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2063 "r"(buf + j + k + 131072)
2064 : "%v0",
2065 "%v1",
2066 "%v2",
2067 "%v3",
2068 "%v4",
2069 "%v5",
2070 "%v6",
2071 "%v7",
2072 "%v8",
2073 "%v9",
2074 "%v10",
2075 "%v11",
2076 "%v12",
2077 "%v13",
2078 "%v14",
2079 "%v15",
2080 "%v16",
2081 "%v17",
2082 "%v18",
2083 "%v19",
2084 "%v20",
2085 "%v21",
2086 "%v22",
2087 "%v23",
2088 "%v24",
2089 "%v25",
2090 "%v26",
2091 "%v27",
2092 "%v28",
2093 "%v29",
2094 "%v30",
2095 "%v31",
2096 "memory");
2097 }
2098 }
2099 return;
2100 }
2101 }
2102 void helper_float_18(float* buf);
helper_float_18(float * buf)2103 void helper_float_18(float* buf) {
2104 helper_float_18_recursive(buf, 18);
2105 }
2106 void helper_float_19_recursive(float* buf, int depth);
helper_float_19_recursive(float * buf,int depth)2107 void helper_float_19_recursive(float* buf, int depth) {
2108 if (depth == 18) {
2109 helper_float_18(buf);
2110 return;
2111 }
2112 if (depth == 19) {
2113 helper_float_19_recursive(buf + 0, 18);
2114 helper_float_19_recursive(buf + 262144, 18);
2115 for (int j = 0; j < 524288; j += 524288) {
2116 for (int k = 0; k < 262144; k += 4) {
2117 __asm__ volatile(
2118 "LD1 {v0.4S}, [%0]\n"
2119 "LD1 {v1.4S}, [%1]\n"
2120 "FADD v16.4S, v0.4S, v1.4S\n"
2121 "FSUB v17.4S, v0.4S, v1.4S\n"
2122 "ST1 {v16.4S}, [%0]\n"
2123 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2124 "r"(buf + j + k + 262144)
2125 : "%v0",
2126 "%v1",
2127 "%v2",
2128 "%v3",
2129 "%v4",
2130 "%v5",
2131 "%v6",
2132 "%v7",
2133 "%v8",
2134 "%v9",
2135 "%v10",
2136 "%v11",
2137 "%v12",
2138 "%v13",
2139 "%v14",
2140 "%v15",
2141 "%v16",
2142 "%v17",
2143 "%v18",
2144 "%v19",
2145 "%v20",
2146 "%v21",
2147 "%v22",
2148 "%v23",
2149 "%v24",
2150 "%v25",
2151 "%v26",
2152 "%v27",
2153 "%v28",
2154 "%v29",
2155 "%v30",
2156 "%v31",
2157 "memory");
2158 }
2159 }
2160 return;
2161 }
2162 }
2163 void helper_float_19(float* buf);
helper_float_19(float * buf)2164 void helper_float_19(float* buf) {
2165 helper_float_19_recursive(buf, 19);
2166 }
2167 void helper_float_20_recursive(float* buf, int depth);
helper_float_20_recursive(float * buf,int depth)2168 void helper_float_20_recursive(float* buf, int depth) {
2169 if (depth == 18) {
2170 helper_float_18(buf);
2171 return;
2172 }
2173 if (depth == 20) {
2174 helper_float_20_recursive(buf + 0, 18);
2175 helper_float_20_recursive(buf + 262144, 18);
2176 helper_float_20_recursive(buf + 524288, 18);
2177 helper_float_20_recursive(buf + 786432, 18);
2178 for (int j = 0; j < 1048576; j += 1048576) {
2179 for (int k = 0; k < 262144; k += 4) {
2180 __asm__ volatile(
2181 "LD1 {v0.4S}, [%0]\n"
2182 "LD1 {v1.4S}, [%1]\n"
2183 "LD1 {v2.4S}, [%2]\n"
2184 "LD1 {v3.4S}, [%3]\n"
2185 "FADD v16.4S, v0.4S, v1.4S\n"
2186 "FSUB v17.4S, v0.4S, v1.4S\n"
2187 "FADD v18.4S, v2.4S, v3.4S\n"
2188 "FSUB v19.4S, v2.4S, v3.4S\n"
2189 "FADD v0.4S, v16.4S, v18.4S\n"
2190 "FSUB v2.4S, v16.4S, v18.4S\n"
2191 "FADD v1.4S, v17.4S, v19.4S\n"
2192 "FSUB v3.4S, v17.4S, v19.4S\n"
2193 "ST1 {v0.4S}, [%0]\n"
2194 "ST1 {v1.4S}, [%1]\n"
2195 "ST1 {v2.4S}, [%2]\n"
2196 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2197 "r"(buf + j + k + 262144),
2198 "r"(buf + j + k + 524288),
2199 "r"(buf + j + k + 786432)
2200 : "%v0",
2201 "%v1",
2202 "%v2",
2203 "%v3",
2204 "%v4",
2205 "%v5",
2206 "%v6",
2207 "%v7",
2208 "%v8",
2209 "%v9",
2210 "%v10",
2211 "%v11",
2212 "%v12",
2213 "%v13",
2214 "%v14",
2215 "%v15",
2216 "%v16",
2217 "%v17",
2218 "%v18",
2219 "%v19",
2220 "%v20",
2221 "%v21",
2222 "%v22",
2223 "%v23",
2224 "%v24",
2225 "%v25",
2226 "%v26",
2227 "%v27",
2228 "%v28",
2229 "%v29",
2230 "%v30",
2231 "%v31",
2232 "memory");
2233 }
2234 }
2235 return;
2236 }
2237 }
2238 void helper_float_20(float* buf);
helper_float_20(float * buf)2239 void helper_float_20(float* buf) {
2240 helper_float_20_recursive(buf, 20);
2241 }
2242 void helper_float_21_recursive(float* buf, int depth);
helper_float_21_recursive(float * buf,int depth)2243 void helper_float_21_recursive(float* buf, int depth) {
2244 if (depth == 20) {
2245 helper_float_20(buf);
2246 return;
2247 }
2248 if (depth == 21) {
2249 helper_float_21_recursive(buf + 0, 20);
2250 helper_float_21_recursive(buf + 1048576, 20);
2251 for (int j = 0; j < 2097152; j += 2097152) {
2252 for (int k = 0; k < 1048576; k += 4) {
2253 __asm__ volatile(
2254 "LD1 {v0.4S}, [%0]\n"
2255 "LD1 {v1.4S}, [%1]\n"
2256 "FADD v16.4S, v0.4S, v1.4S\n"
2257 "FSUB v17.4S, v0.4S, v1.4S\n"
2258 "ST1 {v16.4S}, [%0]\n"
2259 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2260 "r"(buf + j + k + 1048576)
2261 : "%v0",
2262 "%v1",
2263 "%v2",
2264 "%v3",
2265 "%v4",
2266 "%v5",
2267 "%v6",
2268 "%v7",
2269 "%v8",
2270 "%v9",
2271 "%v10",
2272 "%v11",
2273 "%v12",
2274 "%v13",
2275 "%v14",
2276 "%v15",
2277 "%v16",
2278 "%v17",
2279 "%v18",
2280 "%v19",
2281 "%v20",
2282 "%v21",
2283 "%v22",
2284 "%v23",
2285 "%v24",
2286 "%v25",
2287 "%v26",
2288 "%v27",
2289 "%v28",
2290 "%v29",
2291 "%v30",
2292 "%v31",
2293 "memory");
2294 }
2295 }
2296 return;
2297 }
2298 }
2299 void helper_float_21(float* buf);
helper_float_21(float * buf)2300 void helper_float_21(float* buf) {
2301 helper_float_21_recursive(buf, 21);
2302 }
2303 void helper_float_22_recursive(float* buf, int depth);
helper_float_22_recursive(float * buf,int depth)2304 void helper_float_22_recursive(float* buf, int depth) {
2305 if (depth == 20) {
2306 helper_float_20(buf);
2307 return;
2308 }
2309 if (depth == 22) {
2310 helper_float_22_recursive(buf + 0, 20);
2311 helper_float_22_recursive(buf + 1048576, 20);
2312 helper_float_22_recursive(buf + 2097152, 20);
2313 helper_float_22_recursive(buf + 3145728, 20);
2314 for (int j = 0; j < 4194304; j += 4194304) {
2315 for (int k = 0; k < 1048576; k += 4) {
2316 __asm__ volatile(
2317 "LD1 {v0.4S}, [%0]\n"
2318 "LD1 {v1.4S}, [%1]\n"
2319 "LD1 {v2.4S}, [%2]\n"
2320 "LD1 {v3.4S}, [%3]\n"
2321 "FADD v16.4S, v0.4S, v1.4S\n"
2322 "FSUB v17.4S, v0.4S, v1.4S\n"
2323 "FADD v18.4S, v2.4S, v3.4S\n"
2324 "FSUB v19.4S, v2.4S, v3.4S\n"
2325 "FADD v0.4S, v16.4S, v18.4S\n"
2326 "FSUB v2.4S, v16.4S, v18.4S\n"
2327 "FADD v1.4S, v17.4S, v19.4S\n"
2328 "FSUB v3.4S, v17.4S, v19.4S\n"
2329 "ST1 {v0.4S}, [%0]\n"
2330 "ST1 {v1.4S}, [%1]\n"
2331 "ST1 {v2.4S}, [%2]\n"
2332 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2333 "r"(buf + j + k + 1048576),
2334 "r"(buf + j + k + 2097152),
2335 "r"(buf + j + k + 3145728)
2336 : "%v0",
2337 "%v1",
2338 "%v2",
2339 "%v3",
2340 "%v4",
2341 "%v5",
2342 "%v6",
2343 "%v7",
2344 "%v8",
2345 "%v9",
2346 "%v10",
2347 "%v11",
2348 "%v12",
2349 "%v13",
2350 "%v14",
2351 "%v15",
2352 "%v16",
2353 "%v17",
2354 "%v18",
2355 "%v19",
2356 "%v20",
2357 "%v21",
2358 "%v22",
2359 "%v23",
2360 "%v24",
2361 "%v25",
2362 "%v26",
2363 "%v27",
2364 "%v28",
2365 "%v29",
2366 "%v30",
2367 "%v31",
2368 "memory");
2369 }
2370 }
2371 return;
2372 }
2373 }
2374 void helper_float_22(float* buf);
helper_float_22(float * buf)2375 void helper_float_22(float* buf) {
2376 helper_float_22_recursive(buf, 22);
2377 }
2378 void helper_float_23_recursive(float* buf, int depth);
helper_float_23_recursive(float * buf,int depth)2379 void helper_float_23_recursive(float* buf, int depth) {
2380 if (depth == 22) {
2381 helper_float_22(buf);
2382 return;
2383 }
2384 if (depth == 23) {
2385 helper_float_23_recursive(buf + 0, 22);
2386 helper_float_23_recursive(buf + 4194304, 22);
2387 for (int j = 0; j < 8388608; j += 8388608) {
2388 for (int k = 0; k < 4194304; k += 4) {
2389 __asm__ volatile(
2390 "LD1 {v0.4S}, [%0]\n"
2391 "LD1 {v1.4S}, [%1]\n"
2392 "FADD v16.4S, v0.4S, v1.4S\n"
2393 "FSUB v17.4S, v0.4S, v1.4S\n"
2394 "ST1 {v16.4S}, [%0]\n"
2395 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2396 "r"(buf + j + k + 4194304)
2397 : "%v0",
2398 "%v1",
2399 "%v2",
2400 "%v3",
2401 "%v4",
2402 "%v5",
2403 "%v6",
2404 "%v7",
2405 "%v8",
2406 "%v9",
2407 "%v10",
2408 "%v11",
2409 "%v12",
2410 "%v13",
2411 "%v14",
2412 "%v15",
2413 "%v16",
2414 "%v17",
2415 "%v18",
2416 "%v19",
2417 "%v20",
2418 "%v21",
2419 "%v22",
2420 "%v23",
2421 "%v24",
2422 "%v25",
2423 "%v26",
2424 "%v27",
2425 "%v28",
2426 "%v29",
2427 "%v30",
2428 "%v31",
2429 "memory");
2430 }
2431 }
2432 return;
2433 }
2434 }
2435 void helper_float_23(float* buf);
helper_float_23(float * buf)2436 void helper_float_23(float* buf) {
2437 helper_float_23_recursive(buf, 23);
2438 }
2439 void helper_float_24_recursive(float* buf, int depth);
helper_float_24_recursive(float * buf,int depth)2440 void helper_float_24_recursive(float* buf, int depth) {
2441 if (depth == 23) {
2442 helper_float_23(buf);
2443 return;
2444 }
2445 if (depth == 24) {
2446 helper_float_24_recursive(buf + 0, 23);
2447 helper_float_24_recursive(buf + 8388608, 23);
2448 for (int j = 0; j < 16777216; j += 16777216) {
2449 for (int k = 0; k < 8388608; k += 4) {
2450 __asm__ volatile(
2451 "LD1 {v0.4S}, [%0]\n"
2452 "LD1 {v1.4S}, [%1]\n"
2453 "FADD v16.4S, v0.4S, v1.4S\n"
2454 "FSUB v17.4S, v0.4S, v1.4S\n"
2455 "ST1 {v16.4S}, [%0]\n"
2456 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2457 "r"(buf + j + k + 8388608)
2458 : "%v0",
2459 "%v1",
2460 "%v2",
2461 "%v3",
2462 "%v4",
2463 "%v5",
2464 "%v6",
2465 "%v7",
2466 "%v8",
2467 "%v9",
2468 "%v10",
2469 "%v11",
2470 "%v12",
2471 "%v13",
2472 "%v14",
2473 "%v15",
2474 "%v16",
2475 "%v17",
2476 "%v18",
2477 "%v19",
2478 "%v20",
2479 "%v21",
2480 "%v22",
2481 "%v23",
2482 "%v24",
2483 "%v25",
2484 "%v26",
2485 "%v27",
2486 "%v28",
2487 "%v29",
2488 "%v30",
2489 "%v31",
2490 "memory");
2491 }
2492 }
2493 return;
2494 }
2495 }
2496 void helper_float_24(float* buf);
helper_float_24(float * buf)2497 void helper_float_24(float* buf) {
2498 helper_float_24_recursive(buf, 24);
2499 }
2500 void helper_float_25_recursive(float* buf, int depth);
helper_float_25_recursive(float * buf,int depth)2501 void helper_float_25_recursive(float* buf, int depth) {
2502 if (depth == 23) {
2503 helper_float_23(buf);
2504 return;
2505 }
2506 if (depth == 25) {
2507 helper_float_25_recursive(buf + 0, 23);
2508 helper_float_25_recursive(buf + 8388608, 23);
2509 helper_float_25_recursive(buf + 16777216, 23);
2510 helper_float_25_recursive(buf + 25165824, 23);
2511 for (int j = 0; j < 33554432; j += 33554432) {
2512 for (int k = 0; k < 8388608; k += 4) {
2513 __asm__ volatile(
2514 "LD1 {v0.4S}, [%0]\n"
2515 "LD1 {v1.4S}, [%1]\n"
2516 "LD1 {v2.4S}, [%2]\n"
2517 "LD1 {v3.4S}, [%3]\n"
2518 "FADD v16.4S, v0.4S, v1.4S\n"
2519 "FSUB v17.4S, v0.4S, v1.4S\n"
2520 "FADD v18.4S, v2.4S, v3.4S\n"
2521 "FSUB v19.4S, v2.4S, v3.4S\n"
2522 "FADD v0.4S, v16.4S, v18.4S\n"
2523 "FSUB v2.4S, v16.4S, v18.4S\n"
2524 "FADD v1.4S, v17.4S, v19.4S\n"
2525 "FSUB v3.4S, v17.4S, v19.4S\n"
2526 "ST1 {v0.4S}, [%0]\n"
2527 "ST1 {v1.4S}, [%1]\n"
2528 "ST1 {v2.4S}, [%2]\n"
2529 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2530 "r"(buf + j + k + 8388608),
2531 "r"(buf + j + k + 16777216),
2532 "r"(buf + j + k + 25165824)
2533 : "%v0",
2534 "%v1",
2535 "%v2",
2536 "%v3",
2537 "%v4",
2538 "%v5",
2539 "%v6",
2540 "%v7",
2541 "%v8",
2542 "%v9",
2543 "%v10",
2544 "%v11",
2545 "%v12",
2546 "%v13",
2547 "%v14",
2548 "%v15",
2549 "%v16",
2550 "%v17",
2551 "%v18",
2552 "%v19",
2553 "%v20",
2554 "%v21",
2555 "%v22",
2556 "%v23",
2557 "%v24",
2558 "%v25",
2559 "%v26",
2560 "%v27",
2561 "%v28",
2562 "%v29",
2563 "%v30",
2564 "%v31",
2565 "memory");
2566 }
2567 }
2568 return;
2569 }
2570 }
2571 void helper_float_25(float* buf);
helper_float_25(float * buf)2572 void helper_float_25(float* buf) {
2573 helper_float_25_recursive(buf, 25);
2574 }
2575 void helper_float_26_recursive(float* buf, int depth);
helper_float_26_recursive(float * buf,int depth)2576 void helper_float_26_recursive(float* buf, int depth) {
2577 if (depth == 25) {
2578 helper_float_25(buf);
2579 return;
2580 }
2581 if (depth == 26) {
2582 helper_float_26_recursive(buf + 0, 25);
2583 helper_float_26_recursive(buf + 33554432, 25);
2584 for (int j = 0; j < 67108864; j += 67108864) {
2585 for (int k = 0; k < 33554432; k += 4) {
2586 __asm__ volatile(
2587 "LD1 {v0.4S}, [%0]\n"
2588 "LD1 {v1.4S}, [%1]\n"
2589 "FADD v16.4S, v0.4S, v1.4S\n"
2590 "FSUB v17.4S, v0.4S, v1.4S\n"
2591 "ST1 {v16.4S}, [%0]\n"
2592 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2593 "r"(buf + j + k + 33554432)
2594 : "%v0",
2595 "%v1",
2596 "%v2",
2597 "%v3",
2598 "%v4",
2599 "%v5",
2600 "%v6",
2601 "%v7",
2602 "%v8",
2603 "%v9",
2604 "%v10",
2605 "%v11",
2606 "%v12",
2607 "%v13",
2608 "%v14",
2609 "%v15",
2610 "%v16",
2611 "%v17",
2612 "%v18",
2613 "%v19",
2614 "%v20",
2615 "%v21",
2616 "%v22",
2617 "%v23",
2618 "%v24",
2619 "%v25",
2620 "%v26",
2621 "%v27",
2622 "%v28",
2623 "%v29",
2624 "%v30",
2625 "%v31",
2626 "memory");
2627 }
2628 }
2629 return;
2630 }
2631 }
2632 void helper_float_26(float* buf);
helper_float_26(float * buf)2633 void helper_float_26(float* buf) {
2634 helper_float_26_recursive(buf, 26);
2635 }
2636 void helper_float_27_recursive(float* buf, int depth);
helper_float_27_recursive(float * buf,int depth)2637 void helper_float_27_recursive(float* buf, int depth) {
2638 if (depth == 26) {
2639 helper_float_26(buf);
2640 return;
2641 }
2642 if (depth == 27) {
2643 helper_float_27_recursive(buf + 0, 26);
2644 helper_float_27_recursive(buf + 67108864, 26);
2645 for (int j = 0; j < 134217728; j += 134217728) {
2646 for (int k = 0; k < 67108864; k += 4) {
2647 __asm__ volatile(
2648 "LD1 {v0.4S}, [%0]\n"
2649 "LD1 {v1.4S}, [%1]\n"
2650 "FADD v16.4S, v0.4S, v1.4S\n"
2651 "FSUB v17.4S, v0.4S, v1.4S\n"
2652 "ST1 {v16.4S}, [%0]\n"
2653 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2654 "r"(buf + j + k + 67108864)
2655 : "%v0",
2656 "%v1",
2657 "%v2",
2658 "%v3",
2659 "%v4",
2660 "%v5",
2661 "%v6",
2662 "%v7",
2663 "%v8",
2664 "%v9",
2665 "%v10",
2666 "%v11",
2667 "%v12",
2668 "%v13",
2669 "%v14",
2670 "%v15",
2671 "%v16",
2672 "%v17",
2673 "%v18",
2674 "%v19",
2675 "%v20",
2676 "%v21",
2677 "%v22",
2678 "%v23",
2679 "%v24",
2680 "%v25",
2681 "%v26",
2682 "%v27",
2683 "%v28",
2684 "%v29",
2685 "%v30",
2686 "%v31",
2687 "memory");
2688 }
2689 }
2690 return;
2691 }
2692 }
2693 void helper_float_27(float* buf);
helper_float_27(float * buf)2694 void helper_float_27(float* buf) {
2695 helper_float_27_recursive(buf, 27);
2696 }
2697 void helper_float_28_recursive(float* buf, int depth);
helper_float_28_recursive(float * buf,int depth)2698 void helper_float_28_recursive(float* buf, int depth) {
2699 if (depth == 26) {
2700 helper_float_26(buf);
2701 return;
2702 }
2703 if (depth == 28) {
2704 helper_float_28_recursive(buf + 0, 26);
2705 helper_float_28_recursive(buf + 67108864, 26);
2706 helper_float_28_recursive(buf + 134217728, 26);
2707 helper_float_28_recursive(buf + 201326592, 26);
2708 for (int j = 0; j < 268435456; j += 268435456) {
2709 for (int k = 0; k < 67108864; k += 4) {
2710 __asm__ volatile(
2711 "LD1 {v0.4S}, [%0]\n"
2712 "LD1 {v1.4S}, [%1]\n"
2713 "LD1 {v2.4S}, [%2]\n"
2714 "LD1 {v3.4S}, [%3]\n"
2715 "FADD v16.4S, v0.4S, v1.4S\n"
2716 "FSUB v17.4S, v0.4S, v1.4S\n"
2717 "FADD v18.4S, v2.4S, v3.4S\n"
2718 "FSUB v19.4S, v2.4S, v3.4S\n"
2719 "FADD v0.4S, v16.4S, v18.4S\n"
2720 "FSUB v2.4S, v16.4S, v18.4S\n"
2721 "FADD v1.4S, v17.4S, v19.4S\n"
2722 "FSUB v3.4S, v17.4S, v19.4S\n"
2723 "ST1 {v0.4S}, [%0]\n"
2724 "ST1 {v1.4S}, [%1]\n"
2725 "ST1 {v2.4S}, [%2]\n"
2726 "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2727 "r"(buf + j + k + 67108864),
2728 "r"(buf + j + k + 134217728),
2729 "r"(buf + j + k + 201326592)
2730 : "%v0",
2731 "%v1",
2732 "%v2",
2733 "%v3",
2734 "%v4",
2735 "%v5",
2736 "%v6",
2737 "%v7",
2738 "%v8",
2739 "%v9",
2740 "%v10",
2741 "%v11",
2742 "%v12",
2743 "%v13",
2744 "%v14",
2745 "%v15",
2746 "%v16",
2747 "%v17",
2748 "%v18",
2749 "%v19",
2750 "%v20",
2751 "%v21",
2752 "%v22",
2753 "%v23",
2754 "%v24",
2755 "%v25",
2756 "%v26",
2757 "%v27",
2758 "%v28",
2759 "%v29",
2760 "%v30",
2761 "%v31",
2762 "memory");
2763 }
2764 }
2765 return;
2766 }
2767 }
2768 void helper_float_28(float* buf);
helper_float_28(float * buf)2769 void helper_float_28(float* buf) {
2770 helper_float_28_recursive(buf, 28);
2771 }
2772 void helper_float_29_recursive(float* buf, int depth);
helper_float_29_recursive(float * buf,int depth)2773 void helper_float_29_recursive(float* buf, int depth) {
2774 if (depth == 28) {
2775 helper_float_28(buf);
2776 return;
2777 }
2778 if (depth == 29) {
2779 helper_float_29_recursive(buf + 0, 28);
2780 helper_float_29_recursive(buf + 268435456, 28);
2781 for (int j = 0; j < 536870912; j += 536870912) {
2782 for (int k = 0; k < 268435456; k += 4) {
2783 __asm__ volatile(
2784 "LD1 {v0.4S}, [%0]\n"
2785 "LD1 {v1.4S}, [%1]\n"
2786 "FADD v16.4S, v0.4S, v1.4S\n"
2787 "FSUB v17.4S, v0.4S, v1.4S\n"
2788 "ST1 {v16.4S}, [%0]\n"
2789 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2790 "r"(buf + j + k + 268435456)
2791 : "%v0",
2792 "%v1",
2793 "%v2",
2794 "%v3",
2795 "%v4",
2796 "%v5",
2797 "%v6",
2798 "%v7",
2799 "%v8",
2800 "%v9",
2801 "%v10",
2802 "%v11",
2803 "%v12",
2804 "%v13",
2805 "%v14",
2806 "%v15",
2807 "%v16",
2808 "%v17",
2809 "%v18",
2810 "%v19",
2811 "%v20",
2812 "%v21",
2813 "%v22",
2814 "%v23",
2815 "%v24",
2816 "%v25",
2817 "%v26",
2818 "%v27",
2819 "%v28",
2820 "%v29",
2821 "%v30",
2822 "%v31",
2823 "memory");
2824 }
2825 }
2826 return;
2827 }
2828 }
2829 void helper_float_29(float* buf);
helper_float_29(float * buf)2830 void helper_float_29(float* buf) {
2831 helper_float_29_recursive(buf, 29);
2832 }
2833 void helper_float_30_recursive(float* buf, int depth);
helper_float_30_recursive(float * buf,int depth)2834 void helper_float_30_recursive(float* buf, int depth) {
2835 if (depth == 29) {
2836 helper_float_29(buf);
2837 return;
2838 }
2839 if (depth == 30) {
2840 helper_float_30_recursive(buf + 0, 29);
2841 helper_float_30_recursive(buf + 536870912, 29);
2842 for (int j = 0; j < 1073741824; j += 1073741824) {
2843 for (int k = 0; k < 536870912; k += 4) {
2844 __asm__ volatile(
2845 "LD1 {v0.4S}, [%0]\n"
2846 "LD1 {v1.4S}, [%1]\n"
2847 "FADD v16.4S, v0.4S, v1.4S\n"
2848 "FSUB v17.4S, v0.4S, v1.4S\n"
2849 "ST1 {v16.4S}, [%0]\n"
2850 "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2851 "r"(buf + j + k + 536870912)
2852 : "%v0",
2853 "%v1",
2854 "%v2",
2855 "%v3",
2856 "%v4",
2857 "%v5",
2858 "%v6",
2859 "%v7",
2860 "%v8",
2861 "%v9",
2862 "%v10",
2863 "%v11",
2864 "%v12",
2865 "%v13",
2866 "%v14",
2867 "%v15",
2868 "%v16",
2869 "%v17",
2870 "%v18",
2871 "%v19",
2872 "%v20",
2873 "%v21",
2874 "%v22",
2875 "%v23",
2876 "%v24",
2877 "%v25",
2878 "%v26",
2879 "%v27",
2880 "%v28",
2881 "%v29",
2882 "%v30",
2883 "%v31",
2884 "memory");
2885 }
2886 }
2887 return;
2888 }
2889 }
2890 void helper_float_30(float* buf);
helper_float_30(float * buf)2891 void helper_float_30(float* buf) {
2892 helper_float_30_recursive(buf, 30);
2893 }
fht_float(float * buf,int log_n)2894 int fht_float(float* buf, int log_n) {
2895 if (log_n == 0) {
2896 return 0;
2897 }
2898 if (log_n == 1) {
2899 helper_float_1(buf);
2900 return 0;
2901 }
2902 if (log_n == 2) {
2903 helper_float_2(buf);
2904 return 0;
2905 }
2906 if (log_n == 3) {
2907 helper_float_3(buf);
2908 return 0;
2909 }
2910 if (log_n == 4) {
2911 helper_float_4(buf);
2912 return 0;
2913 }
2914 if (log_n == 5) {
2915 helper_float_5(buf);
2916 return 0;
2917 }
2918 if (log_n == 6) {
2919 helper_float_6(buf);
2920 return 0;
2921 }
2922 if (log_n == 7) {
2923 helper_float_7(buf);
2924 return 0;
2925 }
2926 if (log_n == 8) {
2927 helper_float_8(buf);
2928 return 0;
2929 }
2930 if (log_n == 9) {
2931 helper_float_9(buf);
2932 return 0;
2933 }
2934 if (log_n == 10) {
2935 helper_float_10(buf);
2936 return 0;
2937 }
2938 if (log_n == 11) {
2939 helper_float_11(buf);
2940 return 0;
2941 }
2942 if (log_n == 12) {
2943 helper_float_12(buf);
2944 return 0;
2945 }
2946 if (log_n == 13) {
2947 helper_float_13(buf);
2948 return 0;
2949 }
2950 if (log_n == 14) {
2951 helper_float_14(buf);
2952 return 0;
2953 }
2954 if (log_n == 15) {
2955 helper_float_15(buf);
2956 return 0;
2957 }
2958 if (log_n == 16) {
2959 helper_float_16(buf);
2960 return 0;
2961 }
2962 if (log_n == 17) {
2963 helper_float_17(buf);
2964 return 0;
2965 }
2966 if (log_n == 18) {
2967 helper_float_18(buf);
2968 return 0;
2969 }
2970 if (log_n == 19) {
2971 helper_float_19(buf);
2972 return 0;
2973 }
2974 if (log_n == 20) {
2975 helper_float_20(buf);
2976 return 0;
2977 }
2978 if (log_n == 21) {
2979 helper_float_21(buf);
2980 return 0;
2981 }
2982 if (log_n == 22) {
2983 helper_float_22(buf);
2984 return 0;
2985 }
2986 if (log_n == 23) {
2987 helper_float_23(buf);
2988 return 0;
2989 }
2990 if (log_n == 24) {
2991 helper_float_24(buf);
2992 return 0;
2993 }
2994 if (log_n == 25) {
2995 helper_float_25(buf);
2996 return 0;
2997 }
2998 if (log_n == 26) {
2999 helper_float_26(buf);
3000 return 0;
3001 }
3002 if (log_n == 27) {
3003 helper_float_27(buf);
3004 return 0;
3005 }
3006 if (log_n == 28) {
3007 helper_float_28(buf);
3008 return 0;
3009 }
3010 if (log_n == 29) {
3011 helper_float_29(buf);
3012 return 0;
3013 }
3014 if (log_n == 30) {
3015 helper_float_30(buf);
3016 return 0;
3017 }
3018 return 1;
3019 }
3020