xref: /aosp_15_r20/external/executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 // @generated
2 #include "fht.h"
3 static inline void helper_float_1(float* buf);
helper_float_1(float * buf)4 static inline void helper_float_1(float* buf) {
5   for (int j = 0; j < 2; j += 2) {
6     for (int k = 0; k < 1; ++k) {
7       float u = buf[j + k];
8       float v = buf[j + k + 1];
9       buf[j + k] = u + v;
10       buf[j + k + 1] = u - v;
11     }
12   }
13 }
14 static inline void helper_float_2(float* buf);
helper_float_2(float * buf)15 static inline void helper_float_2(float* buf) {
16   for (int j = 0; j < 4; j += 4) {
17     __asm__ volatile(
18         "LD1 {v0.4S}, [%0]\n"
19         "TRN1 v16.4S, v0.4S, v0.4S\n"
20         "FNEG v17.4S, v0.4S\n"
21         "TRN2 v17.4S, v0.4S, v17.4S\n"
22         "FADD v0.4S, v16.4S, v17.4S\n"
23         "DUP v16.2D, v0.D[0]\n"
24         "FNEG v17.4S, v0.4S\n"
25         "INS v17.D[0], v0.D[1]\n"
26         "FADD v0.4S, v16.4S, v17.4S\n"
27         "ST1 {v0.4S}, [%0]\n" ::"r"(buf + j)
28         : "%v0",
29           "%v1",
30           "%v2",
31           "%v3",
32           "%v4",
33           "%v5",
34           "%v6",
35           "%v7",
36           "%v8",
37           "%v9",
38           "%v10",
39           "%v11",
40           "%v12",
41           "%v13",
42           "%v14",
43           "%v15",
44           "%v16",
45           "%v17",
46           "%v18",
47           "%v19",
48           "%v20",
49           "%v21",
50           "%v22",
51           "%v23",
52           "%v24",
53           "%v25",
54           "%v26",
55           "%v27",
56           "%v28",
57           "%v29",
58           "%v30",
59           "%v31",
60           "memory");
61   }
62 }
63 void helper_float_3_recursive(float* buf, int depth);
helper_float_3_recursive(float * buf,int depth)64 void helper_float_3_recursive(float* buf, int depth) {
65   if (depth == 2) {
66     helper_float_2(buf);
67     return;
68   }
69   if (depth == 3) {
70     helper_float_3_recursive(buf + 0, 2);
71     helper_float_3_recursive(buf + 4, 2);
72     for (int j = 0; j < 8; j += 8) {
73       for (int k = 0; k < 4; k += 4) {
74         __asm__ volatile(
75             "LD1 {v0.4S}, [%0]\n"
76             "LD1 {v1.4S}, [%1]\n"
77             "FADD v16.4S, v0.4S, v1.4S\n"
78             "FSUB v17.4S, v0.4S, v1.4S\n"
79             "ST1 {v16.4S}, [%0]\n"
80             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
81             "r"(buf + j + k + 4)
82             : "%v0",
83               "%v1",
84               "%v2",
85               "%v3",
86               "%v4",
87               "%v5",
88               "%v6",
89               "%v7",
90               "%v8",
91               "%v9",
92               "%v10",
93               "%v11",
94               "%v12",
95               "%v13",
96               "%v14",
97               "%v15",
98               "%v16",
99               "%v17",
100               "%v18",
101               "%v19",
102               "%v20",
103               "%v21",
104               "%v22",
105               "%v23",
106               "%v24",
107               "%v25",
108               "%v26",
109               "%v27",
110               "%v28",
111               "%v29",
112               "%v30",
113               "%v31",
114               "memory");
115       }
116     }
117     return;
118   }
119 }
120 void helper_float_3(float* buf);
helper_float_3(float * buf)121 void helper_float_3(float* buf) {
122   helper_float_3_recursive(buf, 3);
123 }
124 void helper_float_4_recursive(float* buf, int depth);
helper_float_4_recursive(float * buf,int depth)125 void helper_float_4_recursive(float* buf, int depth) {
126   if (depth == 3) {
127     helper_float_3(buf);
128     return;
129   }
130   if (depth == 4) {
131     helper_float_4_recursive(buf + 0, 3);
132     helper_float_4_recursive(buf + 8, 3);
133     for (int j = 0; j < 16; j += 16) {
134       for (int k = 0; k < 8; k += 4) {
135         __asm__ volatile(
136             "LD1 {v0.4S}, [%0]\n"
137             "LD1 {v1.4S}, [%1]\n"
138             "FADD v16.4S, v0.4S, v1.4S\n"
139             "FSUB v17.4S, v0.4S, v1.4S\n"
140             "ST1 {v16.4S}, [%0]\n"
141             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
142             "r"(buf + j + k + 8)
143             : "%v0",
144               "%v1",
145               "%v2",
146               "%v3",
147               "%v4",
148               "%v5",
149               "%v6",
150               "%v7",
151               "%v8",
152               "%v9",
153               "%v10",
154               "%v11",
155               "%v12",
156               "%v13",
157               "%v14",
158               "%v15",
159               "%v16",
160               "%v17",
161               "%v18",
162               "%v19",
163               "%v20",
164               "%v21",
165               "%v22",
166               "%v23",
167               "%v24",
168               "%v25",
169               "%v26",
170               "%v27",
171               "%v28",
172               "%v29",
173               "%v30",
174               "%v31",
175               "memory");
176       }
177     }
178     return;
179   }
180 }
181 void helper_float_4(float* buf);
helper_float_4(float * buf)182 void helper_float_4(float* buf) {
183   helper_float_4_recursive(buf, 4);
184 }
185 void helper_float_5_recursive(float* buf, int depth);
helper_float_5_recursive(float * buf,int depth)186 void helper_float_5_recursive(float* buf, int depth) {
187   if (depth == 4) {
188     helper_float_4(buf);
189     return;
190   }
191   if (depth == 5) {
192     helper_float_5_recursive(buf + 0, 4);
193     helper_float_5_recursive(buf + 16, 4);
194     for (int j = 0; j < 32; j += 32) {
195       for (int k = 0; k < 16; k += 4) {
196         __asm__ volatile(
197             "LD1 {v0.4S}, [%0]\n"
198             "LD1 {v1.4S}, [%1]\n"
199             "FADD v16.4S, v0.4S, v1.4S\n"
200             "FSUB v17.4S, v0.4S, v1.4S\n"
201             "ST1 {v16.4S}, [%0]\n"
202             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
203             "r"(buf + j + k + 16)
204             : "%v0",
205               "%v1",
206               "%v2",
207               "%v3",
208               "%v4",
209               "%v5",
210               "%v6",
211               "%v7",
212               "%v8",
213               "%v9",
214               "%v10",
215               "%v11",
216               "%v12",
217               "%v13",
218               "%v14",
219               "%v15",
220               "%v16",
221               "%v17",
222               "%v18",
223               "%v19",
224               "%v20",
225               "%v21",
226               "%v22",
227               "%v23",
228               "%v24",
229               "%v25",
230               "%v26",
231               "%v27",
232               "%v28",
233               "%v29",
234               "%v30",
235               "%v31",
236               "memory");
237       }
238     }
239     return;
240   }
241 }
242 void helper_float_5(float* buf);
helper_float_5(float * buf)243 void helper_float_5(float* buf) {
244   helper_float_5_recursive(buf, 5);
245 }
246 void helper_float_6_recursive(float* buf, int depth);
helper_float_6_recursive(float * buf,int depth)247 void helper_float_6_recursive(float* buf, int depth) {
248   if (depth == 3) {
249     helper_float_3(buf);
250     return;
251   }
252   if (depth == 6) {
253     helper_float_6_recursive(buf + 0, 3);
254     helper_float_6_recursive(buf + 8, 3);
255     helper_float_6_recursive(buf + 16, 3);
256     helper_float_6_recursive(buf + 24, 3);
257     helper_float_6_recursive(buf + 32, 3);
258     helper_float_6_recursive(buf + 40, 3);
259     helper_float_6_recursive(buf + 48, 3);
260     helper_float_6_recursive(buf + 56, 3);
261     for (int j = 0; j < 64; j += 64) {
262       for (int k = 0; k < 8; k += 4) {
263         __asm__ volatile(
264             "LD1 {v0.4S}, [%0]\n"
265             "LD1 {v1.4S}, [%1]\n"
266             "LD1 {v2.4S}, [%2]\n"
267             "LD1 {v3.4S}, [%3]\n"
268             "LD1 {v4.4S}, [%4]\n"
269             "LD1 {v5.4S}, [%5]\n"
270             "LD1 {v6.4S}, [%6]\n"
271             "LD1 {v7.4S}, [%7]\n"
272             "FADD v16.4S, v0.4S, v1.4S\n"
273             "FSUB v17.4S, v0.4S, v1.4S\n"
274             "FADD v18.4S, v2.4S, v3.4S\n"
275             "FSUB v19.4S, v2.4S, v3.4S\n"
276             "FADD v20.4S, v4.4S, v5.4S\n"
277             "FSUB v21.4S, v4.4S, v5.4S\n"
278             "FADD v22.4S, v6.4S, v7.4S\n"
279             "FSUB v23.4S, v6.4S, v7.4S\n"
280             "FADD v0.4S, v16.4S, v18.4S\n"
281             "FSUB v2.4S, v16.4S, v18.4S\n"
282             "FADD v1.4S, v17.4S, v19.4S\n"
283             "FSUB v3.4S, v17.4S, v19.4S\n"
284             "FADD v4.4S, v20.4S, v22.4S\n"
285             "FSUB v6.4S, v20.4S, v22.4S\n"
286             "FADD v5.4S, v21.4S, v23.4S\n"
287             "FSUB v7.4S, v21.4S, v23.4S\n"
288             "FADD v16.4S, v0.4S, v4.4S\n"
289             "FSUB v20.4S, v0.4S, v4.4S\n"
290             "FADD v17.4S, v1.4S, v5.4S\n"
291             "FSUB v21.4S, v1.4S, v5.4S\n"
292             "FADD v18.4S, v2.4S, v6.4S\n"
293             "FSUB v22.4S, v2.4S, v6.4S\n"
294             "FADD v19.4S, v3.4S, v7.4S\n"
295             "FSUB v23.4S, v3.4S, v7.4S\n"
296             "ST1 {v16.4S}, [%0]\n"
297             "ST1 {v17.4S}, [%1]\n"
298             "ST1 {v18.4S}, [%2]\n"
299             "ST1 {v19.4S}, [%3]\n"
300             "ST1 {v20.4S}, [%4]\n"
301             "ST1 {v21.4S}, [%5]\n"
302             "ST1 {v22.4S}, [%6]\n"
303             "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
304             "r"(buf + j + k + 8),
305             "r"(buf + j + k + 16),
306             "r"(buf + j + k + 24),
307             "r"(buf + j + k + 32),
308             "r"(buf + j + k + 40),
309             "r"(buf + j + k + 48),
310             "r"(buf + j + k + 56)
311             : "%v0",
312               "%v1",
313               "%v2",
314               "%v3",
315               "%v4",
316               "%v5",
317               "%v6",
318               "%v7",
319               "%v8",
320               "%v9",
321               "%v10",
322               "%v11",
323               "%v12",
324               "%v13",
325               "%v14",
326               "%v15",
327               "%v16",
328               "%v17",
329               "%v18",
330               "%v19",
331               "%v20",
332               "%v21",
333               "%v22",
334               "%v23",
335               "%v24",
336               "%v25",
337               "%v26",
338               "%v27",
339               "%v28",
340               "%v29",
341               "%v30",
342               "%v31",
343               "memory");
344       }
345     }
346     return;
347   }
348 }
349 void helper_float_6(float* buf);
helper_float_6(float * buf)350 void helper_float_6(float* buf) {
351   helper_float_6_recursive(buf, 6);
352 }
353 void helper_float_7_recursive(float* buf, int depth);
helper_float_7_recursive(float * buf,int depth)354 void helper_float_7_recursive(float* buf, int depth) {
355   if (depth == 3) {
356     helper_float_3(buf);
357     return;
358   }
359   if (depth == 7) {
360     helper_float_7_recursive(buf + 0, 3);
361     helper_float_7_recursive(buf + 8, 3);
362     helper_float_7_recursive(buf + 16, 3);
363     helper_float_7_recursive(buf + 24, 3);
364     helper_float_7_recursive(buf + 32, 3);
365     helper_float_7_recursive(buf + 40, 3);
366     helper_float_7_recursive(buf + 48, 3);
367     helper_float_7_recursive(buf + 56, 3);
368     helper_float_7_recursive(buf + 64, 3);
369     helper_float_7_recursive(buf + 72, 3);
370     helper_float_7_recursive(buf + 80, 3);
371     helper_float_7_recursive(buf + 88, 3);
372     helper_float_7_recursive(buf + 96, 3);
373     helper_float_7_recursive(buf + 104, 3);
374     helper_float_7_recursive(buf + 112, 3);
375     helper_float_7_recursive(buf + 120, 3);
376     for (int j = 0; j < 128; j += 128) {
377       for (int k = 0; k < 8; k += 4) {
378         __asm__ volatile(
379             "LD1 {v0.4S}, [%0]\n"
380             "LD1 {v1.4S}, [%1]\n"
381             "LD1 {v2.4S}, [%2]\n"
382             "LD1 {v3.4S}, [%3]\n"
383             "LD1 {v4.4S}, [%4]\n"
384             "LD1 {v5.4S}, [%5]\n"
385             "LD1 {v6.4S}, [%6]\n"
386             "LD1 {v7.4S}, [%7]\n"
387             "LD1 {v8.4S}, [%8]\n"
388             "LD1 {v9.4S}, [%9]\n"
389             "LD1 {v10.4S}, [%10]\n"
390             "LD1 {v11.4S}, [%11]\n"
391             "LD1 {v12.4S}, [%12]\n"
392             "LD1 {v13.4S}, [%13]\n"
393             "LD1 {v14.4S}, [%14]\n"
394             "LD1 {v15.4S}, [%15]\n"
395             "FADD v16.4S, v0.4S, v1.4S\n"
396             "FSUB v17.4S, v0.4S, v1.4S\n"
397             "FADD v18.4S, v2.4S, v3.4S\n"
398             "FSUB v19.4S, v2.4S, v3.4S\n"
399             "FADD v20.4S, v4.4S, v5.4S\n"
400             "FSUB v21.4S, v4.4S, v5.4S\n"
401             "FADD v22.4S, v6.4S, v7.4S\n"
402             "FSUB v23.4S, v6.4S, v7.4S\n"
403             "FADD v24.4S, v8.4S, v9.4S\n"
404             "FSUB v25.4S, v8.4S, v9.4S\n"
405             "FADD v26.4S, v10.4S, v11.4S\n"
406             "FSUB v27.4S, v10.4S, v11.4S\n"
407             "FADD v28.4S, v12.4S, v13.4S\n"
408             "FSUB v29.4S, v12.4S, v13.4S\n"
409             "FADD v30.4S, v14.4S, v15.4S\n"
410             "FSUB v31.4S, v14.4S, v15.4S\n"
411             "FADD v0.4S, v16.4S, v18.4S\n"
412             "FSUB v2.4S, v16.4S, v18.4S\n"
413             "FADD v1.4S, v17.4S, v19.4S\n"
414             "FSUB v3.4S, v17.4S, v19.4S\n"
415             "FADD v4.4S, v20.4S, v22.4S\n"
416             "FSUB v6.4S, v20.4S, v22.4S\n"
417             "FADD v5.4S, v21.4S, v23.4S\n"
418             "FSUB v7.4S, v21.4S, v23.4S\n"
419             "FADD v8.4S, v24.4S, v26.4S\n"
420             "FSUB v10.4S, v24.4S, v26.4S\n"
421             "FADD v9.4S, v25.4S, v27.4S\n"
422             "FSUB v11.4S, v25.4S, v27.4S\n"
423             "FADD v12.4S, v28.4S, v30.4S\n"
424             "FSUB v14.4S, v28.4S, v30.4S\n"
425             "FADD v13.4S, v29.4S, v31.4S\n"
426             "FSUB v15.4S, v29.4S, v31.4S\n"
427             "FADD v16.4S, v0.4S, v4.4S\n"
428             "FSUB v20.4S, v0.4S, v4.4S\n"
429             "FADD v17.4S, v1.4S, v5.4S\n"
430             "FSUB v21.4S, v1.4S, v5.4S\n"
431             "FADD v18.4S, v2.4S, v6.4S\n"
432             "FSUB v22.4S, v2.4S, v6.4S\n"
433             "FADD v19.4S, v3.4S, v7.4S\n"
434             "FSUB v23.4S, v3.4S, v7.4S\n"
435             "FADD v24.4S, v8.4S, v12.4S\n"
436             "FSUB v28.4S, v8.4S, v12.4S\n"
437             "FADD v25.4S, v9.4S, v13.4S\n"
438             "FSUB v29.4S, v9.4S, v13.4S\n"
439             "FADD v26.4S, v10.4S, v14.4S\n"
440             "FSUB v30.4S, v10.4S, v14.4S\n"
441             "FADD v27.4S, v11.4S, v15.4S\n"
442             "FSUB v31.4S, v11.4S, v15.4S\n"
443             "FADD v0.4S, v16.4S, v24.4S\n"
444             "FSUB v8.4S, v16.4S, v24.4S\n"
445             "FADD v1.4S, v17.4S, v25.4S\n"
446             "FSUB v9.4S, v17.4S, v25.4S\n"
447             "FADD v2.4S, v18.4S, v26.4S\n"
448             "FSUB v10.4S, v18.4S, v26.4S\n"
449             "FADD v3.4S, v19.4S, v27.4S\n"
450             "FSUB v11.4S, v19.4S, v27.4S\n"
451             "FADD v4.4S, v20.4S, v28.4S\n"
452             "FSUB v12.4S, v20.4S, v28.4S\n"
453             "FADD v5.4S, v21.4S, v29.4S\n"
454             "FSUB v13.4S, v21.4S, v29.4S\n"
455             "FADD v6.4S, v22.4S, v30.4S\n"
456             "FSUB v14.4S, v22.4S, v30.4S\n"
457             "FADD v7.4S, v23.4S, v31.4S\n"
458             "FSUB v15.4S, v23.4S, v31.4S\n"
459             "ST1 {v0.4S}, [%0]\n"
460             "ST1 {v1.4S}, [%1]\n"
461             "ST1 {v2.4S}, [%2]\n"
462             "ST1 {v3.4S}, [%3]\n"
463             "ST1 {v4.4S}, [%4]\n"
464             "ST1 {v5.4S}, [%5]\n"
465             "ST1 {v6.4S}, [%6]\n"
466             "ST1 {v7.4S}, [%7]\n"
467             "ST1 {v8.4S}, [%8]\n"
468             "ST1 {v9.4S}, [%9]\n"
469             "ST1 {v10.4S}, [%10]\n"
470             "ST1 {v11.4S}, [%11]\n"
471             "ST1 {v12.4S}, [%12]\n"
472             "ST1 {v13.4S}, [%13]\n"
473             "ST1 {v14.4S}, [%14]\n"
474             "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
475             "r"(buf + j + k + 8),
476             "r"(buf + j + k + 16),
477             "r"(buf + j + k + 24),
478             "r"(buf + j + k + 32),
479             "r"(buf + j + k + 40),
480             "r"(buf + j + k + 48),
481             "r"(buf + j + k + 56),
482             "r"(buf + j + k + 64),
483             "r"(buf + j + k + 72),
484             "r"(buf + j + k + 80),
485             "r"(buf + j + k + 88),
486             "r"(buf + j + k + 96),
487             "r"(buf + j + k + 104),
488             "r"(buf + j + k + 112),
489             "r"(buf + j + k + 120)
490             : "%v0",
491               "%v1",
492               "%v2",
493               "%v3",
494               "%v4",
495               "%v5",
496               "%v6",
497               "%v7",
498               "%v8",
499               "%v9",
500               "%v10",
501               "%v11",
502               "%v12",
503               "%v13",
504               "%v14",
505               "%v15",
506               "%v16",
507               "%v17",
508               "%v18",
509               "%v19",
510               "%v20",
511               "%v21",
512               "%v22",
513               "%v23",
514               "%v24",
515               "%v25",
516               "%v26",
517               "%v27",
518               "%v28",
519               "%v29",
520               "%v30",
521               "%v31",
522               "memory");
523       }
524     }
525     return;
526   }
527 }
528 void helper_float_7(float* buf);
helper_float_7(float * buf)529 void helper_float_7(float* buf) {
530   helper_float_7_recursive(buf, 7);
531 }
532 static inline void helper_float_8(float* buf);
helper_float_8(float * buf)533 static inline void helper_float_8(float* buf) {
534   for (int j = 0; j < 256; j += 64) {
535     for (int k = 0; k < 4; k += 4) {
536       __asm__ volatile(
537           "LD1 {v0.4S}, [%0]\n"
538           "LD1 {v1.4S}, [%1]\n"
539           "LD1 {v2.4S}, [%2]\n"
540           "LD1 {v3.4S}, [%3]\n"
541           "LD1 {v4.4S}, [%4]\n"
542           "LD1 {v5.4S}, [%5]\n"
543           "LD1 {v6.4S}, [%6]\n"
544           "LD1 {v7.4S}, [%7]\n"
545           "LD1 {v8.4S}, [%8]\n"
546           "LD1 {v9.4S}, [%9]\n"
547           "LD1 {v10.4S}, [%10]\n"
548           "LD1 {v11.4S}, [%11]\n"
549           "LD1 {v12.4S}, [%12]\n"
550           "LD1 {v13.4S}, [%13]\n"
551           "LD1 {v14.4S}, [%14]\n"
552           "LD1 {v15.4S}, [%15]\n"
553           "TRN1 v16.4S, v0.4S, v0.4S\n"
554           "FNEG v17.4S, v0.4S\n"
555           "TRN2 v17.4S, v0.4S, v17.4S\n"
556           "FADD v0.4S, v16.4S, v17.4S\n"
557           "TRN1 v16.4S, v1.4S, v1.4S\n"
558           "FNEG v17.4S, v1.4S\n"
559           "TRN2 v17.4S, v1.4S, v17.4S\n"
560           "FADD v1.4S, v16.4S, v17.4S\n"
561           "TRN1 v16.4S, v2.4S, v2.4S\n"
562           "FNEG v17.4S, v2.4S\n"
563           "TRN2 v17.4S, v2.4S, v17.4S\n"
564           "FADD v2.4S, v16.4S, v17.4S\n"
565           "TRN1 v16.4S, v3.4S, v3.4S\n"
566           "FNEG v17.4S, v3.4S\n"
567           "TRN2 v17.4S, v3.4S, v17.4S\n"
568           "FADD v3.4S, v16.4S, v17.4S\n"
569           "TRN1 v16.4S, v4.4S, v4.4S\n"
570           "FNEG v17.4S, v4.4S\n"
571           "TRN2 v17.4S, v4.4S, v17.4S\n"
572           "FADD v4.4S, v16.4S, v17.4S\n"
573           "TRN1 v16.4S, v5.4S, v5.4S\n"
574           "FNEG v17.4S, v5.4S\n"
575           "TRN2 v17.4S, v5.4S, v17.4S\n"
576           "FADD v5.4S, v16.4S, v17.4S\n"
577           "TRN1 v16.4S, v6.4S, v6.4S\n"
578           "FNEG v17.4S, v6.4S\n"
579           "TRN2 v17.4S, v6.4S, v17.4S\n"
580           "FADD v6.4S, v16.4S, v17.4S\n"
581           "TRN1 v16.4S, v7.4S, v7.4S\n"
582           "FNEG v17.4S, v7.4S\n"
583           "TRN2 v17.4S, v7.4S, v17.4S\n"
584           "FADD v7.4S, v16.4S, v17.4S\n"
585           "TRN1 v16.4S, v8.4S, v8.4S\n"
586           "FNEG v17.4S, v8.4S\n"
587           "TRN2 v17.4S, v8.4S, v17.4S\n"
588           "FADD v8.4S, v16.4S, v17.4S\n"
589           "TRN1 v16.4S, v9.4S, v9.4S\n"
590           "FNEG v17.4S, v9.4S\n"
591           "TRN2 v17.4S, v9.4S, v17.4S\n"
592           "FADD v9.4S, v16.4S, v17.4S\n"
593           "TRN1 v16.4S, v10.4S, v10.4S\n"
594           "FNEG v17.4S, v10.4S\n"
595           "TRN2 v17.4S, v10.4S, v17.4S\n"
596           "FADD v10.4S, v16.4S, v17.4S\n"
597           "TRN1 v16.4S, v11.4S, v11.4S\n"
598           "FNEG v17.4S, v11.4S\n"
599           "TRN2 v17.4S, v11.4S, v17.4S\n"
600           "FADD v11.4S, v16.4S, v17.4S\n"
601           "TRN1 v16.4S, v12.4S, v12.4S\n"
602           "FNEG v17.4S, v12.4S\n"
603           "TRN2 v17.4S, v12.4S, v17.4S\n"
604           "FADD v12.4S, v16.4S, v17.4S\n"
605           "TRN1 v16.4S, v13.4S, v13.4S\n"
606           "FNEG v17.4S, v13.4S\n"
607           "TRN2 v17.4S, v13.4S, v17.4S\n"
608           "FADD v13.4S, v16.4S, v17.4S\n"
609           "TRN1 v16.4S, v14.4S, v14.4S\n"
610           "FNEG v17.4S, v14.4S\n"
611           "TRN2 v17.4S, v14.4S, v17.4S\n"
612           "FADD v14.4S, v16.4S, v17.4S\n"
613           "TRN1 v16.4S, v15.4S, v15.4S\n"
614           "FNEG v17.4S, v15.4S\n"
615           "TRN2 v17.4S, v15.4S, v17.4S\n"
616           "FADD v15.4S, v16.4S, v17.4S\n"
617           "DUP v16.2D, v0.D[0]\n"
618           "FNEG v17.4S, v0.4S\n"
619           "INS v17.D[0], v0.D[1]\n"
620           "FADD v0.4S, v16.4S, v17.4S\n"
621           "DUP v16.2D, v1.D[0]\n"
622           "FNEG v17.4S, v1.4S\n"
623           "INS v17.D[0], v1.D[1]\n"
624           "FADD v1.4S, v16.4S, v17.4S\n"
625           "DUP v16.2D, v2.D[0]\n"
626           "FNEG v17.4S, v2.4S\n"
627           "INS v17.D[0], v2.D[1]\n"
628           "FADD v2.4S, v16.4S, v17.4S\n"
629           "DUP v16.2D, v3.D[0]\n"
630           "FNEG v17.4S, v3.4S\n"
631           "INS v17.D[0], v3.D[1]\n"
632           "FADD v3.4S, v16.4S, v17.4S\n"
633           "DUP v16.2D, v4.D[0]\n"
634           "FNEG v17.4S, v4.4S\n"
635           "INS v17.D[0], v4.D[1]\n"
636           "FADD v4.4S, v16.4S, v17.4S\n"
637           "DUP v16.2D, v5.D[0]\n"
638           "FNEG v17.4S, v5.4S\n"
639           "INS v17.D[0], v5.D[1]\n"
640           "FADD v5.4S, v16.4S, v17.4S\n"
641           "DUP v16.2D, v6.D[0]\n"
642           "FNEG v17.4S, v6.4S\n"
643           "INS v17.D[0], v6.D[1]\n"
644           "FADD v6.4S, v16.4S, v17.4S\n"
645           "DUP v16.2D, v7.D[0]\n"
646           "FNEG v17.4S, v7.4S\n"
647           "INS v17.D[0], v7.D[1]\n"
648           "FADD v7.4S, v16.4S, v17.4S\n"
649           "DUP v16.2D, v8.D[0]\n"
650           "FNEG v17.4S, v8.4S\n"
651           "INS v17.D[0], v8.D[1]\n"
652           "FADD v8.4S, v16.4S, v17.4S\n"
653           "DUP v16.2D, v9.D[0]\n"
654           "FNEG v17.4S, v9.4S\n"
655           "INS v17.D[0], v9.D[1]\n"
656           "FADD v9.4S, v16.4S, v17.4S\n"
657           "DUP v16.2D, v10.D[0]\n"
658           "FNEG v17.4S, v10.4S\n"
659           "INS v17.D[0], v10.D[1]\n"
660           "FADD v10.4S, v16.4S, v17.4S\n"
661           "DUP v16.2D, v11.D[0]\n"
662           "FNEG v17.4S, v11.4S\n"
663           "INS v17.D[0], v11.D[1]\n"
664           "FADD v11.4S, v16.4S, v17.4S\n"
665           "DUP v16.2D, v12.D[0]\n"
666           "FNEG v17.4S, v12.4S\n"
667           "INS v17.D[0], v12.D[1]\n"
668           "FADD v12.4S, v16.4S, v17.4S\n"
669           "DUP v16.2D, v13.D[0]\n"
670           "FNEG v17.4S, v13.4S\n"
671           "INS v17.D[0], v13.D[1]\n"
672           "FADD v13.4S, v16.4S, v17.4S\n"
673           "DUP v16.2D, v14.D[0]\n"
674           "FNEG v17.4S, v14.4S\n"
675           "INS v17.D[0], v14.D[1]\n"
676           "FADD v14.4S, v16.4S, v17.4S\n"
677           "DUP v16.2D, v15.D[0]\n"
678           "FNEG v17.4S, v15.4S\n"
679           "INS v17.D[0], v15.D[1]\n"
680           "FADD v15.4S, v16.4S, v17.4S\n"
681           "FADD v16.4S, v0.4S, v1.4S\n"
682           "FSUB v17.4S, v0.4S, v1.4S\n"
683           "FADD v18.4S, v2.4S, v3.4S\n"
684           "FSUB v19.4S, v2.4S, v3.4S\n"
685           "FADD v20.4S, v4.4S, v5.4S\n"
686           "FSUB v21.4S, v4.4S, v5.4S\n"
687           "FADD v22.4S, v6.4S, v7.4S\n"
688           "FSUB v23.4S, v6.4S, v7.4S\n"
689           "FADD v24.4S, v8.4S, v9.4S\n"
690           "FSUB v25.4S, v8.4S, v9.4S\n"
691           "FADD v26.4S, v10.4S, v11.4S\n"
692           "FSUB v27.4S, v10.4S, v11.4S\n"
693           "FADD v28.4S, v12.4S, v13.4S\n"
694           "FSUB v29.4S, v12.4S, v13.4S\n"
695           "FADD v30.4S, v14.4S, v15.4S\n"
696           "FSUB v31.4S, v14.4S, v15.4S\n"
697           "FADD v0.4S, v16.4S, v18.4S\n"
698           "FSUB v2.4S, v16.4S, v18.4S\n"
699           "FADD v1.4S, v17.4S, v19.4S\n"
700           "FSUB v3.4S, v17.4S, v19.4S\n"
701           "FADD v4.4S, v20.4S, v22.4S\n"
702           "FSUB v6.4S, v20.4S, v22.4S\n"
703           "FADD v5.4S, v21.4S, v23.4S\n"
704           "FSUB v7.4S, v21.4S, v23.4S\n"
705           "FADD v8.4S, v24.4S, v26.4S\n"
706           "FSUB v10.4S, v24.4S, v26.4S\n"
707           "FADD v9.4S, v25.4S, v27.4S\n"
708           "FSUB v11.4S, v25.4S, v27.4S\n"
709           "FADD v12.4S, v28.4S, v30.4S\n"
710           "FSUB v14.4S, v28.4S, v30.4S\n"
711           "FADD v13.4S, v29.4S, v31.4S\n"
712           "FSUB v15.4S, v29.4S, v31.4S\n"
713           "FADD v16.4S, v0.4S, v4.4S\n"
714           "FSUB v20.4S, v0.4S, v4.4S\n"
715           "FADD v17.4S, v1.4S, v5.4S\n"
716           "FSUB v21.4S, v1.4S, v5.4S\n"
717           "FADD v18.4S, v2.4S, v6.4S\n"
718           "FSUB v22.4S, v2.4S, v6.4S\n"
719           "FADD v19.4S, v3.4S, v7.4S\n"
720           "FSUB v23.4S, v3.4S, v7.4S\n"
721           "FADD v24.4S, v8.4S, v12.4S\n"
722           "FSUB v28.4S, v8.4S, v12.4S\n"
723           "FADD v25.4S, v9.4S, v13.4S\n"
724           "FSUB v29.4S, v9.4S, v13.4S\n"
725           "FADD v26.4S, v10.4S, v14.4S\n"
726           "FSUB v30.4S, v10.4S, v14.4S\n"
727           "FADD v27.4S, v11.4S, v15.4S\n"
728           "FSUB v31.4S, v11.4S, v15.4S\n"
729           "FADD v0.4S, v16.4S, v24.4S\n"
730           "FSUB v8.4S, v16.4S, v24.4S\n"
731           "FADD v1.4S, v17.4S, v25.4S\n"
732           "FSUB v9.4S, v17.4S, v25.4S\n"
733           "FADD v2.4S, v18.4S, v26.4S\n"
734           "FSUB v10.4S, v18.4S, v26.4S\n"
735           "FADD v3.4S, v19.4S, v27.4S\n"
736           "FSUB v11.4S, v19.4S, v27.4S\n"
737           "FADD v4.4S, v20.4S, v28.4S\n"
738           "FSUB v12.4S, v20.4S, v28.4S\n"
739           "FADD v5.4S, v21.4S, v29.4S\n"
740           "FSUB v13.4S, v21.4S, v29.4S\n"
741           "FADD v6.4S, v22.4S, v30.4S\n"
742           "FSUB v14.4S, v22.4S, v30.4S\n"
743           "FADD v7.4S, v23.4S, v31.4S\n"
744           "FSUB v15.4S, v23.4S, v31.4S\n"
745           "ST1 {v0.4S}, [%0]\n"
746           "ST1 {v1.4S}, [%1]\n"
747           "ST1 {v2.4S}, [%2]\n"
748           "ST1 {v3.4S}, [%3]\n"
749           "ST1 {v4.4S}, [%4]\n"
750           "ST1 {v5.4S}, [%5]\n"
751           "ST1 {v6.4S}, [%6]\n"
752           "ST1 {v7.4S}, [%7]\n"
753           "ST1 {v8.4S}, [%8]\n"
754           "ST1 {v9.4S}, [%9]\n"
755           "ST1 {v10.4S}, [%10]\n"
756           "ST1 {v11.4S}, [%11]\n"
757           "ST1 {v12.4S}, [%12]\n"
758           "ST1 {v13.4S}, [%13]\n"
759           "ST1 {v14.4S}, [%14]\n"
760           "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
761           "r"(buf + j + k + 4),
762           "r"(buf + j + k + 8),
763           "r"(buf + j + k + 12),
764           "r"(buf + j + k + 16),
765           "r"(buf + j + k + 20),
766           "r"(buf + j + k + 24),
767           "r"(buf + j + k + 28),
768           "r"(buf + j + k + 32),
769           "r"(buf + j + k + 36),
770           "r"(buf + j + k + 40),
771           "r"(buf + j + k + 44),
772           "r"(buf + j + k + 48),
773           "r"(buf + j + k + 52),
774           "r"(buf + j + k + 56),
775           "r"(buf + j + k + 60)
776           : "%v0",
777             "%v1",
778             "%v2",
779             "%v3",
780             "%v4",
781             "%v5",
782             "%v6",
783             "%v7",
784             "%v8",
785             "%v9",
786             "%v10",
787             "%v11",
788             "%v12",
789             "%v13",
790             "%v14",
791             "%v15",
792             "%v16",
793             "%v17",
794             "%v18",
795             "%v19",
796             "%v20",
797             "%v21",
798             "%v22",
799             "%v23",
800             "%v24",
801             "%v25",
802             "%v26",
803             "%v27",
804             "%v28",
805             "%v29",
806             "%v30",
807             "%v31",
808             "memory");
809     }
810   }
811   for (int j = 0; j < 256; j += 256) {
812     for (int k = 0; k < 64; k += 4) {
813       __asm__ volatile(
814           "LD1 {v0.4S}, [%0]\n"
815           "LD1 {v1.4S}, [%1]\n"
816           "LD1 {v2.4S}, [%2]\n"
817           "LD1 {v3.4S}, [%3]\n"
818           "FADD v16.4S, v0.4S, v1.4S\n"
819           "FSUB v17.4S, v0.4S, v1.4S\n"
820           "FADD v18.4S, v2.4S, v3.4S\n"
821           "FSUB v19.4S, v2.4S, v3.4S\n"
822           "FADD v0.4S, v16.4S, v18.4S\n"
823           "FSUB v2.4S, v16.4S, v18.4S\n"
824           "FADD v1.4S, v17.4S, v19.4S\n"
825           "FSUB v3.4S, v17.4S, v19.4S\n"
826           "ST1 {v0.4S}, [%0]\n"
827           "ST1 {v1.4S}, [%1]\n"
828           "ST1 {v2.4S}, [%2]\n"
829           "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
830           "r"(buf + j + k + 64),
831           "r"(buf + j + k + 128),
832           "r"(buf + j + k + 192)
833           : "%v0",
834             "%v1",
835             "%v2",
836             "%v3",
837             "%v4",
838             "%v5",
839             "%v6",
840             "%v7",
841             "%v8",
842             "%v9",
843             "%v10",
844             "%v11",
845             "%v12",
846             "%v13",
847             "%v14",
848             "%v15",
849             "%v16",
850             "%v17",
851             "%v18",
852             "%v19",
853             "%v20",
854             "%v21",
855             "%v22",
856             "%v23",
857             "%v24",
858             "%v25",
859             "%v26",
860             "%v27",
861             "%v28",
862             "%v29",
863             "%v30",
864             "%v31",
865             "memory");
866     }
867   }
868 }
869 void helper_float_9_recursive(float* buf, int depth);
helper_float_9_recursive(float * buf,int depth)870 void helper_float_9_recursive(float* buf, int depth) {
871   if (depth == 8) {
872     helper_float_8(buf);
873     return;
874   }
875   if (depth == 9) {
876     helper_float_9_recursive(buf + 0, 8);
877     helper_float_9_recursive(buf + 256, 8);
878     for (int j = 0; j < 512; j += 512) {
879       for (int k = 0; k < 256; k += 4) {
880         __asm__ volatile(
881             "LD1 {v0.4S}, [%0]\n"
882             "LD1 {v1.4S}, [%1]\n"
883             "FADD v16.4S, v0.4S, v1.4S\n"
884             "FSUB v17.4S, v0.4S, v1.4S\n"
885             "ST1 {v16.4S}, [%0]\n"
886             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
887             "r"(buf + j + k + 256)
888             : "%v0",
889               "%v1",
890               "%v2",
891               "%v3",
892               "%v4",
893               "%v5",
894               "%v6",
895               "%v7",
896               "%v8",
897               "%v9",
898               "%v10",
899               "%v11",
900               "%v12",
901               "%v13",
902               "%v14",
903               "%v15",
904               "%v16",
905               "%v17",
906               "%v18",
907               "%v19",
908               "%v20",
909               "%v21",
910               "%v22",
911               "%v23",
912               "%v24",
913               "%v25",
914               "%v26",
915               "%v27",
916               "%v28",
917               "%v29",
918               "%v30",
919               "%v31",
920               "memory");
921       }
922     }
923     return;
924   }
925 }
926 void helper_float_9(float* buf);
helper_float_9(float * buf)927 void helper_float_9(float* buf) {
928   helper_float_9_recursive(buf, 9);
929 }
930 void helper_float_10_recursive(float* buf, int depth);
helper_float_10_recursive(float * buf,int depth)931 void helper_float_10_recursive(float* buf, int depth) {
932   if (depth == 8) {
933     helper_float_8(buf);
934     return;
935   }
936   if (depth == 10) {
937     helper_float_10_recursive(buf + 0, 8);
938     helper_float_10_recursive(buf + 256, 8);
939     helper_float_10_recursive(buf + 512, 8);
940     helper_float_10_recursive(buf + 768, 8);
941     for (int j = 0; j < 1024; j += 1024) {
942       for (int k = 0; k < 256; k += 4) {
943         __asm__ volatile(
944             "LD1 {v0.4S}, [%0]\n"
945             "LD1 {v1.4S}, [%1]\n"
946             "LD1 {v2.4S}, [%2]\n"
947             "LD1 {v3.4S}, [%3]\n"
948             "FADD v16.4S, v0.4S, v1.4S\n"
949             "FSUB v17.4S, v0.4S, v1.4S\n"
950             "FADD v18.4S, v2.4S, v3.4S\n"
951             "FSUB v19.4S, v2.4S, v3.4S\n"
952             "FADD v0.4S, v16.4S, v18.4S\n"
953             "FSUB v2.4S, v16.4S, v18.4S\n"
954             "FADD v1.4S, v17.4S, v19.4S\n"
955             "FSUB v3.4S, v17.4S, v19.4S\n"
956             "ST1 {v0.4S}, [%0]\n"
957             "ST1 {v1.4S}, [%1]\n"
958             "ST1 {v2.4S}, [%2]\n"
959             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
960             "r"(buf + j + k + 256),
961             "r"(buf + j + k + 512),
962             "r"(buf + j + k + 768)
963             : "%v0",
964               "%v1",
965               "%v2",
966               "%v3",
967               "%v4",
968               "%v5",
969               "%v6",
970               "%v7",
971               "%v8",
972               "%v9",
973               "%v10",
974               "%v11",
975               "%v12",
976               "%v13",
977               "%v14",
978               "%v15",
979               "%v16",
980               "%v17",
981               "%v18",
982               "%v19",
983               "%v20",
984               "%v21",
985               "%v22",
986               "%v23",
987               "%v24",
988               "%v25",
989               "%v26",
990               "%v27",
991               "%v28",
992               "%v29",
993               "%v30",
994               "%v31",
995               "memory");
996       }
997     }
998     return;
999   }
1000 }
1001 void helper_float_10(float* buf);
helper_float_10(float * buf)1002 void helper_float_10(float* buf) {
1003   helper_float_10_recursive(buf, 10);
1004 }
1005 void helper_float_11_recursive(float* buf, int depth);
helper_float_11_recursive(float * buf,int depth)1006 void helper_float_11_recursive(float* buf, int depth) {
1007   if (depth == 10) {
1008     helper_float_10(buf);
1009     return;
1010   }
1011   if (depth == 11) {
1012     helper_float_11_recursive(buf + 0, 10);
1013     helper_float_11_recursive(buf + 1024, 10);
1014     for (int j = 0; j < 2048; j += 2048) {
1015       for (int k = 0; k < 1024; k += 4) {
1016         __asm__ volatile(
1017             "LD1 {v0.4S}, [%0]\n"
1018             "LD1 {v1.4S}, [%1]\n"
1019             "FADD v16.4S, v0.4S, v1.4S\n"
1020             "FSUB v17.4S, v0.4S, v1.4S\n"
1021             "ST1 {v16.4S}, [%0]\n"
1022             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
1023             "r"(buf + j + k + 1024)
1024             : "%v0",
1025               "%v1",
1026               "%v2",
1027               "%v3",
1028               "%v4",
1029               "%v5",
1030               "%v6",
1031               "%v7",
1032               "%v8",
1033               "%v9",
1034               "%v10",
1035               "%v11",
1036               "%v12",
1037               "%v13",
1038               "%v14",
1039               "%v15",
1040               "%v16",
1041               "%v17",
1042               "%v18",
1043               "%v19",
1044               "%v20",
1045               "%v21",
1046               "%v22",
1047               "%v23",
1048               "%v24",
1049               "%v25",
1050               "%v26",
1051               "%v27",
1052               "%v28",
1053               "%v29",
1054               "%v30",
1055               "%v31",
1056               "memory");
1057       }
1058     }
1059     return;
1060   }
1061 }
1062 void helper_float_11(float* buf);
helper_float_11(float * buf)1063 void helper_float_11(float* buf) {
1064   helper_float_11_recursive(buf, 11);
1065 }
1066 void helper_float_12_recursive(float* buf, int depth);
helper_float_12_recursive(float * buf,int depth)1067 void helper_float_12_recursive(float* buf, int depth) {
1068   if (depth == 10) {
1069     helper_float_10(buf);
1070     return;
1071   }
1072   if (depth == 12) {
1073     helper_float_12_recursive(buf + 0, 10);
1074     helper_float_12_recursive(buf + 1024, 10);
1075     helper_float_12_recursive(buf + 2048, 10);
1076     helper_float_12_recursive(buf + 3072, 10);
1077     for (int j = 0; j < 4096; j += 4096) {
1078       for (int k = 0; k < 1024; k += 4) {
1079         __asm__ volatile(
1080             "LD1 {v0.4S}, [%0]\n"
1081             "LD1 {v1.4S}, [%1]\n"
1082             "LD1 {v2.4S}, [%2]\n"
1083             "LD1 {v3.4S}, [%3]\n"
1084             "FADD v16.4S, v0.4S, v1.4S\n"
1085             "FSUB v17.4S, v0.4S, v1.4S\n"
1086             "FADD v18.4S, v2.4S, v3.4S\n"
1087             "FSUB v19.4S, v2.4S, v3.4S\n"
1088             "FADD v0.4S, v16.4S, v18.4S\n"
1089             "FSUB v2.4S, v16.4S, v18.4S\n"
1090             "FADD v1.4S, v17.4S, v19.4S\n"
1091             "FSUB v3.4S, v17.4S, v19.4S\n"
1092             "ST1 {v0.4S}, [%0]\n"
1093             "ST1 {v1.4S}, [%1]\n"
1094             "ST1 {v2.4S}, [%2]\n"
1095             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
1096             "r"(buf + j + k + 1024),
1097             "r"(buf + j + k + 2048),
1098             "r"(buf + j + k + 3072)
1099             : "%v0",
1100               "%v1",
1101               "%v2",
1102               "%v3",
1103               "%v4",
1104               "%v5",
1105               "%v6",
1106               "%v7",
1107               "%v8",
1108               "%v9",
1109               "%v10",
1110               "%v11",
1111               "%v12",
1112               "%v13",
1113               "%v14",
1114               "%v15",
1115               "%v16",
1116               "%v17",
1117               "%v18",
1118               "%v19",
1119               "%v20",
1120               "%v21",
1121               "%v22",
1122               "%v23",
1123               "%v24",
1124               "%v25",
1125               "%v26",
1126               "%v27",
1127               "%v28",
1128               "%v29",
1129               "%v30",
1130               "%v31",
1131               "memory");
1132       }
1133     }
1134     return;
1135   }
1136 }
1137 void helper_float_12(float* buf);
helper_float_12(float * buf)1138 void helper_float_12(float* buf) {
1139   helper_float_12_recursive(buf, 12);
1140 }
1141 static inline void helper_float_13(float* buf);
helper_float_13(float * buf)1142 static inline void helper_float_13(float* buf) {
1143   for (int j = 0; j < 8192; j += 64) {
1144     for (int k = 0; k < 4; k += 4) {
1145       __asm__ volatile(
1146           "LD1 {v0.4S}, [%0]\n"
1147           "LD1 {v1.4S}, [%1]\n"
1148           "LD1 {v2.4S}, [%2]\n"
1149           "LD1 {v3.4S}, [%3]\n"
1150           "LD1 {v4.4S}, [%4]\n"
1151           "LD1 {v5.4S}, [%5]\n"
1152           "LD1 {v6.4S}, [%6]\n"
1153           "LD1 {v7.4S}, [%7]\n"
1154           "LD1 {v8.4S}, [%8]\n"
1155           "LD1 {v9.4S}, [%9]\n"
1156           "LD1 {v10.4S}, [%10]\n"
1157           "LD1 {v11.4S}, [%11]\n"
1158           "LD1 {v12.4S}, [%12]\n"
1159           "LD1 {v13.4S}, [%13]\n"
1160           "LD1 {v14.4S}, [%14]\n"
1161           "LD1 {v15.4S}, [%15]\n"
1162           "TRN1 v16.4S, v0.4S, v0.4S\n"
1163           "FNEG v17.4S, v0.4S\n"
1164           "TRN2 v17.4S, v0.4S, v17.4S\n"
1165           "FADD v0.4S, v16.4S, v17.4S\n"
1166           "TRN1 v16.4S, v1.4S, v1.4S\n"
1167           "FNEG v17.4S, v1.4S\n"
1168           "TRN2 v17.4S, v1.4S, v17.4S\n"
1169           "FADD v1.4S, v16.4S, v17.4S\n"
1170           "TRN1 v16.4S, v2.4S, v2.4S\n"
1171           "FNEG v17.4S, v2.4S\n"
1172           "TRN2 v17.4S, v2.4S, v17.4S\n"
1173           "FADD v2.4S, v16.4S, v17.4S\n"
1174           "TRN1 v16.4S, v3.4S, v3.4S\n"
1175           "FNEG v17.4S, v3.4S\n"
1176           "TRN2 v17.4S, v3.4S, v17.4S\n"
1177           "FADD v3.4S, v16.4S, v17.4S\n"
1178           "TRN1 v16.4S, v4.4S, v4.4S\n"
1179           "FNEG v17.4S, v4.4S\n"
1180           "TRN2 v17.4S, v4.4S, v17.4S\n"
1181           "FADD v4.4S, v16.4S, v17.4S\n"
1182           "TRN1 v16.4S, v5.4S, v5.4S\n"
1183           "FNEG v17.4S, v5.4S\n"
1184           "TRN2 v17.4S, v5.4S, v17.4S\n"
1185           "FADD v5.4S, v16.4S, v17.4S\n"
1186           "TRN1 v16.4S, v6.4S, v6.4S\n"
1187           "FNEG v17.4S, v6.4S\n"
1188           "TRN2 v17.4S, v6.4S, v17.4S\n"
1189           "FADD v6.4S, v16.4S, v17.4S\n"
1190           "TRN1 v16.4S, v7.4S, v7.4S\n"
1191           "FNEG v17.4S, v7.4S\n"
1192           "TRN2 v17.4S, v7.4S, v17.4S\n"
1193           "FADD v7.4S, v16.4S, v17.4S\n"
1194           "TRN1 v16.4S, v8.4S, v8.4S\n"
1195           "FNEG v17.4S, v8.4S\n"
1196           "TRN2 v17.4S, v8.4S, v17.4S\n"
1197           "FADD v8.4S, v16.4S, v17.4S\n"
1198           "TRN1 v16.4S, v9.4S, v9.4S\n"
1199           "FNEG v17.4S, v9.4S\n"
1200           "TRN2 v17.4S, v9.4S, v17.4S\n"
1201           "FADD v9.4S, v16.4S, v17.4S\n"
1202           "TRN1 v16.4S, v10.4S, v10.4S\n"
1203           "FNEG v17.4S, v10.4S\n"
1204           "TRN2 v17.4S, v10.4S, v17.4S\n"
1205           "FADD v10.4S, v16.4S, v17.4S\n"
1206           "TRN1 v16.4S, v11.4S, v11.4S\n"
1207           "FNEG v17.4S, v11.4S\n"
1208           "TRN2 v17.4S, v11.4S, v17.4S\n"
1209           "FADD v11.4S, v16.4S, v17.4S\n"
1210           "TRN1 v16.4S, v12.4S, v12.4S\n"
1211           "FNEG v17.4S, v12.4S\n"
1212           "TRN2 v17.4S, v12.4S, v17.4S\n"
1213           "FADD v12.4S, v16.4S, v17.4S\n"
1214           "TRN1 v16.4S, v13.4S, v13.4S\n"
1215           "FNEG v17.4S, v13.4S\n"
1216           "TRN2 v17.4S, v13.4S, v17.4S\n"
1217           "FADD v13.4S, v16.4S, v17.4S\n"
1218           "TRN1 v16.4S, v14.4S, v14.4S\n"
1219           "FNEG v17.4S, v14.4S\n"
1220           "TRN2 v17.4S, v14.4S, v17.4S\n"
1221           "FADD v14.4S, v16.4S, v17.4S\n"
1222           "TRN1 v16.4S, v15.4S, v15.4S\n"
1223           "FNEG v17.4S, v15.4S\n"
1224           "TRN2 v17.4S, v15.4S, v17.4S\n"
1225           "FADD v15.4S, v16.4S, v17.4S\n"
1226           "DUP v16.2D, v0.D[0]\n"
1227           "FNEG v17.4S, v0.4S\n"
1228           "INS v17.D[0], v0.D[1]\n"
1229           "FADD v0.4S, v16.4S, v17.4S\n"
1230           "DUP v16.2D, v1.D[0]\n"
1231           "FNEG v17.4S, v1.4S\n"
1232           "INS v17.D[0], v1.D[1]\n"
1233           "FADD v1.4S, v16.4S, v17.4S\n"
1234           "DUP v16.2D, v2.D[0]\n"
1235           "FNEG v17.4S, v2.4S\n"
1236           "INS v17.D[0], v2.D[1]\n"
1237           "FADD v2.4S, v16.4S, v17.4S\n"
1238           "DUP v16.2D, v3.D[0]\n"
1239           "FNEG v17.4S, v3.4S\n"
1240           "INS v17.D[0], v3.D[1]\n"
1241           "FADD v3.4S, v16.4S, v17.4S\n"
1242           "DUP v16.2D, v4.D[0]\n"
1243           "FNEG v17.4S, v4.4S\n"
1244           "INS v17.D[0], v4.D[1]\n"
1245           "FADD v4.4S, v16.4S, v17.4S\n"
1246           "DUP v16.2D, v5.D[0]\n"
1247           "FNEG v17.4S, v5.4S\n"
1248           "INS v17.D[0], v5.D[1]\n"
1249           "FADD v5.4S, v16.4S, v17.4S\n"
1250           "DUP v16.2D, v6.D[0]\n"
1251           "FNEG v17.4S, v6.4S\n"
1252           "INS v17.D[0], v6.D[1]\n"
1253           "FADD v6.4S, v16.4S, v17.4S\n"
1254           "DUP v16.2D, v7.D[0]\n"
1255           "FNEG v17.4S, v7.4S\n"
1256           "INS v17.D[0], v7.D[1]\n"
1257           "FADD v7.4S, v16.4S, v17.4S\n"
1258           "DUP v16.2D, v8.D[0]\n"
1259           "FNEG v17.4S, v8.4S\n"
1260           "INS v17.D[0], v8.D[1]\n"
1261           "FADD v8.4S, v16.4S, v17.4S\n"
1262           "DUP v16.2D, v9.D[0]\n"
1263           "FNEG v17.4S, v9.4S\n"
1264           "INS v17.D[0], v9.D[1]\n"
1265           "FADD v9.4S, v16.4S, v17.4S\n"
1266           "DUP v16.2D, v10.D[0]\n"
1267           "FNEG v17.4S, v10.4S\n"
1268           "INS v17.D[0], v10.D[1]\n"
1269           "FADD v10.4S, v16.4S, v17.4S\n"
1270           "DUP v16.2D, v11.D[0]\n"
1271           "FNEG v17.4S, v11.4S\n"
1272           "INS v17.D[0], v11.D[1]\n"
1273           "FADD v11.4S, v16.4S, v17.4S\n"
1274           "DUP v16.2D, v12.D[0]\n"
1275           "FNEG v17.4S, v12.4S\n"
1276           "INS v17.D[0], v12.D[1]\n"
1277           "FADD v12.4S, v16.4S, v17.4S\n"
1278           "DUP v16.2D, v13.D[0]\n"
1279           "FNEG v17.4S, v13.4S\n"
1280           "INS v17.D[0], v13.D[1]\n"
1281           "FADD v13.4S, v16.4S, v17.4S\n"
1282           "DUP v16.2D, v14.D[0]\n"
1283           "FNEG v17.4S, v14.4S\n"
1284           "INS v17.D[0], v14.D[1]\n"
1285           "FADD v14.4S, v16.4S, v17.4S\n"
1286           "DUP v16.2D, v15.D[0]\n"
1287           "FNEG v17.4S, v15.4S\n"
1288           "INS v17.D[0], v15.D[1]\n"
1289           "FADD v15.4S, v16.4S, v17.4S\n"
1290           "FADD v16.4S, v0.4S, v1.4S\n"
1291           "FSUB v17.4S, v0.4S, v1.4S\n"
1292           "FADD v18.4S, v2.4S, v3.4S\n"
1293           "FSUB v19.4S, v2.4S, v3.4S\n"
1294           "FADD v20.4S, v4.4S, v5.4S\n"
1295           "FSUB v21.4S, v4.4S, v5.4S\n"
1296           "FADD v22.4S, v6.4S, v7.4S\n"
1297           "FSUB v23.4S, v6.4S, v7.4S\n"
1298           "FADD v24.4S, v8.4S, v9.4S\n"
1299           "FSUB v25.4S, v8.4S, v9.4S\n"
1300           "FADD v26.4S, v10.4S, v11.4S\n"
1301           "FSUB v27.4S, v10.4S, v11.4S\n"
1302           "FADD v28.4S, v12.4S, v13.4S\n"
1303           "FSUB v29.4S, v12.4S, v13.4S\n"
1304           "FADD v30.4S, v14.4S, v15.4S\n"
1305           "FSUB v31.4S, v14.4S, v15.4S\n"
1306           "FADD v0.4S, v16.4S, v18.4S\n"
1307           "FSUB v2.4S, v16.4S, v18.4S\n"
1308           "FADD v1.4S, v17.4S, v19.4S\n"
1309           "FSUB v3.4S, v17.4S, v19.4S\n"
1310           "FADD v4.4S, v20.4S, v22.4S\n"
1311           "FSUB v6.4S, v20.4S, v22.4S\n"
1312           "FADD v5.4S, v21.4S, v23.4S\n"
1313           "FSUB v7.4S, v21.4S, v23.4S\n"
1314           "FADD v8.4S, v24.4S, v26.4S\n"
1315           "FSUB v10.4S, v24.4S, v26.4S\n"
1316           "FADD v9.4S, v25.4S, v27.4S\n"
1317           "FSUB v11.4S, v25.4S, v27.4S\n"
1318           "FADD v12.4S, v28.4S, v30.4S\n"
1319           "FSUB v14.4S, v28.4S, v30.4S\n"
1320           "FADD v13.4S, v29.4S, v31.4S\n"
1321           "FSUB v15.4S, v29.4S, v31.4S\n"
1322           "FADD v16.4S, v0.4S, v4.4S\n"
1323           "FSUB v20.4S, v0.4S, v4.4S\n"
1324           "FADD v17.4S, v1.4S, v5.4S\n"
1325           "FSUB v21.4S, v1.4S, v5.4S\n"
1326           "FADD v18.4S, v2.4S, v6.4S\n"
1327           "FSUB v22.4S, v2.4S, v6.4S\n"
1328           "FADD v19.4S, v3.4S, v7.4S\n"
1329           "FSUB v23.4S, v3.4S, v7.4S\n"
1330           "FADD v24.4S, v8.4S, v12.4S\n"
1331           "FSUB v28.4S, v8.4S, v12.4S\n"
1332           "FADD v25.4S, v9.4S, v13.4S\n"
1333           "FSUB v29.4S, v9.4S, v13.4S\n"
1334           "FADD v26.4S, v10.4S, v14.4S\n"
1335           "FSUB v30.4S, v10.4S, v14.4S\n"
1336           "FADD v27.4S, v11.4S, v15.4S\n"
1337           "FSUB v31.4S, v11.4S, v15.4S\n"
1338           "FADD v0.4S, v16.4S, v24.4S\n"
1339           "FSUB v8.4S, v16.4S, v24.4S\n"
1340           "FADD v1.4S, v17.4S, v25.4S\n"
1341           "FSUB v9.4S, v17.4S, v25.4S\n"
1342           "FADD v2.4S, v18.4S, v26.4S\n"
1343           "FSUB v10.4S, v18.4S, v26.4S\n"
1344           "FADD v3.4S, v19.4S, v27.4S\n"
1345           "FSUB v11.4S, v19.4S, v27.4S\n"
1346           "FADD v4.4S, v20.4S, v28.4S\n"
1347           "FSUB v12.4S, v20.4S, v28.4S\n"
1348           "FADD v5.4S, v21.4S, v29.4S\n"
1349           "FSUB v13.4S, v21.4S, v29.4S\n"
1350           "FADD v6.4S, v22.4S, v30.4S\n"
1351           "FSUB v14.4S, v22.4S, v30.4S\n"
1352           "FADD v7.4S, v23.4S, v31.4S\n"
1353           "FSUB v15.4S, v23.4S, v31.4S\n"
1354           "ST1 {v0.4S}, [%0]\n"
1355           "ST1 {v1.4S}, [%1]\n"
1356           "ST1 {v2.4S}, [%2]\n"
1357           "ST1 {v3.4S}, [%3]\n"
1358           "ST1 {v4.4S}, [%4]\n"
1359           "ST1 {v5.4S}, [%5]\n"
1360           "ST1 {v6.4S}, [%6]\n"
1361           "ST1 {v7.4S}, [%7]\n"
1362           "ST1 {v8.4S}, [%8]\n"
1363           "ST1 {v9.4S}, [%9]\n"
1364           "ST1 {v10.4S}, [%10]\n"
1365           "ST1 {v11.4S}, [%11]\n"
1366           "ST1 {v12.4S}, [%12]\n"
1367           "ST1 {v13.4S}, [%13]\n"
1368           "ST1 {v14.4S}, [%14]\n"
1369           "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
1370           "r"(buf + j + k + 4),
1371           "r"(buf + j + k + 8),
1372           "r"(buf + j + k + 12),
1373           "r"(buf + j + k + 16),
1374           "r"(buf + j + k + 20),
1375           "r"(buf + j + k + 24),
1376           "r"(buf + j + k + 28),
1377           "r"(buf + j + k + 32),
1378           "r"(buf + j + k + 36),
1379           "r"(buf + j + k + 40),
1380           "r"(buf + j + k + 44),
1381           "r"(buf + j + k + 48),
1382           "r"(buf + j + k + 52),
1383           "r"(buf + j + k + 56),
1384           "r"(buf + j + k + 60)
1385           : "%v0",
1386             "%v1",
1387             "%v2",
1388             "%v3",
1389             "%v4",
1390             "%v5",
1391             "%v6",
1392             "%v7",
1393             "%v8",
1394             "%v9",
1395             "%v10",
1396             "%v11",
1397             "%v12",
1398             "%v13",
1399             "%v14",
1400             "%v15",
1401             "%v16",
1402             "%v17",
1403             "%v18",
1404             "%v19",
1405             "%v20",
1406             "%v21",
1407             "%v22",
1408             "%v23",
1409             "%v24",
1410             "%v25",
1411             "%v26",
1412             "%v27",
1413             "%v28",
1414             "%v29",
1415             "%v30",
1416             "%v31",
1417             "memory");
1418     }
1419   }
1420   for (int j = 0; j < 8192; j += 1024) {
1421     for (int k = 0; k < 64; k += 4) {
1422       __asm__ volatile(
1423           "LD1 {v0.4S}, [%0]\n"
1424           "LD1 {v1.4S}, [%1]\n"
1425           "LD1 {v2.4S}, [%2]\n"
1426           "LD1 {v3.4S}, [%3]\n"
1427           "LD1 {v4.4S}, [%4]\n"
1428           "LD1 {v5.4S}, [%5]\n"
1429           "LD1 {v6.4S}, [%6]\n"
1430           "LD1 {v7.4S}, [%7]\n"
1431           "LD1 {v8.4S}, [%8]\n"
1432           "LD1 {v9.4S}, [%9]\n"
1433           "LD1 {v10.4S}, [%10]\n"
1434           "LD1 {v11.4S}, [%11]\n"
1435           "LD1 {v12.4S}, [%12]\n"
1436           "LD1 {v13.4S}, [%13]\n"
1437           "LD1 {v14.4S}, [%14]\n"
1438           "LD1 {v15.4S}, [%15]\n"
1439           "FADD v16.4S, v0.4S, v1.4S\n"
1440           "FSUB v17.4S, v0.4S, v1.4S\n"
1441           "FADD v18.4S, v2.4S, v3.4S\n"
1442           "FSUB v19.4S, v2.4S, v3.4S\n"
1443           "FADD v20.4S, v4.4S, v5.4S\n"
1444           "FSUB v21.4S, v4.4S, v5.4S\n"
1445           "FADD v22.4S, v6.4S, v7.4S\n"
1446           "FSUB v23.4S, v6.4S, v7.4S\n"
1447           "FADD v24.4S, v8.4S, v9.4S\n"
1448           "FSUB v25.4S, v8.4S, v9.4S\n"
1449           "FADD v26.4S, v10.4S, v11.4S\n"
1450           "FSUB v27.4S, v10.4S, v11.4S\n"
1451           "FADD v28.4S, v12.4S, v13.4S\n"
1452           "FSUB v29.4S, v12.4S, v13.4S\n"
1453           "FADD v30.4S, v14.4S, v15.4S\n"
1454           "FSUB v31.4S, v14.4S, v15.4S\n"
1455           "FADD v0.4S, v16.4S, v18.4S\n"
1456           "FSUB v2.4S, v16.4S, v18.4S\n"
1457           "FADD v1.4S, v17.4S, v19.4S\n"
1458           "FSUB v3.4S, v17.4S, v19.4S\n"
1459           "FADD v4.4S, v20.4S, v22.4S\n"
1460           "FSUB v6.4S, v20.4S, v22.4S\n"
1461           "FADD v5.4S, v21.4S, v23.4S\n"
1462           "FSUB v7.4S, v21.4S, v23.4S\n"
1463           "FADD v8.4S, v24.4S, v26.4S\n"
1464           "FSUB v10.4S, v24.4S, v26.4S\n"
1465           "FADD v9.4S, v25.4S, v27.4S\n"
1466           "FSUB v11.4S, v25.4S, v27.4S\n"
1467           "FADD v12.4S, v28.4S, v30.4S\n"
1468           "FSUB v14.4S, v28.4S, v30.4S\n"
1469           "FADD v13.4S, v29.4S, v31.4S\n"
1470           "FSUB v15.4S, v29.4S, v31.4S\n"
1471           "FADD v16.4S, v0.4S, v4.4S\n"
1472           "FSUB v20.4S, v0.4S, v4.4S\n"
1473           "FADD v17.4S, v1.4S, v5.4S\n"
1474           "FSUB v21.4S, v1.4S, v5.4S\n"
1475           "FADD v18.4S, v2.4S, v6.4S\n"
1476           "FSUB v22.4S, v2.4S, v6.4S\n"
1477           "FADD v19.4S, v3.4S, v7.4S\n"
1478           "FSUB v23.4S, v3.4S, v7.4S\n"
1479           "FADD v24.4S, v8.4S, v12.4S\n"
1480           "FSUB v28.4S, v8.4S, v12.4S\n"
1481           "FADD v25.4S, v9.4S, v13.4S\n"
1482           "FSUB v29.4S, v9.4S, v13.4S\n"
1483           "FADD v26.4S, v10.4S, v14.4S\n"
1484           "FSUB v30.4S, v10.4S, v14.4S\n"
1485           "FADD v27.4S, v11.4S, v15.4S\n"
1486           "FSUB v31.4S, v11.4S, v15.4S\n"
1487           "FADD v0.4S, v16.4S, v24.4S\n"
1488           "FSUB v8.4S, v16.4S, v24.4S\n"
1489           "FADD v1.4S, v17.4S, v25.4S\n"
1490           "FSUB v9.4S, v17.4S, v25.4S\n"
1491           "FADD v2.4S, v18.4S, v26.4S\n"
1492           "FSUB v10.4S, v18.4S, v26.4S\n"
1493           "FADD v3.4S, v19.4S, v27.4S\n"
1494           "FSUB v11.4S, v19.4S, v27.4S\n"
1495           "FADD v4.4S, v20.4S, v28.4S\n"
1496           "FSUB v12.4S, v20.4S, v28.4S\n"
1497           "FADD v5.4S, v21.4S, v29.4S\n"
1498           "FSUB v13.4S, v21.4S, v29.4S\n"
1499           "FADD v6.4S, v22.4S, v30.4S\n"
1500           "FSUB v14.4S, v22.4S, v30.4S\n"
1501           "FADD v7.4S, v23.4S, v31.4S\n"
1502           "FSUB v15.4S, v23.4S, v31.4S\n"
1503           "ST1 {v0.4S}, [%0]\n"
1504           "ST1 {v1.4S}, [%1]\n"
1505           "ST1 {v2.4S}, [%2]\n"
1506           "ST1 {v3.4S}, [%3]\n"
1507           "ST1 {v4.4S}, [%4]\n"
1508           "ST1 {v5.4S}, [%5]\n"
1509           "ST1 {v6.4S}, [%6]\n"
1510           "ST1 {v7.4S}, [%7]\n"
1511           "ST1 {v8.4S}, [%8]\n"
1512           "ST1 {v9.4S}, [%9]\n"
1513           "ST1 {v10.4S}, [%10]\n"
1514           "ST1 {v11.4S}, [%11]\n"
1515           "ST1 {v12.4S}, [%12]\n"
1516           "ST1 {v13.4S}, [%13]\n"
1517           "ST1 {v14.4S}, [%14]\n"
1518           "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
1519           "r"(buf + j + k + 64),
1520           "r"(buf + j + k + 128),
1521           "r"(buf + j + k + 192),
1522           "r"(buf + j + k + 256),
1523           "r"(buf + j + k + 320),
1524           "r"(buf + j + k + 384),
1525           "r"(buf + j + k + 448),
1526           "r"(buf + j + k + 512),
1527           "r"(buf + j + k + 576),
1528           "r"(buf + j + k + 640),
1529           "r"(buf + j + k + 704),
1530           "r"(buf + j + k + 768),
1531           "r"(buf + j + k + 832),
1532           "r"(buf + j + k + 896),
1533           "r"(buf + j + k + 960)
1534           : "%v0",
1535             "%v1",
1536             "%v2",
1537             "%v3",
1538             "%v4",
1539             "%v5",
1540             "%v6",
1541             "%v7",
1542             "%v8",
1543             "%v9",
1544             "%v10",
1545             "%v11",
1546             "%v12",
1547             "%v13",
1548             "%v14",
1549             "%v15",
1550             "%v16",
1551             "%v17",
1552             "%v18",
1553             "%v19",
1554             "%v20",
1555             "%v21",
1556             "%v22",
1557             "%v23",
1558             "%v24",
1559             "%v25",
1560             "%v26",
1561             "%v27",
1562             "%v28",
1563             "%v29",
1564             "%v30",
1565             "%v31",
1566             "memory");
1567     }
1568   }
1569   for (int j = 0; j < 8192; j += 8192) {
1570     for (int k = 0; k < 1024; k += 4) {
1571       __asm__ volatile(
1572           "LD1 {v0.4S}, [%0]\n"
1573           "LD1 {v1.4S}, [%1]\n"
1574           "LD1 {v2.4S}, [%2]\n"
1575           "LD1 {v3.4S}, [%3]\n"
1576           "LD1 {v4.4S}, [%4]\n"
1577           "LD1 {v5.4S}, [%5]\n"
1578           "LD1 {v6.4S}, [%6]\n"
1579           "LD1 {v7.4S}, [%7]\n"
1580           "FADD v16.4S, v0.4S, v1.4S\n"
1581           "FSUB v17.4S, v0.4S, v1.4S\n"
1582           "FADD v18.4S, v2.4S, v3.4S\n"
1583           "FSUB v19.4S, v2.4S, v3.4S\n"
1584           "FADD v20.4S, v4.4S, v5.4S\n"
1585           "FSUB v21.4S, v4.4S, v5.4S\n"
1586           "FADD v22.4S, v6.4S, v7.4S\n"
1587           "FSUB v23.4S, v6.4S, v7.4S\n"
1588           "FADD v0.4S, v16.4S, v18.4S\n"
1589           "FSUB v2.4S, v16.4S, v18.4S\n"
1590           "FADD v1.4S, v17.4S, v19.4S\n"
1591           "FSUB v3.4S, v17.4S, v19.4S\n"
1592           "FADD v4.4S, v20.4S, v22.4S\n"
1593           "FSUB v6.4S, v20.4S, v22.4S\n"
1594           "FADD v5.4S, v21.4S, v23.4S\n"
1595           "FSUB v7.4S, v21.4S, v23.4S\n"
1596           "FADD v16.4S, v0.4S, v4.4S\n"
1597           "FSUB v20.4S, v0.4S, v4.4S\n"
1598           "FADD v17.4S, v1.4S, v5.4S\n"
1599           "FSUB v21.4S, v1.4S, v5.4S\n"
1600           "FADD v18.4S, v2.4S, v6.4S\n"
1601           "FSUB v22.4S, v2.4S, v6.4S\n"
1602           "FADD v19.4S, v3.4S, v7.4S\n"
1603           "FSUB v23.4S, v3.4S, v7.4S\n"
1604           "ST1 {v16.4S}, [%0]\n"
1605           "ST1 {v17.4S}, [%1]\n"
1606           "ST1 {v18.4S}, [%2]\n"
1607           "ST1 {v19.4S}, [%3]\n"
1608           "ST1 {v20.4S}, [%4]\n"
1609           "ST1 {v21.4S}, [%5]\n"
1610           "ST1 {v22.4S}, [%6]\n"
1611           "ST1 {v23.4S}, [%7]\n" ::"r"(buf + j + k + 0),
1612           "r"(buf + j + k + 1024),
1613           "r"(buf + j + k + 2048),
1614           "r"(buf + j + k + 3072),
1615           "r"(buf + j + k + 4096),
1616           "r"(buf + j + k + 5120),
1617           "r"(buf + j + k + 6144),
1618           "r"(buf + j + k + 7168)
1619           : "%v0",
1620             "%v1",
1621             "%v2",
1622             "%v3",
1623             "%v4",
1624             "%v5",
1625             "%v6",
1626             "%v7",
1627             "%v8",
1628             "%v9",
1629             "%v10",
1630             "%v11",
1631             "%v12",
1632             "%v13",
1633             "%v14",
1634             "%v15",
1635             "%v16",
1636             "%v17",
1637             "%v18",
1638             "%v19",
1639             "%v20",
1640             "%v21",
1641             "%v22",
1642             "%v23",
1643             "%v24",
1644             "%v25",
1645             "%v26",
1646             "%v27",
1647             "%v28",
1648             "%v29",
1649             "%v30",
1650             "%v31",
1651             "memory");
1652     }
1653   }
1654 }
1655 void helper_float_14_recursive(float* buf, int depth);
helper_float_14_recursive(float * buf,int depth)1656 void helper_float_14_recursive(float* buf, int depth) {
1657   if (depth == 10) {
1658     helper_float_10(buf);
1659     return;
1660   }
1661   if (depth == 14) {
1662     helper_float_14_recursive(buf + 0, 10);
1663     helper_float_14_recursive(buf + 1024, 10);
1664     helper_float_14_recursive(buf + 2048, 10);
1665     helper_float_14_recursive(buf + 3072, 10);
1666     helper_float_14_recursive(buf + 4096, 10);
1667     helper_float_14_recursive(buf + 5120, 10);
1668     helper_float_14_recursive(buf + 6144, 10);
1669     helper_float_14_recursive(buf + 7168, 10);
1670     helper_float_14_recursive(buf + 8192, 10);
1671     helper_float_14_recursive(buf + 9216, 10);
1672     helper_float_14_recursive(buf + 10240, 10);
1673     helper_float_14_recursive(buf + 11264, 10);
1674     helper_float_14_recursive(buf + 12288, 10);
1675     helper_float_14_recursive(buf + 13312, 10);
1676     helper_float_14_recursive(buf + 14336, 10);
1677     helper_float_14_recursive(buf + 15360, 10);
1678     for (int j = 0; j < 16384; j += 16384) {
1679       for (int k = 0; k < 1024; k += 4) {
1680         __asm__ volatile(
1681             "LD1 {v0.4S}, [%0]\n"
1682             "LD1 {v1.4S}, [%1]\n"
1683             "LD1 {v2.4S}, [%2]\n"
1684             "LD1 {v3.4S}, [%3]\n"
1685             "LD1 {v4.4S}, [%4]\n"
1686             "LD1 {v5.4S}, [%5]\n"
1687             "LD1 {v6.4S}, [%6]\n"
1688             "LD1 {v7.4S}, [%7]\n"
1689             "LD1 {v8.4S}, [%8]\n"
1690             "LD1 {v9.4S}, [%9]\n"
1691             "LD1 {v10.4S}, [%10]\n"
1692             "LD1 {v11.4S}, [%11]\n"
1693             "LD1 {v12.4S}, [%12]\n"
1694             "LD1 {v13.4S}, [%13]\n"
1695             "LD1 {v14.4S}, [%14]\n"
1696             "LD1 {v15.4S}, [%15]\n"
1697             "FADD v16.4S, v0.4S, v1.4S\n"
1698             "FSUB v17.4S, v0.4S, v1.4S\n"
1699             "FADD v18.4S, v2.4S, v3.4S\n"
1700             "FSUB v19.4S, v2.4S, v3.4S\n"
1701             "FADD v20.4S, v4.4S, v5.4S\n"
1702             "FSUB v21.4S, v4.4S, v5.4S\n"
1703             "FADD v22.4S, v6.4S, v7.4S\n"
1704             "FSUB v23.4S, v6.4S, v7.4S\n"
1705             "FADD v24.4S, v8.4S, v9.4S\n"
1706             "FSUB v25.4S, v8.4S, v9.4S\n"
1707             "FADD v26.4S, v10.4S, v11.4S\n"
1708             "FSUB v27.4S, v10.4S, v11.4S\n"
1709             "FADD v28.4S, v12.4S, v13.4S\n"
1710             "FSUB v29.4S, v12.4S, v13.4S\n"
1711             "FADD v30.4S, v14.4S, v15.4S\n"
1712             "FSUB v31.4S, v14.4S, v15.4S\n"
1713             "FADD v0.4S, v16.4S, v18.4S\n"
1714             "FSUB v2.4S, v16.4S, v18.4S\n"
1715             "FADD v1.4S, v17.4S, v19.4S\n"
1716             "FSUB v3.4S, v17.4S, v19.4S\n"
1717             "FADD v4.4S, v20.4S, v22.4S\n"
1718             "FSUB v6.4S, v20.4S, v22.4S\n"
1719             "FADD v5.4S, v21.4S, v23.4S\n"
1720             "FSUB v7.4S, v21.4S, v23.4S\n"
1721             "FADD v8.4S, v24.4S, v26.4S\n"
1722             "FSUB v10.4S, v24.4S, v26.4S\n"
1723             "FADD v9.4S, v25.4S, v27.4S\n"
1724             "FSUB v11.4S, v25.4S, v27.4S\n"
1725             "FADD v12.4S, v28.4S, v30.4S\n"
1726             "FSUB v14.4S, v28.4S, v30.4S\n"
1727             "FADD v13.4S, v29.4S, v31.4S\n"
1728             "FSUB v15.4S, v29.4S, v31.4S\n"
1729             "FADD v16.4S, v0.4S, v4.4S\n"
1730             "FSUB v20.4S, v0.4S, v4.4S\n"
1731             "FADD v17.4S, v1.4S, v5.4S\n"
1732             "FSUB v21.4S, v1.4S, v5.4S\n"
1733             "FADD v18.4S, v2.4S, v6.4S\n"
1734             "FSUB v22.4S, v2.4S, v6.4S\n"
1735             "FADD v19.4S, v3.4S, v7.4S\n"
1736             "FSUB v23.4S, v3.4S, v7.4S\n"
1737             "FADD v24.4S, v8.4S, v12.4S\n"
1738             "FSUB v28.4S, v8.4S, v12.4S\n"
1739             "FADD v25.4S, v9.4S, v13.4S\n"
1740             "FSUB v29.4S, v9.4S, v13.4S\n"
1741             "FADD v26.4S, v10.4S, v14.4S\n"
1742             "FSUB v30.4S, v10.4S, v14.4S\n"
1743             "FADD v27.4S, v11.4S, v15.4S\n"
1744             "FSUB v31.4S, v11.4S, v15.4S\n"
1745             "FADD v0.4S, v16.4S, v24.4S\n"
1746             "FSUB v8.4S, v16.4S, v24.4S\n"
1747             "FADD v1.4S, v17.4S, v25.4S\n"
1748             "FSUB v9.4S, v17.4S, v25.4S\n"
1749             "FADD v2.4S, v18.4S, v26.4S\n"
1750             "FSUB v10.4S, v18.4S, v26.4S\n"
1751             "FADD v3.4S, v19.4S, v27.4S\n"
1752             "FSUB v11.4S, v19.4S, v27.4S\n"
1753             "FADD v4.4S, v20.4S, v28.4S\n"
1754             "FSUB v12.4S, v20.4S, v28.4S\n"
1755             "FADD v5.4S, v21.4S, v29.4S\n"
1756             "FSUB v13.4S, v21.4S, v29.4S\n"
1757             "FADD v6.4S, v22.4S, v30.4S\n"
1758             "FSUB v14.4S, v22.4S, v30.4S\n"
1759             "FADD v7.4S, v23.4S, v31.4S\n"
1760             "FSUB v15.4S, v23.4S, v31.4S\n"
1761             "ST1 {v0.4S}, [%0]\n"
1762             "ST1 {v1.4S}, [%1]\n"
1763             "ST1 {v2.4S}, [%2]\n"
1764             "ST1 {v3.4S}, [%3]\n"
1765             "ST1 {v4.4S}, [%4]\n"
1766             "ST1 {v5.4S}, [%5]\n"
1767             "ST1 {v6.4S}, [%6]\n"
1768             "ST1 {v7.4S}, [%7]\n"
1769             "ST1 {v8.4S}, [%8]\n"
1770             "ST1 {v9.4S}, [%9]\n"
1771             "ST1 {v10.4S}, [%10]\n"
1772             "ST1 {v11.4S}, [%11]\n"
1773             "ST1 {v12.4S}, [%12]\n"
1774             "ST1 {v13.4S}, [%13]\n"
1775             "ST1 {v14.4S}, [%14]\n"
1776             "ST1 {v15.4S}, [%15]\n" ::"r"(buf + j + k + 0),
1777             "r"(buf + j + k + 1024),
1778             "r"(buf + j + k + 2048),
1779             "r"(buf + j + k + 3072),
1780             "r"(buf + j + k + 4096),
1781             "r"(buf + j + k + 5120),
1782             "r"(buf + j + k + 6144),
1783             "r"(buf + j + k + 7168),
1784             "r"(buf + j + k + 8192),
1785             "r"(buf + j + k + 9216),
1786             "r"(buf + j + k + 10240),
1787             "r"(buf + j + k + 11264),
1788             "r"(buf + j + k + 12288),
1789             "r"(buf + j + k + 13312),
1790             "r"(buf + j + k + 14336),
1791             "r"(buf + j + k + 15360)
1792             : "%v0",
1793               "%v1",
1794               "%v2",
1795               "%v3",
1796               "%v4",
1797               "%v5",
1798               "%v6",
1799               "%v7",
1800               "%v8",
1801               "%v9",
1802               "%v10",
1803               "%v11",
1804               "%v12",
1805               "%v13",
1806               "%v14",
1807               "%v15",
1808               "%v16",
1809               "%v17",
1810               "%v18",
1811               "%v19",
1812               "%v20",
1813               "%v21",
1814               "%v22",
1815               "%v23",
1816               "%v24",
1817               "%v25",
1818               "%v26",
1819               "%v27",
1820               "%v28",
1821               "%v29",
1822               "%v30",
1823               "%v31",
1824               "memory");
1825       }
1826     }
1827     return;
1828   }
1829 }
1830 void helper_float_14(float* buf);
helper_float_14(float * buf)1831 void helper_float_14(float* buf) {
1832   helper_float_14_recursive(buf, 14);
1833 }
1834 void helper_float_15_recursive(float* buf, int depth);
helper_float_15_recursive(float * buf,int depth)1835 void helper_float_15_recursive(float* buf, int depth) {
1836   if (depth == 13) {
1837     helper_float_13(buf);
1838     return;
1839   }
1840   if (depth == 15) {
1841     helper_float_15_recursive(buf + 0, 13);
1842     helper_float_15_recursive(buf + 8192, 13);
1843     helper_float_15_recursive(buf + 16384, 13);
1844     helper_float_15_recursive(buf + 24576, 13);
1845     for (int j = 0; j < 32768; j += 32768) {
1846       for (int k = 0; k < 8192; k += 4) {
1847         __asm__ volatile(
1848             "LD1 {v0.4S}, [%0]\n"
1849             "LD1 {v1.4S}, [%1]\n"
1850             "LD1 {v2.4S}, [%2]\n"
1851             "LD1 {v3.4S}, [%3]\n"
1852             "FADD v16.4S, v0.4S, v1.4S\n"
1853             "FSUB v17.4S, v0.4S, v1.4S\n"
1854             "FADD v18.4S, v2.4S, v3.4S\n"
1855             "FSUB v19.4S, v2.4S, v3.4S\n"
1856             "FADD v0.4S, v16.4S, v18.4S\n"
1857             "FSUB v2.4S, v16.4S, v18.4S\n"
1858             "FADD v1.4S, v17.4S, v19.4S\n"
1859             "FSUB v3.4S, v17.4S, v19.4S\n"
1860             "ST1 {v0.4S}, [%0]\n"
1861             "ST1 {v1.4S}, [%1]\n"
1862             "ST1 {v2.4S}, [%2]\n"
1863             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
1864             "r"(buf + j + k + 8192),
1865             "r"(buf + j + k + 16384),
1866             "r"(buf + j + k + 24576)
1867             : "%v0",
1868               "%v1",
1869               "%v2",
1870               "%v3",
1871               "%v4",
1872               "%v5",
1873               "%v6",
1874               "%v7",
1875               "%v8",
1876               "%v9",
1877               "%v10",
1878               "%v11",
1879               "%v12",
1880               "%v13",
1881               "%v14",
1882               "%v15",
1883               "%v16",
1884               "%v17",
1885               "%v18",
1886               "%v19",
1887               "%v20",
1888               "%v21",
1889               "%v22",
1890               "%v23",
1891               "%v24",
1892               "%v25",
1893               "%v26",
1894               "%v27",
1895               "%v28",
1896               "%v29",
1897               "%v30",
1898               "%v31",
1899               "memory");
1900       }
1901     }
1902     return;
1903   }
1904 }
1905 void helper_float_15(float* buf);
helper_float_15(float * buf)1906 void helper_float_15(float* buf) {
1907   helper_float_15_recursive(buf, 15);
1908 }
1909 void helper_float_16_recursive(float* buf, int depth);
helper_float_16_recursive(float * buf,int depth)1910 void helper_float_16_recursive(float* buf, int depth) {
1911   if (depth == 15) {
1912     helper_float_15(buf);
1913     return;
1914   }
1915   if (depth == 16) {
1916     helper_float_16_recursive(buf + 0, 15);
1917     helper_float_16_recursive(buf + 32768, 15);
1918     for (int j = 0; j < 65536; j += 65536) {
1919       for (int k = 0; k < 32768; k += 4) {
1920         __asm__ volatile(
1921             "LD1 {v0.4S}, [%0]\n"
1922             "LD1 {v1.4S}, [%1]\n"
1923             "FADD v16.4S, v0.4S, v1.4S\n"
1924             "FSUB v17.4S, v0.4S, v1.4S\n"
1925             "ST1 {v16.4S}, [%0]\n"
1926             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
1927             "r"(buf + j + k + 32768)
1928             : "%v0",
1929               "%v1",
1930               "%v2",
1931               "%v3",
1932               "%v4",
1933               "%v5",
1934               "%v6",
1935               "%v7",
1936               "%v8",
1937               "%v9",
1938               "%v10",
1939               "%v11",
1940               "%v12",
1941               "%v13",
1942               "%v14",
1943               "%v15",
1944               "%v16",
1945               "%v17",
1946               "%v18",
1947               "%v19",
1948               "%v20",
1949               "%v21",
1950               "%v22",
1951               "%v23",
1952               "%v24",
1953               "%v25",
1954               "%v26",
1955               "%v27",
1956               "%v28",
1957               "%v29",
1958               "%v30",
1959               "%v31",
1960               "memory");
1961       }
1962     }
1963     return;
1964   }
1965 }
1966 void helper_float_16(float* buf);
helper_float_16(float * buf)1967 void helper_float_16(float* buf) {
1968   helper_float_16_recursive(buf, 16);
1969 }
1970 void helper_float_17_recursive(float* buf, int depth);
helper_float_17_recursive(float * buf,int depth)1971 void helper_float_17_recursive(float* buf, int depth) {
1972   if (depth == 15) {
1973     helper_float_15(buf);
1974     return;
1975   }
1976   if (depth == 17) {
1977     helper_float_17_recursive(buf + 0, 15);
1978     helper_float_17_recursive(buf + 32768, 15);
1979     helper_float_17_recursive(buf + 65536, 15);
1980     helper_float_17_recursive(buf + 98304, 15);
1981     for (int j = 0; j < 131072; j += 131072) {
1982       for (int k = 0; k < 32768; k += 4) {
1983         __asm__ volatile(
1984             "LD1 {v0.4S}, [%0]\n"
1985             "LD1 {v1.4S}, [%1]\n"
1986             "LD1 {v2.4S}, [%2]\n"
1987             "LD1 {v3.4S}, [%3]\n"
1988             "FADD v16.4S, v0.4S, v1.4S\n"
1989             "FSUB v17.4S, v0.4S, v1.4S\n"
1990             "FADD v18.4S, v2.4S, v3.4S\n"
1991             "FSUB v19.4S, v2.4S, v3.4S\n"
1992             "FADD v0.4S, v16.4S, v18.4S\n"
1993             "FSUB v2.4S, v16.4S, v18.4S\n"
1994             "FADD v1.4S, v17.4S, v19.4S\n"
1995             "FSUB v3.4S, v17.4S, v19.4S\n"
1996             "ST1 {v0.4S}, [%0]\n"
1997             "ST1 {v1.4S}, [%1]\n"
1998             "ST1 {v2.4S}, [%2]\n"
1999             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2000             "r"(buf + j + k + 32768),
2001             "r"(buf + j + k + 65536),
2002             "r"(buf + j + k + 98304)
2003             : "%v0",
2004               "%v1",
2005               "%v2",
2006               "%v3",
2007               "%v4",
2008               "%v5",
2009               "%v6",
2010               "%v7",
2011               "%v8",
2012               "%v9",
2013               "%v10",
2014               "%v11",
2015               "%v12",
2016               "%v13",
2017               "%v14",
2018               "%v15",
2019               "%v16",
2020               "%v17",
2021               "%v18",
2022               "%v19",
2023               "%v20",
2024               "%v21",
2025               "%v22",
2026               "%v23",
2027               "%v24",
2028               "%v25",
2029               "%v26",
2030               "%v27",
2031               "%v28",
2032               "%v29",
2033               "%v30",
2034               "%v31",
2035               "memory");
2036       }
2037     }
2038     return;
2039   }
2040 }
2041 void helper_float_17(float* buf);
helper_float_17(float * buf)2042 void helper_float_17(float* buf) {
2043   helper_float_17_recursive(buf, 17);
2044 }
2045 void helper_float_18_recursive(float* buf, int depth);
helper_float_18_recursive(float * buf,int depth)2046 void helper_float_18_recursive(float* buf, int depth) {
2047   if (depth == 17) {
2048     helper_float_17(buf);
2049     return;
2050   }
2051   if (depth == 18) {
2052     helper_float_18_recursive(buf + 0, 17);
2053     helper_float_18_recursive(buf + 131072, 17);
2054     for (int j = 0; j < 262144; j += 262144) {
2055       for (int k = 0; k < 131072; k += 4) {
2056         __asm__ volatile(
2057             "LD1 {v0.4S}, [%0]\n"
2058             "LD1 {v1.4S}, [%1]\n"
2059             "FADD v16.4S, v0.4S, v1.4S\n"
2060             "FSUB v17.4S, v0.4S, v1.4S\n"
2061             "ST1 {v16.4S}, [%0]\n"
2062             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2063             "r"(buf + j + k + 131072)
2064             : "%v0",
2065               "%v1",
2066               "%v2",
2067               "%v3",
2068               "%v4",
2069               "%v5",
2070               "%v6",
2071               "%v7",
2072               "%v8",
2073               "%v9",
2074               "%v10",
2075               "%v11",
2076               "%v12",
2077               "%v13",
2078               "%v14",
2079               "%v15",
2080               "%v16",
2081               "%v17",
2082               "%v18",
2083               "%v19",
2084               "%v20",
2085               "%v21",
2086               "%v22",
2087               "%v23",
2088               "%v24",
2089               "%v25",
2090               "%v26",
2091               "%v27",
2092               "%v28",
2093               "%v29",
2094               "%v30",
2095               "%v31",
2096               "memory");
2097       }
2098     }
2099     return;
2100   }
2101 }
2102 void helper_float_18(float* buf);
helper_float_18(float * buf)2103 void helper_float_18(float* buf) {
2104   helper_float_18_recursive(buf, 18);
2105 }
2106 void helper_float_19_recursive(float* buf, int depth);
helper_float_19_recursive(float * buf,int depth)2107 void helper_float_19_recursive(float* buf, int depth) {
2108   if (depth == 18) {
2109     helper_float_18(buf);
2110     return;
2111   }
2112   if (depth == 19) {
2113     helper_float_19_recursive(buf + 0, 18);
2114     helper_float_19_recursive(buf + 262144, 18);
2115     for (int j = 0; j < 524288; j += 524288) {
2116       for (int k = 0; k < 262144; k += 4) {
2117         __asm__ volatile(
2118             "LD1 {v0.4S}, [%0]\n"
2119             "LD1 {v1.4S}, [%1]\n"
2120             "FADD v16.4S, v0.4S, v1.4S\n"
2121             "FSUB v17.4S, v0.4S, v1.4S\n"
2122             "ST1 {v16.4S}, [%0]\n"
2123             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2124             "r"(buf + j + k + 262144)
2125             : "%v0",
2126               "%v1",
2127               "%v2",
2128               "%v3",
2129               "%v4",
2130               "%v5",
2131               "%v6",
2132               "%v7",
2133               "%v8",
2134               "%v9",
2135               "%v10",
2136               "%v11",
2137               "%v12",
2138               "%v13",
2139               "%v14",
2140               "%v15",
2141               "%v16",
2142               "%v17",
2143               "%v18",
2144               "%v19",
2145               "%v20",
2146               "%v21",
2147               "%v22",
2148               "%v23",
2149               "%v24",
2150               "%v25",
2151               "%v26",
2152               "%v27",
2153               "%v28",
2154               "%v29",
2155               "%v30",
2156               "%v31",
2157               "memory");
2158       }
2159     }
2160     return;
2161   }
2162 }
2163 void helper_float_19(float* buf);
helper_float_19(float * buf)2164 void helper_float_19(float* buf) {
2165   helper_float_19_recursive(buf, 19);
2166 }
2167 void helper_float_20_recursive(float* buf, int depth);
helper_float_20_recursive(float * buf,int depth)2168 void helper_float_20_recursive(float* buf, int depth) {
2169   if (depth == 18) {
2170     helper_float_18(buf);
2171     return;
2172   }
2173   if (depth == 20) {
2174     helper_float_20_recursive(buf + 0, 18);
2175     helper_float_20_recursive(buf + 262144, 18);
2176     helper_float_20_recursive(buf + 524288, 18);
2177     helper_float_20_recursive(buf + 786432, 18);
2178     for (int j = 0; j < 1048576; j += 1048576) {
2179       for (int k = 0; k < 262144; k += 4) {
2180         __asm__ volatile(
2181             "LD1 {v0.4S}, [%0]\n"
2182             "LD1 {v1.4S}, [%1]\n"
2183             "LD1 {v2.4S}, [%2]\n"
2184             "LD1 {v3.4S}, [%3]\n"
2185             "FADD v16.4S, v0.4S, v1.4S\n"
2186             "FSUB v17.4S, v0.4S, v1.4S\n"
2187             "FADD v18.4S, v2.4S, v3.4S\n"
2188             "FSUB v19.4S, v2.4S, v3.4S\n"
2189             "FADD v0.4S, v16.4S, v18.4S\n"
2190             "FSUB v2.4S, v16.4S, v18.4S\n"
2191             "FADD v1.4S, v17.4S, v19.4S\n"
2192             "FSUB v3.4S, v17.4S, v19.4S\n"
2193             "ST1 {v0.4S}, [%0]\n"
2194             "ST1 {v1.4S}, [%1]\n"
2195             "ST1 {v2.4S}, [%2]\n"
2196             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2197             "r"(buf + j + k + 262144),
2198             "r"(buf + j + k + 524288),
2199             "r"(buf + j + k + 786432)
2200             : "%v0",
2201               "%v1",
2202               "%v2",
2203               "%v3",
2204               "%v4",
2205               "%v5",
2206               "%v6",
2207               "%v7",
2208               "%v8",
2209               "%v9",
2210               "%v10",
2211               "%v11",
2212               "%v12",
2213               "%v13",
2214               "%v14",
2215               "%v15",
2216               "%v16",
2217               "%v17",
2218               "%v18",
2219               "%v19",
2220               "%v20",
2221               "%v21",
2222               "%v22",
2223               "%v23",
2224               "%v24",
2225               "%v25",
2226               "%v26",
2227               "%v27",
2228               "%v28",
2229               "%v29",
2230               "%v30",
2231               "%v31",
2232               "memory");
2233       }
2234     }
2235     return;
2236   }
2237 }
2238 void helper_float_20(float* buf);
helper_float_20(float * buf)2239 void helper_float_20(float* buf) {
2240   helper_float_20_recursive(buf, 20);
2241 }
2242 void helper_float_21_recursive(float* buf, int depth);
helper_float_21_recursive(float * buf,int depth)2243 void helper_float_21_recursive(float* buf, int depth) {
2244   if (depth == 20) {
2245     helper_float_20(buf);
2246     return;
2247   }
2248   if (depth == 21) {
2249     helper_float_21_recursive(buf + 0, 20);
2250     helper_float_21_recursive(buf + 1048576, 20);
2251     for (int j = 0; j < 2097152; j += 2097152) {
2252       for (int k = 0; k < 1048576; k += 4) {
2253         __asm__ volatile(
2254             "LD1 {v0.4S}, [%0]\n"
2255             "LD1 {v1.4S}, [%1]\n"
2256             "FADD v16.4S, v0.4S, v1.4S\n"
2257             "FSUB v17.4S, v0.4S, v1.4S\n"
2258             "ST1 {v16.4S}, [%0]\n"
2259             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2260             "r"(buf + j + k + 1048576)
2261             : "%v0",
2262               "%v1",
2263               "%v2",
2264               "%v3",
2265               "%v4",
2266               "%v5",
2267               "%v6",
2268               "%v7",
2269               "%v8",
2270               "%v9",
2271               "%v10",
2272               "%v11",
2273               "%v12",
2274               "%v13",
2275               "%v14",
2276               "%v15",
2277               "%v16",
2278               "%v17",
2279               "%v18",
2280               "%v19",
2281               "%v20",
2282               "%v21",
2283               "%v22",
2284               "%v23",
2285               "%v24",
2286               "%v25",
2287               "%v26",
2288               "%v27",
2289               "%v28",
2290               "%v29",
2291               "%v30",
2292               "%v31",
2293               "memory");
2294       }
2295     }
2296     return;
2297   }
2298 }
2299 void helper_float_21(float* buf);
helper_float_21(float * buf)2300 void helper_float_21(float* buf) {
2301   helper_float_21_recursive(buf, 21);
2302 }
2303 void helper_float_22_recursive(float* buf, int depth);
helper_float_22_recursive(float * buf,int depth)2304 void helper_float_22_recursive(float* buf, int depth) {
2305   if (depth == 20) {
2306     helper_float_20(buf);
2307     return;
2308   }
2309   if (depth == 22) {
2310     helper_float_22_recursive(buf + 0, 20);
2311     helper_float_22_recursive(buf + 1048576, 20);
2312     helper_float_22_recursive(buf + 2097152, 20);
2313     helper_float_22_recursive(buf + 3145728, 20);
2314     for (int j = 0; j < 4194304; j += 4194304) {
2315       for (int k = 0; k < 1048576; k += 4) {
2316         __asm__ volatile(
2317             "LD1 {v0.4S}, [%0]\n"
2318             "LD1 {v1.4S}, [%1]\n"
2319             "LD1 {v2.4S}, [%2]\n"
2320             "LD1 {v3.4S}, [%3]\n"
2321             "FADD v16.4S, v0.4S, v1.4S\n"
2322             "FSUB v17.4S, v0.4S, v1.4S\n"
2323             "FADD v18.4S, v2.4S, v3.4S\n"
2324             "FSUB v19.4S, v2.4S, v3.4S\n"
2325             "FADD v0.4S, v16.4S, v18.4S\n"
2326             "FSUB v2.4S, v16.4S, v18.4S\n"
2327             "FADD v1.4S, v17.4S, v19.4S\n"
2328             "FSUB v3.4S, v17.4S, v19.4S\n"
2329             "ST1 {v0.4S}, [%0]\n"
2330             "ST1 {v1.4S}, [%1]\n"
2331             "ST1 {v2.4S}, [%2]\n"
2332             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2333             "r"(buf + j + k + 1048576),
2334             "r"(buf + j + k + 2097152),
2335             "r"(buf + j + k + 3145728)
2336             : "%v0",
2337               "%v1",
2338               "%v2",
2339               "%v3",
2340               "%v4",
2341               "%v5",
2342               "%v6",
2343               "%v7",
2344               "%v8",
2345               "%v9",
2346               "%v10",
2347               "%v11",
2348               "%v12",
2349               "%v13",
2350               "%v14",
2351               "%v15",
2352               "%v16",
2353               "%v17",
2354               "%v18",
2355               "%v19",
2356               "%v20",
2357               "%v21",
2358               "%v22",
2359               "%v23",
2360               "%v24",
2361               "%v25",
2362               "%v26",
2363               "%v27",
2364               "%v28",
2365               "%v29",
2366               "%v30",
2367               "%v31",
2368               "memory");
2369       }
2370     }
2371     return;
2372   }
2373 }
2374 void helper_float_22(float* buf);
helper_float_22(float * buf)2375 void helper_float_22(float* buf) {
2376   helper_float_22_recursive(buf, 22);
2377 }
2378 void helper_float_23_recursive(float* buf, int depth);
helper_float_23_recursive(float * buf,int depth)2379 void helper_float_23_recursive(float* buf, int depth) {
2380   if (depth == 22) {
2381     helper_float_22(buf);
2382     return;
2383   }
2384   if (depth == 23) {
2385     helper_float_23_recursive(buf + 0, 22);
2386     helper_float_23_recursive(buf + 4194304, 22);
2387     for (int j = 0; j < 8388608; j += 8388608) {
2388       for (int k = 0; k < 4194304; k += 4) {
2389         __asm__ volatile(
2390             "LD1 {v0.4S}, [%0]\n"
2391             "LD1 {v1.4S}, [%1]\n"
2392             "FADD v16.4S, v0.4S, v1.4S\n"
2393             "FSUB v17.4S, v0.4S, v1.4S\n"
2394             "ST1 {v16.4S}, [%0]\n"
2395             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2396             "r"(buf + j + k + 4194304)
2397             : "%v0",
2398               "%v1",
2399               "%v2",
2400               "%v3",
2401               "%v4",
2402               "%v5",
2403               "%v6",
2404               "%v7",
2405               "%v8",
2406               "%v9",
2407               "%v10",
2408               "%v11",
2409               "%v12",
2410               "%v13",
2411               "%v14",
2412               "%v15",
2413               "%v16",
2414               "%v17",
2415               "%v18",
2416               "%v19",
2417               "%v20",
2418               "%v21",
2419               "%v22",
2420               "%v23",
2421               "%v24",
2422               "%v25",
2423               "%v26",
2424               "%v27",
2425               "%v28",
2426               "%v29",
2427               "%v30",
2428               "%v31",
2429               "memory");
2430       }
2431     }
2432     return;
2433   }
2434 }
2435 void helper_float_23(float* buf);
helper_float_23(float * buf)2436 void helper_float_23(float* buf) {
2437   helper_float_23_recursive(buf, 23);
2438 }
2439 void helper_float_24_recursive(float* buf, int depth);
helper_float_24_recursive(float * buf,int depth)2440 void helper_float_24_recursive(float* buf, int depth) {
2441   if (depth == 23) {
2442     helper_float_23(buf);
2443     return;
2444   }
2445   if (depth == 24) {
2446     helper_float_24_recursive(buf + 0, 23);
2447     helper_float_24_recursive(buf + 8388608, 23);
2448     for (int j = 0; j < 16777216; j += 16777216) {
2449       for (int k = 0; k < 8388608; k += 4) {
2450         __asm__ volatile(
2451             "LD1 {v0.4S}, [%0]\n"
2452             "LD1 {v1.4S}, [%1]\n"
2453             "FADD v16.4S, v0.4S, v1.4S\n"
2454             "FSUB v17.4S, v0.4S, v1.4S\n"
2455             "ST1 {v16.4S}, [%0]\n"
2456             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2457             "r"(buf + j + k + 8388608)
2458             : "%v0",
2459               "%v1",
2460               "%v2",
2461               "%v3",
2462               "%v4",
2463               "%v5",
2464               "%v6",
2465               "%v7",
2466               "%v8",
2467               "%v9",
2468               "%v10",
2469               "%v11",
2470               "%v12",
2471               "%v13",
2472               "%v14",
2473               "%v15",
2474               "%v16",
2475               "%v17",
2476               "%v18",
2477               "%v19",
2478               "%v20",
2479               "%v21",
2480               "%v22",
2481               "%v23",
2482               "%v24",
2483               "%v25",
2484               "%v26",
2485               "%v27",
2486               "%v28",
2487               "%v29",
2488               "%v30",
2489               "%v31",
2490               "memory");
2491       }
2492     }
2493     return;
2494   }
2495 }
2496 void helper_float_24(float* buf);
helper_float_24(float * buf)2497 void helper_float_24(float* buf) {
2498   helper_float_24_recursive(buf, 24);
2499 }
2500 void helper_float_25_recursive(float* buf, int depth);
helper_float_25_recursive(float * buf,int depth)2501 void helper_float_25_recursive(float* buf, int depth) {
2502   if (depth == 23) {
2503     helper_float_23(buf);
2504     return;
2505   }
2506   if (depth == 25) {
2507     helper_float_25_recursive(buf + 0, 23);
2508     helper_float_25_recursive(buf + 8388608, 23);
2509     helper_float_25_recursive(buf + 16777216, 23);
2510     helper_float_25_recursive(buf + 25165824, 23);
2511     for (int j = 0; j < 33554432; j += 33554432) {
2512       for (int k = 0; k < 8388608; k += 4) {
2513         __asm__ volatile(
2514             "LD1 {v0.4S}, [%0]\n"
2515             "LD1 {v1.4S}, [%1]\n"
2516             "LD1 {v2.4S}, [%2]\n"
2517             "LD1 {v3.4S}, [%3]\n"
2518             "FADD v16.4S, v0.4S, v1.4S\n"
2519             "FSUB v17.4S, v0.4S, v1.4S\n"
2520             "FADD v18.4S, v2.4S, v3.4S\n"
2521             "FSUB v19.4S, v2.4S, v3.4S\n"
2522             "FADD v0.4S, v16.4S, v18.4S\n"
2523             "FSUB v2.4S, v16.4S, v18.4S\n"
2524             "FADD v1.4S, v17.4S, v19.4S\n"
2525             "FSUB v3.4S, v17.4S, v19.4S\n"
2526             "ST1 {v0.4S}, [%0]\n"
2527             "ST1 {v1.4S}, [%1]\n"
2528             "ST1 {v2.4S}, [%2]\n"
2529             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2530             "r"(buf + j + k + 8388608),
2531             "r"(buf + j + k + 16777216),
2532             "r"(buf + j + k + 25165824)
2533             : "%v0",
2534               "%v1",
2535               "%v2",
2536               "%v3",
2537               "%v4",
2538               "%v5",
2539               "%v6",
2540               "%v7",
2541               "%v8",
2542               "%v9",
2543               "%v10",
2544               "%v11",
2545               "%v12",
2546               "%v13",
2547               "%v14",
2548               "%v15",
2549               "%v16",
2550               "%v17",
2551               "%v18",
2552               "%v19",
2553               "%v20",
2554               "%v21",
2555               "%v22",
2556               "%v23",
2557               "%v24",
2558               "%v25",
2559               "%v26",
2560               "%v27",
2561               "%v28",
2562               "%v29",
2563               "%v30",
2564               "%v31",
2565               "memory");
2566       }
2567     }
2568     return;
2569   }
2570 }
2571 void helper_float_25(float* buf);
helper_float_25(float * buf)2572 void helper_float_25(float* buf) {
2573   helper_float_25_recursive(buf, 25);
2574 }
2575 void helper_float_26_recursive(float* buf, int depth);
helper_float_26_recursive(float * buf,int depth)2576 void helper_float_26_recursive(float* buf, int depth) {
2577   if (depth == 25) {
2578     helper_float_25(buf);
2579     return;
2580   }
2581   if (depth == 26) {
2582     helper_float_26_recursive(buf + 0, 25);
2583     helper_float_26_recursive(buf + 33554432, 25);
2584     for (int j = 0; j < 67108864; j += 67108864) {
2585       for (int k = 0; k < 33554432; k += 4) {
2586         __asm__ volatile(
2587             "LD1 {v0.4S}, [%0]\n"
2588             "LD1 {v1.4S}, [%1]\n"
2589             "FADD v16.4S, v0.4S, v1.4S\n"
2590             "FSUB v17.4S, v0.4S, v1.4S\n"
2591             "ST1 {v16.4S}, [%0]\n"
2592             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2593             "r"(buf + j + k + 33554432)
2594             : "%v0",
2595               "%v1",
2596               "%v2",
2597               "%v3",
2598               "%v4",
2599               "%v5",
2600               "%v6",
2601               "%v7",
2602               "%v8",
2603               "%v9",
2604               "%v10",
2605               "%v11",
2606               "%v12",
2607               "%v13",
2608               "%v14",
2609               "%v15",
2610               "%v16",
2611               "%v17",
2612               "%v18",
2613               "%v19",
2614               "%v20",
2615               "%v21",
2616               "%v22",
2617               "%v23",
2618               "%v24",
2619               "%v25",
2620               "%v26",
2621               "%v27",
2622               "%v28",
2623               "%v29",
2624               "%v30",
2625               "%v31",
2626               "memory");
2627       }
2628     }
2629     return;
2630   }
2631 }
2632 void helper_float_26(float* buf);
helper_float_26(float * buf)2633 void helper_float_26(float* buf) {
2634   helper_float_26_recursive(buf, 26);
2635 }
2636 void helper_float_27_recursive(float* buf, int depth);
helper_float_27_recursive(float * buf,int depth)2637 void helper_float_27_recursive(float* buf, int depth) {
2638   if (depth == 26) {
2639     helper_float_26(buf);
2640     return;
2641   }
2642   if (depth == 27) {
2643     helper_float_27_recursive(buf + 0, 26);
2644     helper_float_27_recursive(buf + 67108864, 26);
2645     for (int j = 0; j < 134217728; j += 134217728) {
2646       for (int k = 0; k < 67108864; k += 4) {
2647         __asm__ volatile(
2648             "LD1 {v0.4S}, [%0]\n"
2649             "LD1 {v1.4S}, [%1]\n"
2650             "FADD v16.4S, v0.4S, v1.4S\n"
2651             "FSUB v17.4S, v0.4S, v1.4S\n"
2652             "ST1 {v16.4S}, [%0]\n"
2653             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2654             "r"(buf + j + k + 67108864)
2655             : "%v0",
2656               "%v1",
2657               "%v2",
2658               "%v3",
2659               "%v4",
2660               "%v5",
2661               "%v6",
2662               "%v7",
2663               "%v8",
2664               "%v9",
2665               "%v10",
2666               "%v11",
2667               "%v12",
2668               "%v13",
2669               "%v14",
2670               "%v15",
2671               "%v16",
2672               "%v17",
2673               "%v18",
2674               "%v19",
2675               "%v20",
2676               "%v21",
2677               "%v22",
2678               "%v23",
2679               "%v24",
2680               "%v25",
2681               "%v26",
2682               "%v27",
2683               "%v28",
2684               "%v29",
2685               "%v30",
2686               "%v31",
2687               "memory");
2688       }
2689     }
2690     return;
2691   }
2692 }
2693 void helper_float_27(float* buf);
helper_float_27(float * buf)2694 void helper_float_27(float* buf) {
2695   helper_float_27_recursive(buf, 27);
2696 }
2697 void helper_float_28_recursive(float* buf, int depth);
helper_float_28_recursive(float * buf,int depth)2698 void helper_float_28_recursive(float* buf, int depth) {
2699   if (depth == 26) {
2700     helper_float_26(buf);
2701     return;
2702   }
2703   if (depth == 28) {
2704     helper_float_28_recursive(buf + 0, 26);
2705     helper_float_28_recursive(buf + 67108864, 26);
2706     helper_float_28_recursive(buf + 134217728, 26);
2707     helper_float_28_recursive(buf + 201326592, 26);
2708     for (int j = 0; j < 268435456; j += 268435456) {
2709       for (int k = 0; k < 67108864; k += 4) {
2710         __asm__ volatile(
2711             "LD1 {v0.4S}, [%0]\n"
2712             "LD1 {v1.4S}, [%1]\n"
2713             "LD1 {v2.4S}, [%2]\n"
2714             "LD1 {v3.4S}, [%3]\n"
2715             "FADD v16.4S, v0.4S, v1.4S\n"
2716             "FSUB v17.4S, v0.4S, v1.4S\n"
2717             "FADD v18.4S, v2.4S, v3.4S\n"
2718             "FSUB v19.4S, v2.4S, v3.4S\n"
2719             "FADD v0.4S, v16.4S, v18.4S\n"
2720             "FSUB v2.4S, v16.4S, v18.4S\n"
2721             "FADD v1.4S, v17.4S, v19.4S\n"
2722             "FSUB v3.4S, v17.4S, v19.4S\n"
2723             "ST1 {v0.4S}, [%0]\n"
2724             "ST1 {v1.4S}, [%1]\n"
2725             "ST1 {v2.4S}, [%2]\n"
2726             "ST1 {v3.4S}, [%3]\n" ::"r"(buf + j + k + 0),
2727             "r"(buf + j + k + 67108864),
2728             "r"(buf + j + k + 134217728),
2729             "r"(buf + j + k + 201326592)
2730             : "%v0",
2731               "%v1",
2732               "%v2",
2733               "%v3",
2734               "%v4",
2735               "%v5",
2736               "%v6",
2737               "%v7",
2738               "%v8",
2739               "%v9",
2740               "%v10",
2741               "%v11",
2742               "%v12",
2743               "%v13",
2744               "%v14",
2745               "%v15",
2746               "%v16",
2747               "%v17",
2748               "%v18",
2749               "%v19",
2750               "%v20",
2751               "%v21",
2752               "%v22",
2753               "%v23",
2754               "%v24",
2755               "%v25",
2756               "%v26",
2757               "%v27",
2758               "%v28",
2759               "%v29",
2760               "%v30",
2761               "%v31",
2762               "memory");
2763       }
2764     }
2765     return;
2766   }
2767 }
2768 void helper_float_28(float* buf);
helper_float_28(float * buf)2769 void helper_float_28(float* buf) {
2770   helper_float_28_recursive(buf, 28);
2771 }
2772 void helper_float_29_recursive(float* buf, int depth);
helper_float_29_recursive(float * buf,int depth)2773 void helper_float_29_recursive(float* buf, int depth) {
2774   if (depth == 28) {
2775     helper_float_28(buf);
2776     return;
2777   }
2778   if (depth == 29) {
2779     helper_float_29_recursive(buf + 0, 28);
2780     helper_float_29_recursive(buf + 268435456, 28);
2781     for (int j = 0; j < 536870912; j += 536870912) {
2782       for (int k = 0; k < 268435456; k += 4) {
2783         __asm__ volatile(
2784             "LD1 {v0.4S}, [%0]\n"
2785             "LD1 {v1.4S}, [%1]\n"
2786             "FADD v16.4S, v0.4S, v1.4S\n"
2787             "FSUB v17.4S, v0.4S, v1.4S\n"
2788             "ST1 {v16.4S}, [%0]\n"
2789             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2790             "r"(buf + j + k + 268435456)
2791             : "%v0",
2792               "%v1",
2793               "%v2",
2794               "%v3",
2795               "%v4",
2796               "%v5",
2797               "%v6",
2798               "%v7",
2799               "%v8",
2800               "%v9",
2801               "%v10",
2802               "%v11",
2803               "%v12",
2804               "%v13",
2805               "%v14",
2806               "%v15",
2807               "%v16",
2808               "%v17",
2809               "%v18",
2810               "%v19",
2811               "%v20",
2812               "%v21",
2813               "%v22",
2814               "%v23",
2815               "%v24",
2816               "%v25",
2817               "%v26",
2818               "%v27",
2819               "%v28",
2820               "%v29",
2821               "%v30",
2822               "%v31",
2823               "memory");
2824       }
2825     }
2826     return;
2827   }
2828 }
2829 void helper_float_29(float* buf);
helper_float_29(float * buf)2830 void helper_float_29(float* buf) {
2831   helper_float_29_recursive(buf, 29);
2832 }
2833 void helper_float_30_recursive(float* buf, int depth);
helper_float_30_recursive(float * buf,int depth)2834 void helper_float_30_recursive(float* buf, int depth) {
2835   if (depth == 29) {
2836     helper_float_29(buf);
2837     return;
2838   }
2839   if (depth == 30) {
2840     helper_float_30_recursive(buf + 0, 29);
2841     helper_float_30_recursive(buf + 536870912, 29);
2842     for (int j = 0; j < 1073741824; j += 1073741824) {
2843       for (int k = 0; k < 536870912; k += 4) {
2844         __asm__ volatile(
2845             "LD1 {v0.4S}, [%0]\n"
2846             "LD1 {v1.4S}, [%1]\n"
2847             "FADD v16.4S, v0.4S, v1.4S\n"
2848             "FSUB v17.4S, v0.4S, v1.4S\n"
2849             "ST1 {v16.4S}, [%0]\n"
2850             "ST1 {v17.4S}, [%1]\n" ::"r"(buf + j + k + 0),
2851             "r"(buf + j + k + 536870912)
2852             : "%v0",
2853               "%v1",
2854               "%v2",
2855               "%v3",
2856               "%v4",
2857               "%v5",
2858               "%v6",
2859               "%v7",
2860               "%v8",
2861               "%v9",
2862               "%v10",
2863               "%v11",
2864               "%v12",
2865               "%v13",
2866               "%v14",
2867               "%v15",
2868               "%v16",
2869               "%v17",
2870               "%v18",
2871               "%v19",
2872               "%v20",
2873               "%v21",
2874               "%v22",
2875               "%v23",
2876               "%v24",
2877               "%v25",
2878               "%v26",
2879               "%v27",
2880               "%v28",
2881               "%v29",
2882               "%v30",
2883               "%v31",
2884               "memory");
2885       }
2886     }
2887     return;
2888   }
2889 }
2890 void helper_float_30(float* buf);
helper_float_30(float * buf)2891 void helper_float_30(float* buf) {
2892   helper_float_30_recursive(buf, 30);
2893 }
fht_float(float * buf,int log_n)2894 int fht_float(float* buf, int log_n) {
2895   if (log_n == 0) {
2896     return 0;
2897   }
2898   if (log_n == 1) {
2899     helper_float_1(buf);
2900     return 0;
2901   }
2902   if (log_n == 2) {
2903     helper_float_2(buf);
2904     return 0;
2905   }
2906   if (log_n == 3) {
2907     helper_float_3(buf);
2908     return 0;
2909   }
2910   if (log_n == 4) {
2911     helper_float_4(buf);
2912     return 0;
2913   }
2914   if (log_n == 5) {
2915     helper_float_5(buf);
2916     return 0;
2917   }
2918   if (log_n == 6) {
2919     helper_float_6(buf);
2920     return 0;
2921   }
2922   if (log_n == 7) {
2923     helper_float_7(buf);
2924     return 0;
2925   }
2926   if (log_n == 8) {
2927     helper_float_8(buf);
2928     return 0;
2929   }
2930   if (log_n == 9) {
2931     helper_float_9(buf);
2932     return 0;
2933   }
2934   if (log_n == 10) {
2935     helper_float_10(buf);
2936     return 0;
2937   }
2938   if (log_n == 11) {
2939     helper_float_11(buf);
2940     return 0;
2941   }
2942   if (log_n == 12) {
2943     helper_float_12(buf);
2944     return 0;
2945   }
2946   if (log_n == 13) {
2947     helper_float_13(buf);
2948     return 0;
2949   }
2950   if (log_n == 14) {
2951     helper_float_14(buf);
2952     return 0;
2953   }
2954   if (log_n == 15) {
2955     helper_float_15(buf);
2956     return 0;
2957   }
2958   if (log_n == 16) {
2959     helper_float_16(buf);
2960     return 0;
2961   }
2962   if (log_n == 17) {
2963     helper_float_17(buf);
2964     return 0;
2965   }
2966   if (log_n == 18) {
2967     helper_float_18(buf);
2968     return 0;
2969   }
2970   if (log_n == 19) {
2971     helper_float_19(buf);
2972     return 0;
2973   }
2974   if (log_n == 20) {
2975     helper_float_20(buf);
2976     return 0;
2977   }
2978   if (log_n == 21) {
2979     helper_float_21(buf);
2980     return 0;
2981   }
2982   if (log_n == 22) {
2983     helper_float_22(buf);
2984     return 0;
2985   }
2986   if (log_n == 23) {
2987     helper_float_23(buf);
2988     return 0;
2989   }
2990   if (log_n == 24) {
2991     helper_float_24(buf);
2992     return 0;
2993   }
2994   if (log_n == 25) {
2995     helper_float_25(buf);
2996     return 0;
2997   }
2998   if (log_n == 26) {
2999     helper_float_26(buf);
3000     return 0;
3001   }
3002   if (log_n == 27) {
3003     helper_float_27(buf);
3004     return 0;
3005   }
3006   if (log_n == 28) {
3007     helper_float_28(buf);
3008     return 0;
3009   }
3010   if (log_n == 29) {
3011     helper_float_29(buf);
3012     return 0;
3013   }
3014   if (log_n == 30) {
3015     helper_float_30(buf);
3016     return 0;
3017   }
3018   return 1;
3019 }
3020