xref: /aosp_15_r20/external/libxaac/decoder/armv8/ixheaacd_post_twiddle_overlap.s (revision 15dc779a375ca8b5125643b829a8aa4b70d7f451)
1.macro push_v_regs
2    stp       q8, q9, [sp, #-32]!
3    stp       q10, q11, [sp, #-32]!
4    stp       q12, q13, [sp, #-32]!
5    stp       q14, q15, [sp, #-32]!
6//st1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [sp, #-64]!
7//st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]!
8    stp       X8, X9, [sp, #-16]!
9    stp       X10, X11, [sp, #-16]!
10    stp       X12, X13, [sp, #-16]!
11    stp       X14, X15, [sp, #-16]!
12  stp       X16, X17, [sp, #-16]!
13  stp       X18, X19, [sp, #-16]!
14  stp       X20, X21, [sp, #-16]!
15  stp       X22, X23, [sp, #-16]!
16  stp       X24, X25, [sp, #-16]!
17  stp       X26, X27, [sp, #-16]!
18  stp       X28, X29, [sp, #-16]!
19  stp       X30, X29, [sp, #-16]!
20.endm
21
22.macro pop_v_regs
23    ldp       X30, X29, [sp], #16
24    ldp       X28, X29, [sp], #16
25    ldp       X26, X27, [sp], #16
26    ldp       X24, X25, [sp], #16
27    ldp       X22, X23, [sp], #16
28    ldp       X20, X21, [sp], #16
29    ldp       X18, X19, [sp], #16
30    ldp       X16, X17, [sp], #16
31    ldp       X14, X15, [sp], #16
32    ldp       X12, X13, [sp], #16
33    ldp       X10, X11, [sp], #16
34    ldp       X8, X9, [sp], #16
35//ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
36//ld1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [sp], #64
37    ldp       q14, q15, [sp], #32
38    ldp       q12, q13, [sp], #32
39    ldp       q10, q11, [sp], #32
40    ldp       q8, q9, [sp], #32
41.endm
42
43
44.text
45.p2align 2
46.global ixheaacd_post_twid_overlap_add_armv8
47
48ixheaacd_post_twid_overlap_add_armv8:
49
50    // STMFD sp!, {x4-x12}
51  push_v_regs
52  //stp x19, x20,[sp,#-16]!
53    //VPUSH           {d8 - d15}
54
55    //LDR w4,  [sp, #100]
56  //sxtw x4,w4
57    //LDR w5,  [sp, #104]
58  //sxtw x5,w5
59    //LDR w6,  [sp, #108]
60  //sxtw x6,w6
61  MOV        x16, x5
62  MOV        x17, x7
63    LSL             x9, x3, #2
64    ASR             x9, x9, #1
65    ADD             x6, x6, x9
66    SUB             x6, x6, #4
67
68    MOV             w8, #7500
69    sxtw            x8, w8
70    ADD             x2, x2, x8
71
72
73
74    movi  v18.4h, #50
75  sub x20, x5, #15
76  neg x9, x20
77    movi  v20.4s, #0x00, LSL #8
78    dup  v16.4s,w5
79    SUB             x5, x5, #16
80    //STR w5,  [sp, #116]
81  MOV        w25, w5
82  sxtw x25,w25
83    MOV             x8, #1
84    LSL             x8, x8, x9
85    //STR w8,  [sp, #120]
86  MOV        w26, w8
87
88  //sxtw x8,w8
89
90
91ARM_PROLOGUE:
92
93
94    LDR w8,  [x1], #4
95  sxtw x8,w8
96    LDR w9,  [x1], #4
97  sxtw x9,w9
98
99    LDR w10,  [x2], #4
100  sxtw x10,w10
101
102  AND        w19,w10,0xFFFF
103  sxth      x19,w19
104  ASR        w10,w10,#16
105//    SMULWT          x11, x8, x10
106//
107//  SMULWB          x12, x9, x10
108//    SMULWB          x5, x8, x10
109//    SMLAWT          x7, x9, x10, x5
110
111  SMULL          x11, w8, w10
112  ASR        x11,x11,#16
113  SMULL          x12, w9, w19
114  ASR        x12,x12,#16
115    SMULL          x5, w8, w19
116  ASR        x5,x5,#16
117    SMULL      x7, w9, w10
118  ASR        x7, x7, #16
119  ADD        x7, x7, x5
120
121  SUB             x8, x12, x11
122    MVN             x5, x7
123    ADD             x5, x5, #1
124
125
126    MOV             x9, #50
127    MOV             x12, #-50
128    AND        w19,w9,0xFFFF
129  sxth      x19,w19
130  SMULL          x10, w5, w19
131    ASR        x10,x10,#16
132  AND        w19,w12,0xFFFF
133  sxth      x19,w19
134  SMULL          x11, w8, w19
135  ASR        x11,x11,#16
136
137  ADD             x8, x8, x10
138    ADD             x5, x5, x11
139
140    //LDR w11,  [sp, #104]
141  MOV    w11, w16
142  sxth x11,w11
143    LDR w10,  [x6], #-32
144  sxtw x10,w10
145
146  AND        w19,w10,0xFFFF
147  sxth      x19,w19
148  ASR        w20,w10,#16
149
150    //SMULWB          x7, x8, x10
151    SMULL      x7, w8, w19
152  ASR        x7, x7, #16
153  MVN             x8, x8
154    ADD             x8, x8, #1
155    //SMULWT          x12, x8, x10
156  SMULL      x12, w8, w20
157  ASR        x12, x12, #16
158
159    CMP             x11, #0
160    BLT             NEXT
161
162    SUB            x9, x11, #16
163  negs      x9,x9
164
165
166
167
168   // LDR w8,  [sp, #120]
169  //sxtw x8,w8
170  MOV      v1.s[0], w26
171  MOV      v2.s[0], w5
172
173    //sQADD            w5, w5, w8
174    //ASR             w5, w5, w9
175
176  SQADD      v2.2s, v2.2s, v1.2s
177  MOV        w5, v2.s[0]
178  ASR             w5, w5, w9
179
180    SUB            x9, x11, #31
181  negs      x9,x9
182  ASR        x20, x7, x9
183    //MOV            x8, x20
184  ADDS      x8, x20, #0
185  BGE        NEXT2
186    CMN           x8, #1
187NEXT2:
188    MOV             x20, #0x80000000
189   csel x7, x20, x7,LT
190    MOV             x20, #0x7fffffff
191   csel x7, x20, x7,GT
192    LSL x20, x7,  x11
193  csel x7,x20,x7,EQ
194
195    SUB            x9, x11, #31
196  negs      x9,x9
197  ASR        x20, x12, x9
198    //MOV            x8, x20
199  ADDS      x8, x20, #0
200  BGE        NEXT3
201  CMN           x8, #1
202NEXT3:
203    MOV             x20, #0x80000000
204   csel x12, x20, x12,LT
205    MOV             x20, #0x7fffffff
206   csel x12, x20, x12,GT
207    LSL x20, x12,  x11
208  csel x12,x20,x12,EQ
209
210    B               NEXT1
211NEXT:
212    MVN             w11, w11
213    ADD             w11, w11, #1
214    ASR             w5, w5, w11
215    MOV             w8, #0x8000
216
217  MOV      v1.s[0], w8
218  MOV      v2.s[0], w5
219
220    //QADD            x5, x5, x8
221
222  SQADD      v2.2s, v2.2s, v1.2s
223  MOV        w5, v2.s[0]
224
225  ASR             w5, w5, #16
226    ASR             w7, w7, w11
227    ASR             w12, w12, w11
228
229NEXT1:
230    LDR w9,  [x4]
231  sxtw x9,w9
232    MOV             w8, #0x8000
233  //sxtw x8,w8
234
235    STR w5,  [x4], #4
236  sxtw x5,w5
237
238
239  ROR        w20, w10, #16
240    //UXTH            x5, x10, ROR #16
241    UXTH      w5, w20
242  UXTH            w10, w10
243
244
245    dup  v0.2s,w9
246    dup  v2.2s,w10
247    dup  v3.2s,w5
248    //VZIP.32         D2, D3
249    ZIP1    v28.2s, v2.2s, v3.2s
250  ZIP2    v3.2s, v2.2s, v3.2s
251  MOV      v2.8b, v28.8b
252  sMULL v0.2d, v2.2s, v0.2s
253    Sqxtn v8.2s,  v0.2d
254
255
256    dup  v0.2s,w12
257    dup  v1.2s,w7
258
259    //VZIP.32         D0, D1
260
261    ZIP1    v28.2s, v0.2s, v1.2s
262  ZIP2    v1.2s, v0.2s, v1.2s
263  MOV      v0.8b, v28.8b
264
265    SQSUB  v8.2s,  v0.2s ,  v8.2s
266
267
268    //sQshL v8.2s, v8.2s,#2
269    dup  v0.2s,w8
270    //SQADD  v8.2s,  v8.2s ,  v0.2s
271    //sshR v8.2s, v8.2s,#16
272
273
274
275    MOV x7,  x17
276  //sxtw x7,w7
277    LSL             x10, x7, #2
278
279    ASR             x5, x3, #1
280    //SMULBB          x5, x10, x5
281    AND        w5,w5,0xFFFF
282    sxth      x5,w5
283  AND        w19,w10,0xFFFF
284  sxth      x19,w19
285  SMULL      x5, w19, w5
286
287    ADD             x5, x5, x0
288    SUB             x0, x5, x10
289    MVN             x9, x10
290    ADD             x9, x9, #1
291
292    ST1 {V8.S}[1],[x0], x9
293    ST1 {V8.S}[0],[x5], x10
294
295
296    MOV             x8, x1
297    LSL             x12, x3, #2
298
299    ADD             x1, x1, x12
300
301    SUB             x1, x1, #40
302
303    MOV             x12, #-32
304
305
306
307PROLOGUE_NEON:
308
309    ASR             x3, x3, #2
310    SUB             x3, x3, #4
311    ASR             x3, x3, #2
312    SUB             x3, x3, #2
313
314    LD2 { v0.4s, v1.4s}, [x1]
315  MOV        v2.16b, v1.16b
316  ADD       x1, x1, x12
317
318    //VUZP.16         D0, D1
319    UZP1      v28.8h, v0.8h, v0.8h
320  UZP2      v29.8h, v0.8h, v0.8h
321  MOV        v0.d[0], v28.d[0]
322  MOV        v0.d[1], v29.d[0]
323
324  //VUZP.16         D2, D3
325
326  UZP1      v28.8h, v2.8h, v2.8h
327  UZP2      v29.8h, v2.8h, v2.8h
328  MOV        v2.d[0], v28.d[0]
329  MOV        v2.d[1], v29.d[0]
330
331
332    //rev64  v0.8h,  v0.8h
333  rev64  v0.8h,  v0.8h
334  MOV        v1.d[0], v0.d[1]
335    rev64  v2.8h,  v2.8h
336  MOV        v3.d[0], v2.d[1]
337    LD2 {v8.4h, v9.4h}, [x2]
338  ADD      x2, x2, #16
339
340    LD2 { v4.4s, v5.4s}, [x8]
341  MOV      v6.16b, v5.16b
342  ADD x8, x8,#32
343    uMULL v30.4s, v0.4h, v9.4h
344
345//    VUZP.16         D4, D5
346
347  UZP1      v28.8h, v4.8h, v4.8h
348  UZP2      v29.8h, v4.8h, v4.8h
349  MOV        v4.d[0], v28.d[0]
350  MOV        v5.d[0], v29.d[0]
351
352    uMULL v28.4s, v2.4h, v8.4h
353
354//    VUZP.16         D6, D7
355  UZP1      v26.8h, v6.8h, v6.8h
356  UZP2      v27.8h, v6.8h, v6.8h
357  MOV        v6.d[0], v26.d[0]
358  MOV        v7.d[0], v27.d[0]
359
360  uMULL v26.4s, v0.4h, v8.4h
361
362
363    uMULL v24.4s, v2.4h, v9.4h
364
365    LD2 { v10.4s, v11.4s}, [x6]
366    MOV      v12.16b, v11.16b
367  ADD  x6, x6, x12
368    ushR v30.4s, v30.4s,#16
369
370    //VUZP.16         D10, D11
371
372  UZP1      v22.8h, v10.8h, v10.8h
373  UZP2      v23.8h, v10.8h, v10.8h
374  MOV        v10.d[0], v22.d[0]
375  MOV        v10.d[1], v23.d[0]
376
377  ushR v28.4s, v28.4s,#16
378
379    //VUZP.16         D12, D13
380
381  UZP1      v22.8h, v12.8h, v12.8h
382  UZP2      v23.8h, v12.8h, v12.8h
383  MOV        v12.d[0], v22.d[0]
384  MOV        v12.d[1], v23.d[0]
385
386  sMLAL v30.4s, v1.4h, v9.4h
387
388    rev64  v10.8h,  v10.8h
389  MOV    v11.d[0], v10.d[1]
390    sMLAL v28.4s, v3.4h, v8.4h
391
392    rev64  v12.8h,  v12.8h
393  MOV    v13.d[0], v12.d[1]
394    ushR v26.4s, v26.4s,#16
395
396
397    ushR v24.4s, v24.4s,#16
398
399    sMLAL v26.4s, v1.4h, v8.4h
400    sMLAL v24.4s, v3.4h, v9.4h
401
402
403
404    ADD  v30.4s,  v30.4s ,  v28.4s
405    NEG  v30.4s, v30.4s
406
407    uMULL v22.4s, v4.4h, v8.4h
408
409    SUB  v28.4s,  v24.4s ,  v26.4s
410
411
412    mov  v26.16b, v30.16b
413    mov  v24.16b, v28.16b
414
415//    VUZP.16         D24, D25
416
417  UZP1      v19.8h, v24.8h, v24.8h
418  UZP2      v21.8h, v24.8h, v24.8h
419  MOV        v24.d[0], v19.d[0]
420  MOV        v25.d[0], v21.d[0]
421
422
423//    VUZP.16         D26, D27
424
425  UZP1      v19.8h, v26.8h, v26.8h
426  UZP2      v21.8h, v26.8h, v26.8h
427  MOV        v26.d[0], v19.d[0]
428  MOV        v27.d[0], v21.d[0]
429
430    uMULL v2.4s, v24.4h, v18.4h
431
432    uMULL v0.4s, v26.4h, v18.4h
433
434    ushR v22.4s, v22.4s,#16
435    sMLAL v22.4s, v5.4h, v8.4h
436
437    ushR v2.4s, v2.4s,#16
438    ushR v0.4s, v0.4s,#16
439    sMLAL v2.4s, v25.4h, v18.4h
440    sMLAL v0.4s, v27.4h, v18.4h
441
442    uMULL v24.4s, v4.4h, v9.4h
443    uMULL v26.4s, v6.4h, v8.4h
444
445    NEG  v2.4s, v2.4s
446    ADD  v28.4s,  v28.4s ,  v0.4s
447    ADD  v30.4s,  v30.4s ,  v2.4s
448
449    uMULL v0.4s, v6.4h, v9.4h
450    sshR v24.4s, v24.4s,#16
451    sMLAL v24.4s, v5.4h, v9.4h
452    sshR v26.4s, v26.4s,#16
453    sshR v0.4s, v0.4s,#16
454    sMLAL v26.4s, v7.4h, v8.4h
455    sMLAL v0.4s, v7.4h, v9.4h
456
457
458
459
460    ADD  v22.4s,  v22.4s ,  v0.4s
461    NEG  v22.4s, v22.4s
462    SUB  v24.4s,  v26.4s ,  v24.4s
463
464
465
466    //LDR w11,  [sp, #120]
467  //sxtw x11,w11
468  MOV      w11, w26
469    dup  v14.4s,w11
470    SQADD  v28.4s,  v28.4s ,  v14.4s
471    //LDR w11,  [sp, #116]
472  MOV    w11, w25
473  //sxtw x11,w11
474    dup  v0.4s,w11
475    sQshL v28.4s, v28.4s, v0.4s
476
477    mov  v0.16b, v22.16b
478    mov  v14.16b, v24.16b
479
480
481//    VUZP.16         D24, D25
482
483  UZP1      v19.8h, v24.8h, v24.8h
484  UZP2      v21.8h, v24.8h, v24.8h
485  MOV        v24.d[0], v19.d[0]
486  MOV        v25.d[0], v21.d[0]
487
488
489//    VUZP.16         D22, D23
490
491  UZP1      v19.8h, v22.8h, v22.8h
492  UZP2      v21.8h, v22.8h, v22.8h
493  MOV        v22.d[0], v19.d[0]
494  MOV        v23.d[0], v21.d[0]
495
496  uMULL v8.4s, v24.4h, v18.4h
497    uMULL v26.4s, v22.4h, v18.4h
498
499    NEG  v2.4s, v30.4s
500//    VUZP.16         D30, D31
501
502  UZP1      v19.8h, v30.8h, v30.8h
503  UZP2      v21.8h, v30.8h, v30.8h
504  MOV        v30.d[0], v19.d[0]
505  MOV        v30.d[1], v21.d[0]
506
507//    VUZP.16         D2, D3
508
509  UZP1      v19.8h, v2.8h, v2.8h
510  UZP2      v21.8h, v2.8h, v2.8h
511  MOV        v2.d[0], v19.d[0]
512  MOV        v3.d[0], v21.d[0]
513
514    uMULL v4.4s, v30.4h, v12.4h
515
516    uMULL v6.4s, v2.4h, v13.4h
517
518    ushR v8.4s, v8.4s,#16
519    ushR v26.4s, v26.4s,#16
520
521    sMLAL v8.4s, v25.4h, v18.4h
522    sMLAL v26.4s, v23.4h, v18.4h
523
524    ushR v4.4s, v4.4s,#16
525    ushR v6.4s, v6.4s,#16
526
527  MOV  v19.d[0], v30.d[1]
528
529    sMLAL v4.4s, v19.4h, v12.4h
530    sMLAL v6.4s, v3.4h, v13.4h
531
532    NEG  v8.4s, v8.4s
533    ADD  v14.4s,  v14.4s ,  v26.4s
534    ADD  v0.4s,  v0.4s ,  v8.4s
535
536    //LDR w11,  [sp, #120]
537  //sxtw x11,w11
538  MOV      w11, w26
539    dup  v8.4s,w11
540    SQADD  v0.4s,  v0.4s ,  v8.4s
541    //LDR w11,  [sp, #116]
542  //sxtw x11,w11
543    MOV    w11, w25
544  dup  v26.4s,w11
545    sQshL v0.4s, v0.4s, v26.4s
546
547    mov  v26.16b, v28.16b
548
549    LD2 { v28.4s, v29.4s}, [x4]
550    MOV      v30.16b, v29.16b
551    MOV      v29.d[0], v28.d[1]
552 //   VZIP.32         Q13, Q0
553
554    ZIP1    v19.4s, v26.4s, v0.4s
555  ZIP2    v0.4s, v26.4s, v0.4s
556  MOV      v26.16b, v19.16b
557
558    ST1 { v26.4s}, [x4],#16
559    ST1 { v0.4s}, [x4],#16
560
561    movi  v1.2s, #0
562    //VADDL.S16       Q0, D13, D1
563
564  SADDL       v0.4s, v13.4h, v1.4h
565  MOV      v1.d[0], v0.d[1]
566    sMULL v26.2d, v28.2s, v0.2s
567    Sqxtn v8.2s,  v26.2d
568    sMULL v26.2d, v29.2s, v1.2s
569    Sqxtn v9.2s,  v26.2d
570  MOV    v8.d[1], v9.d[0]
571    movi  v1.2s, #0
572//    VADDL.S16       Q0, D12, D1
573  SADDL       v0.4s, v12.4h, v1.4h
574  MOV      v1.d[0], v0.d[1]
575    sMULL v24.2d, v28.2s, v0.2s
576    Sqxtn v26.2s,  v24.2d
577    sMULL v24.2d, v29.2s, v1.2s
578    Sqxtn v27.2s,  v24.2d
579  MOV    v26.d[1], v27.d[0]
580
581    sQshL v4.4s, v4.4s, v16.4s
582    sQshL v6.4s, v6.4s, v16.4s
583
584    SQSUB  v4.4s,  v4.4s ,  v8.4s
585    SQSUB  v6.4s,  v6.4s ,  v26.4s
586
587    NEG  v26.4s, v14.4s
588//    VUZP.16         D14, D15
589
590
591  UZP1      v19.8h, v14.8h, v14.8h
592  UZP2      v21.8h, v14.8h, v14.8h
593  MOV        v14.d[0], v19.d[0]
594  MOV        v15.d[0], v21.d[0]
595
596//    VUZP.16         D26, D27
597
598
599  UZP1      v19.8h, v26.8h, v26.8h
600  UZP2      v21.8h, v26.8h, v26.8h
601  MOV        v26.d[0], v19.d[0]
602  MOV        v27.d[0], v21.d[0]
603
604
605    movi  v1.2s, #0
606//    VADDL.S16       Q0, D10, D1
607   SADDL       v0.4s, v10.4h, v1.4h
608  MOV      v1.d[0], v0.d[0]
609  sMULL v22.2d, v30.2s, v0.2s
610    Sqxtn v24.2s,  v22.2d
611    sMULL2 v22.2d, v30.4s, v0.4s
612    Sqxtn v25.2s,  v22.2d
613  MOV    v24.d[1], v25.d[0]
614    movi  v1.2s, #0
615//    VADDL.S16       Q0, D11, D1
616  SADDL       v0.4s, v11.4h, v1.4h
617  MOV      v1.d[0], v0.d[1]
618
619  sMULL v8.2d, v30.2s, v0.2s
620    Sqxtn v22.2s,  v8.2d
621    sMULL2 v8.2d, v30.4s, v0.4s
622    Sqxtn v23.2s,  v8.2d
623  MOV    v22.d[1], v23.d[0]
624    uMULL v8.4s, v26.4h, v11.4h
625    uMULL v30.4s, v14.4h, v10.4h
626
627    LD2 { v0.4s, v1.4s}, [x1]
628  MOV  v2.16b, v1.16b
629  ADD x1, x1, x12
630
631//    VUZP.16         D0, D1
632
633  UZP1      v19.8h, v0.8h, v0.8h
634  UZP2      v21.8h, v0.8h, v0.8h
635  MOV        v0.d[0], v19.d[0]
636  MOV        v0.d[1], v21.d[0]
637
638//    VUZP.16         D2, D3
639
640  UZP1      v19.8h, v2.8h, v2.8h
641  UZP2      v21.8h, v2.8h, v2.8h
642  MOV        v2.d[0], v19.d[0]
643  MOV        v2.d[1], v21.d[0]
644
645    ushR v8.4s, v8.4s,#16
646
647    rev64  v0.8h,  v0.8h
648  MOV    v1.d[0], v0.d[1]
649    ushR v30.4s, v30.4s,#16
650
651    rev64  v2.8h,  v2.8h
652  MOV    v3.d[0], v2.d[1]
653    sMLAL v8.4s, v27.4h, v11.4h
654
655    sMLAL v30.4s, v15.4h, v10.4h
656
657    LD2 { v10.4s, v11.4s}, [x6]
658  ADD  x6, x6, x12
659  MOV  v12.16b, v11.16b
660
661  UZP1      v19.8h, v10.8h, v10.8h
662  UZP2      v21.8h, v10.8h, v10.8h
663  MOV        v10.d[0], v19.d[0]
664  MOV        v10.d[1], v21.d[0]
665
666
667 UZP1   v19.8h, v12.8h, v12.8h
668 UZP2   v21.8h, v12.8h, v12.8h
669 MOV    v12.d[0], v19.d[0]
670 MOV    v12.d[1], v21.d[0]
671  MOV V14.16B , V4.16B
672
673    rev64  v10.8h,  v10.8h
674  MOV    v11.d[0], v10.d[1]
675
676
677    rev64  v12.8h,  v12.8h
678  MOV   v13.d[0], v12.d[1]
679
680    sQshL v8.4s, v8.4s, v16.4s
681
682    MOV V31.16B, V6.16B
683    LD2 { v4.4s, v5.4s}, [x8]
684  ADD  x8, x8,#32
685
686  MOV  v6.16b, v5.16b
687    sQshL v30.4s, v30.4s, v16.4s
688
689//    VUZP.16         D4, D5
690
691  UZP1      v19.8h, v4.8h, v4.8h
692  UZP2      v21.8h, v4.8h, v4.8h
693  MOV        v4.d[0], v19.d[0]
694  MOV        v5.d[0], v21.d[0]
695
696    SQSUB  v8.4s,  v8.4s ,  v24.4s
697
698//    VUZP.16         D6, D7
699
700    UZP1      v19.8h, v6.8h, v6.8h
701  UZP2      v21.8h, v6.8h, v6.8h
702  MOV        v6.d[0], v19.d[0]
703  MOV        v7.d[0], v21.d[0]
704
705  SQSUB  v22.4s,  v30.4s ,  v22.4s
706
707
708     MOV V30.16B, V8.16B
709
710    LD2 {v8.4h, v9.4h}, [x2]
711  ADD  x2, x2, #16
712
713
714CORE_LOOP:
715    ST1 {V14.S}[0], [x0]
716  ADD  x0, x0, x9
717    ST1 {V22.S}[0], [x0]
718  ADD  x0, x0,  x9
719
720
721    ST1 {V14.S}[1], [x0]
722  ADD  x0, x0, x9
723
724
725    ST1 {V22.S}[1], [x0]
726  ADD  x0, x0,  x9
727
728
729    ST1 {V14.S}[2], [x0]
730    ADD  x0, x0,  x9
731
732
733    ST1 {V22.S}[2], [x0]
734    ADD  x0, x0,  x9
735
736
737    ST1 {V14.S}[3], [x0]
738    ADD  x0, x0,  x9
739
740
741    ST1 {V22.S}[3], [x0]
742    ADD  x0, x0,  x9
743
744
745    ST1 {V31.S}[0], [x5]
746    ADD  x5, x5,  x10
747
748
749    ST1 {V30.S}[0], [x5]
750  ADD  x5, x5,  x10
751
752
753    ST1 {V31.S}[1], [x5]
754  ADD  x5, x5,  x10
755
756
757    ST1 {V30.S}[1], [x5]
758  ADD  x5, x5,  x10
759
760
761    ST1 {V31.S}[2], [x5]
762  ADD  x5, x5,  x10
763
764
765    ST1 {V30.S}[2], [x5]
766  ADD  x5, x5,  x10
767
768
769    ST1 {V31.S}[3], [x5]
770  ADD  x5, x5,  x10
771
772    ST1 {V30.S}[3], [x5]
773  ADD  x5, x5,  x10
774
775
776    uMULL v30.4s, v0.4h, v9.4h
777    uMULL v28.4s, v2.4h, v8.4h
778    uMULL v26.4s, v0.4h, v8.4h
779    uMULL v24.4s, v2.4h, v9.4h
780    ushR v30.4s, v30.4s,#16
781    ushR v28.4s, v28.4s,#16
782    sMLAL v30.4s, v1.4h, v9.4h
783    sMLAL v28.4s, v3.4h, v8.4h
784    ushR v26.4s, v26.4s,#16
785    ushR v24.4s, v24.4s,#16
786    sMLAL v26.4s, v1.4h, v8.4h
787    sMLAL v24.4s, v3.4h, v9.4h
788    ADD  v30.4s,  v30.4s ,  v28.4s
789    NEG  v30.4s, v30.4s
790    SUB  v28.4s,  v24.4s ,  v26.4s
791
792    mov  v26.16b, v30.16b
793    uMULL v22.4s, v4.4h, v8.4h
794
795    mov  v24.16b, v28.16b
796
797//    VUZP.16         D24, D25
798
799  UZP1      v19.8h, v24.8h, v24.8h
800  UZP2      v21.8h, v24.8h, v24.8h
801  MOV        v24.d[0], v19.d[0]
802  MOV        v25.d[0], v21.d[0]
803
804
805//    VUZP.16         D26, D27
806
807  UZP1      v19.8h, v26.8h, v26.8h
808  UZP2      v21.8h, v26.8h, v26.8h
809  MOV        v26.d[0], v19.d[0]
810  MOV        v27.d[0], v21.d[0]
811
812  uMULL v2.4s, v24.4h, v18.4h
813    uMULL v0.4s, v26.4h, v18.4h
814
815    ushR v22.4s, v22.4s,#16
816    sMLAL v22.4s, v5.4h, v8.4h
817
818    ushR v2.4s, v2.4s,#16
819    ushR v0.4s, v0.4s,#16
820    sMLAL v2.4s, v25.4h, v18.4h
821    sMLAL v0.4s, v27.4h, v18.4h
822
823    uMULL v24.4s, v4.4h, v9.4h
824    uMULL v26.4s, v6.4h, v8.4h
825
826    NEG  v2.4s, v2.4s
827    ADD  v28.4s,  v28.4s ,  v0.4s
828    ADD  v30.4s,  v30.4s ,  v2.4s
829
830    uMULL v0.4s, v6.4h, v9.4h
831    sshR v24.4s, v24.4s,#16
832    sMLAL v24.4s, v5.4h, v9.4h
833    sshR v26.4s, v26.4s,#16
834    sshR v0.4s, v0.4s,#16
835    sMLAL v26.4s, v7.4h, v8.4h
836    sMLAL v0.4s, v7.4h, v9.4h
837
838
839
840    ADD  v22.4s,  v22.4s ,  v0.4s
841
842    NEG  v22.4s, v22.4s
843    SUB  v24.4s,  v26.4s ,  v24.4s
844
845
846    //LDR w11,  [sp, #120]
847  //sxtw x11,w11
848  MOV      w11, w26
849    dup  v14.4s,w11
850    SQADD  v28.4s,  v28.4s ,  v14.4s
851    //LDR w11,  [sp, #116]
852  //sxtw x11,w11
853  MOV    w11, w25
854    dup  v0.4s,w11
855    sQshL v28.4s, v28.4s, v0.4s
856
857
858    mov  v0.16b, v22.16b
859    mov  v14.16b, v24.16b
860
861//    VUZP.16         D24, D25
862
863  UZP1      v19.8h, v24.8h, v24.8h
864  UZP2      v21.8h, v24.8h, v24.8h
865  MOV        v24.d[0], v19.d[0]
866  MOV        v25.d[0], v21.d[0]
867
868
869//    VUZP.16         D22, D23
870
871  UZP1      v19.8h, v22.8h, v22.8h
872  UZP2      v21.8h, v22.8h, v22.8h
873  MOV        v22.d[0], v19.d[0]
874  MOV        v23.d[0], v21.d[0]
875
876  uMULL v8.4s, v24.4h, v18.4h
877    uMULL v26.4s, v22.4h, v18.4h
878
879    NEG  v2.4s, v30.4s
880
881//    VUZP.16         D30, D31
882
883  UZP1      v19.8h, v30.8h, v30.8h
884  UZP2      v21.8h, v30.8h, v30.8h
885  MOV        v30.d[0], v19.d[0]
886  MOV        v30.d[1], v21.d[0]
887
888
889//    VUZP.16         D2, D3
890
891  UZP1      v19.8h, v2.8h, v2.8h
892  UZP2      v21.8h, v2.8h, v2.8h
893  MOV        v2.d[0], v19.d[0]
894  MOV        v3.d[0], v21.d[0]
895
896  uMULL v4.4s, v30.4h, v12.4h
897    uMULL v6.4s, v2.4h, v13.4h
898
899    ushR v8.4s, v8.4s,#16
900    ushR v26.4s, v26.4s,#16
901
902    sMLAL v8.4s, v25.4h, v18.4h
903    sMLAL v26.4s, v23.4h, v18.4h
904
905    ushR v4.4s, v4.4s,#16
906    ushR v6.4s, v6.4s,#16
907
908  MOV  v19.d[0], v30.d[1]
909
910    sMLAL v4.4s, v19.4h, v12.4h
911    sMLAL v6.4s, v3.4h, v13.4h
912
913    NEG  v8.4s, v8.4s
914    ADD  v14.4s,  v14.4s ,  v26.4s
915    ADD  v0.4s,  v0.4s ,  v8.4s
916
917
918
919    //LDR w11,  [sp, #120]
920  //sxtw x11,w11
921  MOV      w11, w26
922    dup  v8.4s,w11
923    SQADD  v0.4s,  v0.4s ,  v8.4s
924    //LDR w11,  [sp, #116]
925  //sxtw x11,w11
926    MOV    w11, w25
927  dup  v26.4s,w11
928    sQshL v0.4s, v0.4s, v26.4s
929    mov  v26.16b, v28.16b
930
931    LD2 { v28.4s, v29.4s}, [x4]
932  MOV  v30.16b, v29.16b
933  MOV  v29.d[0], v28.d[1]
934//    VZIP.32         Q13, Q0
935
936    ZIP1    v19.4s, v26.4s, v0.4s
937  ZIP2    v0.4s, v26.4s, v0.4s
938  MOV      v26.16b, v19.16b
939
940    ST1 { v26.4s}, [x4]
941  ADD x4, x4,#16
942    ST1 { v0.4s}, [x4]
943  ADD  x4, x4,#16
944
945    movi  v1.2s, #0
946//    VADDL.S16       Q0, D13, D1
947  SADDL       v0.4s, v13.4h, v1.4h
948  MOV      v1.d[0], v0.d[1]
949
950    sMULL v26.2d, v28.2s, v0.2s
951    Sqxtn v8.2s,  v26.2d
952    sMULL v26.2d, v29.2s, v1.2s
953    Sqxtn v9.2s,  v26.2d
954  MOV    v8.d[1], v9.d[0]
955    movi  v1.2s, #0
956    //VADDL.S16       Q0, D12, D1
957  SADDL       v0.4s, v12.4h, v1.4h
958  MOV      v1.d[0], v0.d[1]
959
960    sMULL v24.2d, v28.2s, v0.2s
961    Sqxtn v26.2s,  v24.2d
962    sMULL v24.2d, v29.2s, v1.2s
963    Sqxtn v27.2s,  v24.2d
964  MOV    v26.d[1], v27.d[0]
965    sQshL v4.4s, v4.4s, v16.4s
966    sQshL v6.4s, v6.4s, v16.4s
967
968
969
970    SQSUB  v4.4s,  v4.4s ,  v8.4s
971    SQSUB  v6.4s,  v6.4s ,  v26.4s
972
973    NEG  v26.4s, v14.4s
974//    VUZP.16         D26, D27
975  UZP1      v19.8h, v26.8h, v26.8h
976  UZP2      v21.8h, v26.8h, v26.8h
977  MOV        v26.d[0], v19.d[0]
978  MOV        v27.d[0], v21.d[0]
979
980    movi  v1.2s, #0
981   //VADDL.S16       Q0, D10, D1
982      SADDL       v0.4s, v10.4h, v1.4h
983  MOV      v1.d[0], v0.d[1]
984
985  sMULL v22.2d, v30.2s, v0.2s
986    Sqxtn v24.2s,  v22.2d
987    sMULL2 v22.2d, v30.4s, v0.4s
988    Sqxtn v25.2s,  v22.2d
989  MOV    v24.d[1], v25.d[0]
990    movi  v1.2s, #0
991    //VADDL.S16       Q0, D11, D1
992  SADDL       v0.4s, v11.4h, v1.4h
993
994    sMULL v8.2d, v30.2s, v0.2s
995    Sqxtn v22.2s,  v8.2d
996    sMULL2 v8.2d, v30.4s, v0.4s
997    Sqxtn v23.2s,  v8.2d
998  MOV    v22.d[1], v23.d[0]
999
1000//    VUZP.16         D14, D15
1001
1002   UZP1      v19.8h, v14.8h, v14.8h
1003  UZP2      v21.8h, v14.8h, v14.8h
1004  MOV        v14.d[0], v19.d[0]
1005  MOV        v15.d[0], v21.d[0]
1006
1007  uMULL v8.4s, v26.4h, v11.4h
1008    uMULL v30.4s, v14.4h, v10.4h
1009
1010
1011    LD2 { v0.4s, v1.4s}, [x1]
1012  MOV v2.16b, v1.16b
1013  ADD  X1, X1, x12
1014
1015//    VUZP.16         D0, D1
1016  UZP1      v19.8h, v0.8h, v0.8h
1017  UZP2      v21.8h, v0.8h, v0.8h
1018  MOV        v0.d[0], v19.d[0]
1019  MOV        v0.d[1], v21.d[0]
1020
1021//    VUZP.16         D2, D3
1022
1023   UZP1      v19.8h, v2.8h, v2.8h
1024  UZP2      v21.8h, v2.8h, v2.8h
1025  MOV        v2.d[0], v19.d[0]
1026  MOV        v2.d[1], v21.d[0]
1027
1028  ushR v8.4s, v8.4s,#16
1029
1030    rev64  v0.8h,  v0.8h
1031  MOV  v1.d[0], v0.d[1]
1032    ushR v30.4s, v30.4s,#16
1033
1034    rev64  v2.8h,  v2.8h
1035  MOV  v3.d[0], v2.d[1]
1036    sMLAL v8.4s, v27.4h, v11.4h
1037
1038    sMLAL v30.4s, v15.4h, v10.4h
1039
1040    LD2 { v10.4s, v11.4s}, [x6]
1041  add  X6, x6, x12
1042  MOV  v12.16b, v11.16b
1043
1044
1045    //VUZP.16         D10, D11
1046
1047     UZP1      v19.8h, v10.8h, v10.8h
1048  UZP2      v21.8h, v10.8h, v10.8h
1049  MOV        v10.d[0], v19.d[0]
1050  MOV        v10.d[1], v21.d[0]
1051
1052
1053
1054//    VUZP.16         D12, D13
1055
1056   UZP1      v19.8h, v12.8h, v12.8h
1057  UZP2      v21.8h, v12.8h, v12.8h
1058        MOV        v12.d[0], v19.d[0]
1059  MOV        v12.d[1], v21.d[0]
1060
1061
1062
1063      MOV  V14.16B, V4.16B
1064
1065    rev64  v10.8h,  v10.8h
1066  MOV        v11.d[0], v10.d[1]
1067
1068
1069    rev64  v12.8h,  v12.8h
1070  MOV        v13.d[0], v12.d[1]
1071
1072    sQshL v8.4s, v8.4s, v16.4s
1073
1074    LD2 { v4.4s, v5.4s}, [x8]
1075  ADD  x8, x8, #32
1076
1077    MOV V31.16B, V6.16B
1078    MOV  v6.16b, v5.16b
1079
1080  sQshL v30.4s, v30.4s, v16.4s
1081
1082
1083   UZP1      v19.8h, v4.8h, v4.8h
1084  UZP2      v21.8h, v4.8h, v4.8h
1085  MOV        v4.d[0], v19.d[0]
1086  MOV        v5.d[0], v21.d[0]
1087
1088
1089  SQSUB  v8.4s,  v8.4s ,  v24.4s
1090
1091//    VUZP.16         D6, D7
1092
1093     UZP1      v19.8h, v6.8h, v6.8h
1094  UZP2      v21.8h, v6.8h, v6.8h
1095  MOV        v6.d[0], v19.d[0]
1096  MOV        v7.d[0], v21.d[0]
1097
1098  SQSUB  v22.4s,  v30.4s ,  v22.4s
1099
1100    MOV V30.16B , V8.16B
1101
1102    LD2 {v8.4h, v9.4h}, [x2]
1103  ADD x2, x2,#16
1104
1105
1106
1107
1108    SUBS            x3, x3, #1
1109    BNE             CORE_LOOP
1110
1111
1112
1113
1114
1115EPILOGUE:
1116
1117    ST1 {V14.S}[0],[x0]
1118  ADD  x0, x0, x9
1119
1120
1121    ST1 {V22.S}[0],[x0]
1122  ADD  x0, x0, x9
1123
1124
1125    ST1 {V14.S}[1],[x0]
1126  ADD  x0, x0, x9
1127
1128
1129    ST1 {V22.S}[1],[x0]
1130  ADD  x0, x0, x9
1131
1132
1133    ST1 {V14.S}[2],[x0]
1134  ADD  x0, x0, x9
1135
1136
1137    ST1 {V22.S}[2],[x0]
1138  ADD  x0, x0, x9
1139
1140
1141    ST1 {V14.S}[3],[x0]
1142  ADD  x0, x0, x9
1143
1144
1145    ST1 {V22.S}[3],[x0]
1146  ADD  x0, x0, x9
1147
1148
1149    ST1 {V31.S}[0],[x5]
1150  ADD  x5, x5, x10
1151
1152
1153    ST1 {V30.S}[0],[x5]
1154  ADD  x5, x5, x10
1155
1156
1157    ST1 {V31.S}[1],[x5]
1158  ADD  x5, x5, x10
1159
1160
1161    ST1 {V30.S}[1],[x5]
1162  ADD  x5, x5, x10
1163
1164
1165    ST1 {V31.S}[2],[x5]
1166  ADD  x5, x5, x10
1167
1168
1169    ST1 {V30.S}[2],[x5]
1170  ADD  x5, x5, x10
1171
1172
1173    ST1 {V31.S}[3],[x5]
1174  ADD  x5, x5, x10
1175
1176
1177    ST1 {V30.S}[3],[x5]
1178  ADD  x5, x5, x10
1179
1180
1181    uMULL v30.4s, v0.4h, v9.4h
1182    uMULL v28.4s, v2.4h, v8.4h
1183    uMULL v26.4s, v0.4h, v8.4h
1184    uMULL v24.4s, v2.4h, v9.4h
1185    ushR v30.4s, v30.4s,#16
1186    ushR v28.4s, v28.4s,#16
1187    sMLAL v30.4s, v1.4h, v9.4h
1188    sMLAL v28.4s, v3.4h, v8.4h
1189    ushR v26.4s, v26.4s,#16
1190    ushR v24.4s, v24.4s,#16
1191    sMLAL v26.4s, v1.4h, v8.4h
1192    sMLAL v24.4s, v3.4h, v9.4h
1193    ADD  v30.4s,  v30.4s ,  v28.4s
1194    NEG  v30.4s, v30.4s
1195    SUB  v28.4s,  v24.4s ,  v26.4s
1196
1197
1198    uMULL v22.4s, v4.4h, v8.4h
1199    mov  v26.16b, v30.16b
1200    mov  v24.16b, v28.16b
1201
1202    mov  v26.16b, v30.16b
1203    mov  v24.16b, v28.16b
1204
1205    //VUZP.16         D26, D27
1206
1207  UZP1      v19.8h, v26.8h, v26.8h
1208  UZP2      v21.8h, v26.8h, v26.8h
1209  MOV        v26.d[0], v19.d[0]
1210  MOV        v27.d[0], v21.d[0]
1211
1212//    VUZP.16         D24, D25
1213
1214     UZP1      v19.8h, v24.8h, v24.8h
1215  UZP2      v21.8h, v24.8h, v24.8h
1216  MOV        v24.d[0], v19.d[0]
1217  MOV        v25.d[0], v21.d[0]
1218
1219    uMULL v2.4s, v24.4h, v18.4h
1220    uMULL v0.4s, v26.4h, v18.4h
1221
1222    ushR v22.4s, v22.4s,#16
1223    sMLAL v22.4s, v5.4h, v8.4h
1224
1225    ushR v2.4s, v2.4s,#16
1226    ushR v0.4s, v0.4s,#16
1227    sMLAL v2.4s, v25.4h, v18.4h
1228    sMLAL v0.4s, v27.4h, v18.4h
1229
1230    uMULL v24.4s, v4.4h, v9.4h
1231    uMULL v26.4s, v6.4h, v8.4h
1232
1233    NEG  v2.4s, v2.4s
1234    ADD  v28.4s,  v28.4s ,  v0.4s
1235    ADD  v30.4s,  v30.4s ,  v2.4s
1236
1237    uMULL v0.4s, v6.4h, v9.4h
1238    sshR v24.4s, v24.4s,#16
1239    sMLAL v24.4s, v5.4h, v9.4h
1240    sshR v26.4s, v26.4s,#16
1241    sshR v0.4s, v0.4s,#16
1242    sMLAL v26.4s, v7.4h, v8.4h
1243    sMLAL v0.4s, v7.4h, v9.4h
1244
1245
1246
1247
1248
1249    ADD  v22.4s,  v22.4s ,  v0.4s
1250    NEG  v22.4s, v22.4s
1251    SUB  v24.4s,  v26.4s ,  v24.4s
1252
1253
1254
1255
1256    //LDR w11,  [sp, #120]
1257  //sxtw x11,w11
1258  MOV      w11, w26
1259    dup  v14.4s,w11
1260    SQADD  v28.4s,  v28.4s ,  v14.4s
1261    //LDR w11,  [sp, #116]
1262  //sxtw x11,w11
1263    MOV    w11, w25
1264  dup  v0.4s,w11
1265    sQshL v28.4s, v28.4s, v0.4s
1266
1267
1268    mov  v0.16b, v22.16b
1269    mov  v14.16b, v24.16b
1270
1271
1272//    VUZP.16         D22, D23
1273
1274  UZP1      v19.8h, v22.8h, v22.8h
1275  UZP2      v21.8h, v22.8h, v22.8h
1276  MOV        v22.d[0], v19.d[0]
1277  MOV        v23.d[0], v21.d[0]
1278
1279//    VUZP.16         D24, D25
1280
1281     UZP1      v19.8h, v24.8h, v24.8h
1282  UZP2      v21.8h, v24.8h, v24.8h
1283  MOV        v24.d[0], v19.d[0]
1284  MOV        v25.d[0], v21.d[0]
1285
1286    uMULL v8.4s, v24.4h, v18.4h
1287    uMULL v26.4s, v22.4h, v18.4h
1288
1289    NEG  v2.4s, v30.4s
1290
1291//    VUZP.16         D30, D31
1292
1293  UZP1      v19.8h, v30.8h, v30.8h
1294  UZP2      v21.8h, v30.8h, v30.8h
1295  MOV        v30.d[0], v19.d[0]
1296  MOV        v30.d[1], v21.d[0]
1297
1298//    VUZP.16         D2, D3
1299
1300   UZP1      v19.8h, v2.8h, v2.8h
1301  UZP2      v21.8h, v2.8h, v2.8h
1302  MOV        v2.d[0], v19.d[0]
1303  MOV        v3.d[0], v21.d[0]
1304
1305    uMULL v4.4s, v30.4h, v12.4h
1306    uMULL v6.4s, v2.4h, v13.4h
1307
1308    ushR v8.4s, v8.4s,#16
1309    ushR v26.4s, v26.4s,#16
1310
1311    sMLAL v8.4s, v25.4h, v18.4h
1312    sMLAL v26.4s, v23.4h, v18.4h
1313
1314    ushR v4.4s, v4.4s,#16
1315    ushR v6.4s, v6.4s,#16
1316
1317  MOV  v19.d[0], v30.d[1]
1318
1319    sMLAL v4.4s, v19.4h, v12.4h
1320    sMLAL v6.4s, v3.4h, v13.4h
1321
1322    NEG  v8.4s, v8.4s
1323    ADD  v14.4s,  v14.4s ,  v26.4s
1324    ADD  v0.4s,  v0.4s ,  v8.4s
1325
1326    //LDR w11,  [sp, #120]
1327  //sxtw x11,w11
1328  MOV      w11, w26
1329    dup  v8.4s,w11
1330    SQADD  v0.4s,  v0.4s ,  v8.4s
1331    //LDR w11,  [sp, #116]
1332  //sxtw x11,w11
1333    MOV    w11, w25
1334  dup  v26.4s,w11
1335    sQshL v0.4s, v0.4s, v26.4s
1336
1337
1338    mov  v26.16b, v28.16b
1339
1340    LD2 { v28.4s, v29.4s}, [x4]
1341  MOV  v30.16b, v29.16b
1342  MOV  v29.d[0], v28.d[1]
1343//    VZIP.32         Q13, Q0
1344
1345    ZIP1    v19.4s, v26.4s, v0.4s
1346  ZIP2    v0.4s, v26.4s, v0.4s
1347  MOV      v26.16b, v19.16b
1348
1349  ST1 { v26.4s}, [x4],#16
1350    ST1 { v0.4s}, [x4],#16
1351
1352    movi  v1.2s, #0
1353//    VADDL.S16       Q0, D13, D1
1354  SADDL       v0.4s, v13.4h, v1.4h
1355  MOV      v1.d[0], v0.d[1]
1356
1357    sMULL v26.2d, v28.2s, v0.2s
1358    Sqxtn v8.2s,  v26.2d
1359    sMULL v26.2d, v29.2s, v1.2s
1360    Sqxtn v9.2s,  v26.2d
1361  MOV    v8.d[1], v9.d[0]
1362    movi  v1.2s, #0
1363//    VADDL.S16       Q0, D12, D1
1364  SADDL       v0.4s, v12.4h, v1.4h
1365  MOV      v1.d[0], v0.d[1]
1366
1367    sMULL v24.2d, v28.2s, v0.2s
1368    Sqxtn v26.2s,  v24.2d
1369    sMULL v24.2d, v29.2s, v1.2s
1370    Sqxtn v27.2s,  v24.2d
1371  MOV    v26.d[1], v27.d[0]
1372
1373    sQshL v4.4s, v4.4s, v16.4s
1374    sQshL v6.4s, v6.4s, v16.4s
1375
1376    SQSUB  v4.4s,  v4.4s ,  v8.4s
1377    SQSUB  v6.4s,  v6.4s ,  v26.4s
1378
1379    NEG  v26.4s, v14.4s
1380//    VUZP.16         D14, D15
1381
1382  UZP1      v19.8h, v14.8h, v14.8h
1383  UZP2      v21.8h, v14.8h, v14.8h
1384  MOV        v14.d[0], v19.d[0]
1385  MOV        v15.d[0], v21.d[0]
1386
1387
1388 //   VUZP.16         D26, D27
1389
1390  UZP1      v19.8h, v26.8h, v26.8h
1391  UZP2      v21.8h, v26.8h, v26.8h
1392  MOV        v26.d[0], v19.d[0]
1393  MOV        v27.d[0], v21.d[0]
1394
1395
1396    movi  v1.2s, #0
1397    //VADDL.S16       Q0, D10, D1
1398  SADDL       v0.4s, v10.4h, v1.4h
1399  MOV      v1.d[0], v0.d[1]
1400
1401    sMULL v22.2d, v30.2s, v0.2s
1402    Sqxtn v24.2s,  v22.2d
1403    sMULL2 v22.2d, v30.4s, v0.4s
1404    Sqxtn v25.2s,  v22.2d
1405  MOV    v24.d[1], v25.d[0]
1406    movi  v1.2s, #0
1407    //VADDL.S16       Q0, D11, D1
1408  SADDL       v0.4s, v11.4h, v1.4h
1409  MOV      v1.d[0], v0.d[1]
1410
1411    sMULL v8.2d, v30.2s, v0.2s
1412    Sqxtn v22.2s,  v8.2d
1413    sMULL2 v8.2d, v30.4s, v0.4s
1414    Sqxtn v23.2s,  v8.2d
1415  MOV    v22.d[1], v23.d[0]
1416
1417    uMULL v8.4s, v26.4h, v11.4h
1418    uMULL v30.4s, v14.4h, v10.4h
1419
1420    ushR v8.4s, v8.4s,#16
1421
1422    ushR v30.4s, v30.4s,#16
1423
1424    sMLAL v8.4s, v27.4h, v11.4h
1425
1426    sMLAL v30.4s, v15.4h, v10.4h
1427
1428
1429    MOV V14.16B, V4.16B
1430
1431
1432    sQshL v8.4s, v8.4s, v16.4s
1433
1434    sQshL v30.4s, v30.4s, v16.4s
1435
1436    SQSUB  v8.4s,  v8.4s ,  v24.4s
1437
1438    SQSUB  v22.4s,  v30.4s ,  v22.4s
1439
1440    MOV V30.16B , V8.16B
1441
1442
1443
1444
1445    ST1 {V14.S}[0],[x0]
1446  ADD  x0, x0, x9
1447    ST1 {V22.S}[0],[x0]
1448       ADD  x0, x0, x9
1449    ST1 {V14.S}[1],[x0]
1450  ADD  x0, x0, x9
1451    ST1 {V22.S}[1],[x0]
1452  ADD  x0, x0, x9
1453    ST1 {V14.S}[2],[x0]
1454  ADD  x0, x0, x9
1455    ST1 {V22.S}[2],[x0]
1456  ADD  x0, x0, x9
1457    ST1 {V14.S}[3],[x0]
1458  ADD  x0, x0, x9
1459    ST1 {V22.S}[3],[x0]
1460  ADD  x0, x0, x9
1461    ST1 {V6.S}[0],[x5]
1462  ADD  x5, x5, x10
1463    ST1 {V30.S}[0],[x5]
1464  ADD  x5, x5, x10
1465    ST1 {V6.S}[1],[x5]
1466  ADD  x5, x5, x10
1467    ST1 {V30.S}[1],[x5]
1468  ADD  x5, x5, x10
1469    ST1 {V6.S}[2],[x5]
1470  ADD  x5, x5, x10
1471    ST1 {V30.S}[2],[x5]
1472  ADD  x5, x5, x10
1473    ST1 {V6.S}[3],[x5]
1474  ADD  x5, x5, x10
1475    ST1 {V30.S}[3],[x5]
1476  ADD  x5, x5, x10
1477
1478ARM_EPILOGUE:
1479
1480ARM_LOOP:
1481
1482    LD2 { v0.4s, v1.4s}, [x1]
1483  MOV  v2.16b, v1.16b
1484
1485    //VUZP.16         D0, D1
1486      UZP1      v19.8h, v0.8h, v0.8h
1487  UZP2      v21.8h, v0.8h, v0.8h
1488  MOV        v0.d[0], v19.d[0]
1489  MOV        v0.d[1], v21.d[0]
1490
1491  //VUZP.16         D2, D3
1492  UZP1      v19.8h, v2.8h, v2.8h
1493  UZP2      v21.8h, v2.8h, v2.8h
1494  MOV        v2.d[0], v19.d[0]
1495  MOV        v2.d[1], v21.d[0]
1496
1497
1498    rev64  v0.8h,  v0.8h
1499  MOV        v1.d[0], v0.d[1]
1500    rev64  v2.8h,  v2.8h
1501  MOV        v3.d[0], v2.d[1]
1502
1503    LD2 {v8.4h, v9.4h}, [x2]
1504  ADD x2, x2,#16
1505
1506    LD2 {v4.2s, v5.2s}, [x8]
1507  ADD x8, x8,#16
1508    MOV  v6.16b, v5.16b
1509  movi  v5.2s, #0x00000000
1510    movi  v7.2s, #0x00000000
1511
1512    LD1 {v5.s}[0],[x8],#4
1513    LD1 {v7.s}[0],[x8]
1514
1515    MOV             x12, #16
1516  MOV        v4.d[1], v5.d[0]
1517  MOV        v6.d[1], v7.d[0]
1518//    VUZP.16         D4, D5
1519
1520  UZP1      v19.8h, v4.8h, v4.8h
1521  UZP2      v21.8h, v4.8h, v4.8h
1522  MOV        v4.d[0], v19.d[0]
1523  MOV        v5.d[0], v21.d[0]
1524
1525//    VUZP.16         D6, D7
1526
1527  UZP1      v19.8h, v6.8h, v6.8h
1528  UZP2      v21.8h, v6.8h, v6.8h
1529  MOV        v6.d[0], v19.d[0]
1530  MOV        v7.d[0], v21.d[0]
1531
1532    ADD             x6, x6, #16
1533
1534    MOV             x12, #-4
1535    LD2 {v11.2s, v12.2s}, [x6]
1536  ADD x6, x6, x12
1537  MOV v13.16b, v12.16b
1538
1539
1540    movi  v10.2s, #0x00000000
1541
1542    LD1 {v12.s}[1],[x6]
1543  ADD  x6, x6, x12
1544    LD1 {v10.s}[1],[x6]
1545    ADD  x6, x6, x12
1546  LD1 {v12.s}[0],[x6]
1547    ADD  x6, x6, x12
1548
1549  MOV        v10.d[1], v11.d[0]
1550  MOV        v12.d[1], v13.d[0]
1551
1552    //VUZP.16         D10, D11
1553
1554  UZP1      v19.8h, v10.8h, v10.8h
1555  UZP2      v21.8h, v10.8h, v10.8h
1556  MOV        v10.d[0], v19.d[0]
1557  MOV        v10.d[1], v21.d[0]
1558
1559  //VUZP.16         D12, D13
1560
1561  UZP1      v19.8h, v12.8h, v12.8h
1562  UZP2      v21.8h, v12.8h, v12.8h
1563  MOV        v12.d[0], v19.d[0]
1564  MOV        v12.d[1], v21.d[0]
1565
1566
1567    rev64  v10.8h,  v10.8h
1568  MOV        v11.d[0], v10.d[1]
1569    rev64  v12.8h,  v12.8h
1570  MOV        v13.d[0], v12.d[1]
1571
1572    uMULL v30.4s, v0.4h, v9.4h
1573    uMULL v28.4s, v2.4h, v8.4h
1574    uMULL v26.4s, v0.4h, v8.4h
1575    uMULL v24.4s, v2.4h, v9.4h
1576
1577    ushR v30.4s, v30.4s,#16
1578    ushR v28.4s, v28.4s,#16
1579
1580    sMLAL v30.4s, v1.4h, v9.4h
1581    sMLAL v28.4s, v3.4h, v8.4h
1582
1583    ushR v26.4s, v26.4s,#16
1584    ushR v24.4s, v24.4s,#16
1585
1586    sMLAL v26.4s, v1.4h, v8.4h
1587    sMLAL v24.4s, v3.4h, v9.4h
1588
1589    ADD  v30.4s,  v30.4s ,  v28.4s
1590    NEG  v30.4s, v30.4s
1591
1592    uMULL v22.4s, v4.4h, v8.4h
1593
1594    SUB  v28.4s,  v24.4s ,  v26.4s
1595
1596
1597    mov  v26.16b, v30.16b
1598    mov  v24.16b, v28.16b
1599
1600//    VUZP.16         D26, D27
1601
1602  UZP1      v19.8h, v26.8h, v26.8h
1603  UZP2      v21.8h, v26.8h, v26.8h
1604  MOV        v26.d[0], v19.d[0]
1605  MOV        v27.d[0], v21.d[0]
1606
1607    //VUZP.16         D24, D25
1608
1609  UZP1      v19.8h, v24.8h, v24.8h
1610  UZP2      v21.8h, v24.8h, v24.8h
1611  MOV        v24.d[0], v19.d[0]
1612  MOV        v25.d[0], v21.d[0]
1613
1614    uMULL v2.4s, v24.4h, v18.4h
1615    uMULL v0.4s, v26.4h, v18.4h
1616
1617    ushR v22.4s, v22.4s,#16
1618    sMLAL v22.4s, v5.4h, v8.4h
1619
1620    ushR v2.4s, v2.4s,#16
1621    ushR v0.4s, v0.4s,#16
1622    sMLAL v2.4s, v25.4h, v18.4h
1623    sMLAL v0.4s, v27.4h, v18.4h
1624
1625    uMULL v24.4s, v4.4h, v9.4h
1626    uMULL v26.4s, v6.4h, v8.4h
1627
1628    NEG  v2.4s, v2.4s
1629    ADD  v28.4s,  v28.4s ,  v0.4s
1630    ADD  v30.4s,  v30.4s ,  v2.4s
1631
1632    uMULL v0.4s, v6.4h, v9.4h
1633    sshR v24.4s, v24.4s,#16
1634    sMLAL v24.4s, v5.4h, v9.4h
1635    sshR v26.4s, v26.4s,#16
1636    sshR v0.4s, v0.4s,#16
1637    sMLAL v26.4s, v7.4h, v8.4h
1638    sMLAL v0.4s, v7.4h, v9.4h
1639
1640    ADD  v22.4s,  v22.4s ,  v0.4s
1641    NEG  v22.4s, v22.4s
1642    SUB  v24.4s,  v26.4s ,  v24.4s
1643
1644    //LDR w11,  [sp, #120]
1645  //sxtw x11,w11
1646  MOV      w11, w26
1647    dup  v14.4s,w11
1648    SQADD  v28.4s,  v28.4s ,  v14.4s
1649    //LDR w11,  [sp, #116]
1650  //sxtw x11,w11
1651    MOV    w11, w25
1652  dup  v0.4s,w11
1653    sQshL v28.4s, v28.4s, v0.4s
1654
1655    mov  v0.16b, v22.16b
1656    mov  v14.16b, v24.16b
1657
1658//    VUZP.16         D22, D23
1659
1660   UZP1      v19.8h, v22.8h, v22.8h
1661  UZP2      v21.8h, v22.8h, v22.8h
1662  MOV        v22.d[0], v19.d[0]
1663  MOV        v23.d[0], v21.d[0]
1664
1665 //   VUZP.16         D24, D25
1666
1667   UZP1      v19.8h, v24.8h, v24.8h
1668  UZP2      v21.8h, v24.8h, v24.8h
1669  MOV        v24.d[0], v19.d[0]
1670  MOV        v25.d[0], v21.d[0]
1671
1672    uMULL v8.4s, v24.4h, v18.4h
1673    uMULL v26.4s, v22.4h, v18.4h
1674
1675    NEG  v2.4s, v30.4s
1676//    VUZP.16         D30, D31
1677
1678  UZP1      v19.8h, v30.8h, v30.8h
1679  UZP2      v21.8h, v30.8h, v30.8h
1680  MOV        v30.d[0], v19.d[0]
1681  MOV        v30.d[1], v21.d[0]
1682
1683//    VUZP.16         D2, D3
1684
1685  UZP1      v19.8h, v2.8h, v2.8h
1686  UZP2      v21.8h, v2.8h, v2.8h
1687  MOV        v2.d[0], v19.d[0]
1688  MOV        v3.d[0], v21.d[0]
1689
1690    uMULL v4.4s, v30.4h, v12.4h
1691    uMULL v6.4s, v2.4h, v13.4h
1692
1693    ushR v8.4s, v8.4s,#16
1694    ushR v26.4s, v26.4s,#16
1695
1696    sMLAL v8.4s, v25.4h, v18.4h
1697    sMLAL v26.4s, v23.4h, v18.4h
1698
1699    ushR v4.4s, v4.4s,#16
1700    ushR v6.4s, v6.4s,#16
1701
1702  MOV  v19.d[0], v30.d[1]
1703
1704    sMLAL v4.4s, v19.4h, v12.4h
1705    sMLAL v6.4s, v3.4h, v13.4h
1706
1707    NEG  v8.4s, v8.4s
1708    ADD  v14.4s,  v14.4s ,  v26.4s
1709    ADD  v0.4s,  v0.4s ,  v8.4s
1710
1711    //LDR w11,  [sp, #120]
1712  //sxtw x11,w11
1713  MOV      w11, w26
1714    dup  v8.4s,w11
1715    SQADD  v0.4s,  v0.4s ,  v8.4s
1716    //LDR w11,  [sp, #116]
1717  //sxtw x11,w11
1718    MOV    w11, w25
1719  dup  v26.4s,w11
1720    sQshL v0.4s, v0.4s, v26.4s
1721
1722    mov  v26.16b, v28.16b
1723
1724    MOV             x6, x4
1725
1726    LD1 {v28.2s, v29.2s}, [x4],#16
1727    movi  v19.2s, #0x00000000
1728    LD1 {v30.s}[0],[x4],#4
1729    LD1 {v30.s}[1],[x4],#4
1730    LD1 {v19.s}[0],[x4],#4
1731
1732  MOV        v28.d[1], v29.d[0]
1733  MOV        v30.d[1], v19.d[0]
1734
1735    //VUZP.32         Q14, Q15
1736
1737  UZP1      v19.4s, v28.4s, v30.4s
1738  UZP2      v30.4s, v28.4s, v30.4s
1739  MOV        v28.16b, v19.16b
1740  MOV        v29.d[0], v28.d[1]
1741
1742    ST1 {v26.s}[0],[x6],#4
1743    ST1 {v0.s}[0],[x6],#4
1744    ST1 {v26.s}[1],[x6],#4
1745    ST1 {v0.s}[1],[x6],#4
1746    ST1 {v26.s}[2],[x6],#4
1747    ST1 {v0.s}[2],[x6],#4
1748    ST1 {v26.s}[3],[x6],#4
1749
1750    movi  v1.2s, #0
1751    //VADDL.S16       Q0, D13, D1
1752  SADDL       v0.4s, v13.4h, v1.4h
1753  MOV      v1.d[0], v0.d[1]
1754
1755    sMULL v26.2d, v28.2s, v0.2s
1756    Sqxtn v8.2s,  v26.2d
1757    sMULL v26.2d, v29.2s, v1.2s
1758    Sqxtn v9.2s,  v26.2d
1759  MOV    v8.d[1], v9.d[0]
1760    movi  v1.2s, #0
1761    //VADDL.S16       Q0, D12, D1
1762  SADDL       v0.4s, v12.4h, v1.4h
1763  MOV      v1.d[0], v0.d[1]
1764
1765    sMULL v24.2d, v28.2s, v0.2s
1766    Sqxtn v26.2s,  v24.2d
1767    sMULL v24.2d, v29.2s, v1.2s
1768    Sqxtn v27.2s,  v24.2d
1769  MOV    v26.d[1], v27.d[0]
1770
1771    sQshL v4.4s, v4.4s, v16.4s
1772    sQshL v6.4s, v6.4s, v16.4s
1773
1774    SQSUB  v4.4s,  v4.4s ,  v8.4s
1775    SQSUB  v6.4s,  v6.4s ,  v26.4s
1776
1777    NEG  v26.4s, v14.4s
1778    //VUZP.16         D14, D15
1779
1780     UZP1      v19.8h, v14.8h, v14.8h
1781  UZP2      v21.8h, v14.8h, v14.8h
1782  MOV        v14.d[0], v19.d[0]
1783  MOV        v15.d[0], v21.d[0]
1784
1785//  VUZP.16         D26, D27
1786
1787  UZP1      v19.8h, v26.8h, v26.8h
1788  UZP2      v21.8h, v26.8h, v26.8h
1789  MOV        v26.d[0], v19.d[0]
1790  MOV        v27.d[0], v21.d[0]
1791
1792
1793    movi  v1.2s, #0
1794    //VADDL.S16       Q0, D10, D1
1795  SADDL       v0.4s, v10.4h, v1.4h
1796  MOV      v1.d[0], v0.d[1]
1797
1798    sMULL v22.2d, v30.2s, v0.2s
1799    Sqxtn v24.2s,  v22.2d
1800    sMULL2 v22.2d, v30.4s, v0.4s
1801    Sqxtn v25.2s,  v22.2d
1802  MOV    v24.d[1], v25.d[0]
1803
1804    movi  v1.2s, #0
1805//    VADDL.S16       Q0, D11, D1
1806  SADDL       v0.4s, v11.4h, v1.4h
1807  MOV      v1.d[0], v0.d[1]
1808
1809    sMULL v8.2d, v30.2s, v0.2s
1810    Sqxtn v22.2s,  v8.2d
1811    sMULL2 v8.2d, v30.4s, v0.4s
1812    Sqxtn v23.2s,  v8.2d
1813  MOV    v22.d[1], v23.d[0]
1814
1815    uMULL v8.4s, v26.4h, v11.4h
1816    uMULL v30.4s, v14.4h, v10.4h
1817
1818    ushR v8.4s, v8.4s,#16
1819
1820    ushR v30.4s, v30.4s,#16
1821
1822    sMLAL v8.4s, v27.4h, v11.4h
1823
1824    sMLAL v30.4s, v15.4h, v10.4h
1825
1826    MOV V14.16B , V4.16B
1827
1828    //mov  v15.8b, v6.8b
1829    sQshL v8.4s, v8.4s, v16.4s
1830
1831    sQshL v30.4s, v30.4s, v16.4s
1832
1833    SQSUB  v8.4s,  v8.4s ,  v24.4s
1834
1835    SQSUB  v22.4s,  v30.4s ,  v22.4s
1836
1837     MOV  V30.16B, V8.16B
1838
1839
1840
1841
1842
1843
1844
1845    ST1 {V14.S}[0],[x0]
1846  ADD  x0, x0, x9
1847    ST1 {V22.S}[0],[x0]
1848  ADD  x0, x0, x9
1849    ST1 {V14.S}[1],[x0]
1850  ADD  x0, x0, x9
1851    ST1 {V22.S}[1],[x0]
1852  ADD  x0, x0, x9
1853    ST1 {V14.S}[2],[x0]
1854  ADD  x0, x0, x9
1855    ST1 {V22.S}[2],[x0]
1856  ADD  x0, x0, x9
1857    ST1 {V14.S}[3],[x0]
1858  ADD  x0, x0, x9
1859
1860    ST1 {V6.S}[0],[x5]
1861  ADD  x5, x5, x10
1862    ST1 {V30.S}[0],[x5]
1863  ADD  x5, x5, x10
1864    ST1 {V6.S}[1],[x5]
1865  ADD  x5, x5, x10
1866    ST1 {V30.S}[1],[x5]
1867  ADD  x5, x5, x10
1868    ST1 {V6.S}[2],[x5]
1869  ADD  x5, x5, x10
1870    ST1 {V30.S}[2],[x5]
1871  ADD  x5, x5, x10
1872    ST1 {V6.S}[3],[x5]
1873  ADD  x5, x5, x10
1874
1875   pop_v_regs
1876   ret
1877
1878
1879