xref: /aosp_15_r20/external/libxaac/decoder/armv8/ixheaacd_overlap_add1.s (revision 15dc779a375ca8b5125643b829a8aa4b70d7f451)
1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21.macro push_v_regs
22    stp             q8, q9, [sp, #-32]!
23    stp             q10, q11, [sp, #-32]!
24    stp             q12, q13, [sp, #-32]!
25    stp             q14, q15, [sp, #-32]!
26    stp             X8, X9, [sp, #-16]!
27    stp             X10, X11, [sp, #-16]!
28    stp             X12, X13, [sp, #-16]!
29    stp             X14, X15, [sp, #-16]!
30    stp             X16, X17, [sp, #-16]!
31    stp             X29, X30, [sp, #-16]!
32.endm
33.macro pop_v_regs
34    ldp             X29, X30, [sp], #16
35    ldp             X16, X17, [sp], #16
36    ldp             X14, X15, [sp], #16
37    ldp             X12, X13, [sp], #16
38    ldp             X10, X11, [sp], #16
39    ldp             X8, X9, [sp], #16
40    ldp             q14, q15, [sp], #32
41    ldp             q12, q13, [sp], #32
42    ldp             q10, q11, [sp], #32
43    ldp             q8, q9, [sp], #32
44.endm
45
46.text
47.global ixheaacd_over_lap_add1_armv8
48ixheaacd_over_lap_add1_armv8:
49    push_v_regs
50    LSL             X6 , X6 , #1
51    LSL             X10, X5, #1
52    SUB             X11, X10, #1
53    LSL             X10, X11, #2
54    ADD             X10, X0, X10
55    SUB             X10, X10, #12
56    LSL             X8, X11, #1
57    ADD             X8, X8, X3
58    SUB             X8, X8, #14
59    MOV             X12, #-16
60    DUP             V11.8H, W4
61    LD1             {V3.4S}, [X10], X12
62
63    NEG             W7, W7
64    SQNEG           V0.4S, V3.4S
65    UZP1            V31.8H, V0.8H, V0.8H
66    UZP2            V30.8H, V0.8H, V0.8H
67    REV64           V31.8h, V31.8h
68    REV64           V30.8h, V30.8h
69    SUB             X11, X5, #1
70    UZP1            V7.8H, V3.8H, V3.8H
71    UZP2            V6.8H, V3.8H, V3.8H
72    REV64           V7.8H, V7.8H
73    REV64           V6.8H, V6.8H
74    MOV             V16.S[0], W6
75    MOV             V17.S[0], W11
76    SMULL           V17.4S, V16.4H, V17.4H
77    MOV             W11, V17.S[0]
78    LSL             X11, X11, #1
79
80    LD2             {V2.4H, V3.4H}, [X8], X12
81    ADD             X11, X11, X2
82    REV64           V2.4H, V2.4H
83    REV64           V3.4H, V3.4H
84    LSL             X4, X6, #1
85    NEG             X4, X4
86    LSL             X9, X6, #1
87    MOV             V16.S[0], W5
88    MOV             V17.S[0], W6
89    SMULL           V17.4S, V16.4H, V17.4H
90    MOV             W6, V17.S[0]
91    LSL             W6, W6, #1
92    ADD             X6, X6, X2
93
94    UMULL           V15.4S, V7.4H, V2.4H
95    LD1             {V4.4S}, [X1], #16
96    USHR            V15.4S, V15.4S, #16
97
98    SMLAL           V15.4S, V6.4H, V2.4H
99    SQSHL           V15.4S, V15.4S, V11.4S
100    SSHLL           V27.4S, V3.4H, #0
101    SMULL           V28.2D, V27.2S, V4.2S
102    SMULL2          V29.2D, V27.4S, V4.4S
103    SQXTN           V28.2S, V28.2D
104    SQXTN2          V28.4S, V29.2D
105    MOV             V14.16B, V28.16B
106
107    SQSUB           V13.4S, V15.4S, V14.4S
108    UMULL           V12.4S, V31.4H, V3.4H
109    USHR            V12.4S, V12.4S, #16
110    SMLAL           V12.4S, V30.4H, V3.4H
111    SQSHL           V12.4S, V12.4S, V11.4S
112    LD1             {V3.4S}, [X10], X12
113
114    SSHLL           V27.4S, V2.4H, #0
115    SMULL           V28.2D, V27.2S, V4.2S
116    SMULL2          V29.2D, V27.4S, V4.4S
117    SQXTN           V28.2S, V28.2D
118    SQXTN2          V28.4S, V29.2D
119    MOV             V8.16B, V28.16B
120
121    SQNEG           V0.4S, V3.4S
122    UZP1            V1.8H, V0.8H, V0.8H
123    UZP2            V0.8H, V0.8H, V0.8H
124    REV64           V1.8h, V1.8h
125    REV64           V0.8h, V0.8h
126    SQSUB           V9.4S, V12.4S, V8.4S
127    UZP1            V7.8H, V3.8H, V3.8H
128    UZP2            V6.8H, V3.8H, V3.8H
129    REV64           V7.8h, V7.8h
130    REV64           V6.8h, V6.8h
131    LD2             {V2.4H, V3.4H}, [X8], X12
132    REV64           V2.4H, V2.4H
133    REV64           V3.4H, V3.4H
134
135    LD1             {V4.4S}, [X1], #16
136    SUB             W5, W5, #8
137
138
139LOOP_1:
140
141    ST1             {V13.S}[0], [X11], X4
142    UMULL           V15.4S, V7.4H, V2.4H
143    ST1             {V13.S}[1], [X11], X4
144    UMULL           V12.4S, V1.4H, V3.4H
145    ST1             {V13.S}[2], [X11], X4
146    USHR            V15.4S, V15.4S, #16
147    ST1             {V13.S}[3], [X11], X4
148    USHR            V12.4S, V12.4S, #16
149    ST1             {V9.S}[0], [X6], X9
150    SMLAL           V15.4S, V6.4H, V2.4H
151    ST1             {V9.S}[1], [X6], X9
152    SMLAL           V12.4S, V0.4H, V3.4H
153    ST1             {V9.S}[2], [X6], X9
154    SQSHL           V15.4S, V15.4S, V11.4S
155    ST1             {V9.S}[3], [X6], X9
156    SQSHL           V12.4S, V12.4S, V11.4S
157    LD1             {V6.4S}, [X10], X12
158
159    SSHLL           V27.4S, V3.4H, #0
160    SMULL           V28.2D, V27.2S, V4.2S
161    SMULL2          V29.2D, V27.4S, V4.4S
162    SQXTN           V28.2S, V28.2D
163    SQXTN2          V28.4S, V29.2D
164    MOV             V14.16B, V28.16B
165
166    SSHLL           V27.4S, V2.4H, #0
167    SMULL           V28.2D, V27.2S, V4.2S
168    SMULL2          V29.2D, V27.4S, V4.4S
169    SQXTN           V28.2S, V28.2D
170    SQXTN2          V28.4S, V29.2D
171    MOV             V8.16B, V28.16B
172
173    LD2             {V2.4H, V3.4H}, [X8], X12
174
175    SQNEG           V0.4S, V6.4S
176
177    LD1             {V4.4S}, [X1], #16
178
179    UZP1            V1.8H, V0.8H, V0.8H
180    UZP2            V0.8H, V0.8H, V0.8H
181    REV64           V1.8h, V1.8h
182    REV64           V0.8h, V0.8h
183    UZP1            V7.8H, V6.8H, V6.8H
184    UZP2            V6.8H, V6.8H, V6.8H
185    REV64           V7.8h, V7.8h
186    REV64           V6.8h, V6.8h
187    SQSUB           V13.4S, V15.4S, V14.4S
188    REV64           V2.4H, V2.4H
189    REV64           V3.4H, V3.4H
190    SQSUB           V9.4S, V12.4S, V8.4S
191    UMULL           V15.4S, V7.4H, V2.4H
192    ST1             {V13.S}[0], [X11], X4
193    UMULL           V12.4S, V1.4H, V3.4H
194    USHR            V15.4S, V15.4S, #16
195    ST1             {V13.S}[1], [X11], X4
196    SMLAL           V15.4S, V6.4H, V2.4H
197    ST1             {V13.S}[2], [X11], X4
198    USHR            V12.4S, V12.4S, #16
199    ST1             {V13.S}[3], [X11], X4
200    SMLAL           V12.4S, V0.4H, V3.4H
201    ST1             {V9.S}[0], [X6], X9
202    SQSHL           V15.4S, V15.4S, V11.4S
203    ST1             {V9.S}[1], [X6], X9
204    SQSHL           V12.4S, V12.4S, V11.4S
205    ST1             {V9.S}[2], [X6], X9
206
207    SSHLL           V27.4S, V3.4H, #0
208    SMULL           V28.2D, V27.2S, V4.2S
209    SMULL2          V29.2D, V27.4S, V4.4S
210    SQXTN           V28.2S, V28.2D
211    SQXTN2          V28.4S, V29.2D
212    MOV             V14.16B, V28.16B
213
214    ST1             {V9.S}[3], [X6], X9
215
216
217    SSHLL           V27.4S, V2.4H, #0
218    SMULL           V28.2D, V27.2S, V4.2S
219    SMULL2          V29.2D, V27.4S, V4.4S
220    SQXTN           V28.2S, V28.2D
221    SQXTN2          V28.4S, V29.2D
222    MOV             V8.16B, V28.16B
223
224    LD1             {V3.4S}, [X10], X12
225
226    SQNEG           V0.4S, V3.4S
227    UZP1            V1.8H, V0.8H, V0.8H
228    UZP2            V0.8H, V0.8H, V0.8H
229    REV64           V1.8H, V1.8H
230    REV64           V0.8H, V0.8H
231    SQSUB           V13.4S, V15.4S, V14.4S
232    UZP1            V7.8H, V3.8H, V3.8H
233    UZP2            V6.8H, V3.8H, V3.8H
234    REV64           V7.8H, V7.8H
235    REV64           V6.8H, V6.8H
236
237    LD2             {V2.4H, V3.4H}, [X8], X12
238    SQSUB           V9.4S, V12.4S, V8.4S
239    REV64           V2.4H, V2.4H
240    REV64           V3.4H, V3.4H
241
242    LD1             {V4.4S}, [X1], #16
243
244
245    SUBS            X5, X5, #8
246
247    BGT             LOOP_1
248
249    ST1             {V13.S}[0], [X11], X4
250    UMULL           V15.4S, V7.4H, V2.4H
251    ST1             {V13.S}[1], [X11], X4
252    UMULL           V12.4s, V1.4H, V3.4H
253    ST1             {V13.S}[2], [X11], X4
254    USHR            V15.4S, V15.4S, #16
255    ST1             {V13.S}[3], [X11], X4
256    USHR            V12.4S, V12.4S, #16
257
258    ST1             {V9.S}[0], [X6], X9
259    SMLAL           V15.4S, V6.4H, V2.4H
260    ST1             {V9.S}[1], [X6], X9
261    SMLAL           V12.4S, V0.4H, V3.4H
262    ST1             {V9.S}[2], [X6], X9
263    SQSHL           V15.4S, V15.4S, V11.4S
264    ST1             {V9.S}[3], [X6], X9
265    SQSHL           V12.4S, V12.4S, V11.4S
266
267
268    SSHLL           V27.4S, V3.4H, #0
269    SMULL           V28.2D, V27.2S, V4.2S
270    SMULL2          V29.2D, V27.4S, V4.4S
271    SQXTN           V28.2S, V28.2D
272    SQXTN2          V28.4S, V29.2D
273    MOV             V14.16B, V28.16B
274
275    SSHLL           V27.4S, V2.4H, #0
276    SMULL           V28.2D, V27.2S, V4.2S
277    SMULL2          V29.2D, V27.4S, V4.4S
278    SQXTN           V28.2S, V28.2D
279    SQXTN2          V28.4S, V29.2D
280    MOV             V8.16B, V28.16B
281
282
283    SQSUB           V13.4S, V15.4S, V14.4S
284    SQSUB           V9.4S, V12.4S, V8.4S
285
286
287    ST1             {V13.S}[0], [X11], X4
288    ST1             {V13.S}[1], [X11], X4
289    ST1             {V13.S}[2], [X11], X4
290    ST1             {V13.S}[3], [X11], X4
291
292    ST1             {V9.S}[0], [X6], X9
293    ST1             {V9.S}[1], [X6], X9
294    ST1             {V9.S}[2], [X6], X9
295    ST1             {V9.S}[3], [X6], X9
296    pop_v_regs
297    RET
298
299
300
301
302