xref: /aosp_15_r20/external/libxaac/decoder/armv7/ixheaacd_fft32x32_ld2_armv7.s (revision 15dc779a375ca8b5125643b829a8aa4b70d7f451)
1.text
2.p2align 2
3.global ixheaacd_fft32x32_ld2_armv7
4
5ixheaacd_fft32x32_ld2_armv7:
6
7    STMFD           sp!, {r4-r12, r14}
8
9    @DIT Radix-4 FFT First Stage
10    @First Butterfly
11    MOV             r0, r2
12    MOV             r1, r3
13    LDR             r2, [r0]            @x_0 = x[0 ]
14    LDR             r3, [r0, #32]       @x_2 = x[8 ]
15    LDR             r4, [r0, #64]       @x_4 = x[16]
16    LDR             r5, [r0, #96]       @x_6 = x[24]
17    ADD             r6, r2, r4          @xh0_0 = x_0 + x_4
18    SUB             r7, r2, r4          @xl0_0 = x_0 - x_4
19    ADD             r8, r3, r5          @xh0_1 = x_2 + x_6
20    SUB             r9, r3, r5          @xl0_1 = x_2 - x_6
21
22    LDR             r2, [r0, #4]        @x_1 = x[0 +1]
23    LDR             r3, [r0, #36]       @x_3 = x[8 +1]
24    LDR             r4, [r0, #68]       @x_5 = x[16+1]
25    LDR             r5, [r0, #100]      @x_7 = x[24+1]
26    ADD             r10, r2, r4         @xh1_0 = x_1 + x_5
27    SUB             r11, r2, r4         @xl1_0 = x_1 - x_5
28    ADD             r12, r3, r5         @xh1_1 = x_3 + x_7
29    SUB             r14, r3, r5         @xl1_1 = x_3 - x_7
30
31    ADD             r2, r6, r8          @n00 = xh0_0 + xh0_1
32    ADD             r3, r7, r14         @n10 = xl0_0 + xl1_1
33    SUB             r4, r6, r8          @n20 = xh0_0 - xh0_1
34    SUB             r5, r7, r14         @n30 = xl0_0 - xl1_1
35    STR             r2, [r0]            @x[0 ] = n00
36    STR             r3, [r0, #32]       @x[8 ] = n10
37    STR             r4, [r0, #64]       @x[16] = n20
38    STR             r5, [r0, #96]       @x[24] = n30
39
40    ADD             r2, r10, r12        @n01 = xh1_0 + xh1_1
41    SUB             r3, r11, r9         @n11 = xl1_0 - xl0_1
42    SUB             r4, r10, r12        @n21 = xh1_0 - xh1_1
43    ADD             r5, r11, r9         @n31 = xl1_0 + xl0_1
44    STR             r2, [r0, #4]        @x[1   ] = n01
45    STR             r3, [r0, #36]       @x[8+1 ] = n11
46    STR             r4, [r0, #68]       @x[16+1] = n21
47    STR             r5, [r0, #100]      @x[24+1] = n31
48
49    @Second Butterfly
50    LDR             r2, [r0, #8]        @x_0 = x[2 ]
51    LDR             r3, [r0, #40]       @x_2 = x[10]
52    LDR             r4, [r0, #72]       @x_4 = x[18]
53    LDR             r5, [r0, #104]      @x_6 = x[26]
54    ADD             r6, r2, r4          @xh0_0 = x_0 + x_4
55    SUB             r7, r2, r4          @xl0_0 = x_0 - x_4
56    ADD             r8, r3, r5          @xh0_1 = x_2 + x_6
57    SUB             r9, r3, r5          @xl0_1 = x_2 - x_6
58
59    LDR             r2, [r0, #12]       @x_1 = x[2 +1]
60    LDR             r3, [r0, #44]       @x_3 = x[10+1]
61    LDR             r4, [r0, #76]       @x_5 = x[18+1]
62    LDR             r5, [r0, #108]      @x_7 = x[26+1]
63    ADD             r10, r2, r4         @xh1_0 = x_1 + x_5
64    SUB             r11, r2, r4         @xl1_0 = x_1 - x_5
65    ADD             r12, r3, r5         @xh1_1 = x_3 + x_7
66    SUB             r14, r3, r5         @xl1_1 = x_3 - x_7
67
68    ADD             r2, r6, r8          @n00 = xh0_0 + xh0_1
69    ADD             r3, r7, r14         @n10 = xl0_0 + xl1_1
70    SUB             r4, r6, r8          @n20 = xh0_0 - xh0_1
71    SUB             r5, r7, r14         @n30 = xl0_0 - xl1_1
72    STR             r2, [r0, #8]        @x[2 ] = n00
73    STR             r3, [r0, #40]       @x[10] = n10
74    STR             r4, [r0, #72]       @x[18] = n20
75    STR             r5, [r0, #104]      @x[26] = n30
76
77    ADD             r2, r10, r12        @n01 = xh1_0 + xh1_1
78    SUB             r3, r11, r9         @n11 = xl1_0 - xl0_1
79    SUB             r4, r10, r12        @n21 = xh1_0 - xh1_1
80    ADD             r5, r11, r9         @n31 = xl1_0 + xl0_1
81    STR             r2, [r0, #12]       @x[2 +1] = n01
82    STR             r3, [r0, #44]       @x[10+1] = n11
83    STR             r4, [r0, #76]       @x[18+1] = n21
84    STR             r5, [r0, #108]      @x[26+1] = n31
85
86    @Third Butterfly
87    LDR             r2, [r0, #16]       @x_0 = x[4 ]
88    LDR             r3, [r0, #48]       @x_2 = x[12]
89    LDR             r4, [r0, #80]       @x_4 = x[20]
90    LDR             r5, [r0, #112]      @x_6 = x[28]
91    ADD             r6, r2, r4          @xh0_0 = x_0 + x_4
92    SUB             r7, r2, r4          @xl0_0 = x_0 - x_4
93    ADD             r8, r3, r5          @xh0_1 = x_2 + x_6
94    SUB             r9, r3, r5          @xl0_1 = x_2 - x_6
95
96    LDR             r2, [r0, #20]       @x_1 = x[4 +1]
97    LDR             r3, [r0, #52]       @x_3 = x[12+1]
98    LDR             r4, [r0, #84]       @x_5 = x[20+1]
99    LDR             r5, [r0, #116]      @x_7 = x[28+1]
100    ADD             r10, r2, r4         @xh1_0 = x_1 + x_5
101    SUB             r11, r2, r4         @xl1_0 = x_1 - x_5
102    ADD             r12, r3, r5         @xh1_1 = x_3 + x_7
103    SUB             r14, r3, r5         @xl1_1 = x_3 - x_7
104
105    ADD             r2, r6, r8          @n00 = xh0_0 + xh0_1
106    ADD             r3, r7, r14         @n10 = xl0_0 + xl1_1
107    SUB             r4, r6, r8          @n20 = xh0_0 - xh0_1
108    SUB             r5, r7, r14         @n30 = xl0_0 - xl1_1
109    STR             r2, [r0, #16]       @x[4 ] = n00
110    STR             r3, [r0, #48]       @x[12] = n10
111    STR             r4, [r0, #80]       @x[20] = n20
112    STR             r5, [r0, #112]      @x[28] = n30
113
114    ADD             r2, r10, r12        @n01 = xh1_0 + xh1_1
115    SUB             r3, r11, r9         @n11 = xl1_0 - xl0_1
116    SUB             r4, r10, r12        @n21 = xh1_0 - xh1_1
117    ADD             r5, r11, r9         @n31 = xl1_0 + xl0_1
118    STR             r2, [r0, #20]       @x[4 +1] = n01
119    STR             r3, [r0, #52]       @x[12+1] = n11
120    STR             r4, [r0, #84]       @x[20+1] = n21
121    STR             r5, [r0, #116]      @x[28+1] = n31
122
123    @Fourth Butterfly
124    LDR             r2, [r0, #24]       @x_0 = x[6 ]
125    LDR             r3, [r0, #56]       @x_2 = x[14]
126    LDR             r4, [r0, #88]       @x_4 = x[22]
127    LDR             r5, [r0, #120]      @x_6 = x[30]
128    ADD             r6, r2, r4          @xh0_0 = x_0 + x_4
129    SUB             r7, r2, r4          @xl0_0 = x_0 - x_4
130    ADD             r8, r3, r5          @xh0_1 = x_2 + x_6
131    SUB             r9, r3, r5          @xl0_1 = x_2 - x_6
132
133    LDR             r2, [r0, #28]       @x_1 = x[6 +1]
134    LDR             r3, [r0, #60]       @x_3 = x[14+1]
135    LDR             r4, [r0, #92]       @x_5 = x[22+1]
136    LDR             r5, [r0, #124]      @x_7 = x[30+1]
137    ADD             r10, r2, r4         @xh1_0 = x_1 + x_5
138    SUB             r11, r2, r4         @xl1_0 = x_1 - x_5
139    ADD             r12, r3, r5         @xh1_1 = x_3 + x_7
140    SUB             r14, r3, r5         @xl1_1 = x_3 - x_7
141
142    ADD             r2, r6, r8          @n00 = xh0_0 + xh0_1
143    ADD             r3, r7, r14         @n10 = xl0_0 + xl1_1
144    SUB             r4, r6, r8          @n20 = xh0_0 - xh0_1
145    SUB             r5, r7, r14         @n30 = xl0_0 - xl1_1
146    STR             r2, [r0, #24]       @x[6 ] = n00
147    STR             r3, [r0, #56]       @x[14] = n10
148    STR             r4, [r0, #88]       @x[22] = n20
149    STR             r5, [r0, #120]      @x[30] = n30
150
151    ADD             r2, r10, r12        @n01 = xh1_0 + xh1_1
152    SUB             r3, r11, r9         @n11 = xl1_0 - xl0_1
153    SUB             r4, r10, r12        @n21 = xh1_0 - xh1_1
154    ADD             r5, r11, r9         @n31 = xl1_0 + xl0_1
155    STR             r2, [r0, #28]       @x[6 +1] = n01
156    STR             r3, [r0, #60]       @x[14+1] = n11
157    STR             r4, [r0, #92]       @x[22+1] = n21
158    STR             r5, [r0, #124]      @x[30+1] = n31
159
160
161    @DIT Radix-4 FFT Second Stage
162    @First Butterfly
163    LDR             r2, [r0]            @inp_0qr = x[0]
164    LDR             r3, [r0, #8]        @inp_1qr = x[2]
165    LDR             r4, [r0, #16]       @inp_2qr = x[4]
166    LDR             r5, [r0, #24]       @inp_3qr = x[6]
167    ADD             r6, r2, r4          @sum_0qr  = mul_0qr + mul_2qr
168    SUB             r7, r2, r4          @sum_1qr  = mul_0qr - mul_2qr
169    ADD             r8, r3, r5          @sum_2qr  = mul_1qr + mul_3qr
170    SUB             r9, r3, r5          @sum_3qr  = mul_1qr - mul_3qr
171
172    LDR             r2, [r0, #4]        @inp_0qi = x[1]
173    LDR             r3, [r0, #12]       @inp_1qi = x[3]
174    LDR             r4, [r0, #20]       @inp_2qi = x[5]
175    LDR             r5, [r0, #28]       @inp_3qi = x[7]
176    ADD             r10, r2, r4         @sum_0qi  = mul_0qi + mul_2qi
177    SUB             r11, r2, r4         @sum_1qi  = mul_0qi - mul_2qi
178    ADD             r12, r3, r5         @sum_2qi  = mul_1qi + mul_3qi
179    SUB             r14, r3, r5         @sum_3qi  = mul_1qi - mul_3qi
180
181    ADD             r2, r6, r8          @sum_0qr + sum_2qr
182    ADD             r3, r7, r14         @sum_1qr + sum_3qi
183    SUB             r4, r6, r8          @sum_0qr - sum_2qr
184    SUB             r5, r7, r14         @sum_1qr - sum_3qi
185    STR             r2, [r1]            @y[0 ] = sum_0qr + sum_2qr
186    STR             r3, [r1, #32]       @y[8 ] = sum_1qr + sum_3qi
187    STR             r4, [r1, #64]       @y[16] = sum_0qr - sum_2qr
188    STR             r5, [r1, #96]       @y[24] = sum_1qr - sum_3qi
189
190    ADD             r2, r10, r12        @sum_0qi + sum_2qi
191    SUB             r3, r11, r9         @sum_1qi - sum_3qr
192    SUB             r4, r10, r12        @sum_0qi - sum_2qi
193    ADD             r5, r11, r9         @sum_1qi + sum_3qr
194    STR             r2, [r1, #4]        @y[0 +1] = sum_0qi + sum_2qi
195    STR             r3, [r1, #36]       @y[8 +1] = sum_1qi - sum_3qr
196    STR             r4, [r1, #68]       @y[16+1] = sum_0qi - sum_2qi
197    STR             r5, [r1, #100]      @y[24+1] = sum_1qi + sum_3qr
198
199
200    @Load twiddle factors
201    MOVW            r11, 0X7642
202    MOVT            r11, 0X89BE
203    MOVW            r12, 0X30FC
204    MOVT            r12, 0XCF04
205    MOVW            r14, 0X5A83
206    MOVT            r14, 0XA57D
207
208    @Second Butterfly
209    LDR             r2, [r0, #32]       @mul_0qr = inp_0qr = x[8]
210    LDR             r3, [r0, #36]       @mul_0qi = inp_1qr = x[9]
211
212    LDR             r5, [r0, #40]       @inp_1qr = x[10]
213    LDR             r6, [r0, #44]       @inp_1qi = x[11]
214    SMULWB          r4, r5, r11         @mul_1qr = mpy_16_32_ns( 0x7642 , inp_1qr)
215    SMLAWB          r4, r6, r12, r4     @mul_1qr -= mpy_16_32_ns(-0x30FC , inp_1qi)
216    SMULWT          r5, r5, r12         @mul_1qi = mpy_16_32_ns(-0x30FC , inp_1qr)
217
218    LDR             r7, [r0, #48]       @inp_2qr = x[12]
219    LDR             r8, [r0, #52]       @inp_2qi = x[13]
220
221    @Moved for delay slot
222    SMLAWB          r5, r6, r11, r5     @mul_1qi += mpy_16_32_ns( 0x7642 , inp_1qi)
223
224    ADD             r6, r7, r8          @(inp_2qr + inp_2qi)
225    SMULWB          r6, r6, r14         @mul_2qr = mpy_16_32_ns(0x5A83 , (inp_2qr + inp_2qi))
226    SUB             r7, r8, r7          @(-inp_2qr + inp_2qi)
227    SMULWB          r7, r7, r14         @mul_2qi = mpy_16_32_ns(0x5A83 , (-inp_2qr + inp_2qi))
228
229    LDR             r9 , [r0, #56]      @inp_3qr = x[14]
230    LDR             r10, [r0, #60]      @inp_3qi = x[15]
231    SMULWB          r8, r9 , r12        @mul_3qr = mpy_16_32_ns( 0x30FC , inp_3qr)
232    SMLAWB          r8, r10, r11, r8    @mul_3qr -= mpy_16_32_ns(-0x7642 , inp_3qi)@
233    SMULWT          r9, r9 , r11        @mul_3qi = mpy_16_32_ns(-0x7642 , inp_3qr)
234    SMLAWB          r9, r10, r12, r9    @mul_3qi += mpy_16_32_ns( 0x30FC , inp_3qi)
235
236    ADD             r10, r2, r6, lsl #1 @sum_0qr  = mul_0qr + (mul_2qr << 1)
237    SUB             r2 , r2, r6, lsl #1 @sum_1qr  = mul_0qr - (mul_2qr << 1)
238    ADD             r6 , r4, r8         @sum_2qr  = mul_1qr + mul_3qr
239    SUB             r4 , r4, r8         @sum_3qr  = mul_1qr - mul_3qr
240
241    ADD             r8 , r3, r7, lsl #1 @sum_0qi  = mul_0qi + (mul_2qi << 1)
242    SUB             r3 , r3, r7, lsl #1 @sum_1qi  = mul_0qi - (mul_2qi << 1)
243    ADD             r7 , r5, r9         @sum_2qi  = mul_1qi + mul_3qi
244    SUB             r5 , r5, r9         @sum_3qi  = mul_1qi - mul_3qi
245
246    ADD             r9 , r10, r6, lsl #1 @sum_0qr + (sum_2qr << 1)
247    SUB             r10, r10, r6, lsl #1 @sum_0qr - (sum_2qr << 1)
248    ADD             r6 , r2 , r5, lsl #1 @sum_1qr + (sum_3qi << 1)
249    SUB             r2 , r2 , r5, lsl #1 @sum_1qr - (sum_3qi << 1)
250    STR             r9 , [r1, #8]       @y[2 ] = sum_0qr + (sum_2qr << 1)
251    STR             r10, [r1, #72]      @y[18] = sum_0qr - (sum_2qr << 1)
252    STR             r6 , [r1, #40]      @y[10] = sum_1qr + (sum_3qi << 1)
253    STR             r2 , [r1, #104]     @y[26] = sum_1qr - (sum_3qi << 1)
254
255    ADD             r5 , r8 , r7, lsl #1 @sum_0qi + (sum_2qi << 1)
256    SUB             r8 , r8 , r7, lsl #1 @sum_0qi - (sum_2qi << 1)
257    SUB             r7 , r3 , r4, lsl #1 @sum_1qi - (sum_3qr << 1)
258    ADD             r3 , r3 , r4, lsl #1 @sum_1qi + (sum_3qr << 1)
259    STR             r5 , [r1, #12]      @y[2 +1] = sum_0qi + (sum_2qi << 1)
260    STR             r8 , [r1, #76]      @y[18+1] = sum_0qi - (sum_2qi << 1)
261    STR             r7 , [r1, #44]      @y[10+1] = sum_1qi - (sum_3qr << 1)
262    STR             r3 , [r1, #108]     @y[26+1] = sum_1qi + (sum_3qr << 1)
263
264    @Third Butterfly
265    LDR             r2, [r0, #64]       @mul_0qr = inp_0qr = x[16]
266
267    LDR             r5, [r0, #72]       @inp_1qr = x[18]
268    LDR             r6, [r0, #76]       @inp_1qi = x[19]
269
270    @Moved for delay slot
271    LDR             r3, [r0, #68]       @mul_0qi = inp_1qr = x[17]
272
273    ADD             r4, r5, r6          @(inp_1qr + inp_1qi)
274    SMULWB          r4, r4, r14         @mul_1qr = mpy_16_32_ns(0x5A83 , (inp_1qr + inp_1qi))
275    SUB             r5, r6, r5          @(-inp_1qr + inp_1qi)
276    SMULWB          r5, r5, r14         @mul_1qi = mpy_16_32_ns(0x5A83 , (-inp_1qr + inp_1qi))
277
278    LDR             r6, [r0, #84]       @mul_2qr = inp_2qi = x[21]
279
280    LDR             r9 , [r0, #88]      @inp_3qr = x[22]
281    LDR             r10, [r0, #92]      @inp_3qi = x[23]
282
283    @Moved for delay slot
284    LDR             r7, [r0, #80]       @mul_2qi = inp_2qr = x[20]
285
286    SUB             r8 , r10, r9        @(-inp_3qr + inp_3qi)
287    SMULWB          r8 , r8 , r14       @mul_3qr = mpy_16_32_ns( 0x5A83 , (-inp_3qr + inp_3qi))
288    ADD             r9 , r9 , r10       @(inp_3qr + inp_3qi)
289    SMULWT          r9 , r9 , r14       @mul_3qi = mpy_16_32_ns(-0x5A83 , (inp_3qr + inp_3qi))
290
291    ADD             r10, r2, r6         @sum_0qr  = mul_0qr + mul_2qr
292    SUB             r2 , r2, r6         @sum_1qr  = mul_0qr - mul_2qr
293    ADD             r6 , r4, r8         @sum_2qr  = mul_1qr + mul_3qr
294    SUB             r4 , r4, r8         @sum_3qr  = mul_1qr - mul_3qr
295
296    SUB             r8 , r3, r7         @sum_0qi  = mul_0qi - mul_2qi
297    ADD             r3 , r3, r7         @sum_1qi  = mul_0qi + mul_2qi
298    ADD             r7 , r5, r9         @sum_2qi  = mul_1qi + mul_3qi
299    SUB             r5 , r5, r9         @sum_3qi  = mul_1qi - mul_3qi
300
301    ADD             r9 , r10, r6, lsl #1 @sum_0qr + (sum_2qr << 1)
302    SUB             r10, r10, r6, lsl #1 @sum_0qr - (sum_2qr << 1)
303    ADD             r6 , r2 , r5, lsl #1 @sum_1qr + (sum_3qi << 1)
304    SUB             r2 , r2 , r5, lsl #1 @sum_1qr - (sum_3qi << 1)
305    STR             r9 , [r1, #16]      @y[4 ] = sum_0qr + (sum_2qr << 1)
306    STR             r10, [r1, #80]      @y[20] = sum_0qr - (sum_2qr << 1)
307    STR             r6 , [r1, #48]      @y[12] = sum_1qr + (sum_3qi << 1)
308    STR             r2 , [r1, #112]     @y[28] = sum_1qr - (sum_3qi << 1)
309
310    ADD             r5, r8, r7, lsl #1  @sum_0qi + (sum_2qi << 1)
311    SUB             r8, r8, r7, lsl #1  @sum_0qi - (sum_2qi << 1)
312    SUB             r7, r3, r4, lsl #1  @sum_1qi - (sum_3qr << 1)
313    ADD             r3, r3, r4, lsl #1  @sum_1qi + (sum_3qr << 1)
314    STR             r5 , [r1, #20]      @y[4 +1] = sum_0qi + (sum_2qi << 1)
315    STR             r8 , [r1, #84]      @y[20+1] = sum_0qi - (sum_2qi << 1)
316    STR             r7 , [r1, #52]      @y[12+1] = sum_1qi - (sum_3qr << 1)
317    STR             r3 , [r1, #116]     @y[28+1] = sum_1qi + (sum_3qr << 1)
318
319    @Fourth Butterfly
320    LDR             r2, [r0, #96]       @mul_0qr = inp_0qr = x[24]
321    LDR             r3, [r0, #100]      @mul_0qi = inp_1qr = x[25]
322
323    LDR             r5, [r0, #104]      @inp_1qr = x[26]
324    LDR             r6, [r0, #108]      @inp_1qi = x[27]
325    SMULWB          r4, r5, r12         @mul_1qr = mpy_16_32_ns( 0x30FC , inp_1qr)
326    SMLAWB          r4, r6, r11, r4     @mul_1qr -= mpy_16_32_ns(-0x7642 , inp_1qi)
327    SMULWT          r5, r5, r11         @mul_1qi = mpy_16_32_ns(-0x7642 , inp_1qr)
328
329    LDR             r7, [r0, #112]      @inp_2qr = x[28]
330    LDR             r8, [r0, #116]      @inp_2qi = x[29]
331
332    @Moved for delay slot
333    SMLAWB          r5, r6, r12, r5     @mul_1qi += mpy_16_32_ns( 0x30FC , inp_1qi)
334
335    SUB             r6, r8, r7          @(-inp_2qr + inp_2qi)
336    SMULWB          r6, r6, r14         @mul_2qr = mpy_16_32_ns( 0x5A83 , (-inp_2qr + inp_2qi))
337    ADD             r7, r8, r7          @(inp_2qr + inp_2qi)
338    SMULWT          r7, r7, r14         @mul_2qi = mpy_16_32_ns(-0x5A83 , (inp_2qr + inp_2qi))
339
340    LDR             r9 , [r0, #120]     @inp_3qr = x[30]
341    LDR             r10, [r0, #124]     @inp_3qi = x[31]
342    SMULWT          r8, r9 , r11        @mul_3qr = mpy_16_32_ns(-0x7642 , inp_3qr)
343    SMLAWT          r8, r10, r12, r8    @mul_3qr -= mpy_16_32_ns( 0x30FC , inp_3qi)@
344    SMULWB          r9, r9 , r12        @mul_3qi = mpy_16_32_ns( 0x30FC , inp_3qr)
345    SMLAWT          r9, r10, r11, r9    @mul_3qi += mpy_16_32_ns(-0x7642 , inp_3qi)
346
347    ADD             r10, r2, r6, lsl #1 @sum_0qr  = mul_0qr + (mul_2qr << 1)
348    SUB             r2 , r2, r6, lsl #1 @sum_1qr  = mul_0qr - (mul_2qr << 1)
349    ADD             r6 , r4, r8         @sum_2qr  = mul_1qr + mul_3qr
350    SUB             r4 , r4, r8         @sum_3qr  = mul_1qr - mul_3qr
351
352    ADD             r8 , r3, r7, lsl #1 @sum_0qi  = mul_0qi + (mul_2qi << 1)
353    SUB             r3 , r3, r7, lsl #1 @sum_1qi  = mul_0qi - (mul_2qi << 1)
354    ADD             r7 , r5, r9         @sum_2qi  = mul_1qi + mul_3qi
355    SUB             r5 , r5, r9         @sum_3qi  = mul_1qi - mul_3qi
356
357    ADD             r9 , r10, r6, lsl #1 @sum_0qr + (sum_2qr << 1)
358    SUB             r10, r10, r6, lsl #1 @sum_0qr - (sum_2qr << 1)
359    ADD             r6 , r2 , r5, lsl #1 @sum_1qr + (sum_3qi << 1)
360    SUB             r2 , r2 , r5, lsl #1 @sum_1qr - (sum_3qi << 1)
361    STR             r9 , [r1, #24]      @y[6 ] = sum_0qr + (sum_2qr << 1)
362    STR             r10, [r1, #88]      @y[22] = sum_0qr - (sum_2qr << 1)
363    STR             r6 , [r1, #56]      @y[14] = sum_1qr + (sum_3qi << 1)
364    STR             r2 , [r1, #120]     @y[30] = sum_1qr - (sum_3qi << 1)
365
366    ADD             r5 , r8 , r7, lsl #1 @sum_0qi + (sum_2qi << 1)
367    SUB             r8 , r8 , r7, lsl #1 @sum_0qi - (sum_2qi << 1)
368    SUB             r7 , r3 , r4, lsl #1 @sum_1qi - (sum_3qr << 1)
369    ADD             r3 , r3 , r4, lsl #1 @sum_1qi + (sum_3qr << 1)
370    STR             r5 , [r1, #28]      @y[6 +1] = sum_0qi + (sum_2qi << 1)
371    STR             r8 , [r1, #92]      @y[22+1] = sum_0qi - (sum_2qi << 1)
372    STR             r7 , [r1, #60]      @y[14+1] = sum_1qi - (sum_3qr << 1)
373    STR             r3 , [r1, #124]     @y[30+1] = sum_1qi + (sum_3qr << 1)
374
375    LDMFD           sp!, {r4-r12, r15}
376
377