xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11    EXPORT  |vpx_idct4x4_16_add_neon|
12    ARM
13    REQUIRE8
14    PRESERVE8
15
16    AREA ||.text||, CODE, READONLY, ALIGN=2
17
18    INCLUDE vpx_dsp/arm/idct_neon.asm.S
19
20    AREA     Block, CODE, READONLY
21;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
22;
23; r0  int16_t input
24; r1  uint8_t *dest
25; r2  int stride)
26
27|vpx_idct4x4_16_add_neon| PROC
28
29    ; The 2D transform is done with two passes which are actually pretty
30    ; similar. We first transform the rows. This is done by transposing
31    ; the inputs, doing an SIMD column transform (the columns are the
32    ; transposed rows) and then transpose the results (so that it goes back
33    ; in normal/row positions). Then, we transform the columns by doing
34    ; another SIMD column transform.
35    ; So, two passes of a transpose followed by a column transform.
36
37    ; load the inputs into q8-q9, d16-d19
38    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
39
40    ; generate scalar constants
41    ; cospi_8_64 = 15137
42    movw            r0, #0x3b21
43    ; cospi_16_64 = 11585
44    movw            r3, #0x2d41
45    ; cospi_24_64 = 6270
46    movw            r12, #0x187e
47
48    ; transpose the input data
49    ; 00 01 02 03   d16
50    ; 10 11 12 13   d17
51    ; 20 21 22 23   d18
52    ; 30 31 32 33   d19
53    vtrn.16         d16, d17
54    vtrn.16         d18, d19
55
56    ; generate constant vectors
57    vdup.16         d20, r0         ; replicate cospi_8_64
58    vdup.16         d21, r3         ; replicate cospi_16_64
59
60    ; 00 10 02 12   d16
61    ; 01 11 03 13   d17
62    ; 20 30 22 32   d18
63    ; 21 31 23 33   d19
64    vtrn.32         q8, q9
65    ; 00 10 20 30   d16
66    ; 01 11 21 31   d17
67    ; 02 12 22 32   d18
68    ; 03 13 23 33   d19
69
70    vdup.16         d22, r12        ; replicate cospi_24_64
71
72    ; do the transform on transposed rows
73
74    ; stage 1
75    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
76    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
77
78    ; (input[0] + input[2]) * cospi_16_64;
79    ; (input[0] - input[2]) * cospi_16_64;
80    vmull.s16 q8,  d16, d21
81    vmull.s16 q14, d18, d21
82    vadd.s32  q13, q8,  q14
83    vsub.s32  q14, q8,  q14
84
85    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
86    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
87    vmlsl.s16 q15, d19, d20
88    vmlal.s16 q1,  d19, d22
89
90    ; dct_const_round_shift
91    vrshrn.s32 d26, q13, #14
92    vrshrn.s32 d27, q14, #14
93    vrshrn.s32 d29, q15, #14
94    vrshrn.s32 d28, q1,  #14
95
96    ; stage 2
97    ; output[0] = step[0] + step[3];
98    ; output[1] = step[1] + step[2];
99    ; output[3] = step[0] - step[3];
100    ; output[2] = step[1] - step[2];
101    vadd.s16 q8,  q13, q14
102    vsub.s16 q9,  q13, q14
103    vswp     d18, d19
104
105    ; transpose the results
106    ; 00 01 02 03   d16
107    ; 10 11 12 13   d17
108    ; 20 21 22 23   d18
109    ; 30 31 32 33   d19
110    vtrn.16         d16, d17
111    vtrn.16         d18, d19
112    ; 00 10 02 12   d16
113    ; 01 11 03 13   d17
114    ; 20 30 22 32   d18
115    ; 21 31 23 33   d19
116    vtrn.32         q8, q9
117    ; 00 10 20 30   d16
118    ; 01 11 21 31   d17
119    ; 02 12 22 32   d18
120    ; 03 13 23 33   d19
121
122    ; do the transform on columns
123
124    ; stage 1
125    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
126    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
127
128    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
129    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
130
131    ; (input[0] + input[2]) * cospi_16_64;
132    ; (input[0] - input[2]) * cospi_16_64;
133    vmull.s16 q13, d23, d21
134    vmull.s16 q14, d24, d21
135
136    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
137    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
138    vmlsl.s16 q15, d19, d20
139    vmlal.s16 q1,  d19, d22
140
141    ; dct_const_round_shift
142    vrshrn.s32 d26, q13, #14
143    vrshrn.s32 d27, q14, #14
144    vrshrn.s32 d29, q15, #14
145    vrshrn.s32 d28, q1,  #14
146
147    ; stage 2
148    ; output[0] = step[0] + step[3];
149    ; output[1] = step[1] + step[2];
150    ; output[3] = step[0] - step[3];
151    ; output[2] = step[1] - step[2];
152    vadd.s16 q8,  q13, q14
153    vsub.s16 q9,  q13, q14
154
155    ; The results are in two registers, one of them being swapped. This will
156    ; be taken care of by loading the 'dest' value in a swapped fashion and
157    ; also storing them in the same swapped fashion.
158    ; temp_out[0, 1] = d16, d17 = q8
159    ; temp_out[2, 3] = d19, d18 = q9 swapped
160
161    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
162    vrshr.s16 q8, q8, #4
163    vrshr.s16 q9, q9, #4
164
165    vld1.32 {d26[0]}, [r1], r2
166    vld1.32 {d26[1]}, [r1], r2
167    vld1.32 {d27[1]}, [r1], r2
168    vld1.32 {d27[0]}, [r1]  ; no post-increment
169
170    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
171    vaddw.u8 q8, q8, d26
172    vaddw.u8 q9, q9, d27
173
174    ; clip_pixel
175    vqmovun.s16 d26, q8
176    vqmovun.s16 d27, q9
177
178    ; do the stores in reverse order with negative post-increment, by changing
179    ; the sign of the stride
180    rsb r2, r2, #0
181    vst1.32 {d27[0]}, [r1], r2
182    vst1.32 {d27[1]}, [r1], r2
183    vst1.32 {d26[1]}, [r1], r2
184    vst1.32 {d26[0]}, [r1]  ; no post-increment
185    bx              lr
186    ENDP  ; |vpx_idct4x4_16_add_neon|
187
188    END
189