1; 2; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 EXPORT |vpx_idct4x4_16_add_neon| 12 ARM 13 REQUIRE8 14 PRESERVE8 15 16 AREA ||.text||, CODE, READONLY, ALIGN=2 17 18 INCLUDE vpx_dsp/arm/idct_neon.asm.S 19 20 AREA Block, CODE, READONLY 21;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) 22; 23; r0 int16_t input 24; r1 uint8_t *dest 25; r2 int stride) 26 27|vpx_idct4x4_16_add_neon| PROC 28 29 ; The 2D transform is done with two passes which are actually pretty 30 ; similar. We first transform the rows. This is done by transposing 31 ; the inputs, doing an SIMD column transform (the columns are the 32 ; transposed rows) and then transpose the results (so that it goes back 33 ; in normal/row positions). Then, we transform the columns by doing 34 ; another SIMD column transform. 35 ; So, two passes of a transpose followed by a column transform. 36 37 ; load the inputs into q8-q9, d16-d19 38 LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 39 40 ; generate scalar constants 41 ; cospi_8_64 = 15137 42 movw r0, #0x3b21 43 ; cospi_16_64 = 11585 44 movw r3, #0x2d41 45 ; cospi_24_64 = 6270 46 movw r12, #0x187e 47 48 ; transpose the input data 49 ; 00 01 02 03 d16 50 ; 10 11 12 13 d17 51 ; 20 21 22 23 d18 52 ; 30 31 32 33 d19 53 vtrn.16 d16, d17 54 vtrn.16 d18, d19 55 56 ; generate constant vectors 57 vdup.16 d20, r0 ; replicate cospi_8_64 58 vdup.16 d21, r3 ; replicate cospi_16_64 59 60 ; 00 10 02 12 d16 61 ; 01 11 03 13 d17 62 ; 20 30 22 32 d18 63 ; 21 31 23 33 d19 64 vtrn.32 q8, q9 65 ; 00 10 20 30 d16 66 ; 01 11 21 31 d17 67 ; 02 12 22 32 d18 68 ; 03 13 23 33 d19 69 70 vdup.16 d22, r12 ; replicate cospi_24_64 71 72 ; do the transform on transposed rows 73 74 ; stage 1 75 vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 76 vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 77 78 ; (input[0] + input[2]) * cospi_16_64; 79 ; (input[0] - input[2]) * cospi_16_64; 80 vmull.s16 q8, d16, d21 81 vmull.s16 q14, d18, d21 82 vadd.s32 q13, q8, q14 83 vsub.s32 q14, q8, q14 84 85 ; input[1] * cospi_24_64 - input[3] * cospi_8_64; 86 ; input[1] * cospi_8_64 + input[3] * cospi_24_64; 87 vmlsl.s16 q15, d19, d20 88 vmlal.s16 q1, d19, d22 89 90 ; dct_const_round_shift 91 vrshrn.s32 d26, q13, #14 92 vrshrn.s32 d27, q14, #14 93 vrshrn.s32 d29, q15, #14 94 vrshrn.s32 d28, q1, #14 95 96 ; stage 2 97 ; output[0] = step[0] + step[3]; 98 ; output[1] = step[1] + step[2]; 99 ; output[3] = step[0] - step[3]; 100 ; output[2] = step[1] - step[2]; 101 vadd.s16 q8, q13, q14 102 vsub.s16 q9, q13, q14 103 vswp d18, d19 104 105 ; transpose the results 106 ; 00 01 02 03 d16 107 ; 10 11 12 13 d17 108 ; 20 21 22 23 d18 109 ; 30 31 32 33 d19 110 vtrn.16 d16, d17 111 vtrn.16 d18, d19 112 ; 00 10 02 12 d16 113 ; 01 11 03 13 d17 114 ; 20 30 22 32 d18 115 ; 21 31 23 33 d19 116 vtrn.32 q8, q9 117 ; 00 10 20 30 d16 118 ; 01 11 21 31 d17 119 ; 02 12 22 32 d18 120 ; 03 13 23 33 d19 121 122 ; do the transform on columns 123 124 ; stage 1 125 vadd.s16 d23, d16, d18 ; (input[0] + input[2]) 126 vsub.s16 d24, d16, d18 ; (input[0] - input[2]) 127 128 vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 129 vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 130 131 ; (input[0] + input[2]) * cospi_16_64; 132 ; (input[0] - input[2]) * cospi_16_64; 133 vmull.s16 q13, d23, d21 134 vmull.s16 q14, d24, d21 135 136 ; input[1] * cospi_24_64 - input[3] * cospi_8_64; 137 ; input[1] * cospi_8_64 + input[3] * cospi_24_64; 138 vmlsl.s16 q15, d19, d20 139 vmlal.s16 q1, d19, d22 140 141 ; dct_const_round_shift 142 vrshrn.s32 d26, q13, #14 143 vrshrn.s32 d27, q14, #14 144 vrshrn.s32 d29, q15, #14 145 vrshrn.s32 d28, q1, #14 146 147 ; stage 2 148 ; output[0] = step[0] + step[3]; 149 ; output[1] = step[1] + step[2]; 150 ; output[3] = step[0] - step[3]; 151 ; output[2] = step[1] - step[2]; 152 vadd.s16 q8, q13, q14 153 vsub.s16 q9, q13, q14 154 155 ; The results are in two registers, one of them being swapped. This will 156 ; be taken care of by loading the 'dest' value in a swapped fashion and 157 ; also storing them in the same swapped fashion. 158 ; temp_out[0, 1] = d16, d17 = q8 159 ; temp_out[2, 3] = d19, d18 = q9 swapped 160 161 ; ROUND_POWER_OF_TWO(temp_out[j], 4) 162 vrshr.s16 q8, q8, #4 163 vrshr.s16 q9, q9, #4 164 165 vld1.32 {d26[0]}, [r1], r2 166 vld1.32 {d26[1]}, [r1], r2 167 vld1.32 {d27[1]}, [r1], r2 168 vld1.32 {d27[0]}, [r1] ; no post-increment 169 170 ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i] 171 vaddw.u8 q8, q8, d26 172 vaddw.u8 q9, q9, d27 173 174 ; clip_pixel 175 vqmovun.s16 d26, q8 176 vqmovun.s16 d27, q9 177 178 ; do the stores in reverse order with negative post-increment, by changing 179 ; the sign of the stride 180 rsb r2, r2, #0 181 vst1.32 {d27[0]}, [r1], r2 182 vst1.32 {d27[1]}, [r1], r2 183 vst1.32 {d26[1]}, [r1], r2 184 vst1.32 {d26[0]}, [r1] ; no post-increment 185 bx lr 186 ENDP ; |vpx_idct4x4_16_add_neon| 187 188 END 189