1///****************************************************************************** 2// * 3// * Copyright (C) 2018 The Android Open Source Project 4// * 5// * Licensed under the Apache License, Version 2.0 (the "License"); 6// * you may not use this file except in compliance with the License. 7// * You may obtain a copy of the License at: 8// * 9// * http://www.apache.org/licenses/LICENSE-2.0 10// * 11// * Unless required by applicable law or agreed to in writing, software 12// * distributed under the License is distributed on an "AS IS" BASIS, 13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14// * See the License for the specific language governing permissions and 15// * limitations under the License. 16// * 17// ***************************************************************************** 18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21.macro push_v_regs 22 stp q8, q9, [sp, #-32]! 23 stp q10, q11, [sp, #-32]! 24 stp q12, q13, [sp, #-32]! 25 stp q14, q15, [sp, #-32]! 26 stp X8, X9, [sp, #-16]! 27 stp X10, X11, [sp, #-16]! 28 stp X12, X13, [sp, #-16]! 29 stp X14, X15, [sp, #-16]! 30 stp X16, X17, [sp, #-16]! 31 stp X29, X30, [sp, #-16]! 32.endm 33.macro pop_v_regs 34 ldp X29, X30, [sp], #16 35 ldp X16, X17, [sp], #16 36 ldp X14, X15, [sp], #16 37 ldp X12, X13, [sp], #16 38 ldp X10, X11, [sp], #16 39 ldp X8, X9, [sp], #16 40 ldp q14, q15, [sp], #32 41 ldp q12, q13, [sp], #32 42 ldp q10, q11, [sp], #32 43 ldp q8, q9, [sp], #32 44.endm 45 46.text 47.global ixheaacd_over_lap_add1_armv8 48ixheaacd_over_lap_add1_armv8: 49 push_v_regs 50 LSL X6 , X6 , #1 51 LSL X10, X5, #1 52 SUB X11, X10, #1 53 LSL X10, X11, #2 54 ADD X10, X0, X10 55 SUB X10, X10, #12 56 LSL X8, X11, #1 57 ADD X8, X8, X3 58 SUB X8, X8, #14 59 MOV X12, #-16 60 DUP V11.8H, W4 61 LD1 {V3.4S}, [X10], X12 62 63 NEG W7, W7 64 SQNEG V0.4S, V3.4S 65 UZP1 V31.8H, V0.8H, V0.8H 66 UZP2 V30.8H, V0.8H, V0.8H 67 REV64 V31.8h, V31.8h 68 REV64 V30.8h, V30.8h 69 SUB X11, X5, #1 70 UZP1 V7.8H, V3.8H, V3.8H 71 UZP2 V6.8H, V3.8H, V3.8H 72 REV64 V7.8H, V7.8H 73 REV64 V6.8H, V6.8H 74 MOV V16.S[0], W6 75 MOV V17.S[0], W11 76 SMULL V17.4S, V16.4H, V17.4H 77 MOV W11, V17.S[0] 78 LSL X11, X11, #1 79 80 LD2 {V2.4H, V3.4H}, [X8], X12 81 ADD X11, X11, X2 82 REV64 V2.4H, V2.4H 83 REV64 V3.4H, V3.4H 84 LSL X4, X6, #1 85 NEG X4, X4 86 LSL X9, X6, #1 87 MOV V16.S[0], W5 88 MOV V17.S[0], W6 89 SMULL V17.4S, V16.4H, V17.4H 90 MOV W6, V17.S[0] 91 LSL W6, W6, #1 92 ADD X6, X6, X2 93 94 UMULL V15.4S, V7.4H, V2.4H 95 LD1 {V4.4S}, [X1], #16 96 USHR V15.4S, V15.4S, #16 97 98 SMLAL V15.4S, V6.4H, V2.4H 99 SQSHL V15.4S, V15.4S, V11.4S 100 SSHLL V27.4S, V3.4H, #0 101 SMULL V28.2D, V27.2S, V4.2S 102 SMULL2 V29.2D, V27.4S, V4.4S 103 SQXTN V28.2S, V28.2D 104 SQXTN2 V28.4S, V29.2D 105 MOV V14.16B, V28.16B 106 107 SQSUB V13.4S, V15.4S, V14.4S 108 UMULL V12.4S, V31.4H, V3.4H 109 USHR V12.4S, V12.4S, #16 110 SMLAL V12.4S, V30.4H, V3.4H 111 SQSHL V12.4S, V12.4S, V11.4S 112 LD1 {V3.4S}, [X10], X12 113 114 SSHLL V27.4S, V2.4H, #0 115 SMULL V28.2D, V27.2S, V4.2S 116 SMULL2 V29.2D, V27.4S, V4.4S 117 SQXTN V28.2S, V28.2D 118 SQXTN2 V28.4S, V29.2D 119 MOV V8.16B, V28.16B 120 121 SQNEG V0.4S, V3.4S 122 UZP1 V1.8H, V0.8H, V0.8H 123 UZP2 V0.8H, V0.8H, V0.8H 124 REV64 V1.8h, V1.8h 125 REV64 V0.8h, V0.8h 126 SQSUB V9.4S, V12.4S, V8.4S 127 UZP1 V7.8H, V3.8H, V3.8H 128 UZP2 V6.8H, V3.8H, V3.8H 129 REV64 V7.8h, V7.8h 130 REV64 V6.8h, V6.8h 131 LD2 {V2.4H, V3.4H}, [X8], X12 132 REV64 V2.4H, V2.4H 133 REV64 V3.4H, V3.4H 134 135 LD1 {V4.4S}, [X1], #16 136 SUB W5, W5, #8 137 138 139LOOP_1: 140 141 ST1 {V13.S}[0], [X11], X4 142 UMULL V15.4S, V7.4H, V2.4H 143 ST1 {V13.S}[1], [X11], X4 144 UMULL V12.4S, V1.4H, V3.4H 145 ST1 {V13.S}[2], [X11], X4 146 USHR V15.4S, V15.4S, #16 147 ST1 {V13.S}[3], [X11], X4 148 USHR V12.4S, V12.4S, #16 149 ST1 {V9.S}[0], [X6], X9 150 SMLAL V15.4S, V6.4H, V2.4H 151 ST1 {V9.S}[1], [X6], X9 152 SMLAL V12.4S, V0.4H, V3.4H 153 ST1 {V9.S}[2], [X6], X9 154 SQSHL V15.4S, V15.4S, V11.4S 155 ST1 {V9.S}[3], [X6], X9 156 SQSHL V12.4S, V12.4S, V11.4S 157 LD1 {V6.4S}, [X10], X12 158 159 SSHLL V27.4S, V3.4H, #0 160 SMULL V28.2D, V27.2S, V4.2S 161 SMULL2 V29.2D, V27.4S, V4.4S 162 SQXTN V28.2S, V28.2D 163 SQXTN2 V28.4S, V29.2D 164 MOV V14.16B, V28.16B 165 166 SSHLL V27.4S, V2.4H, #0 167 SMULL V28.2D, V27.2S, V4.2S 168 SMULL2 V29.2D, V27.4S, V4.4S 169 SQXTN V28.2S, V28.2D 170 SQXTN2 V28.4S, V29.2D 171 MOV V8.16B, V28.16B 172 173 LD2 {V2.4H, V3.4H}, [X8], X12 174 175 SQNEG V0.4S, V6.4S 176 177 LD1 {V4.4S}, [X1], #16 178 179 UZP1 V1.8H, V0.8H, V0.8H 180 UZP2 V0.8H, V0.8H, V0.8H 181 REV64 V1.8h, V1.8h 182 REV64 V0.8h, V0.8h 183 UZP1 V7.8H, V6.8H, V6.8H 184 UZP2 V6.8H, V6.8H, V6.8H 185 REV64 V7.8h, V7.8h 186 REV64 V6.8h, V6.8h 187 SQSUB V13.4S, V15.4S, V14.4S 188 REV64 V2.4H, V2.4H 189 REV64 V3.4H, V3.4H 190 SQSUB V9.4S, V12.4S, V8.4S 191 UMULL V15.4S, V7.4H, V2.4H 192 ST1 {V13.S}[0], [X11], X4 193 UMULL V12.4S, V1.4H, V3.4H 194 USHR V15.4S, V15.4S, #16 195 ST1 {V13.S}[1], [X11], X4 196 SMLAL V15.4S, V6.4H, V2.4H 197 ST1 {V13.S}[2], [X11], X4 198 USHR V12.4S, V12.4S, #16 199 ST1 {V13.S}[3], [X11], X4 200 SMLAL V12.4S, V0.4H, V3.4H 201 ST1 {V9.S}[0], [X6], X9 202 SQSHL V15.4S, V15.4S, V11.4S 203 ST1 {V9.S}[1], [X6], X9 204 SQSHL V12.4S, V12.4S, V11.4S 205 ST1 {V9.S}[2], [X6], X9 206 207 SSHLL V27.4S, V3.4H, #0 208 SMULL V28.2D, V27.2S, V4.2S 209 SMULL2 V29.2D, V27.4S, V4.4S 210 SQXTN V28.2S, V28.2D 211 SQXTN2 V28.4S, V29.2D 212 MOV V14.16B, V28.16B 213 214 ST1 {V9.S}[3], [X6], X9 215 216 217 SSHLL V27.4S, V2.4H, #0 218 SMULL V28.2D, V27.2S, V4.2S 219 SMULL2 V29.2D, V27.4S, V4.4S 220 SQXTN V28.2S, V28.2D 221 SQXTN2 V28.4S, V29.2D 222 MOV V8.16B, V28.16B 223 224 LD1 {V3.4S}, [X10], X12 225 226 SQNEG V0.4S, V3.4S 227 UZP1 V1.8H, V0.8H, V0.8H 228 UZP2 V0.8H, V0.8H, V0.8H 229 REV64 V1.8H, V1.8H 230 REV64 V0.8H, V0.8H 231 SQSUB V13.4S, V15.4S, V14.4S 232 UZP1 V7.8H, V3.8H, V3.8H 233 UZP2 V6.8H, V3.8H, V3.8H 234 REV64 V7.8H, V7.8H 235 REV64 V6.8H, V6.8H 236 237 LD2 {V2.4H, V3.4H}, [X8], X12 238 SQSUB V9.4S, V12.4S, V8.4S 239 REV64 V2.4H, V2.4H 240 REV64 V3.4H, V3.4H 241 242 LD1 {V4.4S}, [X1], #16 243 244 245 SUBS X5, X5, #8 246 247 BGT LOOP_1 248 249 ST1 {V13.S}[0], [X11], X4 250 UMULL V15.4S, V7.4H, V2.4H 251 ST1 {V13.S}[1], [X11], X4 252 UMULL V12.4s, V1.4H, V3.4H 253 ST1 {V13.S}[2], [X11], X4 254 USHR V15.4S, V15.4S, #16 255 ST1 {V13.S}[3], [X11], X4 256 USHR V12.4S, V12.4S, #16 257 258 ST1 {V9.S}[0], [X6], X9 259 SMLAL V15.4S, V6.4H, V2.4H 260 ST1 {V9.S}[1], [X6], X9 261 SMLAL V12.4S, V0.4H, V3.4H 262 ST1 {V9.S}[2], [X6], X9 263 SQSHL V15.4S, V15.4S, V11.4S 264 ST1 {V9.S}[3], [X6], X9 265 SQSHL V12.4S, V12.4S, V11.4S 266 267 268 SSHLL V27.4S, V3.4H, #0 269 SMULL V28.2D, V27.2S, V4.2S 270 SMULL2 V29.2D, V27.4S, V4.4S 271 SQXTN V28.2S, V28.2D 272 SQXTN2 V28.4S, V29.2D 273 MOV V14.16B, V28.16B 274 275 SSHLL V27.4S, V2.4H, #0 276 SMULL V28.2D, V27.2S, V4.2S 277 SMULL2 V29.2D, V27.4S, V4.4S 278 SQXTN V28.2S, V28.2D 279 SQXTN2 V28.4S, V29.2D 280 MOV V8.16B, V28.16B 281 282 283 SQSUB V13.4S, V15.4S, V14.4S 284 SQSUB V9.4S, V12.4S, V8.4S 285 286 287 ST1 {V13.S}[0], [X11], X4 288 ST1 {V13.S}[1], [X11], X4 289 ST1 {V13.S}[2], [X11], X4 290 ST1 {V13.S}[3], [X11], X4 291 292 ST1 {V9.S}[0], [X6], X9 293 ST1 {V9.S}[1], [X6], X9 294 ST1 {V9.S}[2], [X6], X9 295 ST1 {V9.S}[3], [X6], X9 296 pop_v_regs 297 RET 298 299 300 301 302