1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table. 32// In the comments, let RefTable denote the original, reference table. 33const x_by_x_tables 34// RangeMins 35// 36// Min(RefTable[i*8:i*8+8]) 37// First two values are zeroed. 38// 39// Lookup using RangeMins[(x >> 3)] 40 .byte 0, 0, 11, 8, 6, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2 41 .byte 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 42 43// DiffMasks 44// 45// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range 46// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of 47// RefTable changes at that particular index. 48// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of 49// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15). 50// 51// Lookup using DiffMasks[(x >> 3)] 52 .byte 0x00, 0x00, 0xD4, 0x44 53 .byte 0x42, 0x04, 0x00, 0x00 54 .byte 0x00, 0x80, 0x00, 0x00 55 .byte 0x04, 0x00, 0x00, 0x00 56 .byte 0x00, 0x00, 0x00, 0x00 57 .byte 0x00, 0x40, 0x00, 0x00 58 .byte 0x00, 0x00, 0x00, 0x00 59 .byte 0x00, 0x00, 0x00, 0x02 60// Binary form: 61// 0b00000000, 0b00000000, 0b11010100, 0b01000100 62// 0b01000010, 0b00000100, 0b00000000, 0b00000000 63// 0b00000000, 0b10000000, 0b00000000, 0b00000000 64// 0b00000100, 0b00000000, 0b00000000, 0b00000000 65// 0b00000000, 0b00000000, 0b00000000, 0b00000000 66// 0b00000000, 0b01000000, 0b00000000, 0b00000000 67// 0b00000000, 0b00000000, 0b00000000, 0b00000000 68// 0b00000000, 0b00000000, 0b00000000, 0b00000010 69 70// RefLo 71// 72// RefTable[0:16] 73// i.e. First 16 elements of the original table. 74// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable. 75// 76// Lookup using RangeMins[x] (tbl will replace x > 15 with 0) 77 .byte 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 78 79// Pseudo assembly 80// 81// hi_bits = x >> 3 82// tbl ref, {RefLo}, x 83// tbl diffs, {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits 84// tbl min, {RangeMins[0:16], RangeMins[16:32]}, hi_bits 85// lo_bits = x & 0x7 86// diffs = diffs << lo_bits 87// ref = ref + min 88// integral = popcnt(diffs) 89// ref = ref + integral 90// return ref 91endconst 92 93// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, 94// int32_t *AA, int16_t *BB, 95// const int w, const int s, 96// const int bitdepth_max); 97function sgr_box3_vert_neon, export=1 98 stp d8, d9, [sp, #-0x40]! 99 stp d10, d11, [sp, #0x10] 100 stp d12, d13, [sp, #0x20] 101 stp d14, d15, [sp, #0x30] 102 103 add w4, w4, #2 104 clz w9, w6 // bitdepth_max 105 dup v28.4s, w5 // strength 106 107 ldp x5, x6, [x0] 108 ldr x0, [x0, #16] 109 ldp x7, x8, [x1] 110 ldr x1, [x1, #16] 111 112 movi v31.4s, #9 // n 113 114 sub w9, w9, #24 // -bitdepth_min_8 115 movrel x12, x_by_x_tables 116 mov w13, #455 // one_by_x 117 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks 118 movi v22.16b, #0x7 119 ldr q23, [x12, #64] //RefLo 120 dup v6.8h, w9 // -bitdepth_min_8 121 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 122 movi v29.8h, #1, lsl #8 123 dup v30.4s, w13 // one_by_x 124 125 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x5], #64 126 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64 127 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 128 ld1 {v20.8h, v21.8h}, [x8], #32 129 ld1 {v0.8h, v1.8h}, [x7], #32 1301: 131 ld1 {v2.8h, v3.8h}, [x1], #32 132 add v8.4s, v8.4s, v12.4s 133 add v9.4s, v9.4s, v13.4s 134 add v10.4s, v10.4s, v14.4s 135 add v11.4s, v11.4s, v15.4s 136 add v0.8h, v0.8h, v20.8h 137 add v1.8h, v1.8h, v21.8h 138 139 add v16.4s, v16.4s, v8.4s 140 add v17.4s, v17.4s, v9.4s 141 add v18.4s, v18.4s, v10.4s 142 add v19.4s, v19.4s, v11.4s 143 add v4.8h, v2.8h, v0.8h 144 add v5.8h, v3.8h, v1.8h 145 146 srshl v16.4s, v16.4s, v7.4s 147 srshl v17.4s, v17.4s, v7.4s 148 srshl v18.4s, v18.4s, v7.4s 149 srshl v19.4s, v19.4s, v7.4s 150 srshl v9.8h, v4.8h, v6.8h 151 srshl v13.8h, v5.8h, v6.8h 152 mul v16.4s, v16.4s, v31.4s // a * n 153 mul v17.4s, v17.4s, v31.4s // a * n 154 mul v18.4s, v18.4s, v31.4s // a * n 155 mul v19.4s, v19.4s, v31.4s // a * n 156 umull v8.4s, v9.4h, v9.4h // b * b 157 umull2 v9.4s, v9.8h, v9.8h // b * b 158 umull v12.4s, v13.4h, v13.4h // b * b 159 umull2 v13.4s, v13.8h, v13.8h // b * b 160 uqsub v16.4s, v16.4s, v8.4s // imax(a * n - b * b, 0) 161 uqsub v17.4s, v17.4s, v9.4s // imax(a * n - b * b, 0) 162 uqsub v18.4s, v18.4s, v12.4s // imax(a * n - b * b, 0) 163 uqsub v19.4s, v19.4s, v13.4s // imax(a * n - b * b, 0) 164 mul v16.4s, v16.4s, v28.4s // p * s 165 mul v17.4s, v17.4s, v28.4s // p * s 166 mul v18.4s, v18.4s, v28.4s // p * s 167 mul v19.4s, v19.4s, v28.4s // p * s 168 uqshrn v16.4h, v16.4s, #16 169 uqshrn2 v16.8h, v17.4s, #16 170 uqshrn v18.4h, v18.4s, #16 171 uqshrn2 v18.8h, v19.4s, #16 172 uqrshrn v1.8b, v16.8h, #4 // imin(z, 255) 173 uqrshrn2 v1.16b, v18.8h, #4 // imin(z, 255) 174 175 ld1 {v16.4s, v17.4s}, [x0], #32 176 subs w4, w4, #16 177 178 ushr v0.16b, v1.16b, #3 179 ld1 {v8.4s, v9.4s}, [x5], #32 180 tbl v2.16b, {v26.16b, v27.16b}, v0.16b // RangeMins 181 tbl v0.16b, {v24.16b, v25.16b}, v0.16b // DiffMasks 182 tbl v3.16b, {v23.16b}, v1.16b // RefLo 183 and v1.16b, v1.16b, v22.16b 184 ld1 {v12.4s, v13.4s}, [x6], #32 185 ushl v1.16b, v2.16b, v1.16b 186 ld1 {v20.8h, v21.8h}, [x8], #32 187 add v3.16b, v3.16b, v0.16b 188 cnt v1.16b, v1.16b 189 ld1 {v18.4s, v19.4s}, [x0], #32 190 add v3.16b, v3.16b, v1.16b 191 ld1 {v10.4s, v11.4s}, [x5], #32 192 uxtl v0.8h, v3.8b // x 193 uxtl2 v1.8h, v3.16b // x 194 195 ld1 {v14.4s, v15.4s}, [x6], #32 196 197 umull v2.4s, v0.4h, v4.4h // x * BB[i] 198 umull2 v3.4s, v0.8h, v4.8h // x * BB[i] 199 umull v4.4s, v1.4h, v5.4h // x * BB[i] 200 umull2 v5.4s, v1.8h, v5.8h // x * BB[i] 201 sub v0.8h, v29.8h, v0.8h // 256 - x 202 sub v1.8h, v29.8h, v1.8h // 256 - x 203 mul v2.4s, v2.4s, v30.4s // x * BB[i] * sgr_one_by_x 204 mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x 205 mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x 206 mul v5.4s, v5.4s, v30.4s // x * BB[i] * sgr_one_by_x 207 st1 {v0.8h, v1.8h}, [x3], #32 208 ld1 {v0.8h, v1.8h}, [x7], #32 209 srshr v2.4s, v2.4s, #12 // AA[i] 210 srshr v3.4s, v3.4s, #12 // AA[i] 211 srshr v4.4s, v4.4s, #12 // AA[i] 212 srshr v5.4s, v5.4s, #12 // AA[i] 213 214 st1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64 215 b.gt 1b 216 217 ldp d14, d15, [sp, #0x30] 218 ldp d12, d13, [sp, #0x20] 219 ldp d10, d11, [sp, #0x10] 220 ldp d8, d9, [sp], 0x40 221 ret 222endfunc 223 224// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, 225// int32_t *AA, int16_t *BB, 226// const int w, const int s, 227// const int bitdepth_max); 228function sgr_box5_vert_neon, export=1 229 stp d8, d9, [sp, #-0x30]! 230 stp d10, d11, [sp, #0x10] 231 stp d12, d13, [sp, #0x20] 232 233 add w4, w4, #2 234 clz w15, w6 // bitdepth_max 235 dup v28.4s, w5 // strength 236 237 ldp x5, x6, [x0] 238 ldp x7, x8, [x0, #16] 239 ldr x0, [x0, #32] 240 ldp x9, x10, [x1] 241 ldp x11, x12, [x1, #16] 242 ldr x1, [x1, #32] 243 244 movi v31.4s, #25 // n 245 246 sub w15, w15, #24 // -bitdepth_min_8 247 movrel x13, x_by_x_tables 248 movi v30.4s, #164 249 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks 250 dup v6.8h, w15 // -bitdepth_min_8 251 movi v19.8b, #0x7 252 ldr q18, [x13, #64] // RefLo 253 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 254 movi v29.8h, #1, lsl #8 255 256 ld1 {v8.4s, v9.4s}, [x5], #32 257 ld1 {v10.4s, v11.4s}, [x6], #32 258 ld1 {v12.4s, v13.4s}, [x7], #32 259 ld1 {v16.4s, v17.4s}, [x8], #32 260 ld1 {v20.8h}, [x9], #16 261 ld1 {v21.8h}, [x10], #16 262 ld1 {v22.8h}, [x11], #16 263 ld1 {v23.8h}, [x12], #16 264 ld1 {v0.4s, v1.4s}, [x0], #32 265 ld1 {v2.8h}, [x1], #16 266 2671: 268 add v8.4s, v8.4s, v10.4s 269 add v9.4s, v9.4s, v11.4s 270 add v12.4s, v12.4s, v16.4s 271 add v13.4s, v13.4s, v17.4s 272 273 add v20.8h, v20.8h, v21.8h 274 add v22.8h, v22.8h, v23.8h 275 276 add v0.4s, v0.4s, v8.4s 277 add v1.4s, v1.4s, v9.4s 278 add v2.8h, v2.8h, v20.8h 279 280 add v0.4s, v0.4s, v12.4s 281 add v1.4s, v1.4s, v13.4s 282 add v2.8h, v2.8h, v22.8h 283 284 subs w4, w4, #8 285 286 srshl v0.4s, v0.4s, v7.4s 287 srshl v1.4s, v1.4s, v7.4s 288 srshl v4.8h, v2.8h, v6.8h 289 mul v0.4s, v0.4s, v31.4s // a * n 290 mul v1.4s, v1.4s, v31.4s // a * n 291 umull v3.4s, v4.4h, v4.4h // b * b 292 umull2 v4.4s, v4.8h, v4.8h // b * b 293 uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) 294 uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) 295 mul v0.4s, v0.4s, v28.4s // p * s 296 mul v1.4s, v1.4s, v28.4s // p * s 297 ld1 {v8.4s, v9.4s}, [x5], #32 298 uqshrn v0.4h, v0.4s, #16 299 uqshrn2 v0.8h, v1.4s, #16 300 ld1 {v10.4s, v11.4s}, [x6], #32 301 uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) 302 303 ld1 {v12.4s, v13.4s}, [x7], #32 304 305 ushr v1.8b, v0.8b, #3 306 ld1 {v16.4s, v17.4s}, [x8], #32 307 tbl v5.8b, {v26.16b, v27.16b}, v1.8b // RangeMins 308 tbl v1.8b, {v24.16b, v25.16b}, v1.8b // DiffMasks 309 tbl v4.8b, {v18.16b}, v0.8b // RefLo 310 and v0.8b, v0.8b, v19.8b 311 ld1 {v20.8h}, [x9], #16 312 ushl v5.8b, v5.8b, v0.8b 313 add v4.8b, v4.8b, v1.8b 314 ld1 {v21.8h}, [x10], #16 315 cnt v5.8b, v5.8b 316 ld1 {v22.8h}, [x11], #16 317 add v5.8b, v4.8b, v5.8b 318 ld1 {v23.8h}, [x12], #16 319 uxtl v5.8h, v5.8b // x 320 321 ld1 {v0.4s, v1.4s}, [x0], #32 322 umull v3.4s, v5.4h, v2.4h // x * BB[i] 323 umull2 v4.4s, v5.8h, v2.8h // x * BB[i] 324 mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x 325 mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x 326 srshr v3.4s, v3.4s, #12 // AA[i] 327 srshr v4.4s, v4.4s, #12 // AA[i] 328 sub v5.8h, v29.8h, v5.8h // 256 - x 329 ld1 {v2.8h}, [x1], #16 330 331 st1 {v3.4s, v4.4s}, [x2], #32 332 st1 {v5.8h}, [x3], #16 333 b.gt 1b 334 335 ldp d12, d13, [sp, #0x20] 336 ldp d10, d11, [sp, #0x10] 337 ldp d8, d9, [sp], 0x30 338 ret 339endfunc 340