xref: /aosp_15_r20/frameworks/rs/toolkit/Lut3d_advsimd.S (revision e1eccf28f96817838ad6867f7f39d2351ec11f56)
1*e1eccf28SAndroid Build Coastguard Worker/*
2*e1eccf28SAndroid Build Coastguard Worker * Copyright (C) 2014 The Android Open Source Project
3*e1eccf28SAndroid Build Coastguard Worker *
4*e1eccf28SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*e1eccf28SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*e1eccf28SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*e1eccf28SAndroid Build Coastguard Worker *
8*e1eccf28SAndroid Build Coastguard Worker *      http://www.apache.org/licenses/LICENSE-2.0
9*e1eccf28SAndroid Build Coastguard Worker *
10*e1eccf28SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*e1eccf28SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*e1eccf28SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*e1eccf28SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*e1eccf28SAndroid Build Coastguard Worker * limitations under the License.
15*e1eccf28SAndroid Build Coastguard Worker */
16*e1eccf28SAndroid Build Coastguard Worker
17*e1eccf28SAndroid Build Coastguard Worker#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18*e1eccf28SAndroid Build Coastguard Worker#define END(f) .size f, .-f;
19*e1eccf28SAndroid Build Coastguard Worker
20*e1eccf28SAndroid Build Coastguard Worker
21*e1eccf28SAndroid Build Coastguard Worker.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
22*e1eccf28SAndroid Build Coastguard Worker
23*e1eccf28SAndroid Build Coastguard Worker            smov        x6, \src0
24*e1eccf28SAndroid Build Coastguard Worker            smov        x7, \src1
25*e1eccf28SAndroid Build Coastguard Worker
26*e1eccf28SAndroid Build Coastguard Worker            add         x6, x6, x3
27*e1eccf28SAndroid Build Coastguard Worker            add         x7, x7, x3
28*e1eccf28SAndroid Build Coastguard Worker
29*e1eccf28SAndroid Build Coastguard Worker            ld1         {v16.2s}, [x6], x4
30*e1eccf28SAndroid Build Coastguard Worker            ld1         {v17.2s}, [x7], x4
31*e1eccf28SAndroid Build Coastguard Worker
32*e1eccf28SAndroid Build Coastguard Worker            ld1         {v18.2s}, [x6], x5
33*e1eccf28SAndroid Build Coastguard Worker            ld1         {v19.2s}, [x7], x5
34*e1eccf28SAndroid Build Coastguard Worker
35*e1eccf28SAndroid Build Coastguard Worker            dup         v8.8b, \yr0
36*e1eccf28SAndroid Build Coastguard Worker            dup         v9.8b, \yr1
37*e1eccf28SAndroid Build Coastguard Worker            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
38*e1eccf28SAndroid Build Coastguard Worker            zip1        v12.16b, v5.16b, v16.16b
39*e1eccf28SAndroid Build Coastguard Worker            zip1        v13.16b, v5.16b, v17.16b
40*e1eccf28SAndroid Build Coastguard Worker            umlsl       v12.8h, v16.8b, v8.8b
41*e1eccf28SAndroid Build Coastguard Worker            umlsl       v13.8h, v17.8b, v9.8b
42*e1eccf28SAndroid Build Coastguard Worker            umlal       v12.8h, v18.8b, v8.8b
43*e1eccf28SAndroid Build Coastguard Worker            umlal       v13.8h, v19.8b, v9.8b
44*e1eccf28SAndroid Build Coastguard Worker
45*e1eccf28SAndroid Build Coastguard Worker            ld1         {v18.2s}, [x6]
46*e1eccf28SAndroid Build Coastguard Worker            ld1         {v19.2s}, [x7]
47*e1eccf28SAndroid Build Coastguard Worker
48*e1eccf28SAndroid Build Coastguard Worker            sub         x6, x6, x4
49*e1eccf28SAndroid Build Coastguard Worker            sub         x7, x7, x4
50*e1eccf28SAndroid Build Coastguard Worker
51*e1eccf28SAndroid Build Coastguard Worker            ld1         {v16.2s}, [x6]
52*e1eccf28SAndroid Build Coastguard Worker            ld1         {v17.2s}, [x7]
53*e1eccf28SAndroid Build Coastguard Worker
54*e1eccf28SAndroid Build Coastguard Worker            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
55*e1eccf28SAndroid Build Coastguard Worker            zip1        v14.16b, v5.16b, v16.16b
56*e1eccf28SAndroid Build Coastguard Worker            zip1        v15.16b, v5.16b, v17.16b
57*e1eccf28SAndroid Build Coastguard Worker            umlsl       v14.8h, v16.8b, v8.8b
58*e1eccf28SAndroid Build Coastguard Worker            umlsl       v15.8h, v17.8b, v9.8b
59*e1eccf28SAndroid Build Coastguard Worker            umlal       v14.8h, v18.8b, v8.8b
60*e1eccf28SAndroid Build Coastguard Worker            umlal       v15.8h, v19.8b, v9.8b
61*e1eccf28SAndroid Build Coastguard Worker
62*e1eccf28SAndroid Build Coastguard Worker            /* Z interpolate, lane 0 v12/v14 -> v10 */
63*e1eccf28SAndroid Build Coastguard Worker            ushll       v8.4s, v12.4h, #8
64*e1eccf28SAndroid Build Coastguard Worker            ushll2      v9.4s, v12.8h, #8
65*e1eccf28SAndroid Build Coastguard Worker            umlsl       v8.4s, v12.4h, \zr0
66*e1eccf28SAndroid Build Coastguard Worker            umlsl2      v9.4s, v12.8h, \zr0
67*e1eccf28SAndroid Build Coastguard Worker            umlal       v8.4s, v14.4h, \zr0
68*e1eccf28SAndroid Build Coastguard Worker            umlal2      v9.4s, v14.8h, \zr0
69*e1eccf28SAndroid Build Coastguard Worker            rshrn       v10.4h, v8.4s, #8
70*e1eccf28SAndroid Build Coastguard Worker            rshrn2      v10.8h, v9.4s, #8
71*e1eccf28SAndroid Build Coastguard Worker
72*e1eccf28SAndroid Build Coastguard Worker            /* Z interpolate, lane 1 v13/v15 -> v11 */
73*e1eccf28SAndroid Build Coastguard Worker            ushll       v8.4s, v13.4h, #8
74*e1eccf28SAndroid Build Coastguard Worker            ushll2      v9.4s, v13.8h, #8
75*e1eccf28SAndroid Build Coastguard Worker            umlsl       v8.4s, v13.4h, \zr1
76*e1eccf28SAndroid Build Coastguard Worker            umlsl2      v9.4s, v13.8h, \zr1
77*e1eccf28SAndroid Build Coastguard Worker            umlal       v8.4s, v15.4h, \zr1
78*e1eccf28SAndroid Build Coastguard Worker            umlal2      v9.4s, v15.8h, \zr1
79*e1eccf28SAndroid Build Coastguard Worker            rshrn       v11.4h, v8.4s, #8
80*e1eccf28SAndroid Build Coastguard Worker            rshrn2      v11.8h, v9.4s, #8
81*e1eccf28SAndroid Build Coastguard Worker
82*e1eccf28SAndroid Build Coastguard Worker            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
83*e1eccf28SAndroid Build Coastguard Worker            ushll       v8.4s, v10.4h, #8
84*e1eccf28SAndroid Build Coastguard Worker            ushll       v9.4s, v11.4h, #8
85*e1eccf28SAndroid Build Coastguard Worker            umlsl       v8.4s, v10.4h, \xr0
86*e1eccf28SAndroid Build Coastguard Worker            umlsl       v9.4s, v11.4h, \xr1
87*e1eccf28SAndroid Build Coastguard Worker            umlal2      v8.4s, v10.8h, \xr0
88*e1eccf28SAndroid Build Coastguard Worker            umlal2      v9.4s, v11.8h, \xr1
89*e1eccf28SAndroid Build Coastguard Worker            shrn        v14.4h, v8.4s, #8
90*e1eccf28SAndroid Build Coastguard Worker            shrn2       v14.8h, v9.4s, #8
91*e1eccf28SAndroid Build Coastguard Worker
92*e1eccf28SAndroid Build Coastguard Worker            /* pack lanes 0-1 -> v6 */
93*e1eccf28SAndroid Build Coastguard Worker.ifc \dst, v20.16b
94*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    \dst, v14.8h, #8
95*e1eccf28SAndroid Build Coastguard Worker.else ; .ifc \dst, v21.16b
96*e1eccf28SAndroid Build Coastguard Worker            uqrshrn2    \dst, v14.8h, #8
97*e1eccf28SAndroid Build Coastguard Worker.else
98*e1eccf28SAndroid Build Coastguard Worker            uqrshrn     \dst, v14.8h, #8
99*e1eccf28SAndroid Build Coastguard Worker.endif ; .endif
100*e1eccf28SAndroid Build Coastguard Worker.endm
101*e1eccf28SAndroid Build Coastguard Worker
102*e1eccf28SAndroid Build Coastguard Worker/* void rsdIntrinsic3DLUT_K(
103*e1eccf28SAndroid Build Coastguard Worker *          void *dst,          // x0
104*e1eccf28SAndroid Build Coastguard Worker *          void const *in,     // x1
105*e1eccf28SAndroid Build Coastguard Worker *          size_t count,       // x2
106*e1eccf28SAndroid Build Coastguard Worker *          void const *lut,    // x3
107*e1eccf28SAndroid Build Coastguard Worker *          int32_t pitchy,     // w4
108*e1eccf28SAndroid Build Coastguard Worker *          int32_t pitchz,     // w5
109*e1eccf28SAndroid Build Coastguard Worker *          int dimx,           // w6
110*e1eccf28SAndroid Build Coastguard Worker *          int dimy,           // w7
111*e1eccf28SAndroid Build Coastguard Worker *          int dimz);          // [sp]
112*e1eccf28SAndroid Build Coastguard Worker */
113*e1eccf28SAndroid Build Coastguard WorkerENTRY(rsdIntrinsic3DLUT_K)
114*e1eccf28SAndroid Build Coastguard Worker            ldr         w8, [sp]
115*e1eccf28SAndroid Build Coastguard Worker            stp         d8, d9, [sp, #-64]!
116*e1eccf28SAndroid Build Coastguard Worker            stp         d10, d11, [sp, #16]
117*e1eccf28SAndroid Build Coastguard Worker            stp         d12, d13, [sp, #32]
118*e1eccf28SAndroid Build Coastguard Worker            stp         d14, d15, [sp, #48]
119*e1eccf28SAndroid Build Coastguard Worker            movi        v4.8b, #1
120*e1eccf28SAndroid Build Coastguard Worker            ins         v4.h[0], w6
121*e1eccf28SAndroid Build Coastguard Worker            ins         v4.h[1], w7
122*e1eccf28SAndroid Build Coastguard Worker            ins         v4.h[2], w8
123*e1eccf28SAndroid Build Coastguard Worker            ins         v4.s[2], w4
124*e1eccf28SAndroid Build Coastguard Worker            ins         v4.s[3], w5
125*e1eccf28SAndroid Build Coastguard Worker            movi        v5.16b, #0
126*e1eccf28SAndroid Build Coastguard Worker
127*e1eccf28SAndroid Build Coastguard Worker            subs        x2, x2, #8
128*e1eccf28SAndroid Build Coastguard Worker            bge         2f
129*e1eccf28SAndroid Build Coastguard Worker            cmn         x2, #8    // same as cmp x2, #-8
130*e1eccf28SAndroid Build Coastguard Worker            ble         9f
131*e1eccf28SAndroid Build Coastguard Worker            b           4f
132*e1eccf28SAndroid Build Coastguard Worker
133*e1eccf28SAndroid Build Coastguard Worker            .align 6
134*e1eccf28SAndroid Build Coastguard Worker1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
135*e1eccf28SAndroid Build Coastguard Worker/* x0  = dst
136*e1eccf28SAndroid Build Coastguard Worker * x1  = src
137*e1eccf28SAndroid Build Coastguard Worker * x2  = count
138*e1eccf28SAndroid Build Coastguard Worker * x3  = lut
139*e1eccf28SAndroid Build Coastguard Worker * x4  = pitchy
140*e1eccf28SAndroid Build Coastguard Worker * x5  = pitchz
141*e1eccf28SAndroid Build Coastguard Worker * x6 = offset0
142*e1eccf28SAndroid Build Coastguard Worker * x7 = offset1
143*e1eccf28SAndroid Build Coastguard Worker */
144*e1eccf28SAndroid Build Coastguard Worker2:          ld4         {v0.8b-v3.8b}, [x1], #32
145*e1eccf28SAndroid Build Coastguard Worker/* v0,v1,v2,v3 source data
146*e1eccf28SAndroid Build Coastguard Worker * v4 dimensions and pitches
147*e1eccf28SAndroid Build Coastguard Worker */
148*e1eccf28SAndroid Build Coastguard Worker3:          uxtl        v0.8h, v0.8b
149*e1eccf28SAndroid Build Coastguard Worker            uxtl        v1.8h, v1.8b
150*e1eccf28SAndroid Build Coastguard Worker            uxtl        v2.8h, v2.8b
151*e1eccf28SAndroid Build Coastguard Worker            mul         v0.8h, v0.8h, v4.h[0]
152*e1eccf28SAndroid Build Coastguard Worker            mul         v1.8h, v1.8h, v4.h[1]
153*e1eccf28SAndroid Build Coastguard Worker            mul         v2.8h, v2.8h, v4.h[2]
154*e1eccf28SAndroid Build Coastguard Worker
155*e1eccf28SAndroid Build Coastguard Worker/* ursra below would be more accurate, but this can result in a dim.0 case
156*e1eccf28SAndroid Build Coastguard Worker * where we try to read from the limit of the array and the limit +1 to
157*e1eccf28SAndroid Build Coastguard Worker * interpolate, even though the fractional component is zero.  Strictly this is
158*e1eccf28SAndroid Build Coastguard Worker * correct, except for the llegal access problem.
159*e1eccf28SAndroid Build Coastguard Worker */
160*e1eccf28SAndroid Build Coastguard Worker            usra        v0.8h, v0.8h, #8
161*e1eccf28SAndroid Build Coastguard Worker            usra        v1.8h, v1.8h, #8
162*e1eccf28SAndroid Build Coastguard Worker            usra        v2.8h, v2.8h, #8
163*e1eccf28SAndroid Build Coastguard Worker
164*e1eccf28SAndroid Build Coastguard Worker            ushr        v12.8h, v0.8h, #8
165*e1eccf28SAndroid Build Coastguard Worker            ushr        v13.8h, v1.8h, #8
166*e1eccf28SAndroid Build Coastguard Worker            ushr        v14.8h, v2.8h, #8
167*e1eccf28SAndroid Build Coastguard Worker            bic         v0.8h, #0xff, LSL #8
168*e1eccf28SAndroid Build Coastguard Worker            xtn         v1.8b, v1.8h
169*e1eccf28SAndroid Build Coastguard Worker            bic         v2.8h, #0xff, LSL #8
170*e1eccf28SAndroid Build Coastguard Worker
171*e1eccf28SAndroid Build Coastguard Worker/* v0.8h,v1.8b,v2.hb fractional offset
172*e1eccf28SAndroid Build Coastguard Worker * v12.8h,v13.8h,v14.8h integer offset
173*e1eccf28SAndroid Build Coastguard Worker */
174*e1eccf28SAndroid Build Coastguard Worker
175*e1eccf28SAndroid Build Coastguard Worker            ushll       v6.4s, v12.4h, #2
176*e1eccf28SAndroid Build Coastguard Worker            ushll2      v7.4s, v12.8h, #2
177*e1eccf28SAndroid Build Coastguard Worker            uxtl        v8.4s, v13.4h
178*e1eccf28SAndroid Build Coastguard Worker            uxtl2       v9.4s, v13.8h
179*e1eccf28SAndroid Build Coastguard Worker            uxtl        v10.4s, v14.4h
180*e1eccf28SAndroid Build Coastguard Worker            uxtl2       v11.4s, v14.8h
181*e1eccf28SAndroid Build Coastguard Worker            mla         v6.4s, v8.4s,  v4.s[2]
182*e1eccf28SAndroid Build Coastguard Worker            mla         v7.4s, v9.4s,  v4.s[2]
183*e1eccf28SAndroid Build Coastguard Worker            mla         v6.4s, v10.4s, v4.s[3]
184*e1eccf28SAndroid Build Coastguard Worker            mla         v7.4s, v11.4s, v4.s[3]
185*e1eccf28SAndroid Build Coastguard Worker
186*e1eccf28SAndroid Build Coastguard Worker/* v6,v7 list of table offsets */
187*e1eccf28SAndroid Build Coastguard Worker
188*e1eccf28SAndroid Build Coastguard Worker        /* lanes 0 and 1 */
189*e1eccf28SAndroid Build Coastguard Worker            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
190*e1eccf28SAndroid Build Coastguard Worker
191*e1eccf28SAndroid Build Coastguard Worker        /* lanes 2 and 3 */
192*e1eccf28SAndroid Build Coastguard Worker            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
193*e1eccf28SAndroid Build Coastguard Worker
194*e1eccf28SAndroid Build Coastguard Worker        /* lanes 4 and 5 */
195*e1eccf28SAndroid Build Coastguard Worker            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
196*e1eccf28SAndroid Build Coastguard Worker
197*e1eccf28SAndroid Build Coastguard Worker        /* lanes 6 and 7 */
198*e1eccf28SAndroid Build Coastguard Worker            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
199*e1eccf28SAndroid Build Coastguard Worker
200*e1eccf28SAndroid Build Coastguard Worker            uzp1        v6.16b, v20.16b, v21.16b
201*e1eccf28SAndroid Build Coastguard Worker            uzp2        v7.16b, v20.16b, v21.16b
202*e1eccf28SAndroid Build Coastguard Worker            uzp1        v20.16b, v6.16b, v7.16b
203*e1eccf28SAndroid Build Coastguard Worker            uzp2        v22.16b, v6.16b, v7.16b
204*e1eccf28SAndroid Build Coastguard Worker            mov         v21.d[0], v20.d[1]
205*e1eccf28SAndroid Build Coastguard Worker
206*e1eccf28SAndroid Build Coastguard Worker            subs        x2, x2, #8
207*e1eccf28SAndroid Build Coastguard Worker            mov         v23.8b, v3.8b
208*e1eccf28SAndroid Build Coastguard Worker
209*e1eccf28SAndroid Build Coastguard Worker            bge         1b
210*e1eccf28SAndroid Build Coastguard Worker
211*e1eccf28SAndroid Build Coastguard Worker            cmn         x2, #8    // same as cmp x2, #-8
212*e1eccf28SAndroid Build Coastguard Worker            blt         1f
213*e1eccf28SAndroid Build Coastguard Worker
214*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
215*e1eccf28SAndroid Build Coastguard Worker            beq         9f
216*e1eccf28SAndroid Build Coastguard Worker
217*e1eccf28SAndroid Build Coastguard Worker            /* fill the vector  with a safe value */
218*e1eccf28SAndroid Build Coastguard Worker4:          ld4r        {v0.8b-v3.8b}, [x1]
219*e1eccf28SAndroid Build Coastguard Worker            tbz         x2, #2, 2f
220*e1eccf28SAndroid Build Coastguard Worker            ld4         {v0.b-v3.b}[0], [x1], #4
221*e1eccf28SAndroid Build Coastguard Worker            ld4         {v0.b-v3.b}[1], [x1], #4
222*e1eccf28SAndroid Build Coastguard Worker            ld4         {v0.b-v3.b}[2], [x1], #4
223*e1eccf28SAndroid Build Coastguard Worker            ld4         {v0.b-v3.b}[3], [x1], #4
224*e1eccf28SAndroid Build Coastguard Worker2:          tbz         x2, #1, 2f
225*e1eccf28SAndroid Build Coastguard Worker            ld4         {v0.b-v3.b}[4], [x1], #4
226*e1eccf28SAndroid Build Coastguard Worker            ld4         {v0.b-v3.b}[5], [x1], #4
227*e1eccf28SAndroid Build Coastguard Worker2:          tbz         x2, #0, 2f
228*e1eccf28SAndroid Build Coastguard Worker            ld4         {v0.b-v3.b}[6], [x1], #4
229*e1eccf28SAndroid Build Coastguard Worker2:          b           3b
230*e1eccf28SAndroid Build Coastguard Worker
231*e1eccf28SAndroid Build Coastguard Worker1:          tst         x2, #4
232*e1eccf28SAndroid Build Coastguard Worker            beq         2f
233*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.b-v23.b}[0], [x0], #4
234*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.b-v23.b}[1], [x0], #4
235*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.b-v23.b}[2], [x0], #4
236*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.b-v23.b}[3], [x0], #4
237*e1eccf28SAndroid Build Coastguard Worker2:          tst         x2, #2
238*e1eccf28SAndroid Build Coastguard Worker            beq         2f
239*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.b-v23.b}[4], [x0], #4
240*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.b-v23.b}[5], [x0], #4
241*e1eccf28SAndroid Build Coastguard Worker2:          tst         x2, #1
242*e1eccf28SAndroid Build Coastguard Worker            beq         9f
243*e1eccf28SAndroid Build Coastguard Worker            st4         {v20.b-v23.b}[6], [x0], #4
244*e1eccf28SAndroid Build Coastguard Worker
245*e1eccf28SAndroid Build Coastguard Worker9:          ldp         d14, d15, [sp, #48]
246*e1eccf28SAndroid Build Coastguard Worker            ldp         d12, d13, [sp, #32]
247*e1eccf28SAndroid Build Coastguard Worker            ldp         d10, d11, [sp, #16]
248*e1eccf28SAndroid Build Coastguard Worker            ldp         d8, d9, [sp], #64
249*e1eccf28SAndroid Build Coastguard Worker            ret
250*e1eccf28SAndroid Build Coastguard WorkerEND(rsdIntrinsic3DLUT_K)
251