1/*
2 * Copyright (C) 2013-2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20#define BLEND_LIST(X) \
21    X(0, CLEAR) \
22    X(1, SRC) \
23    X(2, DST) \
24    X(3, SRC_OVER) \
25    X(4, DST_OVER) \
26    X(5, SRC_IN) \
27    X(6, DST_IN) \
28    X(7, SRC_OUT) \
29    X(8, DST_OUT) \
30    X(9, SRC_ATOP) \
31    X(10, DST_ATOP) \
32    X(11, XOR) \
33    X(12, MULTIPLY) \
34    X(13, ADD) \
35    X(14, SUBTRACT)
36
37/* This operation was not enabled in the original RenderScript. We could
38 * enable it.
39 *
40 *  X(15, DIFFERENCE) \
41 */
42
43/* For every blend operation supported, define a macro with just the arithmetic
44 * component.  The rest can be handled later on.
45 *
46 * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
47 * contain the data from the source buffer.  Both have already been split out
48 * into one colour component per register (if necessary).  q3 and q11 contain
49 * the alpha components.
50 *
51 * At the same time as defining the assembly macro, define a corresponding
52 * preprocessor macro indicating any other requirements.
53 *    zipped=0 -- The macro does not require the RGBA components to be
54 *                separated.
55 *    lddst=0  -- The macro does not require data from the destination buffer.
56 *    ldsrc=0  -- The macro does not require data from the source buffer.
57 *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
58 *                inserted without any surrounding load/store or loop code.
59 */
60
61#define params_CLEAR zipped=0, lddst=0, ldsrc=0
62.macro blend_kernel_CLEAR
63        movi    v0.16b, #0
64        movi    v1.16b, #0
65        movi    v2.16b, #0
66        movi    v3.16b, #0
67.endm
68
69#define params_SRC zipped=0, lddst=0
70.macro blend_kernel_SRC
71        mov     v0.16b, v8.16b
72        mov     v1.16b, v9.16b
73        mov     v2.16b, v10.16b
74        mov     v3.16b, v11.16b
75.endm
76
77#define params_DST nowrap=1
78.macro blend_kernel_DST
79        /* nop */
80.endm
81
82#define params_SRC_OVER zipped=1
83.macro blend_kernel_SRC_OVER
84        mvn         v7.16b, v11.16b
85
86        umull2      v12.8h, v7.16b, v0.16b
87        umull       v0.8h,  v7.8b,  v0.8b
88        umull2      v13.8h, v7.16b, v1.16b
89        umull       v1.8h,  v7.8b,  v1.8b
90        umull2      v14.8h, v7.16b, v2.16b
91        umull       v2.8h,  v7.8b,  v2.8b
92        umull2      v15.8h, v7.16b, v3.16b
93        umull       v3.8h,  v7.8b,  v3.8b
94
95        rshrn       v4.8b,  v0.8h,  #8
96        rshrn2      v4.16b, v12.8h, #8
97        rshrn       v5.8b,  v1.8h,  #8
98        rshrn2      v5.16b, v13.8h, #8
99        rshrn       v6.8b,  v2.8h,  #8
100        rshrn2      v6.16b, v14.8h, #8
101        rshrn       v7.8b,  v3.8h,  #8
102        rshrn2      v7.16b, v15.8h, #8
103
104        uaddw       v0.8h,  v0.8h,  v4.8b
105        uaddw2      v12.8h, v12.8h, v4.16b
106        uaddw       v1.8h,  v1.8h,  v5.8b
107        uaddw2      v13.8h, v13.8h, v5.16b
108        uaddw       v2.8h,  v2.8h,  v6.8b
109        uaddw2      v14.8h, v14.8h, v6.16b
110        uaddw       v3.8h,  v3.8h,  v7.8b
111        uaddw2      v15.8h, v15.8h, v7.16b
112
113        rshrn       v0.8b,  v0.8h,  #8
114        rshrn2      v0.16b, v12.8h, #8
115        rshrn       v1.8b,  v1.8h,  #8
116        rshrn2      v1.16b, v13.8h, #8
117        rshrn       v2.8b,  v2.8h,  #8
118        rshrn2      v2.16b, v14.8h, #8
119        rshrn       v3.8b,  v3.8h,  #8
120        rshrn2      v3.16b, v15.8h, #8
121
122        uqadd       v0.16b, v0.16b, v8.16b
123        uqadd       v1.16b, v1.16b, v9.16b
124        uqadd       v2.16b, v2.16b, v10.16b
125        uqadd       v3.16b, v3.16b, v11.16b
126.endm
127
128#define params_DST_OVER zipped=1
129.macro blend_kernel_DST_OVER
130        mvn         v7.16b, v3.16b
131
132        umull2      v12.8h, v7.16b, v8.16b
133        umull       v8.8h,  v7.8b,  v8.8b
134        umull2      v13.8h, v7.16b, v9.16b
135        umull       v9.8h,  v7.8b,  v9.8b
136        umull2      v14.8h, v7.16b, v10.16b
137        umull       v10.8h, v7.8b,  v10.8b
138        umull2      v15.8h, v7.16b, v11.16b
139        umull       v11.8h, v7.8b,  v11.8b
140
141        rshrn       v4.8b,  v8.8h,  #8
142        rshrn2      v4.16b, v12.8h, #8
143        rshrn       v5.8b,  v9.8h,  #8
144        rshrn2      v5.16b, v13.8h, #8
145        rshrn       v6.8b,  v10.8h, #8
146        rshrn2      v6.16b, v14.8h, #8
147        rshrn       v7.8b,  v11.8h, #8
148        rshrn2      v7.16b, v15.8h, #8
149
150        uaddw       v8.8h,  v8.8h,  v4.8b
151        uaddw2      v12.8h, v12.8h, v4.16b
152        uaddw       v9.8h,  v9.8h,  v5.8b
153        uaddw2      v13.8h, v13.8h, v5.16b
154        uaddw       v10.8h, v10.8h, v6.8b
155        uaddw2      v14.8h, v14.8h, v6.16b
156        uaddw       v11.8h, v11.8h, v7.8b
157        uaddw2      v15.8h, v15.8h, v7.16b
158
159        rshrn       v8.8b,  v8.8h,  #8
160        rshrn2      v8.16b, v12.8h, #8
161        rshrn       v9.8b,  v9.8h,  #8
162        rshrn2      v9.16b, v13.8h, #8
163        rshrn       v10.8b,  v10.8h, #8
164        rshrn2      v10.16b, v14.8h, #8
165        rshrn       v11.8b,  v11.8h, #8
166        rshrn2      v11.16b, v15.8h, #8
167
168        uqadd       v0.16b, v0.16b, v8.16b
169        uqadd       v1.16b, v1.16b, v9.16b
170        uqadd       v2.16b, v2.16b, v10.16b
171        uqadd       v3.16b, v3.16b, v11.16b
172.endm
173
174#define params_SRC_IN zipped=1
175.macro blend_kernel_SRC_IN
176        umull2      v12.8h, v3.16b, v8.16b
177        umull       v0.8h,  v3.8b,  v8.8b
178        umull2      v13.8h, v3.16b, v9.16b
179        umull       v1.8h,  v3.8b,  v9.8b
180        umull2      v14.8h, v3.16b, v10.16b
181        umull       v2.8h,  v3.8b,  v10.8b
182        umull2      v15.8h, v3.16b, v11.16b
183        umull       v3.8h,  v3.8b,  v11.8b
184
185        rshrn       v4.8b,  v0.8h,  #8
186        rshrn2      v4.16b, v12.8h, #8
187        rshrn       v5.8b,  v1.8h,  #8
188        rshrn2      v5.16b, v13.8h, #8
189        rshrn       v6.8b,  v2.8h,  #8
190        rshrn2      v6.16b, v14.8h, #8
191        rshrn       v7.8b,  v3.8h,  #8
192        rshrn2      v7.16b, v15.8h, #8
193
194        uaddw       v0.8h,  v0.8h,  v4.8b
195        uaddw2      v12.8h, v12.8h, v4.16b
196        uaddw       v1.8h,  v1.8h,  v5.8b
197        uaddw2      v13.8h, v13.8h, v5.16b
198        uaddw       v2.8h,  v2.8h,  v6.8b
199        uaddw2      v14.8h, v14.8h, v6.16b
200        uaddw       v3.8h,  v3.8h,  v7.8b
201        uaddw2      v15.8h, v15.8h, v7.16b
202
203        rshrn       v0.8b,  v0.8h,  #8
204        rshrn2      v0.16b, v12.8h, #8
205        rshrn       v1.8b,  v1.8h,  #8
206        rshrn2      v1.16b, v13.8h, #8
207        rshrn       v2.8b,  v2.8h,  #8
208        rshrn2      v2.16b, v14.8h, #8
209        rshrn       v3.8b,  v3.8h,  #8
210        rshrn2      v3.16b, v15.8h, #8
211.endm
212
213#define params_DST_IN zipped=1
214.macro blend_kernel_DST_IN
215        umull2      v12.8h, v0.16b, v11.16b
216        umull       v0.8h,  v0.8b,  v11.8b
217        umull2      v13.8h, v1.16b, v11.16b
218        umull       v1.8h,  v1.8b,  v11.8b
219        umull2      v14.8h, v2.16b, v11.16b
220        umull       v2.8h,  v2.8b,  v11.8b
221        umull2      v15.8h, v3.16b, v11.16b
222        umull       v3.8h,  v3.8b,  v11.8b
223
224        rshrn       v4.8b,  v0.8h,  #8
225        rshrn2      v4.16b, v12.8h, #8
226        rshrn       v5.8b,  v1.8h,  #8
227        rshrn2      v5.16b, v13.8h, #8
228        rshrn       v6.8b,  v2.8h,  #8
229        rshrn2      v6.16b, v14.8h, #8
230        rshrn       v7.8b,  v3.8h,  #8
231        rshrn2      v7.16b, v15.8h, #8
232
233        uaddw       v0.8h,  v0.8h,  v4.8b
234        uaddw2      v12.8h, v12.8h, v4.16b
235        uaddw       v1.8h,  v1.8h,  v5.8b
236        uaddw2      v13.8h, v13.8h, v5.16b
237        uaddw       v2.8h,  v2.8h,  v6.8b
238        uaddw2      v14.8h, v14.8h, v6.16b
239        uaddw       v3.8h,  v3.8h,  v7.8b
240        uaddw2      v15.8h, v15.8h, v7.16b
241
242        rshrn       v0.8b,  v0.8h,  #8
243        rshrn2      v0.16b, v12.8h, #8
244        rshrn       v1.8b,  v1.8h,  #8
245        rshrn2      v1.16b, v13.8h, #8
246        rshrn       v2.8b,  v2.8h,  #8
247        rshrn2      v2.16b, v14.8h, #8
248        rshrn       v3.8b,  v3.8h,  #8
249        rshrn2      v3.16b, v15.8h, #8
250.endm
251
252#define params_SRC_OUT zipped=1
253.macro blend_kernel_SRC_OUT
254        mvn         v3.16b, v3.16b
255        blend_kernel_SRC_IN
256.endm
257
258
259#define params_DST_OUT zipped=1
260.macro blend_kernel_DST_OUT
261        mvn         v11.16b, v11.16b
262        blend_kernel_DST_IN
263.endm
264
265#define params_SRC_ATOP zipped=1
266.macro blend_kernel_SRC_ATOP
267        mvn         v11.16b, v11.16b
268
269        umull2      v12.8h, v11.16b, v0.16b
270        umull       v0.8h,  v11.8b,  v0.8b
271        umull2      v13.8h, v11.16b, v1.16b
272        umull       v1.8h,  v11.8b,  v1.8b
273        umull2      v14.8h, v11.16b, v2.16b
274        umull       v2.8h,  v11.8b,  v2.8b
275
276        umull2      v4.8h,  v3.16b, v8.16b
277        umull       v8.8h,  v3.8b,  v8.8b
278        umull2      v5.8h,  v3.16b, v9.16b
279        umull       v9.8h,  v3.8b,  v9.8b
280        umull2      v6.8h,  v3.16b, v10.16b
281        umull       v10.8h, v3.8b,  v10.8b
282
283        uqadd       v12.8h, v12.8h, v4.8h
284        uqadd       v0.8h,  v0.8h,  v8.8h
285        uqadd       v13.8h, v13.8h, v5.8h
286        uqadd       v1.8h,  v1.8h,  v9.8h
287        uqadd       v14.8h, v14.8h, v6.8h
288        uqadd       v2.8h,  v2.8h,  v10.8h
289
290        urshr       v8.8h,  v0.8h,  #8
291        urshr       v4.8h,  v12.8h, #8
292        urshr       v9.8h,  v1.8h,  #8
293        urshr       v5.8h,  v13.8h, #8
294        urshr       v10.8h, v2.8h,  #8
295        urshr       v6.8h,  v14.8h, #8
296
297        uqadd       v0.8h,  v0.8h,  v8.8h
298        uqadd       v12.8h, v12.8h, v4.8h
299        uqadd       v1.8h,  v1.8h,  v9.8h
300        uqadd       v13.8h, v13.8h, v5.8h
301        uqadd       v2.8h,  v2.8h,  v10.8h
302        uqadd       v14.8h, v14.8h, v6.8h
303
304        uqrshrn     v0.8b,  v0.8h,  #8
305        uqrshrn2    v0.16b, v12.8h, #8
306        uqrshrn     v1.8b,  v1.8h,  #8
307        uqrshrn2    v1.16b, v13.8h, #8
308        uqrshrn     v2.8b,  v2.8h,  #8
309        uqrshrn2    v2.16b, v14.8h, #8
310.endm
311
312#define params_DST_ATOP zipped=1
313.macro blend_kernel_DST_ATOP
314        mvn         v3.16b, v3.16b
315
316        umull2      v12.8h, v11.16b, v0.16b
317        umull       v0.8h,  v11.8b,  v0.8b
318        umull2      v13.8h, v11.16b, v1.16b
319        umull       v1.8h,  v11.8b,  v1.8b
320        umull2      v14.8h, v11.16b, v2.16b
321        umull       v2.8h,  v11.8b,  v2.8b
322
323        umull2      v4.8h,  v3.16b, v8.16b
324        umull       v8.8h,  v3.8b,  v8.8b
325        umull2      v5.8h,  v3.16b, v9.16b
326        umull       v9.8h,  v3.8b,  v9.8b
327        umull2      v6.8h,  v3.16b, v10.16b
328        umull       v10.8h, v3.8b,  v10.8b
329
330        uqadd       v12.8h, v12.8h, v4.8h
331        uqadd       v0.8h,  v0.8h,  v8.8h
332        uqadd       v13.8h, v13.8h, v5.8h
333        uqadd       v1.8h,  v1.8h,  v9.8h
334        uqadd       v14.8h, v14.8h, v6.8h
335        uqadd       v2.8h,  v2.8h,  v10.8h
336
337        urshr       v8.8h,  v0.8h,  #8
338        urshr       v4.8h,  v12.8h, #8
339        urshr       v9.8h,  v1.8h,  #8
340        urshr       v5.8h,  v13.8h, #8
341        urshr       v10.8h, v2.8h,  #8
342        urshr       v6.8h,  v14.8h, #8
343
344        uqadd       v0.8h,  v0.8h,  v8.8h
345        uqadd       v12.8h, v12.8h, v4.8h
346        uqadd       v1.8h,  v1.8h,  v9.8h
347        uqadd       v13.8h, v13.8h, v5.8h
348        uqadd       v2.8h,  v2.8h,  v10.8h
349        uqadd       v14.8h, v14.8h, v6.8h
350
351        uqrshrn     v0.8b,  v0.8h,  #8
352        uqrshrn2    v0.16b, v12.8h, #8
353        uqrshrn     v1.8b,  v1.8h,  #8
354        uqrshrn2    v1.16b, v13.8h, #8
355        uqrshrn     v2.8b,  v2.8h,  #8
356        uqrshrn2    v2.16b, v14.8h, #8
357
358        mov         v3.16b, v11.16b
359.endm
360
361#define params_MULTIPLY zipped=0
362.macro blend_kernel_MULTIPLY
363        umull2      v12.8h, v0.16b, v8.16b
364        umull       v0.8h,  v0.8b,  v8.8b
365        umull2      v13.8h, v1.16b, v9.16b
366        umull       v1.8h,  v1.8b,  v9.8b
367        umull2      v14.8h, v2.16b, v10.16b
368        umull       v2.8h,  v2.8b,  v10.8b
369        umull2      v15.8h, v3.16b, v11.16b
370        umull       v3.8h,  v3.8b,  v11.8b
371
372        rshrn       v4.8b,  v0.8h,  #8
373        rshrn2      v4.16b, v12.8h, #8
374        rshrn       v5.8b,  v1.8h,  #8
375        rshrn2      v5.16b, v13.8h, #8
376        rshrn       v6.8b,  v2.8h,  #8
377        rshrn2      v6.16b, v14.8h, #8
378        rshrn       v7.8b,  v3.8h,  #8
379        rshrn2      v7.16b, v15.8h, #8
380
381        uaddw       v0.8h,  v0.8h,  v4.8b
382        uaddw2      v12.8h, v12.8h, v4.16b
383        uaddw       v1.8h,  v1.8h,  v5.8b
384        uaddw2      v13.8h, v13.8h, v5.16b
385        uaddw       v2.8h,  v2.8h,  v6.8b
386        uaddw2      v14.8h, v14.8h, v6.16b
387        uaddw       v3.8h,  v3.8h,  v7.8b
388        uaddw2      v15.8h, v15.8h, v7.16b
389
390        rshrn       v0.8b,  v0.8h,  #8
391        rshrn2      v0.16b, v12.8h, #8
392        rshrn       v1.8b,  v1.8h,  #8
393        rshrn2      v1.16b, v13.8h, #8
394        rshrn       v2.8b,  v2.8h,  #8
395        rshrn2      v2.16b, v14.8h, #8
396        rshrn       v3.8b,  v3.8h,  #8
397        rshrn2      v3.16b, v15.8h, #8
398.endm
399
400#define params_ADD zipped=0
401.macro blend_kernel_ADD
402        uqadd    v0.16b, v0.16b, v8.16b
403        uqadd    v1.16b, v1.16b, v9.16b
404        uqadd    v2.16b, v2.16b, v10.16b
405        uqadd    v3.16b, v3.16b, v11.16b
406.endm
407
408#define params_SUBTRACT zipped=0
409.macro blend_kernel_SUBTRACT
410        uqsub    v0.16b, v0.16b, v8.16b
411        uqsub    v1.16b, v1.16b, v9.16b
412        uqsub    v2.16b, v2.16b, v10.16b
413        uqsub    v3.16b, v3.16b, v11.16b
414.endm
415
416#define params_DIFFERENCE zipped=0
417.macro blend_kernel_DIFFERENCE
418        uabd    v0.16b, v0.16b, v8.16b
419        uabd    v1.16b, v1.16b, v9.16b
420        uabd    v2.16b, v2.16b, v10.16b
421        uabd    v3.16b, v3.16b, v11.16b
422.endm
423
424#define params_XOR zipped=0
425.macro blend_kernel_XOR
426        eor     v0.16b, v0.16b, v8.16b
427        eor     v1.16b, v1.16b, v9.16b
428        eor     v2.16b, v2.16b, v10.16b
429        eor     v3.16b, v3.16b, v11.16b
430.endm
431
432
433/* Define the wrapper code which will load and store the data, iterate the
434 * correct number of times, and safely handle the remainder at the end of the
435 * loop.  Various sections of assembly code are dropped or substituted for
436 * simpler operations if they're not needed.
437 */
438.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
439.if \nowrap
440        \kernel
441.else
442        sub     x3, sp, #32
443        sub     sp, sp, #64
444        st1     {v8.1d - v11.1d}, [sp]
445        st1     {v12.1d - v15.1d}, [x3]
446        subs    x2, x2, #64
447        b       2f
448.align 4
4491:
450  .if \lddst
451    .if \zipped
452        ld4     {v0.16b - v3.16b}, [x0]
453    .else
454        ld1     {v0.16b - v3.16b}, [x0]
455    .endif
456  .endif
457  .if \ldsrc
458    .if \zipped
459        ld4     {v8.16b - v11.16b}, [x1], #64
460    .else
461        ld1     {v8.16b - v11.16b}, [x1], #64
462    .endif
463  .endif
464  .if \pld
465#if 0 /* TODO: test this on real hardware */
466    .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
467    .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
468#endif
469  .endif
470
471        \kernel
472
473        subs    x2, x2, #64
474  .if \zipped
475        st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
476  .else
477        st1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
478  .endif
479
4802:      bge     1b
481        adds    x2, x2, #64
482        beq     2f
483
484        /* To handle the tail portion of the data (something less than 64
485         * bytes) load small power-of-two chunks into working registers.  It
486         * doesn't matter where they end up in the register; the same process
487         * will store them back out using the same positions and the operations
488         * don't require data to interact with its neighbours.
489         */
490        movi    v0.16b, #0
491        movi    v1.16b, #0
492        movi    v2.16b, #0
493        movi    v3.16b, #0
494
495        movi    v8.16b, #0
496        movi    v9.16b, #0
497        movi    v10.16b, #0
498        movi    v11.16b, #0
499
500        tbz     x2, #5, 1f
501  .if \lddst ; ld1     {v2.16b,v3.16b}, [x0], #32   ; .endif
502  .if \ldsrc ; ld1     {v10.16b,v11.16b}, [x1], #32 ; .endif
5031:      tbz     x2, #4, 1f
504  .if \lddst ; ld1     {v1.16b}, [x0], #16  ; .endif
505  .if \ldsrc ; ld1     {v9.16b}, [x1], #16  ; .endif
5061:      tbz     x2, #3, 1f
507  .if \lddst ; ld1     {v0.d}[1], [x0], #8 ; .endif
508  .if \ldsrc ; ld1     {v8.d}[1], [x1], #8 ; .endif
5091:      tbz     x2, #2, 1f
510  .if \lddst ; ld1     {v0.s}[1], [x0], #4 ; .endif
511  .if \ldsrc ; ld1     {v8.s}[1], [x1], #4 ; .endif
5121:      tbz     x2, #1, 1f
513  .if \lddst ; ld1     {v0.h}[1], [x0], #2 ; .endif
514  .if \ldsrc ; ld1     {v8.h}[1], [x1], #2 ; .endif
5151:      tbz     x2, #0, 1f
516  .if \lddst ; ld1     {v0.b}[1], [x0], #1 ; .endif
517  .if \ldsrc ; ld1     {v8.b}[1], [x1], #1 ; .endif
5181:
519  .if \lddst ; sub     x0, x0, x2           ; .endif
520
521.if \zipped
522        /* One small impediment in the process above is that some of the load
523         * operations can't perform byte-wise structure deinterleaving at the
524         * same time as loading only part of a register.  So the data is loaded
525         * linearly and unpacked manually at this point.
526         */
527        uzp1    v4.16b, v0.16b, v1.16b
528        uzp2    v5.16b, v0.16b, v1.16b
529        uzp1    v6.16b, v2.16b, v3.16b
530        uzp2    v7.16b, v2.16b, v3.16b
531        uzp1    v0.16b, v4.16b, v6.16b
532        uzp2    v2.16b, v4.16b, v6.16b
533        uzp1    v1.16b, v5.16b, v7.16b
534        uzp2    v3.16b, v5.16b, v7.16b
535
536        uzp1    v4.16b, v8.16b, v9.16b
537        uzp2    v5.16b, v8.16b, v9.16b
538        uzp1    v6.16b, v10.16b, v11.16b
539        uzp2    v7.16b, v10.16b, v11.16b
540        uzp1    v8.16b, v4.16b, v6.16b
541        uzp2    v10.16b, v4.16b, v6.16b
542        uzp1    v9.16b, v5.16b, v7.16b
543        uzp2    v11.16b, v5.16b, v7.16b
544
545        \kernel
546
547        zip1    v4.16b, v0.16b, v2.16b
548        zip2    v6.16b, v0.16b, v2.16b
549        zip1    v5.16b, v1.16b, v3.16b
550        zip2    v7.16b, v1.16b, v3.16b
551        zip1    v0.16b, v4.16b, v5.16b
552        zip2    v1.16b, v4.16b, v5.16b
553        zip1    v2.16b, v6.16b, v7.16b
554        zip2    v3.16b, v6.16b, v7.16b
555  .else
556        \kernel
557  .endif
558
559        tbz     x2, #5, 1f
560        st1     {v2.16b,v3.16b}, [x0], #32
5611:      tbz     x2, #4, 1f
562        st1     {v1.16b}, [x0], #16
5631:      tbz     x2, #3, 1f
564        st1     {v0.d}[1], [x0], #8
5651:      tbz     x2, #2, 1f
566        st1     {v0.s}[1], [x0], #4
5671:      tbz     x2, #1, 1f
568        st1     {v0.h}[1], [x0], #2
5691:      tbz     x2, #0, 2f
570        st1     {v0.b}[1], [x0], #1
5712:      ld1     {v8.1d - v11.1d}, [sp], #32
572        ld1     {v12.1d - v15.1d}, [sp], #32
573.endif
574        mov     x0, #0
575        ret
576.endm
577
578
579/* produce list of blend_line_XX() functions; each function uses the wrap_line
580 * macro, passing it the name of the operation macro it wants along with
581 * optional parameters to remove unnecessary operations.
582 */
583#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
584    BLEND_LIST(BLEND_X)
585#undef BLEND_X
586
587#define BLEND_X(d, n) .set tablesize, d+1 ;
588    BLEND_LIST(BLEND_X)
589#undef BLEND_X
590
591/*  int rsdIntrinsicBlend_K(
592 *          uchar4 *out,        // x0
593 *          uchar4 const *in,   // x1
594 *          int slot,           // x2
595 *          size_t xstart,      // x3
596 *          size_t xend);       // x4
597 */
598ENTRY(rsdIntrinsicBlend_K)
599    adrp    x5, blendtable
600    add     x5, x5, :lo12:blendtable
601    cmp     w2, tablesize
602    bhs     1f
603    ldrsh   x6, [x5, w2, uxtw #1]
604    add     x0, x0, w3, uxtw #2
605    add     x1, x1, w3, uxtw #2
606    sub     w2, w4, w3
607    ubfiz   x2, x2, #2, #32 /* TODO: fix */
608    cbz     x6, 1f
609    adr     x5, 2f
610    add     x6, x5, x6
6112:  br      x6
6121:  mov     x0, #-1
613    ret
614
615END(rsdIntrinsicBlend_K)
616
617.rodata
618.set off,0
619blendtable:
620#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
621        BLEND_LIST(BLEND_X)
622#undef BLEND_X
623