1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <cassert>
18 #include <cstdint>
19 
20 #include "RenderScriptToolkit.h"
21 #include "TaskProcessor.h"
22 #include "Utils.h"
23 
24 namespace renderscript {
25 
26 #define LOG_TAG "renderscript.toolkit.Blend"
27 
28 /**
29  * Blends a source into a destination, based on the mode.
30  */
31 class BlendTask : public Task {
32     // The type of blending to do.
33     RenderScriptToolkit::BlendingMode mMode;
34     // The input we're blending.
35     const uchar4* mIn;
36     // The destination, used both for input and output.
37     uchar4* mOut;
38 
39     void blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
40                uint32_t length);
41     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
42     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
43                      size_t endY) override;
44 
45    public:
BlendTask(RenderScriptToolkit::BlendingMode mode,const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,const Restriction * restriction)46     BlendTask(RenderScriptToolkit::BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
47               size_t sizeY, const Restriction* restriction)
48         : Task{sizeX, sizeY, 4, true, restriction},
49           mMode{mode},
50           mIn{reinterpret_cast<const uchar4*>(in)},
51           mOut{reinterpret_cast<uchar4*>(out)} {}
52 };
53 
54 #if defined(ARCH_ARM_USE_INTRINSICS)
55 extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
56                     uint32_t xstart, uint32_t xend);
57 #endif
58 
59 #if defined(ARCH_X86_HAVE_SSSE3)
60 extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
61 extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
62 extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
63 extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
64 extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
65 extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
66 extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
67 extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
68 extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
69 extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
70 extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
71 extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
72 #endif
73 
74 // Convert vector to uchar4, clipping each value to 255.
75 template <typename TI>
convertClipped(TI amount)76 static inline uchar4 convertClipped(TI amount) {
77     return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
78                     static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
79                     static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
80                     static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
81 }
82 
blend(RenderScriptToolkit::BlendingMode mode,const uchar4 * in,uchar4 * out,uint32_t length)83 void BlendTask::blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
84                       uint32_t length) {
85     uint32_t x1 = 0;
86     uint32_t x2 = length;
87 
88 #if defined(ARCH_ARM_USE_INTRINSICS)
89     if (mUsesSimd) {
90         if (rsdIntrinsicBlend_K(out, in, (int) mode, x1, x2) >= 0) {
91             return;
92         } else {
93             ALOGW("Intrinsic Blend failed to use SIMD for %d", mode);
94         }
95     }
96 #endif
97     switch (mode) {
98     case RenderScriptToolkit::BlendingMode::CLEAR:
99         for (;x1 < x2; x1++, out++) {
100             *out = 0;
101         }
102         break;
103     case RenderScriptToolkit::BlendingMode::SRC:
104         for (;x1 < x2; x1++, out++, in++) {
105           *out = *in;
106         }
107         break;
108     //RenderScriptToolkit::BlendingMode::DST is a NOP
109     case RenderScriptToolkit::BlendingMode::DST:
110         break;
111     case RenderScriptToolkit::BlendingMode::SRC_OVER:
112     #if defined(ARCH_X86_HAVE_SSSE3)
113         if (mUsesSimd) {
114             if ((x1 + 8) < x2) {
115                 uint32_t len = (x2 - x1) >> 3;
116                 rsdIntrinsicBlendSrcOver_K(out, in, len);
117                 x1 += len << 3;
118                 out += len << 3;
119                 in += len << 3;
120             }
121         }
122     #endif
123         for (;x1 < x2; x1++, out++, in++) {
124             ushort4 in_s = convert<ushort4>(*in);
125             ushort4 out_s = convert<ushort4>(*out);
126             in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
127             *out = convertClipped(in_s);
128         }
129         break;
130     case RenderScriptToolkit::BlendingMode::DST_OVER:
131     #if defined(ARCH_X86_HAVE_SSSE3)
132         if (mUsesSimd) {
133             if ((x1 + 8) < x2) {
134                 uint32_t len = (x2 - x1) >> 3;
135                 rsdIntrinsicBlendDstOver_K(out, in, len);
136                 x1 += len << 3;
137                 out += len << 3;
138                 in += len << 3;
139             }
140         }
141      #endif
142         for (;x1 < x2; x1++, out++, in++) {
143             ushort4 in_s = convert<ushort4>(*in);
144             ushort4 out_s = convert<ushort4>(*out);
145             in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
146             *out = convertClipped(in_s);
147         }
148         break;
149     case RenderScriptToolkit::BlendingMode::SRC_IN:
150     #if defined(ARCH_X86_HAVE_SSSE3)
151         if (mUsesSimd) {
152             if ((x1 + 8) < x2) {
153                 uint32_t len = (x2 - x1) >> 3;
154                 rsdIntrinsicBlendSrcIn_K(out, in, len);
155                 x1 += len << 3;
156                 out += len << 3;
157                 in += len << 3;
158             }
159         }
160 #endif
161         for (;x1 < x2; x1++, out++, in++) {
162             ushort4 in_s = convert<ushort4>(*in);
163             in_s = (in_s * out->w) >> (ushort4)8;
164             *out = convert<uchar4>(in_s);
165         }
166         break;
167     case RenderScriptToolkit::BlendingMode::DST_IN:
168     #if defined(ARCH_X86_HAVE_SSSE3)
169         if (mUsesSimd) {
170             if ((x1 + 8) < x2) {
171                 uint32_t len = (x2 - x1) >> 3;
172                 rsdIntrinsicBlendDstIn_K(out, in, len);
173                 x1 += len << 3;
174                 out += len << 3;
175                 in += len << 3;
176             }
177         }
178      #endif
179         for (;x1 < x2; x1++, out++, in++) {
180             ushort4 out_s = convert<ushort4>(*out);
181             out_s = (out_s * in->w) >> (ushort4)8;
182             *out = convert<uchar4>(out_s);
183         }
184         break;
185     case RenderScriptToolkit::BlendingMode::SRC_OUT:
186     #if defined(ARCH_X86_HAVE_SSSE3)
187         if (mUsesSimd) {
188             if ((x1 + 8) < x2) {
189                 uint32_t len = (x2 - x1) >> 3;
190                 rsdIntrinsicBlendSrcOut_K(out, in, len);
191                 x1 += len << 3;
192                 out += len << 3;
193                 in += len << 3;
194             }
195         }
196     #endif
197         for (;x1 < x2; x1++, out++, in++) {
198             ushort4 in_s = convert<ushort4>(*in);
199             in_s = (in_s * (ushort4)(255 - out->w)) >> (ushort4)8;
200             *out = convert<uchar4>(in_s);
201         }
202         break;
203     case RenderScriptToolkit::BlendingMode::DST_OUT:
204     #if defined(ARCH_X86_HAVE_SSSE3)
205         if (mUsesSimd) {
206             if ((x1 + 8) < x2) {
207                 uint32_t len = (x2 - x1) >> 3;
208                 rsdIntrinsicBlendDstOut_K(out, in, len);
209                 x1 += len << 3;
210                 out += len << 3;
211                 in += len << 3;
212             }
213         }
214     #endif
215         for (;x1 < x2; x1++, out++, in++) {
216             ushort4 out_s = convert<ushort4>(*out);
217             out_s = (out_s * (ushort4)(255 - in->w)) >> (ushort4)8;
218             *out = convert<uchar4>(out_s);
219         }
220         break;
221     case RenderScriptToolkit::BlendingMode::SRC_ATOP:
222     #if defined(ARCH_X86_HAVE_SSSE3)
223         if (mUsesSimd) {
224             if ((x1 + 8) < x2) {
225                 uint32_t len = (x2 - x1) >> 3;
226                 rsdIntrinsicBlendSrcAtop_K(out, in, len);
227                 x1 += len << 3;
228                 out += len << 3;
229                 in += len << 3;
230             }
231         }
232     #endif
233         for (;x1 < x2; x1++, out++, in++) {
234             // The max value the operation could produce before the shift
235             // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
236             // That value does not fit in a ushort, so we use uint.
237             uint4 in_s = convert<uint4>(*in);
238             uint4 out_s = convert<uint4>(*out);
239             out_s.xyz = ((in_s.xyz * out_s.w) +
240               (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
241             *out = convertClipped(out_s);
242         }
243         break;
244     case RenderScriptToolkit::BlendingMode::DST_ATOP:
245     #if defined(ARCH_X86_HAVE_SSSE3)
246         if (mUsesSimd) {
247             if ((x1 + 8) < x2) {
248                 uint32_t len = (x2 - x1) >> 3;
249                 rsdIntrinsicBlendDstAtop_K(out, in, len);
250                 x1 += len << 3;
251                 out += len << 3;
252                 in += len << 3;
253             }
254         }
255      #endif
256         for (;x1 < x2; x1++, out++, in++) {
257             uint4 in_s = convert<uint4>(*in);
258             uint4 out_s = convert<uint4>(*out);
259             out_s.xyz = ((out_s.xyz * in_s.w) +
260               (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
261             out_s.w = in_s.w;
262             *out = convertClipped(out_s);
263         }
264         break;
265     case RenderScriptToolkit::BlendingMode::XOR:
266     #if defined(ARCH_X86_HAVE_SSSE3)
267         if (mUsesSimd) {
268             if ((x1 + 8) < x2) {
269                 uint32_t len = (x2 - x1) >> 3;
270                 rsdIntrinsicBlendXor_K(out, in, len);
271                 x1 += len << 3;
272                 out += len << 3;
273                 in += len << 3;
274             }
275         }
276     #endif
277         for (;x1 < x2; x1++, out++, in++) {
278             *out = *in ^ *out;
279         }
280         break;
281     case RenderScriptToolkit::BlendingMode::MULTIPLY:
282     #if defined(ARCH_X86_HAVE_SSSE3)
283         if (mUsesSimd) {
284             if ((x1 + 8) < x2) {
285                 uint32_t len = (x2 - x1) >> 3;
286                 rsdIntrinsicBlendMultiply_K(out, in, len);
287                 x1 += len << 3;
288                 out += len << 3;
289                 in += len << 3;
290             }
291         }
292     #endif
293         for (;x1 < x2; x1++, out++, in++) {
294           *out = convert<uchar4>((convert<ushort4>(*in) * convert<ushort4>(*out))
295                                 >> (ushort4)8);
296         }
297         break;
298     case RenderScriptToolkit::BlendingMode::ADD:
299     #if defined(ARCH_X86_HAVE_SSSE3)
300         if (mUsesSimd) {
301             if((x1 + 8) < x2) {
302                 uint32_t len = (x2 - x1) >> 3;
303                 rsdIntrinsicBlendAdd_K(out, in, len);
304                 x1 += len << 3;
305                 out += len << 3;
306                 in += len << 3;
307             }
308         }
309     #endif
310         for (;x1 < x2; x1++, out++, in++) {
311             uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
312                 oR = out->x, oG = out->y, oB = out->z, oA = out->w;
313             out->x = (oR + iR) > 255 ? 255 : oR + iR;
314             out->y = (oG + iG) > 255 ? 255 : oG + iG;
315             out->z = (oB + iB) > 255 ? 255 : oB + iB;
316             out->w = (oA + iA) > 255 ? 255 : oA + iA;
317         }
318         break;
319     case RenderScriptToolkit::BlendingMode::SUBTRACT:
320     #if defined(ARCH_X86_HAVE_SSSE3)
321         if (mUsesSimd) {
322             if((x1 + 8) < x2) {
323                 uint32_t len = (x2 - x1) >> 3;
324                 rsdIntrinsicBlendSub_K(out, in, len);
325                 x1 += len << 3;
326                 out += len << 3;
327                 in += len << 3;
328             }
329         }
330     #endif
331         for (;x1 < x2; x1++, out++, in++) {
332             int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
333                 oR = out->x, oG = out->y, oB = out->z, oA = out->w;
334             out->x = (oR - iR) < 0 ? 0 : oR - iR;
335             out->y = (oG - iG) < 0 ? 0 : oG - iG;
336             out->z = (oB - iB) < 0 ? 0 : oB - iB;
337             out->w = (oA - iA) < 0 ? 0 : oA - iA;
338         }
339         break;
340 
341     default:
342         ALOGE("Called unimplemented value %d", mode);
343         assert(false);
344     }
345 }
346 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)347 void BlendTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
348                             size_t endY) {
349     for (size_t y = startY; y < endY; y++) {
350         size_t offset = y * mSizeX + startX;
351         blend(mMode, mIn + offset, mOut + offset, endX - startX);
352     }
353 }
354 
blend(BlendingMode mode,const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,const Restriction * restriction)355 void RenderScriptToolkit::blend(BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
356                                 size_t sizeY, const Restriction* restriction) {
357 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
358     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
359         return;
360     }
361 #endif
362 
363     BlendTask task(mode, in, out, sizeX, sizeY, restriction);
364     processor->doTask(&task);
365 }
366 
367 }  // namespace google::android::renderscript
368