1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <cassert>
18 #include <cstdint>
19
20 #include "RenderScriptToolkit.h"
21 #include "TaskProcessor.h"
22 #include "Utils.h"
23
24 namespace renderscript {
25
26 #define LOG_TAG "renderscript.toolkit.Blend"
27
28 /**
29 * Blends a source into a destination, based on the mode.
30 */
31 class BlendTask : public Task {
32 // The type of blending to do.
33 RenderScriptToolkit::BlendingMode mMode;
34 // The input we're blending.
35 const uchar4* mIn;
36 // The destination, used both for input and output.
37 uchar4* mOut;
38
39 void blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
40 uint32_t length);
41 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
42 void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
43 size_t endY) override;
44
45 public:
BlendTask(RenderScriptToolkit::BlendingMode mode,const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,const Restriction * restriction)46 BlendTask(RenderScriptToolkit::BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
47 size_t sizeY, const Restriction* restriction)
48 : Task{sizeX, sizeY, 4, true, restriction},
49 mMode{mode},
50 mIn{reinterpret_cast<const uchar4*>(in)},
51 mOut{reinterpret_cast<uchar4*>(out)} {}
52 };
53
54 #if defined(ARCH_ARM_USE_INTRINSICS)
55 extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
56 uint32_t xstart, uint32_t xend);
57 #endif
58
59 #if defined(ARCH_X86_HAVE_SSSE3)
60 extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
61 extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
62 extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
63 extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
64 extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
65 extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
66 extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
67 extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
68 extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
69 extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
70 extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
71 extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
72 #endif
73
74 // Convert vector to uchar4, clipping each value to 255.
75 template <typename TI>
convertClipped(TI amount)76 static inline uchar4 convertClipped(TI amount) {
77 return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
78 static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
79 static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
80 static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
81 }
82
blend(RenderScriptToolkit::BlendingMode mode,const uchar4 * in,uchar4 * out,uint32_t length)83 void BlendTask::blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
84 uint32_t length) {
85 uint32_t x1 = 0;
86 uint32_t x2 = length;
87
88 #if defined(ARCH_ARM_USE_INTRINSICS)
89 if (mUsesSimd) {
90 if (rsdIntrinsicBlend_K(out, in, (int) mode, x1, x2) >= 0) {
91 return;
92 } else {
93 ALOGW("Intrinsic Blend failed to use SIMD for %d", mode);
94 }
95 }
96 #endif
97 switch (mode) {
98 case RenderScriptToolkit::BlendingMode::CLEAR:
99 for (;x1 < x2; x1++, out++) {
100 *out = 0;
101 }
102 break;
103 case RenderScriptToolkit::BlendingMode::SRC:
104 for (;x1 < x2; x1++, out++, in++) {
105 *out = *in;
106 }
107 break;
108 //RenderScriptToolkit::BlendingMode::DST is a NOP
109 case RenderScriptToolkit::BlendingMode::DST:
110 break;
111 case RenderScriptToolkit::BlendingMode::SRC_OVER:
112 #if defined(ARCH_X86_HAVE_SSSE3)
113 if (mUsesSimd) {
114 if ((x1 + 8) < x2) {
115 uint32_t len = (x2 - x1) >> 3;
116 rsdIntrinsicBlendSrcOver_K(out, in, len);
117 x1 += len << 3;
118 out += len << 3;
119 in += len << 3;
120 }
121 }
122 #endif
123 for (;x1 < x2; x1++, out++, in++) {
124 ushort4 in_s = convert<ushort4>(*in);
125 ushort4 out_s = convert<ushort4>(*out);
126 in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
127 *out = convertClipped(in_s);
128 }
129 break;
130 case RenderScriptToolkit::BlendingMode::DST_OVER:
131 #if defined(ARCH_X86_HAVE_SSSE3)
132 if (mUsesSimd) {
133 if ((x1 + 8) < x2) {
134 uint32_t len = (x2 - x1) >> 3;
135 rsdIntrinsicBlendDstOver_K(out, in, len);
136 x1 += len << 3;
137 out += len << 3;
138 in += len << 3;
139 }
140 }
141 #endif
142 for (;x1 < x2; x1++, out++, in++) {
143 ushort4 in_s = convert<ushort4>(*in);
144 ushort4 out_s = convert<ushort4>(*out);
145 in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
146 *out = convertClipped(in_s);
147 }
148 break;
149 case RenderScriptToolkit::BlendingMode::SRC_IN:
150 #if defined(ARCH_X86_HAVE_SSSE3)
151 if (mUsesSimd) {
152 if ((x1 + 8) < x2) {
153 uint32_t len = (x2 - x1) >> 3;
154 rsdIntrinsicBlendSrcIn_K(out, in, len);
155 x1 += len << 3;
156 out += len << 3;
157 in += len << 3;
158 }
159 }
160 #endif
161 for (;x1 < x2; x1++, out++, in++) {
162 ushort4 in_s = convert<ushort4>(*in);
163 in_s = (in_s * out->w) >> (ushort4)8;
164 *out = convert<uchar4>(in_s);
165 }
166 break;
167 case RenderScriptToolkit::BlendingMode::DST_IN:
168 #if defined(ARCH_X86_HAVE_SSSE3)
169 if (mUsesSimd) {
170 if ((x1 + 8) < x2) {
171 uint32_t len = (x2 - x1) >> 3;
172 rsdIntrinsicBlendDstIn_K(out, in, len);
173 x1 += len << 3;
174 out += len << 3;
175 in += len << 3;
176 }
177 }
178 #endif
179 for (;x1 < x2; x1++, out++, in++) {
180 ushort4 out_s = convert<ushort4>(*out);
181 out_s = (out_s * in->w) >> (ushort4)8;
182 *out = convert<uchar4>(out_s);
183 }
184 break;
185 case RenderScriptToolkit::BlendingMode::SRC_OUT:
186 #if defined(ARCH_X86_HAVE_SSSE3)
187 if (mUsesSimd) {
188 if ((x1 + 8) < x2) {
189 uint32_t len = (x2 - x1) >> 3;
190 rsdIntrinsicBlendSrcOut_K(out, in, len);
191 x1 += len << 3;
192 out += len << 3;
193 in += len << 3;
194 }
195 }
196 #endif
197 for (;x1 < x2; x1++, out++, in++) {
198 ushort4 in_s = convert<ushort4>(*in);
199 in_s = (in_s * (ushort4)(255 - out->w)) >> (ushort4)8;
200 *out = convert<uchar4>(in_s);
201 }
202 break;
203 case RenderScriptToolkit::BlendingMode::DST_OUT:
204 #if defined(ARCH_X86_HAVE_SSSE3)
205 if (mUsesSimd) {
206 if ((x1 + 8) < x2) {
207 uint32_t len = (x2 - x1) >> 3;
208 rsdIntrinsicBlendDstOut_K(out, in, len);
209 x1 += len << 3;
210 out += len << 3;
211 in += len << 3;
212 }
213 }
214 #endif
215 for (;x1 < x2; x1++, out++, in++) {
216 ushort4 out_s = convert<ushort4>(*out);
217 out_s = (out_s * (ushort4)(255 - in->w)) >> (ushort4)8;
218 *out = convert<uchar4>(out_s);
219 }
220 break;
221 case RenderScriptToolkit::BlendingMode::SRC_ATOP:
222 #if defined(ARCH_X86_HAVE_SSSE3)
223 if (mUsesSimd) {
224 if ((x1 + 8) < x2) {
225 uint32_t len = (x2 - x1) >> 3;
226 rsdIntrinsicBlendSrcAtop_K(out, in, len);
227 x1 += len << 3;
228 out += len << 3;
229 in += len << 3;
230 }
231 }
232 #endif
233 for (;x1 < x2; x1++, out++, in++) {
234 // The max value the operation could produce before the shift
235 // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
236 // That value does not fit in a ushort, so we use uint.
237 uint4 in_s = convert<uint4>(*in);
238 uint4 out_s = convert<uint4>(*out);
239 out_s.xyz = ((in_s.xyz * out_s.w) +
240 (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
241 *out = convertClipped(out_s);
242 }
243 break;
244 case RenderScriptToolkit::BlendingMode::DST_ATOP:
245 #if defined(ARCH_X86_HAVE_SSSE3)
246 if (mUsesSimd) {
247 if ((x1 + 8) < x2) {
248 uint32_t len = (x2 - x1) >> 3;
249 rsdIntrinsicBlendDstAtop_K(out, in, len);
250 x1 += len << 3;
251 out += len << 3;
252 in += len << 3;
253 }
254 }
255 #endif
256 for (;x1 < x2; x1++, out++, in++) {
257 uint4 in_s = convert<uint4>(*in);
258 uint4 out_s = convert<uint4>(*out);
259 out_s.xyz = ((out_s.xyz * in_s.w) +
260 (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
261 out_s.w = in_s.w;
262 *out = convertClipped(out_s);
263 }
264 break;
265 case RenderScriptToolkit::BlendingMode::XOR:
266 #if defined(ARCH_X86_HAVE_SSSE3)
267 if (mUsesSimd) {
268 if ((x1 + 8) < x2) {
269 uint32_t len = (x2 - x1) >> 3;
270 rsdIntrinsicBlendXor_K(out, in, len);
271 x1 += len << 3;
272 out += len << 3;
273 in += len << 3;
274 }
275 }
276 #endif
277 for (;x1 < x2; x1++, out++, in++) {
278 *out = *in ^ *out;
279 }
280 break;
281 case RenderScriptToolkit::BlendingMode::MULTIPLY:
282 #if defined(ARCH_X86_HAVE_SSSE3)
283 if (mUsesSimd) {
284 if ((x1 + 8) < x2) {
285 uint32_t len = (x2 - x1) >> 3;
286 rsdIntrinsicBlendMultiply_K(out, in, len);
287 x1 += len << 3;
288 out += len << 3;
289 in += len << 3;
290 }
291 }
292 #endif
293 for (;x1 < x2; x1++, out++, in++) {
294 *out = convert<uchar4>((convert<ushort4>(*in) * convert<ushort4>(*out))
295 >> (ushort4)8);
296 }
297 break;
298 case RenderScriptToolkit::BlendingMode::ADD:
299 #if defined(ARCH_X86_HAVE_SSSE3)
300 if (mUsesSimd) {
301 if((x1 + 8) < x2) {
302 uint32_t len = (x2 - x1) >> 3;
303 rsdIntrinsicBlendAdd_K(out, in, len);
304 x1 += len << 3;
305 out += len << 3;
306 in += len << 3;
307 }
308 }
309 #endif
310 for (;x1 < x2; x1++, out++, in++) {
311 uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
312 oR = out->x, oG = out->y, oB = out->z, oA = out->w;
313 out->x = (oR + iR) > 255 ? 255 : oR + iR;
314 out->y = (oG + iG) > 255 ? 255 : oG + iG;
315 out->z = (oB + iB) > 255 ? 255 : oB + iB;
316 out->w = (oA + iA) > 255 ? 255 : oA + iA;
317 }
318 break;
319 case RenderScriptToolkit::BlendingMode::SUBTRACT:
320 #if defined(ARCH_X86_HAVE_SSSE3)
321 if (mUsesSimd) {
322 if((x1 + 8) < x2) {
323 uint32_t len = (x2 - x1) >> 3;
324 rsdIntrinsicBlendSub_K(out, in, len);
325 x1 += len << 3;
326 out += len << 3;
327 in += len << 3;
328 }
329 }
330 #endif
331 for (;x1 < x2; x1++, out++, in++) {
332 int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
333 oR = out->x, oG = out->y, oB = out->z, oA = out->w;
334 out->x = (oR - iR) < 0 ? 0 : oR - iR;
335 out->y = (oG - iG) < 0 ? 0 : oG - iG;
336 out->z = (oB - iB) < 0 ? 0 : oB - iB;
337 out->w = (oA - iA) < 0 ? 0 : oA - iA;
338 }
339 break;
340
341 default:
342 ALOGE("Called unimplemented value %d", mode);
343 assert(false);
344 }
345 }
346
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)347 void BlendTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
348 size_t endY) {
349 for (size_t y = startY; y < endY; y++) {
350 size_t offset = y * mSizeX + startX;
351 blend(mMode, mIn + offset, mOut + offset, endX - startX);
352 }
353 }
354
blend(BlendingMode mode,const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,const Restriction * restriction)355 void RenderScriptToolkit::blend(BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
356 size_t sizeY, const Restriction* restriction) {
357 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
358 if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
359 return;
360 }
361 #endif
362
363 BlendTask task(mode, in, out, sizeX, sizeY, restriction);
364 processor->doTask(&task);
365 }
366
367 } // namespace google::android::renderscript
368