1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <cstdint>
18
19 #include "RenderScriptToolkit.h"
20 #include "TaskProcessor.h"
21 #include "Utils.h"
22
23 namespace renderscript {
24
25 #define LOG_TAG "renderscript.toolkit.Convolve5x5"
26
27 extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
28 const void* y3, const void* y4, const int16_t* coef,
29 uint32_t count);
30
31 class Convolve5x5Task : public Task {
32 const void* mIn;
33 void* mOut;
34 // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
35 // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
36 float mFp[28];
37 int16_t mIp[28];
38
39 void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
40 const uchar* py2, const uchar* py3, const uchar* py4);
41 void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
42 size_t startX, size_t startY, size_t endX, size_t endY);
43
44 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
45 void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
46 size_t endY) override;
47
48 public:
Convolve5x5Task(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)49 Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
50 const float* coefficients, const Restriction* restriction)
51 : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
52 for (int ct = 0; ct < 25; ct++) {
53 mFp[ct] = coefficients[ct];
54 if (mFp[ct] >= 0) {
55 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
56 } else {
57 mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
58 }
59 }
60 }
61 };
62
63 template <typename InputOutputType, typename ComputationType>
ConvolveOneU(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)64 static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
65 const InputOutputType* py1, const InputOutputType* py2,
66 const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
67 int32_t width) {
68 uint32_t x0 = std::max((int32_t)x - 2, 0);
69 uint32_t x1 = std::max((int32_t)x - 1, 0);
70 uint32_t x2 = x;
71 uint32_t x3 = std::min((int32_t)x + 1, width - 1);
72 uint32_t x4 = std::min((int32_t)x + 2, width - 1);
73
74 ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
75 convert<ComputationType>(py0[x1]) * coeff[1] +
76 convert<ComputationType>(py0[x2]) * coeff[2] +
77 convert<ComputationType>(py0[x3]) * coeff[3] +
78 convert<ComputationType>(py0[x4]) * coeff[4] +
79
80 convert<ComputationType>(py1[x0]) * coeff[5] +
81 convert<ComputationType>(py1[x1]) * coeff[6] +
82 convert<ComputationType>(py1[x2]) * coeff[7] +
83 convert<ComputationType>(py1[x3]) * coeff[8] +
84 convert<ComputationType>(py1[x4]) * coeff[9] +
85
86 convert<ComputationType>(py2[x0]) * coeff[10] +
87 convert<ComputationType>(py2[x1]) * coeff[11] +
88 convert<ComputationType>(py2[x2]) * coeff[12] +
89 convert<ComputationType>(py2[x3]) * coeff[13] +
90 convert<ComputationType>(py2[x4]) * coeff[14] +
91
92 convert<ComputationType>(py3[x0]) * coeff[15] +
93 convert<ComputationType>(py3[x1]) * coeff[16] +
94 convert<ComputationType>(py3[x2]) * coeff[17] +
95 convert<ComputationType>(py3[x3]) * coeff[18] +
96 convert<ComputationType>(py3[x4]) * coeff[19] +
97
98 convert<ComputationType>(py4[x0]) * coeff[20] +
99 convert<ComputationType>(py4[x1]) * coeff[21] +
100 convert<ComputationType>(py4[x2]) * coeff[22] +
101 convert<ComputationType>(py4[x3]) * coeff[23] +
102 convert<ComputationType>(py4[x4]) * coeff[24];
103 px = clamp(px + 0.5f, 0.f, 255.f);
104 *out = convert<InputOutputType>(px);
105 }
106
107 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
108 template <typename InputOutputType>
ConvolveOneF(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)109 static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
110 const InputOutputType* py1, const InputOutputType* py2,
111 const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
112 int32_t width) {
113 uint32_t x0 = std::max((int32_t)x - 2, 0);
114 uint32_t x1 = std::max((int32_t)x - 1, 0);
115 uint32_t x2 = x;
116 uint32_t x3 = std::min((int32_t)x + 1, width - 1);
117 uint32_t x4 = std::min((int32_t)x + 2, width - 1);
118
119 InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
120 py0[x3] * coeff[3] + py0[x4] * coeff[4] +
121
122 py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
123 py1[x3] * coeff[8] + py1[x4] * coeff[9] +
124
125 py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
126 py2[x3] * coeff[13] + py2[x4] * coeff[14] +
127
128 py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
129 py3[x3] * coeff[18] + py3[x4] * coeff[19] +
130
131 py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
132 py4[x3] * coeff[23] + py4[x4] * coeff[24];
133 *out = px;
134 }
135 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
136
137 /**
138 * This function convolves one line.
139 *
140 * @param pout Where to place the next output.
141 * @param xstart Index in the X direction of where to start.
142 * @param xend End index
143 * @param ppy0 Points to the start of the line two above.
144 * @param ppy1 Points to the start of the line one above.
145 * @param ppy2 Points to the start of the current line.
146 * @param ppy3 Points to the start of the line one below.
147 * @param ppy4 Points to the start of the line two below.
148 */
kernelU4(uchar * pout,uint32_t x1,uint32_t x2,const uchar * ppy0,const uchar * ppy1,const uchar * ppy2,const uchar * ppy3,const uchar * ppy4)149 void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
150 const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
151 const uchar* ppy4) {
152 uchar4* out = (uchar4*)pout;
153 const uchar4* py0 = (const uchar4*)ppy0;
154 const uchar4* py1 = (const uchar4*)ppy1;
155 const uchar4* py2 = (const uchar4*)ppy2;
156 const uchar4* py3 = (const uchar4*)ppy3;
157 const uchar4* py4 = (const uchar4*)ppy4;
158
159 while ((x1 < x2) && (x1 < 2)) {
160 ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
161 out++;
162 x1++;
163 }
164 #if defined(ARCH_X86_HAVE_SSSE3)
165 // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
166 // 3 for end boundary where x may hit the end boundary)
167 if (mUsesSimd && ((x1 + 6) < x2)) {
168 // subtract 3 for end boundary
169 uint32_t len = (x2 - x1 - 3) >> 2;
170 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
171 py4 + x1 - 2, mIp, len);
172 out += len << 2;
173 x1 += len << 2;
174 }
175 #endif
176
177 #if defined(ARCH_ARM_USE_INTRINSICS)
178 if (mUsesSimd && ((x1 + 3) < x2)) {
179 uint32_t len = (x2 - x1 - 3) >> 1;
180 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
181 py4 + x1 - 2, mIp, len);
182 out += len << 1;
183 x1 += len << 1;
184 }
185 #endif
186
187 while (x1 < x2) {
188 ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
189 out++;
190 x1++;
191 }
192 }
193
194 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
195 // This will need more cleanup before it can be used.
kernelF4(const ConvolveInfo * info,float4 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)196 void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
197 uint32_t xstart, uint32_t xend, uint32_t currentY) {
198 const uchar* pin = (const uchar*)info->in;
199 const size_t stride = info->stride;
200
201 uint32_t y0 = std::max((int32_t)currentY - 2, 0);
202 uint32_t y1 = std::max((int32_t)currentY - 1, 0);
203 uint32_t y2 = currentY;
204 uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
205 uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
206
207 const float4* py0 = (const float4*)(pin + stride * y0);
208 const float4* py1 = (const float4*)(pin + stride * y1);
209 const float4* py2 = (const float4*)(pin + stride * y2);
210 const float4* py3 = (const float4*)(pin + stride * y3);
211 const float4* py4 = (const float4*)(pin + stride * y4);
212
213 for (uint32_t x = xstart; x < xend; x++, out++) {
214 ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
215 }
216 }
217
RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo * info,float2 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)218 void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
219 uint32_t xstart, uint32_t xend, uint32_t currentY) {
220 const uchar* pin = (const uchar*)info->in;
221 const size_t stride = info->stride;
222
223 uint32_t y0 = std::max((int32_t)currentY - 2, 0);
224 uint32_t y1 = std::max((int32_t)currentY - 1, 0);
225 uint32_t y2 = currentY;
226 uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
227 uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
228
229 const float2* py0 = (const float2*)(pin + stride * y0);
230 const float2* py1 = (const float2*)(pin + stride * y1);
231 const float2* py2 = (const float2*)(pin + stride * y2);
232 const float2* py3 = (const float2*)(pin + stride * y3);
233 const float2* py4 = (const float2*)(pin + stride * y4);
234
235 for (uint32_t x = xstart; x < xend; x++, out++) {
236 ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
237 }
238 }
239
RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo * info,float * out,uint32_t xstart,uint32_t xend,uint32_t currentY)240 void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
241 uint32_t xstart, uint32_t xend, uint32_t currentY) {
242 const uchar* pin = (const uchar*)info->in;
243 const size_t stride = info->stride;
244
245 uint32_t y0 = std::max((int32_t)currentY - 2, 0);
246 uint32_t y1 = std::max((int32_t)currentY - 1, 0);
247 uint32_t y2 = currentY;
248 uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
249 uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
250
251 const float* py0 = (const float*)(pin + stride * y0);
252 const float* py1 = (const float*)(pin + stride * y1);
253 const float* py2 = (const float*)(pin + stride * y2);
254 const float* py3 = (const float*)(pin + stride * y3);
255 const float* py4 = (const float*)(pin + stride * y4);
256
257 for (uint32_t x = xstart; x < xend; x++, out++) {
258 ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
259 }
260 }
261 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
262
263 template <typename InputOutputType, typename ComputationType>
convolveU(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY,float * mFp)264 static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
265 size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
266 const size_t stride = vectorSize * sizeX;
267 for (size_t y = startY; y < endY; y++) {
268 uint32_t y0 = std::max((int32_t)y - 2, 0);
269 uint32_t y1 = std::max((int32_t)y - 1, 0);
270 uint32_t y2 = y;
271 uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
272 uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
273
274 size_t offset = (y * sizeX + startX) * vectorSize;
275 InputOutputType* px = (InputOutputType*)(pout + offset);
276 InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
277 InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
278 InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
279 InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
280 InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
281 for (uint32_t x = startX; x < endX; x++, px++) {
282 ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
283 sizeX);
284 }
285 }
286 }
287
convolveU4(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY)288 void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
289 size_t sizeY, size_t startX, size_t startY, size_t endX,
290 size_t endY) {
291 const size_t stride = paddedSize(vectorSize) * sizeX;
292 for (size_t y = startY; y < endY; y++) {
293 uint32_t y0 = std::max((int32_t)y - 2, 0);
294 uint32_t y1 = std::max((int32_t)y - 1, 0);
295 uint32_t y2 = y;
296 uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
297 uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
298
299 size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
300 uchar* px = pout + offset;
301 const uchar* py0 = pin + stride * y0;
302 const uchar* py1 = pin + stride * y1;
303 const uchar* py2 = pin + stride * y2;
304 const uchar* py3 = pin + stride * y3;
305 const uchar* py4 = pin + stride * y4;
306 kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
307 }
308 }
309
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)310 void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
311 size_t endY) {
312 // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
313 // endX, endY);
314 switch (mVectorSize) {
315 case 1:
316 convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
317 startX, startY, endX, endY, mFp);
318 break;
319 case 2:
320 convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
321 startX, startY, endX, endY, mFp);
322 break;
323 case 3:
324 case 4:
325 convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
326 endX, endY);
327 break;
328 }
329 }
330
convolve5x5(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)331 void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
332 size_t sizeY, const float* coefficients,
333 const Restriction* restriction) {
334 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
335 if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
336 return;
337 }
338 if (vectorSize < 1 || vectorSize > 4) {
339 ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
340 return;
341 }
342 #endif
343
344 Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
345 processor->doTask(&task);
346 }
347
348 } // namespace renderscript
349