1*32afb93cSXin Li /*
2*32afb93cSXin Li  * Copyright (C) 2012 The Android Open Source Project
3*32afb93cSXin Li  *
4*32afb93cSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li  * you may not use this file except in compliance with the License.
6*32afb93cSXin Li  * You may obtain a copy of the License at
7*32afb93cSXin Li  *
8*32afb93cSXin Li  *      http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li  *
10*32afb93cSXin Li  * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li  * See the License for the specific language governing permissions and
14*32afb93cSXin Li  * limitations under the License.
15*32afb93cSXin Li  */
16*32afb93cSXin Li 
17*32afb93cSXin Li #include <cstdint>
18*32afb93cSXin Li 
19*32afb93cSXin Li #include "RenderScriptToolkit.h"
20*32afb93cSXin Li #include "TaskProcessor.h"
21*32afb93cSXin Li #include "Utils.h"
22*32afb93cSXin Li 
23*32afb93cSXin Li namespace renderscript {
24*32afb93cSXin Li 
25*32afb93cSXin Li #define LOG_TAG "renderscript.toolkit.Convolve5x5"
26*32afb93cSXin Li 
27*32afb93cSXin Li extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
28*32afb93cSXin Li                                           const void* y3, const void* y4, const int16_t* coef,
29*32afb93cSXin Li                                           uint32_t count);
30*32afb93cSXin Li 
31*32afb93cSXin Li class Convolve5x5Task : public Task {
32*32afb93cSXin Li     const void* mIn;
33*32afb93cSXin Li     void* mOut;
34*32afb93cSXin Li     // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
35*32afb93cSXin Li     // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
36*32afb93cSXin Li     float mFp[28];
37*32afb93cSXin Li     int16_t mIp[28];
38*32afb93cSXin Li 
39*32afb93cSXin Li     void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
40*32afb93cSXin Li                   const uchar* py2, const uchar* py3, const uchar* py4);
41*32afb93cSXin Li     void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
42*32afb93cSXin Li                     size_t startX, size_t startY, size_t endX, size_t endY);
43*32afb93cSXin Li 
44*32afb93cSXin Li     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
45*32afb93cSXin Li     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
46*32afb93cSXin Li                      size_t endY) override;
47*32afb93cSXin Li 
48*32afb93cSXin Li    public:
Convolve5x5Task(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)49*32afb93cSXin Li     Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
50*32afb93cSXin Li                     const float* coefficients, const Restriction* restriction)
51*32afb93cSXin Li         : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
52*32afb93cSXin Li         for (int ct = 0; ct < 25; ct++) {
53*32afb93cSXin Li             mFp[ct] = coefficients[ct];
54*32afb93cSXin Li             if (mFp[ct] >= 0) {
55*32afb93cSXin Li                 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
56*32afb93cSXin Li             } else {
57*32afb93cSXin Li                 mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
58*32afb93cSXin Li             }
59*32afb93cSXin Li         }
60*32afb93cSXin Li     }
61*32afb93cSXin Li };
62*32afb93cSXin Li 
63*32afb93cSXin Li template <typename InputOutputType, typename ComputationType>
ConvolveOneU(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)64*32afb93cSXin Li static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
65*32afb93cSXin Li                          const InputOutputType* py1, const InputOutputType* py2,
66*32afb93cSXin Li                          const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
67*32afb93cSXin Li                          int32_t width) {
68*32afb93cSXin Li     uint32_t x0 = std::max((int32_t)x - 2, 0);
69*32afb93cSXin Li     uint32_t x1 = std::max((int32_t)x - 1, 0);
70*32afb93cSXin Li     uint32_t x2 = x;
71*32afb93cSXin Li     uint32_t x3 = std::min((int32_t)x + 1, width - 1);
72*32afb93cSXin Li     uint32_t x4 = std::min((int32_t)x + 2, width - 1);
73*32afb93cSXin Li 
74*32afb93cSXin Li     ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
75*32afb93cSXin Li                          convert<ComputationType>(py0[x1]) * coeff[1] +
76*32afb93cSXin Li                          convert<ComputationType>(py0[x2]) * coeff[2] +
77*32afb93cSXin Li                          convert<ComputationType>(py0[x3]) * coeff[3] +
78*32afb93cSXin Li                          convert<ComputationType>(py0[x4]) * coeff[4] +
79*32afb93cSXin Li 
80*32afb93cSXin Li                          convert<ComputationType>(py1[x0]) * coeff[5] +
81*32afb93cSXin Li                          convert<ComputationType>(py1[x1]) * coeff[6] +
82*32afb93cSXin Li                          convert<ComputationType>(py1[x2]) * coeff[7] +
83*32afb93cSXin Li                          convert<ComputationType>(py1[x3]) * coeff[8] +
84*32afb93cSXin Li                          convert<ComputationType>(py1[x4]) * coeff[9] +
85*32afb93cSXin Li 
86*32afb93cSXin Li                          convert<ComputationType>(py2[x0]) * coeff[10] +
87*32afb93cSXin Li                          convert<ComputationType>(py2[x1]) * coeff[11] +
88*32afb93cSXin Li                          convert<ComputationType>(py2[x2]) * coeff[12] +
89*32afb93cSXin Li                          convert<ComputationType>(py2[x3]) * coeff[13] +
90*32afb93cSXin Li                          convert<ComputationType>(py2[x4]) * coeff[14] +
91*32afb93cSXin Li 
92*32afb93cSXin Li                          convert<ComputationType>(py3[x0]) * coeff[15] +
93*32afb93cSXin Li                          convert<ComputationType>(py3[x1]) * coeff[16] +
94*32afb93cSXin Li                          convert<ComputationType>(py3[x2]) * coeff[17] +
95*32afb93cSXin Li                          convert<ComputationType>(py3[x3]) * coeff[18] +
96*32afb93cSXin Li                          convert<ComputationType>(py3[x4]) * coeff[19] +
97*32afb93cSXin Li 
98*32afb93cSXin Li                          convert<ComputationType>(py4[x0]) * coeff[20] +
99*32afb93cSXin Li                          convert<ComputationType>(py4[x1]) * coeff[21] +
100*32afb93cSXin Li                          convert<ComputationType>(py4[x2]) * coeff[22] +
101*32afb93cSXin Li                          convert<ComputationType>(py4[x3]) * coeff[23] +
102*32afb93cSXin Li                          convert<ComputationType>(py4[x4]) * coeff[24];
103*32afb93cSXin Li     px = clamp(px + 0.5f, 0.f, 255.f);
104*32afb93cSXin Li     *out = convert<InputOutputType>(px);
105*32afb93cSXin Li }
106*32afb93cSXin Li 
107*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
108*32afb93cSXin Li template <typename InputOutputType>
ConvolveOneF(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const InputOutputType * py3,const InputOutputType * py4,const float * coeff,int32_t width)109*32afb93cSXin Li static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
110*32afb93cSXin Li                          const InputOutputType* py1, const InputOutputType* py2,
111*32afb93cSXin Li                          const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
112*32afb93cSXin Li                          int32_t width) {
113*32afb93cSXin Li     uint32_t x0 = std::max((int32_t)x - 2, 0);
114*32afb93cSXin Li     uint32_t x1 = std::max((int32_t)x - 1, 0);
115*32afb93cSXin Li     uint32_t x2 = x;
116*32afb93cSXin Li     uint32_t x3 = std::min((int32_t)x + 1, width - 1);
117*32afb93cSXin Li     uint32_t x4 = std::min((int32_t)x + 2, width - 1);
118*32afb93cSXin Li 
119*32afb93cSXin Li     InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
120*32afb93cSXin Li                          py0[x3] * coeff[3] + py0[x4] * coeff[4] +
121*32afb93cSXin Li 
122*32afb93cSXin Li                          py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
123*32afb93cSXin Li                          py1[x3] * coeff[8] + py1[x4] * coeff[9] +
124*32afb93cSXin Li 
125*32afb93cSXin Li                          py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
126*32afb93cSXin Li                          py2[x3] * coeff[13] + py2[x4] * coeff[14] +
127*32afb93cSXin Li 
128*32afb93cSXin Li                          py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
129*32afb93cSXin Li                          py3[x3] * coeff[18] + py3[x4] * coeff[19] +
130*32afb93cSXin Li 
131*32afb93cSXin Li                          py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
132*32afb93cSXin Li                          py4[x3] * coeff[23] + py4[x4] * coeff[24];
133*32afb93cSXin Li     *out = px;
134*32afb93cSXin Li }
135*32afb93cSXin Li #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
136*32afb93cSXin Li 
137*32afb93cSXin Li /**
138*32afb93cSXin Li  * This function convolves one line.
139*32afb93cSXin Li  *
140*32afb93cSXin Li  * @param pout Where to place the next output.
141*32afb93cSXin Li  * @param xstart Index in the X direction of where to start.
142*32afb93cSXin Li  * @param xend End index
143*32afb93cSXin Li  * @param ppy0 Points to the start of the line two above.
144*32afb93cSXin Li  * @param ppy1 Points to the start of the line one above.
145*32afb93cSXin Li  * @param ppy2 Points to the start of the current line.
146*32afb93cSXin Li  * @param ppy3 Points to the start of the line one below.
147*32afb93cSXin Li  * @param ppy4 Points to the start of the line two below.
148*32afb93cSXin Li  */
kernelU4(uchar * pout,uint32_t x1,uint32_t x2,const uchar * ppy0,const uchar * ppy1,const uchar * ppy2,const uchar * ppy3,const uchar * ppy4)149*32afb93cSXin Li void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
150*32afb93cSXin Li                                const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
151*32afb93cSXin Li                                const uchar* ppy4) {
152*32afb93cSXin Li     uchar4* out = (uchar4*)pout;
153*32afb93cSXin Li     const uchar4* py0 = (const uchar4*)ppy0;
154*32afb93cSXin Li     const uchar4* py1 = (const uchar4*)ppy1;
155*32afb93cSXin Li     const uchar4* py2 = (const uchar4*)ppy2;
156*32afb93cSXin Li     const uchar4* py3 = (const uchar4*)ppy3;
157*32afb93cSXin Li     const uchar4* py4 = (const uchar4*)ppy4;
158*32afb93cSXin Li 
159*32afb93cSXin Li     while ((x1 < x2) && (x1 < 2)) {
160*32afb93cSXin Li         ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
161*32afb93cSXin Li         out++;
162*32afb93cSXin Li         x1++;
163*32afb93cSXin Li     }
164*32afb93cSXin Li #if defined(ARCH_X86_HAVE_SSSE3)
165*32afb93cSXin Li     // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
166*32afb93cSXin Li     // 3 for end boundary where x may hit the end boundary)
167*32afb93cSXin Li     if (mUsesSimd && ((x1 + 6) < x2)) {
168*32afb93cSXin Li         // subtract 3 for end boundary
169*32afb93cSXin Li         uint32_t len = (x2 - x1 - 3) >> 2;
170*32afb93cSXin Li         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
171*32afb93cSXin Li                                   py4 + x1 - 2, mIp, len);
172*32afb93cSXin Li         out += len << 2;
173*32afb93cSXin Li         x1 += len << 2;
174*32afb93cSXin Li     }
175*32afb93cSXin Li #endif
176*32afb93cSXin Li 
177*32afb93cSXin Li #if defined(ARCH_ARM_USE_INTRINSICS)
178*32afb93cSXin Li     if (mUsesSimd && ((x1 + 3) < x2)) {
179*32afb93cSXin Li         uint32_t len = (x2 - x1 - 3) >> 1;
180*32afb93cSXin Li         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
181*32afb93cSXin Li                                   py4 + x1 - 2, mIp, len);
182*32afb93cSXin Li         out += len << 1;
183*32afb93cSXin Li         x1 += len << 1;
184*32afb93cSXin Li     }
185*32afb93cSXin Li #endif
186*32afb93cSXin Li 
187*32afb93cSXin Li     while (x1 < x2) {
188*32afb93cSXin Li         ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
189*32afb93cSXin Li         out++;
190*32afb93cSXin Li         x1++;
191*32afb93cSXin Li     }
192*32afb93cSXin Li }
193*32afb93cSXin Li 
194*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
195*32afb93cSXin Li // This will need more cleanup before it can be used.
kernelF4(const ConvolveInfo * info,float4 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)196*32afb93cSXin Li void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
197*32afb93cSXin Li                                uint32_t xstart, uint32_t xend, uint32_t currentY) {
198*32afb93cSXin Li     const uchar* pin = (const uchar*)info->in;
199*32afb93cSXin Li     const size_t stride = info->stride;
200*32afb93cSXin Li 
201*32afb93cSXin Li     uint32_t y0 = std::max((int32_t)currentY - 2, 0);
202*32afb93cSXin Li     uint32_t y1 = std::max((int32_t)currentY - 1, 0);
203*32afb93cSXin Li     uint32_t y2 = currentY;
204*32afb93cSXin Li     uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
205*32afb93cSXin Li     uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
206*32afb93cSXin Li 
207*32afb93cSXin Li     const float4* py0 = (const float4*)(pin + stride * y0);
208*32afb93cSXin Li     const float4* py1 = (const float4*)(pin + stride * y1);
209*32afb93cSXin Li     const float4* py2 = (const float4*)(pin + stride * y2);
210*32afb93cSXin Li     const float4* py3 = (const float4*)(pin + stride * y3);
211*32afb93cSXin Li     const float4* py4 = (const float4*)(pin + stride * y4);
212*32afb93cSXin Li 
213*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++, out++) {
214*32afb93cSXin Li         ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
215*32afb93cSXin Li     }
216*32afb93cSXin Li }
217*32afb93cSXin Li 
RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo * info,float2 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)218*32afb93cSXin Li void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
219*32afb93cSXin Li                                                uint32_t xstart, uint32_t xend, uint32_t currentY) {
220*32afb93cSXin Li     const uchar* pin = (const uchar*)info->in;
221*32afb93cSXin Li     const size_t stride = info->stride;
222*32afb93cSXin Li 
223*32afb93cSXin Li     uint32_t y0 = std::max((int32_t)currentY - 2, 0);
224*32afb93cSXin Li     uint32_t y1 = std::max((int32_t)currentY - 1, 0);
225*32afb93cSXin Li     uint32_t y2 = currentY;
226*32afb93cSXin Li     uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
227*32afb93cSXin Li     uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
228*32afb93cSXin Li 
229*32afb93cSXin Li     const float2* py0 = (const float2*)(pin + stride * y0);
230*32afb93cSXin Li     const float2* py1 = (const float2*)(pin + stride * y1);
231*32afb93cSXin Li     const float2* py2 = (const float2*)(pin + stride * y2);
232*32afb93cSXin Li     const float2* py3 = (const float2*)(pin + stride * y3);
233*32afb93cSXin Li     const float2* py4 = (const float2*)(pin + stride * y4);
234*32afb93cSXin Li 
235*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++, out++) {
236*32afb93cSXin Li         ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
237*32afb93cSXin Li     }
238*32afb93cSXin Li }
239*32afb93cSXin Li 
RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo * info,float * out,uint32_t xstart,uint32_t xend,uint32_t currentY)240*32afb93cSXin Li void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
241*32afb93cSXin Li                                                uint32_t xstart, uint32_t xend, uint32_t currentY) {
242*32afb93cSXin Li     const uchar* pin = (const uchar*)info->in;
243*32afb93cSXin Li     const size_t stride = info->stride;
244*32afb93cSXin Li 
245*32afb93cSXin Li     uint32_t y0 = std::max((int32_t)currentY - 2, 0);
246*32afb93cSXin Li     uint32_t y1 = std::max((int32_t)currentY - 1, 0);
247*32afb93cSXin Li     uint32_t y2 = currentY;
248*32afb93cSXin Li     uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
249*32afb93cSXin Li     uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
250*32afb93cSXin Li 
251*32afb93cSXin Li     const float* py0 = (const float*)(pin + stride * y0);
252*32afb93cSXin Li     const float* py1 = (const float*)(pin + stride * y1);
253*32afb93cSXin Li     const float* py2 = (const float*)(pin + stride * y2);
254*32afb93cSXin Li     const float* py3 = (const float*)(pin + stride * y3);
255*32afb93cSXin Li     const float* py4 = (const float*)(pin + stride * y4);
256*32afb93cSXin Li 
257*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++, out++) {
258*32afb93cSXin Li         ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
259*32afb93cSXin Li     }
260*32afb93cSXin Li }
261*32afb93cSXin Li #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
262*32afb93cSXin Li 
263*32afb93cSXin Li template <typename InputOutputType, typename ComputationType>
convolveU(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY,float * mFp)264*32afb93cSXin Li static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
265*32afb93cSXin Li                       size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
266*32afb93cSXin Li     const size_t stride = vectorSize * sizeX;
267*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
268*32afb93cSXin Li         uint32_t y0 = std::max((int32_t)y - 2, 0);
269*32afb93cSXin Li         uint32_t y1 = std::max((int32_t)y - 1, 0);
270*32afb93cSXin Li         uint32_t y2 = y;
271*32afb93cSXin Li         uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
272*32afb93cSXin Li         uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
273*32afb93cSXin Li 
274*32afb93cSXin Li         size_t offset = (y * sizeX + startX) * vectorSize;
275*32afb93cSXin Li         InputOutputType* px = (InputOutputType*)(pout + offset);
276*32afb93cSXin Li         InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
277*32afb93cSXin Li         InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
278*32afb93cSXin Li         InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
279*32afb93cSXin Li         InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
280*32afb93cSXin Li         InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
281*32afb93cSXin Li         for (uint32_t x = startX; x < endX; x++, px++) {
282*32afb93cSXin Li             ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
283*32afb93cSXin Li                                                            sizeX);
284*32afb93cSXin Li         }
285*32afb93cSXin Li     }
286*32afb93cSXin Li }
287*32afb93cSXin Li 
convolveU4(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY)288*32afb93cSXin Li void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
289*32afb93cSXin Li                                  size_t sizeY, size_t startX, size_t startY, size_t endX,
290*32afb93cSXin Li                                  size_t endY) {
291*32afb93cSXin Li     const size_t stride = paddedSize(vectorSize) * sizeX;
292*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
293*32afb93cSXin Li         uint32_t y0 = std::max((int32_t)y - 2, 0);
294*32afb93cSXin Li         uint32_t y1 = std::max((int32_t)y - 1, 0);
295*32afb93cSXin Li         uint32_t y2 = y;
296*32afb93cSXin Li         uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
297*32afb93cSXin Li         uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
298*32afb93cSXin Li 
299*32afb93cSXin Li         size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
300*32afb93cSXin Li         uchar* px = pout + offset;
301*32afb93cSXin Li         const uchar* py0 = pin + stride * y0;
302*32afb93cSXin Li         const uchar* py1 = pin + stride * y1;
303*32afb93cSXin Li         const uchar* py2 = pin + stride * y2;
304*32afb93cSXin Li         const uchar* py3 = pin + stride * y3;
305*32afb93cSXin Li         const uchar* py4 = pin + stride * y4;
306*32afb93cSXin Li         kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
307*32afb93cSXin Li     }
308*32afb93cSXin Li }
309*32afb93cSXin Li 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)310*32afb93cSXin Li void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
311*32afb93cSXin Li                                   size_t endY) {
312*32afb93cSXin Li     // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
313*32afb93cSXin Li     // endX, endY);
314*32afb93cSXin Li     switch (mVectorSize) {
315*32afb93cSXin Li         case 1:
316*32afb93cSXin Li             convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
317*32afb93cSXin Li                                     startX, startY, endX, endY, mFp);
318*32afb93cSXin Li             break;
319*32afb93cSXin Li         case 2:
320*32afb93cSXin Li             convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
321*32afb93cSXin Li                                       startX, startY, endX, endY, mFp);
322*32afb93cSXin Li             break;
323*32afb93cSXin Li         case 3:
324*32afb93cSXin Li         case 4:
325*32afb93cSXin Li             convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
326*32afb93cSXin Li                        endX, endY);
327*32afb93cSXin Li             break;
328*32afb93cSXin Li     }
329*32afb93cSXin Li }
330*32afb93cSXin Li 
convolve5x5(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)331*32afb93cSXin Li void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
332*32afb93cSXin Li                                       size_t sizeY, const float* coefficients,
333*32afb93cSXin Li                                       const Restriction* restriction) {
334*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
335*32afb93cSXin Li     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
336*32afb93cSXin Li         return;
337*32afb93cSXin Li     }
338*32afb93cSXin Li     if (vectorSize < 1 || vectorSize > 4) {
339*32afb93cSXin Li         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
340*32afb93cSXin Li         return;
341*32afb93cSXin Li     }
342*32afb93cSXin Li #endif
343*32afb93cSXin Li 
344*32afb93cSXin Li     Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
345*32afb93cSXin Li     processor->doTask(&task);
346*32afb93cSXin Li }
347*32afb93cSXin Li 
348*32afb93cSXin Li }  // namespace renderscript
349