1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <math.h>
18 
19 #include <cstdint>
20 
21 #include "RenderScriptToolkit.h"
22 #include "TaskProcessor.h"
23 #include "Utils.h"
24 
25 #if defined(ARCH_X86_HAVE_AVX2)
26 #include <stdint.h>
27 #include <x86intrin.h>
28 #include <xmmintrin.h>
29 #endif
30 
31 #define LOG_TAG "renderscript.toolkit.Resize"
32 
33 namespace renderscript {
34 
35 class ResizeTask : public Task {
36     const uchar* mIn;
37     uchar* mOut;
38     float mScaleX;
39     float mScaleY;
40     size_t mInputSizeX;
41     size_t mInputSizeY;
42 
43     void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
44     void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
45     void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
46 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
47     void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
48     void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
49     void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
50 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
51 
52     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
53     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
54                      size_t endY) override;
55 
56    public:
ResizeTask(const uchar * input,uchar * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)57     ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
58                size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
59                const Restriction* restriction)
60         : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
61           mIn{input},
62           mOut{output},
63           mInputSizeX{inputSizeX},
64           mInputSizeY{inputSizeY} {
65         mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
66         mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
67     }
68 };
69 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)70 void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
71                              size_t endY) {
72     typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
73 
74     KernelFunction kernel;
75     switch (mVectorSize) {
76         case 4:
77             kernel = &ResizeTask::kernelU4;
78             break;
79         case 3:
80             kernel = &ResizeTask::kernelU4;
81             break;
82         case 2:
83             kernel = &ResizeTask::kernelU2;
84             break;
85         case 1:
86             kernel = &ResizeTask::kernelU1;
87             break;
88         default:
89             ALOGE("Bad vector size %zd", mVectorSize);
90     }
91 
92     for (size_t y = startY; y < endY; y++) {
93         size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
94         uchar* out = mOut + offset;
95         std::invoke(kernel, this, out, startX, endX, y);
96     }
97 }
98 
cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3,float x)99 static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
100     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
101             + x * (3.f * (p1 - p2) + p3 - p0)));
102 }
103 
cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3,float x)104 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
105     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
106             + x * (3.f * (p1 - p2) + p3 - p0)));
107 }
108 
109 
110 #if defined(ARCH_X86_HAVE_AVX2)
cubicInterpolate(float p0,float p1,float p2,float p3,float x)111 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
112    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
113            _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
114            + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
115                                               _mm_set1_ps(p3 - p0))))));
116 
117 }
118 #else
cubicInterpolate(float p0,float p1,float p2,float p3,float x)119 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
120     //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
121     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
122             + x * (3.f * (p1 - p2) + p3 - p0)));
123 }
124 #endif
125 
OneBiCubic(const uchar4 * yp0,const uchar4 * yp1,const uchar4 * yp2,const uchar4 * yp3,float xf,float yf,int width)126 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
127                          float xf, float yf, int width) {
128     int startx = (int) floor(xf - 1);
129     xf = xf - floor(xf);
130     int maxx = width - 1;
131     int xs0 = std::max(0, startx + 0);
132     int xs1 = std::max(0, startx + 1);
133     int xs2 = std::min(maxx, startx + 2);
134     int xs3 = std::min(maxx, startx + 3);
135 
136     float4 p0  = cubicInterpolate(convert<float4>(yp0[xs0]),
137                                   convert<float4>(yp0[xs1]),
138                                   convert<float4>(yp0[xs2]),
139                                   convert<float4>(yp0[xs3]), xf);
140 
141     float4 p1  = cubicInterpolate(convert<float4>(yp1[xs0]),
142                                   convert<float4>(yp1[xs1]),
143                                   convert<float4>(yp1[xs2]),
144                                   convert<float4>(yp1[xs3]), xf);
145 
146     float4 p2  = cubicInterpolate(convert<float4>(yp2[xs0]),
147                                   convert<float4>(yp2[xs1]),
148                                   convert<float4>(yp2[xs2]),
149                                   convert<float4>(yp2[xs3]), xf);
150 
151     float4 p3  = cubicInterpolate(convert<float4>(yp3[xs0]),
152                                   convert<float4>(yp3[xs1]),
153                                   convert<float4>(yp3[xs2]),
154                                   convert<float4>(yp3[xs3]), xf);
155 
156     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
157     p = clamp(p + 0.5f, 0.f, 255.f);
158     return convert<uchar4>(p);
159 }
160 
OneBiCubic(const uchar2 * yp0,const uchar2 * yp1,const uchar2 * yp2,const uchar2 * yp3,float xf,float yf,int width)161 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
162                          float xf, float yf, int width) {
163     int startx = (int) floor(xf - 1);
164     xf = xf - floor(xf);
165     int maxx = width - 1;
166     int xs0 = std::max(0, startx + 0);
167     int xs1 = std::max(0, startx + 1);
168     int xs2 = std::min(maxx, startx + 2);
169     int xs3 = std::min(maxx, startx + 3);
170 
171     float2 p0  = cubicInterpolate(convert<float2>(yp0[xs0]),
172                                   convert<float2>(yp0[xs1]),
173                                   convert<float2>(yp0[xs2]),
174                                   convert<float2>(yp0[xs3]), xf);
175 
176     float2 p1  = cubicInterpolate(convert<float2>(yp1[xs0]),
177                                   convert<float2>(yp1[xs1]),
178                                   convert<float2>(yp1[xs2]),
179                                   convert<float2>(yp1[xs3]), xf);
180 
181     float2 p2  = cubicInterpolate(convert<float2>(yp2[xs0]),
182                                   convert<float2>(yp2[xs1]),
183                                   convert<float2>(yp2[xs2]),
184                                   convert<float2>(yp2[xs3]), xf);
185 
186     float2 p3  = cubicInterpolate(convert<float2>(yp3[xs0]),
187                                   convert<float2>(yp3[xs1]),
188                                   convert<float2>(yp3[xs2]),
189                                   convert<float2>(yp3[xs3]), xf);
190 
191     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
192     p = clamp(p + 0.5f, 0.f, 255.f);
193     return convert<uchar2>(p);
194 }
195 
OneBiCubic(const uchar * yp0,const uchar * yp1,const uchar * yp2,const uchar * yp3,float xf,float yf,int width)196 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
197                         float xf, float yf, int width) {
198     int startx = (int) floor(xf - 1);
199     xf = xf - floor(xf);
200     int maxx = width - 1;
201     int xs0 = std::max(0, startx + 0);
202     int xs1 = std::max(0, startx + 1);
203     int xs2 = std::min(maxx, startx + 2);
204     int xs3 = std::min(maxx, startx + 3);
205 
206     float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
207                                  (float)yp0[xs2], (float)yp0[xs3], xf);
208     float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
209                                  (float)yp1[xs2], (float)yp1[xs3], xf);
210     float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
211                                  (float)yp2[xs2], (float)yp2[xs3], xf);
212     float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
213                                  (float)yp3[xs2], (float)yp3[xs3], xf);
214 
215     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
216     p = clamp(p + 0.5f, 0.f, 255.f);
217     //ALOGI("CUC,%f,%u", p, (uchar)p);
218     return (uchar)p;
219 }
220 
221 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
222 
223 extern "C" void rsdIntrinsicResizeB4_K(
224             uchar4 *dst,
225             size_t count,
226             uint32_t xf,
227             uint32_t xinc,
228             uchar4 const *srcn,
229             uchar4 const *src0,
230             uchar4 const *src1,
231             uchar4 const *src2,
232             size_t xclip,
233             size_t avail,
234             uint64_t osc_ctl,
235             int32_t const *yr);
236 
237 extern "C" void rsdIntrinsicResizeB2_K(
238             uchar2 *dst,
239             size_t count,
240             uint32_t xf,
241             uint32_t xinc,
242             uchar2 const *srcn,
243             uchar2 const *src0,
244             uchar2 const *src1,
245             uchar2 const *src2,
246             size_t xclip,
247             size_t avail,
248             uint64_t osc_ctl,
249             int32_t const *yr);
250 
251 extern "C" void rsdIntrinsicResizeB1_K(
252             uchar *dst,
253             size_t count,
254             uint32_t xf,
255             uint32_t xinc,
256             uchar const *srcn,
257             uchar const *src0,
258             uchar const *src1,
259             uchar const *src2,
260             size_t xclip,
261             size_t avail,
262             uint64_t osc_ctl,
263             int32_t const *yr);
264 
265 #if defined(ARCH_ARM_USE_INTRINSICS)
mkYCoeff(int32_t * yr,float yf)266 static void mkYCoeff(int32_t *yr, float yf) {
267     int32_t yf1 = rint(yf * 0x10000);
268     int32_t yf2 = rint(yf * yf * 0x10000);
269     int32_t yf3 = rint(yf * yf * yf * 0x10000);
270 
271     yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
272     yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
273     yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
274     yr[3] = -(yf3 - yf2) >> 1;
275 }
276 #endif
277 
278 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
OneBiCubic(const float4 * yp0,const float4 * yp1,const float4 * yp2,const float4 * yp3,float xf,float yf,int width)279 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
280                          float xf, float yf, int width) {
281     int startx = (int) floor(xf - 1);
282     xf = xf - floor(xf);
283     int maxx = width - 1;
284     int xs0 = std::max(0, startx + 0);
285     int xs1 = std::max(0, startx + 1);
286     int xs2 = std::min(maxx, startx + 2);
287     int xs3 = std::min(maxx, startx + 3);
288 
289     float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
290                                   yp0[xs2], yp0[xs3], xf);
291     float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
292                                   yp1[xs2], yp1[xs3], xf);
293     float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
294                                   yp2[xs2], yp2[xs3], xf);
295     float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
296                                   yp3[xs2], yp3[xs3], xf);
297 
298     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
299     return p;
300 }
301 
OneBiCubic(const float2 * yp0,const float2 * yp1,const float2 * yp2,const float2 * yp3,float xf,float yf,int width)302 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
303                          float xf, float yf, int width) {
304     int startx = (int) floor(xf - 1);
305     xf = xf - floor(xf);
306     int maxx = width - 1;
307     int xs0 = std::max(0, startx + 0);
308     int xs1 = std::max(0, startx + 1);
309     int xs2 = std::min(maxx, startx + 2);
310     int xs3 = std::min(maxx, startx + 3);
311 
312     float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
313                                   yp0[xs2], yp0[xs3], xf);
314     float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
315                                   yp1[xs2], yp1[xs3], xf);
316     float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
317                                   yp2[xs2], yp2[xs3], xf);
318     float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
319                                   yp3[xs2], yp3[xs3], xf);
320 
321     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
322     return p;
323 }
324 
OneBiCubic(const float * yp0,const float * yp1,const float * yp2,const float * yp3,float xf,float yf,int width)325 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
326                         float xf, float yf, int width) {
327     int startx = (int) floor(xf - 1);
328     xf = xf - floor(xf);
329     int maxx = width - 1;
330     int xs0 = std::max(0, startx + 0);
331     int xs1 = std::max(0, startx + 1);
332     int xs2 = std::min(maxx, startx + 2);
333     int xs3 = std::min(maxx, startx + 3);
334 
335     float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
336                                  yp0[xs2], yp0[xs3], xf);
337     float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
338                                  yp1[xs2], yp1[xs3], xf);
339     float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
340                                  yp2[xs2], yp2[xs3], xf);
341     float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
342                                  yp3[xs2], yp3[xs3], xf);
343 
344     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
345     return p;
346 }
347 #endif
348 
kernelU4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)349 void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
350     const uchar *pin = mIn;
351     const int srcHeight = mInputSizeY;
352     const int srcWidth = mInputSizeX;
353     const size_t stride = mInputSizeX * paddedSize(mVectorSize);
354 
355 
356 #if defined(ARCH_X86_HAVE_AVX2)
357     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
358                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
359 #else
360     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
361 #endif
362 
363 
364     int starty = (int) floor(yf - 1);
365     yf = yf - floor(yf);
366     int maxy = srcHeight - 1;
367     int ys0 = std::max(0, starty + 0);
368     int ys1 = std::max(0, starty + 1);
369     int ys2 = std::min(maxy, starty + 2);
370     int ys3 = std::min(maxy, starty + 3);
371 
372     const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
373     const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
374     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
375     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
376 
377     uchar4 *out = ((uchar4 *)outPtr);
378     uint32_t x1 = xstart;
379     uint32_t x2 = xend;
380 
381 #if defined(ARCH_ARM_USE_INTRINSICS)
382     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
383         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
384         long xf16 = rint(xf * 0x10000);
385         uint32_t xinc16 = rint(mScaleX * 0x10000);
386 
387         int xoff = (xf16 >> 16) - 1;
388         int xclip = std::max(0, xoff) - xoff;
389         int len = x2 - x1;
390 
391         int32_t yr[4];
392         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
393         mkYCoeff(yr, yf);
394 
395         xoff += xclip;
396 
397         rsdIntrinsicResizeB4_K(
398                 out, len,
399                 xf16 & 0xffff, xinc16,
400                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
401                 xclip, srcWidth - xoff + xclip,
402                 osc_ctl, yr);
403         out += len;
404         x1 += len;
405     }
406 #endif
407 
408     while(x1 < x2) {
409 #if defined(ARCH_X86_HAVE_AVX2)
410         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
411                                               _mm_set1_ps(0.5f)));
412 #else
413         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
414 #endif
415         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
416         out++;
417         x1++;
418     }
419 }
420 
kernelU2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)421 void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
422     const uchar *pin = mIn;
423     const int srcHeight = mInputSizeY;
424     const int srcWidth = mInputSizeX;
425     const size_t stride = mInputSizeX * mVectorSize;
426 
427 
428 #if defined(ARCH_X86_HAVE_AVX2)
429     float yf = _mm_cvtss_f32(
430             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
431 #else
432     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
433 #endif
434 
435     int starty = (int) floor(yf - 1);
436     yf = yf - floor(yf);
437     int maxy = srcHeight - 1;
438     int ys0 = std::max(0, starty + 0);
439     int ys1 = std::max(0, starty + 1);
440     int ys2 = std::min(maxy, starty + 2);
441     int ys3 = std::min(maxy, starty + 3);
442 
443     const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
444     const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
445     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
446     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
447 
448     uchar2 *out = ((uchar2 *)outPtr);
449     uint32_t x1 = xstart;
450     uint32_t x2 = xend;
451 
452 #if defined(ARCH_ARM_USE_INTRINSICS)
453     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
454         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
455         long xf16 = rint(xf * 0x10000);
456         uint32_t xinc16 = rint(mScaleX * 0x10000);
457 
458         int xoff = (xf16 >> 16) - 1;
459         int xclip = std::max(0, xoff) - xoff;
460         int len = x2 - x1;
461 
462         int32_t yr[4];
463         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
464         mkYCoeff(yr, yf);
465 
466         xoff += xclip;
467 
468         rsdIntrinsicResizeB2_K(
469                 out, len,
470                 xf16 & 0xffff, xinc16,
471                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
472                 xclip, srcWidth - xoff + xclip,
473                 osc_ctl, yr);
474         out += len;
475         x1 += len;
476     }
477 #endif
478 
479     while(x1 < x2) {
480 
481 #if defined(ARCH_X86_HAVE_AVX2)
482         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
483                                               _mm_set1_ps(0.5f)));
484 #else
485         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
486 #endif
487         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
488         out++;
489         x1++;
490     }
491 }
492 
kernelU1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)493 void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
494     //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
495     const uchar *pin = mIn;
496     const int srcHeight = mInputSizeY;
497     const int srcWidth = mInputSizeX;
498     const size_t stride = mInputSizeX * mVectorSize;
499 
500     // ALOGI("Toolkit   ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
501     // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
502 
503 #if defined(ARCH_X86_HAVE_AVX2)
504     float yf = _mm_cvtss_f32(
505             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
506 #else
507     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
508 #endif
509 
510     int starty = (int) floor(yf - 1);
511     yf = yf - floor(yf);
512     int maxy = srcHeight - 1;
513     int ys0 = std::max(0, starty + 0);
514     int ys1 = std::min(maxy, std::max(0, starty + 1));
515     int ys2 = std::min(maxy, starty + 2);
516     int ys3 = std::min(maxy, starty + 3);
517 
518     const uchar *yp0 = pin + stride * ys0;
519     const uchar *yp1 = pin + stride * ys1;
520     const uchar *yp2 = pin + stride * ys2;
521     const uchar *yp3 = pin + stride * ys3;
522 
523     uchar *out = ((uchar *)outPtr);
524     uint32_t x1 = xstart;
525     uint32_t x2 = xend;
526 
527 #if defined(ARCH_ARM_USE_INTRINSICS)
528     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
529         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
530         long xf16 = rint(xf * 0x10000);
531         uint32_t xinc16 = rint(mScaleX * 0x10000);
532 
533         int xoff = (xf16 >> 16) - 1;
534         int xclip = std::max(0, xoff) - xoff;
535         int len = x2 - x1;
536 
537         int32_t yr[4];
538         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
539         mkYCoeff(yr, yf);
540 
541         // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
542         // xclip %d, len %d, osc_ctl %lu)",
543         //       ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
544         //       osc_ctl);
545         // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
546         // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
547         // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
548 
549         xoff += xclip;
550 
551         rsdIntrinsicResizeB1_K(
552                 out, len,
553                 xf16 & 0xffff, xinc16,
554                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
555                 xclip, srcWidth - xoff + xclip,
556                 osc_ctl, yr);
557         out += len;
558         x1 += len;
559     }
560 #endif
561 
562     while(x1 < x2) {
563 
564 #if defined(ARCH_X86_HAVE_AVX2)
565         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
566                                               _mm_set1_ps(0.5f)));
567 #else
568         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
569 #endif
570 
571         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
572         out++;
573         x1++;
574     }
575 }
576 
577 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
kernelF4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)578 void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
579     const uchar *pin = mIn;
580     const int srcHeight = inputSizeY;
581     const int srcWidth = inputSizeX;
582     const size_t stride = sizeX * vectorSize;
583 
584 #if defined(ARCH_X86_HAVE_AVX2)
585     float yf = _mm_cvtss_f32(
586             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
587 #else
588     float yf = (currentY + 0.5f) * scaleY - 0.5f;
589 #endif
590 
591     int starty = (int) floor(yf - 1);
592     yf = yf - floor(yf);
593     int maxy = srcHeight - 1;
594     int ys0 = std::max(0, starty + 0);
595     int ys1 = std::max(0, starty + 1);
596     int ys2 = std::min(maxy, starty + 2);
597     int ys3 = std::min(maxy, starty + 3);
598 
599     const float4 *yp0 = (const float4 *)(pin + stride * ys0);
600     const float4 *yp1 = (const float4 *)(pin + stride * ys1);
601     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
602     const float4 *yp3 = (const float4 *)(pin + stride * ys3);
603 
604     float4 *out = ((float4 *)outPtr);
605     uint32_t x1 = xstart;
606     uint32_t x2 = xend;
607 
608     while(x1 < x2) {
609 
610 #if defined(ARCH_X86_HAVE_AVX2)
611         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
612                                               _mm_set1_ps(0.5f)));
613 #else
614         float xf = (x1 + 0.5f) * scaleX - 0.5f;
615 #endif
616 
617         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
618         out++;
619         x1++;
620     }
621 }
622 
kernelF2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)623 void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
624     const uchar *pin = mIn;
625     const int srcHeight = inputSizeY;
626     const int srcWidth = inputSizeX;
627     const size_t stride = sizeX * vectorSize;
628 
629 
630 #if defined(ARCH_X86_HAVE_AVX2)
631     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
632                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
633 #else
634     float yf = (currentY + 0.5f) * scaleY - 0.5f;
635 #endif
636 
637     int starty = (int) floor(yf - 1);
638     yf = yf - floor(yf);
639     int maxy = srcHeight - 1;
640     int ys0 = std::max(0, starty + 0);
641     int ys1 = std::max(0, starty + 1);
642     int ys2 = std::min(maxy, starty + 2);
643     int ys3 = std::min(maxy, starty + 3);
644 
645     const float2 *yp0 = (const float2 *)(pin + stride * ys0);
646     const float2 *yp1 = (const float2 *)(pin + stride * ys1);
647     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
648     const float2 *yp3 = (const float2 *)(pin + stride * ys3);
649 
650     float2 *out = ((float2 *)outPtr);
651     uint32_t x1 = xstart;
652     uint32_t x2 = xend;
653 
654     while(x1 < x2) {
655 
656 #if defined(ARCH_X86_HAVE_AVX2)
657         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
658                                               _mm_set1_ps(0.5f)));
659 #else
660         float xf = (x1 + 0.5f) * scaleX - 0.5f;
661 #endif
662 
663         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
664         out++;
665         x1++;
666     }
667 }
668 
kernelF1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)669 void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
670     const uchar *pin = mIn;
671     const int srcHeight = inputSizeY;
672     const int srcWidth = inputSizeX;
673     const size_t stride = sizeX * vectorSize;
674 
675 
676 #if defined(ARCH_X86_HAVE_AVX2)
677     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
678                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
679 #else
680     float yf = (currentY + 0.5f) * scaleY - 0.5f;
681 #endif
682 
683     int starty = (int) floor(yf - 1);
684     yf = yf - floor(yf);
685     int maxy = srcHeight - 1;
686     int ys0 = std::max(0, starty + 0);
687     int ys1 = std::max(0, starty + 1);
688     int ys2 = std::min(maxy, starty + 2);
689     int ys3 = std::min(maxy, starty + 3);
690 
691     const float *yp0 = (const float *)(pin + stride * ys0);
692     const float *yp1 = (const float *)(pin + stride * ys1);
693     const float *yp2 = (const float *)(pin + stride * ys2);
694     const float *yp3 = (const float *)(pin + stride * ys3);
695 
696     float *out = ((float *)outPtr);
697     uint32_t x1 = xstart;
698     uint32_t x2 = xend;
699 
700     while(x1 < x2) {
701 
702 #if defined(ARCH_X86_HAVE_AVX2)
703         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
704                                               _mm_set1_ps(0.5f)));
705 #else
706         float xf = (x1 + 0.5f) * scaleX - 0.5f;
707 #endif
708 
709         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
710         out++;
711         x1++;
712     }
713 }
714 
preLaunch(uint32_t slot,const RsScriptCall * sc)715 void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
716 {
717 
718     //check the data type to determine F or U.
719     if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
720         switch(mAlloc->getType()->getElement()->getVectorSize()) {
721         case 1:
722             mRootPtr = &kernelU1;
723             break;
724         case 2:
725             mRootPtr = &kernelU2;
726             break;
727         case 3:
728         case 4:
729             mRootPtr = &kernelU4;
730             break;
731         }
732     } else {
733         switch(mAlloc->getType()->getElement()->getVectorSize()) {
734         case 1:
735             mRootPtr = &kernelF1;
736             break;
737         case 2:
738             mRootPtr = &kernelF2;
739             break;
740         case 3:
741         case 4:
742             mRootPtr = &kernelF4;
743             break;
744         }
745     }
746 }
747 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
748 
resize(const uint8_t * input,uint8_t * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)749 void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
750                                  size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
751                                  size_t outputSizeY, const Restriction* restriction) {
752 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
753     if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
754         return;
755     }
756     if (vectorSize < 1 || vectorSize > 4) {
757         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
758         return;
759     }
760 #endif
761 
762     ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
763                     outputSizeX, outputSizeY, restriction);
764     processor->doTask(&task);
765 }
766 
767 }  // namespace renderscript
768