1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <math.h>
18
19 #include <cstdint>
20
21 #include "RenderScriptToolkit.h"
22 #include "TaskProcessor.h"
23 #include "Utils.h"
24
25 #if defined(ARCH_X86_HAVE_AVX2)
26 #include <stdint.h>
27 #include <x86intrin.h>
28 #include <xmmintrin.h>
29 #endif
30
31 #define LOG_TAG "renderscript.toolkit.Resize"
32
33 namespace renderscript {
34
35 class ResizeTask : public Task {
36 const uchar* mIn;
37 uchar* mOut;
38 float mScaleX;
39 float mScaleY;
40 size_t mInputSizeX;
41 size_t mInputSizeY;
42
43 void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
44 void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
45 void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
46 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
47 void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
48 void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
49 void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
50 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
51
52 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
53 void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
54 size_t endY) override;
55
56 public:
ResizeTask(const uchar * input,uchar * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)57 ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
58 size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
59 const Restriction* restriction)
60 : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
61 mIn{input},
62 mOut{output},
63 mInputSizeX{inputSizeX},
64 mInputSizeY{inputSizeY} {
65 mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
66 mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
67 }
68 };
69
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)70 void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
71 size_t endY) {
72 typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
73
74 KernelFunction kernel;
75 switch (mVectorSize) {
76 case 4:
77 kernel = &ResizeTask::kernelU4;
78 break;
79 case 3:
80 kernel = &ResizeTask::kernelU4;
81 break;
82 case 2:
83 kernel = &ResizeTask::kernelU2;
84 break;
85 case 1:
86 kernel = &ResizeTask::kernelU1;
87 break;
88 default:
89 ALOGE("Bad vector size %zd", mVectorSize);
90 }
91
92 for (size_t y = startY; y < endY; y++) {
93 size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
94 uchar* out = mOut + offset;
95 std::invoke(kernel, this, out, startX, endX, y);
96 }
97 }
98
cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3,float x)99 static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
100 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
101 + x * (3.f * (p1 - p2) + p3 - p0)));
102 }
103
cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3,float x)104 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
105 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
106 + x * (3.f * (p1 - p2) + p3 - p0)));
107 }
108
109
110 #if defined(ARCH_X86_HAVE_AVX2)
cubicInterpolate(float p0,float p1,float p2,float p3,float x)111 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
112 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
113 _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
114 + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
115 _mm_set1_ps(p3 - p0))))));
116
117 }
118 #else
cubicInterpolate(float p0,float p1,float p2,float p3,float x)119 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
120 //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
121 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
122 + x * (3.f * (p1 - p2) + p3 - p0)));
123 }
124 #endif
125
OneBiCubic(const uchar4 * yp0,const uchar4 * yp1,const uchar4 * yp2,const uchar4 * yp3,float xf,float yf,int width)126 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
127 float xf, float yf, int width) {
128 int startx = (int) floor(xf - 1);
129 xf = xf - floor(xf);
130 int maxx = width - 1;
131 int xs0 = std::max(0, startx + 0);
132 int xs1 = std::max(0, startx + 1);
133 int xs2 = std::min(maxx, startx + 2);
134 int xs3 = std::min(maxx, startx + 3);
135
136 float4 p0 = cubicInterpolate(convert<float4>(yp0[xs0]),
137 convert<float4>(yp0[xs1]),
138 convert<float4>(yp0[xs2]),
139 convert<float4>(yp0[xs3]), xf);
140
141 float4 p1 = cubicInterpolate(convert<float4>(yp1[xs0]),
142 convert<float4>(yp1[xs1]),
143 convert<float4>(yp1[xs2]),
144 convert<float4>(yp1[xs3]), xf);
145
146 float4 p2 = cubicInterpolate(convert<float4>(yp2[xs0]),
147 convert<float4>(yp2[xs1]),
148 convert<float4>(yp2[xs2]),
149 convert<float4>(yp2[xs3]), xf);
150
151 float4 p3 = cubicInterpolate(convert<float4>(yp3[xs0]),
152 convert<float4>(yp3[xs1]),
153 convert<float4>(yp3[xs2]),
154 convert<float4>(yp3[xs3]), xf);
155
156 float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
157 p = clamp(p + 0.5f, 0.f, 255.f);
158 return convert<uchar4>(p);
159 }
160
OneBiCubic(const uchar2 * yp0,const uchar2 * yp1,const uchar2 * yp2,const uchar2 * yp3,float xf,float yf,int width)161 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
162 float xf, float yf, int width) {
163 int startx = (int) floor(xf - 1);
164 xf = xf - floor(xf);
165 int maxx = width - 1;
166 int xs0 = std::max(0, startx + 0);
167 int xs1 = std::max(0, startx + 1);
168 int xs2 = std::min(maxx, startx + 2);
169 int xs3 = std::min(maxx, startx + 3);
170
171 float2 p0 = cubicInterpolate(convert<float2>(yp0[xs0]),
172 convert<float2>(yp0[xs1]),
173 convert<float2>(yp0[xs2]),
174 convert<float2>(yp0[xs3]), xf);
175
176 float2 p1 = cubicInterpolate(convert<float2>(yp1[xs0]),
177 convert<float2>(yp1[xs1]),
178 convert<float2>(yp1[xs2]),
179 convert<float2>(yp1[xs3]), xf);
180
181 float2 p2 = cubicInterpolate(convert<float2>(yp2[xs0]),
182 convert<float2>(yp2[xs1]),
183 convert<float2>(yp2[xs2]),
184 convert<float2>(yp2[xs3]), xf);
185
186 float2 p3 = cubicInterpolate(convert<float2>(yp3[xs0]),
187 convert<float2>(yp3[xs1]),
188 convert<float2>(yp3[xs2]),
189 convert<float2>(yp3[xs3]), xf);
190
191 float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
192 p = clamp(p + 0.5f, 0.f, 255.f);
193 return convert<uchar2>(p);
194 }
195
OneBiCubic(const uchar * yp0,const uchar * yp1,const uchar * yp2,const uchar * yp3,float xf,float yf,int width)196 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
197 float xf, float yf, int width) {
198 int startx = (int) floor(xf - 1);
199 xf = xf - floor(xf);
200 int maxx = width - 1;
201 int xs0 = std::max(0, startx + 0);
202 int xs1 = std::max(0, startx + 1);
203 int xs2 = std::min(maxx, startx + 2);
204 int xs3 = std::min(maxx, startx + 3);
205
206 float p0 = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
207 (float)yp0[xs2], (float)yp0[xs3], xf);
208 float p1 = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
209 (float)yp1[xs2], (float)yp1[xs3], xf);
210 float p2 = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
211 (float)yp2[xs2], (float)yp2[xs3], xf);
212 float p3 = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
213 (float)yp3[xs2], (float)yp3[xs3], xf);
214
215 float p = cubicInterpolate(p0, p1, p2, p3, yf);
216 p = clamp(p + 0.5f, 0.f, 255.f);
217 //ALOGI("CUC,%f,%u", p, (uchar)p);
218 return (uchar)p;
219 }
220
221 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
222
223 extern "C" void rsdIntrinsicResizeB4_K(
224 uchar4 *dst,
225 size_t count,
226 uint32_t xf,
227 uint32_t xinc,
228 uchar4 const *srcn,
229 uchar4 const *src0,
230 uchar4 const *src1,
231 uchar4 const *src2,
232 size_t xclip,
233 size_t avail,
234 uint64_t osc_ctl,
235 int32_t const *yr);
236
237 extern "C" void rsdIntrinsicResizeB2_K(
238 uchar2 *dst,
239 size_t count,
240 uint32_t xf,
241 uint32_t xinc,
242 uchar2 const *srcn,
243 uchar2 const *src0,
244 uchar2 const *src1,
245 uchar2 const *src2,
246 size_t xclip,
247 size_t avail,
248 uint64_t osc_ctl,
249 int32_t const *yr);
250
251 extern "C" void rsdIntrinsicResizeB1_K(
252 uchar *dst,
253 size_t count,
254 uint32_t xf,
255 uint32_t xinc,
256 uchar const *srcn,
257 uchar const *src0,
258 uchar const *src1,
259 uchar const *src2,
260 size_t xclip,
261 size_t avail,
262 uint64_t osc_ctl,
263 int32_t const *yr);
264
265 #if defined(ARCH_ARM_USE_INTRINSICS)
mkYCoeff(int32_t * yr,float yf)266 static void mkYCoeff(int32_t *yr, float yf) {
267 int32_t yf1 = rint(yf * 0x10000);
268 int32_t yf2 = rint(yf * yf * 0x10000);
269 int32_t yf3 = rint(yf * yf * yf * 0x10000);
270
271 yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
272 yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
273 yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
274 yr[3] = -(yf3 - yf2) >> 1;
275 }
276 #endif
277
278 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
OneBiCubic(const float4 * yp0,const float4 * yp1,const float4 * yp2,const float4 * yp3,float xf,float yf,int width)279 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
280 float xf, float yf, int width) {
281 int startx = (int) floor(xf - 1);
282 xf = xf - floor(xf);
283 int maxx = width - 1;
284 int xs0 = std::max(0, startx + 0);
285 int xs1 = std::max(0, startx + 1);
286 int xs2 = std::min(maxx, startx + 2);
287 int xs3 = std::min(maxx, startx + 3);
288
289 float4 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
290 yp0[xs2], yp0[xs3], xf);
291 float4 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
292 yp1[xs2], yp1[xs3], xf);
293 float4 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
294 yp2[xs2], yp2[xs3], xf);
295 float4 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
296 yp3[xs2], yp3[xs3], xf);
297
298 float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
299 return p;
300 }
301
OneBiCubic(const float2 * yp0,const float2 * yp1,const float2 * yp2,const float2 * yp3,float xf,float yf,int width)302 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
303 float xf, float yf, int width) {
304 int startx = (int) floor(xf - 1);
305 xf = xf - floor(xf);
306 int maxx = width - 1;
307 int xs0 = std::max(0, startx + 0);
308 int xs1 = std::max(0, startx + 1);
309 int xs2 = std::min(maxx, startx + 2);
310 int xs3 = std::min(maxx, startx + 3);
311
312 float2 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
313 yp0[xs2], yp0[xs3], xf);
314 float2 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
315 yp1[xs2], yp1[xs3], xf);
316 float2 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
317 yp2[xs2], yp2[xs3], xf);
318 float2 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
319 yp3[xs2], yp3[xs3], xf);
320
321 float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
322 return p;
323 }
324
OneBiCubic(const float * yp0,const float * yp1,const float * yp2,const float * yp3,float xf,float yf,int width)325 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
326 float xf, float yf, int width) {
327 int startx = (int) floor(xf - 1);
328 xf = xf - floor(xf);
329 int maxx = width - 1;
330 int xs0 = std::max(0, startx + 0);
331 int xs1 = std::max(0, startx + 1);
332 int xs2 = std::min(maxx, startx + 2);
333 int xs3 = std::min(maxx, startx + 3);
334
335 float p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
336 yp0[xs2], yp0[xs3], xf);
337 float p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
338 yp1[xs2], yp1[xs3], xf);
339 float p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
340 yp2[xs2], yp2[xs3], xf);
341 float p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
342 yp3[xs2], yp3[xs3], xf);
343
344 float p = cubicInterpolate(p0, p1, p2, p3, yf);
345 return p;
346 }
347 #endif
348
kernelU4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)349 void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
350 const uchar *pin = mIn;
351 const int srcHeight = mInputSizeY;
352 const int srcWidth = mInputSizeX;
353 const size_t stride = mInputSizeX * paddedSize(mVectorSize);
354
355
356 #if defined(ARCH_X86_HAVE_AVX2)
357 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
358 _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
359 #else
360 float yf = (currentY + 0.5f) * mScaleY - 0.5f;
361 #endif
362
363
364 int starty = (int) floor(yf - 1);
365 yf = yf - floor(yf);
366 int maxy = srcHeight - 1;
367 int ys0 = std::max(0, starty + 0);
368 int ys1 = std::max(0, starty + 1);
369 int ys2 = std::min(maxy, starty + 2);
370 int ys3 = std::min(maxy, starty + 3);
371
372 const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
373 const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
374 const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
375 const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
376
377 uchar4 *out = ((uchar4 *)outPtr);
378 uint32_t x1 = xstart;
379 uint32_t x2 = xend;
380
381 #if defined(ARCH_ARM_USE_INTRINSICS)
382 if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
383 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
384 long xf16 = rint(xf * 0x10000);
385 uint32_t xinc16 = rint(mScaleX * 0x10000);
386
387 int xoff = (xf16 >> 16) - 1;
388 int xclip = std::max(0, xoff) - xoff;
389 int len = x2 - x1;
390
391 int32_t yr[4];
392 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
393 mkYCoeff(yr, yf);
394
395 xoff += xclip;
396
397 rsdIntrinsicResizeB4_K(
398 out, len,
399 xf16 & 0xffff, xinc16,
400 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
401 xclip, srcWidth - xoff + xclip,
402 osc_ctl, yr);
403 out += len;
404 x1 += len;
405 }
406 #endif
407
408 while(x1 < x2) {
409 #if defined(ARCH_X86_HAVE_AVX2)
410 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
411 _mm_set1_ps(0.5f)));
412 #else
413 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
414 #endif
415 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
416 out++;
417 x1++;
418 }
419 }
420
kernelU2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)421 void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
422 const uchar *pin = mIn;
423 const int srcHeight = mInputSizeY;
424 const int srcWidth = mInputSizeX;
425 const size_t stride = mInputSizeX * mVectorSize;
426
427
428 #if defined(ARCH_X86_HAVE_AVX2)
429 float yf = _mm_cvtss_f32(
430 _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
431 #else
432 float yf = (currentY + 0.5f) * mScaleY - 0.5f;
433 #endif
434
435 int starty = (int) floor(yf - 1);
436 yf = yf - floor(yf);
437 int maxy = srcHeight - 1;
438 int ys0 = std::max(0, starty + 0);
439 int ys1 = std::max(0, starty + 1);
440 int ys2 = std::min(maxy, starty + 2);
441 int ys3 = std::min(maxy, starty + 3);
442
443 const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
444 const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
445 const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
446 const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
447
448 uchar2 *out = ((uchar2 *)outPtr);
449 uint32_t x1 = xstart;
450 uint32_t x2 = xend;
451
452 #if defined(ARCH_ARM_USE_INTRINSICS)
453 if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
454 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
455 long xf16 = rint(xf * 0x10000);
456 uint32_t xinc16 = rint(mScaleX * 0x10000);
457
458 int xoff = (xf16 >> 16) - 1;
459 int xclip = std::max(0, xoff) - xoff;
460 int len = x2 - x1;
461
462 int32_t yr[4];
463 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
464 mkYCoeff(yr, yf);
465
466 xoff += xclip;
467
468 rsdIntrinsicResizeB2_K(
469 out, len,
470 xf16 & 0xffff, xinc16,
471 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
472 xclip, srcWidth - xoff + xclip,
473 osc_ctl, yr);
474 out += len;
475 x1 += len;
476 }
477 #endif
478
479 while(x1 < x2) {
480
481 #if defined(ARCH_X86_HAVE_AVX2)
482 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
483 _mm_set1_ps(0.5f)));
484 #else
485 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
486 #endif
487 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
488 out++;
489 x1++;
490 }
491 }
492
kernelU1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)493 void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
494 //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
495 const uchar *pin = mIn;
496 const int srcHeight = mInputSizeY;
497 const int srcWidth = mInputSizeX;
498 const size_t stride = mInputSizeX * mVectorSize;
499
500 // ALOGI("Toolkit ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
501 // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
502
503 #if defined(ARCH_X86_HAVE_AVX2)
504 float yf = _mm_cvtss_f32(
505 _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
506 #else
507 float yf = (currentY + 0.5f) * mScaleY - 0.5f;
508 #endif
509
510 int starty = (int) floor(yf - 1);
511 yf = yf - floor(yf);
512 int maxy = srcHeight - 1;
513 int ys0 = std::max(0, starty + 0);
514 int ys1 = std::min(maxy, std::max(0, starty + 1));
515 int ys2 = std::min(maxy, starty + 2);
516 int ys3 = std::min(maxy, starty + 3);
517
518 const uchar *yp0 = pin + stride * ys0;
519 const uchar *yp1 = pin + stride * ys1;
520 const uchar *yp2 = pin + stride * ys2;
521 const uchar *yp3 = pin + stride * ys3;
522
523 uchar *out = ((uchar *)outPtr);
524 uint32_t x1 = xstart;
525 uint32_t x2 = xend;
526
527 #if defined(ARCH_ARM_USE_INTRINSICS)
528 if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
529 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
530 long xf16 = rint(xf * 0x10000);
531 uint32_t xinc16 = rint(mScaleX * 0x10000);
532
533 int xoff = (xf16 >> 16) - 1;
534 int xclip = std::max(0, xoff) - xoff;
535 int len = x2 - x1;
536
537 int32_t yr[4];
538 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
539 mkYCoeff(yr, yf);
540
541 // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
542 // xclip %d, len %d, osc_ctl %lu)",
543 // ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
544 // osc_ctl);
545 // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
546 // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
547 // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
548
549 xoff += xclip;
550
551 rsdIntrinsicResizeB1_K(
552 out, len,
553 xf16 & 0xffff, xinc16,
554 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
555 xclip, srcWidth - xoff + xclip,
556 osc_ctl, yr);
557 out += len;
558 x1 += len;
559 }
560 #endif
561
562 while(x1 < x2) {
563
564 #if defined(ARCH_X86_HAVE_AVX2)
565 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
566 _mm_set1_ps(0.5f)));
567 #else
568 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
569 #endif
570
571 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
572 out++;
573 x1++;
574 }
575 }
576
577 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
kernelF4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)578 void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
579 const uchar *pin = mIn;
580 const int srcHeight = inputSizeY;
581 const int srcWidth = inputSizeX;
582 const size_t stride = sizeX * vectorSize;
583
584 #if defined(ARCH_X86_HAVE_AVX2)
585 float yf = _mm_cvtss_f32(
586 _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
587 #else
588 float yf = (currentY + 0.5f) * scaleY - 0.5f;
589 #endif
590
591 int starty = (int) floor(yf - 1);
592 yf = yf - floor(yf);
593 int maxy = srcHeight - 1;
594 int ys0 = std::max(0, starty + 0);
595 int ys1 = std::max(0, starty + 1);
596 int ys2 = std::min(maxy, starty + 2);
597 int ys3 = std::min(maxy, starty + 3);
598
599 const float4 *yp0 = (const float4 *)(pin + stride * ys0);
600 const float4 *yp1 = (const float4 *)(pin + stride * ys1);
601 const float4 *yp2 = (const float4 *)(pin + stride * ys2);
602 const float4 *yp3 = (const float4 *)(pin + stride * ys3);
603
604 float4 *out = ((float4 *)outPtr);
605 uint32_t x1 = xstart;
606 uint32_t x2 = xend;
607
608 while(x1 < x2) {
609
610 #if defined(ARCH_X86_HAVE_AVX2)
611 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
612 _mm_set1_ps(0.5f)));
613 #else
614 float xf = (x1 + 0.5f) * scaleX - 0.5f;
615 #endif
616
617 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
618 out++;
619 x1++;
620 }
621 }
622
kernelF2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)623 void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
624 const uchar *pin = mIn;
625 const int srcHeight = inputSizeY;
626 const int srcWidth = inputSizeX;
627 const size_t stride = sizeX * vectorSize;
628
629
630 #if defined(ARCH_X86_HAVE_AVX2)
631 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
632 _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
633 #else
634 float yf = (currentY + 0.5f) * scaleY - 0.5f;
635 #endif
636
637 int starty = (int) floor(yf - 1);
638 yf = yf - floor(yf);
639 int maxy = srcHeight - 1;
640 int ys0 = std::max(0, starty + 0);
641 int ys1 = std::max(0, starty + 1);
642 int ys2 = std::min(maxy, starty + 2);
643 int ys3 = std::min(maxy, starty + 3);
644
645 const float2 *yp0 = (const float2 *)(pin + stride * ys0);
646 const float2 *yp1 = (const float2 *)(pin + stride * ys1);
647 const float2 *yp2 = (const float2 *)(pin + stride * ys2);
648 const float2 *yp3 = (const float2 *)(pin + stride * ys3);
649
650 float2 *out = ((float2 *)outPtr);
651 uint32_t x1 = xstart;
652 uint32_t x2 = xend;
653
654 while(x1 < x2) {
655
656 #if defined(ARCH_X86_HAVE_AVX2)
657 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
658 _mm_set1_ps(0.5f)));
659 #else
660 float xf = (x1 + 0.5f) * scaleX - 0.5f;
661 #endif
662
663 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
664 out++;
665 x1++;
666 }
667 }
668
kernelF1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)669 void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
670 const uchar *pin = mIn;
671 const int srcHeight = inputSizeY;
672 const int srcWidth = inputSizeX;
673 const size_t stride = sizeX * vectorSize;
674
675
676 #if defined(ARCH_X86_HAVE_AVX2)
677 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
678 _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
679 #else
680 float yf = (currentY + 0.5f) * scaleY - 0.5f;
681 #endif
682
683 int starty = (int) floor(yf - 1);
684 yf = yf - floor(yf);
685 int maxy = srcHeight - 1;
686 int ys0 = std::max(0, starty + 0);
687 int ys1 = std::max(0, starty + 1);
688 int ys2 = std::min(maxy, starty + 2);
689 int ys3 = std::min(maxy, starty + 3);
690
691 const float *yp0 = (const float *)(pin + stride * ys0);
692 const float *yp1 = (const float *)(pin + stride * ys1);
693 const float *yp2 = (const float *)(pin + stride * ys2);
694 const float *yp3 = (const float *)(pin + stride * ys3);
695
696 float *out = ((float *)outPtr);
697 uint32_t x1 = xstart;
698 uint32_t x2 = xend;
699
700 while(x1 < x2) {
701
702 #if defined(ARCH_X86_HAVE_AVX2)
703 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
704 _mm_set1_ps(0.5f)));
705 #else
706 float xf = (x1 + 0.5f) * scaleX - 0.5f;
707 #endif
708
709 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
710 out++;
711 x1++;
712 }
713 }
714
preLaunch(uint32_t slot,const RsScriptCall * sc)715 void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
716 {
717
718 //check the data type to determine F or U.
719 if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
720 switch(mAlloc->getType()->getElement()->getVectorSize()) {
721 case 1:
722 mRootPtr = &kernelU1;
723 break;
724 case 2:
725 mRootPtr = &kernelU2;
726 break;
727 case 3:
728 case 4:
729 mRootPtr = &kernelU4;
730 break;
731 }
732 } else {
733 switch(mAlloc->getType()->getElement()->getVectorSize()) {
734 case 1:
735 mRootPtr = &kernelF1;
736 break;
737 case 2:
738 mRootPtr = &kernelF2;
739 break;
740 case 3:
741 case 4:
742 mRootPtr = &kernelF4;
743 break;
744 }
745 }
746 }
747 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
748
resize(const uint8_t * input,uint8_t * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)749 void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
750 size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
751 size_t outputSizeY, const Restriction* restriction) {
752 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
753 if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
754 return;
755 }
756 if (vectorSize < 1 || vectorSize > 4) {
757 ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
758 return;
759 }
760 #endif
761
762 ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
763 outputSizeX, outputSizeY, restriction);
764 processor->doTask(&task);
765 }
766
767 } // namespace renderscript
768