1*32afb93cSXin Li /*
2*32afb93cSXin Li  * Copyright (C) 2013 The Android Open Source Project
3*32afb93cSXin Li  *
4*32afb93cSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li  * you may not use this file except in compliance with the License.
6*32afb93cSXin Li  * You may obtain a copy of the License at
7*32afb93cSXin Li  *
8*32afb93cSXin Li  *      http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li  *
10*32afb93cSXin Li  * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li  * See the License for the specific language governing permissions and
14*32afb93cSXin Li  * limitations under the License.
15*32afb93cSXin Li  */
16*32afb93cSXin Li 
17*32afb93cSXin Li #include <array>
18*32afb93cSXin Li #include <cstdint>
19*32afb93cSXin Li 
20*32afb93cSXin Li #include "RenderScriptToolkit.h"
21*32afb93cSXin Li #include "TaskProcessor.h"
22*32afb93cSXin Li #include "Utils.h"
23*32afb93cSXin Li 
24*32afb93cSXin Li #define LOG_TAG "renderscript.toolkit.Histogram"
25*32afb93cSXin Li 
26*32afb93cSXin Li namespace renderscript {
27*32afb93cSXin Li 
28*32afb93cSXin Li class HistogramTask : public Task {
29*32afb93cSXin Li     const uchar* mIn;
30*32afb93cSXin Li     std::vector<int> mSums;
31*32afb93cSXin Li     uint32_t mThreadCount;
32*32afb93cSXin Li 
33*32afb93cSXin Li     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
34*32afb93cSXin Li     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
35*32afb93cSXin Li                      size_t endY) override;
36*32afb93cSXin Li 
37*32afb93cSXin Li     void kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
38*32afb93cSXin Li     void kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
39*32afb93cSXin Li     void kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
40*32afb93cSXin Li     void kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
41*32afb93cSXin Li 
42*32afb93cSXin Li    public:
43*32afb93cSXin Li     HistogramTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
44*32afb93cSXin Li                   uint32_t threadCount, const Restriction* restriction);
45*32afb93cSXin Li     void collateSums(int* out);
46*32afb93cSXin Li };
47*32afb93cSXin Li 
48*32afb93cSXin Li class HistogramDotTask : public Task {
49*32afb93cSXin Li     const uchar* mIn;
50*32afb93cSXin Li     float mDot[4];
51*32afb93cSXin Li     int mDotI[4];
52*32afb93cSXin Li     std::vector<int> mSums;
53*32afb93cSXin Li     uint32_t mThreadCount;
54*32afb93cSXin Li 
55*32afb93cSXin Li     void kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
56*32afb93cSXin Li     void kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
57*32afb93cSXin Li     void kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
58*32afb93cSXin Li     void kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
59*32afb93cSXin Li 
60*32afb93cSXin Li    public:
61*32afb93cSXin Li     HistogramDotTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
62*32afb93cSXin Li                      uint32_t threadCount, const float* coefficients,
63*32afb93cSXin Li                      const Restriction* restriction);
64*32afb93cSXin Li     void collateSums(int* out);
65*32afb93cSXin Li 
66*32afb93cSXin Li     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
67*32afb93cSXin Li                      size_t endY) override;
68*32afb93cSXin Li };
69*32afb93cSXin Li 
HistogramTask(const uchar * in,size_t sizeX,size_t sizeY,size_t vectorSize,uint32_t threadCount,const Restriction * restriction)70*32afb93cSXin Li HistogramTask::HistogramTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
71*32afb93cSXin Li                              uint32_t threadCount, const Restriction* restriction)
72*32afb93cSXin Li     : Task{sizeX, sizeY, vectorSize, true, restriction},
73*32afb93cSXin Li       mIn{in},
74*32afb93cSXin Li       mSums(256 * paddedSize(vectorSize) * threadCount) {
75*32afb93cSXin Li     mThreadCount = threadCount;
76*32afb93cSXin Li }
77*32afb93cSXin Li 
processData(int threadIndex,size_t startX,size_t startY,size_t endX,size_t endY)78*32afb93cSXin Li void HistogramTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
79*32afb93cSXin Li                                 size_t endY) {
80*32afb93cSXin Li     typedef void (HistogramTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
81*32afb93cSXin Li 
82*32afb93cSXin Li     KernelFunction kernel;
83*32afb93cSXin Li     switch (mVectorSize) {
84*32afb93cSXin Li         case 4:
85*32afb93cSXin Li             kernel = &HistogramTask::kernelP1U4;
86*32afb93cSXin Li             break;
87*32afb93cSXin Li         case 3:
88*32afb93cSXin Li             kernel = &HistogramTask::kernelP1U3;
89*32afb93cSXin Li             break;
90*32afb93cSXin Li         case 2:
91*32afb93cSXin Li             kernel = &HistogramTask::kernelP1U2;
92*32afb93cSXin Li             break;
93*32afb93cSXin Li         case 1:
94*32afb93cSXin Li             kernel = &HistogramTask::kernelP1U1;
95*32afb93cSXin Li             break;
96*32afb93cSXin Li         default:
97*32afb93cSXin Li             ALOGE("Bad vector size %zd", mVectorSize);
98*32afb93cSXin Li             return;
99*32afb93cSXin Li     }
100*32afb93cSXin Li 
101*32afb93cSXin Li     int* sums = &mSums[256 * paddedSize(mVectorSize) * threadIndex];
102*32afb93cSXin Li 
103*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
104*32afb93cSXin Li         const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
105*32afb93cSXin Li         std::invoke(kernel, this, inPtr, sums, startX, endX);
106*32afb93cSXin Li     }
107*32afb93cSXin Li }
108*32afb93cSXin Li 
kernelP1U4(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)109*32afb93cSXin Li void HistogramTask::kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
110*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
111*32afb93cSXin Li         sums[(in[0] << 2)]++;
112*32afb93cSXin Li         sums[(in[1] << 2) + 1]++;
113*32afb93cSXin Li         sums[(in[2] << 2) + 2]++;
114*32afb93cSXin Li         sums[(in[3] << 2) + 3]++;
115*32afb93cSXin Li         in += 4;
116*32afb93cSXin Li     }
117*32afb93cSXin Li }
118*32afb93cSXin Li 
kernelP1U3(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)119*32afb93cSXin Li void HistogramTask::kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
120*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
121*32afb93cSXin Li         sums[(in[0] << 2)]++;
122*32afb93cSXin Li         sums[(in[1] << 2) + 1]++;
123*32afb93cSXin Li         sums[(in[2] << 2) + 2]++;
124*32afb93cSXin Li         in += 4;
125*32afb93cSXin Li     }
126*32afb93cSXin Li }
127*32afb93cSXin Li 
kernelP1U2(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)128*32afb93cSXin Li void HistogramTask::kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
129*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
130*32afb93cSXin Li         sums[(in[0] << 1)]++;
131*32afb93cSXin Li         sums[(in[1] << 1) + 1]++;
132*32afb93cSXin Li         in += 2;
133*32afb93cSXin Li     }
134*32afb93cSXin Li }
135*32afb93cSXin Li 
kernelP1U1(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)136*32afb93cSXin Li void HistogramTask::kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
137*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
138*32afb93cSXin Li         sums[in[0]]++;
139*32afb93cSXin Li         in++;
140*32afb93cSXin Li     }
141*32afb93cSXin Li }
142*32afb93cSXin Li 
collateSums(int * out)143*32afb93cSXin Li void HistogramTask::collateSums(int* out) {
144*32afb93cSXin Li     for (uint32_t ct = 0; ct < (256 * paddedSize(mVectorSize)); ct++) {
145*32afb93cSXin Li         out[ct] = mSums[ct];
146*32afb93cSXin Li         for (uint32_t t = 1; t < mThreadCount; t++) {
147*32afb93cSXin Li             out[ct] += mSums[ct + (256 * paddedSize(mVectorSize) * t)];
148*32afb93cSXin Li         }
149*32afb93cSXin Li     }
150*32afb93cSXin Li }
151*32afb93cSXin Li 
HistogramDotTask(const uchar * in,size_t sizeX,size_t sizeY,size_t vectorSize,uint32_t threadCount,const float * coefficients,const Restriction * restriction)152*32afb93cSXin Li HistogramDotTask::HistogramDotTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
153*32afb93cSXin Li                                    uint32_t threadCount, const float* coefficients,
154*32afb93cSXin Li                                    const Restriction* restriction)
155*32afb93cSXin Li     : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * threadCount, 0) {
156*32afb93cSXin Li     mThreadCount = threadCount;
157*32afb93cSXin Li 
158*32afb93cSXin Li     if (coefficients == nullptr) {
159*32afb93cSXin Li         mDot[0] = 0.299f;
160*32afb93cSXin Li         mDot[1] = 0.587f;
161*32afb93cSXin Li         mDot[2] = 0.114f;
162*32afb93cSXin Li         mDot[3] = 0;
163*32afb93cSXin Li     } else {
164*32afb93cSXin Li         memcpy(mDot, coefficients, 16);
165*32afb93cSXin Li     }
166*32afb93cSXin Li     mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
167*32afb93cSXin Li     mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
168*32afb93cSXin Li     mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
169*32afb93cSXin Li     mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
170*32afb93cSXin Li }
171*32afb93cSXin Li 
processData(int threadIndex,size_t startX,size_t startY,size_t endX,size_t endY)172*32afb93cSXin Li void HistogramDotTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
173*32afb93cSXin Li                                    size_t endY) {
174*32afb93cSXin Li     typedef void (HistogramDotTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
175*32afb93cSXin Li 
176*32afb93cSXin Li     KernelFunction kernel;
177*32afb93cSXin Li     switch (mVectorSize) {
178*32afb93cSXin Li         case 4:
179*32afb93cSXin Li             kernel = &HistogramDotTask::kernelP1L4;
180*32afb93cSXin Li             break;
181*32afb93cSXin Li         case 3:
182*32afb93cSXin Li             kernel = &HistogramDotTask::kernelP1L3;
183*32afb93cSXin Li             break;
184*32afb93cSXin Li         case 2:
185*32afb93cSXin Li             kernel = &HistogramDotTask::kernelP1L2;
186*32afb93cSXin Li             break;
187*32afb93cSXin Li         case 1:
188*32afb93cSXin Li             kernel = &HistogramDotTask::kernelP1L1;
189*32afb93cSXin Li             break;
190*32afb93cSXin Li         default:
191*32afb93cSXin Li             ALOGI("Bad vector size %zd", mVectorSize);
192*32afb93cSXin Li             return;
193*32afb93cSXin Li     }
194*32afb93cSXin Li 
195*32afb93cSXin Li     int* sums = &mSums[256 * threadIndex];
196*32afb93cSXin Li 
197*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
198*32afb93cSXin Li         const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
199*32afb93cSXin Li         std::invoke(kernel, this, inPtr, sums, startX, endX);
200*32afb93cSXin Li     }
201*32afb93cSXin Li }
202*32afb93cSXin Li 
kernelP1L4(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)203*32afb93cSXin Li void HistogramDotTask::kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
204*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
205*32afb93cSXin Li         int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]) + (mDotI[3] * in[3]);
206*32afb93cSXin Li         sums[(t + 0x7f) >> 8]++;
207*32afb93cSXin Li         in += 4;
208*32afb93cSXin Li     }
209*32afb93cSXin Li }
210*32afb93cSXin Li 
kernelP1L3(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)211*32afb93cSXin Li void HistogramDotTask::kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
212*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
213*32afb93cSXin Li         int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]);
214*32afb93cSXin Li         sums[(t + 0x7f) >> 8]++;
215*32afb93cSXin Li         in += 4;
216*32afb93cSXin Li     }
217*32afb93cSXin Li }
218*32afb93cSXin Li 
kernelP1L2(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)219*32afb93cSXin Li void HistogramDotTask::kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
220*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
221*32afb93cSXin Li         int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]);
222*32afb93cSXin Li         sums[(t + 0x7f) >> 8]++;
223*32afb93cSXin Li         in += 2;
224*32afb93cSXin Li     }
225*32afb93cSXin Li }
226*32afb93cSXin Li 
kernelP1L1(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)227*32afb93cSXin Li void HistogramDotTask::kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
228*32afb93cSXin Li     for (uint32_t x = xstart; x < xend; x++) {
229*32afb93cSXin Li         int t = (mDotI[0] * in[0]);
230*32afb93cSXin Li         sums[(t + 0x7f) >> 8]++;
231*32afb93cSXin Li         in++;
232*32afb93cSXin Li     }
233*32afb93cSXin Li }
234*32afb93cSXin Li 
collateSums(int * out)235*32afb93cSXin Li void HistogramDotTask::collateSums(int* out) {
236*32afb93cSXin Li     for (uint32_t ct = 0; ct < 256; ct++) {
237*32afb93cSXin Li         out[ct] = mSums[ct];
238*32afb93cSXin Li         for (uint32_t t = 1; t < mThreadCount; t++) {
239*32afb93cSXin Li             out[ct] += mSums[ct + (256 * t)];
240*32afb93cSXin Li         }
241*32afb93cSXin Li     }
242*32afb93cSXin Li }
243*32afb93cSXin Li 
244*32afb93cSXin Li ////////////////////////////////////////////////////////////////////////////
245*32afb93cSXin Li 
histogram(const uint8_t * in,int32_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,const Restriction * restriction)246*32afb93cSXin Li void RenderScriptToolkit::histogram(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
247*32afb93cSXin Li                                     size_t vectorSize, const Restriction* restriction) {
248*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
249*32afb93cSXin Li     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
250*32afb93cSXin Li         return;
251*32afb93cSXin Li     }
252*32afb93cSXin Li     if (vectorSize < 1 || vectorSize > 4) {
253*32afb93cSXin Li         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
254*32afb93cSXin Li         return;
255*32afb93cSXin Li     }
256*32afb93cSXin Li #endif
257*32afb93cSXin Li 
258*32afb93cSXin Li     HistogramTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), restriction);
259*32afb93cSXin Li     processor->doTask(&task);
260*32afb93cSXin Li     task.collateSums(out);
261*32afb93cSXin Li }
262*32afb93cSXin Li 
histogramDot(const uint8_t * in,int32_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,const float * coefficients,const Restriction * restriction)263*32afb93cSXin Li void RenderScriptToolkit::histogramDot(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
264*32afb93cSXin Li                                        size_t vectorSize, const float* coefficients,
265*32afb93cSXin Li                                        const Restriction* restriction) {
266*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
267*32afb93cSXin Li     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
268*32afb93cSXin Li         return;
269*32afb93cSXin Li     }
270*32afb93cSXin Li     if (vectorSize < 1 || vectorSize > 4) {
271*32afb93cSXin Li         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
272*32afb93cSXin Li         return;
273*32afb93cSXin Li     }
274*32afb93cSXin Li     if (coefficients != nullptr) {
275*32afb93cSXin Li         float sum = 0.0f;
276*32afb93cSXin Li         for (size_t i = 0; i < vectorSize; i++) {
277*32afb93cSXin Li             if (coefficients[i] < 0.0f) {
278*32afb93cSXin Li                 ALOGE("histogramDot coefficients should not be negative. Coefficient %zu was %f.",
279*32afb93cSXin Li                       i, coefficients[i]);
280*32afb93cSXin Li                 return;
281*32afb93cSXin Li             }
282*32afb93cSXin Li             sum += coefficients[i];
283*32afb93cSXin Li         }
284*32afb93cSXin Li         if (sum > 1.0f) {
285*32afb93cSXin Li             ALOGE("histogramDot coefficients should add to 1 or less. Their sum is %f.", sum);
286*32afb93cSXin Li             return;
287*32afb93cSXin Li         }
288*32afb93cSXin Li     }
289*32afb93cSXin Li #endif
290*32afb93cSXin Li 
291*32afb93cSXin Li     HistogramDotTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(),
292*32afb93cSXin Li                           coefficients, restriction);
293*32afb93cSXin Li     processor->doTask(&task);
294*32afb93cSXin Li     task.collateSums(out);
295*32afb93cSXin Li }
296*32afb93cSXin Li 
297*32afb93cSXin Li }  // namespace renderscript
298