1*32afb93cSXin Li /*
2*32afb93cSXin Li * Copyright (C) 2013 The Android Open Source Project
3*32afb93cSXin Li *
4*32afb93cSXin Li * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li * you may not use this file except in compliance with the License.
6*32afb93cSXin Li * You may obtain a copy of the License at
7*32afb93cSXin Li *
8*32afb93cSXin Li * http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li *
10*32afb93cSXin Li * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li * See the License for the specific language governing permissions and
14*32afb93cSXin Li * limitations under the License.
15*32afb93cSXin Li */
16*32afb93cSXin Li
17*32afb93cSXin Li #include <array>
18*32afb93cSXin Li #include <cstdint>
19*32afb93cSXin Li
20*32afb93cSXin Li #include "RenderScriptToolkit.h"
21*32afb93cSXin Li #include "TaskProcessor.h"
22*32afb93cSXin Li #include "Utils.h"
23*32afb93cSXin Li
24*32afb93cSXin Li #define LOG_TAG "renderscript.toolkit.Histogram"
25*32afb93cSXin Li
26*32afb93cSXin Li namespace renderscript {
27*32afb93cSXin Li
28*32afb93cSXin Li class HistogramTask : public Task {
29*32afb93cSXin Li const uchar* mIn;
30*32afb93cSXin Li std::vector<int> mSums;
31*32afb93cSXin Li uint32_t mThreadCount;
32*32afb93cSXin Li
33*32afb93cSXin Li // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
34*32afb93cSXin Li void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
35*32afb93cSXin Li size_t endY) override;
36*32afb93cSXin Li
37*32afb93cSXin Li void kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
38*32afb93cSXin Li void kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
39*32afb93cSXin Li void kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
40*32afb93cSXin Li void kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
41*32afb93cSXin Li
42*32afb93cSXin Li public:
43*32afb93cSXin Li HistogramTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
44*32afb93cSXin Li uint32_t threadCount, const Restriction* restriction);
45*32afb93cSXin Li void collateSums(int* out);
46*32afb93cSXin Li };
47*32afb93cSXin Li
48*32afb93cSXin Li class HistogramDotTask : public Task {
49*32afb93cSXin Li const uchar* mIn;
50*32afb93cSXin Li float mDot[4];
51*32afb93cSXin Li int mDotI[4];
52*32afb93cSXin Li std::vector<int> mSums;
53*32afb93cSXin Li uint32_t mThreadCount;
54*32afb93cSXin Li
55*32afb93cSXin Li void kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
56*32afb93cSXin Li void kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
57*32afb93cSXin Li void kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
58*32afb93cSXin Li void kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
59*32afb93cSXin Li
60*32afb93cSXin Li public:
61*32afb93cSXin Li HistogramDotTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
62*32afb93cSXin Li uint32_t threadCount, const float* coefficients,
63*32afb93cSXin Li const Restriction* restriction);
64*32afb93cSXin Li void collateSums(int* out);
65*32afb93cSXin Li
66*32afb93cSXin Li void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
67*32afb93cSXin Li size_t endY) override;
68*32afb93cSXin Li };
69*32afb93cSXin Li
HistogramTask(const uchar * in,size_t sizeX,size_t sizeY,size_t vectorSize,uint32_t threadCount,const Restriction * restriction)70*32afb93cSXin Li HistogramTask::HistogramTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
71*32afb93cSXin Li uint32_t threadCount, const Restriction* restriction)
72*32afb93cSXin Li : Task{sizeX, sizeY, vectorSize, true, restriction},
73*32afb93cSXin Li mIn{in},
74*32afb93cSXin Li mSums(256 * paddedSize(vectorSize) * threadCount) {
75*32afb93cSXin Li mThreadCount = threadCount;
76*32afb93cSXin Li }
77*32afb93cSXin Li
processData(int threadIndex,size_t startX,size_t startY,size_t endX,size_t endY)78*32afb93cSXin Li void HistogramTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
79*32afb93cSXin Li size_t endY) {
80*32afb93cSXin Li typedef void (HistogramTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
81*32afb93cSXin Li
82*32afb93cSXin Li KernelFunction kernel;
83*32afb93cSXin Li switch (mVectorSize) {
84*32afb93cSXin Li case 4:
85*32afb93cSXin Li kernel = &HistogramTask::kernelP1U4;
86*32afb93cSXin Li break;
87*32afb93cSXin Li case 3:
88*32afb93cSXin Li kernel = &HistogramTask::kernelP1U3;
89*32afb93cSXin Li break;
90*32afb93cSXin Li case 2:
91*32afb93cSXin Li kernel = &HistogramTask::kernelP1U2;
92*32afb93cSXin Li break;
93*32afb93cSXin Li case 1:
94*32afb93cSXin Li kernel = &HistogramTask::kernelP1U1;
95*32afb93cSXin Li break;
96*32afb93cSXin Li default:
97*32afb93cSXin Li ALOGE("Bad vector size %zd", mVectorSize);
98*32afb93cSXin Li return;
99*32afb93cSXin Li }
100*32afb93cSXin Li
101*32afb93cSXin Li int* sums = &mSums[256 * paddedSize(mVectorSize) * threadIndex];
102*32afb93cSXin Li
103*32afb93cSXin Li for (size_t y = startY; y < endY; y++) {
104*32afb93cSXin Li const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
105*32afb93cSXin Li std::invoke(kernel, this, inPtr, sums, startX, endX);
106*32afb93cSXin Li }
107*32afb93cSXin Li }
108*32afb93cSXin Li
kernelP1U4(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)109*32afb93cSXin Li void HistogramTask::kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
110*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
111*32afb93cSXin Li sums[(in[0] << 2)]++;
112*32afb93cSXin Li sums[(in[1] << 2) + 1]++;
113*32afb93cSXin Li sums[(in[2] << 2) + 2]++;
114*32afb93cSXin Li sums[(in[3] << 2) + 3]++;
115*32afb93cSXin Li in += 4;
116*32afb93cSXin Li }
117*32afb93cSXin Li }
118*32afb93cSXin Li
kernelP1U3(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)119*32afb93cSXin Li void HistogramTask::kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
120*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
121*32afb93cSXin Li sums[(in[0] << 2)]++;
122*32afb93cSXin Li sums[(in[1] << 2) + 1]++;
123*32afb93cSXin Li sums[(in[2] << 2) + 2]++;
124*32afb93cSXin Li in += 4;
125*32afb93cSXin Li }
126*32afb93cSXin Li }
127*32afb93cSXin Li
kernelP1U2(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)128*32afb93cSXin Li void HistogramTask::kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
129*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
130*32afb93cSXin Li sums[(in[0] << 1)]++;
131*32afb93cSXin Li sums[(in[1] << 1) + 1]++;
132*32afb93cSXin Li in += 2;
133*32afb93cSXin Li }
134*32afb93cSXin Li }
135*32afb93cSXin Li
kernelP1U1(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)136*32afb93cSXin Li void HistogramTask::kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
137*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
138*32afb93cSXin Li sums[in[0]]++;
139*32afb93cSXin Li in++;
140*32afb93cSXin Li }
141*32afb93cSXin Li }
142*32afb93cSXin Li
collateSums(int * out)143*32afb93cSXin Li void HistogramTask::collateSums(int* out) {
144*32afb93cSXin Li for (uint32_t ct = 0; ct < (256 * paddedSize(mVectorSize)); ct++) {
145*32afb93cSXin Li out[ct] = mSums[ct];
146*32afb93cSXin Li for (uint32_t t = 1; t < mThreadCount; t++) {
147*32afb93cSXin Li out[ct] += mSums[ct + (256 * paddedSize(mVectorSize) * t)];
148*32afb93cSXin Li }
149*32afb93cSXin Li }
150*32afb93cSXin Li }
151*32afb93cSXin Li
HistogramDotTask(const uchar * in,size_t sizeX,size_t sizeY,size_t vectorSize,uint32_t threadCount,const float * coefficients,const Restriction * restriction)152*32afb93cSXin Li HistogramDotTask::HistogramDotTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
153*32afb93cSXin Li uint32_t threadCount, const float* coefficients,
154*32afb93cSXin Li const Restriction* restriction)
155*32afb93cSXin Li : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * threadCount, 0) {
156*32afb93cSXin Li mThreadCount = threadCount;
157*32afb93cSXin Li
158*32afb93cSXin Li if (coefficients == nullptr) {
159*32afb93cSXin Li mDot[0] = 0.299f;
160*32afb93cSXin Li mDot[1] = 0.587f;
161*32afb93cSXin Li mDot[2] = 0.114f;
162*32afb93cSXin Li mDot[3] = 0;
163*32afb93cSXin Li } else {
164*32afb93cSXin Li memcpy(mDot, coefficients, 16);
165*32afb93cSXin Li }
166*32afb93cSXin Li mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
167*32afb93cSXin Li mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
168*32afb93cSXin Li mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
169*32afb93cSXin Li mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
170*32afb93cSXin Li }
171*32afb93cSXin Li
processData(int threadIndex,size_t startX,size_t startY,size_t endX,size_t endY)172*32afb93cSXin Li void HistogramDotTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
173*32afb93cSXin Li size_t endY) {
174*32afb93cSXin Li typedef void (HistogramDotTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
175*32afb93cSXin Li
176*32afb93cSXin Li KernelFunction kernel;
177*32afb93cSXin Li switch (mVectorSize) {
178*32afb93cSXin Li case 4:
179*32afb93cSXin Li kernel = &HistogramDotTask::kernelP1L4;
180*32afb93cSXin Li break;
181*32afb93cSXin Li case 3:
182*32afb93cSXin Li kernel = &HistogramDotTask::kernelP1L3;
183*32afb93cSXin Li break;
184*32afb93cSXin Li case 2:
185*32afb93cSXin Li kernel = &HistogramDotTask::kernelP1L2;
186*32afb93cSXin Li break;
187*32afb93cSXin Li case 1:
188*32afb93cSXin Li kernel = &HistogramDotTask::kernelP1L1;
189*32afb93cSXin Li break;
190*32afb93cSXin Li default:
191*32afb93cSXin Li ALOGI("Bad vector size %zd", mVectorSize);
192*32afb93cSXin Li return;
193*32afb93cSXin Li }
194*32afb93cSXin Li
195*32afb93cSXin Li int* sums = &mSums[256 * threadIndex];
196*32afb93cSXin Li
197*32afb93cSXin Li for (size_t y = startY; y < endY; y++) {
198*32afb93cSXin Li const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
199*32afb93cSXin Li std::invoke(kernel, this, inPtr, sums, startX, endX);
200*32afb93cSXin Li }
201*32afb93cSXin Li }
202*32afb93cSXin Li
kernelP1L4(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)203*32afb93cSXin Li void HistogramDotTask::kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
204*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
205*32afb93cSXin Li int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]) + (mDotI[3] * in[3]);
206*32afb93cSXin Li sums[(t + 0x7f) >> 8]++;
207*32afb93cSXin Li in += 4;
208*32afb93cSXin Li }
209*32afb93cSXin Li }
210*32afb93cSXin Li
kernelP1L3(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)211*32afb93cSXin Li void HistogramDotTask::kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
212*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
213*32afb93cSXin Li int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]);
214*32afb93cSXin Li sums[(t + 0x7f) >> 8]++;
215*32afb93cSXin Li in += 4;
216*32afb93cSXin Li }
217*32afb93cSXin Li }
218*32afb93cSXin Li
kernelP1L2(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)219*32afb93cSXin Li void HistogramDotTask::kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
220*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
221*32afb93cSXin Li int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]);
222*32afb93cSXin Li sums[(t + 0x7f) >> 8]++;
223*32afb93cSXin Li in += 2;
224*32afb93cSXin Li }
225*32afb93cSXin Li }
226*32afb93cSXin Li
kernelP1L1(const uchar * in,int * sums,uint32_t xstart,uint32_t xend)227*32afb93cSXin Li void HistogramDotTask::kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
228*32afb93cSXin Li for (uint32_t x = xstart; x < xend; x++) {
229*32afb93cSXin Li int t = (mDotI[0] * in[0]);
230*32afb93cSXin Li sums[(t + 0x7f) >> 8]++;
231*32afb93cSXin Li in++;
232*32afb93cSXin Li }
233*32afb93cSXin Li }
234*32afb93cSXin Li
collateSums(int * out)235*32afb93cSXin Li void HistogramDotTask::collateSums(int* out) {
236*32afb93cSXin Li for (uint32_t ct = 0; ct < 256; ct++) {
237*32afb93cSXin Li out[ct] = mSums[ct];
238*32afb93cSXin Li for (uint32_t t = 1; t < mThreadCount; t++) {
239*32afb93cSXin Li out[ct] += mSums[ct + (256 * t)];
240*32afb93cSXin Li }
241*32afb93cSXin Li }
242*32afb93cSXin Li }
243*32afb93cSXin Li
244*32afb93cSXin Li ////////////////////////////////////////////////////////////////////////////
245*32afb93cSXin Li
histogram(const uint8_t * in,int32_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,const Restriction * restriction)246*32afb93cSXin Li void RenderScriptToolkit::histogram(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
247*32afb93cSXin Li size_t vectorSize, const Restriction* restriction) {
248*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
249*32afb93cSXin Li if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
250*32afb93cSXin Li return;
251*32afb93cSXin Li }
252*32afb93cSXin Li if (vectorSize < 1 || vectorSize > 4) {
253*32afb93cSXin Li ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
254*32afb93cSXin Li return;
255*32afb93cSXin Li }
256*32afb93cSXin Li #endif
257*32afb93cSXin Li
258*32afb93cSXin Li HistogramTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), restriction);
259*32afb93cSXin Li processor->doTask(&task);
260*32afb93cSXin Li task.collateSums(out);
261*32afb93cSXin Li }
262*32afb93cSXin Li
histogramDot(const uint8_t * in,int32_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,const float * coefficients,const Restriction * restriction)263*32afb93cSXin Li void RenderScriptToolkit::histogramDot(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
264*32afb93cSXin Li size_t vectorSize, const float* coefficients,
265*32afb93cSXin Li const Restriction* restriction) {
266*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
267*32afb93cSXin Li if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
268*32afb93cSXin Li return;
269*32afb93cSXin Li }
270*32afb93cSXin Li if (vectorSize < 1 || vectorSize > 4) {
271*32afb93cSXin Li ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
272*32afb93cSXin Li return;
273*32afb93cSXin Li }
274*32afb93cSXin Li if (coefficients != nullptr) {
275*32afb93cSXin Li float sum = 0.0f;
276*32afb93cSXin Li for (size_t i = 0; i < vectorSize; i++) {
277*32afb93cSXin Li if (coefficients[i] < 0.0f) {
278*32afb93cSXin Li ALOGE("histogramDot coefficients should not be negative. Coefficient %zu was %f.",
279*32afb93cSXin Li i, coefficients[i]);
280*32afb93cSXin Li return;
281*32afb93cSXin Li }
282*32afb93cSXin Li sum += coefficients[i];
283*32afb93cSXin Li }
284*32afb93cSXin Li if (sum > 1.0f) {
285*32afb93cSXin Li ALOGE("histogramDot coefficients should add to 1 or less. Their sum is %f.", sum);
286*32afb93cSXin Li return;
287*32afb93cSXin Li }
288*32afb93cSXin Li }
289*32afb93cSXin Li #endif
290*32afb93cSXin Li
291*32afb93cSXin Li HistogramDotTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(),
292*32afb93cSXin Li coefficients, restriction);
293*32afb93cSXin Li processor->doTask(&task);
294*32afb93cSXin Li task.collateSums(out);
295*32afb93cSXin Li }
296*32afb93cSXin Li
297*32afb93cSXin Li } // namespace renderscript
298