1 /*
2 * Copyright (C) 2021 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "TaskProcessor.h"
18
19 #include <cassert>
20 #include <sys/prctl.h>
21
22 #include "RenderScriptToolkit.h"
23 #include "Utils.h"
24
25 #define LOG_TAG "renderscript.toolkit.TaskProcessor"
26
27 namespace renderscript {
28
setTiling(unsigned int targetTileSizeInBytes)29 int Task::setTiling(unsigned int targetTileSizeInBytes) {
30 // Empirically, values smaller than 1000 are unlikely to give good performance.
31 targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
32 const size_t cellSizeInBytes =
33 mVectorSize; // If we add float support, vectorSize * 4 for that.
34 const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
35 assert(targetCellsPerTile > 0);
36
37 size_t cellsToProcessY;
38 size_t cellsToProcessX;
39 if (mRestriction == nullptr) {
40 cellsToProcessX = mSizeX;
41 cellsToProcessY = mSizeY;
42 } else {
43 assert(mRestriction->endX > mRestriction->startX);
44 assert(mRestriction->endY > mRestriction->startY);
45 cellsToProcessX = mRestriction->endX - mRestriction->startX;
46 cellsToProcessY = mRestriction->endY - mRestriction->startY;
47 }
48
49 // We want rows as large as possible, as the SIMD code we have is more efficient with
50 // large rows.
51 mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
52 // Once we know the number of tiles per row, we divide that row evenly. We round up to make
53 // sure all cells are included in the last tile of the row.
54 mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
55
56 // We do the same thing for the Y direction.
57 size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
58 mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
59 mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
60
61 return mTilesPerRow * mTilesPerColumn;
62 }
63
processTile(unsigned int threadIndex,size_t tileIndex)64 void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
65 // Figure out the overall boundaries.
66 size_t startWorkX;
67 size_t startWorkY;
68 size_t endWorkX;
69 size_t endWorkY;
70 if (mRestriction == nullptr) {
71 startWorkX = 0;
72 startWorkY = 0;
73 endWorkX = mSizeX;
74 endWorkY = mSizeY;
75 } else {
76 startWorkX = mRestriction->startX;
77 startWorkY = mRestriction->startY;
78 endWorkX = mRestriction->endX;
79 endWorkY = mRestriction->endY;
80 }
81 // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
82 // first the X, Y coordinate of our tile in that grid.
83 size_t tileIndexY = tileIndex / mTilesPerRow;
84 size_t tileIndexX = tileIndex % mTilesPerRow;
85 // Calculate the starting and ending point of that tile.
86 size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
87 size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
88 size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
89 size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
90
91 // Call the derived class to do the specific work.
92 if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
93 // When the tile covers entire rows, we can take advantage that some ops are not 2D.
94 processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
95 } else {
96 processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
97 }
98 }
99
TaskProcessor(unsigned int numThreads)100 TaskProcessor::TaskProcessor(unsigned int numThreads)
101 : mUsesSimd{cpuSupportsSimd()},
102 /* If the requested number of threads is 0, we'll decide based on the number of cores.
103 * Through empirical testing, we've found that using more than 6 threads does not help.
104 * There may be more optimal choices to make depending on the SoC but we'll stick to
105 * this simple heuristic for now.
106 *
107 * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
108 * worker pool thread than the total number of threads.
109 */
110 mNumberOfPoolThreads{numThreads ? numThreads - 1
111 : std::min(6u, std::thread::hardware_concurrency() - 1)} {
112 for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
113 mPoolThreads.emplace_back(
114 std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
115 }
116 }
117
~TaskProcessor()118 TaskProcessor::~TaskProcessor() {
119 {
120 std::lock_guard<std::mutex> lock(mQueueMutex);
121 mStopThreads = true;
122 mWorkAvailableOrStop.notify_all();
123 }
124
125 for (auto& thread : mPoolThreads) {
126 thread.join();
127 }
128 }
129
processTilesOfWork(int threadIndex,bool returnWhenNoWork)130 void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
131 if (threadIndex != 0) {
132 // Set the name of the thread, except for thread 0, which is not part of the pool.
133 // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
134 char name[16]{"RenderScToolkit"};
135 prctl(PR_SET_NAME, name, 0, 0, 0);
136 // ALOGI("Starting thread%d", threadIndex);
137 }
138
139 std::unique_lock<std::mutex> lock(mQueueMutex);
140 while (true) {
141 mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() /*REQUIRES(mQueueMutex)*/ {
142 return mStopThreads || (mTilesNotYetStarted > 0) ||
143 (returnWhenNoWork && (mTilesNotYetStarted == 0));
144 });
145 // ALOGI("Woke thread%d", threadIndex);
146
147 // This ScopedLockAssertion is to help the compiler when it checks thread annotations
148 // to realize that we have the lock. It's however not completely true; we don't
149 // hold the lock while processing the tile.
150 // TODO Figure out how to fix that.
151 // android::base::ScopedLockAssertion lockAssert(mQueueMutex);
152 if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
153 break;
154 }
155
156 while (mTilesNotYetStarted > 0 && !mStopThreads) {
157 // This picks the tiles in decreasing order but that does not matter.
158 int myTile = --mTilesNotYetStarted;
159 mTilesInProcess++;
160 lock.unlock();
161 {
162 // We won't be executing this code unless the main thread is
163 // holding the mTaskMutex lock, which guards mCurrentTask.
164 // The compiler can't figure this out.
165 // android::base::ScopedLockAssertion lockAssert(mTaskMutex);
166 mCurrentTask->processTile(threadIndex, myTile);
167 }
168 lock.lock();
169 mTilesInProcess--;
170 if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
171 mWorkIsFinished.notify_one();
172 }
173 }
174 }
175 // if (threadIndex != 0) {
176 // ALOGI("Ending thread%d", threadIndex);
177 // }
178 }
179
doTask(Task * task)180 void TaskProcessor::doTask(Task* task) {
181 std::lock_guard<std::mutex> lockGuard(mTaskMutex);
182 task->setUsesSimd(mUsesSimd);
183 mCurrentTask = task;
184 // Notify the thread pool of available work.
185 startWork(task);
186 // Start processing some of the tiles on the calling thread.
187 processTilesOfWork(0, true);
188 // Wait for all the pool workers to complete.
189 waitForPoolWorkersToComplete();
190 mCurrentTask = nullptr;
191 }
192
startWork(Task * task)193 void TaskProcessor::startWork(Task* task) {
194 /**
195 * The size in bytes that we're hoping each tile will be. If this value is too small,
196 * we'll spend too much time in synchronization. If it's too large, some cores may be
197 * idle while others still have a lot of work to do. Ideally, it would depend on the
198 * device we're running. 16k is the same value used by RenderScript and seems reasonable
199 * from ad-hoc tests.
200 */
201 const size_t targetTileSize = 16 * 1024;
202
203 std::lock_guard<std::mutex> lock(mQueueMutex);
204 assert(mTilesInProcess == 0);
205 mTilesNotYetStarted = task->setTiling(targetTileSize);
206 mWorkAvailableOrStop.notify_all();
207 }
208
waitForPoolWorkersToComplete()209 void TaskProcessor::waitForPoolWorkersToComplete() {
210 std::unique_lock<std::mutex> lock(mQueueMutex);
211 // The predicate, i.e. the lambda, will make sure that
212 // we terminate even if the main thread calls this after
213 // mWorkIsFinished is signaled.
214 mWorkIsFinished.wait(lock, [this]() /*REQUIRES(mQueueMutex)*/ {
215 return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
216 });
217 }
218
219 } // namespace renderscript
220