1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "TaskProcessor.h"
18 
19 #include <cassert>
20 #include <sys/prctl.h>
21 
22 #include "RenderScriptToolkit.h"
23 #include "Utils.h"
24 
25 #define LOG_TAG "renderscript.toolkit.TaskProcessor"
26 
27 namespace renderscript {
28 
setTiling(unsigned int targetTileSizeInBytes)29 int Task::setTiling(unsigned int targetTileSizeInBytes) {
30     // Empirically, values smaller than 1000 are unlikely to give good performance.
31     targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
32     const size_t cellSizeInBytes =
33             mVectorSize;  // If we add float support, vectorSize * 4 for that.
34     const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
35     assert(targetCellsPerTile > 0);
36 
37     size_t cellsToProcessY;
38     size_t cellsToProcessX;
39     if (mRestriction == nullptr) {
40         cellsToProcessX = mSizeX;
41         cellsToProcessY = mSizeY;
42     } else {
43         assert(mRestriction->endX > mRestriction->startX);
44         assert(mRestriction->endY > mRestriction->startY);
45         cellsToProcessX = mRestriction->endX - mRestriction->startX;
46         cellsToProcessY = mRestriction->endY - mRestriction->startY;
47     }
48 
49     // We want rows as large as possible, as the SIMD code we have is more efficient with
50     // large rows.
51     mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
52     // Once we know the number of tiles per row, we divide that row evenly. We round up to make
53     // sure all cells are included in the last tile of the row.
54     mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
55 
56     // We do the same thing for the Y direction.
57     size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
58     mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
59     mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
60 
61     return mTilesPerRow * mTilesPerColumn;
62 }
63 
processTile(unsigned int threadIndex,size_t tileIndex)64 void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
65     // Figure out the overall boundaries.
66     size_t startWorkX;
67     size_t startWorkY;
68     size_t endWorkX;
69     size_t endWorkY;
70     if (mRestriction == nullptr) {
71         startWorkX = 0;
72         startWorkY = 0;
73         endWorkX = mSizeX;
74         endWorkY = mSizeY;
75     } else {
76         startWorkX = mRestriction->startX;
77         startWorkY = mRestriction->startY;
78         endWorkX = mRestriction->endX;
79         endWorkY = mRestriction->endY;
80     }
81     // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
82     // first the X, Y coordinate of our tile in that grid.
83     size_t tileIndexY = tileIndex / mTilesPerRow;
84     size_t tileIndexX = tileIndex % mTilesPerRow;
85     // Calculate the starting and ending point of that tile.
86     size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
87     size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
88     size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
89     size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
90 
91     // Call the derived class to do the specific work.
92     if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
93         // When the tile covers entire rows, we can take advantage that some ops are not 2D.
94         processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
95     } else {
96         processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
97     }
98 }
99 
TaskProcessor(unsigned int numThreads)100 TaskProcessor::TaskProcessor(unsigned int numThreads)
101     : mUsesSimd{cpuSupportsSimd()},
102       /* If the requested number of threads is 0, we'll decide based on the number of cores.
103        * Through empirical testing, we've found that using more than 6 threads does not help.
104        * There may be more optimal choices to make depending on the SoC but we'll stick to
105        * this simple heuristic for now.
106        *
107        * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
108        * worker pool thread than the total number of threads.
109        */
110       mNumberOfPoolThreads{numThreads ? numThreads - 1
111                                       : std::min(6u, std::thread::hardware_concurrency() - 1)} {
112     for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
113         mPoolThreads.emplace_back(
114                 std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
115     }
116 }
117 
~TaskProcessor()118 TaskProcessor::~TaskProcessor() {
119     {
120         std::lock_guard<std::mutex> lock(mQueueMutex);
121         mStopThreads = true;
122         mWorkAvailableOrStop.notify_all();
123     }
124 
125     for (auto& thread : mPoolThreads) {
126         thread.join();
127     }
128 }
129 
processTilesOfWork(int threadIndex,bool returnWhenNoWork)130 void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
131     if (threadIndex != 0) {
132         // Set the name of the thread, except for thread 0, which is not part of the pool.
133         // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
134         char name[16]{"RenderScToolkit"};
135         prctl(PR_SET_NAME, name, 0, 0, 0);
136         // ALOGI("Starting thread%d", threadIndex);
137     }
138 
139     std::unique_lock<std::mutex> lock(mQueueMutex);
140     while (true) {
141         mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() /*REQUIRES(mQueueMutex)*/ {
142             return mStopThreads || (mTilesNotYetStarted > 0) ||
143                    (returnWhenNoWork && (mTilesNotYetStarted == 0));
144         });
145         // ALOGI("Woke thread%d", threadIndex);
146 
147         // This ScopedLockAssertion is to help the compiler when it checks thread annotations
148         // to realize that we have the lock. It's however not completely true; we don't
149         // hold the lock while processing the tile.
150         // TODO Figure out how to fix that.
151         // android::base::ScopedLockAssertion lockAssert(mQueueMutex);
152         if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
153             break;
154         }
155 
156         while (mTilesNotYetStarted > 0 && !mStopThreads) {
157             // This picks the tiles in decreasing order but that does not matter.
158             int myTile = --mTilesNotYetStarted;
159             mTilesInProcess++;
160             lock.unlock();
161             {
162                 // We won't be executing this code unless the main thread is
163                 // holding the mTaskMutex lock, which guards mCurrentTask.
164                 // The compiler can't figure this out.
165                 // android::base::ScopedLockAssertion lockAssert(mTaskMutex);
166                 mCurrentTask->processTile(threadIndex, myTile);
167             }
168             lock.lock();
169             mTilesInProcess--;
170             if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
171                 mWorkIsFinished.notify_one();
172             }
173         }
174     }
175     // if (threadIndex != 0) {
176     //     ALOGI("Ending thread%d", threadIndex);
177     // }
178 }
179 
doTask(Task * task)180 void TaskProcessor::doTask(Task* task) {
181     std::lock_guard<std::mutex> lockGuard(mTaskMutex);
182     task->setUsesSimd(mUsesSimd);
183     mCurrentTask = task;
184     // Notify the thread pool of available work.
185     startWork(task);
186     // Start processing some of the tiles on the calling thread.
187     processTilesOfWork(0, true);
188     // Wait for all the pool workers to complete.
189     waitForPoolWorkersToComplete();
190     mCurrentTask = nullptr;
191 }
192 
startWork(Task * task)193 void TaskProcessor::startWork(Task* task) {
194     /**
195      * The size in bytes that we're hoping each tile will be. If this value is too small,
196      * we'll spend too much time in synchronization. If it's too large, some cores may be
197      * idle while others still have a lot of work to do. Ideally, it would depend on the
198      * device we're running. 16k is the same value used by RenderScript and seems reasonable
199      * from ad-hoc tests.
200      */
201     const size_t targetTileSize = 16 * 1024;
202 
203     std::lock_guard<std::mutex> lock(mQueueMutex);
204     assert(mTilesInProcess == 0);
205     mTilesNotYetStarted = task->setTiling(targetTileSize);
206     mWorkAvailableOrStop.notify_all();
207 }
208 
waitForPoolWorkersToComplete()209 void TaskProcessor::waitForPoolWorkersToComplete() {
210     std::unique_lock<std::mutex> lock(mQueueMutex);
211     // The predicate, i.e. the lambda, will make sure that
212     // we terminate even if the main thread calls this after
213     // mWorkIsFinished is signaled.
214     mWorkIsFinished.wait(lock, [this]() /*REQUIRES(mQueueMutex)*/ {
215         return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
216     });
217 }
218 
219 }  // namespace renderscript
220