xref: /aosp_15_r20/external/libgav1/src/threading_strategy.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop //      http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop 
15*09537850SAkhilesh Sanikop #include "src/threading_strategy.h"
16*09537850SAkhilesh Sanikop 
17*09537850SAkhilesh Sanikop #include <algorithm>
18*09537850SAkhilesh Sanikop #include <cassert>
19*09537850SAkhilesh Sanikop #include <memory>
20*09537850SAkhilesh Sanikop 
21*09537850SAkhilesh Sanikop #include "src/frame_scratch_buffer.h"
22*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
23*09537850SAkhilesh Sanikop #include "src/utils/logging.h"
24*09537850SAkhilesh Sanikop #include "src/utils/vector.h"
25*09537850SAkhilesh Sanikop 
26*09537850SAkhilesh Sanikop namespace libgav1 {
27*09537850SAkhilesh Sanikop namespace {
28*09537850SAkhilesh Sanikop 
29*09537850SAkhilesh Sanikop #if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
30*09537850SAkhilesh Sanikop constexpr int kFrameParallelThresholdMultiplier = 3;
31*09537850SAkhilesh Sanikop #else
32*09537850SAkhilesh Sanikop constexpr int kFrameParallelThresholdMultiplier =
33*09537850SAkhilesh Sanikop     LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
34*09537850SAkhilesh Sanikop #endif
35*09537850SAkhilesh Sanikop 
36*09537850SAkhilesh Sanikop // Computes the number of frame threads to be used based on the following
37*09537850SAkhilesh Sanikop // heuristic:
38*09537850SAkhilesh Sanikop //   * If |thread_count| == 1, return 0.
39*09537850SAkhilesh Sanikop //   * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
40*09537850SAkhilesh Sanikop //     return 0.
41*09537850SAkhilesh Sanikop //   * Otherwise, return the largest value of i which satisfies the following
42*09537850SAkhilesh Sanikop //     condition: i + i * tile_columns <= thread_count. This ensures that there
43*09537850SAkhilesh Sanikop //     are at least |tile_columns| worker threads for each frame thread.
44*09537850SAkhilesh Sanikop //   * This function will never return 1 or a value > |thread_count|.
45*09537850SAkhilesh Sanikop //
46*09537850SAkhilesh Sanikop //  This heuristic is based on empirical performance data. The in-frame
47*09537850SAkhilesh Sanikop //  threading model (combination of tile multithreading, superblock row
48*09537850SAkhilesh Sanikop //  multithreading and post filter multithreading) performs better than the
49*09537850SAkhilesh Sanikop //  frame parallel model until we reach the threshold of |thread_count| >
50*09537850SAkhilesh Sanikop //  |tile_count| * kFrameParallelThresholdMultiplier.
51*09537850SAkhilesh Sanikop //
52*09537850SAkhilesh Sanikop //  It is a function of |tile_count| since tile threading and superblock row
53*09537850SAkhilesh Sanikop //  multithreading will scale only as a factor of |tile_count|. The threshold
54*09537850SAkhilesh Sanikop //  kFrameParallelThresholdMultiplier is arrived at based on empirical data.
55*09537850SAkhilesh Sanikop //  The general idea is that superblock row multithreading plateaus at 4 *
56*09537850SAkhilesh Sanikop //  |tile_count| because in most practical cases there aren't more than that
57*09537850SAkhilesh Sanikop //  many superblock rows and columns available to work on in parallel.
ComputeFrameThreadCount(int thread_count,int tile_count,int tile_columns)58*09537850SAkhilesh Sanikop int ComputeFrameThreadCount(int thread_count, int tile_count,
59*09537850SAkhilesh Sanikop                             int tile_columns) {
60*09537850SAkhilesh Sanikop   assert(thread_count > 0);
61*09537850SAkhilesh Sanikop   if (thread_count == 1) return 0;
62*09537850SAkhilesh Sanikop   return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
63*09537850SAkhilesh Sanikop              ? 0
64*09537850SAkhilesh Sanikop              : std::max(2, thread_count / (1 + tile_columns));
65*09537850SAkhilesh Sanikop }
66*09537850SAkhilesh Sanikop 
67*09537850SAkhilesh Sanikop }  // namespace
68*09537850SAkhilesh Sanikop 
Reset(const ObuFrameHeader & frame_header,int thread_count)69*09537850SAkhilesh Sanikop bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
70*09537850SAkhilesh Sanikop                               int thread_count) {
71*09537850SAkhilesh Sanikop   assert(thread_count > 0);
72*09537850SAkhilesh Sanikop   frame_parallel_ = false;
73*09537850SAkhilesh Sanikop 
74*09537850SAkhilesh Sanikop   if (thread_count == 1) {
75*09537850SAkhilesh Sanikop     thread_pool_.reset(nullptr);
76*09537850SAkhilesh Sanikop     tile_thread_count_ = 0;
77*09537850SAkhilesh Sanikop     max_tile_index_for_row_threads_ = 0;
78*09537850SAkhilesh Sanikop     return true;
79*09537850SAkhilesh Sanikop   }
80*09537850SAkhilesh Sanikop 
81*09537850SAkhilesh Sanikop   // We do work in the current thread, so it is sufficient to create
82*09537850SAkhilesh Sanikop   // |thread_count|-1 threads in the threadpool.
83*09537850SAkhilesh Sanikop   thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
84*09537850SAkhilesh Sanikop 
85*09537850SAkhilesh Sanikop   if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
86*09537850SAkhilesh Sanikop     thread_pool_ = ThreadPool::Create("libgav1", thread_count);
87*09537850SAkhilesh Sanikop     if (thread_pool_ == nullptr) {
88*09537850SAkhilesh Sanikop       LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
89*09537850SAkhilesh Sanikop                    thread_count);
90*09537850SAkhilesh Sanikop       tile_thread_count_ = 0;
91*09537850SAkhilesh Sanikop       max_tile_index_for_row_threads_ = 0;
92*09537850SAkhilesh Sanikop       return false;
93*09537850SAkhilesh Sanikop     }
94*09537850SAkhilesh Sanikop   }
95*09537850SAkhilesh Sanikop 
96*09537850SAkhilesh Sanikop   // Prefer tile threads first (but only if there is more than one tile).
97*09537850SAkhilesh Sanikop   const int tile_count = frame_header.tile_info.tile_count;
98*09537850SAkhilesh Sanikop   if (tile_count > 1) {
99*09537850SAkhilesh Sanikop     // We want 1 + tile_thread_count_ <= tile_count because the current thread
100*09537850SAkhilesh Sanikop     // is also used to decode tiles. This is equivalent to
101*09537850SAkhilesh Sanikop     // tile_thread_count_ <= tile_count - 1.
102*09537850SAkhilesh Sanikop     tile_thread_count_ = std::min(thread_count, tile_count - 1);
103*09537850SAkhilesh Sanikop     thread_count -= tile_thread_count_;
104*09537850SAkhilesh Sanikop     if (thread_count == 0) {
105*09537850SAkhilesh Sanikop       max_tile_index_for_row_threads_ = 0;
106*09537850SAkhilesh Sanikop       return true;
107*09537850SAkhilesh Sanikop     }
108*09537850SAkhilesh Sanikop   } else {
109*09537850SAkhilesh Sanikop     tile_thread_count_ = 0;
110*09537850SAkhilesh Sanikop   }
111*09537850SAkhilesh Sanikop 
112*09537850SAkhilesh Sanikop #if defined(__ANDROID__)
113*09537850SAkhilesh Sanikop   // Assign the remaining threads for each Tile. The heuristic used here is that
114*09537850SAkhilesh Sanikop   // we will assign two threads for each Tile. So for example, if |thread_count|
115*09537850SAkhilesh Sanikop   // is 2, for a stream with 2 tiles the first tile would get both the threads
116*09537850SAkhilesh Sanikop   // and the second tile would have row multi-threading turned off. This
117*09537850SAkhilesh Sanikop   // heuristic is based on the fact that row multi-threading is fast enough only
118*09537850SAkhilesh Sanikop   // when there are at least two threads to do the decoding (since one thread
119*09537850SAkhilesh Sanikop   // always does the parsing).
120*09537850SAkhilesh Sanikop   //
121*09537850SAkhilesh Sanikop   // This heuristic might stop working when SIMD optimizations make the decoding
122*09537850SAkhilesh Sanikop   // much faster and the parsing thread is only as fast as the decoding threads.
123*09537850SAkhilesh Sanikop   // So we will have to revisit this later to make sure that this is still
124*09537850SAkhilesh Sanikop   // optimal.
125*09537850SAkhilesh Sanikop   //
126*09537850SAkhilesh Sanikop   // Note that while this heuristic significantly improves performance on high
127*09537850SAkhilesh Sanikop   // end devices (like the Pixel 3), there are some performance regressions in
128*09537850SAkhilesh Sanikop   // some lower end devices (in some cases) and that needs to be revisited as we
129*09537850SAkhilesh Sanikop   // bring in more optimizations. Overall, the gains because of this heuristic
130*09537850SAkhilesh Sanikop   // seems to be much larger than the regressions.
131*09537850SAkhilesh Sanikop   for (int i = 0; i < tile_count; ++i) {
132*09537850SAkhilesh Sanikop     max_tile_index_for_row_threads_ = i + 1;
133*09537850SAkhilesh Sanikop     thread_count -= 2;
134*09537850SAkhilesh Sanikop     if (thread_count <= 0) break;
135*09537850SAkhilesh Sanikop   }
136*09537850SAkhilesh Sanikop #else   // !defined(__ANDROID__)
137*09537850SAkhilesh Sanikop   // Assign the remaining threads to each Tile.
138*09537850SAkhilesh Sanikop   for (int i = 0; i < tile_count; ++i) {
139*09537850SAkhilesh Sanikop     const int count = thread_count / tile_count +
140*09537850SAkhilesh Sanikop                       static_cast<int>(i < thread_count % tile_count);
141*09537850SAkhilesh Sanikop     if (count == 0) {
142*09537850SAkhilesh Sanikop       // Once we see a 0 value, all subsequent values will be 0 since it is
143*09537850SAkhilesh Sanikop       // supposed to be assigned in a round-robin fashion.
144*09537850SAkhilesh Sanikop       break;
145*09537850SAkhilesh Sanikop     }
146*09537850SAkhilesh Sanikop     max_tile_index_for_row_threads_ = i + 1;
147*09537850SAkhilesh Sanikop   }
148*09537850SAkhilesh Sanikop #endif  // defined(__ANDROID__)
149*09537850SAkhilesh Sanikop   return true;
150*09537850SAkhilesh Sanikop }
151*09537850SAkhilesh Sanikop 
Reset(int thread_count)152*09537850SAkhilesh Sanikop bool ThreadingStrategy::Reset(int thread_count) {
153*09537850SAkhilesh Sanikop   assert(thread_count > 0);
154*09537850SAkhilesh Sanikop   frame_parallel_ = true;
155*09537850SAkhilesh Sanikop 
156*09537850SAkhilesh Sanikop   // In frame parallel mode, we simply access the underlying |thread_pool_|
157*09537850SAkhilesh Sanikop   // directly. So ensure all the other threadpool getter functions return
158*09537850SAkhilesh Sanikop   // nullptr. Also, superblock row multithreading is always disabled in frame
159*09537850SAkhilesh Sanikop   // parallel mode.
160*09537850SAkhilesh Sanikop   tile_thread_count_ = 0;
161*09537850SAkhilesh Sanikop   max_tile_index_for_row_threads_ = 0;
162*09537850SAkhilesh Sanikop 
163*09537850SAkhilesh Sanikop   if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
164*09537850SAkhilesh Sanikop     thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
165*09537850SAkhilesh Sanikop     if (thread_pool_ == nullptr) {
166*09537850SAkhilesh Sanikop       LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
167*09537850SAkhilesh Sanikop                    thread_count);
168*09537850SAkhilesh Sanikop       return false;
169*09537850SAkhilesh Sanikop     }
170*09537850SAkhilesh Sanikop   }
171*09537850SAkhilesh Sanikop   return true;
172*09537850SAkhilesh Sanikop }
173*09537850SAkhilesh Sanikop 
InitializeThreadPoolsForFrameParallel(int thread_count,int tile_count,int tile_columns,std::unique_ptr<ThreadPool> * const frame_thread_pool,FrameScratchBufferPool * const frame_scratch_buffer_pool)174*09537850SAkhilesh Sanikop bool InitializeThreadPoolsForFrameParallel(
175*09537850SAkhilesh Sanikop     int thread_count, int tile_count, int tile_columns,
176*09537850SAkhilesh Sanikop     std::unique_ptr<ThreadPool>* const frame_thread_pool,
177*09537850SAkhilesh Sanikop     FrameScratchBufferPool* const frame_scratch_buffer_pool) {
178*09537850SAkhilesh Sanikop   assert(*frame_thread_pool == nullptr);
179*09537850SAkhilesh Sanikop   thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
180*09537850SAkhilesh Sanikop   const int frame_threads =
181*09537850SAkhilesh Sanikop       ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
182*09537850SAkhilesh Sanikop   if (frame_threads == 0) return true;
183*09537850SAkhilesh Sanikop   *frame_thread_pool = ThreadPool::Create(frame_threads);
184*09537850SAkhilesh Sanikop   if (*frame_thread_pool == nullptr) {
185*09537850SAkhilesh Sanikop     LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
186*09537850SAkhilesh Sanikop                  frame_threads);
187*09537850SAkhilesh Sanikop     return false;
188*09537850SAkhilesh Sanikop   }
189*09537850SAkhilesh Sanikop   int remaining_threads = thread_count - frame_threads;
190*09537850SAkhilesh Sanikop   if (remaining_threads == 0) return true;
191*09537850SAkhilesh Sanikop   int threads_per_frame = remaining_threads / frame_threads;
192*09537850SAkhilesh Sanikop   const int extra_threads = remaining_threads % frame_threads;
193*09537850SAkhilesh Sanikop   Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
194*09537850SAkhilesh Sanikop   if (!frame_scratch_buffers.reserve(frame_threads)) return false;
195*09537850SAkhilesh Sanikop   // Create the tile thread pools.
196*09537850SAkhilesh Sanikop   for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
197*09537850SAkhilesh Sanikop     std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
198*09537850SAkhilesh Sanikop         frame_scratch_buffer_pool->Get();
199*09537850SAkhilesh Sanikop     if (frame_scratch_buffer == nullptr) {
200*09537850SAkhilesh Sanikop       return false;
201*09537850SAkhilesh Sanikop     }
202*09537850SAkhilesh Sanikop     // If the number of tile threads cannot be divided equally amongst all the
203*09537850SAkhilesh Sanikop     // frame threads, assign one extra thread to the first |extra_threads| frame
204*09537850SAkhilesh Sanikop     // threads.
205*09537850SAkhilesh Sanikop     const int current_frame_thread_count =
206*09537850SAkhilesh Sanikop         threads_per_frame + static_cast<int>(i < extra_threads);
207*09537850SAkhilesh Sanikop     if (!frame_scratch_buffer->threading_strategy.Reset(
208*09537850SAkhilesh Sanikop             current_frame_thread_count)) {
209*09537850SAkhilesh Sanikop       return false;
210*09537850SAkhilesh Sanikop     }
211*09537850SAkhilesh Sanikop     remaining_threads -= current_frame_thread_count;
212*09537850SAkhilesh Sanikop     frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
213*09537850SAkhilesh Sanikop   }
214*09537850SAkhilesh Sanikop   // We release the frame scratch buffers in reverse order so that the extra
215*09537850SAkhilesh Sanikop   // threads are allocated to buffers in the top of the stack.
216*09537850SAkhilesh Sanikop   for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
217*09537850SAkhilesh Sanikop        --i) {
218*09537850SAkhilesh Sanikop     frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
219*09537850SAkhilesh Sanikop   }
220*09537850SAkhilesh Sanikop   return true;
221*09537850SAkhilesh Sanikop }
222*09537850SAkhilesh Sanikop 
223*09537850SAkhilesh Sanikop }  // namespace libgav1
224