1*b095b053SXin Li #include <gtest/gtest.h>
2*b095b053SXin Li
3*b095b053SXin Li #include <pthreadpool.h>
4*b095b053SXin Li
5*b095b053SXin Li #include <algorithm>
6*b095b053SXin Li #include <atomic>
7*b095b053SXin Li #include <cstddef>
8*b095b053SXin Li #include <memory>
9*b095b053SXin Li
10*b095b053SXin Li
11*b095b053SXin Li typedef std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> auto_pthreadpool_t;
12*b095b053SXin Li
13*b095b053SXin Li
14*b095b053SXin Li const size_t kParallelize1DRange = 1223;
15*b095b053SXin Li const size_t kParallelize1DTile1DRange = 1303;
16*b095b053SXin Li const size_t kParallelize1DTile1DTile = 11;
17*b095b053SXin Li const size_t kParallelize2DRangeI = 41;
18*b095b053SXin Li const size_t kParallelize2DRangeJ = 43;
19*b095b053SXin Li const size_t kParallelize2DTile1DRangeI = 43;
20*b095b053SXin Li const size_t kParallelize2DTile1DRangeJ = 53;
21*b095b053SXin Li const size_t kParallelize2DTile1DTileJ = 5;
22*b095b053SXin Li const size_t kParallelize2DTile2DRangeI = 53;
23*b095b053SXin Li const size_t kParallelize2DTile2DRangeJ = 59;
24*b095b053SXin Li const size_t kParallelize2DTile2DTileI = 5;
25*b095b053SXin Li const size_t kParallelize2DTile2DTileJ = 7;
26*b095b053SXin Li const size_t kParallelize3DRangeI = 13;
27*b095b053SXin Li const size_t kParallelize3DRangeJ = 17;
28*b095b053SXin Li const size_t kParallelize3DRangeK = 19;
29*b095b053SXin Li const size_t kParallelize3DTile1DRangeI = 17;
30*b095b053SXin Li const size_t kParallelize3DTile1DRangeJ = 19;
31*b095b053SXin Li const size_t kParallelize3DTile1DRangeK = 23;
32*b095b053SXin Li const size_t kParallelize3DTile1DTileK = 5;
33*b095b053SXin Li const size_t kParallelize3DTile2DRangeI = 19;
34*b095b053SXin Li const size_t kParallelize3DTile2DRangeJ = 23;
35*b095b053SXin Li const size_t kParallelize3DTile2DRangeK = 29;
36*b095b053SXin Li const size_t kParallelize3DTile2DTileJ = 2;
37*b095b053SXin Li const size_t kParallelize3DTile2DTileK = 3;
38*b095b053SXin Li const size_t kParallelize4DRangeI = 11;
39*b095b053SXin Li const size_t kParallelize4DRangeJ = 13;
40*b095b053SXin Li const size_t kParallelize4DRangeK = 17;
41*b095b053SXin Li const size_t kParallelize4DRangeL = 19;
42*b095b053SXin Li const size_t kParallelize4DTile1DRangeI = 13;
43*b095b053SXin Li const size_t kParallelize4DTile1DRangeJ = 17;
44*b095b053SXin Li const size_t kParallelize4DTile1DRangeK = 19;
45*b095b053SXin Li const size_t kParallelize4DTile1DRangeL = 23;
46*b095b053SXin Li const size_t kParallelize4DTile1DTileL = 5;
47*b095b053SXin Li const size_t kParallelize4DTile2DRangeI = 17;
48*b095b053SXin Li const size_t kParallelize4DTile2DRangeJ = 19;
49*b095b053SXin Li const size_t kParallelize4DTile2DRangeK = 23;
50*b095b053SXin Li const size_t kParallelize4DTile2DRangeL = 29;
51*b095b053SXin Li const size_t kParallelize4DTile2DTileK = 2;
52*b095b053SXin Li const size_t kParallelize4DTile2DTileL = 3;
53*b095b053SXin Li const size_t kParallelize5DRangeI = 7;
54*b095b053SXin Li const size_t kParallelize5DRangeJ = 11;
55*b095b053SXin Li const size_t kParallelize5DRangeK = 13;
56*b095b053SXin Li const size_t kParallelize5DRangeL = 17;
57*b095b053SXin Li const size_t kParallelize5DRangeM = 19;
58*b095b053SXin Li const size_t kParallelize5DTile1DRangeI = 11;
59*b095b053SXin Li const size_t kParallelize5DTile1DRangeJ = 13;
60*b095b053SXin Li const size_t kParallelize5DTile1DRangeK = 17;
61*b095b053SXin Li const size_t kParallelize5DTile1DRangeL = 19;
62*b095b053SXin Li const size_t kParallelize5DTile1DRangeM = 23;
63*b095b053SXin Li const size_t kParallelize5DTile1DTileM = 5;
64*b095b053SXin Li const size_t kParallelize5DTile2DRangeI = 13;
65*b095b053SXin Li const size_t kParallelize5DTile2DRangeJ = 17;
66*b095b053SXin Li const size_t kParallelize5DTile2DRangeK = 19;
67*b095b053SXin Li const size_t kParallelize5DTile2DRangeL = 23;
68*b095b053SXin Li const size_t kParallelize5DTile2DRangeM = 29;
69*b095b053SXin Li const size_t kParallelize5DTile2DTileL = 3;
70*b095b053SXin Li const size_t kParallelize5DTile2DTileM = 2;
71*b095b053SXin Li const size_t kParallelize6DRangeI = 3;
72*b095b053SXin Li const size_t kParallelize6DRangeJ = 5;
73*b095b053SXin Li const size_t kParallelize6DRangeK = 7;
74*b095b053SXin Li const size_t kParallelize6DRangeL = 11;
75*b095b053SXin Li const size_t kParallelize6DRangeM = 13;
76*b095b053SXin Li const size_t kParallelize6DRangeN = 17;
77*b095b053SXin Li const size_t kParallelize6DTile1DRangeI = 5;
78*b095b053SXin Li const size_t kParallelize6DTile1DRangeJ = 7;
79*b095b053SXin Li const size_t kParallelize6DTile1DRangeK = 11;
80*b095b053SXin Li const size_t kParallelize6DTile1DRangeL = 13;
81*b095b053SXin Li const size_t kParallelize6DTile1DRangeM = 17;
82*b095b053SXin Li const size_t kParallelize6DTile1DRangeN = 19;
83*b095b053SXin Li const size_t kParallelize6DTile1DTileN = 5;
84*b095b053SXin Li const size_t kParallelize6DTile2DRangeI = 7;
85*b095b053SXin Li const size_t kParallelize6DTile2DRangeJ = 11;
86*b095b053SXin Li const size_t kParallelize6DTile2DRangeK = 13;
87*b095b053SXin Li const size_t kParallelize6DTile2DRangeL = 17;
88*b095b053SXin Li const size_t kParallelize6DTile2DRangeM = 19;
89*b095b053SXin Li const size_t kParallelize6DTile2DRangeN = 23;
90*b095b053SXin Li const size_t kParallelize6DTile2DTileM = 3;
91*b095b053SXin Li const size_t kParallelize6DTile2DTileN = 2;
92*b095b053SXin Li
93*b095b053SXin Li const size_t kIncrementIterations = 101;
94*b095b053SXin Li const size_t kIncrementIterations5D = 7;
95*b095b053SXin Li const size_t kIncrementIterations6D = 3;
96*b095b053SXin Li
97*b095b053SXin Li const uint32_t kMaxUArchIndex = 0;
98*b095b053SXin Li const uint32_t kDefaultUArchIndex = 42;
99*b095b053SXin Li
100*b095b053SXin Li
TEST(CreateAndDestroy,NullThreadPool)101*b095b053SXin Li TEST(CreateAndDestroy, NullThreadPool) {
102*b095b053SXin Li pthreadpool* threadpool = nullptr;
103*b095b053SXin Li pthreadpool_destroy(threadpool);
104*b095b053SXin Li }
105*b095b053SXin Li
TEST(CreateAndDestroy,SingleThreadPool)106*b095b053SXin Li TEST(CreateAndDestroy, SingleThreadPool) {
107*b095b053SXin Li pthreadpool* threadpool = pthreadpool_create(1);
108*b095b053SXin Li ASSERT_TRUE(threadpool);
109*b095b053SXin Li pthreadpool_destroy(threadpool);
110*b095b053SXin Li }
111*b095b053SXin Li
TEST(CreateAndDestroy,MultiThreadPool)112*b095b053SXin Li TEST(CreateAndDestroy, MultiThreadPool) {
113*b095b053SXin Li pthreadpool* threadpool = pthreadpool_create(0);
114*b095b053SXin Li ASSERT_TRUE(threadpool);
115*b095b053SXin Li pthreadpool_destroy(threadpool);
116*b095b053SXin Li }
117*b095b053SXin Li
ComputeNothing1D(void *,size_t)118*b095b053SXin Li static void ComputeNothing1D(void*, size_t) {
119*b095b053SXin Li }
120*b095b053SXin Li
TEST(Parallelize1D,SingleThreadPoolCompletes)121*b095b053SXin Li TEST(Parallelize1D, SingleThreadPoolCompletes) {
122*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
123*b095b053SXin Li ASSERT_TRUE(threadpool.get());
124*b095b053SXin Li
125*b095b053SXin Li pthreadpool_parallelize_1d(threadpool.get(),
126*b095b053SXin Li ComputeNothing1D,
127*b095b053SXin Li nullptr,
128*b095b053SXin Li kParallelize1DRange,
129*b095b053SXin Li 0 /* flags */);
130*b095b053SXin Li }
131*b095b053SXin Li
TEST(Parallelize1D,MultiThreadPoolCompletes)132*b095b053SXin Li TEST(Parallelize1D, MultiThreadPoolCompletes) {
133*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
134*b095b053SXin Li ASSERT_TRUE(threadpool.get());
135*b095b053SXin Li
136*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
137*b095b053SXin Li GTEST_SKIP();
138*b095b053SXin Li }
139*b095b053SXin Li
140*b095b053SXin Li pthreadpool_parallelize_1d(
141*b095b053SXin Li threadpool.get(),
142*b095b053SXin Li ComputeNothing1D,
143*b095b053SXin Li nullptr,
144*b095b053SXin Li kParallelize1DRange,
145*b095b053SXin Li 0 /* flags */);
146*b095b053SXin Li }
147*b095b053SXin Li
CheckBounds1D(void *,size_t i)148*b095b053SXin Li static void CheckBounds1D(void*, size_t i) {
149*b095b053SXin Li EXPECT_LT(i, kParallelize1DRange);
150*b095b053SXin Li }
151*b095b053SXin Li
TEST(Parallelize1D,SingleThreadPoolAllItemsInBounds)152*b095b053SXin Li TEST(Parallelize1D, SingleThreadPoolAllItemsInBounds) {
153*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
154*b095b053SXin Li ASSERT_TRUE(threadpool.get());
155*b095b053SXin Li
156*b095b053SXin Li pthreadpool_parallelize_1d(
157*b095b053SXin Li threadpool.get(),
158*b095b053SXin Li CheckBounds1D,
159*b095b053SXin Li nullptr,
160*b095b053SXin Li kParallelize1DRange,
161*b095b053SXin Li 0 /* flags */);
162*b095b053SXin Li }
163*b095b053SXin Li
TEST(Parallelize1D,MultiThreadPoolAllItemsInBounds)164*b095b053SXin Li TEST(Parallelize1D, MultiThreadPoolAllItemsInBounds) {
165*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
166*b095b053SXin Li ASSERT_TRUE(threadpool.get());
167*b095b053SXin Li
168*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
169*b095b053SXin Li GTEST_SKIP();
170*b095b053SXin Li }
171*b095b053SXin Li
172*b095b053SXin Li pthreadpool_parallelize_1d(
173*b095b053SXin Li threadpool.get(),
174*b095b053SXin Li CheckBounds1D,
175*b095b053SXin Li nullptr,
176*b095b053SXin Li kParallelize1DRange,
177*b095b053SXin Li 0 /* flags */);
178*b095b053SXin Li }
179*b095b053SXin Li
SetTrue1D(std::atomic_bool * processed_indicators,size_t i)180*b095b053SXin Li static void SetTrue1D(std::atomic_bool* processed_indicators, size_t i) {
181*b095b053SXin Li processed_indicators[i].store(true, std::memory_order_relaxed);
182*b095b053SXin Li }
183*b095b053SXin Li
TEST(Parallelize1D,SingleThreadPoolAllItemsProcessed)184*b095b053SXin Li TEST(Parallelize1D, SingleThreadPoolAllItemsProcessed) {
185*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize1DRange);
186*b095b053SXin Li
187*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
188*b095b053SXin Li ASSERT_TRUE(threadpool.get());
189*b095b053SXin Li
190*b095b053SXin Li pthreadpool_parallelize_1d(
191*b095b053SXin Li threadpool.get(),
192*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(SetTrue1D),
193*b095b053SXin Li static_cast<void*>(indicators.data()),
194*b095b053SXin Li kParallelize1DRange,
195*b095b053SXin Li 0 /* flags */);
196*b095b053SXin Li
197*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
198*b095b053SXin Li EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
199*b095b053SXin Li << "Element " << i << " not processed";
200*b095b053SXin Li }
201*b095b053SXin Li }
202*b095b053SXin Li
TEST(Parallelize1D,MultiThreadPoolAllItemsProcessed)203*b095b053SXin Li TEST(Parallelize1D, MultiThreadPoolAllItemsProcessed) {
204*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize1DRange);
205*b095b053SXin Li
206*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
207*b095b053SXin Li ASSERT_TRUE(threadpool.get());
208*b095b053SXin Li
209*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
210*b095b053SXin Li GTEST_SKIP();
211*b095b053SXin Li }
212*b095b053SXin Li
213*b095b053SXin Li pthreadpool_parallelize_1d(
214*b095b053SXin Li threadpool.get(),
215*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(SetTrue1D),
216*b095b053SXin Li static_cast<void*>(indicators.data()),
217*b095b053SXin Li kParallelize1DRange,
218*b095b053SXin Li 0 /* flags */);
219*b095b053SXin Li
220*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
221*b095b053SXin Li EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
222*b095b053SXin Li << "Element " << i << " not processed";
223*b095b053SXin Li }
224*b095b053SXin Li }
225*b095b053SXin Li
Increment1D(std::atomic_int * processed_counters,size_t i)226*b095b053SXin Li static void Increment1D(std::atomic_int* processed_counters, size_t i) {
227*b095b053SXin Li processed_counters[i].fetch_add(1, std::memory_order_relaxed);
228*b095b053SXin Li }
229*b095b053SXin Li
TEST(Parallelize1D,SingleThreadPoolEachItemProcessedOnce)230*b095b053SXin Li TEST(Parallelize1D, SingleThreadPoolEachItemProcessedOnce) {
231*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
232*b095b053SXin Li
233*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
234*b095b053SXin Li ASSERT_TRUE(threadpool.get());
235*b095b053SXin Li
236*b095b053SXin Li pthreadpool_parallelize_1d(
237*b095b053SXin Li threadpool.get(),
238*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
239*b095b053SXin Li static_cast<void*>(counters.data()),
240*b095b053SXin Li kParallelize1DRange,
241*b095b053SXin Li 0 /* flags */);
242*b095b053SXin Li
243*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
244*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
245*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
246*b095b053SXin Li }
247*b095b053SXin Li }
248*b095b053SXin Li
TEST(Parallelize1D,MultiThreadPoolEachItemProcessedOnce)249*b095b053SXin Li TEST(Parallelize1D, MultiThreadPoolEachItemProcessedOnce) {
250*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
251*b095b053SXin Li
252*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
253*b095b053SXin Li ASSERT_TRUE(threadpool.get());
254*b095b053SXin Li
255*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
256*b095b053SXin Li GTEST_SKIP();
257*b095b053SXin Li }
258*b095b053SXin Li
259*b095b053SXin Li pthreadpool_parallelize_1d(
260*b095b053SXin Li threadpool.get(),
261*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
262*b095b053SXin Li static_cast<void*>(counters.data()),
263*b095b053SXin Li kParallelize1DRange,
264*b095b053SXin Li 0 /* flags */);
265*b095b053SXin Li
266*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
267*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
268*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
269*b095b053SXin Li }
270*b095b053SXin Li }
271*b095b053SXin Li
TEST(Parallelize1D,SingleThreadPoolEachItemProcessedMultipleTimes)272*b095b053SXin Li TEST(Parallelize1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
273*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
274*b095b053SXin Li
275*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
276*b095b053SXin Li ASSERT_TRUE(threadpool.get());
277*b095b053SXin Li
278*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
279*b095b053SXin Li pthreadpool_parallelize_1d(
280*b095b053SXin Li threadpool.get(),
281*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
282*b095b053SXin Li static_cast<void*>(counters.data()),
283*b095b053SXin Li kParallelize1DRange,
284*b095b053SXin Li 0 /* flags */);
285*b095b053SXin Li }
286*b095b053SXin Li
287*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
288*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
289*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
290*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
291*b095b053SXin Li }
292*b095b053SXin Li }
293*b095b053SXin Li
TEST(Parallelize1D,MultiThreadPoolEachItemProcessedMultipleTimes)294*b095b053SXin Li TEST(Parallelize1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
295*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
296*b095b053SXin Li
297*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
298*b095b053SXin Li ASSERT_TRUE(threadpool.get());
299*b095b053SXin Li
300*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
301*b095b053SXin Li GTEST_SKIP();
302*b095b053SXin Li }
303*b095b053SXin Li
304*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
305*b095b053SXin Li pthreadpool_parallelize_1d(
306*b095b053SXin Li threadpool.get(),
307*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(Increment1D),
308*b095b053SXin Li static_cast<void*>(counters.data()),
309*b095b053SXin Li kParallelize1DRange,
310*b095b053SXin Li 0 /* flags */);
311*b095b053SXin Li }
312*b095b053SXin Li
313*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
314*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
315*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
316*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
317*b095b053SXin Li }
318*b095b053SXin Li }
319*b095b053SXin Li
IncrementSame1D(std::atomic_int * num_processed_items,size_t i)320*b095b053SXin Li static void IncrementSame1D(std::atomic_int* num_processed_items, size_t i) {
321*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
322*b095b053SXin Li }
323*b095b053SXin Li
TEST(Parallelize1D,MultiThreadPoolHighContention)324*b095b053SXin Li TEST(Parallelize1D, MultiThreadPoolHighContention) {
325*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
326*b095b053SXin Li
327*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
328*b095b053SXin Li ASSERT_TRUE(threadpool.get());
329*b095b053SXin Li
330*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
331*b095b053SXin Li GTEST_SKIP();
332*b095b053SXin Li }
333*b095b053SXin Li
334*b095b053SXin Li pthreadpool_parallelize_1d(
335*b095b053SXin Li threadpool.get(),
336*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(IncrementSame1D),
337*b095b053SXin Li static_cast<void*>(&num_processed_items),
338*b095b053SXin Li kParallelize1DRange,
339*b095b053SXin Li 0 /* flags */);
340*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
341*b095b053SXin Li }
342*b095b053SXin Li
WorkImbalance1D(std::atomic_int * num_processed_items,size_t i)343*b095b053SXin Li static void WorkImbalance1D(std::atomic_int* num_processed_items, size_t i) {
344*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
345*b095b053SXin Li if (i == 0) {
346*b095b053SXin Li /* Spin-wait until all items are computed */
347*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) {
348*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
349*b095b053SXin Li }
350*b095b053SXin Li }
351*b095b053SXin Li }
352*b095b053SXin Li
TEST(Parallelize1D,MultiThreadPoolWorkStealing)353*b095b053SXin Li TEST(Parallelize1D, MultiThreadPoolWorkStealing) {
354*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
355*b095b053SXin Li
356*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
357*b095b053SXin Li ASSERT_TRUE(threadpool.get());
358*b095b053SXin Li
359*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
360*b095b053SXin Li GTEST_SKIP();
361*b095b053SXin Li }
362*b095b053SXin Li
363*b095b053SXin Li pthreadpool_parallelize_1d(
364*b095b053SXin Li threadpool.get(),
365*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_t>(WorkImbalance1D),
366*b095b053SXin Li static_cast<void*>(&num_processed_items),
367*b095b053SXin Li kParallelize1DRange,
368*b095b053SXin Li 0 /* flags */);
369*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
370*b095b053SXin Li }
371*b095b053SXin Li
ComputeNothing1DWithUArch(void *,uint32_t,size_t)372*b095b053SXin Li static void ComputeNothing1DWithUArch(void*, uint32_t, size_t) {
373*b095b053SXin Li }
374*b095b053SXin Li
TEST(Parallelize1DWithUArch,SingleThreadPoolCompletes)375*b095b053SXin Li TEST(Parallelize1DWithUArch, SingleThreadPoolCompletes) {
376*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
377*b095b053SXin Li ASSERT_TRUE(threadpool.get());
378*b095b053SXin Li
379*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(threadpool.get(),
380*b095b053SXin Li ComputeNothing1DWithUArch,
381*b095b053SXin Li nullptr,
382*b095b053SXin Li kDefaultUArchIndex,
383*b095b053SXin Li kMaxUArchIndex,
384*b095b053SXin Li kParallelize1DRange,
385*b095b053SXin Li 0 /* flags */);
386*b095b053SXin Li }
387*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolCompletes)388*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolCompletes) {
389*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
390*b095b053SXin Li ASSERT_TRUE(threadpool.get());
391*b095b053SXin Li
392*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
393*b095b053SXin Li GTEST_SKIP();
394*b095b053SXin Li }
395*b095b053SXin Li
396*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
397*b095b053SXin Li threadpool.get(),
398*b095b053SXin Li ComputeNothing1DWithUArch,
399*b095b053SXin Li nullptr,
400*b095b053SXin Li kDefaultUArchIndex,
401*b095b053SXin Li kMaxUArchIndex,
402*b095b053SXin Li kParallelize1DRange,
403*b095b053SXin Li 0 /* flags */);
404*b095b053SXin Li }
405*b095b053SXin Li
CheckUArch1DWithUArch(void *,uint32_t uarch_index,size_t)406*b095b053SXin Li static void CheckUArch1DWithUArch(void*, uint32_t uarch_index, size_t) {
407*b095b053SXin Li if (uarch_index != kDefaultUArchIndex) {
408*b095b053SXin Li EXPECT_LE(uarch_index, kMaxUArchIndex);
409*b095b053SXin Li }
410*b095b053SXin Li }
411*b095b053SXin Li
TEST(Parallelize1DWithUArch,SingleThreadPoolUArchInBounds)412*b095b053SXin Li TEST(Parallelize1DWithUArch, SingleThreadPoolUArchInBounds) {
413*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
414*b095b053SXin Li ASSERT_TRUE(threadpool.get());
415*b095b053SXin Li
416*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(threadpool.get(),
417*b095b053SXin Li CheckUArch1DWithUArch,
418*b095b053SXin Li nullptr,
419*b095b053SXin Li kDefaultUArchIndex,
420*b095b053SXin Li kMaxUArchIndex,
421*b095b053SXin Li kParallelize1DRange,
422*b095b053SXin Li 0 /* flags */);
423*b095b053SXin Li }
424*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolUArchInBounds)425*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolUArchInBounds) {
426*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
427*b095b053SXin Li ASSERT_TRUE(threadpool.get());
428*b095b053SXin Li
429*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
430*b095b053SXin Li GTEST_SKIP();
431*b095b053SXin Li }
432*b095b053SXin Li
433*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
434*b095b053SXin Li threadpool.get(),
435*b095b053SXin Li CheckUArch1DWithUArch,
436*b095b053SXin Li nullptr,
437*b095b053SXin Li kDefaultUArchIndex,
438*b095b053SXin Li kMaxUArchIndex,
439*b095b053SXin Li kParallelize1DRange,
440*b095b053SXin Li 0 /* flags */);
441*b095b053SXin Li }
442*b095b053SXin Li
CheckBounds1DWithUArch(void *,uint32_t,size_t i)443*b095b053SXin Li static void CheckBounds1DWithUArch(void*, uint32_t, size_t i) {
444*b095b053SXin Li EXPECT_LT(i, kParallelize1DRange);
445*b095b053SXin Li }
446*b095b053SXin Li
TEST(Parallelize1DWithUArch,SingleThreadPoolAllItemsInBounds)447*b095b053SXin Li TEST(Parallelize1DWithUArch, SingleThreadPoolAllItemsInBounds) {
448*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
449*b095b053SXin Li ASSERT_TRUE(threadpool.get());
450*b095b053SXin Li
451*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
452*b095b053SXin Li threadpool.get(),
453*b095b053SXin Li CheckBounds1DWithUArch,
454*b095b053SXin Li nullptr,
455*b095b053SXin Li kDefaultUArchIndex,
456*b095b053SXin Li kMaxUArchIndex,
457*b095b053SXin Li kParallelize1DRange,
458*b095b053SXin Li 0 /* flags */);
459*b095b053SXin Li }
460*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolAllItemsInBounds)461*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolAllItemsInBounds) {
462*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
463*b095b053SXin Li ASSERT_TRUE(threadpool.get());
464*b095b053SXin Li
465*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
466*b095b053SXin Li GTEST_SKIP();
467*b095b053SXin Li }
468*b095b053SXin Li
469*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
470*b095b053SXin Li threadpool.get(),
471*b095b053SXin Li CheckBounds1DWithUArch,
472*b095b053SXin Li nullptr,
473*b095b053SXin Li kDefaultUArchIndex,
474*b095b053SXin Li kMaxUArchIndex,
475*b095b053SXin Li kParallelize1DRange,
476*b095b053SXin Li 0 /* flags */);
477*b095b053SXin Li }
478*b095b053SXin Li
SetTrue1DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t i)479*b095b053SXin Li static void SetTrue1DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i) {
480*b095b053SXin Li processed_indicators[i].store(true, std::memory_order_relaxed);
481*b095b053SXin Li }
482*b095b053SXin Li
TEST(Parallelize1DWithUArch,SingleThreadPoolAllItemsProcessed)483*b095b053SXin Li TEST(Parallelize1DWithUArch, SingleThreadPoolAllItemsProcessed) {
484*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize1DRange);
485*b095b053SXin Li
486*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
487*b095b053SXin Li ASSERT_TRUE(threadpool.get());
488*b095b053SXin Li
489*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
490*b095b053SXin Li threadpool.get(),
491*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(SetTrue1DWithUArch),
492*b095b053SXin Li static_cast<void*>(indicators.data()),
493*b095b053SXin Li kDefaultUArchIndex,
494*b095b053SXin Li kMaxUArchIndex,
495*b095b053SXin Li kParallelize1DRange,
496*b095b053SXin Li 0 /* flags */);
497*b095b053SXin Li
498*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
499*b095b053SXin Li EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
500*b095b053SXin Li << "Element " << i << " not processed";
501*b095b053SXin Li }
502*b095b053SXin Li }
503*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolAllItemsProcessed)504*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolAllItemsProcessed) {
505*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize1DRange);
506*b095b053SXin Li
507*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
508*b095b053SXin Li ASSERT_TRUE(threadpool.get());
509*b095b053SXin Li
510*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
511*b095b053SXin Li GTEST_SKIP();
512*b095b053SXin Li }
513*b095b053SXin Li
514*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
515*b095b053SXin Li threadpool.get(),
516*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(SetTrue1DWithUArch),
517*b095b053SXin Li static_cast<void*>(indicators.data()),
518*b095b053SXin Li kDefaultUArchIndex,
519*b095b053SXin Li kMaxUArchIndex,
520*b095b053SXin Li kParallelize1DRange,
521*b095b053SXin Li 0 /* flags */);
522*b095b053SXin Li
523*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
524*b095b053SXin Li EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
525*b095b053SXin Li << "Element " << i << " not processed";
526*b095b053SXin Li }
527*b095b053SXin Li }
528*b095b053SXin Li
Increment1DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t i)529*b095b053SXin Li static void Increment1DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i) {
530*b095b053SXin Li processed_counters[i].fetch_add(1, std::memory_order_relaxed);
531*b095b053SXin Li }
532*b095b053SXin Li
TEST(Parallelize1DWithUArch,SingleThreadPoolEachItemProcessedOnce)533*b095b053SXin Li TEST(Parallelize1DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
534*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
535*b095b053SXin Li
536*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
537*b095b053SXin Li ASSERT_TRUE(threadpool.get());
538*b095b053SXin Li
539*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
540*b095b053SXin Li threadpool.get(),
541*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
542*b095b053SXin Li static_cast<void*>(counters.data()),
543*b095b053SXin Li kDefaultUArchIndex,
544*b095b053SXin Li kMaxUArchIndex,
545*b095b053SXin Li kParallelize1DRange,
546*b095b053SXin Li 0 /* flags */);
547*b095b053SXin Li
548*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
549*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
550*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
551*b095b053SXin Li }
552*b095b053SXin Li }
553*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolEachItemProcessedOnce)554*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
555*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
556*b095b053SXin Li
557*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
558*b095b053SXin Li ASSERT_TRUE(threadpool.get());
559*b095b053SXin Li
560*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
561*b095b053SXin Li GTEST_SKIP();
562*b095b053SXin Li }
563*b095b053SXin Li
564*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
565*b095b053SXin Li threadpool.get(),
566*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
567*b095b053SXin Li static_cast<void*>(counters.data()),
568*b095b053SXin Li kDefaultUArchIndex,
569*b095b053SXin Li kMaxUArchIndex,
570*b095b053SXin Li kParallelize1DRange,
571*b095b053SXin Li 0 /* flags */);
572*b095b053SXin Li
573*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
574*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
575*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
576*b095b053SXin Li }
577*b095b053SXin Li }
578*b095b053SXin Li
TEST(Parallelize1DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)579*b095b053SXin Li TEST(Parallelize1DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
580*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
581*b095b053SXin Li
582*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
583*b095b053SXin Li ASSERT_TRUE(threadpool.get());
584*b095b053SXin Li
585*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
586*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
587*b095b053SXin Li threadpool.get(),
588*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
589*b095b053SXin Li static_cast<void*>(counters.data()),
590*b095b053SXin Li kDefaultUArchIndex,
591*b095b053SXin Li kMaxUArchIndex,
592*b095b053SXin Li kParallelize1DRange,
593*b095b053SXin Li 0 /* flags */);
594*b095b053SXin Li }
595*b095b053SXin Li
596*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
597*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
598*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
599*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
600*b095b053SXin Li }
601*b095b053SXin Li }
602*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)603*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
604*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DRange);
605*b095b053SXin Li
606*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
607*b095b053SXin Li ASSERT_TRUE(threadpool.get());
608*b095b053SXin Li
609*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
610*b095b053SXin Li GTEST_SKIP();
611*b095b053SXin Li }
612*b095b053SXin Li
613*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
614*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
615*b095b053SXin Li threadpool.get(),
616*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(Increment1DWithUArch),
617*b095b053SXin Li static_cast<void*>(counters.data()),
618*b095b053SXin Li kDefaultUArchIndex,
619*b095b053SXin Li kMaxUArchIndex,
620*b095b053SXin Li kParallelize1DRange,
621*b095b053SXin Li 0 /* flags */);
622*b095b053SXin Li }
623*b095b053SXin Li
624*b095b053SXin Li for (size_t i = 0; i < kParallelize1DRange; i++) {
625*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
626*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
627*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
628*b095b053SXin Li }
629*b095b053SXin Li }
630*b095b053SXin Li
IncrementSame1DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i)631*b095b053SXin Li static void IncrementSame1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i) {
632*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
633*b095b053SXin Li }
634*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolHighContention)635*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolHighContention) {
636*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
637*b095b053SXin Li
638*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
639*b095b053SXin Li ASSERT_TRUE(threadpool.get());
640*b095b053SXin Li
641*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
642*b095b053SXin Li GTEST_SKIP();
643*b095b053SXin Li }
644*b095b053SXin Li
645*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
646*b095b053SXin Li threadpool.get(),
647*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(IncrementSame1DWithUArch),
648*b095b053SXin Li static_cast<void*>(&num_processed_items),
649*b095b053SXin Li kDefaultUArchIndex,
650*b095b053SXin Li kMaxUArchIndex,
651*b095b053SXin Li kParallelize1DRange,
652*b095b053SXin Li 0 /* flags */);
653*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
654*b095b053SXin Li }
655*b095b053SXin Li
WorkImbalance1DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i)656*b095b053SXin Li static void WorkImbalance1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i) {
657*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
658*b095b053SXin Li if (i == 0) {
659*b095b053SXin Li /* Spin-wait until all items are computed */
660*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) {
661*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
662*b095b053SXin Li }
663*b095b053SXin Li }
664*b095b053SXin Li }
665*b095b053SXin Li
TEST(Parallelize1DWithUArch,MultiThreadPoolWorkStealing)666*b095b053SXin Li TEST(Parallelize1DWithUArch, MultiThreadPoolWorkStealing) {
667*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
668*b095b053SXin Li
669*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
670*b095b053SXin Li ASSERT_TRUE(threadpool.get());
671*b095b053SXin Li
672*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
673*b095b053SXin Li GTEST_SKIP();
674*b095b053SXin Li }
675*b095b053SXin Li
676*b095b053SXin Li pthreadpool_parallelize_1d_with_uarch(
677*b095b053SXin Li threadpool.get(),
678*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_with_id_t>(WorkImbalance1DWithUArch),
679*b095b053SXin Li static_cast<void*>(&num_processed_items),
680*b095b053SXin Li kDefaultUArchIndex,
681*b095b053SXin Li kMaxUArchIndex,
682*b095b053SXin Li kParallelize1DRange,
683*b095b053SXin Li 0 /* flags */);
684*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange);
685*b095b053SXin Li }
686*b095b053SXin Li
ComputeNothing1DTile1D(void *,size_t,size_t)687*b095b053SXin Li static void ComputeNothing1DTile1D(void*, size_t, size_t) {
688*b095b053SXin Li }
689*b095b053SXin Li
TEST(Parallelize1DTile1D,SingleThreadPoolCompletes)690*b095b053SXin Li TEST(Parallelize1DTile1D, SingleThreadPoolCompletes) {
691*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
692*b095b053SXin Li ASSERT_TRUE(threadpool.get());
693*b095b053SXin Li
694*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(threadpool.get(),
695*b095b053SXin Li ComputeNothing1DTile1D,
696*b095b053SXin Li nullptr,
697*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
698*b095b053SXin Li 0 /* flags */);
699*b095b053SXin Li }
700*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolCompletes)701*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolCompletes) {
702*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
703*b095b053SXin Li ASSERT_TRUE(threadpool.get());
704*b095b053SXin Li
705*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
706*b095b053SXin Li GTEST_SKIP();
707*b095b053SXin Li }
708*b095b053SXin Li
709*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
710*b095b053SXin Li threadpool.get(),
711*b095b053SXin Li ComputeNothing1DTile1D,
712*b095b053SXin Li nullptr,
713*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
714*b095b053SXin Li 0 /* flags */);
715*b095b053SXin Li }
716*b095b053SXin Li
CheckBounds1DTile1D(void *,size_t start_i,size_t tile_i)717*b095b053SXin Li static void CheckBounds1DTile1D(void*, size_t start_i, size_t tile_i) {
718*b095b053SXin Li EXPECT_LT(start_i, kParallelize1DTile1DRange);
719*b095b053SXin Li EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange);
720*b095b053SXin Li }
721*b095b053SXin Li
TEST(Parallelize1DTile1D,SingleThreadPoolAllItemsInBounds)722*b095b053SXin Li TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsInBounds) {
723*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
724*b095b053SXin Li ASSERT_TRUE(threadpool.get());
725*b095b053SXin Li
726*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
727*b095b053SXin Li threadpool.get(),
728*b095b053SXin Li CheckBounds1DTile1D,
729*b095b053SXin Li nullptr,
730*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
731*b095b053SXin Li 0 /* flags */);
732*b095b053SXin Li }
733*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolAllItemsInBounds)734*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsInBounds) {
735*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
736*b095b053SXin Li ASSERT_TRUE(threadpool.get());
737*b095b053SXin Li
738*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
739*b095b053SXin Li GTEST_SKIP();
740*b095b053SXin Li }
741*b095b053SXin Li
742*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
743*b095b053SXin Li threadpool.get(),
744*b095b053SXin Li CheckBounds1DTile1D,
745*b095b053SXin Li nullptr,
746*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
747*b095b053SXin Li 0 /* flags */);
748*b095b053SXin Li }
749*b095b053SXin Li
CheckTiling1DTile1D(void *,size_t start_i,size_t tile_i)750*b095b053SXin Li static void CheckTiling1DTile1D(void*, size_t start_i, size_t tile_i) {
751*b095b053SXin Li EXPECT_GT(tile_i, 0);
752*b095b053SXin Li EXPECT_LE(tile_i, kParallelize1DTile1DTile);
753*b095b053SXin Li EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0);
754*b095b053SXin Li EXPECT_EQ(tile_i, std::min<size_t>(kParallelize1DTile1DTile, kParallelize1DTile1DRange - start_i));
755*b095b053SXin Li }
756*b095b053SXin Li
TEST(Parallelize1DTile1D,SingleThreadPoolUniformTiling)757*b095b053SXin Li TEST(Parallelize1DTile1D, SingleThreadPoolUniformTiling) {
758*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
759*b095b053SXin Li ASSERT_TRUE(threadpool.get());
760*b095b053SXin Li
761*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
762*b095b053SXin Li threadpool.get(),
763*b095b053SXin Li CheckTiling1DTile1D,
764*b095b053SXin Li nullptr,
765*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
766*b095b053SXin Li 0 /* flags */);
767*b095b053SXin Li }
768*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolUniformTiling)769*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolUniformTiling) {
770*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
771*b095b053SXin Li ASSERT_TRUE(threadpool.get());
772*b095b053SXin Li
773*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
774*b095b053SXin Li GTEST_SKIP();
775*b095b053SXin Li }
776*b095b053SXin Li
777*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
778*b095b053SXin Li threadpool.get(),
779*b095b053SXin Li CheckTiling1DTile1D,
780*b095b053SXin Li nullptr,
781*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
782*b095b053SXin Li 0 /* flags */);
783*b095b053SXin Li }
784*b095b053SXin Li
SetTrue1DTile1D(std::atomic_bool * processed_indicators,size_t start_i,size_t tile_i)785*b095b053SXin Li static void SetTrue1DTile1D(std::atomic_bool* processed_indicators, size_t start_i, size_t tile_i) {
786*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
787*b095b053SXin Li processed_indicators[i].store(true, std::memory_order_relaxed);
788*b095b053SXin Li }
789*b095b053SXin Li }
790*b095b053SXin Li
TEST(Parallelize1DTile1D,SingleThreadPoolAllItemsProcessed)791*b095b053SXin Li TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsProcessed) {
792*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize1DTile1DRange);
793*b095b053SXin Li
794*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
795*b095b053SXin Li ASSERT_TRUE(threadpool.get());
796*b095b053SXin Li
797*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
798*b095b053SXin Li threadpool.get(),
799*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(SetTrue1DTile1D),
800*b095b053SXin Li static_cast<void*>(indicators.data()),
801*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
802*b095b053SXin Li 0 /* flags */);
803*b095b053SXin Li
804*b095b053SXin Li for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
805*b095b053SXin Li EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
806*b095b053SXin Li << "Element " << i << " not processed";
807*b095b053SXin Li }
808*b095b053SXin Li }
809*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolAllItemsProcessed)810*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsProcessed) {
811*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize1DTile1DRange);
812*b095b053SXin Li
813*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
814*b095b053SXin Li ASSERT_TRUE(threadpool.get());
815*b095b053SXin Li
816*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
817*b095b053SXin Li GTEST_SKIP();
818*b095b053SXin Li }
819*b095b053SXin Li
820*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
821*b095b053SXin Li threadpool.get(),
822*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(SetTrue1DTile1D),
823*b095b053SXin Li static_cast<void*>(indicators.data()),
824*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
825*b095b053SXin Li 0 /* flags */);
826*b095b053SXin Li
827*b095b053SXin Li for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
828*b095b053SXin Li EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed))
829*b095b053SXin Li << "Element " << i << " not processed";
830*b095b053SXin Li }
831*b095b053SXin Li }
832*b095b053SXin Li
Increment1DTile1D(std::atomic_int * processed_counters,size_t start_i,size_t tile_i)833*b095b053SXin Li static void Increment1DTile1D(std::atomic_int* processed_counters, size_t start_i, size_t tile_i) {
834*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
835*b095b053SXin Li processed_counters[i].fetch_add(1, std::memory_order_relaxed);
836*b095b053SXin Li }
837*b095b053SXin Li }
838*b095b053SXin Li
TEST(Parallelize1DTile1D,SingleThreadPoolEachItemProcessedOnce)839*b095b053SXin Li TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedOnce) {
840*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
841*b095b053SXin Li
842*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
843*b095b053SXin Li ASSERT_TRUE(threadpool.get());
844*b095b053SXin Li
845*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
846*b095b053SXin Li threadpool.get(),
847*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
848*b095b053SXin Li static_cast<void*>(counters.data()),
849*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
850*b095b053SXin Li 0 /* flags */);
851*b095b053SXin Li
852*b095b053SXin Li for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
853*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
854*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
855*b095b053SXin Li }
856*b095b053SXin Li }
857*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolEachItemProcessedOnce)858*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedOnce) {
859*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
860*b095b053SXin Li
861*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
862*b095b053SXin Li ASSERT_TRUE(threadpool.get());
863*b095b053SXin Li
864*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
865*b095b053SXin Li GTEST_SKIP();
866*b095b053SXin Li }
867*b095b053SXin Li
868*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
869*b095b053SXin Li threadpool.get(),
870*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
871*b095b053SXin Li static_cast<void*>(counters.data()),
872*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
873*b095b053SXin Li 0 /* flags */);
874*b095b053SXin Li
875*b095b053SXin Li for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
876*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1)
877*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)";
878*b095b053SXin Li }
879*b095b053SXin Li }
880*b095b053SXin Li
TEST(Parallelize1DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)881*b095b053SXin Li TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
882*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
883*b095b053SXin Li
884*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
885*b095b053SXin Li ASSERT_TRUE(threadpool.get());
886*b095b053SXin Li
887*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
888*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
889*b095b053SXin Li threadpool.get(),
890*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
891*b095b053SXin Li static_cast<void*>(counters.data()),
892*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
893*b095b053SXin Li 0 /* flags */);
894*b095b053SXin Li }
895*b095b053SXin Li
896*b095b053SXin Li for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
897*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
898*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
899*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
900*b095b053SXin Li }
901*b095b053SXin Li }
902*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)903*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
904*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize1DTile1DRange);
905*b095b053SXin Li
906*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
907*b095b053SXin Li ASSERT_TRUE(threadpool.get());
908*b095b053SXin Li
909*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
910*b095b053SXin Li GTEST_SKIP();
911*b095b053SXin Li }
912*b095b053SXin Li
913*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
914*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
915*b095b053SXin Li threadpool.get(),
916*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D),
917*b095b053SXin Li static_cast<void*>(counters.data()),
918*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
919*b095b053SXin Li 0 /* flags */);
920*b095b053SXin Li }
921*b095b053SXin Li
922*b095b053SXin Li for (size_t i = 0; i < kParallelize1DTile1DRange; i++) {
923*b095b053SXin Li EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations)
924*b095b053SXin Li << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times "
925*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
926*b095b053SXin Li }
927*b095b053SXin Li }
928*b095b053SXin Li
IncrementSame1DTile1D(std::atomic_int * num_processed_items,size_t start_i,size_t tile_i)929*b095b053SXin Li static void IncrementSame1DTile1D(std::atomic_int* num_processed_items, size_t start_i, size_t tile_i) {
930*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
931*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
932*b095b053SXin Li }
933*b095b053SXin Li }
934*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolHighContention)935*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolHighContention) {
936*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
937*b095b053SXin Li
938*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
939*b095b053SXin Li ASSERT_TRUE(threadpool.get());
940*b095b053SXin Li
941*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
942*b095b053SXin Li GTEST_SKIP();
943*b095b053SXin Li }
944*b095b053SXin Li
945*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
946*b095b053SXin Li threadpool.get(),
947*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(IncrementSame1DTile1D),
948*b095b053SXin Li static_cast<void*>(&num_processed_items),
949*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
950*b095b053SXin Li 0 /* flags */);
951*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DTile1DRange);
952*b095b053SXin Li }
953*b095b053SXin Li
WorkImbalance1DTile1D(std::atomic_int * num_processed_items,size_t start_i,size_t tile_i)954*b095b053SXin Li static void WorkImbalance1DTile1D(std::atomic_int* num_processed_items, size_t start_i, size_t tile_i) {
955*b095b053SXin Li num_processed_items->fetch_add(tile_i, std::memory_order_relaxed);
956*b095b053SXin Li if (start_i == 0) {
957*b095b053SXin Li /* Spin-wait until all items are computed */
958*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DTile1DRange) {
959*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
960*b095b053SXin Li }
961*b095b053SXin Li }
962*b095b053SXin Li }
963*b095b053SXin Li
TEST(Parallelize1DTile1D,MultiThreadPoolWorkStealing)964*b095b053SXin Li TEST(Parallelize1DTile1D, MultiThreadPoolWorkStealing) {
965*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
966*b095b053SXin Li
967*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
968*b095b053SXin Li ASSERT_TRUE(threadpool.get());
969*b095b053SXin Li
970*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
971*b095b053SXin Li GTEST_SKIP();
972*b095b053SXin Li }
973*b095b053SXin Li
974*b095b053SXin Li pthreadpool_parallelize_1d_tile_1d(
975*b095b053SXin Li threadpool.get(),
976*b095b053SXin Li reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(WorkImbalance1DTile1D),
977*b095b053SXin Li static_cast<void*>(&num_processed_items),
978*b095b053SXin Li kParallelize1DTile1DRange, kParallelize1DTile1DTile,
979*b095b053SXin Li 0 /* flags */);
980*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DTile1DRange);
981*b095b053SXin Li }
982*b095b053SXin Li
ComputeNothing2D(void *,size_t,size_t)983*b095b053SXin Li static void ComputeNothing2D(void*, size_t, size_t) {
984*b095b053SXin Li }
985*b095b053SXin Li
TEST(Parallelize2D,SingleThreadPoolCompletes)986*b095b053SXin Li TEST(Parallelize2D, SingleThreadPoolCompletes) {
987*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
988*b095b053SXin Li ASSERT_TRUE(threadpool.get());
989*b095b053SXin Li
990*b095b053SXin Li pthreadpool_parallelize_2d(threadpool.get(),
991*b095b053SXin Li ComputeNothing2D,
992*b095b053SXin Li nullptr,
993*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
994*b095b053SXin Li 0 /* flags */);
995*b095b053SXin Li }
996*b095b053SXin Li
TEST(Parallelize2D,MultiThreadPoolCompletes)997*b095b053SXin Li TEST(Parallelize2D, MultiThreadPoolCompletes) {
998*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
999*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1000*b095b053SXin Li
1001*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1002*b095b053SXin Li GTEST_SKIP();
1003*b095b053SXin Li }
1004*b095b053SXin Li
1005*b095b053SXin Li pthreadpool_parallelize_2d(
1006*b095b053SXin Li threadpool.get(),
1007*b095b053SXin Li ComputeNothing2D,
1008*b095b053SXin Li nullptr,
1009*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1010*b095b053SXin Li 0 /* flags */);
1011*b095b053SXin Li }
1012*b095b053SXin Li
CheckBounds2D(void *,size_t i,size_t j)1013*b095b053SXin Li static void CheckBounds2D(void*, size_t i, size_t j) {
1014*b095b053SXin Li EXPECT_LT(i, kParallelize2DRangeI);
1015*b095b053SXin Li EXPECT_LT(j, kParallelize2DRangeJ);
1016*b095b053SXin Li }
1017*b095b053SXin Li
TEST(Parallelize2D,SingleThreadPoolAllItemsInBounds)1018*b095b053SXin Li TEST(Parallelize2D, SingleThreadPoolAllItemsInBounds) {
1019*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1020*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1021*b095b053SXin Li
1022*b095b053SXin Li pthreadpool_parallelize_2d(
1023*b095b053SXin Li threadpool.get(),
1024*b095b053SXin Li CheckBounds2D,
1025*b095b053SXin Li nullptr,
1026*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1027*b095b053SXin Li 0 /* flags */);
1028*b095b053SXin Li }
1029*b095b053SXin Li
TEST(Parallelize2D,MultiThreadPoolAllItemsInBounds)1030*b095b053SXin Li TEST(Parallelize2D, MultiThreadPoolAllItemsInBounds) {
1031*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1032*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1033*b095b053SXin Li
1034*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1035*b095b053SXin Li GTEST_SKIP();
1036*b095b053SXin Li }
1037*b095b053SXin Li
1038*b095b053SXin Li pthreadpool_parallelize_2d(
1039*b095b053SXin Li threadpool.get(),
1040*b095b053SXin Li CheckBounds2D,
1041*b095b053SXin Li nullptr,
1042*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1043*b095b053SXin Li 0 /* flags */);
1044*b095b053SXin Li }
1045*b095b053SXin Li
SetTrue2D(std::atomic_bool * processed_indicators,size_t i,size_t j)1046*b095b053SXin Li static void SetTrue2D(std::atomic_bool* processed_indicators, size_t i, size_t j) {
1047*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1048*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
1049*b095b053SXin Li }
1050*b095b053SXin Li
TEST(Parallelize2D,SingleThreadPoolAllItemsProcessed)1051*b095b053SXin Li TEST(Parallelize2D, SingleThreadPoolAllItemsProcessed) {
1052*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DRangeI * kParallelize2DRangeJ);
1053*b095b053SXin Li
1054*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1055*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1056*b095b053SXin Li
1057*b095b053SXin Li pthreadpool_parallelize_2d(
1058*b095b053SXin Li threadpool.get(),
1059*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(SetTrue2D),
1060*b095b053SXin Li static_cast<void*>(indicators.data()),
1061*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1062*b095b053SXin Li 0 /* flags */);
1063*b095b053SXin Li
1064*b095b053SXin Li for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1065*b095b053SXin Li for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1066*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1067*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1068*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
1069*b095b053SXin Li }
1070*b095b053SXin Li }
1071*b095b053SXin Li }
1072*b095b053SXin Li
TEST(Parallelize2D,MultiThreadPoolAllItemsProcessed)1073*b095b053SXin Li TEST(Parallelize2D, MultiThreadPoolAllItemsProcessed) {
1074*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DRangeI * kParallelize2DRangeJ);
1075*b095b053SXin Li
1076*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1077*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1078*b095b053SXin Li
1079*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1080*b095b053SXin Li GTEST_SKIP();
1081*b095b053SXin Li }
1082*b095b053SXin Li
1083*b095b053SXin Li pthreadpool_parallelize_2d(
1084*b095b053SXin Li threadpool.get(),
1085*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(SetTrue2D),
1086*b095b053SXin Li static_cast<void*>(indicators.data()),
1087*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1088*b095b053SXin Li 0 /* flags */);
1089*b095b053SXin Li
1090*b095b053SXin Li for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1091*b095b053SXin Li for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1092*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1093*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1094*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
1095*b095b053SXin Li }
1096*b095b053SXin Li }
1097*b095b053SXin Li }
1098*b095b053SXin Li
Increment2D(std::atomic_int * processed_counters,size_t i,size_t j)1099*b095b053SXin Li static void Increment2D(std::atomic_int* processed_counters, size_t i, size_t j) {
1100*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1101*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
1102*b095b053SXin Li }
1103*b095b053SXin Li
TEST(Parallelize2D,SingleThreadPoolEachItemProcessedOnce)1104*b095b053SXin Li TEST(Parallelize2D, SingleThreadPoolEachItemProcessedOnce) {
1105*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1106*b095b053SXin Li
1107*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1108*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1109*b095b053SXin Li
1110*b095b053SXin Li pthreadpool_parallelize_2d(
1111*b095b053SXin Li threadpool.get(),
1112*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1113*b095b053SXin Li static_cast<void*>(counters.data()),
1114*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1115*b095b053SXin Li 0 /* flags */);
1116*b095b053SXin Li
1117*b095b053SXin Li for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1118*b095b053SXin Li for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1119*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1120*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1121*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1122*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1123*b095b053SXin Li }
1124*b095b053SXin Li }
1125*b095b053SXin Li }
1126*b095b053SXin Li
TEST(Parallelize2D,MultiThreadPoolEachItemProcessedOnce)1127*b095b053SXin Li TEST(Parallelize2D, MultiThreadPoolEachItemProcessedOnce) {
1128*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1129*b095b053SXin Li
1130*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1131*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1132*b095b053SXin Li
1133*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1134*b095b053SXin Li GTEST_SKIP();
1135*b095b053SXin Li }
1136*b095b053SXin Li
1137*b095b053SXin Li pthreadpool_parallelize_2d(
1138*b095b053SXin Li threadpool.get(),
1139*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1140*b095b053SXin Li static_cast<void*>(counters.data()),
1141*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1142*b095b053SXin Li 0 /* flags */);
1143*b095b053SXin Li
1144*b095b053SXin Li for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1145*b095b053SXin Li for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1146*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1147*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1148*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1149*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1150*b095b053SXin Li }
1151*b095b053SXin Li }
1152*b095b053SXin Li }
1153*b095b053SXin Li
TEST(Parallelize2D,SingleThreadPoolEachItemProcessedMultipleTimes)1154*b095b053SXin Li TEST(Parallelize2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
1155*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1156*b095b053SXin Li
1157*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1158*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1159*b095b053SXin Li
1160*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1161*b095b053SXin Li pthreadpool_parallelize_2d(
1162*b095b053SXin Li threadpool.get(),
1163*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1164*b095b053SXin Li static_cast<void*>(counters.data()),
1165*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1166*b095b053SXin Li 0 /* flags */);
1167*b095b053SXin Li }
1168*b095b053SXin Li
1169*b095b053SXin Li for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1170*b095b053SXin Li for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1171*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1172*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1173*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1174*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
1175*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
1176*b095b053SXin Li }
1177*b095b053SXin Li }
1178*b095b053SXin Li }
1179*b095b053SXin Li
TEST(Parallelize2D,MultiThreadPoolEachItemProcessedMultipleTimes)1180*b095b053SXin Li TEST(Parallelize2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
1181*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ);
1182*b095b053SXin Li
1183*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1184*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1185*b095b053SXin Li
1186*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1187*b095b053SXin Li GTEST_SKIP();
1188*b095b053SXin Li }
1189*b095b053SXin Li
1190*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1191*b095b053SXin Li pthreadpool_parallelize_2d(
1192*b095b053SXin Li threadpool.get(),
1193*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(Increment2D),
1194*b095b053SXin Li static_cast<void*>(counters.data()),
1195*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1196*b095b053SXin Li 0 /* flags */);
1197*b095b053SXin Li }
1198*b095b053SXin Li
1199*b095b053SXin Li for (size_t i = 0; i < kParallelize2DRangeI; i++) {
1200*b095b053SXin Li for (size_t j = 0; j < kParallelize2DRangeJ; j++) {
1201*b095b053SXin Li const size_t linear_idx = i * kParallelize2DRangeJ + j;
1202*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1203*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1204*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
1205*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
1206*b095b053SXin Li }
1207*b095b053SXin Li }
1208*b095b053SXin Li }
1209*b095b053SXin Li
IncrementSame2D(std::atomic_int * num_processed_items,size_t i,size_t j)1210*b095b053SXin Li static void IncrementSame2D(std::atomic_int* num_processed_items, size_t i, size_t j) {
1211*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
1212*b095b053SXin Li }
1213*b095b053SXin Li
TEST(Parallelize2D,MultiThreadPoolHighContention)1214*b095b053SXin Li TEST(Parallelize2D, MultiThreadPoolHighContention) {
1215*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1216*b095b053SXin Li
1217*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1218*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1219*b095b053SXin Li
1220*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1221*b095b053SXin Li GTEST_SKIP();
1222*b095b053SXin Li }
1223*b095b053SXin Li
1224*b095b053SXin Li pthreadpool_parallelize_2d(
1225*b095b053SXin Li threadpool.get(),
1226*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(IncrementSame2D),
1227*b095b053SXin Li static_cast<void*>(&num_processed_items),
1228*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1229*b095b053SXin Li 0 /* flags */);
1230*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ);
1231*b095b053SXin Li }
1232*b095b053SXin Li
WorkImbalance2D(std::atomic_int * num_processed_items,size_t i,size_t j)1233*b095b053SXin Li static void WorkImbalance2D(std::atomic_int* num_processed_items, size_t i, size_t j) {
1234*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
1235*b095b053SXin Li if (i == 0 && j == 0) {
1236*b095b053SXin Li /* Spin-wait until all items are computed */
1237*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DRangeI * kParallelize2DRangeJ) {
1238*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
1239*b095b053SXin Li }
1240*b095b053SXin Li }
1241*b095b053SXin Li }
1242*b095b053SXin Li
TEST(Parallelize2D,MultiThreadPoolWorkStealing)1243*b095b053SXin Li TEST(Parallelize2D, MultiThreadPoolWorkStealing) {
1244*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1245*b095b053SXin Li
1246*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1247*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1248*b095b053SXin Li
1249*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1250*b095b053SXin Li GTEST_SKIP();
1251*b095b053SXin Li }
1252*b095b053SXin Li
1253*b095b053SXin Li pthreadpool_parallelize_2d(
1254*b095b053SXin Li threadpool.get(),
1255*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_t>(WorkImbalance2D),
1256*b095b053SXin Li static_cast<void*>(&num_processed_items),
1257*b095b053SXin Li kParallelize2DRangeI, kParallelize2DRangeJ,
1258*b095b053SXin Li 0 /* flags */);
1259*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ);
1260*b095b053SXin Li }
1261*b095b053SXin Li
ComputeNothing2DTile1D(void *,size_t,size_t,size_t)1262*b095b053SXin Li static void ComputeNothing2DTile1D(void*, size_t, size_t, size_t) {
1263*b095b053SXin Li }
1264*b095b053SXin Li
TEST(Parallelize2DTile1D,SingleThreadPoolCompletes)1265*b095b053SXin Li TEST(Parallelize2DTile1D, SingleThreadPoolCompletes) {
1266*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1267*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1268*b095b053SXin Li
1269*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(threadpool.get(),
1270*b095b053SXin Li ComputeNothing2DTile1D,
1271*b095b053SXin Li nullptr,
1272*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1273*b095b053SXin Li 0 /* flags */);
1274*b095b053SXin Li }
1275*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolCompletes)1276*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolCompletes) {
1277*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1278*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1279*b095b053SXin Li
1280*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1281*b095b053SXin Li GTEST_SKIP();
1282*b095b053SXin Li }
1283*b095b053SXin Li
1284*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1285*b095b053SXin Li threadpool.get(),
1286*b095b053SXin Li ComputeNothing2DTile1D,
1287*b095b053SXin Li nullptr,
1288*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1289*b095b053SXin Li 0 /* flags */);
1290*b095b053SXin Li }
1291*b095b053SXin Li
CheckBounds2DTile1D(void *,size_t i,size_t start_j,size_t tile_j)1292*b095b053SXin Li static void CheckBounds2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) {
1293*b095b053SXin Li EXPECT_LT(i, kParallelize2DTile1DRangeI);
1294*b095b053SXin Li EXPECT_LT(start_j, kParallelize2DTile1DRangeJ);
1295*b095b053SXin Li EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ);
1296*b095b053SXin Li }
1297*b095b053SXin Li
TEST(Parallelize2DTile1D,SingleThreadPoolAllItemsInBounds)1298*b095b053SXin Li TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsInBounds) {
1299*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1300*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1301*b095b053SXin Li
1302*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1303*b095b053SXin Li threadpool.get(),
1304*b095b053SXin Li CheckBounds2DTile1D,
1305*b095b053SXin Li nullptr,
1306*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1307*b095b053SXin Li 0 /* flags */);
1308*b095b053SXin Li }
1309*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolAllItemsInBounds)1310*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsInBounds) {
1311*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1312*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1313*b095b053SXin Li
1314*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1315*b095b053SXin Li GTEST_SKIP();
1316*b095b053SXin Li }
1317*b095b053SXin Li
1318*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1319*b095b053SXin Li threadpool.get(),
1320*b095b053SXin Li CheckBounds2DTile1D,
1321*b095b053SXin Li nullptr,
1322*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1323*b095b053SXin Li 0 /* flags */);
1324*b095b053SXin Li }
1325*b095b053SXin Li
CheckTiling2DTile1D(void *,size_t i,size_t start_j,size_t tile_j)1326*b095b053SXin Li static void CheckTiling2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) {
1327*b095b053SXin Li EXPECT_GT(tile_j, 0);
1328*b095b053SXin Li EXPECT_LE(tile_j, kParallelize2DTile1DTileJ);
1329*b095b053SXin Li EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0);
1330*b095b053SXin Li EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile1DTileJ, kParallelize2DTile1DRangeJ - start_j));
1331*b095b053SXin Li }
1332*b095b053SXin Li
TEST(Parallelize2DTile1D,SingleThreadPoolUniformTiling)1333*b095b053SXin Li TEST(Parallelize2DTile1D, SingleThreadPoolUniformTiling) {
1334*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1335*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1336*b095b053SXin Li
1337*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1338*b095b053SXin Li threadpool.get(),
1339*b095b053SXin Li CheckTiling2DTile1D,
1340*b095b053SXin Li nullptr,
1341*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1342*b095b053SXin Li 0 /* flags */);
1343*b095b053SXin Li }
1344*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolUniformTiling)1345*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolUniformTiling) {
1346*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1347*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1348*b095b053SXin Li
1349*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1350*b095b053SXin Li GTEST_SKIP();
1351*b095b053SXin Li }
1352*b095b053SXin Li
1353*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1354*b095b053SXin Li threadpool.get(),
1355*b095b053SXin Li CheckTiling2DTile1D,
1356*b095b053SXin Li nullptr,
1357*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1358*b095b053SXin Li 0 /* flags */);
1359*b095b053SXin Li }
1360*b095b053SXin Li
SetTrue2DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t start_j,size_t tile_j)1361*b095b053SXin Li static void SetTrue2DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t tile_j) {
1362*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
1363*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1364*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
1365*b095b053SXin Li }
1366*b095b053SXin Li }
1367*b095b053SXin Li
TEST(Parallelize2DTile1D,SingleThreadPoolAllItemsProcessed)1368*b095b053SXin Li TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsProcessed) {
1369*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1370*b095b053SXin Li
1371*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1372*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1373*b095b053SXin Li
1374*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1375*b095b053SXin Li threadpool.get(),
1376*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(SetTrue2DTile1D),
1377*b095b053SXin Li static_cast<void*>(indicators.data()),
1378*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1379*b095b053SXin Li 0 /* flags */);
1380*b095b053SXin Li
1381*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1382*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1383*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1384*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1385*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
1386*b095b053SXin Li }
1387*b095b053SXin Li }
1388*b095b053SXin Li }
1389*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolAllItemsProcessed)1390*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsProcessed) {
1391*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1392*b095b053SXin Li
1393*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1394*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1395*b095b053SXin Li
1396*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1397*b095b053SXin Li GTEST_SKIP();
1398*b095b053SXin Li }
1399*b095b053SXin Li
1400*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1401*b095b053SXin Li threadpool.get(),
1402*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(SetTrue2DTile1D),
1403*b095b053SXin Li static_cast<void*>(indicators.data()),
1404*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1405*b095b053SXin Li 0 /* flags */);
1406*b095b053SXin Li
1407*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1408*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1409*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1410*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1411*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
1412*b095b053SXin Li }
1413*b095b053SXin Li }
1414*b095b053SXin Li }
1415*b095b053SXin Li
Increment2DTile1D(std::atomic_int * processed_counters,size_t i,size_t start_j,size_t tile_j)1416*b095b053SXin Li static void Increment2DTile1D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t tile_j) {
1417*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
1418*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1419*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
1420*b095b053SXin Li }
1421*b095b053SXin Li }
1422*b095b053SXin Li
TEST(Parallelize2DTile1D,SingleThreadPoolEachItemProcessedOnce)1423*b095b053SXin Li TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedOnce) {
1424*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1425*b095b053SXin Li
1426*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1427*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1428*b095b053SXin Li
1429*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1430*b095b053SXin Li threadpool.get(),
1431*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1432*b095b053SXin Li static_cast<void*>(counters.data()),
1433*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1434*b095b053SXin Li 0 /* flags */);
1435*b095b053SXin Li
1436*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1437*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1438*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1439*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1440*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1441*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1442*b095b053SXin Li }
1443*b095b053SXin Li }
1444*b095b053SXin Li }
1445*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolEachItemProcessedOnce)1446*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedOnce) {
1447*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1448*b095b053SXin Li
1449*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1450*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1451*b095b053SXin Li
1452*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1453*b095b053SXin Li GTEST_SKIP();
1454*b095b053SXin Li }
1455*b095b053SXin Li
1456*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1457*b095b053SXin Li threadpool.get(),
1458*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1459*b095b053SXin Li static_cast<void*>(counters.data()),
1460*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1461*b095b053SXin Li 0 /* flags */);
1462*b095b053SXin Li
1463*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1464*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1465*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1466*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1467*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1468*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1469*b095b053SXin Li }
1470*b095b053SXin Li }
1471*b095b053SXin Li }
1472*b095b053SXin Li
TEST(Parallelize2DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)1473*b095b053SXin Li TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
1474*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1475*b095b053SXin Li
1476*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1477*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1478*b095b053SXin Li
1479*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1480*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1481*b095b053SXin Li threadpool.get(),
1482*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1483*b095b053SXin Li static_cast<void*>(counters.data()),
1484*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1485*b095b053SXin Li 0 /* flags */);
1486*b095b053SXin Li }
1487*b095b053SXin Li
1488*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1489*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1490*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1491*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1492*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1493*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
1494*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
1495*b095b053SXin Li }
1496*b095b053SXin Li }
1497*b095b053SXin Li }
1498*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)1499*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
1500*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1501*b095b053SXin Li
1502*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1503*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1504*b095b053SXin Li
1505*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1506*b095b053SXin Li GTEST_SKIP();
1507*b095b053SXin Li }
1508*b095b053SXin Li
1509*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1510*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1511*b095b053SXin Li threadpool.get(),
1512*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D),
1513*b095b053SXin Li static_cast<void*>(counters.data()),
1514*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1515*b095b053SXin Li 0 /* flags */);
1516*b095b053SXin Li }
1517*b095b053SXin Li
1518*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) {
1519*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) {
1520*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j;
1521*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1522*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1523*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
1524*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
1525*b095b053SXin Li }
1526*b095b053SXin Li }
1527*b095b053SXin Li }
1528*b095b053SXin Li
IncrementSame2DTile1D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t tile_j)1529*b095b053SXin Li static void IncrementSame2DTile1D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t tile_j) {
1530*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
1531*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
1532*b095b053SXin Li }
1533*b095b053SXin Li }
1534*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolHighContention)1535*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolHighContention) {
1536*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1537*b095b053SXin Li
1538*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1539*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1540*b095b053SXin Li
1541*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1542*b095b053SXin Li GTEST_SKIP();
1543*b095b053SXin Li }
1544*b095b053SXin Li
1545*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1546*b095b053SXin Li threadpool.get(),
1547*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(IncrementSame2DTile1D),
1548*b095b053SXin Li static_cast<void*>(&num_processed_items),
1549*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1550*b095b053SXin Li 0 /* flags */);
1551*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1552*b095b053SXin Li }
1553*b095b053SXin Li
WorkImbalance2DTile1D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t tile_j)1554*b095b053SXin Li static void WorkImbalance2DTile1D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t tile_j) {
1555*b095b053SXin Li num_processed_items->fetch_add(tile_j, std::memory_order_relaxed);
1556*b095b053SXin Li if (i == 0 && start_j == 0) {
1557*b095b053SXin Li /* Spin-wait until all items are computed */
1558*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) {
1559*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
1560*b095b053SXin Li }
1561*b095b053SXin Li }
1562*b095b053SXin Li }
1563*b095b053SXin Li
TEST(Parallelize2DTile1D,MultiThreadPoolWorkStealing)1564*b095b053SXin Li TEST(Parallelize2DTile1D, MultiThreadPoolWorkStealing) {
1565*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1566*b095b053SXin Li
1567*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1568*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1569*b095b053SXin Li
1570*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1571*b095b053SXin Li GTEST_SKIP();
1572*b095b053SXin Li }
1573*b095b053SXin Li
1574*b095b053SXin Li pthreadpool_parallelize_2d_tile_1d(
1575*b095b053SXin Li threadpool.get(),
1576*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(WorkImbalance2DTile1D),
1577*b095b053SXin Li static_cast<void*>(&num_processed_items),
1578*b095b053SXin Li kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ,
1579*b095b053SXin Li 0 /* flags */);
1580*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ);
1581*b095b053SXin Li }
1582*b095b053SXin Li
ComputeNothing2DTile2D(void *,size_t,size_t,size_t,size_t)1583*b095b053SXin Li static void ComputeNothing2DTile2D(void*, size_t, size_t, size_t, size_t) {
1584*b095b053SXin Li }
1585*b095b053SXin Li
TEST(Parallelize2DTile2D,SingleThreadPoolCompletes)1586*b095b053SXin Li TEST(Parallelize2DTile2D, SingleThreadPoolCompletes) {
1587*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1588*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1589*b095b053SXin Li
1590*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(threadpool.get(),
1591*b095b053SXin Li ComputeNothing2DTile2D,
1592*b095b053SXin Li nullptr,
1593*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1594*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1595*b095b053SXin Li 0 /* flags */);
1596*b095b053SXin Li }
1597*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolCompletes)1598*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolCompletes) {
1599*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1600*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1601*b095b053SXin Li
1602*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1603*b095b053SXin Li GTEST_SKIP();
1604*b095b053SXin Li }
1605*b095b053SXin Li
1606*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1607*b095b053SXin Li threadpool.get(),
1608*b095b053SXin Li ComputeNothing2DTile2D,
1609*b095b053SXin Li nullptr,
1610*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1611*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1612*b095b053SXin Li 0 /* flags */);
1613*b095b053SXin Li }
1614*b095b053SXin Li
CheckBounds2DTile2D(void *,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1615*b095b053SXin Li static void CheckBounds2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1616*b095b053SXin Li EXPECT_LT(start_i, kParallelize2DTile2DRangeI);
1617*b095b053SXin Li EXPECT_LT(start_j, kParallelize2DTile2DRangeJ);
1618*b095b053SXin Li EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI);
1619*b095b053SXin Li EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ);
1620*b095b053SXin Li }
1621*b095b053SXin Li
TEST(Parallelize2DTile2D,SingleThreadPoolAllItemsInBounds)1622*b095b053SXin Li TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsInBounds) {
1623*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1624*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1625*b095b053SXin Li
1626*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1627*b095b053SXin Li threadpool.get(),
1628*b095b053SXin Li CheckBounds2DTile2D,
1629*b095b053SXin Li nullptr,
1630*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1631*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1632*b095b053SXin Li 0 /* flags */);
1633*b095b053SXin Li }
1634*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolAllItemsInBounds)1635*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsInBounds) {
1636*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1637*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1638*b095b053SXin Li
1639*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1640*b095b053SXin Li GTEST_SKIP();
1641*b095b053SXin Li }
1642*b095b053SXin Li
1643*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1644*b095b053SXin Li threadpool.get(),
1645*b095b053SXin Li CheckBounds2DTile2D,
1646*b095b053SXin Li nullptr,
1647*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1648*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1649*b095b053SXin Li 0 /* flags */);
1650*b095b053SXin Li }
1651*b095b053SXin Li
CheckTiling2DTile2D(void *,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1652*b095b053SXin Li static void CheckTiling2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1653*b095b053SXin Li EXPECT_GT(tile_i, 0);
1654*b095b053SXin Li EXPECT_LE(tile_i, kParallelize2DTile2DTileI);
1655*b095b053SXin Li EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0);
1656*b095b053SXin Li EXPECT_EQ(tile_i, std::min<size_t>(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i));
1657*b095b053SXin Li
1658*b095b053SXin Li EXPECT_GT(tile_j, 0);
1659*b095b053SXin Li EXPECT_LE(tile_j, kParallelize2DTile2DTileJ);
1660*b095b053SXin Li EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0);
1661*b095b053SXin Li EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j));
1662*b095b053SXin Li }
1663*b095b053SXin Li
TEST(Parallelize2DTile2D,SingleThreadPoolUniformTiling)1664*b095b053SXin Li TEST(Parallelize2DTile2D, SingleThreadPoolUniformTiling) {
1665*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1666*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1667*b095b053SXin Li
1668*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1669*b095b053SXin Li threadpool.get(),
1670*b095b053SXin Li CheckTiling2DTile2D,
1671*b095b053SXin Li nullptr,
1672*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1673*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1674*b095b053SXin Li 0 /* flags */);
1675*b095b053SXin Li }
1676*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolUniformTiling)1677*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolUniformTiling) {
1678*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1679*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1680*b095b053SXin Li
1681*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1682*b095b053SXin Li GTEST_SKIP();
1683*b095b053SXin Li }
1684*b095b053SXin Li
1685*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1686*b095b053SXin Li threadpool.get(),
1687*b095b053SXin Li CheckTiling2DTile2D,
1688*b095b053SXin Li nullptr,
1689*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1690*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1691*b095b053SXin Li 0 /* flags */);
1692*b095b053SXin Li }
1693*b095b053SXin Li
SetTrue2DTile2D(std::atomic_bool * processed_indicators,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1694*b095b053SXin Li static void SetTrue2DTile2D(std::atomic_bool* processed_indicators, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1695*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
1696*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
1697*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1698*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
1699*b095b053SXin Li }
1700*b095b053SXin Li }
1701*b095b053SXin Li }
1702*b095b053SXin Li
TEST(Parallelize2DTile2D,SingleThreadPoolAllItemsProcessed)1703*b095b053SXin Li TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsProcessed) {
1704*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1705*b095b053SXin Li
1706*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1707*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1708*b095b053SXin Li
1709*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1710*b095b053SXin Li threadpool.get(),
1711*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(SetTrue2DTile2D),
1712*b095b053SXin Li static_cast<void*>(indicators.data()),
1713*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1714*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1715*b095b053SXin Li 0 /* flags */);
1716*b095b053SXin Li
1717*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1718*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1719*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1720*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1721*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
1722*b095b053SXin Li }
1723*b095b053SXin Li }
1724*b095b053SXin Li }
1725*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolAllItemsProcessed)1726*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsProcessed) {
1727*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1728*b095b053SXin Li
1729*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1730*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1731*b095b053SXin Li
1732*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1733*b095b053SXin Li GTEST_SKIP();
1734*b095b053SXin Li }
1735*b095b053SXin Li
1736*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1737*b095b053SXin Li threadpool.get(),
1738*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(SetTrue2DTile2D),
1739*b095b053SXin Li static_cast<void*>(indicators.data()),
1740*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1741*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1742*b095b053SXin Li 0 /* flags */);
1743*b095b053SXin Li
1744*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1745*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1746*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1747*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
1748*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
1749*b095b053SXin Li }
1750*b095b053SXin Li }
1751*b095b053SXin Li }
1752*b095b053SXin Li
Increment2DTile2D(std::atomic_int * processed_counters,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1753*b095b053SXin Li static void Increment2DTile2D(std::atomic_int* processed_counters, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1754*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
1755*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
1756*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1757*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
1758*b095b053SXin Li }
1759*b095b053SXin Li }
1760*b095b053SXin Li }
1761*b095b053SXin Li
TEST(Parallelize2DTile2D,SingleThreadPoolEachItemProcessedOnce)1762*b095b053SXin Li TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) {
1763*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1764*b095b053SXin Li
1765*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1766*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1767*b095b053SXin Li
1768*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1769*b095b053SXin Li threadpool.get(),
1770*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1771*b095b053SXin Li static_cast<void*>(counters.data()),
1772*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1773*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1774*b095b053SXin Li 0 /* flags */);
1775*b095b053SXin Li
1776*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1777*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1778*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1779*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1780*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1781*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1782*b095b053SXin Li }
1783*b095b053SXin Li }
1784*b095b053SXin Li }
1785*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolEachItemProcessedOnce)1786*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) {
1787*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1788*b095b053SXin Li
1789*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1790*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1791*b095b053SXin Li
1792*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1793*b095b053SXin Li GTEST_SKIP();
1794*b095b053SXin Li }
1795*b095b053SXin Li
1796*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1797*b095b053SXin Li threadpool.get(),
1798*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1799*b095b053SXin Li static_cast<void*>(counters.data()),
1800*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1801*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1802*b095b053SXin Li 0 /* flags */);
1803*b095b053SXin Li
1804*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1805*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1806*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1807*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
1808*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1809*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
1810*b095b053SXin Li }
1811*b095b053SXin Li }
1812*b095b053SXin Li }
1813*b095b053SXin Li
TEST(Parallelize2DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)1814*b095b053SXin Li TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
1815*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1816*b095b053SXin Li
1817*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1818*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1819*b095b053SXin Li
1820*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1821*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1822*b095b053SXin Li threadpool.get(),
1823*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1824*b095b053SXin Li static_cast<void*>(counters.data()),
1825*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1826*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1827*b095b053SXin Li 0 /* flags */);
1828*b095b053SXin Li }
1829*b095b053SXin Li
1830*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1831*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1832*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1833*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1834*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1835*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
1836*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
1837*b095b053SXin Li }
1838*b095b053SXin Li }
1839*b095b053SXin Li }
1840*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)1841*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
1842*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1843*b095b053SXin Li
1844*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1845*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1846*b095b053SXin Li
1847*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1848*b095b053SXin Li GTEST_SKIP();
1849*b095b053SXin Li }
1850*b095b053SXin Li
1851*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
1852*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1853*b095b053SXin Li threadpool.get(),
1854*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D),
1855*b095b053SXin Li static_cast<void*>(counters.data()),
1856*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1857*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1858*b095b053SXin Li 0 /* flags */);
1859*b095b053SXin Li }
1860*b095b053SXin Li
1861*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
1862*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
1863*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
1864*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
1865*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
1866*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
1867*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
1868*b095b053SXin Li }
1869*b095b053SXin Li }
1870*b095b053SXin Li }
1871*b095b053SXin Li
IncrementSame2DTile2D(std::atomic_int * num_processed_items,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1872*b095b053SXin Li static void IncrementSame2DTile2D(std::atomic_int* num_processed_items, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1873*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
1874*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
1875*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
1876*b095b053SXin Li }
1877*b095b053SXin Li }
1878*b095b053SXin Li }
1879*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolHighContention)1880*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolHighContention) {
1881*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1882*b095b053SXin Li
1883*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1884*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1885*b095b053SXin Li
1886*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1887*b095b053SXin Li GTEST_SKIP();
1888*b095b053SXin Li }
1889*b095b053SXin Li
1890*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1891*b095b053SXin Li threadpool.get(),
1892*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(IncrementSame2DTile2D),
1893*b095b053SXin Li static_cast<void*>(&num_processed_items),
1894*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1895*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1896*b095b053SXin Li 0 /* flags */);
1897*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1898*b095b053SXin Li }
1899*b095b053SXin Li
WorkImbalance2DTile2D(std::atomic_int * num_processed_items,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)1900*b095b053SXin Li static void WorkImbalance2DTile2D(std::atomic_int* num_processed_items, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
1901*b095b053SXin Li num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed);
1902*b095b053SXin Li if (start_i == 0 && start_j == 0) {
1903*b095b053SXin Li /* Spin-wait until all items are computed */
1904*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) {
1905*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
1906*b095b053SXin Li }
1907*b095b053SXin Li }
1908*b095b053SXin Li }
1909*b095b053SXin Li
TEST(Parallelize2DTile2D,MultiThreadPoolWorkStealing)1910*b095b053SXin Li TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) {
1911*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
1912*b095b053SXin Li
1913*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1914*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1915*b095b053SXin Li
1916*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1917*b095b053SXin Li GTEST_SKIP();
1918*b095b053SXin Li }
1919*b095b053SXin Li
1920*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d(
1921*b095b053SXin Li threadpool.get(),
1922*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(WorkImbalance2DTile2D),
1923*b095b053SXin Li static_cast<void*>(&num_processed_items),
1924*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1925*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1926*b095b053SXin Li 0 /* flags */);
1927*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
1928*b095b053SXin Li }
1929*b095b053SXin Li
ComputeNothing2DTile2DWithUArch(void *,uint32_t,size_t,size_t,size_t,size_t)1930*b095b053SXin Li static void ComputeNothing2DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t) {
1931*b095b053SXin Li }
1932*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolCompletes)1933*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolCompletes) {
1934*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1935*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1936*b095b053SXin Li
1937*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(threadpool.get(),
1938*b095b053SXin Li ComputeNothing2DTile2DWithUArch,
1939*b095b053SXin Li nullptr,
1940*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
1941*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1942*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1943*b095b053SXin Li 0 /* flags */);
1944*b095b053SXin Li }
1945*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolCompletes)1946*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolCompletes) {
1947*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1948*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1949*b095b053SXin Li
1950*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1951*b095b053SXin Li GTEST_SKIP();
1952*b095b053SXin Li }
1953*b095b053SXin Li
1954*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
1955*b095b053SXin Li threadpool.get(),
1956*b095b053SXin Li ComputeNothing2DTile2DWithUArch,
1957*b095b053SXin Li nullptr,
1958*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
1959*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1960*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1961*b095b053SXin Li 0 /* flags */);
1962*b095b053SXin Li }
1963*b095b053SXin Li
CheckUArch2DTile2DWithUArch(void *,uint32_t uarch_index,size_t,size_t,size_t,size_t)1964*b095b053SXin Li static void CheckUArch2DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t) {
1965*b095b053SXin Li if (uarch_index != kDefaultUArchIndex) {
1966*b095b053SXin Li EXPECT_LE(uarch_index, kMaxUArchIndex);
1967*b095b053SXin Li }
1968*b095b053SXin Li }
1969*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolUArchInBounds)1970*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUArchInBounds) {
1971*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
1972*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1973*b095b053SXin Li
1974*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
1975*b095b053SXin Li threadpool.get(),
1976*b095b053SXin Li CheckUArch2DTile2DWithUArch,
1977*b095b053SXin Li nullptr,
1978*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
1979*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1980*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1981*b095b053SXin Li 0 /* flags */);
1982*b095b053SXin Li }
1983*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolUArchInBounds)1984*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUArchInBounds) {
1985*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
1986*b095b053SXin Li ASSERT_TRUE(threadpool.get());
1987*b095b053SXin Li
1988*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
1989*b095b053SXin Li GTEST_SKIP();
1990*b095b053SXin Li }
1991*b095b053SXin Li
1992*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
1993*b095b053SXin Li threadpool.get(),
1994*b095b053SXin Li CheckUArch2DTile2DWithUArch,
1995*b095b053SXin Li nullptr,
1996*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
1997*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
1998*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
1999*b095b053SXin Li 0 /* flags */);
2000*b095b053SXin Li }
2001*b095b053SXin Li
CheckBounds2DTile2DWithUArch(void *,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2002*b095b053SXin Li static void CheckBounds2DTile2DWithUArch(void*, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2003*b095b053SXin Li EXPECT_LT(start_i, kParallelize2DTile2DRangeI);
2004*b095b053SXin Li EXPECT_LT(start_j, kParallelize2DTile2DRangeJ);
2005*b095b053SXin Li EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI);
2006*b095b053SXin Li EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ);
2007*b095b053SXin Li }
2008*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolAllItemsInBounds)2009*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) {
2010*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2011*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2012*b095b053SXin Li
2013*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2014*b095b053SXin Li threadpool.get(),
2015*b095b053SXin Li CheckBounds2DTile2DWithUArch,
2016*b095b053SXin Li nullptr,
2017*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2018*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2019*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2020*b095b053SXin Li 0 /* flags */);
2021*b095b053SXin Li }
2022*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolAllItemsInBounds)2023*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) {
2024*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2025*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2026*b095b053SXin Li
2027*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2028*b095b053SXin Li GTEST_SKIP();
2029*b095b053SXin Li }
2030*b095b053SXin Li
2031*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2032*b095b053SXin Li threadpool.get(),
2033*b095b053SXin Li CheckBounds2DTile2DWithUArch,
2034*b095b053SXin Li nullptr,
2035*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2036*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2037*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2038*b095b053SXin Li 0 /* flags */);
2039*b095b053SXin Li }
2040*b095b053SXin Li
CheckTiling2DTile2DWithUArch(void *,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2041*b095b053SXin Li static void CheckTiling2DTile2DWithUArch(void*, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2042*b095b053SXin Li EXPECT_GT(tile_i, 0);
2043*b095b053SXin Li EXPECT_LE(tile_i, kParallelize2DTile2DTileI);
2044*b095b053SXin Li EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0);
2045*b095b053SXin Li EXPECT_EQ(tile_i, std::min<size_t>(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i));
2046*b095b053SXin Li
2047*b095b053SXin Li EXPECT_GT(tile_j, 0);
2048*b095b053SXin Li EXPECT_LE(tile_j, kParallelize2DTile2DTileJ);
2049*b095b053SXin Li EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0);
2050*b095b053SXin Li EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j));
2051*b095b053SXin Li }
2052*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolUniformTiling)2053*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUniformTiling) {
2054*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2055*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2056*b095b053SXin Li
2057*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2058*b095b053SXin Li threadpool.get(),
2059*b095b053SXin Li CheckTiling2DTile2DWithUArch,
2060*b095b053SXin Li nullptr,
2061*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2062*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2063*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2064*b095b053SXin Li 0 /* flags */);
2065*b095b053SXin Li }
2066*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolUniformTiling)2067*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUniformTiling) {
2068*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2069*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2070*b095b053SXin Li
2071*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2072*b095b053SXin Li GTEST_SKIP();
2073*b095b053SXin Li }
2074*b095b053SXin Li
2075*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2076*b095b053SXin Li threadpool.get(),
2077*b095b053SXin Li CheckTiling2DTile2DWithUArch,
2078*b095b053SXin Li nullptr,
2079*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2080*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2081*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2082*b095b053SXin Li 0 /* flags */);
2083*b095b053SXin Li }
2084*b095b053SXin Li
SetTrue2DTile2DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2085*b095b053SXin Li static void SetTrue2DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2086*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
2087*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
2088*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2089*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
2090*b095b053SXin Li }
2091*b095b053SXin Li }
2092*b095b053SXin Li }
2093*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolAllItemsProcessed)2094*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) {
2095*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2096*b095b053SXin Li
2097*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2098*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2099*b095b053SXin Li
2100*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2101*b095b053SXin Li threadpool.get(),
2102*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(SetTrue2DTile2DWithUArch),
2103*b095b053SXin Li static_cast<void*>(indicators.data()),
2104*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2105*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2106*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2107*b095b053SXin Li 0 /* flags */);
2108*b095b053SXin Li
2109*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2110*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2111*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2112*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2113*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
2114*b095b053SXin Li }
2115*b095b053SXin Li }
2116*b095b053SXin Li }
2117*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolAllItemsProcessed)2118*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) {
2119*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2120*b095b053SXin Li
2121*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2122*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2123*b095b053SXin Li
2124*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2125*b095b053SXin Li GTEST_SKIP();
2126*b095b053SXin Li }
2127*b095b053SXin Li
2128*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2129*b095b053SXin Li threadpool.get(),
2130*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(SetTrue2DTile2DWithUArch),
2131*b095b053SXin Li static_cast<void*>(indicators.data()),
2132*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2133*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2134*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2135*b095b053SXin Li 0 /* flags */);
2136*b095b053SXin Li
2137*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2138*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2139*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2140*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2141*b095b053SXin Li << "Element (" << i << ", " << j << ") not processed";
2142*b095b053SXin Li }
2143*b095b053SXin Li }
2144*b095b053SXin Li }
2145*b095b053SXin Li
Increment2DTile2DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2146*b095b053SXin Li static void Increment2DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2147*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
2148*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
2149*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2150*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
2151*b095b053SXin Li }
2152*b095b053SXin Li }
2153*b095b053SXin Li }
2154*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolEachItemProcessedOnce)2155*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
2156*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2157*b095b053SXin Li
2158*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2159*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2160*b095b053SXin Li
2161*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2162*b095b053SXin Li threadpool.get(),
2163*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2164*b095b053SXin Li static_cast<void*>(counters.data()),
2165*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2166*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2167*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2168*b095b053SXin Li 0 /* flags */);
2169*b095b053SXin Li
2170*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2171*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2172*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2173*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2174*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
2175*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2176*b095b053SXin Li }
2177*b095b053SXin Li }
2178*b095b053SXin Li }
2179*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolEachItemProcessedOnce)2180*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
2181*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2182*b095b053SXin Li
2183*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2184*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2185*b095b053SXin Li
2186*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2187*b095b053SXin Li GTEST_SKIP();
2188*b095b053SXin Li }
2189*b095b053SXin Li
2190*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2191*b095b053SXin Li threadpool.get(),
2192*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2193*b095b053SXin Li static_cast<void*>(counters.data()),
2194*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2195*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2196*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2197*b095b053SXin Li 0 /* flags */);
2198*b095b053SXin Li
2199*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2200*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2201*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2202*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2203*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
2204*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2205*b095b053SXin Li }
2206*b095b053SXin Li }
2207*b095b053SXin Li }
2208*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)2209*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
2210*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2211*b095b053SXin Li
2212*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2213*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2214*b095b053SXin Li
2215*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2216*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2217*b095b053SXin Li threadpool.get(),
2218*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2219*b095b053SXin Li static_cast<void*>(counters.data()),
2220*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2221*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2222*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2223*b095b053SXin Li 0 /* flags */);
2224*b095b053SXin Li }
2225*b095b053SXin Li
2226*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2227*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2228*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2229*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2230*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
2231*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
2232*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
2233*b095b053SXin Li }
2234*b095b053SXin Li }
2235*b095b053SXin Li }
2236*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)2237*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
2238*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2239*b095b053SXin Li
2240*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2241*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2242*b095b053SXin Li
2243*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2244*b095b053SXin Li GTEST_SKIP();
2245*b095b053SXin Li }
2246*b095b053SXin Li
2247*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2248*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2249*b095b053SXin Li threadpool.get(),
2250*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(Increment2DTile2DWithUArch),
2251*b095b053SXin Li static_cast<void*>(counters.data()),
2252*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2253*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2254*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2255*b095b053SXin Li 0 /* flags */);
2256*b095b053SXin Li }
2257*b095b053SXin Li
2258*b095b053SXin Li for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) {
2259*b095b053SXin Li for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) {
2260*b095b053SXin Li const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j;
2261*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2262*b095b053SXin Li << "Element (" << i << ", " << j << ") was processed "
2263*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
2264*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
2265*b095b053SXin Li }
2266*b095b053SXin Li }
2267*b095b053SXin Li }
2268*b095b053SXin Li
IncrementSame2DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2269*b095b053SXin Li static void IncrementSame2DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2270*b095b053SXin Li for (size_t i = start_i; i < start_i + tile_i; i++) {
2271*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
2272*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
2273*b095b053SXin Li }
2274*b095b053SXin Li }
2275*b095b053SXin Li }
2276*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolHighContention)2277*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolHighContention) {
2278*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2279*b095b053SXin Li
2280*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2281*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2282*b095b053SXin Li
2283*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2284*b095b053SXin Li GTEST_SKIP();
2285*b095b053SXin Li }
2286*b095b053SXin Li
2287*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2288*b095b053SXin Li threadpool.get(),
2289*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(IncrementSame2DTile2DWithUArch),
2290*b095b053SXin Li static_cast<void*>(&num_processed_items),
2291*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2292*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2293*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2294*b095b053SXin Li 0 /* flags */);
2295*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2296*b095b053SXin Li }
2297*b095b053SXin Li
WorkImbalance2DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t start_i,size_t start_j,size_t tile_i,size_t tile_j)2298*b095b053SXin Li static void WorkImbalance2DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) {
2299*b095b053SXin Li num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed);
2300*b095b053SXin Li if (start_i == 0 && start_j == 0) {
2301*b095b053SXin Li /* Spin-wait until all items are computed */
2302*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) {
2303*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
2304*b095b053SXin Li }
2305*b095b053SXin Li }
2306*b095b053SXin Li }
2307*b095b053SXin Li
TEST(Parallelize2DTile2DWithUArch,MultiThreadPoolWorkStealing)2308*b095b053SXin Li TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolWorkStealing) {
2309*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2310*b095b053SXin Li
2311*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2312*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2313*b095b053SXin Li
2314*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2315*b095b053SXin Li GTEST_SKIP();
2316*b095b053SXin Li }
2317*b095b053SXin Li
2318*b095b053SXin Li pthreadpool_parallelize_2d_tile_2d_with_uarch(
2319*b095b053SXin Li threadpool.get(),
2320*b095b053SXin Li reinterpret_cast<pthreadpool_task_2d_tile_2d_with_id_t>(WorkImbalance2DTile2DWithUArch),
2321*b095b053SXin Li static_cast<void*>(&num_processed_items),
2322*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
2323*b095b053SXin Li kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ,
2324*b095b053SXin Li kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ,
2325*b095b053SXin Li 0 /* flags */);
2326*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ);
2327*b095b053SXin Li }
2328*b095b053SXin Li
ComputeNothing3D(void *,size_t,size_t,size_t)2329*b095b053SXin Li static void ComputeNothing3D(void*, size_t, size_t, size_t) {
2330*b095b053SXin Li }
2331*b095b053SXin Li
TEST(Parallelize3D,SingleThreadPoolCompletes)2332*b095b053SXin Li TEST(Parallelize3D, SingleThreadPoolCompletes) {
2333*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2334*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2335*b095b053SXin Li
2336*b095b053SXin Li pthreadpool_parallelize_3d(threadpool.get(),
2337*b095b053SXin Li ComputeNothing3D,
2338*b095b053SXin Li nullptr,
2339*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2340*b095b053SXin Li 0 /* flags */);
2341*b095b053SXin Li }
2342*b095b053SXin Li
TEST(Parallelize3D,MultiThreadPoolCompletes)2343*b095b053SXin Li TEST(Parallelize3D, MultiThreadPoolCompletes) {
2344*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2345*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2346*b095b053SXin Li
2347*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2348*b095b053SXin Li GTEST_SKIP();
2349*b095b053SXin Li }
2350*b095b053SXin Li
2351*b095b053SXin Li pthreadpool_parallelize_3d(
2352*b095b053SXin Li threadpool.get(),
2353*b095b053SXin Li ComputeNothing3D,
2354*b095b053SXin Li nullptr,
2355*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2356*b095b053SXin Li 0 /* flags */);
2357*b095b053SXin Li }
2358*b095b053SXin Li
CheckBounds3D(void *,size_t i,size_t j,size_t k)2359*b095b053SXin Li static void CheckBounds3D(void*, size_t i, size_t j, size_t k) {
2360*b095b053SXin Li EXPECT_LT(i, kParallelize3DRangeI);
2361*b095b053SXin Li EXPECT_LT(j, kParallelize3DRangeJ);
2362*b095b053SXin Li EXPECT_LT(k, kParallelize3DRangeK);
2363*b095b053SXin Li }
2364*b095b053SXin Li
TEST(Parallelize3D,SingleThreadPoolAllItemsInBounds)2365*b095b053SXin Li TEST(Parallelize3D, SingleThreadPoolAllItemsInBounds) {
2366*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2367*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2368*b095b053SXin Li
2369*b095b053SXin Li pthreadpool_parallelize_3d(
2370*b095b053SXin Li threadpool.get(),
2371*b095b053SXin Li CheckBounds3D,
2372*b095b053SXin Li nullptr,
2373*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2374*b095b053SXin Li 0 /* flags */);
2375*b095b053SXin Li }
2376*b095b053SXin Li
TEST(Parallelize3D,MultiThreadPoolAllItemsInBounds)2377*b095b053SXin Li TEST(Parallelize3D, MultiThreadPoolAllItemsInBounds) {
2378*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2379*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2380*b095b053SXin Li
2381*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2382*b095b053SXin Li GTEST_SKIP();
2383*b095b053SXin Li }
2384*b095b053SXin Li
2385*b095b053SXin Li pthreadpool_parallelize_3d(
2386*b095b053SXin Li threadpool.get(),
2387*b095b053SXin Li CheckBounds3D,
2388*b095b053SXin Li nullptr,
2389*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2390*b095b053SXin Li 0 /* flags */);
2391*b095b053SXin Li }
2392*b095b053SXin Li
SetTrue3D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k)2393*b095b053SXin Li static void SetTrue3D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k) {
2394*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2395*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
2396*b095b053SXin Li }
2397*b095b053SXin Li
TEST(Parallelize3D,SingleThreadPoolAllItemsProcessed)2398*b095b053SXin Li TEST(Parallelize3D, SingleThreadPoolAllItemsProcessed) {
2399*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2400*b095b053SXin Li
2401*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2402*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2403*b095b053SXin Li
2404*b095b053SXin Li pthreadpool_parallelize_3d(
2405*b095b053SXin Li threadpool.get(),
2406*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(SetTrue3D),
2407*b095b053SXin Li static_cast<void*>(indicators.data()),
2408*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2409*b095b053SXin Li 0 /* flags */);
2410*b095b053SXin Li
2411*b095b053SXin Li for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2412*b095b053SXin Li for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2413*b095b053SXin Li for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2414*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2415*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2416*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
2417*b095b053SXin Li }
2418*b095b053SXin Li }
2419*b095b053SXin Li }
2420*b095b053SXin Li }
2421*b095b053SXin Li
TEST(Parallelize3D,MultiThreadPoolAllItemsProcessed)2422*b095b053SXin Li TEST(Parallelize3D, MultiThreadPoolAllItemsProcessed) {
2423*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2424*b095b053SXin Li
2425*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2426*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2427*b095b053SXin Li
2428*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2429*b095b053SXin Li GTEST_SKIP();
2430*b095b053SXin Li }
2431*b095b053SXin Li
2432*b095b053SXin Li pthreadpool_parallelize_3d(
2433*b095b053SXin Li threadpool.get(),
2434*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(SetTrue3D),
2435*b095b053SXin Li static_cast<void*>(indicators.data()),
2436*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2437*b095b053SXin Li 0 /* flags */);
2438*b095b053SXin Li
2439*b095b053SXin Li for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2440*b095b053SXin Li for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2441*b095b053SXin Li for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2442*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2443*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2444*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
2445*b095b053SXin Li }
2446*b095b053SXin Li }
2447*b095b053SXin Li }
2448*b095b053SXin Li }
2449*b095b053SXin Li
Increment3D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k)2450*b095b053SXin Li static void Increment3D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k) {
2451*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2452*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
2453*b095b053SXin Li }
2454*b095b053SXin Li
TEST(Parallelize3D,SingleThreadPoolEachItemProcessedOnce)2455*b095b053SXin Li TEST(Parallelize3D, SingleThreadPoolEachItemProcessedOnce) {
2456*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2457*b095b053SXin Li
2458*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2459*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2460*b095b053SXin Li
2461*b095b053SXin Li pthreadpool_parallelize_3d(
2462*b095b053SXin Li threadpool.get(),
2463*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2464*b095b053SXin Li static_cast<void*>(counters.data()),
2465*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2466*b095b053SXin Li 0 /* flags */);
2467*b095b053SXin Li
2468*b095b053SXin Li for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2469*b095b053SXin Li for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2470*b095b053SXin Li for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2471*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2472*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2473*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2474*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2475*b095b053SXin Li }
2476*b095b053SXin Li }
2477*b095b053SXin Li }
2478*b095b053SXin Li }
2479*b095b053SXin Li
TEST(Parallelize3D,MultiThreadPoolEachItemProcessedOnce)2480*b095b053SXin Li TEST(Parallelize3D, MultiThreadPoolEachItemProcessedOnce) {
2481*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2482*b095b053SXin Li
2483*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2484*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2485*b095b053SXin Li
2486*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2487*b095b053SXin Li GTEST_SKIP();
2488*b095b053SXin Li }
2489*b095b053SXin Li
2490*b095b053SXin Li pthreadpool_parallelize_3d(
2491*b095b053SXin Li threadpool.get(),
2492*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2493*b095b053SXin Li static_cast<void*>(counters.data()),
2494*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2495*b095b053SXin Li 0 /* flags */);
2496*b095b053SXin Li
2497*b095b053SXin Li for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2498*b095b053SXin Li for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2499*b095b053SXin Li for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2500*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2501*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2502*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2503*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2504*b095b053SXin Li }
2505*b095b053SXin Li }
2506*b095b053SXin Li }
2507*b095b053SXin Li }
2508*b095b053SXin Li
TEST(Parallelize3D,SingleThreadPoolEachItemProcessedMultipleTimes)2509*b095b053SXin Li TEST(Parallelize3D, SingleThreadPoolEachItemProcessedMultipleTimes) {
2510*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2511*b095b053SXin Li
2512*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2513*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2514*b095b053SXin Li
2515*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2516*b095b053SXin Li pthreadpool_parallelize_3d(
2517*b095b053SXin Li threadpool.get(),
2518*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2519*b095b053SXin Li static_cast<void*>(counters.data()),
2520*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2521*b095b053SXin Li 0 /* flags */);
2522*b095b053SXin Li }
2523*b095b053SXin Li
2524*b095b053SXin Li for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2525*b095b053SXin Li for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2526*b095b053SXin Li for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2527*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2528*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2529*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2530*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
2531*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
2532*b095b053SXin Li }
2533*b095b053SXin Li }
2534*b095b053SXin Li }
2535*b095b053SXin Li }
2536*b095b053SXin Li
TEST(Parallelize3D,MultiThreadPoolEachItemProcessedMultipleTimes)2537*b095b053SXin Li TEST(Parallelize3D, MultiThreadPoolEachItemProcessedMultipleTimes) {
2538*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2539*b095b053SXin Li
2540*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2541*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2542*b095b053SXin Li
2543*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2544*b095b053SXin Li GTEST_SKIP();
2545*b095b053SXin Li }
2546*b095b053SXin Li
2547*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2548*b095b053SXin Li pthreadpool_parallelize_3d(
2549*b095b053SXin Li threadpool.get(),
2550*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(Increment3D),
2551*b095b053SXin Li static_cast<void*>(counters.data()),
2552*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2553*b095b053SXin Li 0 /* flags */);
2554*b095b053SXin Li }
2555*b095b053SXin Li
2556*b095b053SXin Li for (size_t i = 0; i < kParallelize3DRangeI; i++) {
2557*b095b053SXin Li for (size_t j = 0; j < kParallelize3DRangeJ; j++) {
2558*b095b053SXin Li for (size_t k = 0; k < kParallelize3DRangeK; k++) {
2559*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k;
2560*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2561*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2562*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
2563*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
2564*b095b053SXin Li }
2565*b095b053SXin Li }
2566*b095b053SXin Li }
2567*b095b053SXin Li }
2568*b095b053SXin Li
IncrementSame3D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k)2569*b095b053SXin Li static void IncrementSame3D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k) {
2570*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
2571*b095b053SXin Li }
2572*b095b053SXin Li
TEST(Parallelize3D,MultiThreadPoolHighContention)2573*b095b053SXin Li TEST(Parallelize3D, MultiThreadPoolHighContention) {
2574*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2575*b095b053SXin Li
2576*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2577*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2578*b095b053SXin Li
2579*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2580*b095b053SXin Li GTEST_SKIP();
2581*b095b053SXin Li }
2582*b095b053SXin Li
2583*b095b053SXin Li pthreadpool_parallelize_3d(
2584*b095b053SXin Li threadpool.get(),
2585*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(IncrementSame3D),
2586*b095b053SXin Li static_cast<void*>(&num_processed_items),
2587*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2588*b095b053SXin Li 0 /* flags */);
2589*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2590*b095b053SXin Li }
2591*b095b053SXin Li
WorkImbalance3D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k)2592*b095b053SXin Li static void WorkImbalance3D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k) {
2593*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
2594*b095b053SXin Li if (i == 0 && j == 0 && k == 0) {
2595*b095b053SXin Li /* Spin-wait until all items are computed */
2596*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK) {
2597*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
2598*b095b053SXin Li }
2599*b095b053SXin Li }
2600*b095b053SXin Li }
2601*b095b053SXin Li
TEST(Parallelize3D,MultiThreadPoolWorkStealing)2602*b095b053SXin Li TEST(Parallelize3D, MultiThreadPoolWorkStealing) {
2603*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2604*b095b053SXin Li
2605*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2606*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2607*b095b053SXin Li
2608*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2609*b095b053SXin Li GTEST_SKIP();
2610*b095b053SXin Li }
2611*b095b053SXin Li
2612*b095b053SXin Li pthreadpool_parallelize_3d(
2613*b095b053SXin Li threadpool.get(),
2614*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_t>(WorkImbalance3D),
2615*b095b053SXin Li static_cast<void*>(&num_processed_items),
2616*b095b053SXin Li kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK,
2617*b095b053SXin Li 0 /* flags */);
2618*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK);
2619*b095b053SXin Li }
2620*b095b053SXin Li
ComputeNothing3DTile1D(void *,size_t,size_t,size_t,size_t)2621*b095b053SXin Li static void ComputeNothing3DTile1D(void*, size_t, size_t, size_t, size_t) {
2622*b095b053SXin Li }
2623*b095b053SXin Li
TEST(Parallelize3DTile1D,SingleThreadPoolCompletes)2624*b095b053SXin Li TEST(Parallelize3DTile1D, SingleThreadPoolCompletes) {
2625*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2626*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2627*b095b053SXin Li
2628*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(threadpool.get(),
2629*b095b053SXin Li ComputeNothing3DTile1D,
2630*b095b053SXin Li nullptr,
2631*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2632*b095b053SXin Li kParallelize3DTile1DTileK,
2633*b095b053SXin Li 0 /* flags */);
2634*b095b053SXin Li }
2635*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolCompletes)2636*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolCompletes) {
2637*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2638*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2639*b095b053SXin Li
2640*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2641*b095b053SXin Li GTEST_SKIP();
2642*b095b053SXin Li }
2643*b095b053SXin Li
2644*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2645*b095b053SXin Li threadpool.get(),
2646*b095b053SXin Li ComputeNothing3DTile1D,
2647*b095b053SXin Li nullptr,
2648*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2649*b095b053SXin Li kParallelize3DTile1DTileK,
2650*b095b053SXin Li 0 /* flags */);
2651*b095b053SXin Li }
2652*b095b053SXin Li
CheckBounds3DTile1D(void *,size_t i,size_t j,size_t start_k,size_t tile_k)2653*b095b053SXin Li static void CheckBounds3DTile1D(void*, size_t i, size_t j, size_t start_k, size_t tile_k) {
2654*b095b053SXin Li EXPECT_LT(i, kParallelize3DTile1DRangeI);
2655*b095b053SXin Li EXPECT_LT(j, kParallelize3DTile1DRangeJ);
2656*b095b053SXin Li EXPECT_LT(start_k, kParallelize3DTile1DRangeK);
2657*b095b053SXin Li EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK);
2658*b095b053SXin Li }
2659*b095b053SXin Li
TEST(Parallelize3DTile1D,SingleThreadPoolAllItemsInBounds)2660*b095b053SXin Li TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsInBounds) {
2661*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2662*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2663*b095b053SXin Li
2664*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2665*b095b053SXin Li threadpool.get(),
2666*b095b053SXin Li CheckBounds3DTile1D,
2667*b095b053SXin Li nullptr,
2668*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2669*b095b053SXin Li kParallelize3DTile1DTileK,
2670*b095b053SXin Li 0 /* flags */);
2671*b095b053SXin Li }
2672*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolAllItemsInBounds)2673*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsInBounds) {
2674*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2675*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2676*b095b053SXin Li
2677*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2678*b095b053SXin Li GTEST_SKIP();
2679*b095b053SXin Li }
2680*b095b053SXin Li
2681*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2682*b095b053SXin Li threadpool.get(),
2683*b095b053SXin Li CheckBounds3DTile1D,
2684*b095b053SXin Li nullptr,
2685*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2686*b095b053SXin Li kParallelize3DTile1DTileK,
2687*b095b053SXin Li 0 /* flags */);
2688*b095b053SXin Li }
2689*b095b053SXin Li
CheckTiling3DTile1D(void *,size_t i,size_t j,size_t start_k,size_t tile_k)2690*b095b053SXin Li static void CheckTiling3DTile1D(void*, size_t i, size_t j, size_t start_k, size_t tile_k) {
2691*b095b053SXin Li EXPECT_GT(tile_k, 0);
2692*b095b053SXin Li EXPECT_LE(tile_k, kParallelize3DTile1DTileK);
2693*b095b053SXin Li EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0);
2694*b095b053SXin Li EXPECT_EQ(tile_k, std::min<size_t>(kParallelize3DTile1DTileK, kParallelize3DTile1DRangeK - start_k));
2695*b095b053SXin Li }
2696*b095b053SXin Li
TEST(Parallelize3DTile1D,SingleThreadPoolUniformTiling)2697*b095b053SXin Li TEST(Parallelize3DTile1D, SingleThreadPoolUniformTiling) {
2698*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2699*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2700*b095b053SXin Li
2701*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2702*b095b053SXin Li threadpool.get(),
2703*b095b053SXin Li CheckTiling3DTile1D,
2704*b095b053SXin Li nullptr,
2705*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2706*b095b053SXin Li kParallelize3DTile1DTileK,
2707*b095b053SXin Li 0 /* flags */);
2708*b095b053SXin Li }
2709*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolUniformTiling)2710*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolUniformTiling) {
2711*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2712*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2713*b095b053SXin Li
2714*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2715*b095b053SXin Li GTEST_SKIP();
2716*b095b053SXin Li }
2717*b095b053SXin Li
2718*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2719*b095b053SXin Li threadpool.get(),
2720*b095b053SXin Li CheckTiling3DTile1D,
2721*b095b053SXin Li nullptr,
2722*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2723*b095b053SXin Li kParallelize3DTile1DTileK,
2724*b095b053SXin Li 0 /* flags */);
2725*b095b053SXin Li }
2726*b095b053SXin Li
SetTrue3DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t start_k,size_t tile_k)2727*b095b053SXin Li static void SetTrue3DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t start_k, size_t tile_k) {
2728*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
2729*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2730*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
2731*b095b053SXin Li }
2732*b095b053SXin Li }
2733*b095b053SXin Li
TEST(Parallelize3DTile1D,SingleThreadPoolAllItemsProcessed)2734*b095b053SXin Li TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsProcessed) {
2735*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2736*b095b053SXin Li
2737*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2738*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2739*b095b053SXin Li
2740*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2741*b095b053SXin Li threadpool.get(),
2742*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(SetTrue3DTile1D),
2743*b095b053SXin Li static_cast<void*>(indicators.data()),
2744*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2745*b095b053SXin Li kParallelize3DTile1DTileK,
2746*b095b053SXin Li 0 /* flags */);
2747*b095b053SXin Li
2748*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2749*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2750*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2751*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2752*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2753*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
2754*b095b053SXin Li }
2755*b095b053SXin Li }
2756*b095b053SXin Li }
2757*b095b053SXin Li }
2758*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolAllItemsProcessed)2759*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsProcessed) {
2760*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2761*b095b053SXin Li
2762*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2763*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2764*b095b053SXin Li
2765*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2766*b095b053SXin Li GTEST_SKIP();
2767*b095b053SXin Li }
2768*b095b053SXin Li
2769*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2770*b095b053SXin Li threadpool.get(),
2771*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(SetTrue3DTile1D),
2772*b095b053SXin Li static_cast<void*>(indicators.data()),
2773*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2774*b095b053SXin Li kParallelize3DTile1DTileK,
2775*b095b053SXin Li 0 /* flags */);
2776*b095b053SXin Li
2777*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2778*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2779*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2780*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2781*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
2782*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
2783*b095b053SXin Li }
2784*b095b053SXin Li }
2785*b095b053SXin Li }
2786*b095b053SXin Li }
2787*b095b053SXin Li
Increment3DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t start_k,size_t tile_k)2788*b095b053SXin Li static void Increment3DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t start_k, size_t tile_k) {
2789*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
2790*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2791*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
2792*b095b053SXin Li }
2793*b095b053SXin Li }
2794*b095b053SXin Li
TEST(Parallelize3DTile1D,SingleThreadPoolEachItemProcessedOnce)2795*b095b053SXin Li TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedOnce) {
2796*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2797*b095b053SXin Li
2798*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2799*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2800*b095b053SXin Li
2801*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2802*b095b053SXin Li threadpool.get(),
2803*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2804*b095b053SXin Li static_cast<void*>(counters.data()),
2805*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2806*b095b053SXin Li kParallelize3DTile1DTileK,
2807*b095b053SXin Li 0 /* flags */);
2808*b095b053SXin Li
2809*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2810*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2811*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2812*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2813*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2814*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2815*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2816*b095b053SXin Li }
2817*b095b053SXin Li }
2818*b095b053SXin Li }
2819*b095b053SXin Li }
2820*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolEachItemProcessedOnce)2821*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedOnce) {
2822*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2823*b095b053SXin Li
2824*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2825*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2826*b095b053SXin Li
2827*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2828*b095b053SXin Li GTEST_SKIP();
2829*b095b053SXin Li }
2830*b095b053SXin Li
2831*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2832*b095b053SXin Li threadpool.get(),
2833*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2834*b095b053SXin Li static_cast<void*>(counters.data()),
2835*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2836*b095b053SXin Li kParallelize3DTile1DTileK,
2837*b095b053SXin Li 0 /* flags */);
2838*b095b053SXin Li
2839*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2840*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2841*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2842*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2843*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
2844*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2845*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
2846*b095b053SXin Li }
2847*b095b053SXin Li }
2848*b095b053SXin Li }
2849*b095b053SXin Li }
2850*b095b053SXin Li
TEST(Parallelize3DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)2851*b095b053SXin Li TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
2852*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2853*b095b053SXin Li
2854*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2855*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2856*b095b053SXin Li
2857*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2858*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2859*b095b053SXin Li threadpool.get(),
2860*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2861*b095b053SXin Li static_cast<void*>(counters.data()),
2862*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2863*b095b053SXin Li kParallelize3DTile1DTileK,
2864*b095b053SXin Li 0 /* flags */);
2865*b095b053SXin Li }
2866*b095b053SXin Li
2867*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2868*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2869*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2870*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2871*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2872*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2873*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
2874*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
2875*b095b053SXin Li }
2876*b095b053SXin Li }
2877*b095b053SXin Li }
2878*b095b053SXin Li }
2879*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)2880*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
2881*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2882*b095b053SXin Li
2883*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2884*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2885*b095b053SXin Li
2886*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2887*b095b053SXin Li GTEST_SKIP();
2888*b095b053SXin Li }
2889*b095b053SXin Li
2890*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
2891*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2892*b095b053SXin Li threadpool.get(),
2893*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(Increment3DTile1D),
2894*b095b053SXin Li static_cast<void*>(counters.data()),
2895*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2896*b095b053SXin Li kParallelize3DTile1DTileK,
2897*b095b053SXin Li 0 /* flags */);
2898*b095b053SXin Li }
2899*b095b053SXin Li
2900*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) {
2901*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) {
2902*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) {
2903*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k;
2904*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
2905*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
2906*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
2907*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
2908*b095b053SXin Li }
2909*b095b053SXin Li }
2910*b095b053SXin Li }
2911*b095b053SXin Li }
2912*b095b053SXin Li
IncrementSame3DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t tile_k)2913*b095b053SXin Li static void IncrementSame3DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t tile_k) {
2914*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
2915*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
2916*b095b053SXin Li }
2917*b095b053SXin Li }
2918*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolHighContention)2919*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolHighContention) {
2920*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2921*b095b053SXin Li
2922*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2923*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2924*b095b053SXin Li
2925*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2926*b095b053SXin Li GTEST_SKIP();
2927*b095b053SXin Li }
2928*b095b053SXin Li
2929*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2930*b095b053SXin Li threadpool.get(),
2931*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(IncrementSame3DTile1D),
2932*b095b053SXin Li static_cast<void*>(&num_processed_items),
2933*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2934*b095b053SXin Li kParallelize3DTile1DTileK,
2935*b095b053SXin Li 0 /* flags */);
2936*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2937*b095b053SXin Li }
2938*b095b053SXin Li
WorkImbalance3DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t tile_k)2939*b095b053SXin Li static void WorkImbalance3DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t tile_k) {
2940*b095b053SXin Li num_processed_items->fetch_add(tile_k, std::memory_order_relaxed);
2941*b095b053SXin Li if (i == 0 && j == 0 && start_k == 0) {
2942*b095b053SXin Li /* Spin-wait until all items are computed */
2943*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK) {
2944*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
2945*b095b053SXin Li }
2946*b095b053SXin Li }
2947*b095b053SXin Li }
2948*b095b053SXin Li
TEST(Parallelize3DTile1D,MultiThreadPoolWorkStealing)2949*b095b053SXin Li TEST(Parallelize3DTile1D, MultiThreadPoolWorkStealing) {
2950*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
2951*b095b053SXin Li
2952*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2953*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2954*b095b053SXin Li
2955*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2956*b095b053SXin Li GTEST_SKIP();
2957*b095b053SXin Li }
2958*b095b053SXin Li
2959*b095b053SXin Li pthreadpool_parallelize_3d_tile_1d(
2960*b095b053SXin Li threadpool.get(),
2961*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_1d_t>(WorkImbalance3DTile1D),
2962*b095b053SXin Li static_cast<void*>(&num_processed_items),
2963*b095b053SXin Li kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK,
2964*b095b053SXin Li kParallelize3DTile1DTileK,
2965*b095b053SXin Li 0 /* flags */);
2966*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK);
2967*b095b053SXin Li }
2968*b095b053SXin Li
ComputeNothing3DTile2D(void *,size_t,size_t,size_t,size_t,size_t)2969*b095b053SXin Li static void ComputeNothing3DTile2D(void*, size_t, size_t, size_t, size_t, size_t) {
2970*b095b053SXin Li }
2971*b095b053SXin Li
TEST(Parallelize3DTile2D,SingleThreadPoolCompletes)2972*b095b053SXin Li TEST(Parallelize3DTile2D, SingleThreadPoolCompletes) {
2973*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
2974*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2975*b095b053SXin Li
2976*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(threadpool.get(),
2977*b095b053SXin Li ComputeNothing3DTile2D,
2978*b095b053SXin Li nullptr,
2979*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
2980*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
2981*b095b053SXin Li 0 /* flags */);
2982*b095b053SXin Li }
2983*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolCompletes)2984*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolCompletes) {
2985*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
2986*b095b053SXin Li ASSERT_TRUE(threadpool.get());
2987*b095b053SXin Li
2988*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
2989*b095b053SXin Li GTEST_SKIP();
2990*b095b053SXin Li }
2991*b095b053SXin Li
2992*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
2993*b095b053SXin Li threadpool.get(),
2994*b095b053SXin Li ComputeNothing3DTile2D,
2995*b095b053SXin Li nullptr,
2996*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
2997*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
2998*b095b053SXin Li 0 /* flags */);
2999*b095b053SXin Li }
3000*b095b053SXin Li
CheckBounds3DTile2D(void *,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3001*b095b053SXin Li static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3002*b095b053SXin Li EXPECT_LT(i, kParallelize3DTile2DRangeI);
3003*b095b053SXin Li EXPECT_LT(start_j, kParallelize3DTile2DRangeJ);
3004*b095b053SXin Li EXPECT_LT(start_k, kParallelize3DTile2DRangeK);
3005*b095b053SXin Li EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ);
3006*b095b053SXin Li EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK);
3007*b095b053SXin Li }
3008*b095b053SXin Li
TEST(Parallelize3DTile2D,SingleThreadPoolAllItemsInBounds)3009*b095b053SXin Li TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsInBounds) {
3010*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3011*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3012*b095b053SXin Li
3013*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3014*b095b053SXin Li threadpool.get(),
3015*b095b053SXin Li CheckBounds3DTile2D,
3016*b095b053SXin Li nullptr,
3017*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3018*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3019*b095b053SXin Li 0 /* flags */);
3020*b095b053SXin Li }
3021*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolAllItemsInBounds)3022*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsInBounds) {
3023*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3024*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3025*b095b053SXin Li
3026*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3027*b095b053SXin Li GTEST_SKIP();
3028*b095b053SXin Li }
3029*b095b053SXin Li
3030*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3031*b095b053SXin Li threadpool.get(),
3032*b095b053SXin Li CheckBounds3DTile2D,
3033*b095b053SXin Li nullptr,
3034*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3035*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3036*b095b053SXin Li 0 /* flags */);
3037*b095b053SXin Li }
3038*b095b053SXin Li
CheckTiling3DTile2D(void *,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3039*b095b053SXin Li static void CheckTiling3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3040*b095b053SXin Li EXPECT_GT(tile_j, 0);
3041*b095b053SXin Li EXPECT_LE(tile_j, kParallelize3DTile2DTileJ);
3042*b095b053SXin Li EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0);
3043*b095b053SXin Li EXPECT_EQ(tile_j, std::min<size_t>(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j));
3044*b095b053SXin Li
3045*b095b053SXin Li EXPECT_GT(tile_k, 0);
3046*b095b053SXin Li EXPECT_LE(tile_k, kParallelize3DTile2DTileK);
3047*b095b053SXin Li EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0);
3048*b095b053SXin Li EXPECT_EQ(tile_k, std::min<size_t>(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k));
3049*b095b053SXin Li }
3050*b095b053SXin Li
TEST(Parallelize3DTile2D,SingleThreadPoolUniformTiling)3051*b095b053SXin Li TEST(Parallelize3DTile2D, SingleThreadPoolUniformTiling) {
3052*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3053*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3054*b095b053SXin Li
3055*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3056*b095b053SXin Li threadpool.get(),
3057*b095b053SXin Li CheckTiling3DTile2D,
3058*b095b053SXin Li nullptr,
3059*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3060*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3061*b095b053SXin Li 0 /* flags */);
3062*b095b053SXin Li }
3063*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolUniformTiling)3064*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolUniformTiling) {
3065*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3066*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3067*b095b053SXin Li
3068*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3069*b095b053SXin Li GTEST_SKIP();
3070*b095b053SXin Li }
3071*b095b053SXin Li
3072*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3073*b095b053SXin Li threadpool.get(),
3074*b095b053SXin Li CheckTiling3DTile2D,
3075*b095b053SXin Li nullptr,
3076*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3077*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3078*b095b053SXin Li 0 /* flags */);
3079*b095b053SXin Li }
3080*b095b053SXin Li
SetTrue3DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3081*b095b053SXin Li static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3082*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
3083*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
3084*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3085*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
3086*b095b053SXin Li }
3087*b095b053SXin Li }
3088*b095b053SXin Li }
3089*b095b053SXin Li
TEST(Parallelize3DTile2D,SingleThreadPoolAllItemsProcessed)3090*b095b053SXin Li TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) {
3091*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3092*b095b053SXin Li
3093*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3094*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3095*b095b053SXin Li
3096*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3097*b095b053SXin Li threadpool.get(),
3098*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(SetTrue3DTile2D),
3099*b095b053SXin Li static_cast<void*>(indicators.data()),
3100*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3101*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3102*b095b053SXin Li 0 /* flags */);
3103*b095b053SXin Li
3104*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3105*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3106*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3107*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3108*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3109*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
3110*b095b053SXin Li }
3111*b095b053SXin Li }
3112*b095b053SXin Li }
3113*b095b053SXin Li }
3114*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolAllItemsProcessed)3115*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) {
3116*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3117*b095b053SXin Li
3118*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3119*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3120*b095b053SXin Li
3121*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3122*b095b053SXin Li GTEST_SKIP();
3123*b095b053SXin Li }
3124*b095b053SXin Li
3125*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3126*b095b053SXin Li threadpool.get(),
3127*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(SetTrue3DTile2D),
3128*b095b053SXin Li static_cast<void*>(indicators.data()),
3129*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3130*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3131*b095b053SXin Li 0 /* flags */);
3132*b095b053SXin Li
3133*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3134*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3135*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3136*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3137*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3138*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
3139*b095b053SXin Li }
3140*b095b053SXin Li }
3141*b095b053SXin Li }
3142*b095b053SXin Li }
3143*b095b053SXin Li
Increment3DTile2D(std::atomic_int * processed_counters,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3144*b095b053SXin Li static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3145*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
3146*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
3147*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3148*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
3149*b095b053SXin Li }
3150*b095b053SXin Li }
3151*b095b053SXin Li }
3152*b095b053SXin Li
TEST(Parallelize3DTile2D,SingleThreadPoolEachItemProcessedOnce)3153*b095b053SXin Li TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) {
3154*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3155*b095b053SXin Li
3156*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3157*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3158*b095b053SXin Li
3159*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3160*b095b053SXin Li threadpool.get(),
3161*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3162*b095b053SXin Li static_cast<void*>(counters.data()),
3163*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3164*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3165*b095b053SXin Li 0 /* flags */);
3166*b095b053SXin Li
3167*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3168*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3169*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3170*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3171*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3172*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3173*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3174*b095b053SXin Li }
3175*b095b053SXin Li }
3176*b095b053SXin Li }
3177*b095b053SXin Li }
3178*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolEachItemProcessedOnce)3179*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) {
3180*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3181*b095b053SXin Li
3182*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3183*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3184*b095b053SXin Li
3185*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3186*b095b053SXin Li GTEST_SKIP();
3187*b095b053SXin Li }
3188*b095b053SXin Li
3189*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3190*b095b053SXin Li threadpool.get(),
3191*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3192*b095b053SXin Li static_cast<void*>(counters.data()),
3193*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3194*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3195*b095b053SXin Li 0 /* flags */);
3196*b095b053SXin Li
3197*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3198*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3199*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3200*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3201*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3202*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3203*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3204*b095b053SXin Li }
3205*b095b053SXin Li }
3206*b095b053SXin Li }
3207*b095b053SXin Li }
3208*b095b053SXin Li
TEST(Parallelize3DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)3209*b095b053SXin Li TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
3210*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3211*b095b053SXin Li
3212*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3213*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3214*b095b053SXin Li
3215*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3216*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3217*b095b053SXin Li threadpool.get(),
3218*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3219*b095b053SXin Li static_cast<void*>(counters.data()),
3220*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3221*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3222*b095b053SXin Li 0 /* flags */);
3223*b095b053SXin Li }
3224*b095b053SXin Li
3225*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3226*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3227*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3228*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3229*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3230*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3231*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
3232*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
3233*b095b053SXin Li }
3234*b095b053SXin Li }
3235*b095b053SXin Li }
3236*b095b053SXin Li }
3237*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)3238*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
3239*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3240*b095b053SXin Li
3241*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3242*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3243*b095b053SXin Li
3244*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3245*b095b053SXin Li GTEST_SKIP();
3246*b095b053SXin Li }
3247*b095b053SXin Li
3248*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3249*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3250*b095b053SXin Li threadpool.get(),
3251*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D),
3252*b095b053SXin Li static_cast<void*>(counters.data()),
3253*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3254*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3255*b095b053SXin Li 0 /* flags */);
3256*b095b053SXin Li }
3257*b095b053SXin Li
3258*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3259*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3260*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3261*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3262*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3263*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3264*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
3265*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
3266*b095b053SXin Li }
3267*b095b053SXin Li }
3268*b095b053SXin Li }
3269*b095b053SXin Li }
3270*b095b053SXin Li
IncrementSame3DTile2D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3271*b095b053SXin Li static void IncrementSame3DTile2D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3272*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
3273*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
3274*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
3275*b095b053SXin Li }
3276*b095b053SXin Li }
3277*b095b053SXin Li }
3278*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolHighContention)3279*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolHighContention) {
3280*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3281*b095b053SXin Li
3282*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3283*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3284*b095b053SXin Li
3285*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3286*b095b053SXin Li GTEST_SKIP();
3287*b095b053SXin Li }
3288*b095b053SXin Li
3289*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3290*b095b053SXin Li threadpool.get(),
3291*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(IncrementSame3DTile2D),
3292*b095b053SXin Li static_cast<void*>(&num_processed_items),
3293*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3294*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3295*b095b053SXin Li 0 /* flags */);
3296*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3297*b095b053SXin Li }
3298*b095b053SXin Li
WorkImbalance3DTile2D(std::atomic_int * num_processed_items,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3299*b095b053SXin Li static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3300*b095b053SXin Li num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed);
3301*b095b053SXin Li if (i == 0 && start_j == 0 && start_k == 0) {
3302*b095b053SXin Li /* Spin-wait until all items are computed */
3303*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK) {
3304*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
3305*b095b053SXin Li }
3306*b095b053SXin Li }
3307*b095b053SXin Li }
3308*b095b053SXin Li
TEST(Parallelize3DTile2D,MultiThreadPoolWorkStealing)3309*b095b053SXin Li TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) {
3310*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3311*b095b053SXin Li
3312*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3313*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3314*b095b053SXin Li
3315*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3316*b095b053SXin Li GTEST_SKIP();
3317*b095b053SXin Li }
3318*b095b053SXin Li
3319*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d(
3320*b095b053SXin Li threadpool.get(),
3321*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(WorkImbalance3DTile2D),
3322*b095b053SXin Li static_cast<void*>(&num_processed_items),
3323*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3324*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3325*b095b053SXin Li 0 /* flags */);
3326*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3327*b095b053SXin Li }
3328*b095b053SXin Li
ComputeNothing3DTile2DWithUArch(void *,uint32_t,size_t,size_t,size_t,size_t,size_t)3329*b095b053SXin Li static void ComputeNothing3DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t, size_t) {
3330*b095b053SXin Li }
3331*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolCompletes)3332*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolCompletes) {
3333*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3334*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3335*b095b053SXin Li
3336*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(threadpool.get(),
3337*b095b053SXin Li ComputeNothing3DTile2DWithUArch,
3338*b095b053SXin Li nullptr,
3339*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3340*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3341*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3342*b095b053SXin Li 0 /* flags */);
3343*b095b053SXin Li }
3344*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolCompletes)3345*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolCompletes) {
3346*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3347*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3348*b095b053SXin Li
3349*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3350*b095b053SXin Li GTEST_SKIP();
3351*b095b053SXin Li }
3352*b095b053SXin Li
3353*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3354*b095b053SXin Li threadpool.get(),
3355*b095b053SXin Li ComputeNothing3DTile2DWithUArch,
3356*b095b053SXin Li nullptr,
3357*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3358*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3359*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3360*b095b053SXin Li 0 /* flags */);
3361*b095b053SXin Li }
3362*b095b053SXin Li
CheckUArch3DTile2DWithUArch(void *,uint32_t uarch_index,size_t,size_t,size_t,size_t,size_t)3363*b095b053SXin Li static void CheckUArch3DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t, size_t) {
3364*b095b053SXin Li if (uarch_index != kDefaultUArchIndex) {
3365*b095b053SXin Li EXPECT_LE(uarch_index, kMaxUArchIndex);
3366*b095b053SXin Li }
3367*b095b053SXin Li }
3368*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolUArchInBounds)3369*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolUArchInBounds) {
3370*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3371*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3372*b095b053SXin Li
3373*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3374*b095b053SXin Li threadpool.get(),
3375*b095b053SXin Li CheckUArch3DTile2DWithUArch,
3376*b095b053SXin Li nullptr,
3377*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3378*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3379*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3380*b095b053SXin Li 0 /* flags */);
3381*b095b053SXin Li }
3382*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolUArchInBounds)3383*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolUArchInBounds) {
3384*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3385*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3386*b095b053SXin Li
3387*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3388*b095b053SXin Li GTEST_SKIP();
3389*b095b053SXin Li }
3390*b095b053SXin Li
3391*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3392*b095b053SXin Li threadpool.get(),
3393*b095b053SXin Li CheckUArch3DTile2DWithUArch,
3394*b095b053SXin Li nullptr,
3395*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3396*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3397*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3398*b095b053SXin Li 0 /* flags */);
3399*b095b053SXin Li }
3400*b095b053SXin Li
CheckBounds3DTile2DWithUArch(void *,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3401*b095b053SXin Li static void CheckBounds3DTile2DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3402*b095b053SXin Li EXPECT_LT(i, kParallelize3DTile2DRangeI);
3403*b095b053SXin Li EXPECT_LT(start_j, kParallelize3DTile2DRangeJ);
3404*b095b053SXin Li EXPECT_LT(start_k, kParallelize3DTile2DRangeK);
3405*b095b053SXin Li EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ);
3406*b095b053SXin Li EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK);
3407*b095b053SXin Li }
3408*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolAllItemsInBounds)3409*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) {
3410*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3411*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3412*b095b053SXin Li
3413*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3414*b095b053SXin Li threadpool.get(),
3415*b095b053SXin Li CheckBounds3DTile2DWithUArch,
3416*b095b053SXin Li nullptr,
3417*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3418*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3419*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3420*b095b053SXin Li 0 /* flags */);
3421*b095b053SXin Li }
3422*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolAllItemsInBounds)3423*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) {
3424*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3425*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3426*b095b053SXin Li
3427*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3428*b095b053SXin Li GTEST_SKIP();
3429*b095b053SXin Li }
3430*b095b053SXin Li
3431*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3432*b095b053SXin Li threadpool.get(),
3433*b095b053SXin Li CheckBounds3DTile2DWithUArch,
3434*b095b053SXin Li nullptr,
3435*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3436*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3437*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3438*b095b053SXin Li 0 /* flags */);
3439*b095b053SXin Li }
3440*b095b053SXin Li
CheckTiling3DTile2DWithUArch(void *,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3441*b095b053SXin Li static void CheckTiling3DTile2DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3442*b095b053SXin Li EXPECT_GT(tile_j, 0);
3443*b095b053SXin Li EXPECT_LE(tile_j, kParallelize3DTile2DTileJ);
3444*b095b053SXin Li EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0);
3445*b095b053SXin Li EXPECT_EQ(tile_j, std::min<size_t>(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j));
3446*b095b053SXin Li
3447*b095b053SXin Li EXPECT_GT(tile_k, 0);
3448*b095b053SXin Li EXPECT_LE(tile_k, kParallelize3DTile2DTileK);
3449*b095b053SXin Li EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0);
3450*b095b053SXin Li EXPECT_EQ(tile_k, std::min<size_t>(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k));
3451*b095b053SXin Li }
3452*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolUniformTiling)3453*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolUniformTiling) {
3454*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3455*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3456*b095b053SXin Li
3457*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3458*b095b053SXin Li threadpool.get(),
3459*b095b053SXin Li CheckTiling3DTile2DWithUArch,
3460*b095b053SXin Li nullptr,
3461*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3462*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3463*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3464*b095b053SXin Li 0 /* flags */);
3465*b095b053SXin Li }
3466*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolUniformTiling)3467*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolUniformTiling) {
3468*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3469*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3470*b095b053SXin Li
3471*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3472*b095b053SXin Li GTEST_SKIP();
3473*b095b053SXin Li }
3474*b095b053SXin Li
3475*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3476*b095b053SXin Li threadpool.get(),
3477*b095b053SXin Li CheckTiling3DTile2DWithUArch,
3478*b095b053SXin Li nullptr,
3479*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3480*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3481*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3482*b095b053SXin Li 0 /* flags */);
3483*b095b053SXin Li }
3484*b095b053SXin Li
SetTrue3DTile2DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3485*b095b053SXin Li static void SetTrue3DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3486*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
3487*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
3488*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3489*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
3490*b095b053SXin Li }
3491*b095b053SXin Li }
3492*b095b053SXin Li }
3493*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolAllItemsProcessed)3494*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) {
3495*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3496*b095b053SXin Li
3497*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3498*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3499*b095b053SXin Li
3500*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3501*b095b053SXin Li threadpool.get(),
3502*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(SetTrue3DTile2DWithUArch),
3503*b095b053SXin Li static_cast<void*>(indicators.data()),
3504*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3505*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3506*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3507*b095b053SXin Li 0 /* flags */);
3508*b095b053SXin Li
3509*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3510*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3511*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3512*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3513*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3514*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
3515*b095b053SXin Li }
3516*b095b053SXin Li }
3517*b095b053SXin Li }
3518*b095b053SXin Li }
3519*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolAllItemsProcessed)3520*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) {
3521*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3522*b095b053SXin Li
3523*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3524*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3525*b095b053SXin Li
3526*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3527*b095b053SXin Li GTEST_SKIP();
3528*b095b053SXin Li }
3529*b095b053SXin Li
3530*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3531*b095b053SXin Li threadpool.get(),
3532*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(SetTrue3DTile2DWithUArch),
3533*b095b053SXin Li static_cast<void*>(indicators.data()),
3534*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3535*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3536*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3537*b095b053SXin Li 0 /* flags */);
3538*b095b053SXin Li
3539*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3540*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3541*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3542*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3543*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3544*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") not processed";
3545*b095b053SXin Li }
3546*b095b053SXin Li }
3547*b095b053SXin Li }
3548*b095b053SXin Li }
3549*b095b053SXin Li
Increment3DTile2DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3550*b095b053SXin Li static void Increment3DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3551*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
3552*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
3553*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3554*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
3555*b095b053SXin Li }
3556*b095b053SXin Li }
3557*b095b053SXin Li }
3558*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolEachItemProcessedOnce)3559*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
3560*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3561*b095b053SXin Li
3562*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3563*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3564*b095b053SXin Li
3565*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3566*b095b053SXin Li threadpool.get(),
3567*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3568*b095b053SXin Li static_cast<void*>(counters.data()),
3569*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3570*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3571*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3572*b095b053SXin Li 0 /* flags */);
3573*b095b053SXin Li
3574*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3575*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3576*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3577*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3578*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3579*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3580*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3581*b095b053SXin Li }
3582*b095b053SXin Li }
3583*b095b053SXin Li }
3584*b095b053SXin Li }
3585*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolEachItemProcessedOnce)3586*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
3587*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3588*b095b053SXin Li
3589*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3590*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3591*b095b053SXin Li
3592*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3593*b095b053SXin Li GTEST_SKIP();
3594*b095b053SXin Li }
3595*b095b053SXin Li
3596*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3597*b095b053SXin Li threadpool.get(),
3598*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3599*b095b053SXin Li static_cast<void*>(counters.data()),
3600*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3601*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3602*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3603*b095b053SXin Li 0 /* flags */);
3604*b095b053SXin Li
3605*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3606*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3607*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3608*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3609*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3610*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3611*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3612*b095b053SXin Li }
3613*b095b053SXin Li }
3614*b095b053SXin Li }
3615*b095b053SXin Li }
3616*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)3617*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
3618*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3619*b095b053SXin Li
3620*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3621*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3622*b095b053SXin Li
3623*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3624*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3625*b095b053SXin Li threadpool.get(),
3626*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3627*b095b053SXin Li static_cast<void*>(counters.data()),
3628*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3629*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3630*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3631*b095b053SXin Li 0 /* flags */);
3632*b095b053SXin Li }
3633*b095b053SXin Li
3634*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3635*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3636*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3637*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3638*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3639*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3640*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
3641*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
3642*b095b053SXin Li }
3643*b095b053SXin Li }
3644*b095b053SXin Li }
3645*b095b053SXin Li }
3646*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)3647*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
3648*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3649*b095b053SXin Li
3650*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3651*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3652*b095b053SXin Li
3653*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3654*b095b053SXin Li GTEST_SKIP();
3655*b095b053SXin Li }
3656*b095b053SXin Li
3657*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3658*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3659*b095b053SXin Li threadpool.get(),
3660*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(Increment3DTile2DWithUArch),
3661*b095b053SXin Li static_cast<void*>(counters.data()),
3662*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3663*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3664*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3665*b095b053SXin Li 0 /* flags */);
3666*b095b053SXin Li }
3667*b095b053SXin Li
3668*b095b053SXin Li for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) {
3669*b095b053SXin Li for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) {
3670*b095b053SXin Li for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) {
3671*b095b053SXin Li const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k;
3672*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3673*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ") was processed "
3674*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
3675*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
3676*b095b053SXin Li }
3677*b095b053SXin Li }
3678*b095b053SXin Li }
3679*b095b053SXin Li }
3680*b095b053SXin Li
IncrementSame3DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3681*b095b053SXin Li static void IncrementSame3DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3682*b095b053SXin Li for (size_t j = start_j; j < start_j + tile_j; j++) {
3683*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
3684*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
3685*b095b053SXin Li }
3686*b095b053SXin Li }
3687*b095b053SXin Li }
3688*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolHighContention)3689*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolHighContention) {
3690*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3691*b095b053SXin Li
3692*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3693*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3694*b095b053SXin Li
3695*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3696*b095b053SXin Li GTEST_SKIP();
3697*b095b053SXin Li }
3698*b095b053SXin Li
3699*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3700*b095b053SXin Li threadpool.get(),
3701*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(IncrementSame3DTile2DWithUArch),
3702*b095b053SXin Li static_cast<void*>(&num_processed_items),
3703*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3704*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3705*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3706*b095b053SXin Li 0 /* flags */);
3707*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3708*b095b053SXin Li }
3709*b095b053SXin Li
WorkImbalance3DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t start_j,size_t start_k,size_t tile_j,size_t tile_k)3710*b095b053SXin Li static void WorkImbalance3DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) {
3711*b095b053SXin Li num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed);
3712*b095b053SXin Li if (i == 0 && start_j == 0 && start_k == 0) {
3713*b095b053SXin Li /* Spin-wait until all items are computed */
3714*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK) {
3715*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
3716*b095b053SXin Li }
3717*b095b053SXin Li }
3718*b095b053SXin Li }
3719*b095b053SXin Li
TEST(Parallelize3DTile2DWithUArch,MultiThreadPoolWorkStealing)3720*b095b053SXin Li TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolWorkStealing) {
3721*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
3722*b095b053SXin Li
3723*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3724*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3725*b095b053SXin Li
3726*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3727*b095b053SXin Li GTEST_SKIP();
3728*b095b053SXin Li }
3729*b095b053SXin Li
3730*b095b053SXin Li pthreadpool_parallelize_3d_tile_2d_with_uarch(
3731*b095b053SXin Li threadpool.get(),
3732*b095b053SXin Li reinterpret_cast<pthreadpool_task_3d_tile_2d_with_id_t>(WorkImbalance3DTile2DWithUArch),
3733*b095b053SXin Li static_cast<void*>(&num_processed_items),
3734*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
3735*b095b053SXin Li kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK,
3736*b095b053SXin Li kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK,
3737*b095b053SXin Li 0 /* flags */);
3738*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK);
3739*b095b053SXin Li }
3740*b095b053SXin Li
ComputeNothing4D(void *,size_t,size_t,size_t,size_t)3741*b095b053SXin Li static void ComputeNothing4D(void*, size_t, size_t, size_t, size_t) {
3742*b095b053SXin Li }
3743*b095b053SXin Li
TEST(Parallelize4D,SingleThreadPoolCompletes)3744*b095b053SXin Li TEST(Parallelize4D, SingleThreadPoolCompletes) {
3745*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3746*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3747*b095b053SXin Li
3748*b095b053SXin Li pthreadpool_parallelize_4d(threadpool.get(),
3749*b095b053SXin Li ComputeNothing4D,
3750*b095b053SXin Li nullptr,
3751*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3752*b095b053SXin Li 0 /* flags */);
3753*b095b053SXin Li }
3754*b095b053SXin Li
TEST(Parallelize4D,MultiThreadPoolCompletes)3755*b095b053SXin Li TEST(Parallelize4D, MultiThreadPoolCompletes) {
3756*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3757*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3758*b095b053SXin Li
3759*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3760*b095b053SXin Li GTEST_SKIP();
3761*b095b053SXin Li }
3762*b095b053SXin Li
3763*b095b053SXin Li pthreadpool_parallelize_4d(
3764*b095b053SXin Li threadpool.get(),
3765*b095b053SXin Li ComputeNothing4D,
3766*b095b053SXin Li nullptr,
3767*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3768*b095b053SXin Li 0 /* flags */);
3769*b095b053SXin Li }
3770*b095b053SXin Li
CheckBounds4D(void *,size_t i,size_t j,size_t k,size_t l)3771*b095b053SXin Li static void CheckBounds4D(void*, size_t i, size_t j, size_t k, size_t l) {
3772*b095b053SXin Li EXPECT_LT(i, kParallelize4DRangeI);
3773*b095b053SXin Li EXPECT_LT(j, kParallelize4DRangeJ);
3774*b095b053SXin Li EXPECT_LT(k, kParallelize4DRangeK);
3775*b095b053SXin Li EXPECT_LT(l, kParallelize4DRangeL);
3776*b095b053SXin Li }
3777*b095b053SXin Li
TEST(Parallelize4D,SingleThreadPoolAllItemsInBounds)3778*b095b053SXin Li TEST(Parallelize4D, SingleThreadPoolAllItemsInBounds) {
3779*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3780*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3781*b095b053SXin Li
3782*b095b053SXin Li pthreadpool_parallelize_4d(
3783*b095b053SXin Li threadpool.get(),
3784*b095b053SXin Li CheckBounds4D,
3785*b095b053SXin Li nullptr,
3786*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3787*b095b053SXin Li 0 /* flags */);
3788*b095b053SXin Li }
3789*b095b053SXin Li
TEST(Parallelize4D,MultiThreadPoolAllItemsInBounds)3790*b095b053SXin Li TEST(Parallelize4D, MultiThreadPoolAllItemsInBounds) {
3791*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3792*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3793*b095b053SXin Li
3794*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3795*b095b053SXin Li GTEST_SKIP();
3796*b095b053SXin Li }
3797*b095b053SXin Li
3798*b095b053SXin Li pthreadpool_parallelize_4d(
3799*b095b053SXin Li threadpool.get(),
3800*b095b053SXin Li CheckBounds4D,
3801*b095b053SXin Li nullptr,
3802*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3803*b095b053SXin Li 0 /* flags */);
3804*b095b053SXin Li }
3805*b095b053SXin Li
SetTrue4D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l)3806*b095b053SXin Li static void SetTrue4D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l) {
3807*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3808*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
3809*b095b053SXin Li }
3810*b095b053SXin Li
TEST(Parallelize4D,SingleThreadPoolAllItemsProcessed)3811*b095b053SXin Li TEST(Parallelize4D, SingleThreadPoolAllItemsProcessed) {
3812*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3813*b095b053SXin Li
3814*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3815*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3816*b095b053SXin Li
3817*b095b053SXin Li pthreadpool_parallelize_4d(
3818*b095b053SXin Li threadpool.get(),
3819*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(SetTrue4D),
3820*b095b053SXin Li static_cast<void*>(indicators.data()),
3821*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3822*b095b053SXin Li 0 /* flags */);
3823*b095b053SXin Li
3824*b095b053SXin Li for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3825*b095b053SXin Li for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3826*b095b053SXin Li for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3827*b095b053SXin Li for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3828*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3829*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3830*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
3831*b095b053SXin Li }
3832*b095b053SXin Li }
3833*b095b053SXin Li }
3834*b095b053SXin Li }
3835*b095b053SXin Li }
3836*b095b053SXin Li
TEST(Parallelize4D,MultiThreadPoolAllItemsProcessed)3837*b095b053SXin Li TEST(Parallelize4D, MultiThreadPoolAllItemsProcessed) {
3838*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3839*b095b053SXin Li
3840*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3841*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3842*b095b053SXin Li
3843*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3844*b095b053SXin Li GTEST_SKIP();
3845*b095b053SXin Li }
3846*b095b053SXin Li
3847*b095b053SXin Li pthreadpool_parallelize_4d(
3848*b095b053SXin Li threadpool.get(),
3849*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(SetTrue4D),
3850*b095b053SXin Li static_cast<void*>(indicators.data()),
3851*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3852*b095b053SXin Li 0 /* flags */);
3853*b095b053SXin Li
3854*b095b053SXin Li for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3855*b095b053SXin Li for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3856*b095b053SXin Li for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3857*b095b053SXin Li for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3858*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3859*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
3860*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
3861*b095b053SXin Li }
3862*b095b053SXin Li }
3863*b095b053SXin Li }
3864*b095b053SXin Li }
3865*b095b053SXin Li }
3866*b095b053SXin Li
Increment4D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l)3867*b095b053SXin Li static void Increment4D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l) {
3868*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3869*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
3870*b095b053SXin Li }
3871*b095b053SXin Li
TEST(Parallelize4D,SingleThreadPoolEachItemProcessedOnce)3872*b095b053SXin Li TEST(Parallelize4D, SingleThreadPoolEachItemProcessedOnce) {
3873*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3874*b095b053SXin Li
3875*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3876*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3877*b095b053SXin Li
3878*b095b053SXin Li pthreadpool_parallelize_4d(
3879*b095b053SXin Li threadpool.get(),
3880*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3881*b095b053SXin Li static_cast<void*>(counters.data()),
3882*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3883*b095b053SXin Li 0 /* flags */);
3884*b095b053SXin Li
3885*b095b053SXin Li for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3886*b095b053SXin Li for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3887*b095b053SXin Li for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3888*b095b053SXin Li for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3889*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3890*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3891*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3892*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3893*b095b053SXin Li }
3894*b095b053SXin Li }
3895*b095b053SXin Li }
3896*b095b053SXin Li }
3897*b095b053SXin Li }
3898*b095b053SXin Li
TEST(Parallelize4D,MultiThreadPoolEachItemProcessedOnce)3899*b095b053SXin Li TEST(Parallelize4D, MultiThreadPoolEachItemProcessedOnce) {
3900*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3901*b095b053SXin Li
3902*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3903*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3904*b095b053SXin Li
3905*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3906*b095b053SXin Li GTEST_SKIP();
3907*b095b053SXin Li }
3908*b095b053SXin Li
3909*b095b053SXin Li pthreadpool_parallelize_4d(
3910*b095b053SXin Li threadpool.get(),
3911*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3912*b095b053SXin Li static_cast<void*>(counters.data()),
3913*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3914*b095b053SXin Li 0 /* flags */);
3915*b095b053SXin Li
3916*b095b053SXin Li for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3917*b095b053SXin Li for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3918*b095b053SXin Li for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3919*b095b053SXin Li for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3920*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3921*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
3922*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3923*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
3924*b095b053SXin Li }
3925*b095b053SXin Li }
3926*b095b053SXin Li }
3927*b095b053SXin Li }
3928*b095b053SXin Li }
3929*b095b053SXin Li
TEST(Parallelize4D,SingleThreadPoolEachItemProcessedMultipleTimes)3930*b095b053SXin Li TEST(Parallelize4D, SingleThreadPoolEachItemProcessedMultipleTimes) {
3931*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3932*b095b053SXin Li
3933*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
3934*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3935*b095b053SXin Li
3936*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3937*b095b053SXin Li pthreadpool_parallelize_4d(
3938*b095b053SXin Li threadpool.get(),
3939*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3940*b095b053SXin Li static_cast<void*>(counters.data()),
3941*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3942*b095b053SXin Li 0 /* flags */);
3943*b095b053SXin Li }
3944*b095b053SXin Li
3945*b095b053SXin Li for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3946*b095b053SXin Li for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3947*b095b053SXin Li for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3948*b095b053SXin Li for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3949*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3950*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3951*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3952*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
3953*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
3954*b095b053SXin Li }
3955*b095b053SXin Li }
3956*b095b053SXin Li }
3957*b095b053SXin Li }
3958*b095b053SXin Li }
3959*b095b053SXin Li
TEST(Parallelize4D,MultiThreadPoolEachItemProcessedMultipleTimes)3960*b095b053SXin Li TEST(Parallelize4D, MultiThreadPoolEachItemProcessedMultipleTimes) {
3961*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
3962*b095b053SXin Li
3963*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
3964*b095b053SXin Li ASSERT_TRUE(threadpool.get());
3965*b095b053SXin Li
3966*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
3967*b095b053SXin Li GTEST_SKIP();
3968*b095b053SXin Li }
3969*b095b053SXin Li
3970*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
3971*b095b053SXin Li pthreadpool_parallelize_4d(
3972*b095b053SXin Li threadpool.get(),
3973*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(Increment4D),
3974*b095b053SXin Li static_cast<void*>(counters.data()),
3975*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
3976*b095b053SXin Li 0 /* flags */);
3977*b095b053SXin Li }
3978*b095b053SXin Li
3979*b095b053SXin Li for (size_t i = 0; i < kParallelize4DRangeI; i++) {
3980*b095b053SXin Li for (size_t j = 0; j < kParallelize4DRangeJ; j++) {
3981*b095b053SXin Li for (size_t k = 0; k < kParallelize4DRangeK; k++) {
3982*b095b053SXin Li for (size_t l = 0; l < kParallelize4DRangeL; l++) {
3983*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l;
3984*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
3985*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
3986*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
3987*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
3988*b095b053SXin Li }
3989*b095b053SXin Li }
3990*b095b053SXin Li }
3991*b095b053SXin Li }
3992*b095b053SXin Li }
3993*b095b053SXin Li
IncrementSame4D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l)3994*b095b053SXin Li static void IncrementSame4D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l) {
3995*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
3996*b095b053SXin Li }
3997*b095b053SXin Li
TEST(Parallelize4D,MultiThreadPoolHighContention)3998*b095b053SXin Li TEST(Parallelize4D, MultiThreadPoolHighContention) {
3999*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4000*b095b053SXin Li
4001*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4002*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4003*b095b053SXin Li
4004*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4005*b095b053SXin Li GTEST_SKIP();
4006*b095b053SXin Li }
4007*b095b053SXin Li
4008*b095b053SXin Li pthreadpool_parallelize_4d(
4009*b095b053SXin Li threadpool.get(),
4010*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(IncrementSame4D),
4011*b095b053SXin Li static_cast<void*>(&num_processed_items),
4012*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
4013*b095b053SXin Li 0 /* flags */);
4014*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
4015*b095b053SXin Li }
4016*b095b053SXin Li
WorkImbalance4D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l)4017*b095b053SXin Li static void WorkImbalance4D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l) {
4018*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
4019*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && l == 0) {
4020*b095b053SXin Li /* Spin-wait until all items are computed */
4021*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL) {
4022*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
4023*b095b053SXin Li }
4024*b095b053SXin Li }
4025*b095b053SXin Li }
4026*b095b053SXin Li
TEST(Parallelize4D,MultiThreadPoolWorkStealing)4027*b095b053SXin Li TEST(Parallelize4D, MultiThreadPoolWorkStealing) {
4028*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4029*b095b053SXin Li
4030*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4031*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4032*b095b053SXin Li
4033*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4034*b095b053SXin Li GTEST_SKIP();
4035*b095b053SXin Li }
4036*b095b053SXin Li
4037*b095b053SXin Li pthreadpool_parallelize_4d(
4038*b095b053SXin Li threadpool.get(),
4039*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_t>(WorkImbalance4D),
4040*b095b053SXin Li static_cast<void*>(&num_processed_items),
4041*b095b053SXin Li kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL,
4042*b095b053SXin Li 0 /* flags */);
4043*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL);
4044*b095b053SXin Li }
4045*b095b053SXin Li
ComputeNothing4DTile1D(void *,size_t,size_t,size_t,size_t,size_t)4046*b095b053SXin Li static void ComputeNothing4DTile1D(void*, size_t, size_t, size_t, size_t, size_t) {
4047*b095b053SXin Li }
4048*b095b053SXin Li
TEST(Parallelize4DTile1D,SingleThreadPoolCompletes)4049*b095b053SXin Li TEST(Parallelize4DTile1D, SingleThreadPoolCompletes) {
4050*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4051*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4052*b095b053SXin Li
4053*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(threadpool.get(),
4054*b095b053SXin Li ComputeNothing4DTile1D,
4055*b095b053SXin Li nullptr,
4056*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4057*b095b053SXin Li kParallelize4DTile1DTileL,
4058*b095b053SXin Li 0 /* flags */);
4059*b095b053SXin Li }
4060*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolCompletes)4061*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolCompletes) {
4062*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4063*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4064*b095b053SXin Li
4065*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4066*b095b053SXin Li GTEST_SKIP();
4067*b095b053SXin Li }
4068*b095b053SXin Li
4069*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4070*b095b053SXin Li threadpool.get(),
4071*b095b053SXin Li ComputeNothing4DTile1D,
4072*b095b053SXin Li nullptr,
4073*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4074*b095b053SXin Li kParallelize4DTile1DTileL,
4075*b095b053SXin Li 0 /* flags */);
4076*b095b053SXin Li }
4077*b095b053SXin Li
CheckBounds4DTile1D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4078*b095b053SXin Li static void CheckBounds4DTile1D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4079*b095b053SXin Li EXPECT_LT(i, kParallelize4DTile1DRangeI);
4080*b095b053SXin Li EXPECT_LT(j, kParallelize4DTile1DRangeJ);
4081*b095b053SXin Li EXPECT_LT(k, kParallelize4DTile1DRangeK);
4082*b095b053SXin Li EXPECT_LT(start_l, kParallelize4DTile1DRangeL);
4083*b095b053SXin Li EXPECT_LE(start_l + tile_l, kParallelize4DTile1DRangeL);
4084*b095b053SXin Li }
4085*b095b053SXin Li
TEST(Parallelize4DTile1D,SingleThreadPoolAllItemsInBounds)4086*b095b053SXin Li TEST(Parallelize4DTile1D, SingleThreadPoolAllItemsInBounds) {
4087*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4088*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4089*b095b053SXin Li
4090*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4091*b095b053SXin Li threadpool.get(),
4092*b095b053SXin Li CheckBounds4DTile1D,
4093*b095b053SXin Li nullptr,
4094*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4095*b095b053SXin Li kParallelize4DTile1DTileL,
4096*b095b053SXin Li 0 /* flags */);
4097*b095b053SXin Li }
4098*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolAllItemsInBounds)4099*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolAllItemsInBounds) {
4100*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4101*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4102*b095b053SXin Li
4103*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4104*b095b053SXin Li GTEST_SKIP();
4105*b095b053SXin Li }
4106*b095b053SXin Li
4107*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4108*b095b053SXin Li threadpool.get(),
4109*b095b053SXin Li CheckBounds4DTile1D,
4110*b095b053SXin Li nullptr,
4111*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4112*b095b053SXin Li kParallelize4DTile1DTileL,
4113*b095b053SXin Li 0 /* flags */);
4114*b095b053SXin Li }
4115*b095b053SXin Li
CheckTiling4DTile1D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4116*b095b053SXin Li static void CheckTiling4DTile1D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4117*b095b053SXin Li EXPECT_GT(tile_l, 0);
4118*b095b053SXin Li EXPECT_LE(tile_l, kParallelize4DTile1DTileL);
4119*b095b053SXin Li EXPECT_EQ(start_l % kParallelize4DTile1DTileL, 0);
4120*b095b053SXin Li EXPECT_EQ(tile_l, std::min<size_t>(kParallelize4DTile1DTileL, kParallelize4DTile1DRangeL - start_l));
4121*b095b053SXin Li }
4122*b095b053SXin Li
TEST(Parallelize4DTile1D,SingleThreadPoolUniformTiling)4123*b095b053SXin Li TEST(Parallelize4DTile1D, SingleThreadPoolUniformTiling) {
4124*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4125*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4126*b095b053SXin Li
4127*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4128*b095b053SXin Li threadpool.get(),
4129*b095b053SXin Li CheckTiling4DTile1D,
4130*b095b053SXin Li nullptr,
4131*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4132*b095b053SXin Li kParallelize4DTile1DTileL,
4133*b095b053SXin Li 0 /* flags */);
4134*b095b053SXin Li }
4135*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolUniformTiling)4136*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolUniformTiling) {
4137*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4138*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4139*b095b053SXin Li
4140*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4141*b095b053SXin Li GTEST_SKIP();
4142*b095b053SXin Li }
4143*b095b053SXin Li
4144*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4145*b095b053SXin Li threadpool.get(),
4146*b095b053SXin Li CheckTiling4DTile1D,
4147*b095b053SXin Li nullptr,
4148*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4149*b095b053SXin Li kParallelize4DTile1DTileL,
4150*b095b053SXin Li 0 /* flags */);
4151*b095b053SXin Li }
4152*b095b053SXin Li
SetTrue4DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4153*b095b053SXin Li static void SetTrue4DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4154*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
4155*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4156*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
4157*b095b053SXin Li }
4158*b095b053SXin Li }
4159*b095b053SXin Li
TEST(Parallelize4DTile1D,SingleThreadPoolAllItemsProcessed)4160*b095b053SXin Li TEST(Parallelize4DTile1D, SingleThreadPoolAllItemsProcessed) {
4161*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4162*b095b053SXin Li
4163*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4164*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4165*b095b053SXin Li
4166*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4167*b095b053SXin Li threadpool.get(),
4168*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(SetTrue4DTile1D),
4169*b095b053SXin Li static_cast<void*>(indicators.data()),
4170*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4171*b095b053SXin Li kParallelize4DTile1DTileL,
4172*b095b053SXin Li 0 /* flags */);
4173*b095b053SXin Li
4174*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4175*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4176*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4177*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4178*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4179*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4180*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4181*b095b053SXin Li }
4182*b095b053SXin Li }
4183*b095b053SXin Li }
4184*b095b053SXin Li }
4185*b095b053SXin Li }
4186*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolAllItemsProcessed)4187*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolAllItemsProcessed) {
4188*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4189*b095b053SXin Li
4190*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4191*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4192*b095b053SXin Li
4193*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4194*b095b053SXin Li GTEST_SKIP();
4195*b095b053SXin Li }
4196*b095b053SXin Li
4197*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4198*b095b053SXin Li threadpool.get(),
4199*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(SetTrue4DTile1D),
4200*b095b053SXin Li static_cast<void*>(indicators.data()),
4201*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4202*b095b053SXin Li kParallelize4DTile1DTileL,
4203*b095b053SXin Li 0 /* flags */);
4204*b095b053SXin Li
4205*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4206*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4207*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4208*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4209*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4210*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4211*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4212*b095b053SXin Li }
4213*b095b053SXin Li }
4214*b095b053SXin Li }
4215*b095b053SXin Li }
4216*b095b053SXin Li }
4217*b095b053SXin Li
Increment4DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4218*b095b053SXin Li static void Increment4DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4219*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
4220*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4221*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
4222*b095b053SXin Li }
4223*b095b053SXin Li }
4224*b095b053SXin Li
TEST(Parallelize4DTile1D,SingleThreadPoolEachItemProcessedOnce)4225*b095b053SXin Li TEST(Parallelize4DTile1D, SingleThreadPoolEachItemProcessedOnce) {
4226*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4227*b095b053SXin Li
4228*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4229*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4230*b095b053SXin Li
4231*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4232*b095b053SXin Li threadpool.get(),
4233*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4234*b095b053SXin Li static_cast<void*>(counters.data()),
4235*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4236*b095b053SXin Li kParallelize4DTile1DTileL,
4237*b095b053SXin Li 0 /* flags */);
4238*b095b053SXin Li
4239*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4240*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4241*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4242*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4243*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4244*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4245*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4246*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4247*b095b053SXin Li }
4248*b095b053SXin Li }
4249*b095b053SXin Li }
4250*b095b053SXin Li }
4251*b095b053SXin Li }
4252*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolEachItemProcessedOnce)4253*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolEachItemProcessedOnce) {
4254*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4255*b095b053SXin Li
4256*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4257*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4258*b095b053SXin Li
4259*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4260*b095b053SXin Li GTEST_SKIP();
4261*b095b053SXin Li }
4262*b095b053SXin Li
4263*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4264*b095b053SXin Li threadpool.get(),
4265*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4266*b095b053SXin Li static_cast<void*>(counters.data()),
4267*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4268*b095b053SXin Li kParallelize4DTile1DTileL,
4269*b095b053SXin Li 0 /* flags */);
4270*b095b053SXin Li
4271*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4272*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4273*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4274*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4275*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4276*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4277*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4278*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4279*b095b053SXin Li }
4280*b095b053SXin Li }
4281*b095b053SXin Li }
4282*b095b053SXin Li }
4283*b095b053SXin Li }
4284*b095b053SXin Li
TEST(Parallelize4DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)4285*b095b053SXin Li TEST(Parallelize4DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
4286*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4287*b095b053SXin Li
4288*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4289*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4290*b095b053SXin Li
4291*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4292*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4293*b095b053SXin Li threadpool.get(),
4294*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4295*b095b053SXin Li static_cast<void*>(counters.data()),
4296*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4297*b095b053SXin Li kParallelize4DTile1DTileL,
4298*b095b053SXin Li 0 /* flags */);
4299*b095b053SXin Li }
4300*b095b053SXin Li
4301*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4302*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4303*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4304*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4305*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4306*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4307*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4308*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
4309*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
4310*b095b053SXin Li }
4311*b095b053SXin Li }
4312*b095b053SXin Li }
4313*b095b053SXin Li }
4314*b095b053SXin Li }
4315*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)4316*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
4317*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4318*b095b053SXin Li
4319*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4320*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4321*b095b053SXin Li
4322*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4323*b095b053SXin Li GTEST_SKIP();
4324*b095b053SXin Li }
4325*b095b053SXin Li
4326*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4327*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4328*b095b053SXin Li threadpool.get(),
4329*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(Increment4DTile1D),
4330*b095b053SXin Li static_cast<void*>(counters.data()),
4331*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4332*b095b053SXin Li kParallelize4DTile1DTileL,
4333*b095b053SXin Li 0 /* flags */);
4334*b095b053SXin Li }
4335*b095b053SXin Li
4336*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) {
4337*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) {
4338*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) {
4339*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) {
4340*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l;
4341*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4342*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4343*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
4344*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
4345*b095b053SXin Li }
4346*b095b053SXin Li }
4347*b095b053SXin Li }
4348*b095b053SXin Li }
4349*b095b053SXin Li }
4350*b095b053SXin Li
IncrementSame4DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4351*b095b053SXin Li static void IncrementSame4DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4352*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
4353*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
4354*b095b053SXin Li }
4355*b095b053SXin Li }
4356*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolHighContention)4357*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolHighContention) {
4358*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4359*b095b053SXin Li
4360*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4361*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4362*b095b053SXin Li
4363*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4364*b095b053SXin Li GTEST_SKIP();
4365*b095b053SXin Li }
4366*b095b053SXin Li
4367*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4368*b095b053SXin Li threadpool.get(),
4369*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(IncrementSame4DTile1D),
4370*b095b053SXin Li static_cast<void*>(&num_processed_items),
4371*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4372*b095b053SXin Li kParallelize4DTile1DTileL,
4373*b095b053SXin Li 0 /* flags */);
4374*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4375*b095b053SXin Li }
4376*b095b053SXin Li
WorkImbalance4DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t tile_l)4377*b095b053SXin Li static void WorkImbalance4DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) {
4378*b095b053SXin Li num_processed_items->fetch_add(tile_l, std::memory_order_relaxed);
4379*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && start_l == 0) {
4380*b095b053SXin Li /* Spin-wait until all items are computed */
4381*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL) {
4382*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
4383*b095b053SXin Li }
4384*b095b053SXin Li }
4385*b095b053SXin Li }
4386*b095b053SXin Li
TEST(Parallelize4DTile1D,MultiThreadPoolWorkStealing)4387*b095b053SXin Li TEST(Parallelize4DTile1D, MultiThreadPoolWorkStealing) {
4388*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4389*b095b053SXin Li
4390*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4391*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4392*b095b053SXin Li
4393*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4394*b095b053SXin Li GTEST_SKIP();
4395*b095b053SXin Li }
4396*b095b053SXin Li
4397*b095b053SXin Li pthreadpool_parallelize_4d_tile_1d(
4398*b095b053SXin Li threadpool.get(),
4399*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_1d_t>(WorkImbalance4DTile1D),
4400*b095b053SXin Li static_cast<void*>(&num_processed_items),
4401*b095b053SXin Li kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL,
4402*b095b053SXin Li kParallelize4DTile1DTileL,
4403*b095b053SXin Li 0 /* flags */);
4404*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL);
4405*b095b053SXin Li }
4406*b095b053SXin Li
ComputeNothing4DTile2D(void *,size_t,size_t,size_t,size_t,size_t,size_t)4407*b095b053SXin Li static void ComputeNothing4DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t) {
4408*b095b053SXin Li }
4409*b095b053SXin Li
TEST(Parallelize4DTile2D,SingleThreadPoolCompletes)4410*b095b053SXin Li TEST(Parallelize4DTile2D, SingleThreadPoolCompletes) {
4411*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4412*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4413*b095b053SXin Li
4414*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(threadpool.get(),
4415*b095b053SXin Li ComputeNothing4DTile2D,
4416*b095b053SXin Li nullptr,
4417*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4418*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4419*b095b053SXin Li 0 /* flags */);
4420*b095b053SXin Li }
4421*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolCompletes)4422*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolCompletes) {
4423*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4424*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4425*b095b053SXin Li
4426*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4427*b095b053SXin Li GTEST_SKIP();
4428*b095b053SXin Li }
4429*b095b053SXin Li
4430*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4431*b095b053SXin Li threadpool.get(),
4432*b095b053SXin Li ComputeNothing4DTile2D,
4433*b095b053SXin Li nullptr,
4434*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4435*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4436*b095b053SXin Li 0 /* flags */);
4437*b095b053SXin Li }
4438*b095b053SXin Li
CheckBounds4DTile2D(void *,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4439*b095b053SXin Li static void CheckBounds4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4440*b095b053SXin Li EXPECT_LT(i, kParallelize4DTile2DRangeI);
4441*b095b053SXin Li EXPECT_LT(j, kParallelize4DTile2DRangeJ);
4442*b095b053SXin Li EXPECT_LT(start_k, kParallelize4DTile2DRangeK);
4443*b095b053SXin Li EXPECT_LT(start_l, kParallelize4DTile2DRangeL);
4444*b095b053SXin Li EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK);
4445*b095b053SXin Li EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL);
4446*b095b053SXin Li }
4447*b095b053SXin Li
TEST(Parallelize4DTile2D,SingleThreadPoolAllItemsInBounds)4448*b095b053SXin Li TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsInBounds) {
4449*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4450*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4451*b095b053SXin Li
4452*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4453*b095b053SXin Li threadpool.get(),
4454*b095b053SXin Li CheckBounds4DTile2D,
4455*b095b053SXin Li nullptr,
4456*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4457*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4458*b095b053SXin Li 0 /* flags */);
4459*b095b053SXin Li }
4460*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolAllItemsInBounds)4461*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsInBounds) {
4462*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4463*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4464*b095b053SXin Li
4465*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4466*b095b053SXin Li GTEST_SKIP();
4467*b095b053SXin Li }
4468*b095b053SXin Li
4469*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4470*b095b053SXin Li threadpool.get(),
4471*b095b053SXin Li CheckBounds4DTile2D,
4472*b095b053SXin Li nullptr,
4473*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4474*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4475*b095b053SXin Li 0 /* flags */);
4476*b095b053SXin Li }
4477*b095b053SXin Li
CheckTiling4DTile2D(void *,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4478*b095b053SXin Li static void CheckTiling4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4479*b095b053SXin Li EXPECT_GT(tile_k, 0);
4480*b095b053SXin Li EXPECT_LE(tile_k, kParallelize4DTile2DTileK);
4481*b095b053SXin Li EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0);
4482*b095b053SXin Li EXPECT_EQ(tile_k, std::min<size_t>(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k));
4483*b095b053SXin Li
4484*b095b053SXin Li EXPECT_GT(tile_l, 0);
4485*b095b053SXin Li EXPECT_LE(tile_l, kParallelize4DTile2DTileL);
4486*b095b053SXin Li EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0);
4487*b095b053SXin Li EXPECT_EQ(tile_l, std::min<size_t>(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l));
4488*b095b053SXin Li }
4489*b095b053SXin Li
TEST(Parallelize4DTile2D,SingleThreadPoolUniformTiling)4490*b095b053SXin Li TEST(Parallelize4DTile2D, SingleThreadPoolUniformTiling) {
4491*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4492*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4493*b095b053SXin Li
4494*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4495*b095b053SXin Li threadpool.get(),
4496*b095b053SXin Li CheckTiling4DTile2D,
4497*b095b053SXin Li nullptr,
4498*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4499*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4500*b095b053SXin Li 0 /* flags */);
4501*b095b053SXin Li }
4502*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolUniformTiling)4503*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolUniformTiling) {
4504*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4505*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4506*b095b053SXin Li
4507*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4508*b095b053SXin Li GTEST_SKIP();
4509*b095b053SXin Li }
4510*b095b053SXin Li
4511*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4512*b095b053SXin Li threadpool.get(),
4513*b095b053SXin Li CheckTiling4DTile2D,
4514*b095b053SXin Li nullptr,
4515*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4516*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4517*b095b053SXin Li 0 /* flags */);
4518*b095b053SXin Li }
4519*b095b053SXin Li
SetTrue4DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4520*b095b053SXin Li static void SetTrue4DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4521*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
4522*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
4523*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4524*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
4525*b095b053SXin Li }
4526*b095b053SXin Li }
4527*b095b053SXin Li }
4528*b095b053SXin Li
TEST(Parallelize4DTile2D,SingleThreadPoolAllItemsProcessed)4529*b095b053SXin Li TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsProcessed) {
4530*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4531*b095b053SXin Li
4532*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4533*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4534*b095b053SXin Li
4535*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4536*b095b053SXin Li threadpool.get(),
4537*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(SetTrue4DTile2D),
4538*b095b053SXin Li static_cast<void*>(indicators.data()),
4539*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4540*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4541*b095b053SXin Li 0 /* flags */);
4542*b095b053SXin Li
4543*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4544*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4545*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4546*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4547*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4548*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4549*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4550*b095b053SXin Li }
4551*b095b053SXin Li }
4552*b095b053SXin Li }
4553*b095b053SXin Li }
4554*b095b053SXin Li }
4555*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolAllItemsProcessed)4556*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsProcessed) {
4557*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4558*b095b053SXin Li
4559*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4560*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4561*b095b053SXin Li
4562*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4563*b095b053SXin Li GTEST_SKIP();
4564*b095b053SXin Li }
4565*b095b053SXin Li
4566*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4567*b095b053SXin Li threadpool.get(),
4568*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(SetTrue4DTile2D),
4569*b095b053SXin Li static_cast<void*>(indicators.data()),
4570*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4571*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4572*b095b053SXin Li 0 /* flags */);
4573*b095b053SXin Li
4574*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4575*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4576*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4577*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4578*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4579*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4580*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4581*b095b053SXin Li }
4582*b095b053SXin Li }
4583*b095b053SXin Li }
4584*b095b053SXin Li }
4585*b095b053SXin Li }
4586*b095b053SXin Li
Increment4DTile2D(std::atomic_int * processed_counters,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4587*b095b053SXin Li static void Increment4DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4588*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
4589*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
4590*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4591*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
4592*b095b053SXin Li }
4593*b095b053SXin Li }
4594*b095b053SXin Li }
4595*b095b053SXin Li
TEST(Parallelize4DTile2D,SingleThreadPoolEachItemProcessedOnce)4596*b095b053SXin Li TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedOnce) {
4597*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4598*b095b053SXin Li
4599*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4600*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4601*b095b053SXin Li
4602*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4603*b095b053SXin Li threadpool.get(),
4604*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4605*b095b053SXin Li static_cast<void*>(counters.data()),
4606*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4607*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4608*b095b053SXin Li 0 /* flags */);
4609*b095b053SXin Li
4610*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4611*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4612*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4613*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4614*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4615*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4616*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4617*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4618*b095b053SXin Li }
4619*b095b053SXin Li }
4620*b095b053SXin Li }
4621*b095b053SXin Li }
4622*b095b053SXin Li }
4623*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolEachItemProcessedOnce)4624*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedOnce) {
4625*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4626*b095b053SXin Li
4627*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4628*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4629*b095b053SXin Li
4630*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4631*b095b053SXin Li GTEST_SKIP();
4632*b095b053SXin Li }
4633*b095b053SXin Li
4634*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4635*b095b053SXin Li threadpool.get(),
4636*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4637*b095b053SXin Li static_cast<void*>(counters.data()),
4638*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4639*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4640*b095b053SXin Li 0 /* flags */);
4641*b095b053SXin Li
4642*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4643*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4644*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4645*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4646*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4647*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
4648*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4649*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
4650*b095b053SXin Li }
4651*b095b053SXin Li }
4652*b095b053SXin Li }
4653*b095b053SXin Li }
4654*b095b053SXin Li }
4655*b095b053SXin Li
TEST(Parallelize4DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)4656*b095b053SXin Li TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
4657*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4658*b095b053SXin Li
4659*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4660*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4661*b095b053SXin Li
4662*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4663*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4664*b095b053SXin Li threadpool.get(),
4665*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4666*b095b053SXin Li static_cast<void*>(counters.data()),
4667*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4668*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4669*b095b053SXin Li 0 /* flags */);
4670*b095b053SXin Li }
4671*b095b053SXin Li
4672*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4673*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4674*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4675*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4676*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4677*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4678*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4679*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
4680*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
4681*b095b053SXin Li }
4682*b095b053SXin Li }
4683*b095b053SXin Li }
4684*b095b053SXin Li }
4685*b095b053SXin Li }
4686*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)4687*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
4688*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4689*b095b053SXin Li
4690*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4691*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4692*b095b053SXin Li
4693*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4694*b095b053SXin Li GTEST_SKIP();
4695*b095b053SXin Li }
4696*b095b053SXin Li
4697*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
4698*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4699*b095b053SXin Li threadpool.get(),
4700*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D),
4701*b095b053SXin Li static_cast<void*>(counters.data()),
4702*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4703*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4704*b095b053SXin Li 0 /* flags */);
4705*b095b053SXin Li }
4706*b095b053SXin Li
4707*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4708*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4709*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4710*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4711*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4712*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
4713*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
4714*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
4715*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
4716*b095b053SXin Li }
4717*b095b053SXin Li }
4718*b095b053SXin Li }
4719*b095b053SXin Li }
4720*b095b053SXin Li }
4721*b095b053SXin Li
IncrementSame4DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4722*b095b053SXin Li static void IncrementSame4DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4723*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
4724*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
4725*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
4726*b095b053SXin Li }
4727*b095b053SXin Li }
4728*b095b053SXin Li }
4729*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolHighContention)4730*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolHighContention) {
4731*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4732*b095b053SXin Li
4733*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4734*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4735*b095b053SXin Li
4736*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4737*b095b053SXin Li GTEST_SKIP();
4738*b095b053SXin Li }
4739*b095b053SXin Li
4740*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4741*b095b053SXin Li threadpool.get(),
4742*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(IncrementSame4DTile2D),
4743*b095b053SXin Li static_cast<void*>(&num_processed_items),
4744*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4745*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4746*b095b053SXin Li 0 /* flags */);
4747*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4748*b095b053SXin Li }
4749*b095b053SXin Li
WorkImbalance4DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4750*b095b053SXin Li static void WorkImbalance4DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4751*b095b053SXin Li num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed);
4752*b095b053SXin Li if (i == 0 && j == 0 && start_k == 0 && start_l == 0) {
4753*b095b053SXin Li /* Spin-wait until all items are computed */
4754*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) {
4755*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
4756*b095b053SXin Li }
4757*b095b053SXin Li }
4758*b095b053SXin Li }
4759*b095b053SXin Li
TEST(Parallelize4DTile2D,MultiThreadPoolWorkStealing)4760*b095b053SXin Li TEST(Parallelize4DTile2D, MultiThreadPoolWorkStealing) {
4761*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
4762*b095b053SXin Li
4763*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4764*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4765*b095b053SXin Li
4766*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4767*b095b053SXin Li GTEST_SKIP();
4768*b095b053SXin Li }
4769*b095b053SXin Li
4770*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d(
4771*b095b053SXin Li threadpool.get(),
4772*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(WorkImbalance4DTile2D),
4773*b095b053SXin Li static_cast<void*>(&num_processed_items),
4774*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4775*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4776*b095b053SXin Li 0 /* flags */);
4777*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4778*b095b053SXin Li }
4779*b095b053SXin Li
ComputeNothing4DTile2DWithUArch(void *,uint32_t,size_t,size_t,size_t,size_t,size_t,size_t)4780*b095b053SXin Li static void ComputeNothing4DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t) {
4781*b095b053SXin Li }
4782*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolCompletes)4783*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolCompletes) {
4784*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4785*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4786*b095b053SXin Li
4787*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(threadpool.get(),
4788*b095b053SXin Li ComputeNothing4DTile2DWithUArch,
4789*b095b053SXin Li nullptr,
4790*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4791*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4792*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4793*b095b053SXin Li 0 /* flags */);
4794*b095b053SXin Li }
4795*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolCompletes)4796*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolCompletes) {
4797*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4798*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4799*b095b053SXin Li
4800*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4801*b095b053SXin Li GTEST_SKIP();
4802*b095b053SXin Li }
4803*b095b053SXin Li
4804*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4805*b095b053SXin Li threadpool.get(),
4806*b095b053SXin Li ComputeNothing4DTile2DWithUArch,
4807*b095b053SXin Li nullptr,
4808*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4809*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4810*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4811*b095b053SXin Li 0 /* flags */);
4812*b095b053SXin Li }
4813*b095b053SXin Li
CheckUArch4DTile2DWithUArch(void *,uint32_t uarch_index,size_t,size_t,size_t,size_t,size_t,size_t)4814*b095b053SXin Li static void CheckUArch4DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t, size_t, size_t) {
4815*b095b053SXin Li if (uarch_index != kDefaultUArchIndex) {
4816*b095b053SXin Li EXPECT_LE(uarch_index, kMaxUArchIndex);
4817*b095b053SXin Li }
4818*b095b053SXin Li }
4819*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolUArchInBounds)4820*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolUArchInBounds) {
4821*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4822*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4823*b095b053SXin Li
4824*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4825*b095b053SXin Li threadpool.get(),
4826*b095b053SXin Li CheckUArch4DTile2DWithUArch,
4827*b095b053SXin Li nullptr,
4828*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4829*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4830*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4831*b095b053SXin Li 0 /* flags */);
4832*b095b053SXin Li }
4833*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolUArchInBounds)4834*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolUArchInBounds) {
4835*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4836*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4837*b095b053SXin Li
4838*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4839*b095b053SXin Li GTEST_SKIP();
4840*b095b053SXin Li }
4841*b095b053SXin Li
4842*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4843*b095b053SXin Li threadpool.get(),
4844*b095b053SXin Li CheckUArch4DTile2DWithUArch,
4845*b095b053SXin Li nullptr,
4846*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4847*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4848*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4849*b095b053SXin Li 0 /* flags */);
4850*b095b053SXin Li }
4851*b095b053SXin Li
CheckBounds4DTile2DWithUArch(void *,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4852*b095b053SXin Li static void CheckBounds4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4853*b095b053SXin Li EXPECT_LT(i, kParallelize4DTile2DRangeI);
4854*b095b053SXin Li EXPECT_LT(j, kParallelize4DTile2DRangeJ);
4855*b095b053SXin Li EXPECT_LT(start_k, kParallelize4DTile2DRangeK);
4856*b095b053SXin Li EXPECT_LT(start_l, kParallelize4DTile2DRangeL);
4857*b095b053SXin Li EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK);
4858*b095b053SXin Li EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL);
4859*b095b053SXin Li }
4860*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolAllItemsInBounds)4861*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) {
4862*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4863*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4864*b095b053SXin Li
4865*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4866*b095b053SXin Li threadpool.get(),
4867*b095b053SXin Li CheckBounds4DTile2DWithUArch,
4868*b095b053SXin Li nullptr,
4869*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4870*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4871*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4872*b095b053SXin Li 0 /* flags */);
4873*b095b053SXin Li }
4874*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolAllItemsInBounds)4875*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) {
4876*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4877*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4878*b095b053SXin Li
4879*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4880*b095b053SXin Li GTEST_SKIP();
4881*b095b053SXin Li }
4882*b095b053SXin Li
4883*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4884*b095b053SXin Li threadpool.get(),
4885*b095b053SXin Li CheckBounds4DTile2DWithUArch,
4886*b095b053SXin Li nullptr,
4887*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4888*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4889*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4890*b095b053SXin Li 0 /* flags */);
4891*b095b053SXin Li }
4892*b095b053SXin Li
CheckTiling4DTile2DWithUArch(void *,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4893*b095b053SXin Li static void CheckTiling4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4894*b095b053SXin Li EXPECT_GT(tile_k, 0);
4895*b095b053SXin Li EXPECT_LE(tile_k, kParallelize4DTile2DTileK);
4896*b095b053SXin Li EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0);
4897*b095b053SXin Li EXPECT_EQ(tile_k, std::min<size_t>(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k));
4898*b095b053SXin Li
4899*b095b053SXin Li EXPECT_GT(tile_l, 0);
4900*b095b053SXin Li EXPECT_LE(tile_l, kParallelize4DTile2DTileL);
4901*b095b053SXin Li EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0);
4902*b095b053SXin Li EXPECT_EQ(tile_l, std::min<size_t>(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l));
4903*b095b053SXin Li }
4904*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolUniformTiling)4905*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolUniformTiling) {
4906*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4907*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4908*b095b053SXin Li
4909*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4910*b095b053SXin Li threadpool.get(),
4911*b095b053SXin Li CheckTiling4DTile2DWithUArch,
4912*b095b053SXin Li nullptr,
4913*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4914*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4915*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4916*b095b053SXin Li 0 /* flags */);
4917*b095b053SXin Li }
4918*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolUniformTiling)4919*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolUniformTiling) {
4920*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4921*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4922*b095b053SXin Li
4923*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4924*b095b053SXin Li GTEST_SKIP();
4925*b095b053SXin Li }
4926*b095b053SXin Li
4927*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4928*b095b053SXin Li threadpool.get(),
4929*b095b053SXin Li CheckTiling4DTile2DWithUArch,
4930*b095b053SXin Li nullptr,
4931*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4932*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4933*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4934*b095b053SXin Li 0 /* flags */);
4935*b095b053SXin Li }
4936*b095b053SXin Li
SetTrue4DTile2DWithUArch(std::atomic_bool * processed_indicators,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)4937*b095b053SXin Li static void SetTrue4DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
4938*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
4939*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
4940*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4941*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
4942*b095b053SXin Li }
4943*b095b053SXin Li }
4944*b095b053SXin Li }
4945*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolAllItemsProcessed)4946*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) {
4947*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4948*b095b053SXin Li
4949*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
4950*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4951*b095b053SXin Li
4952*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4953*b095b053SXin Li threadpool.get(),
4954*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(SetTrue4DTile2DWithUArch),
4955*b095b053SXin Li static_cast<void*>(indicators.data()),
4956*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4957*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4958*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4959*b095b053SXin Li 0 /* flags */);
4960*b095b053SXin Li
4961*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4962*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4963*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4964*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4965*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4966*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4967*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
4968*b095b053SXin Li }
4969*b095b053SXin Li }
4970*b095b053SXin Li }
4971*b095b053SXin Li }
4972*b095b053SXin Li }
4973*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolAllItemsProcessed)4974*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) {
4975*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
4976*b095b053SXin Li
4977*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
4978*b095b053SXin Li ASSERT_TRUE(threadpool.get());
4979*b095b053SXin Li
4980*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
4981*b095b053SXin Li GTEST_SKIP();
4982*b095b053SXin Li }
4983*b095b053SXin Li
4984*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
4985*b095b053SXin Li threadpool.get(),
4986*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(SetTrue4DTile2DWithUArch),
4987*b095b053SXin Li static_cast<void*>(indicators.data()),
4988*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
4989*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
4990*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
4991*b095b053SXin Li 0 /* flags */);
4992*b095b053SXin Li
4993*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
4994*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
4995*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
4996*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
4997*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
4998*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
4999*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed";
5000*b095b053SXin Li }
5001*b095b053SXin Li }
5002*b095b053SXin Li }
5003*b095b053SXin Li }
5004*b095b053SXin Li }
5005*b095b053SXin Li
Increment4DTile2DWithUArch(std::atomic_int * processed_counters,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)5006*b095b053SXin Li static void Increment4DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
5007*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
5008*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
5009*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5010*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
5011*b095b053SXin Li }
5012*b095b053SXin Li }
5013*b095b053SXin Li }
5014*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolEachItemProcessedOnce)5015*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) {
5016*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5017*b095b053SXin Li
5018*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5019*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5020*b095b053SXin Li
5021*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
5022*b095b053SXin Li threadpool.get(),
5023*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5024*b095b053SXin Li static_cast<void*>(counters.data()),
5025*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
5026*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5027*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5028*b095b053SXin Li 0 /* flags */);
5029*b095b053SXin Li
5030*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5031*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5032*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5033*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5034*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5035*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5036*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5037*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5038*b095b053SXin Li }
5039*b095b053SXin Li }
5040*b095b053SXin Li }
5041*b095b053SXin Li }
5042*b095b053SXin Li }
5043*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolEachItemProcessedOnce)5044*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) {
5045*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5046*b095b053SXin Li
5047*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5048*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5049*b095b053SXin Li
5050*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5051*b095b053SXin Li GTEST_SKIP();
5052*b095b053SXin Li }
5053*b095b053SXin Li
5054*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
5055*b095b053SXin Li threadpool.get(),
5056*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5057*b095b053SXin Li static_cast<void*>(counters.data()),
5058*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
5059*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5060*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5061*b095b053SXin Li 0 /* flags */);
5062*b095b053SXin Li
5063*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5064*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5065*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5066*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5067*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5068*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5069*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5070*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5071*b095b053SXin Li }
5072*b095b053SXin Li }
5073*b095b053SXin Li }
5074*b095b053SXin Li }
5075*b095b053SXin Li }
5076*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,SingleThreadPoolEachItemProcessedMultipleTimes)5077*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) {
5078*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5079*b095b053SXin Li
5080*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5081*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5082*b095b053SXin Li
5083*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
5084*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
5085*b095b053SXin Li threadpool.get(),
5086*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5087*b095b053SXin Li static_cast<void*>(counters.data()),
5088*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
5089*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5090*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5091*b095b053SXin Li 0 /* flags */);
5092*b095b053SXin Li }
5093*b095b053SXin Li
5094*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5095*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5096*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5097*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5098*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5099*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
5100*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5101*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
5102*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
5103*b095b053SXin Li }
5104*b095b053SXin Li }
5105*b095b053SXin Li }
5106*b095b053SXin Li }
5107*b095b053SXin Li }
5108*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolEachItemProcessedMultipleTimes)5109*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) {
5110*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5111*b095b053SXin Li
5112*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5113*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5114*b095b053SXin Li
5115*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5116*b095b053SXin Li GTEST_SKIP();
5117*b095b053SXin Li }
5118*b095b053SXin Li
5119*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) {
5120*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
5121*b095b053SXin Li threadpool.get(),
5122*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(Increment4DTile2DWithUArch),
5123*b095b053SXin Li static_cast<void*>(counters.data()),
5124*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
5125*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5126*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5127*b095b053SXin Li 0 /* flags */);
5128*b095b053SXin Li }
5129*b095b053SXin Li
5130*b095b053SXin Li for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) {
5131*b095b053SXin Li for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) {
5132*b095b053SXin Li for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) {
5133*b095b053SXin Li for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) {
5134*b095b053SXin Li const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l;
5135*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations)
5136*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed "
5137*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
5138*b095b053SXin Li << "(expected: " << kIncrementIterations << ")";
5139*b095b053SXin Li }
5140*b095b053SXin Li }
5141*b095b053SXin Li }
5142*b095b053SXin Li }
5143*b095b053SXin Li }
5144*b095b053SXin Li
IncrementSame4DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)5145*b095b053SXin Li static void IncrementSame4DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
5146*b095b053SXin Li for (size_t k = start_k; k < start_k + tile_k; k++) {
5147*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
5148*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
5149*b095b053SXin Li }
5150*b095b053SXin Li }
5151*b095b053SXin Li }
5152*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolHighContention)5153*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolHighContention) {
5154*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5155*b095b053SXin Li
5156*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5157*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5158*b095b053SXin Li
5159*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5160*b095b053SXin Li GTEST_SKIP();
5161*b095b053SXin Li }
5162*b095b053SXin Li
5163*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
5164*b095b053SXin Li threadpool.get(),
5165*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(IncrementSame4DTile2DWithUArch),
5166*b095b053SXin Li static_cast<void*>(&num_processed_items),
5167*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
5168*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5169*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5170*b095b053SXin Li 0 /* flags */);
5171*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5172*b095b053SXin Li }
5173*b095b053SXin Li
WorkImbalance4DTile2DWithUArch(std::atomic_int * num_processed_items,uint32_t,size_t i,size_t j,size_t start_k,size_t start_l,size_t tile_k,size_t tile_l)5174*b095b053SXin Li static void WorkImbalance4DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) {
5175*b095b053SXin Li num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed);
5176*b095b053SXin Li if (i == 0 && j == 0 && start_k == 0 && start_l == 0) {
5177*b095b053SXin Li /* Spin-wait until all items are computed */
5178*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) {
5179*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
5180*b095b053SXin Li }
5181*b095b053SXin Li }
5182*b095b053SXin Li }
5183*b095b053SXin Li
TEST(Parallelize4DTile2DWithUArch,MultiThreadPoolWorkStealing)5184*b095b053SXin Li TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolWorkStealing) {
5185*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5186*b095b053SXin Li
5187*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5188*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5189*b095b053SXin Li
5190*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5191*b095b053SXin Li GTEST_SKIP();
5192*b095b053SXin Li }
5193*b095b053SXin Li
5194*b095b053SXin Li pthreadpool_parallelize_4d_tile_2d_with_uarch(
5195*b095b053SXin Li threadpool.get(),
5196*b095b053SXin Li reinterpret_cast<pthreadpool_task_4d_tile_2d_with_id_t>(WorkImbalance4DTile2DWithUArch),
5197*b095b053SXin Li static_cast<void*>(&num_processed_items),
5198*b095b053SXin Li kDefaultUArchIndex, kMaxUArchIndex,
5199*b095b053SXin Li kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL,
5200*b095b053SXin Li kParallelize4DTile2DTileK, kParallelize4DTile2DTileL,
5201*b095b053SXin Li 0 /* flags */);
5202*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL);
5203*b095b053SXin Li }
5204*b095b053SXin Li
ComputeNothing5D(void *,size_t,size_t,size_t,size_t,size_t)5205*b095b053SXin Li static void ComputeNothing5D(void*, size_t, size_t, size_t, size_t, size_t) {
5206*b095b053SXin Li }
5207*b095b053SXin Li
TEST(Parallelize5D,SingleThreadPoolCompletes)5208*b095b053SXin Li TEST(Parallelize5D, SingleThreadPoolCompletes) {
5209*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5210*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5211*b095b053SXin Li
5212*b095b053SXin Li pthreadpool_parallelize_5d(threadpool.get(),
5213*b095b053SXin Li ComputeNothing5D,
5214*b095b053SXin Li nullptr,
5215*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5216*b095b053SXin Li 0 /* flags */);
5217*b095b053SXin Li }
5218*b095b053SXin Li
TEST(Parallelize5D,MultiThreadPoolCompletes)5219*b095b053SXin Li TEST(Parallelize5D, MultiThreadPoolCompletes) {
5220*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5221*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5222*b095b053SXin Li
5223*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5224*b095b053SXin Li GTEST_SKIP();
5225*b095b053SXin Li }
5226*b095b053SXin Li
5227*b095b053SXin Li pthreadpool_parallelize_5d(
5228*b095b053SXin Li threadpool.get(),
5229*b095b053SXin Li ComputeNothing5D,
5230*b095b053SXin Li nullptr,
5231*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5232*b095b053SXin Li 0 /* flags */);
5233*b095b053SXin Li }
5234*b095b053SXin Li
CheckBounds5D(void *,size_t i,size_t j,size_t k,size_t l,size_t m)5235*b095b053SXin Li static void CheckBounds5D(void*, size_t i, size_t j, size_t k, size_t l, size_t m) {
5236*b095b053SXin Li EXPECT_LT(i, kParallelize5DRangeI);
5237*b095b053SXin Li EXPECT_LT(j, kParallelize5DRangeJ);
5238*b095b053SXin Li EXPECT_LT(k, kParallelize5DRangeK);
5239*b095b053SXin Li EXPECT_LT(l, kParallelize5DRangeL);
5240*b095b053SXin Li EXPECT_LT(m, kParallelize5DRangeM);
5241*b095b053SXin Li }
5242*b095b053SXin Li
TEST(Parallelize5D,SingleThreadPoolAllItemsInBounds)5243*b095b053SXin Li TEST(Parallelize5D, SingleThreadPoolAllItemsInBounds) {
5244*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5245*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5246*b095b053SXin Li
5247*b095b053SXin Li pthreadpool_parallelize_5d(
5248*b095b053SXin Li threadpool.get(),
5249*b095b053SXin Li CheckBounds5D,
5250*b095b053SXin Li nullptr,
5251*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5252*b095b053SXin Li 0 /* flags */);
5253*b095b053SXin Li }
5254*b095b053SXin Li
TEST(Parallelize5D,MultiThreadPoolAllItemsInBounds)5255*b095b053SXin Li TEST(Parallelize5D, MultiThreadPoolAllItemsInBounds) {
5256*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5257*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5258*b095b053SXin Li
5259*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5260*b095b053SXin Li GTEST_SKIP();
5261*b095b053SXin Li }
5262*b095b053SXin Li
5263*b095b053SXin Li pthreadpool_parallelize_5d(
5264*b095b053SXin Li threadpool.get(),
5265*b095b053SXin Li CheckBounds5D,
5266*b095b053SXin Li nullptr,
5267*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5268*b095b053SXin Li 0 /* flags */);
5269*b095b053SXin Li }
5270*b095b053SXin Li
SetTrue5D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t m)5271*b095b053SXin Li static void SetTrue5D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m) {
5272*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5273*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
5274*b095b053SXin Li }
5275*b095b053SXin Li
TEST(Parallelize5D,SingleThreadPoolAllItemsProcessed)5276*b095b053SXin Li TEST(Parallelize5D, SingleThreadPoolAllItemsProcessed) {
5277*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5278*b095b053SXin Li
5279*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5280*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5281*b095b053SXin Li
5282*b095b053SXin Li pthreadpool_parallelize_5d(
5283*b095b053SXin Li threadpool.get(),
5284*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(SetTrue5D),
5285*b095b053SXin Li static_cast<void*>(indicators.data()),
5286*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5287*b095b053SXin Li 0 /* flags */);
5288*b095b053SXin Li
5289*b095b053SXin Li for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5290*b095b053SXin Li for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5291*b095b053SXin Li for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5292*b095b053SXin Li for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5293*b095b053SXin Li for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5294*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5295*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5296*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5297*b095b053SXin Li }
5298*b095b053SXin Li }
5299*b095b053SXin Li }
5300*b095b053SXin Li }
5301*b095b053SXin Li }
5302*b095b053SXin Li }
5303*b095b053SXin Li
TEST(Parallelize5D,MultiThreadPoolAllItemsProcessed)5304*b095b053SXin Li TEST(Parallelize5D, MultiThreadPoolAllItemsProcessed) {
5305*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5306*b095b053SXin Li
5307*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5308*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5309*b095b053SXin Li
5310*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5311*b095b053SXin Li GTEST_SKIP();
5312*b095b053SXin Li }
5313*b095b053SXin Li
5314*b095b053SXin Li pthreadpool_parallelize_5d(
5315*b095b053SXin Li threadpool.get(),
5316*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(SetTrue5D),
5317*b095b053SXin Li static_cast<void*>(indicators.data()),
5318*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5319*b095b053SXin Li 0 /* flags */);
5320*b095b053SXin Li
5321*b095b053SXin Li for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5322*b095b053SXin Li for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5323*b095b053SXin Li for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5324*b095b053SXin Li for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5325*b095b053SXin Li for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5326*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5327*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5328*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5329*b095b053SXin Li }
5330*b095b053SXin Li }
5331*b095b053SXin Li }
5332*b095b053SXin Li }
5333*b095b053SXin Li }
5334*b095b053SXin Li }
5335*b095b053SXin Li
Increment5D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t m)5336*b095b053SXin Li static void Increment5D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m) {
5337*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5338*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
5339*b095b053SXin Li }
5340*b095b053SXin Li
TEST(Parallelize5D,SingleThreadPoolEachItemProcessedOnce)5341*b095b053SXin Li TEST(Parallelize5D, SingleThreadPoolEachItemProcessedOnce) {
5342*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5343*b095b053SXin Li
5344*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5345*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5346*b095b053SXin Li
5347*b095b053SXin Li pthreadpool_parallelize_5d(
5348*b095b053SXin Li threadpool.get(),
5349*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5350*b095b053SXin Li static_cast<void*>(counters.data()),
5351*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5352*b095b053SXin Li 0 /* flags */);
5353*b095b053SXin Li
5354*b095b053SXin Li for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5355*b095b053SXin Li for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5356*b095b053SXin Li for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5357*b095b053SXin Li for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5358*b095b053SXin Li for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5359*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5360*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5361*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5362*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5363*b095b053SXin Li }
5364*b095b053SXin Li }
5365*b095b053SXin Li }
5366*b095b053SXin Li }
5367*b095b053SXin Li }
5368*b095b053SXin Li }
5369*b095b053SXin Li
TEST(Parallelize5D,MultiThreadPoolEachItemProcessedOnce)5370*b095b053SXin Li TEST(Parallelize5D, MultiThreadPoolEachItemProcessedOnce) {
5371*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5372*b095b053SXin Li
5373*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5374*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5375*b095b053SXin Li
5376*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5377*b095b053SXin Li GTEST_SKIP();
5378*b095b053SXin Li }
5379*b095b053SXin Li
5380*b095b053SXin Li pthreadpool_parallelize_5d(
5381*b095b053SXin Li threadpool.get(),
5382*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5383*b095b053SXin Li static_cast<void*>(counters.data()),
5384*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5385*b095b053SXin Li 0 /* flags */);
5386*b095b053SXin Li
5387*b095b053SXin Li for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5388*b095b053SXin Li for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5389*b095b053SXin Li for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5390*b095b053SXin Li for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5391*b095b053SXin Li for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5392*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5393*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5394*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5395*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5396*b095b053SXin Li }
5397*b095b053SXin Li }
5398*b095b053SXin Li }
5399*b095b053SXin Li }
5400*b095b053SXin Li }
5401*b095b053SXin Li }
5402*b095b053SXin Li
TEST(Parallelize5D,SingleThreadPoolEachItemProcessedMultipleTimes)5403*b095b053SXin Li TEST(Parallelize5D, SingleThreadPoolEachItemProcessedMultipleTimes) {
5404*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5405*b095b053SXin Li
5406*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5407*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5408*b095b053SXin Li
5409*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5410*b095b053SXin Li pthreadpool_parallelize_5d(
5411*b095b053SXin Li threadpool.get(),
5412*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5413*b095b053SXin Li static_cast<void*>(counters.data()),
5414*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5415*b095b053SXin Li 0 /* flags */);
5416*b095b053SXin Li }
5417*b095b053SXin Li
5418*b095b053SXin Li for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5419*b095b053SXin Li for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5420*b095b053SXin Li for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5421*b095b053SXin Li for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5422*b095b053SXin Li for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5423*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5424*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5425*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5426*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
5427*b095b053SXin Li << "(expected: " << kIncrementIterations5D << ")";
5428*b095b053SXin Li }
5429*b095b053SXin Li }
5430*b095b053SXin Li }
5431*b095b053SXin Li }
5432*b095b053SXin Li }
5433*b095b053SXin Li }
5434*b095b053SXin Li
TEST(Parallelize5D,MultiThreadPoolEachItemProcessedMultipleTimes)5435*b095b053SXin Li TEST(Parallelize5D, MultiThreadPoolEachItemProcessedMultipleTimes) {
5436*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5437*b095b053SXin Li
5438*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5439*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5440*b095b053SXin Li
5441*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5442*b095b053SXin Li GTEST_SKIP();
5443*b095b053SXin Li }
5444*b095b053SXin Li
5445*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5446*b095b053SXin Li pthreadpool_parallelize_5d(
5447*b095b053SXin Li threadpool.get(),
5448*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(Increment5D),
5449*b095b053SXin Li static_cast<void*>(counters.data()),
5450*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5451*b095b053SXin Li 0 /* flags */);
5452*b095b053SXin Li }
5453*b095b053SXin Li
5454*b095b053SXin Li for (size_t i = 0; i < kParallelize5DRangeI; i++) {
5455*b095b053SXin Li for (size_t j = 0; j < kParallelize5DRangeJ; j++) {
5456*b095b053SXin Li for (size_t k = 0; k < kParallelize5DRangeK; k++) {
5457*b095b053SXin Li for (size_t l = 0; l < kParallelize5DRangeL; l++) {
5458*b095b053SXin Li for (size_t m = 0; m < kParallelize5DRangeM; m++) {
5459*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m;
5460*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5461*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5462*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
5463*b095b053SXin Li << "(expected: " << kIncrementIterations5D << ")";
5464*b095b053SXin Li }
5465*b095b053SXin Li }
5466*b095b053SXin Li }
5467*b095b053SXin Li }
5468*b095b053SXin Li }
5469*b095b053SXin Li }
5470*b095b053SXin Li
IncrementSame5D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m)5471*b095b053SXin Li static void IncrementSame5D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m) {
5472*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
5473*b095b053SXin Li }
5474*b095b053SXin Li
TEST(Parallelize5D,MultiThreadPoolHighContention)5475*b095b053SXin Li TEST(Parallelize5D, MultiThreadPoolHighContention) {
5476*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5477*b095b053SXin Li
5478*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5479*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5480*b095b053SXin Li
5481*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5482*b095b053SXin Li GTEST_SKIP();
5483*b095b053SXin Li }
5484*b095b053SXin Li
5485*b095b053SXin Li pthreadpool_parallelize_5d(
5486*b095b053SXin Li threadpool.get(),
5487*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(IncrementSame5D),
5488*b095b053SXin Li static_cast<void*>(&num_processed_items),
5489*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5490*b095b053SXin Li 0 /* flags */);
5491*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5492*b095b053SXin Li }
5493*b095b053SXin Li
WorkImbalance5D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m)5494*b095b053SXin Li static void WorkImbalance5D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m) {
5495*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
5496*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0) {
5497*b095b053SXin Li /* Spin-wait until all items are computed */
5498*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM) {
5499*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
5500*b095b053SXin Li }
5501*b095b053SXin Li }
5502*b095b053SXin Li }
5503*b095b053SXin Li
TEST(Parallelize5D,MultiThreadPoolWorkStealing)5504*b095b053SXin Li TEST(Parallelize5D, MultiThreadPoolWorkStealing) {
5505*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5506*b095b053SXin Li
5507*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5508*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5509*b095b053SXin Li
5510*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5511*b095b053SXin Li GTEST_SKIP();
5512*b095b053SXin Li }
5513*b095b053SXin Li
5514*b095b053SXin Li pthreadpool_parallelize_5d(
5515*b095b053SXin Li threadpool.get(),
5516*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_t>(WorkImbalance5D),
5517*b095b053SXin Li static_cast<void*>(&num_processed_items),
5518*b095b053SXin Li kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM,
5519*b095b053SXin Li 0 /* flags */);
5520*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM);
5521*b095b053SXin Li }
5522*b095b053SXin Li
ComputeNothing5DTile1D(void *,size_t,size_t,size_t,size_t,size_t,size_t)5523*b095b053SXin Li static void ComputeNothing5DTile1D(void*, size_t, size_t, size_t, size_t, size_t, size_t) {
5524*b095b053SXin Li }
5525*b095b053SXin Li
TEST(Parallelize5DTile1D,SingleThreadPoolCompletes)5526*b095b053SXin Li TEST(Parallelize5DTile1D, SingleThreadPoolCompletes) {
5527*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5528*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5529*b095b053SXin Li
5530*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(threadpool.get(),
5531*b095b053SXin Li ComputeNothing5DTile1D,
5532*b095b053SXin Li nullptr,
5533*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5534*b095b053SXin Li kParallelize5DTile1DTileM,
5535*b095b053SXin Li 0 /* flags */);
5536*b095b053SXin Li }
5537*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolCompletes)5538*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolCompletes) {
5539*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5540*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5541*b095b053SXin Li
5542*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5543*b095b053SXin Li GTEST_SKIP();
5544*b095b053SXin Li }
5545*b095b053SXin Li
5546*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5547*b095b053SXin Li threadpool.get(),
5548*b095b053SXin Li ComputeNothing5DTile1D,
5549*b095b053SXin Li nullptr,
5550*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5551*b095b053SXin Li kParallelize5DTile1DTileM,
5552*b095b053SXin Li 0 /* flags */);
5553*b095b053SXin Li }
5554*b095b053SXin Li
CheckBounds5DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5555*b095b053SXin Li static void CheckBounds5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5556*b095b053SXin Li EXPECT_LT(i, kParallelize5DTile1DRangeI);
5557*b095b053SXin Li EXPECT_LT(j, kParallelize5DTile1DRangeJ);
5558*b095b053SXin Li EXPECT_LT(k, kParallelize5DTile1DRangeK);
5559*b095b053SXin Li EXPECT_LT(l, kParallelize5DTile1DRangeL);
5560*b095b053SXin Li EXPECT_LT(start_m, kParallelize5DTile1DRangeM);
5561*b095b053SXin Li EXPECT_LE(start_m + tile_m, kParallelize5DTile1DRangeM);
5562*b095b053SXin Li }
5563*b095b053SXin Li
TEST(Parallelize5DTile1D,SingleThreadPoolAllItemsInBounds)5564*b095b053SXin Li TEST(Parallelize5DTile1D, SingleThreadPoolAllItemsInBounds) {
5565*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5566*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5567*b095b053SXin Li
5568*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5569*b095b053SXin Li threadpool.get(),
5570*b095b053SXin Li CheckBounds5DTile1D,
5571*b095b053SXin Li nullptr,
5572*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5573*b095b053SXin Li kParallelize5DTile1DTileM,
5574*b095b053SXin Li 0 /* flags */);
5575*b095b053SXin Li }
5576*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolAllItemsInBounds)5577*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolAllItemsInBounds) {
5578*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5579*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5580*b095b053SXin Li
5581*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5582*b095b053SXin Li GTEST_SKIP();
5583*b095b053SXin Li }
5584*b095b053SXin Li
5585*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5586*b095b053SXin Li threadpool.get(),
5587*b095b053SXin Li CheckBounds5DTile1D,
5588*b095b053SXin Li nullptr,
5589*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5590*b095b053SXin Li kParallelize5DTile1DTileM,
5591*b095b053SXin Li 0 /* flags */);
5592*b095b053SXin Li }
5593*b095b053SXin Li
CheckTiling5DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5594*b095b053SXin Li static void CheckTiling5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5595*b095b053SXin Li EXPECT_GT(tile_m, 0);
5596*b095b053SXin Li EXPECT_LE(tile_m, kParallelize5DTile1DTileM);
5597*b095b053SXin Li EXPECT_EQ(start_m % kParallelize5DTile1DTileM, 0);
5598*b095b053SXin Li EXPECT_EQ(tile_m, std::min<size_t>(kParallelize5DTile1DTileM, kParallelize5DTile1DRangeM - start_m));
5599*b095b053SXin Li }
5600*b095b053SXin Li
TEST(Parallelize5DTile1D,SingleThreadPoolUniformTiling)5601*b095b053SXin Li TEST(Parallelize5DTile1D, SingleThreadPoolUniformTiling) {
5602*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5603*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5604*b095b053SXin Li
5605*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5606*b095b053SXin Li threadpool.get(),
5607*b095b053SXin Li CheckTiling5DTile1D,
5608*b095b053SXin Li nullptr,
5609*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5610*b095b053SXin Li kParallelize5DTile1DTileM,
5611*b095b053SXin Li 0 /* flags */);
5612*b095b053SXin Li }
5613*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolUniformTiling)5614*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolUniformTiling) {
5615*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5616*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5617*b095b053SXin Li
5618*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5619*b095b053SXin Li GTEST_SKIP();
5620*b095b053SXin Li }
5621*b095b053SXin Li
5622*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5623*b095b053SXin Li threadpool.get(),
5624*b095b053SXin Li CheckTiling5DTile1D,
5625*b095b053SXin Li nullptr,
5626*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5627*b095b053SXin Li kParallelize5DTile1DTileM,
5628*b095b053SXin Li 0 /* flags */);
5629*b095b053SXin Li }
5630*b095b053SXin Li
SetTrue5DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5631*b095b053SXin Li static void SetTrue5DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5632*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
5633*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5634*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
5635*b095b053SXin Li }
5636*b095b053SXin Li }
5637*b095b053SXin Li
TEST(Parallelize5DTile1D,SingleThreadPoolAllItemsProcessed)5638*b095b053SXin Li TEST(Parallelize5DTile1D, SingleThreadPoolAllItemsProcessed) {
5639*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5640*b095b053SXin Li
5641*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5642*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5643*b095b053SXin Li
5644*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5645*b095b053SXin Li threadpool.get(),
5646*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(SetTrue5DTile1D),
5647*b095b053SXin Li static_cast<void*>(indicators.data()),
5648*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5649*b095b053SXin Li kParallelize5DTile1DTileM,
5650*b095b053SXin Li 0 /* flags */);
5651*b095b053SXin Li
5652*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5653*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5654*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5655*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5656*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5657*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5658*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5659*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5660*b095b053SXin Li }
5661*b095b053SXin Li }
5662*b095b053SXin Li }
5663*b095b053SXin Li }
5664*b095b053SXin Li }
5665*b095b053SXin Li }
5666*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolAllItemsProcessed)5667*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolAllItemsProcessed) {
5668*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5669*b095b053SXin Li
5670*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5671*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5672*b095b053SXin Li
5673*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5674*b095b053SXin Li GTEST_SKIP();
5675*b095b053SXin Li }
5676*b095b053SXin Li
5677*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5678*b095b053SXin Li threadpool.get(),
5679*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(SetTrue5DTile1D),
5680*b095b053SXin Li static_cast<void*>(indicators.data()),
5681*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5682*b095b053SXin Li kParallelize5DTile1DTileM,
5683*b095b053SXin Li 0 /* flags */);
5684*b095b053SXin Li
5685*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5686*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5687*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5688*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5689*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5690*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5691*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
5692*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
5693*b095b053SXin Li }
5694*b095b053SXin Li }
5695*b095b053SXin Li }
5696*b095b053SXin Li }
5697*b095b053SXin Li }
5698*b095b053SXin Li }
5699*b095b053SXin Li
Increment5DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5700*b095b053SXin Li static void Increment5DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5701*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
5702*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5703*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
5704*b095b053SXin Li }
5705*b095b053SXin Li }
5706*b095b053SXin Li
TEST(Parallelize5DTile1D,SingleThreadPoolEachItemProcessedOnce)5707*b095b053SXin Li TEST(Parallelize5DTile1D, SingleThreadPoolEachItemProcessedOnce) {
5708*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5709*b095b053SXin Li
5710*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5711*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5712*b095b053SXin Li
5713*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5714*b095b053SXin Li threadpool.get(),
5715*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5716*b095b053SXin Li static_cast<void*>(counters.data()),
5717*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5718*b095b053SXin Li kParallelize5DTile1DTileM,
5719*b095b053SXin Li 0 /* flags */);
5720*b095b053SXin Li
5721*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5722*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5723*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5724*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5725*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5726*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5727*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5728*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5729*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5730*b095b053SXin Li }
5731*b095b053SXin Li }
5732*b095b053SXin Li }
5733*b095b053SXin Li }
5734*b095b053SXin Li }
5735*b095b053SXin Li }
5736*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolEachItemProcessedOnce)5737*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolEachItemProcessedOnce) {
5738*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5739*b095b053SXin Li
5740*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5741*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5742*b095b053SXin Li
5743*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5744*b095b053SXin Li GTEST_SKIP();
5745*b095b053SXin Li }
5746*b095b053SXin Li
5747*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5748*b095b053SXin Li threadpool.get(),
5749*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5750*b095b053SXin Li static_cast<void*>(counters.data()),
5751*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5752*b095b053SXin Li kParallelize5DTile1DTileM,
5753*b095b053SXin Li 0 /* flags */);
5754*b095b053SXin Li
5755*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5756*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5757*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5758*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5759*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5760*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5761*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
5762*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5763*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
5764*b095b053SXin Li }
5765*b095b053SXin Li }
5766*b095b053SXin Li }
5767*b095b053SXin Li }
5768*b095b053SXin Li }
5769*b095b053SXin Li }
5770*b095b053SXin Li
TEST(Parallelize5DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)5771*b095b053SXin Li TEST(Parallelize5DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
5772*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5773*b095b053SXin Li
5774*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5775*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5776*b095b053SXin Li
5777*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5778*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5779*b095b053SXin Li threadpool.get(),
5780*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5781*b095b053SXin Li static_cast<void*>(counters.data()),
5782*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5783*b095b053SXin Li kParallelize5DTile1DTileM,
5784*b095b053SXin Li 0 /* flags */);
5785*b095b053SXin Li }
5786*b095b053SXin Li
5787*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5788*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5789*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5790*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5791*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5792*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5793*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5794*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5795*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
5796*b095b053SXin Li << "(expected: " << kIncrementIterations5D << ")";
5797*b095b053SXin Li }
5798*b095b053SXin Li }
5799*b095b053SXin Li }
5800*b095b053SXin Li }
5801*b095b053SXin Li }
5802*b095b053SXin Li }
5803*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)5804*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
5805*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5806*b095b053SXin Li
5807*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5808*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5809*b095b053SXin Li
5810*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5811*b095b053SXin Li GTEST_SKIP();
5812*b095b053SXin Li }
5813*b095b053SXin Li
5814*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
5815*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5816*b095b053SXin Li threadpool.get(),
5817*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(Increment5DTile1D),
5818*b095b053SXin Li static_cast<void*>(counters.data()),
5819*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5820*b095b053SXin Li kParallelize5DTile1DTileM,
5821*b095b053SXin Li 0 /* flags */);
5822*b095b053SXin Li }
5823*b095b053SXin Li
5824*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) {
5825*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) {
5826*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) {
5827*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) {
5828*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) {
5829*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m;
5830*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
5831*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
5832*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
5833*b095b053SXin Li << "(expected: " << kIncrementIterations5D << ")";
5834*b095b053SXin Li }
5835*b095b053SXin Li }
5836*b095b053SXin Li }
5837*b095b053SXin Li }
5838*b095b053SXin Li }
5839*b095b053SXin Li }
5840*b095b053SXin Li
IncrementSame5DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5841*b095b053SXin Li static void IncrementSame5DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5842*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
5843*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
5844*b095b053SXin Li }
5845*b095b053SXin Li }
5846*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolHighContention)5847*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolHighContention) {
5848*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5849*b095b053SXin Li
5850*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5851*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5852*b095b053SXin Li
5853*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5854*b095b053SXin Li GTEST_SKIP();
5855*b095b053SXin Li }
5856*b095b053SXin Li
5857*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5858*b095b053SXin Li threadpool.get(),
5859*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(IncrementSame5DTile1D),
5860*b095b053SXin Li static_cast<void*>(&num_processed_items),
5861*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5862*b095b053SXin Li kParallelize5DTile1DTileM,
5863*b095b053SXin Li 0 /* flags */);
5864*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5865*b095b053SXin Li }
5866*b095b053SXin Li
WorkImbalance5DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t tile_m)5867*b095b053SXin Li static void WorkImbalance5DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) {
5868*b095b053SXin Li num_processed_items->fetch_add(tile_m, std::memory_order_relaxed);
5869*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0) {
5870*b095b053SXin Li /* Spin-wait until all items are computed */
5871*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM) {
5872*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
5873*b095b053SXin Li }
5874*b095b053SXin Li }
5875*b095b053SXin Li }
5876*b095b053SXin Li
TEST(Parallelize5DTile1D,MultiThreadPoolWorkStealing)5877*b095b053SXin Li TEST(Parallelize5DTile1D, MultiThreadPoolWorkStealing) {
5878*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
5879*b095b053SXin Li
5880*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5881*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5882*b095b053SXin Li
5883*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5884*b095b053SXin Li GTEST_SKIP();
5885*b095b053SXin Li }
5886*b095b053SXin Li
5887*b095b053SXin Li pthreadpool_parallelize_5d_tile_1d(
5888*b095b053SXin Li threadpool.get(),
5889*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_1d_t>(WorkImbalance5DTile1D),
5890*b095b053SXin Li static_cast<void*>(&num_processed_items),
5891*b095b053SXin Li kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM,
5892*b095b053SXin Li kParallelize5DTile1DTileM,
5893*b095b053SXin Li 0 /* flags */);
5894*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM);
5895*b095b053SXin Li }
5896*b095b053SXin Li
ComputeNothing5DTile2D(void *,size_t,size_t,size_t,size_t,size_t,size_t,size_t)5897*b095b053SXin Li static void ComputeNothing5DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) {
5898*b095b053SXin Li }
5899*b095b053SXin Li
TEST(Parallelize5DTile2D,SingleThreadPoolCompletes)5900*b095b053SXin Li TEST(Parallelize5DTile2D, SingleThreadPoolCompletes) {
5901*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5902*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5903*b095b053SXin Li
5904*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(threadpool.get(),
5905*b095b053SXin Li ComputeNothing5DTile2D,
5906*b095b053SXin Li nullptr,
5907*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5908*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5909*b095b053SXin Li 0 /* flags */);
5910*b095b053SXin Li }
5911*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolCompletes)5912*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolCompletes) {
5913*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5914*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5915*b095b053SXin Li
5916*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5917*b095b053SXin Li GTEST_SKIP();
5918*b095b053SXin Li }
5919*b095b053SXin Li
5920*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
5921*b095b053SXin Li threadpool.get(),
5922*b095b053SXin Li ComputeNothing5DTile2D,
5923*b095b053SXin Li nullptr,
5924*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5925*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5926*b095b053SXin Li 0 /* flags */);
5927*b095b053SXin Li }
5928*b095b053SXin Li
CheckBounds5DTile2D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)5929*b095b053SXin Li static void CheckBounds5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
5930*b095b053SXin Li EXPECT_LT(i, kParallelize5DTile2DRangeI);
5931*b095b053SXin Li EXPECT_LT(j, kParallelize5DTile2DRangeJ);
5932*b095b053SXin Li EXPECT_LT(k, kParallelize5DTile2DRangeK);
5933*b095b053SXin Li EXPECT_LT(start_l, kParallelize5DTile2DRangeL);
5934*b095b053SXin Li EXPECT_LT(start_m, kParallelize5DTile2DRangeM);
5935*b095b053SXin Li EXPECT_LE(start_l + tile_l, kParallelize5DTile2DRangeL);
5936*b095b053SXin Li EXPECT_LE(start_m + tile_m, kParallelize5DTile2DRangeM);
5937*b095b053SXin Li }
5938*b095b053SXin Li
TEST(Parallelize5DTile2D,SingleThreadPoolAllItemsInBounds)5939*b095b053SXin Li TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsInBounds) {
5940*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5941*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5942*b095b053SXin Li
5943*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
5944*b095b053SXin Li threadpool.get(),
5945*b095b053SXin Li CheckBounds5DTile2D,
5946*b095b053SXin Li nullptr,
5947*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5948*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5949*b095b053SXin Li 0 /* flags */);
5950*b095b053SXin Li }
5951*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolAllItemsInBounds)5952*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsInBounds) {
5953*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5954*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5955*b095b053SXin Li
5956*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5957*b095b053SXin Li GTEST_SKIP();
5958*b095b053SXin Li }
5959*b095b053SXin Li
5960*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
5961*b095b053SXin Li threadpool.get(),
5962*b095b053SXin Li CheckBounds5DTile2D,
5963*b095b053SXin Li nullptr,
5964*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5965*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5966*b095b053SXin Li 0 /* flags */);
5967*b095b053SXin Li }
5968*b095b053SXin Li
CheckTiling5DTile2D(void *,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)5969*b095b053SXin Li static void CheckTiling5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
5970*b095b053SXin Li EXPECT_GT(tile_l, 0);
5971*b095b053SXin Li EXPECT_LE(tile_l, kParallelize5DTile2DTileL);
5972*b095b053SXin Li EXPECT_EQ(start_l % kParallelize5DTile2DTileL, 0);
5973*b095b053SXin Li EXPECT_EQ(tile_l, std::min<size_t>(kParallelize5DTile2DTileL, kParallelize5DTile2DRangeL - start_l));
5974*b095b053SXin Li
5975*b095b053SXin Li EXPECT_GT(tile_m, 0);
5976*b095b053SXin Li EXPECT_LE(tile_m, kParallelize5DTile2DTileM);
5977*b095b053SXin Li EXPECT_EQ(start_m % kParallelize5DTile2DTileM, 0);
5978*b095b053SXin Li EXPECT_EQ(tile_m, std::min<size_t>(kParallelize5DTile2DTileM, kParallelize5DTile2DRangeM - start_m));
5979*b095b053SXin Li }
5980*b095b053SXin Li
TEST(Parallelize5DTile2D,SingleThreadPoolUniformTiling)5981*b095b053SXin Li TEST(Parallelize5DTile2D, SingleThreadPoolUniformTiling) {
5982*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
5983*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5984*b095b053SXin Li
5985*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
5986*b095b053SXin Li threadpool.get(),
5987*b095b053SXin Li CheckTiling5DTile2D,
5988*b095b053SXin Li nullptr,
5989*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
5990*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
5991*b095b053SXin Li 0 /* flags */);
5992*b095b053SXin Li }
5993*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolUniformTiling)5994*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolUniformTiling) {
5995*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
5996*b095b053SXin Li ASSERT_TRUE(threadpool.get());
5997*b095b053SXin Li
5998*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
5999*b095b053SXin Li GTEST_SKIP();
6000*b095b053SXin Li }
6001*b095b053SXin Li
6002*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6003*b095b053SXin Li threadpool.get(),
6004*b095b053SXin Li CheckTiling5DTile2D,
6005*b095b053SXin Li nullptr,
6006*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6007*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6008*b095b053SXin Li 0 /* flags */);
6009*b095b053SXin Li }
6010*b095b053SXin Li
SetTrue5DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6011*b095b053SXin Li static void SetTrue5DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6012*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
6013*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
6014*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6015*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
6016*b095b053SXin Li }
6017*b095b053SXin Li }
6018*b095b053SXin Li }
6019*b095b053SXin Li
TEST(Parallelize5DTile2D,SingleThreadPoolAllItemsProcessed)6020*b095b053SXin Li TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsProcessed) {
6021*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6022*b095b053SXin Li
6023*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6024*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6025*b095b053SXin Li
6026*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6027*b095b053SXin Li threadpool.get(),
6028*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(SetTrue5DTile2D),
6029*b095b053SXin Li static_cast<void*>(indicators.data()),
6030*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6031*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6032*b095b053SXin Li 0 /* flags */);
6033*b095b053SXin Li
6034*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6035*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6036*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6037*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6038*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6039*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6040*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6041*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
6042*b095b053SXin Li }
6043*b095b053SXin Li }
6044*b095b053SXin Li }
6045*b095b053SXin Li }
6046*b095b053SXin Li }
6047*b095b053SXin Li }
6048*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolAllItemsProcessed)6049*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsProcessed) {
6050*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6051*b095b053SXin Li
6052*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6053*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6054*b095b053SXin Li
6055*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6056*b095b053SXin Li GTEST_SKIP();
6057*b095b053SXin Li }
6058*b095b053SXin Li
6059*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6060*b095b053SXin Li threadpool.get(),
6061*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(SetTrue5DTile2D),
6062*b095b053SXin Li static_cast<void*>(indicators.data()),
6063*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6064*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6065*b095b053SXin Li 0 /* flags */);
6066*b095b053SXin Li
6067*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6068*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6069*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6070*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6071*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6072*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6073*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6074*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed";
6075*b095b053SXin Li }
6076*b095b053SXin Li }
6077*b095b053SXin Li }
6078*b095b053SXin Li }
6079*b095b053SXin Li }
6080*b095b053SXin Li }
6081*b095b053SXin Li
Increment5DTile2D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6082*b095b053SXin Li static void Increment5DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6083*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
6084*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
6085*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6086*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
6087*b095b053SXin Li }
6088*b095b053SXin Li }
6089*b095b053SXin Li }
6090*b095b053SXin Li
TEST(Parallelize5DTile2D,SingleThreadPoolEachItemProcessedOnce)6091*b095b053SXin Li TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedOnce) {
6092*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6093*b095b053SXin Li
6094*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6095*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6096*b095b053SXin Li
6097*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6098*b095b053SXin Li threadpool.get(),
6099*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6100*b095b053SXin Li static_cast<void*>(counters.data()),
6101*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6102*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6103*b095b053SXin Li 0 /* flags */);
6104*b095b053SXin Li
6105*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6106*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6107*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6108*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6109*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6110*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6111*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6112*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6113*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6114*b095b053SXin Li }
6115*b095b053SXin Li }
6116*b095b053SXin Li }
6117*b095b053SXin Li }
6118*b095b053SXin Li }
6119*b095b053SXin Li }
6120*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolEachItemProcessedOnce)6121*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedOnce) {
6122*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6123*b095b053SXin Li
6124*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6125*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6126*b095b053SXin Li
6127*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6128*b095b053SXin Li GTEST_SKIP();
6129*b095b053SXin Li }
6130*b095b053SXin Li
6131*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6132*b095b053SXin Li threadpool.get(),
6133*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6134*b095b053SXin Li static_cast<void*>(counters.data()),
6135*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6136*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6137*b095b053SXin Li 0 /* flags */);
6138*b095b053SXin Li
6139*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6140*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6141*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6142*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6143*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6144*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6145*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6146*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6147*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6148*b095b053SXin Li }
6149*b095b053SXin Li }
6150*b095b053SXin Li }
6151*b095b053SXin Li }
6152*b095b053SXin Li }
6153*b095b053SXin Li }
6154*b095b053SXin Li
TEST(Parallelize5DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)6155*b095b053SXin Li TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
6156*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6157*b095b053SXin Li
6158*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6159*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6160*b095b053SXin Li
6161*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
6162*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6163*b095b053SXin Li threadpool.get(),
6164*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6165*b095b053SXin Li static_cast<void*>(counters.data()),
6166*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6167*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6168*b095b053SXin Li 0 /* flags */);
6169*b095b053SXin Li }
6170*b095b053SXin Li
6171*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6172*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6173*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6174*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6175*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6176*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6177*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
6178*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6179*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
6180*b095b053SXin Li << "(expected: " << kIncrementIterations5D << ")";
6181*b095b053SXin Li }
6182*b095b053SXin Li }
6183*b095b053SXin Li }
6184*b095b053SXin Li }
6185*b095b053SXin Li }
6186*b095b053SXin Li }
6187*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)6188*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
6189*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6190*b095b053SXin Li
6191*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6192*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6193*b095b053SXin Li
6194*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6195*b095b053SXin Li GTEST_SKIP();
6196*b095b053SXin Li }
6197*b095b053SXin Li
6198*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) {
6199*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6200*b095b053SXin Li threadpool.get(),
6201*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D),
6202*b095b053SXin Li static_cast<void*>(counters.data()),
6203*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6204*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6205*b095b053SXin Li 0 /* flags */);
6206*b095b053SXin Li }
6207*b095b053SXin Li
6208*b095b053SXin Li for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) {
6209*b095b053SXin Li for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) {
6210*b095b053SXin Li for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) {
6211*b095b053SXin Li for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) {
6212*b095b053SXin Li for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) {
6213*b095b053SXin Li const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m;
6214*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D)
6215*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed "
6216*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
6217*b095b053SXin Li << "(expected: " << kIncrementIterations5D << ")";
6218*b095b053SXin Li }
6219*b095b053SXin Li }
6220*b095b053SXin Li }
6221*b095b053SXin Li }
6222*b095b053SXin Li }
6223*b095b053SXin Li }
6224*b095b053SXin Li
IncrementSame5DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6225*b095b053SXin Li static void IncrementSame5DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6226*b095b053SXin Li for (size_t l = start_l; l < start_l + tile_l; l++) {
6227*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
6228*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
6229*b095b053SXin Li }
6230*b095b053SXin Li }
6231*b095b053SXin Li }
6232*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolHighContention)6233*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolHighContention) {
6234*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6235*b095b053SXin Li
6236*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6237*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6238*b095b053SXin Li
6239*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6240*b095b053SXin Li GTEST_SKIP();
6241*b095b053SXin Li }
6242*b095b053SXin Li
6243*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6244*b095b053SXin Li threadpool.get(),
6245*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(IncrementSame5DTile2D),
6246*b095b053SXin Li static_cast<void*>(&num_processed_items),
6247*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6248*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6249*b095b053SXin Li 0 /* flags */);
6250*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6251*b095b053SXin Li }
6252*b095b053SXin Li
WorkImbalance5DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t start_l,size_t start_m,size_t tile_l,size_t tile_m)6253*b095b053SXin Li static void WorkImbalance5DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) {
6254*b095b053SXin Li num_processed_items->fetch_add(tile_l * tile_m, std::memory_order_relaxed);
6255*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && start_l == 0 && start_m == 0) {
6256*b095b053SXin Li /* Spin-wait until all items are computed */
6257*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM) {
6258*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
6259*b095b053SXin Li }
6260*b095b053SXin Li }
6261*b095b053SXin Li }
6262*b095b053SXin Li
TEST(Parallelize5DTile2D,MultiThreadPoolWorkStealing)6263*b095b053SXin Li TEST(Parallelize5DTile2D, MultiThreadPoolWorkStealing) {
6264*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6265*b095b053SXin Li
6266*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6267*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6268*b095b053SXin Li
6269*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6270*b095b053SXin Li GTEST_SKIP();
6271*b095b053SXin Li }
6272*b095b053SXin Li
6273*b095b053SXin Li pthreadpool_parallelize_5d_tile_2d(
6274*b095b053SXin Li threadpool.get(),
6275*b095b053SXin Li reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(WorkImbalance5DTile2D),
6276*b095b053SXin Li static_cast<void*>(&num_processed_items),
6277*b095b053SXin Li kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM,
6278*b095b053SXin Li kParallelize5DTile2DTileL, kParallelize5DTile2DTileM,
6279*b095b053SXin Li 0 /* flags */);
6280*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM);
6281*b095b053SXin Li }
6282*b095b053SXin Li
ComputeNothing6D(void *,size_t,size_t,size_t,size_t,size_t,size_t)6283*b095b053SXin Li static void ComputeNothing6D(void*, size_t, size_t, size_t, size_t, size_t, size_t) {
6284*b095b053SXin Li }
6285*b095b053SXin Li
TEST(Parallelize6D,SingleThreadPoolCompletes)6286*b095b053SXin Li TEST(Parallelize6D, SingleThreadPoolCompletes) {
6287*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6288*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6289*b095b053SXin Li
6290*b095b053SXin Li pthreadpool_parallelize_6d(threadpool.get(),
6291*b095b053SXin Li ComputeNothing6D,
6292*b095b053SXin Li nullptr,
6293*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6294*b095b053SXin Li 0 /* flags */);
6295*b095b053SXin Li }
6296*b095b053SXin Li
TEST(Parallelize6D,MultiThreadPoolCompletes)6297*b095b053SXin Li TEST(Parallelize6D, MultiThreadPoolCompletes) {
6298*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6299*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6300*b095b053SXin Li
6301*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6302*b095b053SXin Li GTEST_SKIP();
6303*b095b053SXin Li }
6304*b095b053SXin Li
6305*b095b053SXin Li pthreadpool_parallelize_6d(
6306*b095b053SXin Li threadpool.get(),
6307*b095b053SXin Li ComputeNothing6D,
6308*b095b053SXin Li nullptr,
6309*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6310*b095b053SXin Li 0 /* flags */);
6311*b095b053SXin Li }
6312*b095b053SXin Li
CheckBounds6D(void *,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6313*b095b053SXin Li static void CheckBounds6D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6314*b095b053SXin Li EXPECT_LT(i, kParallelize6DRangeI);
6315*b095b053SXin Li EXPECT_LT(j, kParallelize6DRangeJ);
6316*b095b053SXin Li EXPECT_LT(k, kParallelize6DRangeK);
6317*b095b053SXin Li EXPECT_LT(l, kParallelize6DRangeL);
6318*b095b053SXin Li EXPECT_LT(m, kParallelize6DRangeM);
6319*b095b053SXin Li EXPECT_LT(n, kParallelize6DRangeN);
6320*b095b053SXin Li }
6321*b095b053SXin Li
TEST(Parallelize6D,SingleThreadPoolAllItemsInBounds)6322*b095b053SXin Li TEST(Parallelize6D, SingleThreadPoolAllItemsInBounds) {
6323*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6324*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6325*b095b053SXin Li
6326*b095b053SXin Li pthreadpool_parallelize_6d(
6327*b095b053SXin Li threadpool.get(),
6328*b095b053SXin Li CheckBounds6D,
6329*b095b053SXin Li nullptr,
6330*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6331*b095b053SXin Li 0 /* flags */);
6332*b095b053SXin Li }
6333*b095b053SXin Li
TEST(Parallelize6D,MultiThreadPoolAllItemsInBounds)6334*b095b053SXin Li TEST(Parallelize6D, MultiThreadPoolAllItemsInBounds) {
6335*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6336*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6337*b095b053SXin Li
6338*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6339*b095b053SXin Li GTEST_SKIP();
6340*b095b053SXin Li }
6341*b095b053SXin Li
6342*b095b053SXin Li pthreadpool_parallelize_6d(
6343*b095b053SXin Li threadpool.get(),
6344*b095b053SXin Li CheckBounds6D,
6345*b095b053SXin Li nullptr,
6346*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6347*b095b053SXin Li 0 /* flags */);
6348*b095b053SXin Li }
6349*b095b053SXin Li
SetTrue6D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6350*b095b053SXin Li static void SetTrue6D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6351*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6352*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
6353*b095b053SXin Li }
6354*b095b053SXin Li
TEST(Parallelize6D,SingleThreadPoolAllItemsProcessed)6355*b095b053SXin Li TEST(Parallelize6D, SingleThreadPoolAllItemsProcessed) {
6356*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6357*b095b053SXin Li
6358*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6359*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6360*b095b053SXin Li
6361*b095b053SXin Li pthreadpool_parallelize_6d(
6362*b095b053SXin Li threadpool.get(),
6363*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(SetTrue6D),
6364*b095b053SXin Li static_cast<void*>(indicators.data()),
6365*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6366*b095b053SXin Li 0 /* flags */);
6367*b095b053SXin Li
6368*b095b053SXin Li for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6369*b095b053SXin Li for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6370*b095b053SXin Li for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6371*b095b053SXin Li for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6372*b095b053SXin Li for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6373*b095b053SXin Li for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6374*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6375*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6376*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6377*b095b053SXin Li }
6378*b095b053SXin Li }
6379*b095b053SXin Li }
6380*b095b053SXin Li }
6381*b095b053SXin Li }
6382*b095b053SXin Li }
6383*b095b053SXin Li }
6384*b095b053SXin Li
TEST(Parallelize6D,MultiThreadPoolAllItemsProcessed)6385*b095b053SXin Li TEST(Parallelize6D, MultiThreadPoolAllItemsProcessed) {
6386*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6387*b095b053SXin Li
6388*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6389*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6390*b095b053SXin Li
6391*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6392*b095b053SXin Li GTEST_SKIP();
6393*b095b053SXin Li }
6394*b095b053SXin Li
6395*b095b053SXin Li pthreadpool_parallelize_6d(
6396*b095b053SXin Li threadpool.get(),
6397*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(SetTrue6D),
6398*b095b053SXin Li static_cast<void*>(indicators.data()),
6399*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6400*b095b053SXin Li 0 /* flags */);
6401*b095b053SXin Li
6402*b095b053SXin Li for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6403*b095b053SXin Li for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6404*b095b053SXin Li for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6405*b095b053SXin Li for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6406*b095b053SXin Li for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6407*b095b053SXin Li for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6408*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6409*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6410*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6411*b095b053SXin Li }
6412*b095b053SXin Li }
6413*b095b053SXin Li }
6414*b095b053SXin Li }
6415*b095b053SXin Li }
6416*b095b053SXin Li }
6417*b095b053SXin Li }
6418*b095b053SXin Li
Increment6D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6419*b095b053SXin Li static void Increment6D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6420*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6421*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
6422*b095b053SXin Li }
6423*b095b053SXin Li
TEST(Parallelize6D,SingleThreadPoolEachItemProcessedOnce)6424*b095b053SXin Li TEST(Parallelize6D, SingleThreadPoolEachItemProcessedOnce) {
6425*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6426*b095b053SXin Li
6427*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6428*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6429*b095b053SXin Li
6430*b095b053SXin Li pthreadpool_parallelize_6d(
6431*b095b053SXin Li threadpool.get(),
6432*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6433*b095b053SXin Li static_cast<void*>(counters.data()),
6434*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6435*b095b053SXin Li 0 /* flags */);
6436*b095b053SXin Li
6437*b095b053SXin Li for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6438*b095b053SXin Li for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6439*b095b053SXin Li for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6440*b095b053SXin Li for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6441*b095b053SXin Li for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6442*b095b053SXin Li for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6443*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6444*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6445*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6446*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6447*b095b053SXin Li }
6448*b095b053SXin Li }
6449*b095b053SXin Li }
6450*b095b053SXin Li }
6451*b095b053SXin Li }
6452*b095b053SXin Li }
6453*b095b053SXin Li }
6454*b095b053SXin Li
TEST(Parallelize6D,MultiThreadPoolEachItemProcessedOnce)6455*b095b053SXin Li TEST(Parallelize6D, MultiThreadPoolEachItemProcessedOnce) {
6456*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6457*b095b053SXin Li
6458*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6459*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6460*b095b053SXin Li
6461*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6462*b095b053SXin Li GTEST_SKIP();
6463*b095b053SXin Li }
6464*b095b053SXin Li
6465*b095b053SXin Li pthreadpool_parallelize_6d(
6466*b095b053SXin Li threadpool.get(),
6467*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6468*b095b053SXin Li static_cast<void*>(counters.data()),
6469*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6470*b095b053SXin Li 0 /* flags */);
6471*b095b053SXin Li
6472*b095b053SXin Li for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6473*b095b053SXin Li for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6474*b095b053SXin Li for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6475*b095b053SXin Li for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6476*b095b053SXin Li for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6477*b095b053SXin Li for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6478*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6479*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6480*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6481*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6482*b095b053SXin Li }
6483*b095b053SXin Li }
6484*b095b053SXin Li }
6485*b095b053SXin Li }
6486*b095b053SXin Li }
6487*b095b053SXin Li }
6488*b095b053SXin Li }
6489*b095b053SXin Li
TEST(Parallelize6D,SingleThreadPoolEachItemProcessedMultipleTimes)6490*b095b053SXin Li TEST(Parallelize6D, SingleThreadPoolEachItemProcessedMultipleTimes) {
6491*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6492*b095b053SXin Li
6493*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6494*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6495*b095b053SXin Li
6496*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6497*b095b053SXin Li pthreadpool_parallelize_6d(
6498*b095b053SXin Li threadpool.get(),
6499*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6500*b095b053SXin Li static_cast<void*>(counters.data()),
6501*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6502*b095b053SXin Li 0 /* flags */);
6503*b095b053SXin Li }
6504*b095b053SXin Li
6505*b095b053SXin Li for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6506*b095b053SXin Li for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6507*b095b053SXin Li for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6508*b095b053SXin Li for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6509*b095b053SXin Li for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6510*b095b053SXin Li for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6511*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN;
6512*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6513*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6514*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
6515*b095b053SXin Li << "(expected: " << kIncrementIterations6D << ")";
6516*b095b053SXin Li }
6517*b095b053SXin Li }
6518*b095b053SXin Li }
6519*b095b053SXin Li }
6520*b095b053SXin Li }
6521*b095b053SXin Li }
6522*b095b053SXin Li }
6523*b095b053SXin Li
TEST(Parallelize6D,MultiThreadPoolEachItemProcessedMultipleTimes)6524*b095b053SXin Li TEST(Parallelize6D, MultiThreadPoolEachItemProcessedMultipleTimes) {
6525*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6526*b095b053SXin Li
6527*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6528*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6529*b095b053SXin Li
6530*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6531*b095b053SXin Li GTEST_SKIP();
6532*b095b053SXin Li }
6533*b095b053SXin Li
6534*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6535*b095b053SXin Li pthreadpool_parallelize_6d(
6536*b095b053SXin Li threadpool.get(),
6537*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(Increment6D),
6538*b095b053SXin Li static_cast<void*>(counters.data()),
6539*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6540*b095b053SXin Li 0 /* flags */);
6541*b095b053SXin Li }
6542*b095b053SXin Li
6543*b095b053SXin Li for (size_t i = 0; i < kParallelize6DRangeI; i++) {
6544*b095b053SXin Li for (size_t j = 0; j < kParallelize6DRangeJ; j++) {
6545*b095b053SXin Li for (size_t k = 0; k < kParallelize6DRangeK; k++) {
6546*b095b053SXin Li for (size_t l = 0; l < kParallelize6DRangeL; l++) {
6547*b095b053SXin Li for (size_t m = 0; m < kParallelize6DRangeM; m++) {
6548*b095b053SXin Li for (size_t n = 0; n < kParallelize6DRangeN; n++) {
6549*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n;
6550*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6551*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6552*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
6553*b095b053SXin Li << "(expected: " << kIncrementIterations6D << ")";
6554*b095b053SXin Li }
6555*b095b053SXin Li }
6556*b095b053SXin Li }
6557*b095b053SXin Li }
6558*b095b053SXin Li }
6559*b095b053SXin Li }
6560*b095b053SXin Li }
6561*b095b053SXin Li
IncrementSame6D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6562*b095b053SXin Li static void IncrementSame6D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6563*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
6564*b095b053SXin Li }
6565*b095b053SXin Li
TEST(Parallelize6D,MultiThreadPoolHighContention)6566*b095b053SXin Li TEST(Parallelize6D, MultiThreadPoolHighContention) {
6567*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6568*b095b053SXin Li
6569*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6570*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6571*b095b053SXin Li
6572*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6573*b095b053SXin Li GTEST_SKIP();
6574*b095b053SXin Li }
6575*b095b053SXin Li
6576*b095b053SXin Li pthreadpool_parallelize_6d(
6577*b095b053SXin Li threadpool.get(),
6578*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(IncrementSame6D),
6579*b095b053SXin Li static_cast<void*>(&num_processed_items),
6580*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6581*b095b053SXin Li 0 /* flags */);
6582*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6583*b095b053SXin Li }
6584*b095b053SXin Li
WorkImbalance6D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t n)6585*b095b053SXin Li static void WorkImbalance6D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
6586*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
6587*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && n == 0) {
6588*b095b053SXin Li /* Spin-wait until all items are computed */
6589*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN) {
6590*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
6591*b095b053SXin Li }
6592*b095b053SXin Li }
6593*b095b053SXin Li }
6594*b095b053SXin Li
TEST(Parallelize6D,MultiThreadPoolWorkStealing)6595*b095b053SXin Li TEST(Parallelize6D, MultiThreadPoolWorkStealing) {
6596*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6597*b095b053SXin Li
6598*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6599*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6600*b095b053SXin Li
6601*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6602*b095b053SXin Li GTEST_SKIP();
6603*b095b053SXin Li }
6604*b095b053SXin Li
6605*b095b053SXin Li pthreadpool_parallelize_6d(
6606*b095b053SXin Li threadpool.get(),
6607*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_t>(WorkImbalance6D),
6608*b095b053SXin Li static_cast<void*>(&num_processed_items),
6609*b095b053SXin Li kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN,
6610*b095b053SXin Li 0 /* flags */);
6611*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN);
6612*b095b053SXin Li }
6613*b095b053SXin Li
ComputeNothing6DTile1D(void *,size_t,size_t,size_t,size_t,size_t,size_t,size_t)6614*b095b053SXin Li static void ComputeNothing6DTile1D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) {
6615*b095b053SXin Li }
6616*b095b053SXin Li
TEST(Parallelize6DTile1D,SingleThreadPoolCompletes)6617*b095b053SXin Li TEST(Parallelize6DTile1D, SingleThreadPoolCompletes) {
6618*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6619*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6620*b095b053SXin Li
6621*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(threadpool.get(),
6622*b095b053SXin Li ComputeNothing6DTile1D,
6623*b095b053SXin Li nullptr,
6624*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6625*b095b053SXin Li kParallelize6DTile1DTileN,
6626*b095b053SXin Li 0 /* flags */);
6627*b095b053SXin Li }
6628*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolCompletes)6629*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolCompletes) {
6630*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6631*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6632*b095b053SXin Li
6633*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6634*b095b053SXin Li GTEST_SKIP();
6635*b095b053SXin Li }
6636*b095b053SXin Li
6637*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6638*b095b053SXin Li threadpool.get(),
6639*b095b053SXin Li ComputeNothing6DTile1D,
6640*b095b053SXin Li nullptr,
6641*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6642*b095b053SXin Li kParallelize6DTile1DTileN,
6643*b095b053SXin Li 0 /* flags */);
6644*b095b053SXin Li }
6645*b095b053SXin Li
CheckBounds6DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6646*b095b053SXin Li static void CheckBounds6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6647*b095b053SXin Li EXPECT_LT(i, kParallelize6DTile1DRangeI);
6648*b095b053SXin Li EXPECT_LT(j, kParallelize6DTile1DRangeJ);
6649*b095b053SXin Li EXPECT_LT(k, kParallelize6DTile1DRangeK);
6650*b095b053SXin Li EXPECT_LT(l, kParallelize6DTile1DRangeL);
6651*b095b053SXin Li EXPECT_LT(m, kParallelize6DTile1DRangeM);
6652*b095b053SXin Li EXPECT_LT(start_n, kParallelize6DTile1DRangeN);
6653*b095b053SXin Li EXPECT_LE(start_n + tile_n, kParallelize6DTile1DRangeN);
6654*b095b053SXin Li }
6655*b095b053SXin Li
TEST(Parallelize6DTile1D,SingleThreadPoolAllItemsInBounds)6656*b095b053SXin Li TEST(Parallelize6DTile1D, SingleThreadPoolAllItemsInBounds) {
6657*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6658*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6659*b095b053SXin Li
6660*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6661*b095b053SXin Li threadpool.get(),
6662*b095b053SXin Li CheckBounds6DTile1D,
6663*b095b053SXin Li nullptr,
6664*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6665*b095b053SXin Li kParallelize6DTile1DTileN,
6666*b095b053SXin Li 0 /* flags */);
6667*b095b053SXin Li }
6668*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolAllItemsInBounds)6669*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolAllItemsInBounds) {
6670*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6671*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6672*b095b053SXin Li
6673*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6674*b095b053SXin Li GTEST_SKIP();
6675*b095b053SXin Li }
6676*b095b053SXin Li
6677*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6678*b095b053SXin Li threadpool.get(),
6679*b095b053SXin Li CheckBounds6DTile1D,
6680*b095b053SXin Li nullptr,
6681*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6682*b095b053SXin Li kParallelize6DTile1DTileN,
6683*b095b053SXin Li 0 /* flags */);
6684*b095b053SXin Li }
6685*b095b053SXin Li
CheckTiling6DTile1D(void *,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6686*b095b053SXin Li static void CheckTiling6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6687*b095b053SXin Li EXPECT_GT(tile_n, 0);
6688*b095b053SXin Li EXPECT_LE(tile_n, kParallelize6DTile1DTileN);
6689*b095b053SXin Li EXPECT_EQ(start_n % kParallelize6DTile1DTileN, 0);
6690*b095b053SXin Li EXPECT_EQ(tile_n, std::min<size_t>(kParallelize6DTile1DTileN, kParallelize6DTile1DRangeN - start_n));
6691*b095b053SXin Li }
6692*b095b053SXin Li
TEST(Parallelize6DTile1D,SingleThreadPoolUniformTiling)6693*b095b053SXin Li TEST(Parallelize6DTile1D, SingleThreadPoolUniformTiling) {
6694*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6695*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6696*b095b053SXin Li
6697*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6698*b095b053SXin Li threadpool.get(),
6699*b095b053SXin Li CheckTiling6DTile1D,
6700*b095b053SXin Li nullptr,
6701*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6702*b095b053SXin Li kParallelize6DTile1DTileN,
6703*b095b053SXin Li 0 /* flags */);
6704*b095b053SXin Li }
6705*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolUniformTiling)6706*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolUniformTiling) {
6707*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6708*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6709*b095b053SXin Li
6710*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6711*b095b053SXin Li GTEST_SKIP();
6712*b095b053SXin Li }
6713*b095b053SXin Li
6714*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6715*b095b053SXin Li threadpool.get(),
6716*b095b053SXin Li CheckTiling6DTile1D,
6717*b095b053SXin Li nullptr,
6718*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6719*b095b053SXin Li kParallelize6DTile1DTileN,
6720*b095b053SXin Li 0 /* flags */);
6721*b095b053SXin Li }
6722*b095b053SXin Li
SetTrue6DTile1D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6723*b095b053SXin Li static void SetTrue6DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6724*b095b053SXin Li for (size_t n = start_n; n < start_n + tile_n; n++) {
6725*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6726*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
6727*b095b053SXin Li }
6728*b095b053SXin Li }
6729*b095b053SXin Li
TEST(Parallelize6DTile1D,SingleThreadPoolAllItemsProcessed)6730*b095b053SXin Li TEST(Parallelize6DTile1D, SingleThreadPoolAllItemsProcessed) {
6731*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6732*b095b053SXin Li
6733*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6734*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6735*b095b053SXin Li
6736*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6737*b095b053SXin Li threadpool.get(),
6738*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(SetTrue6DTile1D),
6739*b095b053SXin Li static_cast<void*>(indicators.data()),
6740*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6741*b095b053SXin Li kParallelize6DTile1DTileN,
6742*b095b053SXin Li 0 /* flags */);
6743*b095b053SXin Li
6744*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6745*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6746*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6747*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6748*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6749*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6750*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6751*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6752*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6753*b095b053SXin Li }
6754*b095b053SXin Li }
6755*b095b053SXin Li }
6756*b095b053SXin Li }
6757*b095b053SXin Li }
6758*b095b053SXin Li }
6759*b095b053SXin Li }
6760*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolAllItemsProcessed)6761*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolAllItemsProcessed) {
6762*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6763*b095b053SXin Li
6764*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6765*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6766*b095b053SXin Li
6767*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6768*b095b053SXin Li GTEST_SKIP();
6769*b095b053SXin Li }
6770*b095b053SXin Li
6771*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6772*b095b053SXin Li threadpool.get(),
6773*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(SetTrue6DTile1D),
6774*b095b053SXin Li static_cast<void*>(indicators.data()),
6775*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6776*b095b053SXin Li kParallelize6DTile1DTileN,
6777*b095b053SXin Li 0 /* flags */);
6778*b095b053SXin Li
6779*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6780*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6781*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6782*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6783*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6784*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6785*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6786*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
6787*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
6788*b095b053SXin Li }
6789*b095b053SXin Li }
6790*b095b053SXin Li }
6791*b095b053SXin Li }
6792*b095b053SXin Li }
6793*b095b053SXin Li }
6794*b095b053SXin Li }
6795*b095b053SXin Li
Increment6DTile1D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6796*b095b053SXin Li static void Increment6DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6797*b095b053SXin Li for (size_t n = start_n; n < start_n + tile_n; n++) {
6798*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6799*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
6800*b095b053SXin Li }
6801*b095b053SXin Li }
6802*b095b053SXin Li
TEST(Parallelize6DTile1D,SingleThreadPoolEachItemProcessedOnce)6803*b095b053SXin Li TEST(Parallelize6DTile1D, SingleThreadPoolEachItemProcessedOnce) {
6804*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6805*b095b053SXin Li
6806*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6807*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6808*b095b053SXin Li
6809*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6810*b095b053SXin Li threadpool.get(),
6811*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6812*b095b053SXin Li static_cast<void*>(counters.data()),
6813*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6814*b095b053SXin Li kParallelize6DTile1DTileN,
6815*b095b053SXin Li 0 /* flags */);
6816*b095b053SXin Li
6817*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6818*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6819*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6820*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6821*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6822*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6823*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6824*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6825*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6826*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6827*b095b053SXin Li }
6828*b095b053SXin Li }
6829*b095b053SXin Li }
6830*b095b053SXin Li }
6831*b095b053SXin Li }
6832*b095b053SXin Li }
6833*b095b053SXin Li }
6834*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolEachItemProcessedOnce)6835*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolEachItemProcessedOnce) {
6836*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6837*b095b053SXin Li
6838*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6839*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6840*b095b053SXin Li
6841*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6842*b095b053SXin Li GTEST_SKIP();
6843*b095b053SXin Li }
6844*b095b053SXin Li
6845*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6846*b095b053SXin Li threadpool.get(),
6847*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6848*b095b053SXin Li static_cast<void*>(counters.data()),
6849*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6850*b095b053SXin Li kParallelize6DTile1DTileN,
6851*b095b053SXin Li 0 /* flags */);
6852*b095b053SXin Li
6853*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6854*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6855*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6856*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6857*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6858*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6859*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6860*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
6861*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6862*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
6863*b095b053SXin Li }
6864*b095b053SXin Li }
6865*b095b053SXin Li }
6866*b095b053SXin Li }
6867*b095b053SXin Li }
6868*b095b053SXin Li }
6869*b095b053SXin Li }
6870*b095b053SXin Li
TEST(Parallelize6DTile1D,SingleThreadPoolEachItemProcessedMultipleTimes)6871*b095b053SXin Li TEST(Parallelize6DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) {
6872*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6873*b095b053SXin Li
6874*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
6875*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6876*b095b053SXin Li
6877*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6878*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6879*b095b053SXin Li threadpool.get(),
6880*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6881*b095b053SXin Li static_cast<void*>(counters.data()),
6882*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6883*b095b053SXin Li kParallelize6DTile1DTileN,
6884*b095b053SXin Li 0 /* flags */);
6885*b095b053SXin Li }
6886*b095b053SXin Li
6887*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6888*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6889*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6890*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6891*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6892*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6893*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6894*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6895*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6896*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
6897*b095b053SXin Li << "(expected: " << kIncrementIterations6D << ")";
6898*b095b053SXin Li }
6899*b095b053SXin Li }
6900*b095b053SXin Li }
6901*b095b053SXin Li }
6902*b095b053SXin Li }
6903*b095b053SXin Li }
6904*b095b053SXin Li }
6905*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolEachItemProcessedMultipleTimes)6906*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) {
6907*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6908*b095b053SXin Li
6909*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6910*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6911*b095b053SXin Li
6912*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6913*b095b053SXin Li GTEST_SKIP();
6914*b095b053SXin Li }
6915*b095b053SXin Li
6916*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
6917*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6918*b095b053SXin Li threadpool.get(),
6919*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(Increment6DTile1D),
6920*b095b053SXin Li static_cast<void*>(counters.data()),
6921*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6922*b095b053SXin Li kParallelize6DTile1DTileN,
6923*b095b053SXin Li 0 /* flags */);
6924*b095b053SXin Li }
6925*b095b053SXin Li
6926*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) {
6927*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) {
6928*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) {
6929*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) {
6930*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) {
6931*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) {
6932*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n;
6933*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
6934*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
6935*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
6936*b095b053SXin Li << "(expected: " << kIncrementIterations6D << ")";
6937*b095b053SXin Li }
6938*b095b053SXin Li }
6939*b095b053SXin Li }
6940*b095b053SXin Li }
6941*b095b053SXin Li }
6942*b095b053SXin Li }
6943*b095b053SXin Li }
6944*b095b053SXin Li
IncrementSame6DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6945*b095b053SXin Li static void IncrementSame6DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6946*b095b053SXin Li for (size_t n = start_n; n < start_n + tile_n; n++) {
6947*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
6948*b095b053SXin Li }
6949*b095b053SXin Li }
6950*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolHighContention)6951*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolHighContention) {
6952*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6953*b095b053SXin Li
6954*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6955*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6956*b095b053SXin Li
6957*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6958*b095b053SXin Li GTEST_SKIP();
6959*b095b053SXin Li }
6960*b095b053SXin Li
6961*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6962*b095b053SXin Li threadpool.get(),
6963*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(IncrementSame6DTile1D),
6964*b095b053SXin Li static_cast<void*>(&num_processed_items),
6965*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6966*b095b053SXin Li kParallelize6DTile1DTileN,
6967*b095b053SXin Li 0 /* flags */);
6968*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6969*b095b053SXin Li }
6970*b095b053SXin Li
WorkImbalance6DTile1D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t m,size_t start_n,size_t tile_n)6971*b095b053SXin Li static void WorkImbalance6DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) {
6972*b095b053SXin Li num_processed_items->fetch_add(tile_n, std::memory_order_relaxed);
6973*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && start_n == 0) {
6974*b095b053SXin Li /* Spin-wait until all items are computed */
6975*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN) {
6976*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
6977*b095b053SXin Li }
6978*b095b053SXin Li }
6979*b095b053SXin Li }
6980*b095b053SXin Li
TEST(Parallelize6DTile1D,MultiThreadPoolWorkStealing)6981*b095b053SXin Li TEST(Parallelize6DTile1D, MultiThreadPoolWorkStealing) {
6982*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
6983*b095b053SXin Li
6984*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
6985*b095b053SXin Li ASSERT_TRUE(threadpool.get());
6986*b095b053SXin Li
6987*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
6988*b095b053SXin Li GTEST_SKIP();
6989*b095b053SXin Li }
6990*b095b053SXin Li
6991*b095b053SXin Li pthreadpool_parallelize_6d_tile_1d(
6992*b095b053SXin Li threadpool.get(),
6993*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_1d_t>(WorkImbalance6DTile1D),
6994*b095b053SXin Li static_cast<void*>(&num_processed_items),
6995*b095b053SXin Li kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN,
6996*b095b053SXin Li kParallelize6DTile1DTileN,
6997*b095b053SXin Li 0 /* flags */);
6998*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN);
6999*b095b053SXin Li }
7000*b095b053SXin Li
ComputeNothing6DTile2D(void *,size_t,size_t,size_t,size_t,size_t,size_t,size_t,size_t)7001*b095b053SXin Li static void ComputeNothing6DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) {
7002*b095b053SXin Li }
7003*b095b053SXin Li
TEST(Parallelize6DTile2D,SingleThreadPoolCompletes)7004*b095b053SXin Li TEST(Parallelize6DTile2D, SingleThreadPoolCompletes) {
7005*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7006*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7007*b095b053SXin Li
7008*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(threadpool.get(),
7009*b095b053SXin Li ComputeNothing6DTile2D,
7010*b095b053SXin Li nullptr,
7011*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7012*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7013*b095b053SXin Li 0 /* flags */);
7014*b095b053SXin Li }
7015*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolCompletes)7016*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolCompletes) {
7017*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7018*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7019*b095b053SXin Li
7020*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7021*b095b053SXin Li GTEST_SKIP();
7022*b095b053SXin Li }
7023*b095b053SXin Li
7024*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7025*b095b053SXin Li threadpool.get(),
7026*b095b053SXin Li ComputeNothing6DTile2D,
7027*b095b053SXin Li nullptr,
7028*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7029*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7030*b095b053SXin Li 0 /* flags */);
7031*b095b053SXin Li }
7032*b095b053SXin Li
CheckBounds6DTile2D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7033*b095b053SXin Li static void CheckBounds6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7034*b095b053SXin Li EXPECT_LT(i, kParallelize6DTile2DRangeI);
7035*b095b053SXin Li EXPECT_LT(j, kParallelize6DTile2DRangeJ);
7036*b095b053SXin Li EXPECT_LT(k, kParallelize6DTile2DRangeK);
7037*b095b053SXin Li EXPECT_LT(l, kParallelize6DTile2DRangeL);
7038*b095b053SXin Li EXPECT_LT(start_m, kParallelize6DTile2DRangeM);
7039*b095b053SXin Li EXPECT_LT(start_n, kParallelize6DTile2DRangeN);
7040*b095b053SXin Li EXPECT_LE(start_m + tile_m, kParallelize6DTile2DRangeM);
7041*b095b053SXin Li EXPECT_LE(start_n + tile_n, kParallelize6DTile2DRangeN);
7042*b095b053SXin Li }
7043*b095b053SXin Li
TEST(Parallelize6DTile2D,SingleThreadPoolAllItemsInBounds)7044*b095b053SXin Li TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsInBounds) {
7045*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7046*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7047*b095b053SXin Li
7048*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7049*b095b053SXin Li threadpool.get(),
7050*b095b053SXin Li CheckBounds6DTile2D,
7051*b095b053SXin Li nullptr,
7052*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7053*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7054*b095b053SXin Li 0 /* flags */);
7055*b095b053SXin Li }
7056*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolAllItemsInBounds)7057*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsInBounds) {
7058*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7059*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7060*b095b053SXin Li
7061*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7062*b095b053SXin Li GTEST_SKIP();
7063*b095b053SXin Li }
7064*b095b053SXin Li
7065*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7066*b095b053SXin Li threadpool.get(),
7067*b095b053SXin Li CheckBounds6DTile2D,
7068*b095b053SXin Li nullptr,
7069*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7070*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7071*b095b053SXin Li 0 /* flags */);
7072*b095b053SXin Li }
7073*b095b053SXin Li
CheckTiling6DTile2D(void *,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7074*b095b053SXin Li static void CheckTiling6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7075*b095b053SXin Li EXPECT_GT(tile_m, 0);
7076*b095b053SXin Li EXPECT_LE(tile_m, kParallelize6DTile2DTileM);
7077*b095b053SXin Li EXPECT_EQ(start_m % kParallelize6DTile2DTileM, 0);
7078*b095b053SXin Li EXPECT_EQ(tile_m, std::min<size_t>(kParallelize6DTile2DTileM, kParallelize6DTile2DRangeM - start_m));
7079*b095b053SXin Li
7080*b095b053SXin Li EXPECT_GT(tile_n, 0);
7081*b095b053SXin Li EXPECT_LE(tile_n, kParallelize6DTile2DTileN);
7082*b095b053SXin Li EXPECT_EQ(start_n % kParallelize6DTile2DTileN, 0);
7083*b095b053SXin Li EXPECT_EQ(tile_n, std::min<size_t>(kParallelize6DTile2DTileN, kParallelize6DTile2DRangeN - start_n));
7084*b095b053SXin Li }
7085*b095b053SXin Li
TEST(Parallelize6DTile2D,SingleThreadPoolUniformTiling)7086*b095b053SXin Li TEST(Parallelize6DTile2D, SingleThreadPoolUniformTiling) {
7087*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7088*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7089*b095b053SXin Li
7090*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7091*b095b053SXin Li threadpool.get(),
7092*b095b053SXin Li CheckTiling6DTile2D,
7093*b095b053SXin Li nullptr,
7094*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7095*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7096*b095b053SXin Li 0 /* flags */);
7097*b095b053SXin Li }
7098*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolUniformTiling)7099*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolUniformTiling) {
7100*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7101*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7102*b095b053SXin Li
7103*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7104*b095b053SXin Li GTEST_SKIP();
7105*b095b053SXin Li }
7106*b095b053SXin Li
7107*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7108*b095b053SXin Li threadpool.get(),
7109*b095b053SXin Li CheckTiling6DTile2D,
7110*b095b053SXin Li nullptr,
7111*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7112*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7113*b095b053SXin Li 0 /* flags */);
7114*b095b053SXin Li }
7115*b095b053SXin Li
SetTrue6DTile2D(std::atomic_bool * processed_indicators,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7116*b095b053SXin Li static void SetTrue6DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7117*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
7118*b095b053SXin Li for (size_t n = start_n; n < start_n + tile_n; n++) {
7119*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7120*b095b053SXin Li processed_indicators[linear_idx].store(true, std::memory_order_relaxed);
7121*b095b053SXin Li }
7122*b095b053SXin Li }
7123*b095b053SXin Li }
7124*b095b053SXin Li
TEST(Parallelize6DTile2D,SingleThreadPoolAllItemsProcessed)7125*b095b053SXin Li TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsProcessed) {
7126*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7127*b095b053SXin Li
7128*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7129*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7130*b095b053SXin Li
7131*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7132*b095b053SXin Li threadpool.get(),
7133*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(SetTrue6DTile2D),
7134*b095b053SXin Li static_cast<void*>(indicators.data()),
7135*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7136*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7137*b095b053SXin Li 0 /* flags */);
7138*b095b053SXin Li
7139*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7140*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7141*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7142*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7143*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7144*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7145*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7146*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
7147*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
7148*b095b053SXin Li }
7149*b095b053SXin Li }
7150*b095b053SXin Li }
7151*b095b053SXin Li }
7152*b095b053SXin Li }
7153*b095b053SXin Li }
7154*b095b053SXin Li }
7155*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolAllItemsProcessed)7156*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsProcessed) {
7157*b095b053SXin Li std::vector<std::atomic_bool> indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7158*b095b053SXin Li
7159*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7160*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7161*b095b053SXin Li
7162*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7163*b095b053SXin Li GTEST_SKIP();
7164*b095b053SXin Li }
7165*b095b053SXin Li
7166*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7167*b095b053SXin Li threadpool.get(),
7168*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(SetTrue6DTile2D),
7169*b095b053SXin Li static_cast<void*>(indicators.data()),
7170*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7171*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7172*b095b053SXin Li 0 /* flags */);
7173*b095b053SXin Li
7174*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7175*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7176*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7177*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7178*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7179*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7180*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7181*b095b053SXin Li EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed))
7182*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed";
7183*b095b053SXin Li }
7184*b095b053SXin Li }
7185*b095b053SXin Li }
7186*b095b053SXin Li }
7187*b095b053SXin Li }
7188*b095b053SXin Li }
7189*b095b053SXin Li }
7190*b095b053SXin Li
Increment6DTile2D(std::atomic_int * processed_counters,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7191*b095b053SXin Li static void Increment6DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7192*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
7193*b095b053SXin Li for (size_t n = start_n; n < start_n + tile_n; n++) {
7194*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7195*b095b053SXin Li processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed);
7196*b095b053SXin Li }
7197*b095b053SXin Li }
7198*b095b053SXin Li }
7199*b095b053SXin Li
TEST(Parallelize6DTile2D,SingleThreadPoolEachItemProcessedOnce)7200*b095b053SXin Li TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedOnce) {
7201*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7202*b095b053SXin Li
7203*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7204*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7205*b095b053SXin Li
7206*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7207*b095b053SXin Li threadpool.get(),
7208*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7209*b095b053SXin Li static_cast<void*>(counters.data()),
7210*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7211*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7212*b095b053SXin Li 0 /* flags */);
7213*b095b053SXin Li
7214*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7215*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7216*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7217*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7218*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7219*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7220*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7221*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
7222*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7223*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
7224*b095b053SXin Li }
7225*b095b053SXin Li }
7226*b095b053SXin Li }
7227*b095b053SXin Li }
7228*b095b053SXin Li }
7229*b095b053SXin Li }
7230*b095b053SXin Li }
7231*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolEachItemProcessedOnce)7232*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedOnce) {
7233*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7234*b095b053SXin Li
7235*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7236*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7237*b095b053SXin Li
7238*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7239*b095b053SXin Li GTEST_SKIP();
7240*b095b053SXin Li }
7241*b095b053SXin Li
7242*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7243*b095b053SXin Li threadpool.get(),
7244*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7245*b095b053SXin Li static_cast<void*>(counters.data()),
7246*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7247*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7248*b095b053SXin Li 0 /* flags */);
7249*b095b053SXin Li
7250*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7251*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7252*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7253*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7254*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7255*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7256*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7257*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1)
7258*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7259*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)";
7260*b095b053SXin Li }
7261*b095b053SXin Li }
7262*b095b053SXin Li }
7263*b095b053SXin Li }
7264*b095b053SXin Li }
7265*b095b053SXin Li }
7266*b095b053SXin Li }
7267*b095b053SXin Li
TEST(Parallelize6DTile2D,SingleThreadPoolEachItemProcessedMultipleTimes)7268*b095b053SXin Li TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) {
7269*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7270*b095b053SXin Li
7271*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy);
7272*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7273*b095b053SXin Li
7274*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
7275*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7276*b095b053SXin Li threadpool.get(),
7277*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7278*b095b053SXin Li static_cast<void*>(counters.data()),
7279*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7280*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7281*b095b053SXin Li 0 /* flags */);
7282*b095b053SXin Li }
7283*b095b053SXin Li
7284*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7285*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7286*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7287*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7288*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7289*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7290*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7291*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
7292*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7293*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
7294*b095b053SXin Li << "(expected: " << kIncrementIterations6D << ")";
7295*b095b053SXin Li }
7296*b095b053SXin Li }
7297*b095b053SXin Li }
7298*b095b053SXin Li }
7299*b095b053SXin Li }
7300*b095b053SXin Li }
7301*b095b053SXin Li }
7302*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolEachItemProcessedMultipleTimes)7303*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) {
7304*b095b053SXin Li std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7305*b095b053SXin Li
7306*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7307*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7308*b095b053SXin Li
7309*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7310*b095b053SXin Li GTEST_SKIP();
7311*b095b053SXin Li }
7312*b095b053SXin Li
7313*b095b053SXin Li for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) {
7314*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7315*b095b053SXin Li threadpool.get(),
7316*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D),
7317*b095b053SXin Li static_cast<void*>(counters.data()),
7318*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7319*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7320*b095b053SXin Li 0 /* flags */);
7321*b095b053SXin Li }
7322*b095b053SXin Li
7323*b095b053SXin Li for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) {
7324*b095b053SXin Li for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) {
7325*b095b053SXin Li for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) {
7326*b095b053SXin Li for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) {
7327*b095b053SXin Li for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) {
7328*b095b053SXin Li for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) {
7329*b095b053SXin Li const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n;
7330*b095b053SXin Li EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D)
7331*b095b053SXin Li << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed "
7332*b095b053SXin Li << counters[linear_idx].load(std::memory_order_relaxed) << " times "
7333*b095b053SXin Li << "(expected: " << kIncrementIterations6D << ")";
7334*b095b053SXin Li }
7335*b095b053SXin Li }
7336*b095b053SXin Li }
7337*b095b053SXin Li }
7338*b095b053SXin Li }
7339*b095b053SXin Li }
7340*b095b053SXin Li }
7341*b095b053SXin Li
IncrementSame6DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7342*b095b053SXin Li static void IncrementSame6DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7343*b095b053SXin Li for (size_t m = start_m; m < start_m + tile_m; m++) {
7344*b095b053SXin Li for (size_t n = start_n; n < start_n + tile_n; n++) {
7345*b095b053SXin Li num_processed_items->fetch_add(1, std::memory_order_relaxed);
7346*b095b053SXin Li }
7347*b095b053SXin Li }
7348*b095b053SXin Li }
7349*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolHighContention)7350*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolHighContention) {
7351*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
7352*b095b053SXin Li
7353*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7354*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7355*b095b053SXin Li
7356*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7357*b095b053SXin Li GTEST_SKIP();
7358*b095b053SXin Li }
7359*b095b053SXin Li
7360*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7361*b095b053SXin Li threadpool.get(),
7362*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(IncrementSame6DTile2D),
7363*b095b053SXin Li static_cast<void*>(&num_processed_items),
7364*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7365*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7366*b095b053SXin Li 0 /* flags */);
7367*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7368*b095b053SXin Li }
7369*b095b053SXin Li
WorkImbalance6DTile2D(std::atomic_int * num_processed_items,size_t i,size_t j,size_t k,size_t l,size_t start_m,size_t start_n,size_t tile_m,size_t tile_n)7370*b095b053SXin Li static void WorkImbalance6DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) {
7371*b095b053SXin Li num_processed_items->fetch_add(tile_m * tile_n, std::memory_order_relaxed);
7372*b095b053SXin Li if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0 && start_n == 0) {
7373*b095b053SXin Li /* Spin-wait until all items are computed */
7374*b095b053SXin Li while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN) {
7375*b095b053SXin Li std::atomic_thread_fence(std::memory_order_acquire);
7376*b095b053SXin Li }
7377*b095b053SXin Li }
7378*b095b053SXin Li }
7379*b095b053SXin Li
TEST(Parallelize6DTile2D,MultiThreadPoolWorkStealing)7380*b095b053SXin Li TEST(Parallelize6DTile2D, MultiThreadPoolWorkStealing) {
7381*b095b053SXin Li std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0);
7382*b095b053SXin Li
7383*b095b053SXin Li auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy);
7384*b095b053SXin Li ASSERT_TRUE(threadpool.get());
7385*b095b053SXin Li
7386*b095b053SXin Li if (pthreadpool_get_threads_count(threadpool.get()) <= 1) {
7387*b095b053SXin Li GTEST_SKIP();
7388*b095b053SXin Li }
7389*b095b053SXin Li
7390*b095b053SXin Li pthreadpool_parallelize_6d_tile_2d(
7391*b095b053SXin Li threadpool.get(),
7392*b095b053SXin Li reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(WorkImbalance6DTile2D),
7393*b095b053SXin Li static_cast<void*>(&num_processed_items),
7394*b095b053SXin Li kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN,
7395*b095b053SXin Li kParallelize6DTile2DTileM, kParallelize6DTile2DTileN,
7396*b095b053SXin Li 0 /* flags */);
7397*b095b053SXin Li EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN);
7398*b095b053SXin Li }
7399