xref: /aosp_15_r20/external/pthreadpool/src/threadpool-object.h (revision b095b0533730c2930f947df924a4486d266faa1a)
1*b095b053SXin Li #pragma once
2*b095b053SXin Li 
3*b095b053SXin Li /* Standard C headers */
4*b095b053SXin Li #include <stddef.h>
5*b095b053SXin Li #include <stdint.h>
6*b095b053SXin Li 
7*b095b053SXin Li /* Internal headers */
8*b095b053SXin Li #include "threadpool-common.h"
9*b095b053SXin Li #include "threadpool-atomics.h"
10*b095b053SXin Li 
11*b095b053SXin Li /* POSIX headers */
12*b095b053SXin Li #if PTHREADPOOL_USE_CONDVAR || PTHREADPOOL_USE_FUTEX
13*b095b053SXin Li #include <pthread.h>
14*b095b053SXin Li #endif
15*b095b053SXin Li 
16*b095b053SXin Li /* Mach headers */
17*b095b053SXin Li #if PTHREADPOOL_USE_GCD
18*b095b053SXin Li #include <dispatch/dispatch.h>
19*b095b053SXin Li #endif
20*b095b053SXin Li 
21*b095b053SXin Li /* Windows headers */
22*b095b053SXin Li #if PTHREADPOOL_USE_EVENT
23*b095b053SXin Li #include <windows.h>
24*b095b053SXin Li #endif
25*b095b053SXin Li 
26*b095b053SXin Li /* Dependencies */
27*b095b053SXin Li #include <fxdiv.h>
28*b095b053SXin Li 
29*b095b053SXin Li /* Library header */
30*b095b053SXin Li #include <pthreadpool.h>
31*b095b053SXin Li 
32*b095b053SXin Li 
33*b095b053SXin Li #define THREADPOOL_COMMAND_MASK UINT32_C(0x7FFFFFFF)
34*b095b053SXin Li 
35*b095b053SXin Li enum threadpool_command {
36*b095b053SXin Li 	threadpool_command_init,
37*b095b053SXin Li 	threadpool_command_parallelize,
38*b095b053SXin Li 	threadpool_command_shutdown,
39*b095b053SXin Li };
40*b095b053SXin Li 
41*b095b053SXin Li struct PTHREADPOOL_CACHELINE_ALIGNED thread_info {
42*b095b053SXin Li 	/**
43*b095b053SXin Li 	 * Index of the first element in the work range.
44*b095b053SXin Li 	 * Before processing a new element the owning worker thread increments this value.
45*b095b053SXin Li 	 */
46*b095b053SXin Li 	pthreadpool_atomic_size_t range_start;
47*b095b053SXin Li 	/**
48*b095b053SXin Li 	 * Index of the element after the last element of the work range.
49*b095b053SXin Li 	 * Before processing a new element the stealing worker thread decrements this value.
50*b095b053SXin Li 	 */
51*b095b053SXin Li 	pthreadpool_atomic_size_t range_end;
52*b095b053SXin Li 	/**
53*b095b053SXin Li 	 * The number of elements in the work range.
54*b095b053SXin Li 	 * Due to race conditions range_length <= range_end - range_start.
55*b095b053SXin Li 	 * The owning worker thread must decrement this value before incrementing @a range_start.
56*b095b053SXin Li 	 * The stealing worker thread must decrement this value before decrementing @a range_end.
57*b095b053SXin Li 	 */
58*b095b053SXin Li 	pthreadpool_atomic_size_t range_length;
59*b095b053SXin Li 	/**
60*b095b053SXin Li 	 * Thread number in the 0..threads_count-1 range.
61*b095b053SXin Li 	 */
62*b095b053SXin Li 	size_t thread_number;
63*b095b053SXin Li 	/**
64*b095b053SXin Li 	 * Thread pool which owns the thread.
65*b095b053SXin Li 	 */
66*b095b053SXin Li 	struct pthreadpool* threadpool;
67*b095b053SXin Li #if PTHREADPOOL_USE_CONDVAR || PTHREADPOOL_USE_FUTEX
68*b095b053SXin Li 	/**
69*b095b053SXin Li 	 * The pthread object corresponding to the thread.
70*b095b053SXin Li 	 */
71*b095b053SXin Li 	pthread_t thread_object;
72*b095b053SXin Li #endif
73*b095b053SXin Li #if PTHREADPOOL_USE_EVENT
74*b095b053SXin Li 	/**
75*b095b053SXin Li 	 * The Windows thread handle corresponding to the thread.
76*b095b053SXin Li 	 */
77*b095b053SXin Li 	HANDLE thread_handle;
78*b095b053SXin Li #endif
79*b095b053SXin Li };
80*b095b053SXin Li 
81*b095b053SXin Li PTHREADPOOL_STATIC_ASSERT(sizeof(struct thread_info) % PTHREADPOOL_CACHELINE_SIZE == 0,
82*b095b053SXin Li 	"thread_info structure must occupy an integer number of cache lines (64 bytes)");
83*b095b053SXin Li 
84*b095b053SXin Li struct pthreadpool_1d_with_uarch_params {
85*b095b053SXin Li 	/**
86*b095b053SXin Li 	 * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_1d_with_uarch function.
87*b095b053SXin Li 	 */
88*b095b053SXin Li 	uint32_t default_uarch_index;
89*b095b053SXin Li 	/**
90*b095b053SXin Li 	 * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_1d_with_uarch function.
91*b095b053SXin Li 	 */
92*b095b053SXin Li 	uint32_t max_uarch_index;
93*b095b053SXin Li };
94*b095b053SXin Li 
95*b095b053SXin Li struct pthreadpool_1d_tile_1d_params {
96*b095b053SXin Li 	/**
97*b095b053SXin Li 	 * Copy of the range argument passed to the pthreadpool_parallelize_1d_tile_1d function.
98*b095b053SXin Li 	 */
99*b095b053SXin Li 	size_t range;
100*b095b053SXin Li 	/**
101*b095b053SXin Li 	 * Copy of the tile argument passed to the pthreadpool_parallelize_1d_tile_1d function.
102*b095b053SXin Li 	 */
103*b095b053SXin Li 	size_t tile;
104*b095b053SXin Li };
105*b095b053SXin Li 
106*b095b053SXin Li struct pthreadpool_2d_params {
107*b095b053SXin Li 	/**
108*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_2d function.
109*b095b053SXin Li 	 */
110*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
111*b095b053SXin Li };
112*b095b053SXin Li 
113*b095b053SXin Li struct pthreadpool_2d_tile_1d_params {
114*b095b053SXin Li 	/**
115*b095b053SXin Li 	 * Copy of the range_j argument passed to the pthreadpool_parallelize_2d_tile_1d function.
116*b095b053SXin Li 	 */
117*b095b053SXin Li 	size_t range_j;
118*b095b053SXin Li 	/**
119*b095b053SXin Li 	 * Copy of the tile_j argument passed to the pthreadpool_parallelize_2d_tile_1d function.
120*b095b053SXin Li 	 */
121*b095b053SXin Li 	size_t tile_j;
122*b095b053SXin Li 	/**
123*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_j, tile_j) value.
124*b095b053SXin Li 	 */
125*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_j;
126*b095b053SXin Li };
127*b095b053SXin Li 
128*b095b053SXin Li struct pthreadpool_2d_tile_2d_params {
129*b095b053SXin Li 	/**
130*b095b053SXin Li 	 * Copy of the range_i argument passed to the pthreadpool_parallelize_2d_tile_2d function.
131*b095b053SXin Li 	 */
132*b095b053SXin Li 	size_t range_i;
133*b095b053SXin Li 	/**
134*b095b053SXin Li 	 * Copy of the tile_i argument passed to the pthreadpool_parallelize_2d_tile_2d function.
135*b095b053SXin Li 	 */
136*b095b053SXin Li 	size_t tile_i;
137*b095b053SXin Li 	/**
138*b095b053SXin Li 	 * Copy of the range_j argument passed to the pthreadpool_parallelize_2d_tile_2d function.
139*b095b053SXin Li 	 */
140*b095b053SXin Li 	size_t range_j;
141*b095b053SXin Li 	/**
142*b095b053SXin Li 	 * Copy of the tile_j argument passed to the pthreadpool_parallelize_2d_tile_2d function.
143*b095b053SXin Li 	 */
144*b095b053SXin Li 	size_t tile_j;
145*b095b053SXin Li 	/**
146*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_j, tile_j) value.
147*b095b053SXin Li 	 */
148*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_j;
149*b095b053SXin Li };
150*b095b053SXin Li 
151*b095b053SXin Li struct pthreadpool_2d_tile_2d_with_uarch_params {
152*b095b053SXin Li 	/**
153*b095b053SXin Li 	 * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function.
154*b095b053SXin Li 	 */
155*b095b053SXin Li 	uint32_t default_uarch_index;
156*b095b053SXin Li 	/**
157*b095b053SXin Li 	 * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function.
158*b095b053SXin Li 	 */
159*b095b053SXin Li 	uint32_t max_uarch_index;
160*b095b053SXin Li 	/**
161*b095b053SXin Li 	 * Copy of the range_i argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function.
162*b095b053SXin Li 	 */
163*b095b053SXin Li 	size_t range_i;
164*b095b053SXin Li 	/**
165*b095b053SXin Li 	 * Copy of the tile_i argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function.
166*b095b053SXin Li 	 */
167*b095b053SXin Li 	size_t tile_i;
168*b095b053SXin Li 	/**
169*b095b053SXin Li 	 * Copy of the range_j argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function.
170*b095b053SXin Li 	 */
171*b095b053SXin Li 	size_t range_j;
172*b095b053SXin Li 	/**
173*b095b053SXin Li 	 * Copy of the tile_j argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function.
174*b095b053SXin Li 	 */
175*b095b053SXin Li 	size_t tile_j;
176*b095b053SXin Li 	/**
177*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_j, tile_j) value.
178*b095b053SXin Li 	 */
179*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_j;
180*b095b053SXin Li };
181*b095b053SXin Li 
182*b095b053SXin Li struct pthreadpool_3d_params {
183*b095b053SXin Li 	/**
184*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_3d function.
185*b095b053SXin Li 	 */
186*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
187*b095b053SXin Li 	/**
188*b095b053SXin Li 	 * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_3d function.
189*b095b053SXin Li 	 */
190*b095b053SXin Li 	struct fxdiv_divisor_size_t range_k;
191*b095b053SXin Li };
192*b095b053SXin Li 
193*b095b053SXin Li struct pthreadpool_3d_tile_1d_params {
194*b095b053SXin Li 	/**
195*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_1d function.
196*b095b053SXin Li 	 */
197*b095b053SXin Li 	size_t range_k;
198*b095b053SXin Li 	/**
199*b095b053SXin Li 	 * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_1d function.
200*b095b053SXin Li 	 */
201*b095b053SXin Li 	size_t tile_k;
202*b095b053SXin Li 	/**
203*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_3d_tile_1d function.
204*b095b053SXin Li 	 */
205*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
206*b095b053SXin Li 	/**
207*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_k, tile_k) value.
208*b095b053SXin Li 	 */
209*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_k;
210*b095b053SXin Li };
211*b095b053SXin Li 
212*b095b053SXin Li struct pthreadpool_3d_tile_2d_params {
213*b095b053SXin Li 	/**
214*b095b053SXin Li 	 * Copy of the range_j argument passed to the pthreadpool_parallelize_3d_tile_2d function.
215*b095b053SXin Li 	 */
216*b095b053SXin Li 	size_t range_j;
217*b095b053SXin Li 	/**
218*b095b053SXin Li 	 * Copy of the tile_j argument passed to the pthreadpool_parallelize_3d_tile_2d function.
219*b095b053SXin Li 	 */
220*b095b053SXin Li 	size_t tile_j;
221*b095b053SXin Li 	/**
222*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_2d function.
223*b095b053SXin Li 	 */
224*b095b053SXin Li 	size_t range_k;
225*b095b053SXin Li 	/**
226*b095b053SXin Li 	 * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_2d function.
227*b095b053SXin Li 	 */
228*b095b053SXin Li 	size_t tile_k;
229*b095b053SXin Li 	/**
230*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_j, tile_j) value.
231*b095b053SXin Li 	 */
232*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_j;
233*b095b053SXin Li 	/**
234*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_k, tile_k) value.
235*b095b053SXin Li 	 */
236*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_k;
237*b095b053SXin Li };
238*b095b053SXin Li 
239*b095b053SXin Li struct pthreadpool_3d_tile_2d_with_uarch_params {
240*b095b053SXin Li 	/**
241*b095b053SXin Li 	 * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function.
242*b095b053SXin Li 	 */
243*b095b053SXin Li 	uint32_t default_uarch_index;
244*b095b053SXin Li 	/**
245*b095b053SXin Li 	 * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function.
246*b095b053SXin Li 	 */
247*b095b053SXin Li 	uint32_t max_uarch_index;
248*b095b053SXin Li 	/**
249*b095b053SXin Li 	 * Copy of the range_j argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function.
250*b095b053SXin Li 	 */
251*b095b053SXin Li 	size_t range_j;
252*b095b053SXin Li 	/**
253*b095b053SXin Li 	 * Copy of the tile_j argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function.
254*b095b053SXin Li 	 */
255*b095b053SXin Li 	size_t tile_j;
256*b095b053SXin Li 	/**
257*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function.
258*b095b053SXin Li 	 */
259*b095b053SXin Li 	size_t range_k;
260*b095b053SXin Li 	/**
261*b095b053SXin Li 	 * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function.
262*b095b053SXin Li 	 */
263*b095b053SXin Li 	size_t tile_k;
264*b095b053SXin Li 	/**
265*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_j, tile_j) value.
266*b095b053SXin Li 	 */
267*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_j;
268*b095b053SXin Li 	/**
269*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_k, tile_k) value.
270*b095b053SXin Li 	 */
271*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_k;
272*b095b053SXin Li };
273*b095b053SXin Li 
274*b095b053SXin Li struct pthreadpool_4d_params {
275*b095b053SXin Li 	/**
276*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_4d function.
277*b095b053SXin Li 	 */
278*b095b053SXin Li 	size_t range_k;
279*b095b053SXin Li 	/**
280*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d function.
281*b095b053SXin Li 	 */
282*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
283*b095b053SXin Li 	/**
284*b095b053SXin Li 	 * FXdiv divisor for the range_k * range_l value.
285*b095b053SXin Li 	 */
286*b095b053SXin Li 	struct fxdiv_divisor_size_t range_kl;
287*b095b053SXin Li 	/**
288*b095b053SXin Li 	 * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_4d function.
289*b095b053SXin Li 	 */
290*b095b053SXin Li 	struct fxdiv_divisor_size_t range_l;
291*b095b053SXin Li };
292*b095b053SXin Li 
293*b095b053SXin Li struct pthreadpool_4d_tile_1d_params {
294*b095b053SXin Li 	/**
295*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_1d function.
296*b095b053SXin Li 	 */
297*b095b053SXin Li 	size_t range_k;
298*b095b053SXin Li 	/**
299*b095b053SXin Li 	 * Copy of the range_l argument passed to the pthreadpool_parallelize_4d_tile_1d function.
300*b095b053SXin Li 	 */
301*b095b053SXin Li 	size_t range_l;
302*b095b053SXin Li 	/**
303*b095b053SXin Li 	 * Copy of the tile_l argument passed to the pthreadpool_parallelize_4d_tile_1d function.
304*b095b053SXin Li 	 */
305*b095b053SXin Li 	size_t tile_l;
306*b095b053SXin Li 	/**
307*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d_tile_1d function.
308*b095b053SXin Li 	 */
309*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
310*b095b053SXin Li 	/**
311*b095b053SXin Li 	 * FXdiv divisor for the range_k * divide_round_up(range_l, tile_l) value.
312*b095b053SXin Li 	 */
313*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_kl;
314*b095b053SXin Li 	/**
315*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_l, tile_l) value.
316*b095b053SXin Li 	 */
317*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_l;
318*b095b053SXin Li };
319*b095b053SXin Li 
320*b095b053SXin Li struct pthreadpool_4d_tile_2d_params {
321*b095b053SXin Li 	/**
322*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_2d function.
323*b095b053SXin Li 	 */
324*b095b053SXin Li 	size_t range_k;
325*b095b053SXin Li 	/**
326*b095b053SXin Li 	 * Copy of the tile_k argument passed to the pthreadpool_parallelize_4d_tile_2d function.
327*b095b053SXin Li 	 */
328*b095b053SXin Li 	size_t tile_k;
329*b095b053SXin Li 	/**
330*b095b053SXin Li 	 * Copy of the range_l argument passed to the pthreadpool_parallelize_4d_tile_2d function.
331*b095b053SXin Li 	 */
332*b095b053SXin Li 	size_t range_l;
333*b095b053SXin Li 	/**
334*b095b053SXin Li 	 * Copy of the tile_l argument passed to the pthreadpool_parallelize_4d_tile_2d function.
335*b095b053SXin Li 	 */
336*b095b053SXin Li 	size_t tile_l;
337*b095b053SXin Li 	/**
338*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d_tile_2d function.
339*b095b053SXin Li 	 */
340*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
341*b095b053SXin Li 	/**
342*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_k, tile_k) * divide_round_up(range_l, tile_l) value.
343*b095b053SXin Li 	 */
344*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_kl;
345*b095b053SXin Li 	/**
346*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_l, tile_l) value.
347*b095b053SXin Li 	 */
348*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_l;
349*b095b053SXin Li };
350*b095b053SXin Li 
351*b095b053SXin Li struct pthreadpool_4d_tile_2d_with_uarch_params {
352*b095b053SXin Li 	/**
353*b095b053SXin Li 	 * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function.
354*b095b053SXin Li 	 */
355*b095b053SXin Li 	uint32_t default_uarch_index;
356*b095b053SXin Li 	/**
357*b095b053SXin Li 	 * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function.
358*b095b053SXin Li 	 */
359*b095b053SXin Li 	uint32_t max_uarch_index;
360*b095b053SXin Li 	/**
361*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function.
362*b095b053SXin Li 	 */
363*b095b053SXin Li 	size_t range_k;
364*b095b053SXin Li 	/**
365*b095b053SXin Li 	 * Copy of the tile_k argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function.
366*b095b053SXin Li 	 */
367*b095b053SXin Li 	size_t tile_k;
368*b095b053SXin Li 	/**
369*b095b053SXin Li 	 * Copy of the range_l argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function.
370*b095b053SXin Li 	 */
371*b095b053SXin Li 	size_t range_l;
372*b095b053SXin Li 	/**
373*b095b053SXin Li 	 * Copy of the tile_l argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function.
374*b095b053SXin Li 	 */
375*b095b053SXin Li 	size_t tile_l;
376*b095b053SXin Li 	/**
377*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function.
378*b095b053SXin Li 	 */
379*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
380*b095b053SXin Li 	/**
381*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_k, tile_k) * divide_round_up(range_l, tile_l) value.
382*b095b053SXin Li 	 */
383*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_kl;
384*b095b053SXin Li 	/**
385*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_l, tile_l) value.
386*b095b053SXin Li 	 */
387*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_l;
388*b095b053SXin Li };
389*b095b053SXin Li 
390*b095b053SXin Li struct pthreadpool_5d_params {
391*b095b053SXin Li 	/**
392*b095b053SXin Li 	 * Copy of the range_l argument passed to the pthreadpool_parallelize_5d function.
393*b095b053SXin Li 	 */
394*b095b053SXin Li 	size_t range_l;
395*b095b053SXin Li 	/**
396*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d function.
397*b095b053SXin Li 	 */
398*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
399*b095b053SXin Li 	/**
400*b095b053SXin Li 	 * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_5d function.
401*b095b053SXin Li 	 */
402*b095b053SXin Li 	struct fxdiv_divisor_size_t range_k;
403*b095b053SXin Li 	/**
404*b095b053SXin Li 	 * FXdiv divisor for the range_l * range_m value.
405*b095b053SXin Li 	 */
406*b095b053SXin Li 	struct fxdiv_divisor_size_t range_lm;
407*b095b053SXin Li 	/**
408*b095b053SXin Li 	 * FXdiv divisor for the range_m argument passed to the pthreadpool_parallelize_5d function.
409*b095b053SXin Li 	 */
410*b095b053SXin Li 	struct fxdiv_divisor_size_t range_m;
411*b095b053SXin Li };
412*b095b053SXin Li 
413*b095b053SXin Li struct pthreadpool_5d_tile_1d_params {
414*b095b053SXin Li 	/**
415*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_5d_tile_1d function.
416*b095b053SXin Li 	 */
417*b095b053SXin Li 	size_t range_k;
418*b095b053SXin Li 	/**
419*b095b053SXin Li 	 * Copy of the range_m argument passed to the pthreadpool_parallelize_5d_tile_1d function.
420*b095b053SXin Li 	 */
421*b095b053SXin Li 	size_t range_m;
422*b095b053SXin Li 	/**
423*b095b053SXin Li 	 * Copy of the tile_m argument passed to the pthreadpool_parallelize_5d_tile_1d function.
424*b095b053SXin Li 	 */
425*b095b053SXin Li 	size_t tile_m;
426*b095b053SXin Li 	/**
427*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d_tile_1d function.
428*b095b053SXin Li 	 */
429*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
430*b095b053SXin Li 	/**
431*b095b053SXin Li 	 * FXdiv divisor for the range_k * range_l value.
432*b095b053SXin Li 	 */
433*b095b053SXin Li 	struct fxdiv_divisor_size_t range_kl;
434*b095b053SXin Li 	/**
435*b095b053SXin Li 	 * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_5d_tile_1d function.
436*b095b053SXin Li 	 */
437*b095b053SXin Li 	struct fxdiv_divisor_size_t range_l;
438*b095b053SXin Li 	/**
439*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_m, tile_m) value.
440*b095b053SXin Li 	 */
441*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_m;
442*b095b053SXin Li };
443*b095b053SXin Li 
444*b095b053SXin Li struct pthreadpool_5d_tile_2d_params {
445*b095b053SXin Li 	/**
446*b095b053SXin Li 	 * Copy of the range_l argument passed to the pthreadpool_parallelize_5d_tile_2d function.
447*b095b053SXin Li 	 */
448*b095b053SXin Li 	size_t range_l;
449*b095b053SXin Li 	/**
450*b095b053SXin Li 	 * Copy of the tile_l argument passed to the pthreadpool_parallelize_5d_tile_2d function.
451*b095b053SXin Li 	 */
452*b095b053SXin Li 	size_t tile_l;
453*b095b053SXin Li 	/**
454*b095b053SXin Li 	 * Copy of the range_m argument passed to the pthreadpool_parallelize_5d_tile_2d function.
455*b095b053SXin Li 	 */
456*b095b053SXin Li 	size_t range_m;
457*b095b053SXin Li 	/**
458*b095b053SXin Li 	 * Copy of the tile_m argument passed to the pthreadpool_parallelize_5d_tile_2d function.
459*b095b053SXin Li 	 */
460*b095b053SXin Li 	size_t tile_m;
461*b095b053SXin Li 	/**
462*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d_tile_2d function.
463*b095b053SXin Li 	 */
464*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
465*b095b053SXin Li 	/**
466*b095b053SXin Li 	 * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_5d_tile_2d function.
467*b095b053SXin Li 	 */
468*b095b053SXin Li 	struct fxdiv_divisor_size_t range_k;
469*b095b053SXin Li 	/**
470*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_l, tile_l) * divide_round_up(range_m, tile_m) value.
471*b095b053SXin Li 	 */
472*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_lm;
473*b095b053SXin Li 	/**
474*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_m, tile_m) value.
475*b095b053SXin Li 	 */
476*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_m;
477*b095b053SXin Li };
478*b095b053SXin Li 
479*b095b053SXin Li struct pthreadpool_6d_params {
480*b095b053SXin Li 	/**
481*b095b053SXin Li 	 * Copy of the range_l argument passed to the pthreadpool_parallelize_6d function.
482*b095b053SXin Li 	 */
483*b095b053SXin Li 	size_t range_l;
484*b095b053SXin Li 	/**
485*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_6d function.
486*b095b053SXin Li 	 */
487*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
488*b095b053SXin Li 	/**
489*b095b053SXin Li 	 * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_6d function.
490*b095b053SXin Li 	 */
491*b095b053SXin Li 	struct fxdiv_divisor_size_t range_k;
492*b095b053SXin Li 	/**
493*b095b053SXin Li 	 * FXdiv divisor for the range_l * range_m * range_n value.
494*b095b053SXin Li 	 */
495*b095b053SXin Li 	struct fxdiv_divisor_size_t range_lmn;
496*b095b053SXin Li 	/**
497*b095b053SXin Li 	 * FXdiv divisor for the range_m argument passed to the pthreadpool_parallelize_6d function.
498*b095b053SXin Li 	 */
499*b095b053SXin Li 	struct fxdiv_divisor_size_t range_m;
500*b095b053SXin Li 	/**
501*b095b053SXin Li 	 * FXdiv divisor for the range_n argument passed to the pthreadpool_parallelize_6d function.
502*b095b053SXin Li 	 */
503*b095b053SXin Li 	struct fxdiv_divisor_size_t range_n;
504*b095b053SXin Li };
505*b095b053SXin Li 
506*b095b053SXin Li struct pthreadpool_6d_tile_1d_params {
507*b095b053SXin Li 	/**
508*b095b053SXin Li 	 * Copy of the range_l argument passed to the pthreadpool_parallelize_6d_tile_1d function.
509*b095b053SXin Li 	 */
510*b095b053SXin Li 	size_t range_l;
511*b095b053SXin Li 	/**
512*b095b053SXin Li 	 * Copy of the range_n argument passed to the pthreadpool_parallelize_6d_tile_1d function.
513*b095b053SXin Li 	 */
514*b095b053SXin Li 	size_t range_n;
515*b095b053SXin Li 	/**
516*b095b053SXin Li 	 * Copy of the tile_n argument passed to the pthreadpool_parallelize_6d_tile_1d function.
517*b095b053SXin Li 	 */
518*b095b053SXin Li 	size_t tile_n;
519*b095b053SXin Li 	/**
520*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_6d_tile_1d function.
521*b095b053SXin Li 	 */
522*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
523*b095b053SXin Li 	/**
524*b095b053SXin Li 	 * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_6d_tile_1d function.
525*b095b053SXin Li 	 */
526*b095b053SXin Li 	struct fxdiv_divisor_size_t range_k;
527*b095b053SXin Li 	/**
528*b095b053SXin Li 	 * FXdiv divisor for the range_l * range_m * divide_round_up(range_n, tile_n) value.
529*b095b053SXin Li 	 */
530*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_lmn;
531*b095b053SXin Li 	/**
532*b095b053SXin Li 	 * FXdiv divisor for the range_m argument passed to the pthreadpool_parallelize_6d_tile_1d function.
533*b095b053SXin Li 	 */
534*b095b053SXin Li 	struct fxdiv_divisor_size_t range_m;
535*b095b053SXin Li 	/**
536*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_n, tile_n) value.
537*b095b053SXin Li 	 */
538*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_n;
539*b095b053SXin Li };
540*b095b053SXin Li 
541*b095b053SXin Li struct pthreadpool_6d_tile_2d_params {
542*b095b053SXin Li 	/**
543*b095b053SXin Li 	 * Copy of the range_k argument passed to the pthreadpool_parallelize_6d_tile_2d function.
544*b095b053SXin Li 	 */
545*b095b053SXin Li 	size_t range_k;
546*b095b053SXin Li 	/**
547*b095b053SXin Li 	 * Copy of the range_m argument passed to the pthreadpool_parallelize_6d_tile_2d function.
548*b095b053SXin Li 	 */
549*b095b053SXin Li 	size_t range_m;
550*b095b053SXin Li 	/**
551*b095b053SXin Li 	 * Copy of the tile_m argument passed to the pthreadpool_parallelize_6d_tile_2d function.
552*b095b053SXin Li 	 */
553*b095b053SXin Li 	size_t tile_m;
554*b095b053SXin Li 	/**
555*b095b053SXin Li 	 * Copy of the range_n argument passed to the pthreadpool_parallelize_6d_tile_2d function.
556*b095b053SXin Li 	 */
557*b095b053SXin Li 	size_t range_n;
558*b095b053SXin Li 	/**
559*b095b053SXin Li 	 * Copy of the tile_n argument passed to the pthreadpool_parallelize_6d_tile_2d function.
560*b095b053SXin Li 	 */
561*b095b053SXin Li 	size_t tile_n;
562*b095b053SXin Li 	/**
563*b095b053SXin Li 	 * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_6d_tile_2d function.
564*b095b053SXin Li 	 */
565*b095b053SXin Li 	struct fxdiv_divisor_size_t range_j;
566*b095b053SXin Li 	/**
567*b095b053SXin Li 	 * FXdiv divisor for the range_k * range_l value.
568*b095b053SXin Li 	 */
569*b095b053SXin Li 	struct fxdiv_divisor_size_t range_kl;
570*b095b053SXin Li 	/**
571*b095b053SXin Li 	 * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_6d_tile_2d function.
572*b095b053SXin Li 	 */
573*b095b053SXin Li 	struct fxdiv_divisor_size_t range_l;
574*b095b053SXin Li 	/**
575*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_m, tile_m) * divide_round_up(range_n, tile_n) value.
576*b095b053SXin Li 	 */
577*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_mn;
578*b095b053SXin Li 	/**
579*b095b053SXin Li 	 * FXdiv divisor for the divide_round_up(range_n, tile_n) value.
580*b095b053SXin Li 	 */
581*b095b053SXin Li 	struct fxdiv_divisor_size_t tile_range_n;
582*b095b053SXin Li };
583*b095b053SXin Li 
584*b095b053SXin Li struct PTHREADPOOL_CACHELINE_ALIGNED pthreadpool {
585*b095b053SXin Li #if !PTHREADPOOL_USE_GCD
586*b095b053SXin Li 	/**
587*b095b053SXin Li 	 * The number of threads that are processing an operation.
588*b095b053SXin Li 	 */
589*b095b053SXin Li 	pthreadpool_atomic_size_t active_threads;
590*b095b053SXin Li #endif
591*b095b053SXin Li #if PTHREADPOOL_USE_FUTEX
592*b095b053SXin Li 	/**
593*b095b053SXin Li 	 * Indicates if there are active threads.
594*b095b053SXin Li 	 * Only two values are possible:
595*b095b053SXin Li 	 * - has_active_threads == 0 if active_threads == 0
596*b095b053SXin Li 	 * - has_active_threads == 1 if active_threads != 0
597*b095b053SXin Li 	 */
598*b095b053SXin Li 	pthreadpool_atomic_uint32_t has_active_threads;
599*b095b053SXin Li #endif
600*b095b053SXin Li #if !PTHREADPOOL_USE_GCD
601*b095b053SXin Li 	/**
602*b095b053SXin Li 	 * The last command submitted to the thread pool.
603*b095b053SXin Li 	 */
604*b095b053SXin Li 	pthreadpool_atomic_uint32_t command;
605*b095b053SXin Li #endif
606*b095b053SXin Li 	/**
607*b095b053SXin Li 	 * The entry point function to call for each thread in the thread pool for parallelization tasks.
608*b095b053SXin Li 	 */
609*b095b053SXin Li 	pthreadpool_atomic_void_p thread_function;
610*b095b053SXin Li 	/**
611*b095b053SXin Li 	 * The function to call for each item.
612*b095b053SXin Li 	 */
613*b095b053SXin Li 	pthreadpool_atomic_void_p task;
614*b095b053SXin Li 	/**
615*b095b053SXin Li 	 * The first argument to the item processing function.
616*b095b053SXin Li 	 */
617*b095b053SXin Li 	pthreadpool_atomic_void_p argument;
618*b095b053SXin Li 	/**
619*b095b053SXin Li 	 * Additional parallelization parameters.
620*b095b053SXin Li 	 * These parameters are specific for each thread_function.
621*b095b053SXin Li 	 */
622*b095b053SXin Li 	union {
623*b095b053SXin Li 		struct pthreadpool_1d_with_uarch_params parallelize_1d_with_uarch;
624*b095b053SXin Li 		struct pthreadpool_1d_tile_1d_params parallelize_1d_tile_1d;
625*b095b053SXin Li 		struct pthreadpool_2d_params parallelize_2d;
626*b095b053SXin Li 		struct pthreadpool_2d_tile_1d_params parallelize_2d_tile_1d;
627*b095b053SXin Li 		struct pthreadpool_2d_tile_2d_params parallelize_2d_tile_2d;
628*b095b053SXin Li 		struct pthreadpool_2d_tile_2d_with_uarch_params parallelize_2d_tile_2d_with_uarch;
629*b095b053SXin Li 		struct pthreadpool_3d_params parallelize_3d;
630*b095b053SXin Li 		struct pthreadpool_3d_tile_1d_params parallelize_3d_tile_1d;
631*b095b053SXin Li 		struct pthreadpool_3d_tile_2d_params parallelize_3d_tile_2d;
632*b095b053SXin Li 		struct pthreadpool_3d_tile_2d_with_uarch_params parallelize_3d_tile_2d_with_uarch;
633*b095b053SXin Li 		struct pthreadpool_4d_params parallelize_4d;
634*b095b053SXin Li 		struct pthreadpool_4d_tile_1d_params parallelize_4d_tile_1d;
635*b095b053SXin Li 		struct pthreadpool_4d_tile_2d_params parallelize_4d_tile_2d;
636*b095b053SXin Li 		struct pthreadpool_4d_tile_2d_with_uarch_params parallelize_4d_tile_2d_with_uarch;
637*b095b053SXin Li 		struct pthreadpool_5d_params parallelize_5d;
638*b095b053SXin Li 		struct pthreadpool_5d_tile_1d_params parallelize_5d_tile_1d;
639*b095b053SXin Li 		struct pthreadpool_5d_tile_2d_params parallelize_5d_tile_2d;
640*b095b053SXin Li 		struct pthreadpool_6d_params parallelize_6d;
641*b095b053SXin Li 		struct pthreadpool_6d_tile_1d_params parallelize_6d_tile_1d;
642*b095b053SXin Li 		struct pthreadpool_6d_tile_2d_params parallelize_6d_tile_2d;
643*b095b053SXin Li 	} params;
644*b095b053SXin Li 	/**
645*b095b053SXin Li 	 * Copy of the flags passed to a parallelization function.
646*b095b053SXin Li 	 */
647*b095b053SXin Li 	pthreadpool_atomic_uint32_t flags;
648*b095b053SXin Li #if PTHREADPOOL_USE_CONDVAR || PTHREADPOOL_USE_FUTEX
649*b095b053SXin Li 	/**
650*b095b053SXin Li 	 * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads.
651*b095b053SXin Li 	 */
652*b095b053SXin Li 	pthread_mutex_t execution_mutex;
653*b095b053SXin Li #endif
654*b095b053SXin Li #if PTHREADPOOL_USE_GCD
655*b095b053SXin Li 	/**
656*b095b053SXin Li 	 * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads.
657*b095b053SXin Li 	 */
658*b095b053SXin Li 	dispatch_semaphore_t execution_semaphore;
659*b095b053SXin Li #endif
660*b095b053SXin Li #if PTHREADPOOL_USE_EVENT
661*b095b053SXin Li 	/**
662*b095b053SXin Li 	 * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads.
663*b095b053SXin Li 	 */
664*b095b053SXin Li 	HANDLE execution_mutex;
665*b095b053SXin Li #endif
666*b095b053SXin Li #if PTHREADPOOL_USE_CONDVAR
667*b095b053SXin Li 	/**
668*b095b053SXin Li 	 * Guards access to the @a active_threads variable.
669*b095b053SXin Li 	 */
670*b095b053SXin Li 	pthread_mutex_t completion_mutex;
671*b095b053SXin Li 	/**
672*b095b053SXin Li 	 * Condition variable to wait until all threads complete an operation (until @a active_threads is zero).
673*b095b053SXin Li 	 */
674*b095b053SXin Li 	pthread_cond_t completion_condvar;
675*b095b053SXin Li 	/**
676*b095b053SXin Li 	 * Guards access to the @a command variable.
677*b095b053SXin Li 	 */
678*b095b053SXin Li 	pthread_mutex_t command_mutex;
679*b095b053SXin Li 	/**
680*b095b053SXin Li 	 * Condition variable to wait for change of the @a command variable.
681*b095b053SXin Li 	 */
682*b095b053SXin Li 	pthread_cond_t command_condvar;
683*b095b053SXin Li #endif
684*b095b053SXin Li #if PTHREADPOOL_USE_EVENT
685*b095b053SXin Li 	/**
686*b095b053SXin Li 	 * Events to wait on until all threads complete an operation (until @a active_threads is zero).
687*b095b053SXin Li 	 * To avoid race conditions due to spin-lock synchronization, we use two events and switch event in use after every
688*b095b053SXin Li 	 * submitted command according to the high bit of the command word.
689*b095b053SXin Li 	 */
690*b095b053SXin Li 	HANDLE completion_event[2];
691*b095b053SXin Li 	/**
692*b095b053SXin Li 	 * Events to wait on for change of the @a command variable.
693*b095b053SXin Li 	 * To avoid race conditions due to spin-lock synchronization, we use two events and switch event in use after every
694*b095b053SXin Li 	 * submitted command according to the high bit of the command word.
695*b095b053SXin Li 	 */
696*b095b053SXin Li 	HANDLE command_event[2];
697*b095b053SXin Li #endif
698*b095b053SXin Li 	/**
699*b095b053SXin Li 	 * FXdiv divisor for the number of threads in the thread pool.
700*b095b053SXin Li 	 * This struct never change after pthreadpool_create.
701*b095b053SXin Li 	 */
702*b095b053SXin Li 	struct fxdiv_divisor_size_t threads_count;
703*b095b053SXin Li 	/**
704*b095b053SXin Li 	 * Thread information structures that immediately follow this structure.
705*b095b053SXin Li 	 */
706*b095b053SXin Li 	struct thread_info threads[];
707*b095b053SXin Li };
708*b095b053SXin Li 
709*b095b053SXin Li PTHREADPOOL_STATIC_ASSERT(sizeof(struct pthreadpool) % PTHREADPOOL_CACHELINE_SIZE == 0,
710*b095b053SXin Li 	"pthreadpool structure must occupy an integer number of cache lines (64 bytes)");
711*b095b053SXin Li 
712*b095b053SXin Li PTHREADPOOL_INTERNAL struct pthreadpool* pthreadpool_allocate(
713*b095b053SXin Li 	size_t threads_count);
714*b095b053SXin Li 
715*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_deallocate(
716*b095b053SXin Li 	struct pthreadpool* threadpool);
717*b095b053SXin Li 
718*b095b053SXin Li typedef void (*thread_function_t)(struct pthreadpool* threadpool, struct thread_info* thread);
719*b095b053SXin Li 
720*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_parallelize(
721*b095b053SXin Li 	struct pthreadpool* threadpool,
722*b095b053SXin Li 	thread_function_t thread_function,
723*b095b053SXin Li 	const void* params,
724*b095b053SXin Li 	size_t params_size,
725*b095b053SXin Li 	void* task,
726*b095b053SXin Li 	void* context,
727*b095b053SXin Li 	size_t linear_range,
728*b095b053SXin Li 	uint32_t flags);
729*b095b053SXin Li 
730*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath(
731*b095b053SXin Li 	struct pthreadpool* threadpool,
732*b095b053SXin Li 	struct thread_info* thread);
733*b095b053SXin Li 
734*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath(
735*b095b053SXin Li 	struct pthreadpool* threadpool,
736*b095b053SXin Li 	struct thread_info* thread);
737*b095b053SXin Li 
738*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath(
739*b095b053SXin Li 	struct pthreadpool* threadpool,
740*b095b053SXin Li 	struct thread_info* thread);
741*b095b053SXin Li 
742*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath(
743*b095b053SXin Li 	struct pthreadpool* threadpool,
744*b095b053SXin Li 	struct thread_info* thread);
745*b095b053SXin Li 
746*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath(
747*b095b053SXin Li 	struct pthreadpool* threadpool,
748*b095b053SXin Li 	struct thread_info* thread);
749*b095b053SXin Li 
750*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath(
751*b095b053SXin Li 	struct pthreadpool* threadpool,
752*b095b053SXin Li 	struct thread_info* thread);
753*b095b053SXin Li 
754*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath(
755*b095b053SXin Li 	struct pthreadpool* threadpool,
756*b095b053SXin Li 	struct thread_info* thread);
757*b095b053SXin Li 
758*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath(
759*b095b053SXin Li 	struct pthreadpool* threadpool,
760*b095b053SXin Li 	struct thread_info* thread);
761*b095b053SXin Li 
762*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath(
763*b095b053SXin Li 	struct pthreadpool* threadpool,
764*b095b053SXin Li 	struct thread_info* thread);
765*b095b053SXin Li 
766*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath(
767*b095b053SXin Li 	struct pthreadpool* threadpool,
768*b095b053SXin Li 	struct thread_info* thread);
769*b095b053SXin Li 
770*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath(
771*b095b053SXin Li 	struct pthreadpool* threadpool,
772*b095b053SXin Li 	struct thread_info* thread);
773*b095b053SXin Li 
774*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath(
775*b095b053SXin Li 	struct pthreadpool* threadpool,
776*b095b053SXin Li 	struct thread_info* thread);
777*b095b053SXin Li 
778*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath(
779*b095b053SXin Li 	struct pthreadpool* threadpool,
780*b095b053SXin Li 	struct thread_info* thread);
781*b095b053SXin Li 
782*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath(
783*b095b053SXin Li 	struct pthreadpool* threadpool,
784*b095b053SXin Li 	struct thread_info* thread);
785*b095b053SXin Li 
786*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath(
787*b095b053SXin Li 	struct pthreadpool* threadpool,
788*b095b053SXin Li 	struct thread_info* thread);
789*b095b053SXin Li 
790*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath(
791*b095b053SXin Li 	struct pthreadpool* threadpool,
792*b095b053SXin Li 	struct thread_info* thread);
793*b095b053SXin Li 
794*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath(
795*b095b053SXin Li 	struct pthreadpool* threadpool,
796*b095b053SXin Li 	struct thread_info* thread);
797*b095b053SXin Li 
798*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath(
799*b095b053SXin Li 	struct pthreadpool* threadpool,
800*b095b053SXin Li 	struct thread_info* thread);
801*b095b053SXin Li 
802*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_fastpath(
803*b095b053SXin Li 	struct pthreadpool* threadpool,
804*b095b053SXin Li 	struct thread_info* thread);
805*b095b053SXin Li 
806*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_1d_fastpath(
807*b095b053SXin Li 	struct pthreadpool* threadpool,
808*b095b053SXin Li 	struct thread_info* thread);
809*b095b053SXin Li 
810*b095b053SXin Li PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath(
811*b095b053SXin Li 	struct pthreadpool* threadpool,
812*b095b053SXin Li 	struct thread_info* thread);
813