xref: /aosp_15_r20/external/pthreadpool/include/pthreadpool.h (revision b095b0533730c2930f947df924a4486d266faa1a)
1*b095b053SXin Li #ifndef PTHREADPOOL_H_
2*b095b053SXin Li #define PTHREADPOOL_H_
3*b095b053SXin Li 
4*b095b053SXin Li #include <stddef.h>
5*b095b053SXin Li #include <stdint.h>
6*b095b053SXin Li 
7*b095b053SXin Li typedef struct pthreadpool* pthreadpool_t;
8*b095b053SXin Li 
9*b095b053SXin Li typedef void (*pthreadpool_task_1d_t)(void*, size_t);
10*b095b053SXin Li typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
11*b095b053SXin Li typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
12*b095b053SXin Li typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
13*b095b053SXin Li typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
14*b095b053SXin Li typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
15*b095b053SXin Li typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
16*b095b053SXin Li typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
17*b095b053SXin Li typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
18*b095b053SXin Li typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
19*b095b053SXin Li typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
20*b095b053SXin Li typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
21*b095b053SXin Li typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
22*b095b053SXin Li typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
23*b095b053SXin Li typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
24*b095b053SXin Li typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
25*b095b053SXin Li typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
26*b095b053SXin Li 
27*b095b053SXin Li typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
28*b095b053SXin Li typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
29*b095b053SXin Li typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
30*b095b053SXin Li typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
31*b095b053SXin Li 
32*b095b053SXin Li 
33*b095b053SXin Li /**
34*b095b053SXin Li  * Disable support for denormalized numbers to the maximum extent possible for
35*b095b053SXin Li  * the duration of the computation.
36*b095b053SXin Li  *
37*b095b053SXin Li  * Handling denormalized floating-point numbers is often implemented in
38*b095b053SXin Li  * microcode, and incurs significant performance degradation. This hint
39*b095b053SXin Li  * instructs the thread pool to disable support for denormalized numbers before
40*b095b053SXin Li  * running the computation by manipulating architecture-specific control
41*b095b053SXin Li  * registers, and restore the initial value of control registers after the
42*b095b053SXin Li  * computation is complete. The thread pool temporary disables denormalized
43*b095b053SXin Li  * numbers on all threads involved in the computation (i.e. the caller threads,
44*b095b053SXin Li  * and potentially worker threads).
45*b095b053SXin Li  *
46*b095b053SXin Li  * Disabling denormalized numbers may have a small negative effect on results'
47*b095b053SXin Li  * accuracy. As various architectures differ in capabilities to control
48*b095b053SXin Li  * processing of denormalized numbers, using this flag may also hurt results'
49*b095b053SXin Li  * reproducibility across different instruction set architectures.
50*b095b053SXin Li  */
51*b095b053SXin Li #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
52*b095b053SXin Li 
53*b095b053SXin Li /**
54*b095b053SXin Li  * Yield worker threads to the system scheduler after the operation is finished.
55*b095b053SXin Li  *
56*b095b053SXin Li  * Force workers to use kernel wait (instead of active spin-wait by default) for
57*b095b053SXin Li  * new commands after this command is processed. This flag affects only the
58*b095b053SXin Li  * immediate next operation on this thread pool. To make the thread pool always
59*b095b053SXin Li  * use kernel wait, pass this flag to all parallelization functions.
60*b095b053SXin Li  */
61*b095b053SXin Li #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
62*b095b053SXin Li 
63*b095b053SXin Li #ifdef __cplusplus
64*b095b053SXin Li extern "C" {
65*b095b053SXin Li #endif
66*b095b053SXin Li 
67*b095b053SXin Li /**
68*b095b053SXin Li  * Create a thread pool with the specified number of threads.
69*b095b053SXin Li  *
70*b095b053SXin Li  * @param  threads_count  the number of threads in the thread pool.
71*b095b053SXin Li  *    A value of 0 has special interpretation: it creates a thread pool with as
72*b095b053SXin Li  *    many threads as there are logical processors in the system.
73*b095b053SXin Li  *
74*b095b053SXin Li  * @returns  A pointer to an opaque thread pool object if the call is
75*b095b053SXin Li  *    successful, or NULL pointer if the call failed.
76*b095b053SXin Li  */
77*b095b053SXin Li pthreadpool_t pthreadpool_create(size_t threads_count);
78*b095b053SXin Li 
79*b095b053SXin Li /**
80*b095b053SXin Li  * Query the number of threads in a thread pool.
81*b095b053SXin Li  *
82*b095b053SXin Li  * @param  threadpool  the thread pool to query.
83*b095b053SXin Li  *
84*b095b053SXin Li  * @returns  The number of threads in the thread pool.
85*b095b053SXin Li  */
86*b095b053SXin Li size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
87*b095b053SXin Li 
88*b095b053SXin Li /**
89*b095b053SXin Li  * Process items on a 1D grid.
90*b095b053SXin Li  *
91*b095b053SXin Li  * The function implements a parallel version of the following snippet:
92*b095b053SXin Li  *
93*b095b053SXin Li  *   for (size_t i = 0; i < range; i++)
94*b095b053SXin Li  *     function(context, i);
95*b095b053SXin Li  *
96*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
97*b095b053SXin Li  * is ready for a new task.
98*b095b053SXin Li  *
99*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
100*b095b053SXin Li  *    calls are serialized.
101*b095b053SXin Li  *
102*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
103*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
104*b095b053SXin Li  * @param function    the function to call for each item.
105*b095b053SXin Li  * @param context     the first argument passed to the specified function.
106*b095b053SXin Li  * @param range       the number of items on the 1D grid to process. The
107*b095b053SXin Li  *    specified function will be called once for each item.
108*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
109*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
110*b095b053SXin Li  */
111*b095b053SXin Li void pthreadpool_parallelize_1d(
112*b095b053SXin Li 	pthreadpool_t threadpool,
113*b095b053SXin Li 	pthreadpool_task_1d_t function,
114*b095b053SXin Li 	void* context,
115*b095b053SXin Li 	size_t range,
116*b095b053SXin Li 	uint32_t flags);
117*b095b053SXin Li 
118*b095b053SXin Li /**
119*b095b053SXin Li  * Process items on a 1D grid using a microarchitecture-aware task function.
120*b095b053SXin Li  *
121*b095b053SXin Li  * The function implements a parallel version of the following snippet:
122*b095b053SXin Li  *
123*b095b053SXin Li  *   uint32_t uarch_index = cpuinfo_initialize() ?
124*b095b053SXin Li  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
125*b095b053SXin Li  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
126*b095b053SXin Li  *   for (size_t i = 0; i < range; i++)
127*b095b053SXin Li  *     function(context, uarch_index, i);
128*b095b053SXin Li  *
129*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
130*b095b053SXin Li  * is ready for a new task.
131*b095b053SXin Li  *
132*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
133*b095b053SXin Li  *    calls are serialized.
134*b095b053SXin Li  *
135*b095b053SXin Li  * @param threadpool           the thread pool to use for parallelisation. If
136*b095b053SXin Li  *    threadpool is NULL, all items are processed serially on the calling
137*b095b053SXin Li  *    thread.
138*b095b053SXin Li  * @param function             the function to call for each item.
139*b095b053SXin Li  * @param context              the first argument passed to the specified
140*b095b053SXin Li  *    function.
141*b095b053SXin Li  * @param default_uarch_index  the microarchitecture index to use when
142*b095b053SXin Li  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
143*b095b053SXin Li  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
144*b095b053SXin Li  *    max_uarch_index value.
145*b095b053SXin Li  * @param max_uarch_index      the maximum microarchitecture index expected by
146*b095b053SXin Li  *    the specified function. If the index returned by
147*b095b053SXin Li  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
148*b095b053SXin Li  *    will be used instead. default_uarch_index can exceed max_uarch_index.
149*b095b053SXin Li  * @param range                the number of items on the 1D grid to process.
150*b095b053SXin Li  *    The specified function will be called once for each item.
151*b095b053SXin Li  * @param flags                a bitwise combination of zero or more optional
152*b095b053SXin Li  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
153*b095b053SXin Li  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
154*b095b053SXin Li  */
155*b095b053SXin Li void pthreadpool_parallelize_1d_with_uarch(
156*b095b053SXin Li 	pthreadpool_t threadpool,
157*b095b053SXin Li 	pthreadpool_task_1d_with_id_t function,
158*b095b053SXin Li 	void* context,
159*b095b053SXin Li 	uint32_t default_uarch_index,
160*b095b053SXin Li 	uint32_t max_uarch_index,
161*b095b053SXin Li 	size_t range,
162*b095b053SXin Li 	uint32_t flags);
163*b095b053SXin Li 
164*b095b053SXin Li /**
165*b095b053SXin Li  * Process items on a 1D grid with specified maximum tile size.
166*b095b053SXin Li  *
167*b095b053SXin Li  * The function implements a parallel version of the following snippet:
168*b095b053SXin Li  *
169*b095b053SXin Li  *   for (size_t i = 0; i < range; i += tile)
170*b095b053SXin Li  *     function(context, i, min(range - i, tile));
171*b095b053SXin Li  *
172*b095b053SXin Li  * When the call returns, all items have been processed and the thread pool is
173*b095b053SXin Li  * ready for a new task.
174*b095b053SXin Li  *
175*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool,
176*b095b053SXin Li  *    the calls are serialized.
177*b095b053SXin Li  *
178*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
179*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
180*b095b053SXin Li  * @param function    the function to call for each tile.
181*b095b053SXin Li  * @param context     the first argument passed to the specified function.
182*b095b053SXin Li  * @param range       the number of items on the 1D grid to process.
183*b095b053SXin Li  * @param tile        the maximum number of items on the 1D grid to process in
184*b095b053SXin Li  *    one function call.
185*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
186*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
187*b095b053SXin Li  */
188*b095b053SXin Li void pthreadpool_parallelize_1d_tile_1d(
189*b095b053SXin Li 	pthreadpool_t threadpool,
190*b095b053SXin Li 	pthreadpool_task_1d_tile_1d_t function,
191*b095b053SXin Li 	void* context,
192*b095b053SXin Li 	size_t range,
193*b095b053SXin Li 	size_t tile,
194*b095b053SXin Li 	uint32_t flags);
195*b095b053SXin Li 
196*b095b053SXin Li /**
197*b095b053SXin Li  * Process items on a 2D grid.
198*b095b053SXin Li  *
199*b095b053SXin Li  * The function implements a parallel version of the following snippet:
200*b095b053SXin Li  *
201*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
202*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
203*b095b053SXin Li  *       function(context, i, j);
204*b095b053SXin Li  *
205*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
206*b095b053SXin Li  * is ready for a new task.
207*b095b053SXin Li  *
208*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
209*b095b053SXin Li  *    calls are serialized.
210*b095b053SXin Li  *
211*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
212*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
213*b095b053SXin Li  * @param function    the function to call for each item.
214*b095b053SXin Li  * @param context     the first argument passed to the specified function.
215*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
216*b095b053SXin Li  *    of the 2D grid.
217*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
218*b095b053SXin Li  *    of the 2D grid.
219*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
220*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
221*b095b053SXin Li  */
222*b095b053SXin Li void pthreadpool_parallelize_2d(
223*b095b053SXin Li 	pthreadpool_t threadpool,
224*b095b053SXin Li 	pthreadpool_task_2d_t function,
225*b095b053SXin Li 	void* context,
226*b095b053SXin Li 	size_t range_i,
227*b095b053SXin Li 	size_t range_j,
228*b095b053SXin Li 	uint32_t flags);
229*b095b053SXin Li 
230*b095b053SXin Li /**
231*b095b053SXin Li  * Process items on a 2D grid with the specified maximum tile size along the
232*b095b053SXin Li  * last grid dimension.
233*b095b053SXin Li  *
234*b095b053SXin Li  * The function implements a parallel version of the following snippet:
235*b095b053SXin Li  *
236*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
237*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j += tile_j)
238*b095b053SXin Li  *       function(context, i, j, min(range_j - j, tile_j));
239*b095b053SXin Li  *
240*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
241*b095b053SXin Li  * is ready for a new task.
242*b095b053SXin Li  *
243*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
244*b095b053SXin Li  *    calls are serialized.
245*b095b053SXin Li  *
246*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
247*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
248*b095b053SXin Li  * @param function    the function to call for each tile.
249*b095b053SXin Li  * @param context     the first argument passed to the specified function.
250*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
251*b095b053SXin Li  *    of the 2D grid.
252*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
253*b095b053SXin Li  *    of the 2D grid.
254*b095b053SXin Li  * @param tile_j      the maximum number of items along the second dimension of
255*b095b053SXin Li  *    the 2D grid to process in one function call.
256*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
257*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
258*b095b053SXin Li  */
259*b095b053SXin Li void pthreadpool_parallelize_2d_tile_1d(
260*b095b053SXin Li 	pthreadpool_t threadpool,
261*b095b053SXin Li 	pthreadpool_task_2d_tile_1d_t function,
262*b095b053SXin Li 	void* context,
263*b095b053SXin Li 	size_t range_i,
264*b095b053SXin Li 	size_t range_j,
265*b095b053SXin Li 	size_t tile_j,
266*b095b053SXin Li 	uint32_t flags);
267*b095b053SXin Li 
268*b095b053SXin Li /**
269*b095b053SXin Li  * Process items on a 2D grid with the specified maximum tile size along each
270*b095b053SXin Li  * grid dimension.
271*b095b053SXin Li  *
272*b095b053SXin Li  * The function implements a parallel version of the following snippet:
273*b095b053SXin Li  *
274*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i += tile_i)
275*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j += tile_j)
276*b095b053SXin Li  *       function(context, i, j,
277*b095b053SXin Li  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
278*b095b053SXin Li  *
279*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
280*b095b053SXin Li  * is ready for a new task.
281*b095b053SXin Li  *
282*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
283*b095b053SXin Li  *    calls are serialized.
284*b095b053SXin Li  *
285*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
286*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
287*b095b053SXin Li  * @param function    the function to call for each tile.
288*b095b053SXin Li  * @param context     the first argument passed to the specified function.
289*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
290*b095b053SXin Li  *    of the 2D grid.
291*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
292*b095b053SXin Li  *    of the 2D grid.
293*b095b053SXin Li  * @param tile_j      the maximum number of items along the first dimension of
294*b095b053SXin Li  *    the 2D grid to process in one function call.
295*b095b053SXin Li  * @param tile_j      the maximum number of items along the second dimension of
296*b095b053SXin Li  *    the 2D grid to process in one function call.
297*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
298*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
299*b095b053SXin Li  */
300*b095b053SXin Li void pthreadpool_parallelize_2d_tile_2d(
301*b095b053SXin Li 	pthreadpool_t threadpool,
302*b095b053SXin Li 	pthreadpool_task_2d_tile_2d_t function,
303*b095b053SXin Li 	void* context,
304*b095b053SXin Li 	size_t range_i,
305*b095b053SXin Li 	size_t range_j,
306*b095b053SXin Li 	size_t tile_i,
307*b095b053SXin Li 	size_t tile_j,
308*b095b053SXin Li 	uint32_t flags);
309*b095b053SXin Li 
310*b095b053SXin Li /**
311*b095b053SXin Li  * Process items on a 2D grid with the specified maximum tile size along each
312*b095b053SXin Li  * grid dimension using a microarchitecture-aware task function.
313*b095b053SXin Li  *
314*b095b053SXin Li  * The function implements a parallel version of the following snippet:
315*b095b053SXin Li  *
316*b095b053SXin Li  *   uint32_t uarch_index = cpuinfo_initialize() ?
317*b095b053SXin Li  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
318*b095b053SXin Li  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
319*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i += tile_i)
320*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j += tile_j)
321*b095b053SXin Li  *       function(context, uarch_index, i, j,
322*b095b053SXin Li  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
323*b095b053SXin Li  *
324*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
325*b095b053SXin Li  * is ready for a new task.
326*b095b053SXin Li  *
327*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
328*b095b053SXin Li  *    calls are serialized.
329*b095b053SXin Li  *
330*b095b053SXin Li  * @param threadpool           the thread pool to use for parallelisation. If
331*b095b053SXin Li  *    threadpool is NULL, all items are processed serially on the calling
332*b095b053SXin Li  *    thread.
333*b095b053SXin Li  * @param function             the function to call for each tile.
334*b095b053SXin Li  * @param context              the first argument passed to the specified
335*b095b053SXin Li  *    function.
336*b095b053SXin Li  * @param default_uarch_index  the microarchitecture index to use when
337*b095b053SXin Li  *                             pthreadpool is configured without cpuinfo,
338*b095b053SXin Li  *                             cpuinfo initialization failed, or index returned
339*b095b053SXin Li  *                             by cpuinfo_get_current_uarch_index() exceeds
340*b095b053SXin Li  *                             the max_uarch_index value.
341*b095b053SXin Li  * @param max_uarch_index      the maximum microarchitecture index expected
342*b095b053SXin Li  *                             by the specified function. If the index returned
343*b095b053SXin Li  *                             by cpuinfo_get_current_uarch_index() exceeds this
344*b095b053SXin Li  *                             value, default_uarch_index will be used instead.
345*b095b053SXin Li  *                             default_uarch_index can exceed max_uarch_index.
346*b095b053SXin Li  * @param range_i              the number of items to process along the first
347*b095b053SXin Li  *    dimension of the 2D grid.
348*b095b053SXin Li  * @param range_j              the number of items to process along the second
349*b095b053SXin Li  *    dimension of the 2D grid.
350*b095b053SXin Li  * @param tile_j               the maximum number of items along the first
351*b095b053SXin Li  *    dimension of the 2D grid to process in one function call.
352*b095b053SXin Li  * @param tile_j               the maximum number of items along the second
353*b095b053SXin Li  *    dimension of the 2D grid to process in one function call.
354*b095b053SXin Li  * @param flags                a bitwise combination of zero or more optional
355*b095b053SXin Li  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
356*b095b053SXin Li  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
357*b095b053SXin Li  */
358*b095b053SXin Li void pthreadpool_parallelize_2d_tile_2d_with_uarch(
359*b095b053SXin Li 	pthreadpool_t threadpool,
360*b095b053SXin Li 	pthreadpool_task_2d_tile_2d_with_id_t function,
361*b095b053SXin Li 	void* context,
362*b095b053SXin Li 	uint32_t default_uarch_index,
363*b095b053SXin Li 	uint32_t max_uarch_index,
364*b095b053SXin Li 	size_t range_i,
365*b095b053SXin Li 	size_t range_j,
366*b095b053SXin Li 	size_t tile_i,
367*b095b053SXin Li 	size_t tile_j,
368*b095b053SXin Li 	uint32_t flags);
369*b095b053SXin Li 
370*b095b053SXin Li /**
371*b095b053SXin Li  * Process items on a 3D grid.
372*b095b053SXin Li  *
373*b095b053SXin Li  * The function implements a parallel version of the following snippet:
374*b095b053SXin Li  *
375*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
376*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
377*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
378*b095b053SXin Li  *         function(context, i, j, k);
379*b095b053SXin Li  *
380*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
381*b095b053SXin Li  * is ready for a new task.
382*b095b053SXin Li  *
383*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
384*b095b053SXin Li  *    calls are serialized.
385*b095b053SXin Li  *
386*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
387*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
388*b095b053SXin Li  * @param function    the function to call for each tile.
389*b095b053SXin Li  * @param context     the first argument passed to the specified function.
390*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
391*b095b053SXin Li  *    of the 3D grid.
392*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
393*b095b053SXin Li  *    of the 3D grid.
394*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
395*b095b053SXin Li  *    of the 3D grid.
396*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
397*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
398*b095b053SXin Li  */
399*b095b053SXin Li void pthreadpool_parallelize_3d(
400*b095b053SXin Li 	pthreadpool_t threadpool,
401*b095b053SXin Li 	pthreadpool_task_3d_t function,
402*b095b053SXin Li 	void* context,
403*b095b053SXin Li 	size_t range_i,
404*b095b053SXin Li 	size_t range_j,
405*b095b053SXin Li 	size_t range_k,
406*b095b053SXin Li 	uint32_t flags);
407*b095b053SXin Li 
408*b095b053SXin Li /**
409*b095b053SXin Li  * Process items on a 3D grid with the specified maximum tile size along the
410*b095b053SXin Li  * last grid dimension.
411*b095b053SXin Li  *
412*b095b053SXin Li  * The function implements a parallel version of the following snippet:
413*b095b053SXin Li  *
414*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
415*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
416*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k += tile_k)
417*b095b053SXin Li  *         function(context, i, j, k, min(range_k - k, tile_k));
418*b095b053SXin Li  *
419*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
420*b095b053SXin Li  * is ready for a new task.
421*b095b053SXin Li  *
422*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
423*b095b053SXin Li  *    calls are serialized.
424*b095b053SXin Li  *
425*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
426*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
427*b095b053SXin Li  * @param function    the function to call for each tile.
428*b095b053SXin Li  * @param context     the first argument passed to the specified function.
429*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
430*b095b053SXin Li  *    of the 3D grid.
431*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
432*b095b053SXin Li  *    of the 3D grid.
433*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
434*b095b053SXin Li  *    of the 3D grid.
435*b095b053SXin Li  * @param tile_k      the maximum number of items along the third dimension of
436*b095b053SXin Li  *    the 3D grid to process in one function call.
437*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
438*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
439*b095b053SXin Li  */
440*b095b053SXin Li void pthreadpool_parallelize_3d_tile_1d(
441*b095b053SXin Li 	pthreadpool_t threadpool,
442*b095b053SXin Li 	pthreadpool_task_3d_tile_1d_t function,
443*b095b053SXin Li 	void* context,
444*b095b053SXin Li 	size_t range_i,
445*b095b053SXin Li 	size_t range_j,
446*b095b053SXin Li 	size_t range_k,
447*b095b053SXin Li 	size_t tile_k,
448*b095b053SXin Li 	uint32_t flags);
449*b095b053SXin Li 
450*b095b053SXin Li /**
451*b095b053SXin Li  * Process items on a 3D grid with the specified maximum tile size along the
452*b095b053SXin Li  * last two grid dimensions.
453*b095b053SXin Li  *
454*b095b053SXin Li  * The function implements a parallel version of the following snippet:
455*b095b053SXin Li  *
456*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
457*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j += tile_j)
458*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k += tile_k)
459*b095b053SXin Li  *         function(context, i, j, k,
460*b095b053SXin Li  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
461*b095b053SXin Li  *
462*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
463*b095b053SXin Li  * is ready for a new task.
464*b095b053SXin Li  *
465*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
466*b095b053SXin Li  *    calls are serialized.
467*b095b053SXin Li  *
468*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
469*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
470*b095b053SXin Li  * @param function    the function to call for each tile.
471*b095b053SXin Li  * @param context     the first argument passed to the specified function.
472*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
473*b095b053SXin Li  *    of the 3D grid.
474*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
475*b095b053SXin Li  *    of the 3D grid.
476*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
477*b095b053SXin Li  *    of the 3D grid.
478*b095b053SXin Li  * @param tile_j      the maximum number of items along the second dimension of
479*b095b053SXin Li  *    the 3D grid to process in one function call.
480*b095b053SXin Li  * @param tile_k      the maximum number of items along the third dimension of
481*b095b053SXin Li  *    the 3D grid to process in one function call.
482*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
483*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
484*b095b053SXin Li  */
485*b095b053SXin Li void pthreadpool_parallelize_3d_tile_2d(
486*b095b053SXin Li 	pthreadpool_t threadpool,
487*b095b053SXin Li 	pthreadpool_task_3d_tile_2d_t function,
488*b095b053SXin Li 	void* context,
489*b095b053SXin Li 	size_t range_i,
490*b095b053SXin Li 	size_t range_j,
491*b095b053SXin Li 	size_t range_k,
492*b095b053SXin Li 	size_t tile_j,
493*b095b053SXin Li 	size_t tile_k,
494*b095b053SXin Li 	uint32_t flags);
495*b095b053SXin Li 
496*b095b053SXin Li /**
497*b095b053SXin Li  * Process items on a 3D grid with the specified maximum tile size along the
498*b095b053SXin Li  * last two grid dimensions using a microarchitecture-aware task function.
499*b095b053SXin Li  *
500*b095b053SXin Li  * The function implements a parallel version of the following snippet:
501*b095b053SXin Li  *
502*b095b053SXin Li  *   uint32_t uarch_index = cpuinfo_initialize() ?
503*b095b053SXin Li  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
504*b095b053SXin Li  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
505*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
506*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j += tile_j)
507*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k += tile_k)
508*b095b053SXin Li  *         function(context, uarch_index, i, j, k,
509*b095b053SXin Li  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
510*b095b053SXin Li  *
511*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
512*b095b053SXin Li  * is ready for a new task.
513*b095b053SXin Li  *
514*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
515*b095b053SXin Li  *    calls are serialized.
516*b095b053SXin Li  *
517*b095b053SXin Li  * @param threadpool           the thread pool to use for parallelisation. If
518*b095b053SXin Li  *    threadpool is NULL, all items are processed serially on the calling
519*b095b053SXin Li  *    thread.
520*b095b053SXin Li  * @param function             the function to call for each tile.
521*b095b053SXin Li  * @param context              the first argument passed to the specified
522*b095b053SXin Li  *    function.
523*b095b053SXin Li  * @param default_uarch_index  the microarchitecture index to use when
524*b095b053SXin Li  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
525*b095b053SXin Li  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
526*b095b053SXin Li  *    max_uarch_index value.
527*b095b053SXin Li  * @param max_uarch_index      the maximum microarchitecture index expected by
528*b095b053SXin Li  *    the specified function. If the index returned by
529*b095b053SXin Li  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
530*b095b053SXin Li  *    will be used instead. default_uarch_index can exceed max_uarch_index.
531*b095b053SXin Li  * @param range_i              the number of items to process along the first
532*b095b053SXin Li  *    dimension of the 3D grid.
533*b095b053SXin Li  * @param range_j              the number of items to process along the second
534*b095b053SXin Li  *    dimension of the 3D grid.
535*b095b053SXin Li  * @param range_k              the number of items to process along the third
536*b095b053SXin Li  *    dimension of the 3D grid.
537*b095b053SXin Li  * @param tile_j               the maximum number of items along the second
538*b095b053SXin Li  *    dimension of the 3D grid to process in one function call.
539*b095b053SXin Li  * @param tile_k               the maximum number of items along the third
540*b095b053SXin Li  *    dimension of the 3D grid to process in one function call.
541*b095b053SXin Li  * @param flags                a bitwise combination of zero or more optional
542*b095b053SXin Li  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
543*b095b053SXin Li  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
544*b095b053SXin Li  */
545*b095b053SXin Li void pthreadpool_parallelize_3d_tile_2d_with_uarch(
546*b095b053SXin Li 	pthreadpool_t threadpool,
547*b095b053SXin Li 	pthreadpool_task_3d_tile_2d_with_id_t function,
548*b095b053SXin Li 	void* context,
549*b095b053SXin Li 	uint32_t default_uarch_index,
550*b095b053SXin Li 	uint32_t max_uarch_index,
551*b095b053SXin Li 	size_t range_i,
552*b095b053SXin Li 	size_t range_j,
553*b095b053SXin Li 	size_t range_k,
554*b095b053SXin Li 	size_t tile_j,
555*b095b053SXin Li 	size_t tile_k,
556*b095b053SXin Li 	uint32_t flags);
557*b095b053SXin Li 
558*b095b053SXin Li /**
559*b095b053SXin Li  * Process items on a 4D grid.
560*b095b053SXin Li  *
561*b095b053SXin Li  * The function implements a parallel version of the following snippet:
562*b095b053SXin Li  *
563*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
564*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
565*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
566*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l++)
567*b095b053SXin Li  *           function(context, i, j, k, l);
568*b095b053SXin Li  *
569*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
570*b095b053SXin Li  * is ready for a new task.
571*b095b053SXin Li  *
572*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
573*b095b053SXin Li  *    calls are serialized.
574*b095b053SXin Li  *
575*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
576*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
577*b095b053SXin Li  * @param function    the function to call for each tile.
578*b095b053SXin Li  * @param context     the first argument passed to the specified function.
579*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
580*b095b053SXin Li  *    of the 4D grid.
581*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
582*b095b053SXin Li  *    of the 4D grid.
583*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
584*b095b053SXin Li  *    of the 4D grid.
585*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
586*b095b053SXin Li  *    of the 4D grid.
587*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
588*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
589*b095b053SXin Li  */
590*b095b053SXin Li void pthreadpool_parallelize_4d(
591*b095b053SXin Li 	pthreadpool_t threadpool,
592*b095b053SXin Li 	pthreadpool_task_4d_t function,
593*b095b053SXin Li 	void* context,
594*b095b053SXin Li 	size_t range_i,
595*b095b053SXin Li 	size_t range_j,
596*b095b053SXin Li 	size_t range_k,
597*b095b053SXin Li 	size_t range_l,
598*b095b053SXin Li 	uint32_t flags);
599*b095b053SXin Li 
600*b095b053SXin Li /**
601*b095b053SXin Li  * Process items on a 4D grid with the specified maximum tile size along the
602*b095b053SXin Li  * last grid dimension.
603*b095b053SXin Li  *
604*b095b053SXin Li  * The function implements a parallel version of the following snippet:
605*b095b053SXin Li  *
606*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
607*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
608*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
609*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l += tile_l)
610*b095b053SXin Li  *           function(context, i, j, k, l, min(range_l - l, tile_l));
611*b095b053SXin Li  *
612*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
613*b095b053SXin Li  * is ready for a new task.
614*b095b053SXin Li  *
615*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
616*b095b053SXin Li  *    calls are serialized.
617*b095b053SXin Li  *
618*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
619*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
620*b095b053SXin Li  * @param function    the function to call for each tile.
621*b095b053SXin Li  * @param context     the first argument passed to the specified function.
622*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
623*b095b053SXin Li  *    of the 4D grid.
624*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
625*b095b053SXin Li  *    of the 4D grid.
626*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
627*b095b053SXin Li  *    of the 4D grid.
628*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
629*b095b053SXin Li  *    of the 4D grid.
630*b095b053SXin Li  * @param tile_l      the maximum number of items along the fourth dimension of
631*b095b053SXin Li  *    the 4D grid to process in one function call.
632*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
633*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
634*b095b053SXin Li  */
635*b095b053SXin Li void pthreadpool_parallelize_4d_tile_1d(
636*b095b053SXin Li 	pthreadpool_t threadpool,
637*b095b053SXin Li 	pthreadpool_task_4d_tile_1d_t function,
638*b095b053SXin Li 	void* context,
639*b095b053SXin Li 	size_t range_i,
640*b095b053SXin Li 	size_t range_j,
641*b095b053SXin Li 	size_t range_k,
642*b095b053SXin Li 	size_t range_l,
643*b095b053SXin Li 	size_t tile_l,
644*b095b053SXin Li 	uint32_t flags);
645*b095b053SXin Li 
646*b095b053SXin Li /**
647*b095b053SXin Li  * Process items on a 4D grid with the specified maximum tile size along the
648*b095b053SXin Li  * last two grid dimensions.
649*b095b053SXin Li  *
650*b095b053SXin Li  * The function implements a parallel version of the following snippet:
651*b095b053SXin Li  *
652*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
653*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
654*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k += tile_k)
655*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l += tile_l)
656*b095b053SXin Li  *           function(context, i, j, k, l,
657*b095b053SXin Li  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
658*b095b053SXin Li  *
659*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
660*b095b053SXin Li  * is ready for a new task.
661*b095b053SXin Li  *
662*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
663*b095b053SXin Li  *    calls are serialized.
664*b095b053SXin Li  *
665*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
666*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
667*b095b053SXin Li  * @param function    the function to call for each tile.
668*b095b053SXin Li  * @param context     the first argument passed to the specified function.
669*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
670*b095b053SXin Li  *    of the 4D grid.
671*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
672*b095b053SXin Li  *    of the 4D grid.
673*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
674*b095b053SXin Li  *    of the 4D grid.
675*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
676*b095b053SXin Li  *    of the 4D grid.
677*b095b053SXin Li  * @param tile_k      the maximum number of items along the third dimension of
678*b095b053SXin Li  *    the 4D grid to process in one function call.
679*b095b053SXin Li  * @param tile_l      the maximum number of items along the fourth dimension of
680*b095b053SXin Li  *    the 4D grid to process in one function call.
681*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
682*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
683*b095b053SXin Li  */
684*b095b053SXin Li void pthreadpool_parallelize_4d_tile_2d(
685*b095b053SXin Li 	pthreadpool_t threadpool,
686*b095b053SXin Li 	pthreadpool_task_4d_tile_2d_t function,
687*b095b053SXin Li 	void* context,
688*b095b053SXin Li 	size_t range_i,
689*b095b053SXin Li 	size_t range_j,
690*b095b053SXin Li 	size_t range_k,
691*b095b053SXin Li 	size_t range_l,
692*b095b053SXin Li 	size_t tile_k,
693*b095b053SXin Li 	size_t tile_l,
694*b095b053SXin Li 	uint32_t flags);
695*b095b053SXin Li 
696*b095b053SXin Li /**
697*b095b053SXin Li  * Process items on a 4D grid with the specified maximum tile size along the
698*b095b053SXin Li  * last two grid dimensions using a microarchitecture-aware task function.
699*b095b053SXin Li  *
700*b095b053SXin Li  * The function implements a parallel version of the following snippet:
701*b095b053SXin Li  *
702*b095b053SXin Li  *   uint32_t uarch_index = cpuinfo_initialize() ?
703*b095b053SXin Li  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
704*b095b053SXin Li  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
705*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
706*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
707*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k += tile_k)
708*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l += tile_l)
709*b095b053SXin Li  *           function(context, uarch_index, i, j, k, l,
710*b095b053SXin Li  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
711*b095b053SXin Li  *
712*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
713*b095b053SXin Li  * is ready for a new task.
714*b095b053SXin Li  *
715*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
716*b095b053SXin Li  *    calls are serialized.
717*b095b053SXin Li  *
718*b095b053SXin Li  * @param threadpool           the thread pool to use for parallelisation. If
719*b095b053SXin Li  *    threadpool is NULL, all items are processed serially on the calling
720*b095b053SXin Li  *    thread.
721*b095b053SXin Li  * @param function             the function to call for each tile.
722*b095b053SXin Li  * @param context              the first argument passed to the specified
723*b095b053SXin Li  *    function.
724*b095b053SXin Li  * @param default_uarch_index  the microarchitecture index to use when
725*b095b053SXin Li  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
726*b095b053SXin Li  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
727*b095b053SXin Li  *    max_uarch_index value.
728*b095b053SXin Li  * @param max_uarch_index      the maximum microarchitecture index expected by
729*b095b053SXin Li  *    the specified function. If the index returned by
730*b095b053SXin Li  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
731*b095b053SXin Li  *    will be used instead. default_uarch_index can exceed max_uarch_index.
732*b095b053SXin Li  * @param range_i              the number of items to process along the first
733*b095b053SXin Li  *    dimension of the 4D grid.
734*b095b053SXin Li  * @param range_j              the number of items to process along the second
735*b095b053SXin Li  *    dimension of the 4D grid.
736*b095b053SXin Li  * @param range_k              the number of items to process along the third
737*b095b053SXin Li  *    dimension of the 4D grid.
738*b095b053SXin Li  * @param range_l              the number of items to process along the fourth
739*b095b053SXin Li  *    dimension of the 4D grid.
740*b095b053SXin Li  * @param tile_k               the maximum number of items along the third
741*b095b053SXin Li  *    dimension of the 4D grid to process in one function call.
742*b095b053SXin Li  * @param tile_l               the maximum number of items along the fourth
743*b095b053SXin Li  *    dimension of the 4D grid to process in one function call.
744*b095b053SXin Li  * @param flags                a bitwise combination of zero or more optional
745*b095b053SXin Li  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
746*b095b053SXin Li  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
747*b095b053SXin Li  */
748*b095b053SXin Li void pthreadpool_parallelize_4d_tile_2d_with_uarch(
749*b095b053SXin Li 	pthreadpool_t threadpool,
750*b095b053SXin Li 	pthreadpool_task_4d_tile_2d_with_id_t function,
751*b095b053SXin Li 	void* context,
752*b095b053SXin Li 	uint32_t default_uarch_index,
753*b095b053SXin Li 	uint32_t max_uarch_index,
754*b095b053SXin Li 	size_t range_i,
755*b095b053SXin Li 	size_t range_j,
756*b095b053SXin Li 	size_t range_k,
757*b095b053SXin Li 	size_t range_l,
758*b095b053SXin Li 	size_t tile_k,
759*b095b053SXin Li 	size_t tile_l,
760*b095b053SXin Li 	uint32_t flags);
761*b095b053SXin Li 
762*b095b053SXin Li /**
763*b095b053SXin Li  * Process items on a 5D grid.
764*b095b053SXin Li  *
765*b095b053SXin Li  * The function implements a parallel version of the following snippet:
766*b095b053SXin Li  *
767*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
768*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
769*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
770*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l++)
771*b095b053SXin Li  *           for (size_t m = 0; m < range_m; m++)
772*b095b053SXin Li  *             function(context, i, j, k, l, m);
773*b095b053SXin Li  *
774*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
775*b095b053SXin Li  * is ready for a new task.
776*b095b053SXin Li  *
777*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
778*b095b053SXin Li  *    calls are serialized.
779*b095b053SXin Li  *
780*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
781*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
782*b095b053SXin Li  * @param function    the function to call for each tile.
783*b095b053SXin Li  * @param context     the first argument passed to the specified function.
784*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
785*b095b053SXin Li  *    of the 5D grid.
786*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
787*b095b053SXin Li  *    of the 5D grid.
788*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
789*b095b053SXin Li  *    of the 5D grid.
790*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
791*b095b053SXin Li  *    of the 5D grid.
792*b095b053SXin Li  * @param range_m     the number of items to process along the fifth dimension
793*b095b053SXin Li  *    of the 5D grid.
794*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
795*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
796*b095b053SXin Li  */
797*b095b053SXin Li void pthreadpool_parallelize_5d(
798*b095b053SXin Li 	pthreadpool_t threadpool,
799*b095b053SXin Li 	pthreadpool_task_5d_t function,
800*b095b053SXin Li 	void* context,
801*b095b053SXin Li 	size_t range_i,
802*b095b053SXin Li 	size_t range_j,
803*b095b053SXin Li 	size_t range_k,
804*b095b053SXin Li 	size_t range_l,
805*b095b053SXin Li 	size_t range_m,
806*b095b053SXin Li 	uint32_t flags);
807*b095b053SXin Li 
808*b095b053SXin Li /**
809*b095b053SXin Li  * Process items on a 5D grid with the specified maximum tile size along the
810*b095b053SXin Li  * last grid dimension.
811*b095b053SXin Li  *
812*b095b053SXin Li  * The function implements a parallel version of the following snippet:
813*b095b053SXin Li  *
814*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
815*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
816*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
817*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l++)
818*b095b053SXin Li  *           for (size_t m = 0; m < range_m; m += tile_m)
819*b095b053SXin Li  *             function(context, i, j, k, l, m, min(range_m - m, tile_m));
820*b095b053SXin Li  *
821*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
822*b095b053SXin Li  * is ready for a new task.
823*b095b053SXin Li  *
824*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
825*b095b053SXin Li  *    calls are serialized.
826*b095b053SXin Li  *
827*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
828*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
829*b095b053SXin Li  * @param function    the function to call for each tile.
830*b095b053SXin Li  * @param context     the first argument passed to the specified function.
831*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
832*b095b053SXin Li  *    of the 5D grid.
833*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
834*b095b053SXin Li  *    of the 5D grid.
835*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
836*b095b053SXin Li  *    of the 5D grid.
837*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
838*b095b053SXin Li  *    of the 5D grid.
839*b095b053SXin Li  * @param range_m     the number of items to process along the fifth dimension
840*b095b053SXin Li  *    of the 5D grid.
841*b095b053SXin Li  * @param tile_m      the maximum number of items along the fifth dimension of
842*b095b053SXin Li  *    the 5D grid to process in one function call.
843*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
844*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
845*b095b053SXin Li  */
846*b095b053SXin Li void pthreadpool_parallelize_5d_tile_1d(
847*b095b053SXin Li 	pthreadpool_t threadpool,
848*b095b053SXin Li 	pthreadpool_task_5d_tile_1d_t function,
849*b095b053SXin Li 	void* context,
850*b095b053SXin Li 	size_t range_i,
851*b095b053SXin Li 	size_t range_j,
852*b095b053SXin Li 	size_t range_k,
853*b095b053SXin Li 	size_t range_l,
854*b095b053SXin Li 	size_t range_m,
855*b095b053SXin Li 	size_t tile_m,
856*b095b053SXin Li 	uint32_t flags);
857*b095b053SXin Li 
858*b095b053SXin Li /**
859*b095b053SXin Li  * Process items on a 5D grid with the specified maximum tile size along the
860*b095b053SXin Li  * last two grid dimensions.
861*b095b053SXin Li  *
862*b095b053SXin Li  * The function implements a parallel version of the following snippet:
863*b095b053SXin Li  *
864*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
865*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
866*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
867*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l += tile_l)
868*b095b053SXin Li  *           for (size_t m = 0; m < range_m; m += tile_m)
869*b095b053SXin Li  *             function(context, i, j, k, l, m,
870*b095b053SXin Li  *               min(range_l - l, tile_l), min(range_m - m, tile_m));
871*b095b053SXin Li  *
872*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
873*b095b053SXin Li  * is ready for a new task.
874*b095b053SXin Li  *
875*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
876*b095b053SXin Li  *    calls are serialized.
877*b095b053SXin Li  *
878*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
879*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
880*b095b053SXin Li  * @param function    the function to call for each tile.
881*b095b053SXin Li  * @param context     the first argument passed to the specified function.
882*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
883*b095b053SXin Li  *    of the 5D grid.
884*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
885*b095b053SXin Li  *    of the 5D grid.
886*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
887*b095b053SXin Li  *    of the 5D grid.
888*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
889*b095b053SXin Li  *    of the 5D grid.
890*b095b053SXin Li  * @param range_m     the number of items to process along the fifth dimension
891*b095b053SXin Li  *    of the 5D grid.
892*b095b053SXin Li  * @param tile_l      the maximum number of items along the fourth dimension of
893*b095b053SXin Li  *    the 5D grid to process in one function call.
894*b095b053SXin Li  * @param tile_m      the maximum number of items along the fifth dimension of
895*b095b053SXin Li  *    the 5D grid to process in one function call.
896*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
897*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
898*b095b053SXin Li  */
899*b095b053SXin Li void pthreadpool_parallelize_5d_tile_2d(
900*b095b053SXin Li 	pthreadpool_t threadpool,
901*b095b053SXin Li 	pthreadpool_task_5d_tile_2d_t function,
902*b095b053SXin Li 	void* context,
903*b095b053SXin Li 	size_t range_i,
904*b095b053SXin Li 	size_t range_j,
905*b095b053SXin Li 	size_t range_k,
906*b095b053SXin Li 	size_t range_l,
907*b095b053SXin Li 	size_t range_m,
908*b095b053SXin Li 	size_t tile_l,
909*b095b053SXin Li 	size_t tile_m,
910*b095b053SXin Li 	uint32_t flags);
911*b095b053SXin Li 
912*b095b053SXin Li /**
913*b095b053SXin Li  * Process items on a 6D grid.
914*b095b053SXin Li  *
915*b095b053SXin Li  * The function implements a parallel version of the following snippet:
916*b095b053SXin Li  *
917*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
918*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
919*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
920*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l++)
921*b095b053SXin Li  *           for (size_t m = 0; m < range_m; m++)
922*b095b053SXin Li  *             for (size_t n = 0; n < range_n; n++)
923*b095b053SXin Li  *               function(context, i, j, k, l, m, n);
924*b095b053SXin Li  *
925*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
926*b095b053SXin Li  * is ready for a new task.
927*b095b053SXin Li  *
928*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
929*b095b053SXin Li  *    calls are serialized.
930*b095b053SXin Li  *
931*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
932*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
933*b095b053SXin Li  * @param function    the function to call for each tile.
934*b095b053SXin Li  * @param context     the first argument passed to the specified function.
935*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
936*b095b053SXin Li  *    of the 6D grid.
937*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
938*b095b053SXin Li  *    of the 6D grid.
939*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
940*b095b053SXin Li  *    of the 6D grid.
941*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
942*b095b053SXin Li  *    of the 6D grid.
943*b095b053SXin Li  * @param range_m     the number of items to process along the fifth dimension
944*b095b053SXin Li  *    of the 6D grid.
945*b095b053SXin Li  * @param range_n     the number of items to process along the sixth dimension
946*b095b053SXin Li  *    of the 6D grid.
947*b095b053SXin Li  * @param tile_n      the maximum number of items along the sixth dimension of
948*b095b053SXin Li  *    the 6D grid to process in one function call.
949*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
950*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
951*b095b053SXin Li  */
952*b095b053SXin Li void pthreadpool_parallelize_6d(
953*b095b053SXin Li   pthreadpool_t threadpool,
954*b095b053SXin Li   pthreadpool_task_6d_t function,
955*b095b053SXin Li   void* context,
956*b095b053SXin Li   size_t range_i,
957*b095b053SXin Li   size_t range_j,
958*b095b053SXin Li   size_t range_k,
959*b095b053SXin Li   size_t range_l,
960*b095b053SXin Li   size_t range_m,
961*b095b053SXin Li   size_t range_n,
962*b095b053SXin Li   uint32_t flags);
963*b095b053SXin Li 
964*b095b053SXin Li /**
965*b095b053SXin Li  * Process items on a 6D grid with the specified maximum tile size along the
966*b095b053SXin Li  * last grid dimension.
967*b095b053SXin Li  *
968*b095b053SXin Li  * The function implements a parallel version of the following snippet:
969*b095b053SXin Li  *
970*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
971*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
972*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
973*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l++)
974*b095b053SXin Li  *           for (size_t m = 0; m < range_m; m++)
975*b095b053SXin Li  *             for (size_t n = 0; n < range_n; n += tile_n)
976*b095b053SXin Li  *               function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
977*b095b053SXin Li  *
978*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
979*b095b053SXin Li  * is ready for a new task.
980*b095b053SXin Li  *
981*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
982*b095b053SXin Li  *    calls are serialized.
983*b095b053SXin Li  *
984*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
985*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
986*b095b053SXin Li  * @param function    the function to call for each tile.
987*b095b053SXin Li  * @param context     the first argument passed to the specified function.
988*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
989*b095b053SXin Li  *    of the 6D grid.
990*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
991*b095b053SXin Li  *    of the 6D grid.
992*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
993*b095b053SXin Li  *    of the 6D grid.
994*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
995*b095b053SXin Li  *    of the 6D grid.
996*b095b053SXin Li  * @param range_m     the number of items to process along the fifth dimension
997*b095b053SXin Li  *    of the 6D grid.
998*b095b053SXin Li  * @param range_n     the number of items to process along the sixth dimension
999*b095b053SXin Li  *    of the 6D grid.
1000*b095b053SXin Li  * @param tile_n      the maximum number of items along the sixth dimension of
1001*b095b053SXin Li  *    the 6D grid to process in one function call.
1002*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
1003*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1004*b095b053SXin Li  */
1005*b095b053SXin Li void pthreadpool_parallelize_6d_tile_1d(
1006*b095b053SXin Li   pthreadpool_t threadpool,
1007*b095b053SXin Li   pthreadpool_task_6d_tile_1d_t function,
1008*b095b053SXin Li   void* context,
1009*b095b053SXin Li   size_t range_i,
1010*b095b053SXin Li   size_t range_j,
1011*b095b053SXin Li   size_t range_k,
1012*b095b053SXin Li   size_t range_l,
1013*b095b053SXin Li   size_t range_m,
1014*b095b053SXin Li   size_t range_n,
1015*b095b053SXin Li   size_t tile_n,
1016*b095b053SXin Li   uint32_t flags);
1017*b095b053SXin Li 
1018*b095b053SXin Li /**
1019*b095b053SXin Li  * Process items on a 6D grid with the specified maximum tile size along the
1020*b095b053SXin Li  * last two grid dimensions.
1021*b095b053SXin Li  *
1022*b095b053SXin Li  * The function implements a parallel version of the following snippet:
1023*b095b053SXin Li  *
1024*b095b053SXin Li  *   for (size_t i = 0; i < range_i; i++)
1025*b095b053SXin Li  *     for (size_t j = 0; j < range_j; j++)
1026*b095b053SXin Li  *       for (size_t k = 0; k < range_k; k++)
1027*b095b053SXin Li  *         for (size_t l = 0; l < range_l; l++)
1028*b095b053SXin Li  *           for (size_t m = 0; m < range_m; m += tile_m)
1029*b095b053SXin Li  *             for (size_t n = 0; n < range_n; n += tile_n)
1030*b095b053SXin Li  *               function(context, i, j, k, l, m, n,
1031*b095b053SXin Li  *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
1032*b095b053SXin Li  *
1033*b095b053SXin Li  * When the function returns, all items have been processed and the thread pool
1034*b095b053SXin Li  * is ready for a new task.
1035*b095b053SXin Li  *
1036*b095b053SXin Li  * @note If multiple threads call this function with the same thread pool, the
1037*b095b053SXin Li  *    calls are serialized.
1038*b095b053SXin Li  *
1039*b095b053SXin Li  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1040*b095b053SXin Li  *    is NULL, all items are processed serially on the calling thread.
1041*b095b053SXin Li  * @param function    the function to call for each tile.
1042*b095b053SXin Li  * @param context     the first argument passed to the specified function.
1043*b095b053SXin Li  * @param range_i     the number of items to process along the first dimension
1044*b095b053SXin Li  *    of the 6D grid.
1045*b095b053SXin Li  * @param range_j     the number of items to process along the second dimension
1046*b095b053SXin Li  *    of the 6D grid.
1047*b095b053SXin Li  * @param range_k     the number of items to process along the third dimension
1048*b095b053SXin Li  *    of the 6D grid.
1049*b095b053SXin Li  * @param range_l     the number of items to process along the fourth dimension
1050*b095b053SXin Li  *    of the 6D grid.
1051*b095b053SXin Li  * @param range_m     the number of items to process along the fifth dimension
1052*b095b053SXin Li  *    of the 6D grid.
1053*b095b053SXin Li  * @param range_n     the number of items to process along the sixth dimension
1054*b095b053SXin Li  *    of the 6D grid.
1055*b095b053SXin Li  * @param tile_m      the maximum number of items along the fifth dimension of
1056*b095b053SXin Li  *    the 6D grid to process in one function call.
1057*b095b053SXin Li  * @param tile_n      the maximum number of items along the sixth dimension of
1058*b095b053SXin Li  *    the 6D grid to process in one function call.
1059*b095b053SXin Li  * @param flags       a bitwise combination of zero or more optional flags
1060*b095b053SXin Li  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1061*b095b053SXin Li  */
1062*b095b053SXin Li void pthreadpool_parallelize_6d_tile_2d(
1063*b095b053SXin Li 	pthreadpool_t threadpool,
1064*b095b053SXin Li 	pthreadpool_task_6d_tile_2d_t function,
1065*b095b053SXin Li 	void* context,
1066*b095b053SXin Li 	size_t range_i,
1067*b095b053SXin Li 	size_t range_j,
1068*b095b053SXin Li 	size_t range_k,
1069*b095b053SXin Li 	size_t range_l,
1070*b095b053SXin Li 	size_t range_m,
1071*b095b053SXin Li 	size_t range_n,
1072*b095b053SXin Li 	size_t tile_m,
1073*b095b053SXin Li 	size_t tile_n,
1074*b095b053SXin Li 	uint32_t flags);
1075*b095b053SXin Li 
1076*b095b053SXin Li /**
1077*b095b053SXin Li  * Terminates threads in the thread pool and releases associated resources.
1078*b095b053SXin Li  *
1079*b095b053SXin Li  * @warning  Accessing the thread pool after a call to this function constitutes
1080*b095b053SXin Li  *    undefined behaviour and may cause data corruption.
1081*b095b053SXin Li  *
1082*b095b053SXin Li  * @param[in,out]  threadpool  The thread pool to destroy.
1083*b095b053SXin Li  */
1084*b095b053SXin Li void pthreadpool_destroy(pthreadpool_t threadpool);
1085*b095b053SXin Li 
1086*b095b053SXin Li 
1087*b095b053SXin Li #ifndef PTHREADPOOL_NO_DEPRECATED_API
1088*b095b053SXin Li 
1089*b095b053SXin Li /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
1090*b095b053SXin Li #if defined(__GNUC__)
1091*b095b053SXin Li 	#define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
1092*b095b053SXin Li #else
1093*b095b053SXin Li 	#define PTHREADPOOL_DEPRECATED
1094*b095b053SXin Li #endif
1095*b095b053SXin Li 
1096*b095b053SXin Li typedef void (*pthreadpool_function_1d_t)(void*, size_t) PTHREADPOOL_DEPRECATED;
1097*b095b053SXin Li typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED;
1098*b095b053SXin Li typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED;
1099*b095b053SXin Li typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
1100*b095b053SXin Li typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
1101*b095b053SXin Li typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
1102*b095b053SXin Li 
1103*b095b053SXin Li void pthreadpool_compute_1d(
1104*b095b053SXin Li 	pthreadpool_t threadpool,
1105*b095b053SXin Li 	pthreadpool_function_1d_t function,
1106*b095b053SXin Li 	void* argument,
1107*b095b053SXin Li 	size_t range) PTHREADPOOL_DEPRECATED;
1108*b095b053SXin Li 
1109*b095b053SXin Li void pthreadpool_compute_1d_tiled(
1110*b095b053SXin Li 	pthreadpool_t threadpool,
1111*b095b053SXin Li 	pthreadpool_function_1d_tiled_t function,
1112*b095b053SXin Li 	void* argument,
1113*b095b053SXin Li 	size_t range,
1114*b095b053SXin Li 	size_t tile) PTHREADPOOL_DEPRECATED;
1115*b095b053SXin Li 
1116*b095b053SXin Li void pthreadpool_compute_2d(
1117*b095b053SXin Li 	pthreadpool_t threadpool,
1118*b095b053SXin Li 	pthreadpool_function_2d_t function,
1119*b095b053SXin Li 	void* argument,
1120*b095b053SXin Li 	size_t range_i,
1121*b095b053SXin Li 	size_t range_j) PTHREADPOOL_DEPRECATED;
1122*b095b053SXin Li 
1123*b095b053SXin Li void pthreadpool_compute_2d_tiled(
1124*b095b053SXin Li 	pthreadpool_t threadpool,
1125*b095b053SXin Li 	pthreadpool_function_2d_tiled_t function,
1126*b095b053SXin Li 	void* argument,
1127*b095b053SXin Li 	size_t range_i,
1128*b095b053SXin Li 	size_t range_j,
1129*b095b053SXin Li 	size_t tile_i,
1130*b095b053SXin Li 	size_t tile_j) PTHREADPOOL_DEPRECATED;
1131*b095b053SXin Li 
1132*b095b053SXin Li void pthreadpool_compute_3d_tiled(
1133*b095b053SXin Li 	pthreadpool_t threadpool,
1134*b095b053SXin Li 	pthreadpool_function_3d_tiled_t function,
1135*b095b053SXin Li 	void* argument,
1136*b095b053SXin Li 	size_t range_i,
1137*b095b053SXin Li 	size_t range_j,
1138*b095b053SXin Li 	size_t range_k,
1139*b095b053SXin Li 	size_t tile_i,
1140*b095b053SXin Li 	size_t tile_j,
1141*b095b053SXin Li 	size_t tile_k) PTHREADPOOL_DEPRECATED;
1142*b095b053SXin Li 
1143*b095b053SXin Li void pthreadpool_compute_4d_tiled(
1144*b095b053SXin Li 	pthreadpool_t threadpool,
1145*b095b053SXin Li 	pthreadpool_function_4d_tiled_t function,
1146*b095b053SXin Li 	void* argument,
1147*b095b053SXin Li 	size_t range_i,
1148*b095b053SXin Li 	size_t range_j,
1149*b095b053SXin Li 	size_t range_k,
1150*b095b053SXin Li 	size_t range_l,
1151*b095b053SXin Li 	size_t tile_i,
1152*b095b053SXin Li 	size_t tile_j,
1153*b095b053SXin Li 	size_t tile_k,
1154*b095b053SXin Li 	size_t tile_l) PTHREADPOOL_DEPRECATED;
1155*b095b053SXin Li 
1156*b095b053SXin Li #endif /* PTHREADPOOL_NO_DEPRECATED_API */
1157*b095b053SXin Li 
1158*b095b053SXin Li #ifdef __cplusplus
1159*b095b053SXin Li } /* extern "C" */
1160*b095b053SXin Li #endif
1161*b095b053SXin Li 
1162*b095b053SXin Li #endif /* PTHREADPOOL_H_ */
1163