xref: /aosp_15_r20/external/cronet/third_party/libc++/src/include/__pstl/internal/parallel_backend_tbb.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // -*- C++ -*-
2 //===----------------------------------------------------------------------===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #ifndef _PSTL_PARALLEL_BACKEND_TBB_H
11 #define _PSTL_PARALLEL_BACKEND_TBB_H
12 
13 #include <__assert>
14 #include <__config>
15 #include <algorithm>
16 #include <type_traits>
17 
18 #include "parallel_backend_utils.h"
19 
20 // Bring in minimal required subset of Intel TBB
21 #include <tbb/blocked_range.h>
22 #include <tbb/parallel_for.h>
23 #include <tbb/parallel_reduce.h>
24 #include <tbb/parallel_scan.h>
25 #include <tbb/parallel_invoke.h>
26 #include <tbb/task_arena.h>
27 #include <tbb/tbb_allocator.h>
28 #include <tbb/task.h>
29 
30 #if TBB_INTERFACE_VERSION < 10000
31 #    error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
32 #endif
33 
34 namespace __pstl
35 {
36 namespace __tbb_backend
37 {
38 
39 //! Raw memory buffer with automatic freeing and no exceptions.
40 /** Some of our algorithms need to start with raw memory buffer,
41 not an initialize array, because initialization/destruction
42 would make the span be at least O(N). */
43 // tbb::allocator can improve performance in some cases.
44 template <typename _Tp>
45 class __buffer
46 {
47     tbb::tbb_allocator<_Tp> _M_allocator;
48     _Tp* _M_ptr;
49     const std::size_t _M_buf_size;
50     __buffer(const __buffer&) = delete;
51     void
52     operator=(const __buffer&) = delete;
53 
54   public:
55     //! Try to obtain buffer of given size to store objects of _Tp type
__buffer(std::size_t n)56     __buffer(std::size_t n) : _M_allocator(), _M_ptr(_M_allocator.allocate(n)), _M_buf_size(n) {}
57     //! True if buffer was successfully obtained, zero otherwise.
58     operator bool() const { return _M_ptr != NULL; }
59     //! Return pointer to buffer, or  NULL if buffer could not be obtained.
60     _Tp*
get()61     get() const
62     {
63         return _M_ptr;
64     }
65     //! Destroy buffer
~__buffer()66     ~__buffer() { _M_allocator.deallocate(_M_ptr, _M_buf_size); }
67 };
68 
69 // Wrapper for tbb::task
70 inline void
__cancel_execution()71 __cancel_execution()
72 {
73 #if TBB_INTERFACE_VERSION <= 12000
74     tbb::task::self().group()->cancel_group_execution();
75 #else
76     tbb::task::current_context()->cancel_group_execution();
77 #endif
78 }
79 
80 //------------------------------------------------------------------------
81 // parallel_for
82 //------------------------------------------------------------------------
83 
84 template <class _Index, class _RealBody>
85 class __parallel_for_body
86 {
87   public:
__parallel_for_body(const _RealBody & __body)88     __parallel_for_body(const _RealBody& __body) : _M_body(__body) {}
__parallel_for_body(const __parallel_for_body & __body)89     __parallel_for_body(const __parallel_for_body& __body) : _M_body(__body._M_body) {}
90     void
operator()91     operator()(const tbb::blocked_range<_Index>& __range) const
92     {
93         _M_body(__range.begin(), __range.end());
94     }
95 
96   private:
97     _RealBody _M_body;
98 };
99 
100 //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
101 // wrapper over tbb::parallel_for
102 template <class _ExecutionPolicy, class _Index, class _Fp>
103 void
__parallel_for(__pstl::__internal::__tbb_backend_tag,_ExecutionPolicy &&,_Index __first,_Index __last,_Fp __f)104 __parallel_for(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)
105 {
106     tbb::this_task_arena::isolate([=]() {
107         tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last), __parallel_for_body<_Index, _Fp>(__f));
108     });
109 }
110 
111 //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
112 // wrapper over tbb::parallel_reduce
113 template <class _ExecutionPolicy, class _Value, class _Index, typename _RealBody, typename _Reduction>
114 _Value
__parallel_reduce(__pstl::__internal::__tbb_backend_tag,_ExecutionPolicy &&,_Index __first,_Index __last,const _Value & __identity,const _RealBody & __real_body,const _Reduction & __reduction)115 __parallel_reduce(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,
116                   const _Value& __identity, const _RealBody& __real_body, const _Reduction& __reduction)
117 {
118     return tbb::this_task_arena::isolate([__first, __last, &__identity, &__real_body, &__reduction]() -> _Value {
119         return tbb::parallel_reduce(
120             tbb::blocked_range<_Index>(__first, __last), __identity,
121             [__real_body](const tbb::blocked_range<_Index>& __r, const _Value& __value) -> _Value {
122                 return __real_body(__r.begin(), __r.end(), __value);
123             },
124             __reduction);
125     });
126 }
127 
128 //------------------------------------------------------------------------
129 // parallel_transform_reduce
130 //
131 // Notation:
132 //      r(i,j,init) returns reduction of init with reduction over [i,j)
133 //      u(i) returns f(i,i+1,identity) for a hypothetical left identity element of r
134 //      c(x,y) combines values x and y that were the result of r or u
135 //------------------------------------------------------------------------
136 
137 template <class _Index, class _Up, class _Tp, class _Cp, class _Rp>
138 struct __par_trans_red_body
139 {
140     alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
141     _Rp _M_brick_reduce;                           // Most likely to have non-empty layout
142     _Up _M_u;
143     _Cp _M_combine;
144     bool _M_has_sum; // Put last to minimize size of class
145     _Tp&
sum__par_trans_red_body146     sum()
147     {
148         __TBB_ASSERT(_M_has_sum, "sum expected");
149         return *(_Tp*)_M_sum_storage;
150     }
__par_trans_red_body__par_trans_red_body151     __par_trans_red_body(_Up __u, _Tp __init, _Cp __c, _Rp __r)
152         : _M_brick_reduce(__r), _M_u(__u), _M_combine(__c), _M_has_sum(true)
153     {
154         new (_M_sum_storage) _Tp(__init);
155     }
156 
__par_trans_red_body__par_trans_red_body157     __par_trans_red_body(__par_trans_red_body& __left, tbb::split)
158         : _M_brick_reduce(__left._M_brick_reduce), _M_u(__left._M_u), _M_combine(__left._M_combine), _M_has_sum(false)
159     {
160     }
161 
~__par_trans_red_body__par_trans_red_body162     ~__par_trans_red_body()
163     {
164         // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
165         if (_M_has_sum)
166             sum().~_Tp();
167     }
168 
169     void
join__par_trans_red_body170     join(__par_trans_red_body& __rhs)
171     {
172         sum() = _M_combine(sum(), __rhs.sum());
173     }
174 
175     void
operator__par_trans_red_body176     operator()(const tbb::blocked_range<_Index>& __range)
177     {
178         _Index __i = __range.begin();
179         _Index __j = __range.end();
180         if (!_M_has_sum)
181         {
182             __TBB_ASSERT(__range.size() > 1, "there should be at least 2 elements");
183             new (&_M_sum_storage)
184                 _Tp(_M_combine(_M_u(__i), _M_u(__i + 1))); // The condition i+1 < j is provided by the grain size of 3
185             _M_has_sum = true;
186             std::advance(__i, 2);
187             if (__i == __j)
188                 return;
189         }
190         sum() = _M_brick_reduce(__i, __j, sum());
191     }
192 };
193 
194 template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp>
195 _Tp
__parallel_transform_reduce(__pstl::__internal::__tbb_backend_tag,_ExecutionPolicy &&,_Index __first,_Index __last,_Up __u,_Tp __init,_Cp __combine,_Rp __brick_reduce)196 __parallel_transform_reduce(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,
197                             _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce)
198 {
199     __tbb_backend::__par_trans_red_body<_Index, _Up, _Tp, _Cp, _Rp> __body(__u, __init, __combine, __brick_reduce);
200     // The grain size of 3 is used in order to provide mininum 2 elements for each body
201     tbb::this_task_arena::isolate(
202         [__first, __last, &__body]() { tbb::parallel_reduce(tbb::blocked_range<_Index>(__first, __last, 3), __body); });
203     return __body.sum();
204 }
205 
206 //------------------------------------------------------------------------
207 // parallel_scan
208 //------------------------------------------------------------------------
209 
210 template <class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
211 class __trans_scan_body
212 {
213     alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
214     _Rp _M_brick_reduce;                           // Most likely to have non-empty layout
215     _Up _M_u;
216     _Cp _M_combine;
217     _Sp _M_scan;
218     bool _M_has_sum; // Put last to minimize size of class
219   public:
__trans_scan_body(_Up __u,_Tp __init,_Cp __combine,_Rp __reduce,_Sp __scan)220     __trans_scan_body(_Up __u, _Tp __init, _Cp __combine, _Rp __reduce, _Sp __scan)
221         : _M_brick_reduce(__reduce), _M_u(__u), _M_combine(__combine), _M_scan(__scan), _M_has_sum(true)
222     {
223         new (_M_sum_storage) _Tp(__init);
224     }
225 
__trans_scan_body(__trans_scan_body & __b,tbb::split)226     __trans_scan_body(__trans_scan_body& __b, tbb::split)
227         : _M_brick_reduce(__b._M_brick_reduce), _M_u(__b._M_u), _M_combine(__b._M_combine), _M_scan(__b._M_scan),
228           _M_has_sum(false)
229     {
230     }
231 
~__trans_scan_body()232     ~__trans_scan_body()
233     {
234         // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
235         if (_M_has_sum)
236             sum().~_Tp();
237     }
238 
239     _Tp&
sum()240     sum() const
241     {
242         __TBB_ASSERT(_M_has_sum, "sum expected");
243         return *const_cast<_Tp*>(reinterpret_cast<_Tp const*>(_M_sum_storage));
244     }
245 
246     void
operator()247     operator()(const tbb::blocked_range<_Index>& __range, tbb::pre_scan_tag)
248     {
249         _Index __i = __range.begin();
250         _Index __j = __range.end();
251         if (!_M_has_sum)
252         {
253             new (&_M_sum_storage) _Tp(_M_u(__i));
254             _M_has_sum = true;
255             ++__i;
256             if (__i == __j)
257                 return;
258         }
259         sum() = _M_brick_reduce(__i, __j, sum());
260     }
261 
262     void
operator()263     operator()(const tbb::blocked_range<_Index>& __range, tbb::final_scan_tag)
264     {
265         sum() = _M_scan(__range.begin(), __range.end(), sum());
266     }
267 
268     void
reverse_join(__trans_scan_body & __a)269     reverse_join(__trans_scan_body& __a)
270     {
271         if (_M_has_sum)
272         {
273             sum() = _M_combine(__a.sum(), sum());
274         }
275         else
276         {
277             new (&_M_sum_storage) _Tp(__a.sum());
278             _M_has_sum = true;
279         }
280     }
281 
282     void
assign(__trans_scan_body & __b)283     assign(__trans_scan_body& __b)
284     {
285         sum() = __b.sum();
286     }
287 };
288 
289 template <typename _Index>
290 _Index
__split(_Index __m)291 __split(_Index __m)
292 {
293     _Index __k = 1;
294     while (2 * __k < __m)
295         __k *= 2;
296     return __k;
297 }
298 
299 //------------------------------------------------------------------------
300 // __parallel_strict_scan
301 //------------------------------------------------------------------------
302 
303 template <typename _Index, typename _Tp, typename _Rp, typename _Cp>
304 void
__upsweep(_Index __i,_Index __m,_Index __tilesize,_Tp * __r,_Index __lastsize,_Rp __reduce,_Cp __combine)305 __upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine)
306 {
307     if (__m == 1)
308         __r[0] = __reduce(__i * __tilesize, __lastsize);
309     else
310     {
311         _Index __k = __split(__m);
312         tbb::parallel_invoke(
313             [=] { __tbb_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); },
314             [=] {
315                 __tbb_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine);
316             });
317         if (__m == 2 * __k)
318             __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]);
319     }
320 }
321 
322 template <typename _Index, typename _Tp, typename _Cp, typename _Sp>
323 void
__downsweep(_Index __i,_Index __m,_Index __tilesize,_Tp * __r,_Index __lastsize,_Tp __initial,_Cp __combine,_Sp __scan)324 __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
325             _Sp __scan)
326 {
327     if (__m == 1)
328         __scan(__i * __tilesize, __lastsize, __initial);
329     else
330     {
331         const _Index __k = __split(__m);
332         tbb::parallel_invoke(
333             [=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); },
334             // Assumes that __combine never throws.
335             //TODO: Consider adding a requirement for user functors to be constant.
336             [=, &__combine] {
337                 __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
338                                            __combine(__initial, __r[__k - 1]), __combine, __scan);
339             });
340     }
341 }
342 
343 // Adapted from Intel(R) Cilk(TM) version from cilkpub.
344 // Let i:len denote a counted interval of length n starting at i.  s denotes a generalized-sum value.
345 // Expected actions of the functors are:
346 //     reduce(i,len) -> s  -- return reduction value of i:len.
347 //     combine(s1,s2) -> s -- return merged sum
348 //     apex(s) -- do any processing necessary between reduce and scan.
349 //     scan(i,len,initial) -- perform scan over i:len starting with initial.
350 // The initial range 0:n is partitioned into consecutive subranges.
351 // reduce and scan are each called exactly once per subrange.
352 // Thus callers can rely upon side effects in reduce.
353 // combine must not throw an exception.
354 // apex is called exactly once, after all calls to reduce and before all calls to scan.
355 // For example, it's useful for allocating a __buffer used by scan but whose size is the sum of all reduction values.
356 // T must have a trivial constructor and destructor.
357 template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
358 void
__parallel_strict_scan(__pstl::__internal::__tbb_backend_tag,_ExecutionPolicy &&,_Index __n,_Tp __initial,_Rp __reduce,_Cp __combine,_Sp __scan,_Ap __apex)359 __parallel_strict_scan(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __n, _Tp __initial,
360                        _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex)
361 {
362     tbb::this_task_arena::isolate([=, &__combine]() {
363         if (__n > 1)
364         {
365             _Index __p = tbb::this_task_arena::max_concurrency();
366             const _Index __slack = 4;
367             _Index __tilesize = (__n - 1) / (__slack * __p) + 1;
368             _Index __m = (__n - 1) / __tilesize;
369             __buffer<_Tp> __buf(__m + 1);
370             _Tp* __r = __buf.get();
371             __tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce,
372                                      __combine);
373 
374             // When __apex is a no-op and __combine has no side effects, a good optimizer
375             // should be able to eliminate all code between here and __apex.
376             // Alternatively, provide a default value for __apex that can be
377             // recognized by metaprogramming that conditionlly executes the following.
378             size_t __k = __m + 1;
379             _Tp __t = __r[__k - 1];
380             while ((__k &= __k - 1))
381                 __t = __combine(__r[__k - 1], __t);
382             __apex(__combine(__initial, __t));
383             __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial,
384                                        __combine, __scan);
385             return;
386         }
387         // Fewer than 2 elements in sequence, or out of memory.  Handle has single block.
388         _Tp __sum = __initial;
389         if (__n)
390             __sum = __combine(__sum, __reduce(_Index(0), __n));
391         __apex(__sum);
392         if (__n)
393             __scan(_Index(0), __n, __initial);
394     });
395 }
396 
397 template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
398 _Tp
__parallel_transform_scan(__pstl::__internal::__tbb_backend_tag,_ExecutionPolicy &&,_Index __n,_Up __u,_Tp __init,_Cp __combine,_Rp __brick_reduce,_Sp __scan)399 __parallel_transform_scan(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init,
400                           _Cp __combine, _Rp __brick_reduce, _Sp __scan)
401 {
402     __trans_scan_body<_Index, _Up, _Tp, _Cp, _Rp, _Sp> __body(__u, __init, __combine, __brick_reduce, __scan);
403     auto __range = tbb::blocked_range<_Index>(0, __n);
404     tbb::this_task_arena::isolate([__range, &__body]() { tbb::parallel_scan(__range, __body); });
405     return __body.sum();
406 }
407 
408 //------------------------------------------------------------------------
409 // parallel_stable_sort
410 //------------------------------------------------------------------------
411 
412 //------------------------------------------------------------------------
413 // stable_sort utilities
414 //
415 // These are used by parallel implementations but do not depend on them.
416 //------------------------------------------------------------------------
417 #define _PSTL_MERGE_CUT_OFF 2000
418 
419 template <typename _Func>
420 class __func_task;
421 template <typename _Func>
422 class __root_task;
423 
424 #if TBB_INTERFACE_VERSION <= 12000
425 class __task : public tbb::task
426 {
427   public:
428     template <typename _Fn>
429     __task*
make_continuation(_Fn && __f)430     make_continuation(_Fn&& __f)
431     {
432         return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
433     }
434 
435     template <typename _Fn>
436     __task*
make_child_of(__task * parent,_Fn && __f)437     make_child_of(__task* parent, _Fn&& __f)
438     {
439         return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
440     }
441 
442     template <typename _Fn>
443     __task*
make_additional_child_of(tbb::task * parent,_Fn && __f)444     make_additional_child_of(tbb::task* parent, _Fn&& __f)
445     {
446         return new (tbb::task::allocate_additional_child_of(*parent))
447             __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
448     }
449 
450     inline void
recycle_as_continuation()451     recycle_as_continuation()
452     {
453         tbb::task::recycle_as_continuation();
454     }
455 
456     inline void
recycle_as_child_of(__task * parent)457     recycle_as_child_of(__task* parent)
458     {
459         tbb::task::recycle_as_child_of(*parent);
460     }
461 
462     inline void
spawn(__task * __t)463     spawn(__task* __t)
464     {
465         tbb::task::spawn(*__t);
466     }
467 
468     template <typename _Fn>
469     static inline void
spawn_root_and_wait(__root_task<_Fn> & __root)470     spawn_root_and_wait(__root_task<_Fn>& __root)
471     {
472         tbb::task::spawn_root_and_wait(*__root._M_task);
473     }
474 };
475 
476 template <typename _Func>
477 class __func_task : public __task
478 {
479     _Func _M_func;
480 
481     tbb::task*
execute()482     execute()
483     {
484         return _M_func(this);
485     };
486 
487   public:
488     template <typename _Fn>
__func_task(_Fn && __f)489     __func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)}
490     {
491     }
492 
493     _Func&
body()494     body()
495     {
496         return _M_func;
497     }
498 };
499 
500 template <typename _Func>
501 class __root_task
502 {
503     tbb::task* _M_task;
504 
505   public:
506     template <typename... Args>
__root_task(Args &&...args)507     __root_task(Args&&... args)
508         : _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}}
509     {
510     }
511 
512     friend class __task;
513     friend class __func_task<_Func>;
514 };
515 
516 #else  // TBB_INTERFACE_VERSION <= 12000
517 class __task : public tbb::detail::d1::task
518 {
519   protected:
520     tbb::detail::d1::small_object_allocator _M_allocator{};
521     tbb::detail::d1::execution_data* _M_execute_data{};
522     __task* _M_parent{};
523     std::atomic<int> _M_refcount{};
524     bool _M_recycle{};
525 
526     template <typename _Fn>
527     __task*
allocate_func_task(_Fn && __f)528     allocate_func_task(_Fn&& __f)
529     {
530         _LIBCPP_ASSERT_UNCATEGORIZED(_M_execute_data != nullptr, "");
531         tbb::detail::d1::small_object_allocator __alloc{};
532         auto __t =
533             __alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f));
534         __t->_M_allocator = __alloc;
535         return __t;
536     }
537 
538   public:
539     __task*
parent()540     parent()
541     {
542         return _M_parent;
543     }
544 
545     void
set_ref_count(int __n)546     set_ref_count(int __n)
547     {
548         _M_refcount.store(__n, std::memory_order_release);
549     }
550 
551     template <typename _Fn>
552     __task*
make_continuation(_Fn && __f)553     make_continuation(_Fn&& __f)
554     {
555         auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
556         __t->_M_parent = _M_parent;
557         _M_parent = nullptr;
558         return __t;
559     }
560 
561     template <typename _Fn>
562     __task*
make_child_of(__task * __parent,_Fn && __f)563     make_child_of(__task* __parent, _Fn&& __f)
564     {
565         auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
566         __t->_M_parent = __parent;
567         return __t;
568     }
569 
570     template <typename _Fn>
571     __task*
make_additional_child_of(__task * __parent,_Fn && __f)572     make_additional_child_of(__task* __parent, _Fn&& __f)
573     {
574         auto __t = make_child_of(__parent, std::forward<_Fn>(__f));
575         _LIBCPP_ASSERT_UNCATEGORIZED(__parent->_M_refcount.load(std::memory_order_relaxed) > 0, "");
576         ++__parent->_M_refcount;
577         return __t;
578     }
579 
580     inline void
recycle_as_continuation()581     recycle_as_continuation()
582     {
583         _M_recycle = true;
584     }
585 
586     inline void
recycle_as_child_of(__task * parent)587     recycle_as_child_of(__task* parent)
588     {
589         _M_recycle = true;
590         _M_parent = parent;
591     }
592 
593     inline void
spawn(__task * __t)594     spawn(__task* __t)
595     {
596         _LIBCPP_ASSERT_UNCATEGORIZED(_M_execute_data != nullptr, "");
597         tbb::detail::d1::spawn(*__t, *_M_execute_data->context);
598     }
599 
600     template <typename _Fn>
601     static inline void
spawn_root_and_wait(__root_task<_Fn> & __root)602     spawn_root_and_wait(__root_task<_Fn>& __root)
603     {
604         tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object,
605                                           __root._M_context);
606     }
607 
608     template <typename _Func>
609     friend class __func_task;
610 };
611 
612 template <typename _Func>
613 class __func_task : public __task
614 {
615     _Func _M_func;
616 
617     __task*
execute(tbb::detail::d1::execution_data & __ed)618     execute(tbb::detail::d1::execution_data& __ed) override
619     {
620         _M_execute_data = &__ed;
621         _M_recycle = false;
622         __task* __next = _M_func(this);
623         return finalize(__next);
624     };
625 
626     __task*
cancel(tbb::detail::d1::execution_data & __ed)627     cancel(tbb::detail::d1::execution_data& __ed) override
628     {
629         return finalize(nullptr);
630     }
631 
632     __task*
finalize(__task * __next)633     finalize(__task* __next)
634     {
635         bool __recycle = _M_recycle;
636         _M_recycle = false;
637 
638         if (__recycle)
639         {
640             return __next;
641         }
642 
643         auto __parent = _M_parent;
644         auto __alloc = _M_allocator;
645         auto __ed = _M_execute_data;
646 
647         this->~__func_task();
648 
649         _LIBCPP_ASSERT_UNCATEGORIZED(__parent != nullptr, "");
650         _LIBCPP_ASSERT_UNCATEGORIZED(__parent->_M_refcount.load(std::memory_order_relaxed) > 0, "");
651         if (--__parent->_M_refcount == 0)
652         {
653             _LIBCPP_ASSERT_UNCATEGORIZED(__next == nullptr, "");
654             __alloc.deallocate(this, *__ed);
655             return __parent;
656         }
657 
658         return __next;
659     }
660 
661     friend class __root_task<_Func>;
662 
663   public:
664     template <typename _Fn>
__func_task(_Fn && __f)665     __func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f))
666     {
667     }
668 
669     _Func&
body()670     body()
671     {
672         return _M_func;
673     }
674 };
675 
676 template <typename _Func>
677 class __root_task : public __task
678 {
679     __task*
execute(tbb::detail::d1::execution_data & __ed)680     execute(tbb::detail::d1::execution_data& __ed) override
681     {
682         _M_wait_object.release();
683         return nullptr;
684     };
685 
686     __task*
cancel(tbb::detail::d1::execution_data & __ed)687     cancel(tbb::detail::d1::execution_data& __ed) override
688     {
689         _M_wait_object.release();
690         return nullptr;
691     }
692 
693     __func_task<_Func>* _M_func_task{};
694     tbb::detail::d1::wait_context _M_wait_object{0};
695     tbb::task_group_context _M_context{};
696 
697   public:
698     template <typename... Args>
__root_task(Args &&...args)699     __root_task(Args&&... args) : _M_wait_object{1}
700     {
701         tbb::detail::d1::small_object_allocator __alloc{};
702         _M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...));
703         _M_func_task->_M_allocator = __alloc;
704         _M_func_task->_M_parent = this;
705         _M_refcount.store(1, std::memory_order_relaxed);
706     }
707 
708     friend class __task;
709 };
710 #endif // TBB_INTERFACE_VERSION <= 12000
711 
712 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup,
713           typename _LeafMerge>
714 class __merge_func
715 {
716     typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
717     typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
718     typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
719     typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType;
720 
721     _RandomAccessIterator1 _M_x_beg;
722     _RandomAccessIterator2 _M_z_beg;
723 
724     _SizeType _M_xs, _M_xe;
725     _SizeType _M_ys, _M_ye;
726     _SizeType _M_zs;
727     _Compare _M_comp;
728     _LeafMerge _M_leaf_merge;
729     _SizeType _M_nsort; //number of elements to be sorted for partial_sort alforithm
730 
731     static const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
732 
733     bool _root;   //means a task is merging root task
734     bool _x_orig; //"true" means X(or left ) subrange is in the original container; false - in the buffer
735     bool _y_orig; //"true" means Y(or right) subrange is in the original container; false - in the buffer
736     bool _split; //"true" means a merge task is a split task for parallel merging, the execution logic differs
737 
738     bool
is_partial()739     is_partial() const
740     {
741         return _M_nsort > 0;
742     }
743 
744     struct __move_value
745     {
746         template <typename Iterator1, typename Iterator2>
747         void
operator__move_value748         operator()(Iterator1 __x, Iterator2 __z)
749         {
750             *__z = std::move(*__x);
751         }
752     };
753 
754     struct __move_value_construct
755     {
756         template <typename Iterator1, typename Iterator2>
757         void
operator__move_value_construct758         operator()(Iterator1 __x, Iterator2 __z)
759         {
760             ::new (std::addressof(*__z)) _ValueType(std::move(*__x));
761         }
762     };
763 
764     struct __move_range
765     {
766         template <typename Iterator1, typename Iterator2>
767         Iterator2
operator__move_range768         operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
769         {
770             if (__last1 - __first1 < __merge_cut_off)
771                 return std::move(__first1, __last1, __first2);
772 
773             auto __n = __last1 - __first1;
774             tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
775                               [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
776                                   std::move(__first1 + __range.begin(), __first1 + __range.end(),
777                                             __first2 + __range.begin());
778                               });
779             return __first2 + __n;
780         }
781     };
782 
783     struct __move_range_construct
784     {
785         template <typename Iterator1, typename Iterator2>
786         Iterator2
operator__move_range_construct787         operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
788         {
789             if (__last1 - __first1 < __merge_cut_off)
790             {
791                 for (; __first1 != __last1; ++__first1, ++__first2)
792                     __move_value_construct()(__first1, __first2);
793                 return __first2;
794             }
795 
796             auto __n = __last1 - __first1;
797             tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
798                               [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
799                                   for (auto i = __range.begin(); i != __range.end(); ++i)
800                                       __move_value_construct()(__first1 + i, __first2 + i);
801                               });
802             return __first2 + __n;
803         }
804     };
805 
806     struct __cleanup_range
807     {
808         template <typename Iterator>
809         void
operator__cleanup_range810         operator()(Iterator __first, Iterator __last)
811         {
812             if (__last - __first < __merge_cut_off)
813                 _Cleanup()(__first, __last);
814             else
815             {
816                 auto __n = __last - __first;
817                 tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
818                                   [__first](const tbb::blocked_range<_SizeType>& __range) {
819                                       _Cleanup()(__first + __range.begin(), __first + __range.end());
820                                   });
821             }
822         }
823     };
824 
825   public:
__merge_func(_SizeType __xs,_SizeType __xe,_SizeType __ys,_SizeType __ye,_SizeType __zs,_Compare __comp,_Cleanup,_LeafMerge __leaf_merge,_SizeType __nsort,_RandomAccessIterator1 __x_beg,_RandomAccessIterator2 __z_beg,bool __x_orig,bool __y_orig,bool __root)826     __merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
827                  _Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg,
828                  _RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root)
829         : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg),
830           _M_comp(__comp), _M_leaf_merge(__leaf_merge), _M_nsort(__nsort), _root(__root),
831           _x_orig(__x_orig), _y_orig(__y_orig), _split(false)
832     {
833     }
834 
835     bool
is_left(_SizeType __idx)836     is_left(_SizeType __idx) const
837     {
838         return _M_xs == __idx;
839     }
840 
841     template <typename IndexType>
842     void
set_odd(IndexType __idx,bool __on_off)843     set_odd(IndexType __idx, bool __on_off)
844     {
845         if (is_left(__idx))
846             _x_orig = __on_off;
847         else
848             _y_orig = __on_off;
849     }
850 
851     __task*
852     operator()(__task* __self);
853 
854   private:
855     __merge_func*
parent_merge(__task * __self)856     parent_merge(__task* __self) const
857     {
858         return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body();
859     }
860     bool
x_less_y()861     x_less_y()
862     {
863         const auto __nx = (_M_xe - _M_xs);
864         const auto __ny = (_M_ye - _M_ys);
865         _LIBCPP_ASSERT_UNCATEGORIZED(__nx > 0 && __ny > 0, "");
866 
867         _LIBCPP_ASSERT_UNCATEGORIZED(_x_orig == _y_orig, "");
868         _LIBCPP_ASSERT_UNCATEGORIZED(!is_partial(), "");
869 
870         if (_x_orig)
871         {
872             _LIBCPP_ASSERT_UNCATEGORIZED(std::is_sorted(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_comp), "");
873             _LIBCPP_ASSERT_UNCATEGORIZED(std::is_sorted(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_comp), "");
874             return !_M_comp(*(_M_x_beg + _M_ys), *(_M_x_beg + _M_xe - 1));
875         }
876 
877         _LIBCPP_ASSERT_UNCATEGORIZED(std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp), "");
878         _LIBCPP_ASSERT_UNCATEGORIZED(std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp), "");
879         return !_M_comp(*(_M_z_beg + _M_zs + __nx), *(_M_z_beg + _M_zs + __nx - 1));
880     }
881     void
move_x_range()882     move_x_range()
883     {
884         const auto __nx = (_M_xe - _M_xs);
885         const auto __ny = (_M_ye - _M_ys);
886         _LIBCPP_ASSERT_UNCATEGORIZED(__nx > 0 && __ny > 0, "");
887 
888         if (_x_orig)
889             __move_range_construct()(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_z_beg + _M_zs);
890         else
891         {
892             __move_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx, _M_x_beg + _M_xs);
893             __cleanup_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx);
894         }
895 
896         _x_orig = !_x_orig;
897     }
898     void
move_y_range()899     move_y_range()
900     {
901         const auto __nx = (_M_xe - _M_xs);
902         const auto __ny = (_M_ye - _M_ys);
903 
904         if (_y_orig)
905             __move_range_construct()(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs + __nx);
906         else
907         {
908             __move_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny, _M_x_beg + _M_ys);
909             __cleanup_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny);
910         }
911 
912         _y_orig = !_y_orig;
913     }
914     __task*
merge_ranges(__task * __self)915     merge_ranges(__task* __self)
916     {
917         _LIBCPP_ASSERT_UNCATEGORIZED(_x_orig == _y_orig, ""); // two merged subrange must be lie into the same buffer
918 
919         const auto __nx = (_M_xe - _M_xs);
920         const auto __ny = (_M_ye - _M_ys);
921         const auto __n = __nx + __ny;
922 
923         // need to merge {x} and {y}
924         if (__n > __merge_cut_off)
925             return split_merging(__self);
926 
927         //merge to buffer
928         if (_x_orig)
929         {
930             _M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs,
931                           _M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(),
932                           __move_range_construct());
933             _LIBCPP_ASSERT_UNCATEGORIZED(parent_merge(__self), ""); //not root merging task
934         }
935         //merge to "origin"
936         else
937         {
938             _LIBCPP_ASSERT_UNCATEGORIZED(_x_orig == _y_orig, "");
939 
940             _LIBCPP_ASSERT_UNCATEGORIZED(
941                 is_partial() || std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp), "");
942             _LIBCPP_ASSERT_UNCATEGORIZED(
943                 is_partial() || std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp), "");
944 
945             const auto __nx = (_M_xe - _M_xs);
946             const auto __ny = (_M_ye - _M_ys);
947 
948             _M_leaf_merge(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_x_beg + _M_zs,
949                           _M_comp, __move_value(), __move_value(), __move_range(), __move_range());
950 
951             __cleanup_range()(_M_z_beg + _M_xs, _M_z_beg + _M_xe);
952             __cleanup_range()(_M_z_beg + _M_ys, _M_z_beg + _M_ye);
953         }
954         return nullptr;
955     }
956 
957     __task*
process_ranges(__task * __self)958     process_ranges(__task* __self)
959     {
960         _LIBCPP_ASSERT_UNCATEGORIZED(_x_orig == _y_orig, "");
961         _LIBCPP_ASSERT_UNCATEGORIZED(!_split, "");
962 
963         auto p = parent_merge(__self);
964 
965         if (!p)
966         { //root merging task
967 
968             //optimization, just for sort algorithm, //{x} <= {y}
969             if (!is_partial() && x_less_y()) //we have a solution
970             {
971                 if (!_x_orig)
972                 {                   //we have to move the solution to the origin
973                     move_x_range(); //parallel moving
974                     move_y_range(); //parallel moving
975                 }
976                 return nullptr;
977             }
978             //else: if we have data in the origin,
979             //we have to move data to the buffer for final merging into the origin.
980             if (_x_orig)
981             {
982                 move_x_range(); //parallel moving
983                 move_y_range(); //parallel moving
984             }
985             // need to merge {x} and {y}.
986             return merge_ranges(__self);
987         }
988         //else: not root merging task (parent_merge() == NULL)
989         //optimization, just for sort algorithm, //{x} <= {y}
990         if (!is_partial() && x_less_y())
991         {
992             const auto id_range = _M_zs;
993             p->set_odd(id_range, _x_orig);
994             return nullptr;
995         }
996         //else: we have to revert "_x(y)_orig" flag of the parent merging task
997         const auto id_range = _M_zs;
998         p->set_odd(id_range, !_x_orig);
999 
1000         return merge_ranges(__self);
1001     }
1002 
1003     //splitting as merge task into 2 of the same level
1004     __task*
split_merging(__task * __self)1005     split_merging(__task* __self)
1006     {
1007         _LIBCPP_ASSERT_UNCATEGORIZED(_x_orig == _y_orig, "");
1008         const auto __nx = (_M_xe - _M_xs);
1009         const auto __ny = (_M_ye - _M_ys);
1010 
1011         _SizeType __xm{};
1012         _SizeType __ym{};
1013         if (__nx < __ny)
1014         {
1015             __ym = _M_ys + __ny / 2;
1016 
1017             if (_x_orig)
1018                 __xm = std::upper_bound(_M_x_beg + _M_xs, _M_x_beg + _M_xe, *(_M_x_beg + __ym), _M_comp) - _M_x_beg;
1019             else
1020                 __xm = std::upper_bound(_M_z_beg + _M_xs, _M_z_beg + _M_xe, *(_M_z_beg + __ym), _M_comp) - _M_z_beg;
1021         }
1022         else
1023         {
1024             __xm = _M_xs + __nx / 2;
1025 
1026             if (_y_orig)
1027                 __ym = std::lower_bound(_M_x_beg + _M_ys, _M_x_beg + _M_ye, *(_M_x_beg + __xm), _M_comp) - _M_x_beg;
1028             else
1029                 __ym = std::lower_bound(_M_z_beg + _M_ys, _M_z_beg + _M_ye, *(_M_z_beg + __xm), _M_comp) - _M_z_beg;
1030         }
1031 
1032         auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
1033         __merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort,
1034                                   _M_x_beg, _M_z_beg, _x_orig, _y_orig, _root);
1035         __right_func._split = true;
1036         auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func));
1037         __self->spawn(__merge_task);
1038         __self->recycle_as_continuation();
1039 
1040         _M_xe = __xm;
1041         _M_ye = __ym;
1042         _split = true;
1043 
1044         return __self;
1045     }
1046 };
1047 
1048 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup,
1049           typename _LeafMerge>
1050 __task*
1051 __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::
operator()1052 operator()(__task* __self)
1053 {
1054     //a. split merge task into 2 of the same level; the special logic,
1055     //without processing(process_ranges) adjacent sub-ranges x and y
1056     if (_split)
1057         return merge_ranges(__self);
1058 
1059     //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
1060 
1061     //1. x and y are in the even buffer
1062     //2. x and y are in the odd buffer
1063     if (_x_orig == _y_orig)
1064         return process_ranges(__self);
1065 
1066     //3. x is in even buffer, y is in the odd buffer
1067     //4. x is in odd buffer, y is in the even buffer
1068     if (!parent_merge(__self))
1069     { //root merge task
1070         if (_x_orig)
1071             move_x_range();
1072         else
1073             move_y_range();
1074     }
1075     else
1076     {
1077         const _SizeType __nx = (_M_xe - _M_xs);
1078         const _SizeType __ny = (_M_ye - _M_ys);
1079         _LIBCPP_ASSERT_UNCATEGORIZED(__nx > 0, "");
1080         _LIBCPP_ASSERT_UNCATEGORIZED(__nx > 0, "");
1081 
1082         if (__nx < __ny)
1083             move_x_range();
1084         else
1085             move_y_range();
1086     }
1087 
1088     return process_ranges(__self);
1089 }
1090 
1091 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
1092 class __stable_sort_func
1093 {
1094   public:
1095     typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
1096     typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
1097     typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
1098 
1099   private:
1100     _RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg;
1101     _RandomAccessIterator2 _M_zs, _M_z_beg;
1102     _Compare _M_comp;
1103     _LeafSort _M_leaf_sort;
1104     bool _M_root;
1105     _SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm
1106 
1107   public:
__stable_sort_func(_RandomAccessIterator1 __xs,_RandomAccessIterator1 __xe,_RandomAccessIterator2 __zs,bool __root,_Compare __comp,_LeafSort __leaf_sort,_SizeType __nsort,_RandomAccessIterator1 __x_beg,_RandomAccessIterator2 __z_beg)1108     __stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
1109                        bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort,
1110                        _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg)
1111         : _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp),
1112           _M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort)
1113     {
1114     }
1115 
1116     __task*
1117     operator()(__task* __self);
1118 };
1119 
1120 #define _PSTL_STABLE_SORT_CUT_OFF 500
1121 
1122 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
1123 __task*
operator()1124 __stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self)
1125 {
1126     typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
1127                          __utils::__serial_move_merge>
1128         _MergeTaskType;
1129 
1130     const _SizeType __n = _M_xe - _M_xs;
1131     const _SizeType __nmerge = _M_nsort > 0 ? _M_nsort : __n;
1132     const _SizeType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
1133     if (__n <= __sort_cut_off)
1134     {
1135         _M_leaf_sort(_M_xs, _M_xe, _M_comp);
1136         _LIBCPP_ASSERT_UNCATEGORIZED(!_M_root, "");
1137         return nullptr;
1138     }
1139 
1140     const _RandomAccessIterator1 __xm = _M_xs + __n / 2;
1141     const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs);
1142     const _RandomAccessIterator2 __ze = _M_zs + __n;
1143     _MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg,
1144                                       _M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(),
1145                                       __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
1146                                       /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root));
1147     auto __parent = __self->make_continuation(std::move(__m));
1148     __parent->set_ref_count(2);
1149     auto __right = __self->make_child_of(
1150         __parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg));
1151     __self->spawn(__right);
1152     __self->recycle_as_child_of(__parent);
1153     _M_root = false;
1154     _M_xe = __xm;
1155 
1156     return __self;
1157 }
1158 
1159 template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
1160 void
1161 __parallel_stable_sort(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator __xs,
1162                        _RandomAccessIterator __xe, _Compare __comp, _LeafSort __leaf_sort, std::size_t __nsort = 0)
1163 {
1164     tbb::this_task_arena::isolate([=, &__nsort]() {
1165         //sorting based on task tree and parallel merge
1166         typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _ValueType;
1167         typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType;
1168         const _DifferenceType __n = __xe - __xs;
1169         if (__nsort == __n)
1170             __nsort = 0; // 'partial_sort' becames 'sort'
1171 
1172         const _DifferenceType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
1173         if (__n > __sort_cut_off)
1174         {
1175             __buffer<_ValueType> __buf(__n);
1176             __root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{
1177                 __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()};
1178             __task::spawn_root_and_wait(__root);
1179             return;
1180         }
1181         //serial sort
1182         __leaf_sort(__xs, __xe, __comp);
1183     });
1184 }
1185 
1186 //------------------------------------------------------------------------
1187 // parallel_merge
1188 //------------------------------------------------------------------------
1189 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
1190           typename _Compare, typename _LeafMerge>
1191 class __merge_func_static
1192 {
1193     _RandomAccessIterator1 _M_xs, _M_xe;
1194     _RandomAccessIterator2 _M_ys, _M_ye;
1195     _RandomAccessIterator3 _M_zs;
1196     _Compare _M_comp;
1197     _LeafMerge _M_leaf_merge;
1198 
1199   public:
__merge_func_static(_RandomAccessIterator1 __xs,_RandomAccessIterator1 __xe,_RandomAccessIterator2 __ys,_RandomAccessIterator2 __ye,_RandomAccessIterator3 __zs,_Compare __comp,_LeafMerge __leaf_merge)1200     __merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
1201                         _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
1202                         _LeafMerge __leaf_merge)
1203         : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge)
1204     {
1205     }
1206 
1207     __task*
1208     operator()(__task* __self);
1209 };
1210 
1211 //TODO: consider usage of parallel_for with a custom blocked_range
1212 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
1213           typename __M_Compare, typename _LeafMerge>
1214 __task*
1215 __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>::
operator()1216 operator()(__task* __self)
1217 {
1218     typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
1219     typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
1220     typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
1221     const _SizeType __n = (_M_xe - _M_xs) + (_M_ye - _M_ys);
1222     const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
1223     if (__n <= __merge_cut_off)
1224     {
1225         _M_leaf_merge(_M_xs, _M_xe, _M_ys, _M_ye, _M_zs, _M_comp);
1226         return nullptr;
1227     }
1228 
1229     _RandomAccessIterator1 __xm;
1230     _RandomAccessIterator2 __ym;
1231     if (_M_xe - _M_xs < _M_ye - _M_ys)
1232     {
1233         __ym = _M_ys + (_M_ye - _M_ys) / 2;
1234         __xm = std::upper_bound(_M_xs, _M_xe, *__ym, _M_comp);
1235     }
1236     else
1237     {
1238         __xm = _M_xs + (_M_xe - _M_xs) / 2;
1239         __ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp);
1240     }
1241     const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
1242     auto __right = __self->make_additional_child_of(
1243         __self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge));
1244     __self->spawn(__right);
1245     __self->recycle_as_continuation();
1246     _M_xe = __xm;
1247     _M_ye = __ym;
1248 
1249     return __self;
1250 }
1251 
1252 template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
1253           typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge>
1254 void
__parallel_merge(__pstl::__internal::__tbb_backend_tag,_ExecutionPolicy &&,_RandomAccessIterator1 __xs,_RandomAccessIterator1 __xe,_RandomAccessIterator2 __ys,_RandomAccessIterator2 __ye,_RandomAccessIterator3 __zs,_Compare __comp,_LeafMerge __leaf_merge)1255 __parallel_merge(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __xs,
1256                  _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye,
1257                  _RandomAccessIterator3 __zs, _Compare __comp, _LeafMerge __leaf_merge)
1258 {
1259     typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
1260     typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
1261     typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
1262     const _SizeType __n = (__xe - __xs) + (__ye - __ys);
1263     const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
1264     if (__n <= __merge_cut_off)
1265     {
1266         // Fall back on serial merge
1267         __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp);
1268     }
1269     else
1270     {
1271         tbb::this_task_arena::isolate([=]() {
1272             typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
1273                                         _Compare, _LeafMerge>
1274                 _TaskType;
1275             __root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge};
1276             __task::spawn_root_and_wait(__root);
1277         });
1278     }
1279 }
1280 
1281 //------------------------------------------------------------------------
1282 // parallel_invoke
1283 //------------------------------------------------------------------------
1284 template <class _ExecutionPolicy, typename _F1, typename _F2>
1285 void
__parallel_invoke(__pstl::__internal::__tbb_backend_tag,_ExecutionPolicy &&,_F1 && __f1,_F2 && __f2)1286 __parallel_invoke(__pstl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _F1&& __f1, _F2&& __f2)
1287 {
1288     //TODO: a version of tbb::this_task_arena::isolate with variadic arguments pack should be added in the future
1289     tbb::this_task_arena::isolate([&]() { tbb::parallel_invoke(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); });
1290 }
1291 
1292 } // namespace __tbb_backend
1293 } // namespace __pstl
1294 
1295 #endif /* _PSTL_PARALLEL_BACKEND_TBB_H */
1296