xref: /freebsd/contrib/llvm-project/libcxx/include/__pstl/backends/libdispatch.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
10 #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
11 
12 #include <__algorithm/inplace_merge.h>
13 #include <__algorithm/lower_bound.h>
14 #include <__algorithm/max.h>
15 #include <__algorithm/merge.h>
16 #include <__algorithm/upper_bound.h>
17 #include <__atomic/atomic.h>
18 #include <__config>
19 #include <__cstddef/ptrdiff_t.h>
20 #include <__exception/terminate.h>
21 #include <__iterator/iterator_traits.h>
22 #include <__iterator/move_iterator.h>
23 #include <__memory/allocator.h>
24 #include <__memory/construct_at.h>
25 #include <__memory/destroy.h>
26 #include <__memory/unique_ptr.h>
27 #include <__new/exceptions.h>
28 #include <__numeric/reduce.h>
29 #include <__pstl/backend_fwd.h>
30 #include <__pstl/cpu_algos/any_of.h>
31 #include <__pstl/cpu_algos/cpu_traits.h>
32 #include <__pstl/cpu_algos/fill.h>
33 #include <__pstl/cpu_algos/find_if.h>
34 #include <__pstl/cpu_algos/for_each.h>
35 #include <__pstl/cpu_algos/merge.h>
36 #include <__pstl/cpu_algos/stable_sort.h>
37 #include <__pstl/cpu_algos/transform.h>
38 #include <__pstl/cpu_algos/transform_reduce.h>
39 #include <__utility/empty.h>
40 #include <__utility/exception_guard.h>
41 #include <__utility/move.h>
42 #include <__utility/pair.h>
43 #include <optional>
44 
45 _LIBCPP_PUSH_MACROS
46 #include <__undef_macros>
47 
48 #if _LIBCPP_STD_VER >= 17
49 
50 _LIBCPP_BEGIN_NAMESPACE_STD
51 namespace __pstl {
52 
53 namespace __libdispatch {
54 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
55 // we.
56 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
57 _LIBCPP_EXPORTED_FROM_ABI void
58 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
59 
60 template <class _Func>
__dispatch_apply(size_t __chunk_count,_Func __func)61 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
62   __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
63     (*static_cast<_Func*>(__context))(__chunk);
64   });
65 }
66 
67 struct __chunk_partitions {
68   ptrdiff_t __chunk_count_; // includes the first chunk
69   ptrdiff_t __chunk_size_;
70   ptrdiff_t __first_chunk_size_;
71 };
72 
73 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept;
74 
75 template <class _RandomAccessIterator, class _Functor>
76 _LIBCPP_HIDE_FROM_ABI optional<__empty>
__dispatch_parallel_for(__chunk_partitions __partitions,_RandomAccessIterator __first,_Functor __func)77 __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {
78   // Perform the chunked execution.
79   __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
80     auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
81     auto __index =
82         __chunk == 0
83             ? 0
84             : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
85     __func(__first + __index, __first + __index + __this_chunk_size);
86   });
87 
88   return __empty{};
89 }
90 } // namespace __libdispatch
91 
92 template <>
93 struct __cpu_traits<__libdispatch_backend_tag> {
94   template <class _RandomAccessIterator, class _Functor>
95   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
96   __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
97     return __libdispatch::__dispatch_parallel_for(
98         __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
99   }
100 
101   template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
102   struct __merge_range {
103     __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
104         : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
105 
106     _RandomAccessIterator1 __mid1_;
107     _RandomAccessIterator2 __mid2_;
108     _RandomAccessIteratorOut __result_;
109   };
110 
111   template <typename _RandomAccessIterator1,
112             typename _RandomAccessIterator2,
113             typename _RandomAccessIterator3,
114             typename _Compare,
115             typename _LeafMerge>
116   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
117   __merge(_RandomAccessIterator1 __first1,
118           _RandomAccessIterator1 __last1,
119           _RandomAccessIterator2 __first2,
120           _RandomAccessIterator2 __last2,
121           _RandomAccessIterator3 __result,
122           _Compare __comp,
123           _LeafMerge __leaf_merge) noexcept {
124     __libdispatch::__chunk_partitions __partitions =
125         __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
126 
127     if (__partitions.__chunk_count_ == 0)
128       return __empty{};
129 
130     if (__partitions.__chunk_count_ == 1) {
131       __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
132       return __empty{};
133     }
134 
135     using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
136     auto const __n_ranges = __partitions.__chunk_count_ + 1;
137 
138     // TODO: use __uninitialized_buffer
139     auto __destroy = [=](__merge_range_t* __ptr) {
140       std::destroy_n(__ptr, __n_ranges);
141       std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
142     };
143 
144     unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
145         [&]() -> __merge_range_t* {
146 #  if _LIBCPP_HAS_EXCEPTIONS
147           try {
148 #  endif
149             return std::allocator<__merge_range_t>().allocate(__n_ranges);
150 #  if _LIBCPP_HAS_EXCEPTIONS
151           } catch (const std::bad_alloc&) {
152             return nullptr;
153           }
154 #  endif
155         }(),
156         __destroy);
157 
158     if (!__ranges)
159       return nullopt;
160 
161     // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
162     __merge_range_t* __r = __ranges.get();
163     std::__construct_at(__r++, __first1, __first2, __result);
164 
165     bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
166 
167     auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
168       auto [__mid1, __mid2] = [&] {
169         if (__iterate_first_range) {
170           auto __m1 = __first1 + __chunk_size;
171           auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
172           return std::make_pair(__m1, __m2);
173         } else {
174           auto __m2 = __first2 + __chunk_size;
175           auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
176           return std::make_pair(__m1, __m2);
177         }
178       }();
179 
180       __result += (__mid1 - __first1) + (__mid2 - __first2);
181       __first1 = __mid1;
182       __first2 = __mid2;
183       return {std::move(__mid1), std::move(__mid2), __result};
184     };
185 
186     // handle first chunk
187     std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
188 
189     // handle 2 -> N - 1 chunks
190     for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
191       std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
192 
193     // handle last chunk
194     std::__construct_at(__r, __last1, __last2, __result);
195 
196     __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
197       auto __first_iters = __ranges[__index];
198       auto __last_iters  = __ranges[__index + 1];
199       __leaf_merge(
200           __first_iters.__mid1_,
201           __last_iters.__mid1_,
202           __first_iters.__mid2_,
203           __last_iters.__mid2_,
204           __first_iters.__result_,
205           __comp);
206     });
207 
208     return __empty{};
209   }
210 
211   template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
212   _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
213       _RandomAccessIterator __first,
214       _RandomAccessIterator __last,
215       _Transform __transform,
216       _Value __init,
217       _Combiner __combiner,
218       _Reduction __reduction) {
219     if (__first == __last)
220       return __init;
221 
222     auto __partitions = __libdispatch::__partition_chunks(__last - __first);
223 
224     auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
225       std::destroy_n(__ptr, __count);
226       std::allocator<_Value>().deallocate(__ptr, __count);
227     };
228 
229     // TODO: use __uninitialized_buffer
230     // TODO: allocate one element per worker instead of one element per chunk
231     unique_ptr<_Value[], decltype(__destroy)> __values(
232         std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
233 
234     // __dispatch_apply is noexcept
235     __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
236       auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
237       auto __index           = __chunk == 0 ? 0
238                                             : (__chunk * __partitions.__chunk_size_) +
239                                         (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
240       if (__this_chunk_size != 1) {
241         std::__construct_at(
242             __values.get() + __chunk,
243             __reduction(__first + __index + 2,
244                         __first + __index + __this_chunk_size,
245                         __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
246       } else {
247         std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
248       }
249     });
250 
251     return std::reduce(
252         std::make_move_iterator(__values.get()),
253         std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
254         std::move(__init),
255         __combiner);
256   }
257 
258   template <class _RandomAccessIterator, class _Comp, class _LeafSort>
259   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
260   __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
261     const auto __size = __last - __first;
262     auto __partitions = __libdispatch::__partition_chunks(__size);
263 
264     if (__partitions.__chunk_count_ == 0)
265       return __empty{};
266 
267     if (__partitions.__chunk_count_ == 1) {
268       __leaf_sort(__first, __last, __comp);
269       return __empty{};
270     }
271 
272     using _Value = __iter_value_type<_RandomAccessIterator>;
273 
274     auto __destroy = [__size](_Value* __ptr) {
275       std::destroy_n(__ptr, __size);
276       std::allocator<_Value>().deallocate(__ptr, __size);
277     };
278 
279     // TODO: use __uninitialized_buffer
280     unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
281 
282     // Initialize all elements to a moved-from state
283     // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
284     std::__construct_at(__values.get(), std::move(*__first));
285     for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
286       std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
287     }
288     *__first = std::move(__values.get()[__size - 1]);
289 
290     __libdispatch::__dispatch_parallel_for(
291         __partitions,
292         __first,
293         [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
294           __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
295         });
296 
297     bool __objects_are_in_buffer = false;
298     do {
299       const auto __old_chunk_size = __partitions.__chunk_size_;
300       if (__partitions.__chunk_count_ % 2 == 1) {
301         auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
302           std::inplace_merge(
303               __first_chunk_begin,
304               __first_chunk_begin + __partitions.__first_chunk_size_,
305               __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
306               __comp);
307         };
308         if (__objects_are_in_buffer)
309           __inplace_merge_chunks(__values.get());
310         else
311           __inplace_merge_chunks(__first);
312         __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
313       } else {
314         __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
315       }
316 
317       __partitions.__chunk_size_ *= 2;
318       __partitions.__chunk_count_ /= 2;
319 
320       auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
321         __libdispatch::__dispatch_parallel_for(
322             __partitions,
323             __from_first,
324             [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
325               std::merge(std::make_move_iterator(__chunk_first),
326                          std::make_move_iterator(__chunk_last - __old_chunk_size),
327                          std::make_move_iterator(__chunk_last - __old_chunk_size),
328                          std::make_move_iterator(__chunk_last),
329                          __to_first + (__chunk_first - __from_first),
330                          __comp);
331             });
332       };
333 
334       if (__objects_are_in_buffer)
335         __merge_chunks(__values.get(), __first);
336       else
337         __merge_chunks(__first, __values.get());
338       __objects_are_in_buffer = !__objects_are_in_buffer;
339     } while (__partitions.__chunk_count_ > 1);
340 
341     if (__objects_are_in_buffer) {
342       std::move(__values.get(), __values.get() + __size, __first);
343     }
344 
345     return __empty{};
346   }
347 
348   _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
349 
350   static constexpr size_t __lane_size = 64;
351 };
352 
353 // Mandatory implementations of the computational basis
354 template <class _ExecutionPolicy>
355 struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy>
356     : __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {};
357 
358 template <class _ExecutionPolicy>
359 struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy>
360     : __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {};
361 
362 template <class _ExecutionPolicy>
363 struct __merge<__libdispatch_backend_tag, _ExecutionPolicy>
364     : __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {};
365 
366 template <class _ExecutionPolicy>
367 struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy>
368     : __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {};
369 
370 template <class _ExecutionPolicy>
371 struct __transform<__libdispatch_backend_tag, _ExecutionPolicy>
372     : __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {};
373 
374 template <class _ExecutionPolicy>
375 struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy>
376     : __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
377 
378 template <class _ExecutionPolicy>
379 struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy>
380     : __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {};
381 
382 template <class _ExecutionPolicy>
383 struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy>
384     : __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
385 
386 // Not mandatory, but better optimized
387 template <class _ExecutionPolicy>
388 struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy>
389     : __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {};
390 
391 template <class _ExecutionPolicy>
392 struct __fill<__libdispatch_backend_tag, _ExecutionPolicy>
393     : __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {};
394 
395 } // namespace __pstl
396 _LIBCPP_END_NAMESPACE_STD
397 
398 #endif // _LIBCPP_STD_VER >= 17
399 
400 _LIBCPP_POP_MACROS
401 
402 #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
403