1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
10 #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
11
12 #include <__algorithm/inplace_merge.h>
13 #include <__algorithm/lower_bound.h>
14 #include <__algorithm/max.h>
15 #include <__algorithm/merge.h>
16 #include <__algorithm/upper_bound.h>
17 #include <__atomic/atomic.h>
18 #include <__config>
19 #include <__exception/terminate.h>
20 #include <__iterator/iterator_traits.h>
21 #include <__iterator/move_iterator.h>
22 #include <__memory/allocator.h>
23 #include <__memory/construct_at.h>
24 #include <__memory/unique_ptr.h>
25 #include <__numeric/reduce.h>
26 #include <__pstl/backend_fwd.h>
27 #include <__pstl/cpu_algos/any_of.h>
28 #include <__pstl/cpu_algos/cpu_traits.h>
29 #include <__pstl/cpu_algos/fill.h>
30 #include <__pstl/cpu_algos/find_if.h>
31 #include <__pstl/cpu_algos/for_each.h>
32 #include <__pstl/cpu_algos/merge.h>
33 #include <__pstl/cpu_algos/stable_sort.h>
34 #include <__pstl/cpu_algos/transform.h>
35 #include <__pstl/cpu_algos/transform_reduce.h>
36 #include <__utility/empty.h>
37 #include <__utility/exception_guard.h>
38 #include <__utility/move.h>
39 #include <__utility/pair.h>
40 #include <cstddef>
41 #include <new>
42 #include <optional>
43
44 _LIBCPP_PUSH_MACROS
45 #include <__undef_macros>
46
47 _LIBCPP_BEGIN_NAMESPACE_STD
48 namespace __pstl {
49
50 namespace __libdispatch {
51 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
52 // we.
53 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
54 _LIBCPP_EXPORTED_FROM_ABI void
55 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
56
57 template <class _Func>
__dispatch_apply(size_t __chunk_count,_Func __func)58 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
59 __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
60 (*static_cast<_Func*>(__context))(__chunk);
61 });
62 }
63
64 struct __chunk_partitions {
65 ptrdiff_t __chunk_count_; // includes the first chunk
66 ptrdiff_t __chunk_size_;
67 ptrdiff_t __first_chunk_size_;
68 };
69
70 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept;
71
72 template <class _RandomAccessIterator, class _Functor>
73 _LIBCPP_HIDE_FROM_ABI optional<__empty>
__dispatch_parallel_for(__chunk_partitions __partitions,_RandomAccessIterator __first,_Functor __func)74 __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {
75 // Perform the chunked execution.
76 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
77 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
78 auto __index =
79 __chunk == 0
80 ? 0
81 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
82 __func(__first + __index, __first + __index + __this_chunk_size);
83 });
84
85 return __empty{};
86 }
87 } // namespace __libdispatch
88
89 template <>
90 struct __cpu_traits<__libdispatch_backend_tag> {
91 template <class _RandomAccessIterator, class _Functor>
92 _LIBCPP_HIDE_FROM_ABI static optional<__empty>
93 __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
94 return __libdispatch::__dispatch_parallel_for(
95 __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
96 }
97
98 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
99 struct __merge_range {
100 __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
101 : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
102
103 _RandomAccessIterator1 __mid1_;
104 _RandomAccessIterator2 __mid2_;
105 _RandomAccessIteratorOut __result_;
106 };
107
108 template <typename _RandomAccessIterator1,
109 typename _RandomAccessIterator2,
110 typename _RandomAccessIterator3,
111 typename _Compare,
112 typename _LeafMerge>
113 _LIBCPP_HIDE_FROM_ABI static optional<__empty>
114 __merge(_RandomAccessIterator1 __first1,
115 _RandomAccessIterator1 __last1,
116 _RandomAccessIterator2 __first2,
117 _RandomAccessIterator2 __last2,
118 _RandomAccessIterator3 __result,
119 _Compare __comp,
120 _LeafMerge __leaf_merge) noexcept {
121 __libdispatch::__chunk_partitions __partitions =
122 __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
123
124 if (__partitions.__chunk_count_ == 0)
125 return __empty{};
126
127 if (__partitions.__chunk_count_ == 1) {
128 __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
129 return __empty{};
130 }
131
132 using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
133 auto const __n_ranges = __partitions.__chunk_count_ + 1;
134
135 // TODO: use __uninitialized_buffer
136 auto __destroy = [=](__merge_range_t* __ptr) {
137 std::destroy_n(__ptr, __n_ranges);
138 std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
139 };
140
141 unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
142 [&]() -> __merge_range_t* {
143 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
144 try {
145 #endif
146 return std::allocator<__merge_range_t>().allocate(__n_ranges);
147 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
148 } catch (const std::bad_alloc&) {
149 return nullptr;
150 }
151 #endif
152 }(),
153 __destroy);
154
155 if (!__ranges)
156 return nullopt;
157
158 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
159 __merge_range_t* __r = __ranges.get();
160 std::__construct_at(__r++, __first1, __first2, __result);
161
162 bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
163
164 auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
165 auto [__mid1, __mid2] = [&] {
166 if (__iterate_first_range) {
167 auto __m1 = __first1 + __chunk_size;
168 auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
169 return std::make_pair(__m1, __m2);
170 } else {
171 auto __m2 = __first2 + __chunk_size;
172 auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
173 return std::make_pair(__m1, __m2);
174 }
175 }();
176
177 __result += (__mid1 - __first1) + (__mid2 - __first2);
178 __first1 = __mid1;
179 __first2 = __mid2;
180 return {std::move(__mid1), std::move(__mid2), __result};
181 };
182
183 // handle first chunk
184 std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
185
186 // handle 2 -> N - 1 chunks
187 for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
188 std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
189
190 // handle last chunk
191 std::__construct_at(__r, __last1, __last2, __result);
192
193 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
194 auto __first_iters = __ranges[__index];
195 auto __last_iters = __ranges[__index + 1];
196 __leaf_merge(
197 __first_iters.__mid1_,
198 __last_iters.__mid1_,
199 __first_iters.__mid2_,
200 __last_iters.__mid2_,
201 __first_iters.__result_,
202 __comp);
203 });
204
205 return __empty{};
206 }
207
208 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
209 _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
210 _RandomAccessIterator __first,
211 _RandomAccessIterator __last,
212 _Transform __transform,
213 _Value __init,
214 _Combiner __combiner,
215 _Reduction __reduction) {
216 if (__first == __last)
217 return __init;
218
219 auto __partitions = __libdispatch::__partition_chunks(__last - __first);
220
221 auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
222 std::destroy_n(__ptr, __count);
223 std::allocator<_Value>().deallocate(__ptr, __count);
224 };
225
226 // TODO: use __uninitialized_buffer
227 // TODO: allocate one element per worker instead of one element per chunk
228 unique_ptr<_Value[], decltype(__destroy)> __values(
229 std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
230
231 // __dispatch_apply is noexcept
232 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
233 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
234 auto __index = __chunk == 0 ? 0
235 : (__chunk * __partitions.__chunk_size_) +
236 (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
237 if (__this_chunk_size != 1) {
238 std::__construct_at(
239 __values.get() + __chunk,
240 __reduction(__first + __index + 2,
241 __first + __index + __this_chunk_size,
242 __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
243 } else {
244 std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
245 }
246 });
247
248 return std::reduce(
249 std::make_move_iterator(__values.get()),
250 std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
251 std::move(__init),
252 __combiner);
253 }
254
255 template <class _RandomAccessIterator, class _Comp, class _LeafSort>
256 _LIBCPP_HIDE_FROM_ABI static optional<__empty>
257 __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
258 const auto __size = __last - __first;
259 auto __partitions = __libdispatch::__partition_chunks(__size);
260
261 if (__partitions.__chunk_count_ == 0)
262 return __empty{};
263
264 if (__partitions.__chunk_count_ == 1) {
265 __leaf_sort(__first, __last, __comp);
266 return __empty{};
267 }
268
269 using _Value = __iter_value_type<_RandomAccessIterator>;
270
271 auto __destroy = [__size](_Value* __ptr) {
272 std::destroy_n(__ptr, __size);
273 std::allocator<_Value>().deallocate(__ptr, __size);
274 };
275
276 // TODO: use __uninitialized_buffer
277 unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
278
279 // Initialize all elements to a moved-from state
280 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
281 std::__construct_at(__values.get(), std::move(*__first));
282 for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
283 std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
284 }
285 *__first = std::move(__values.get()[__size - 1]);
286
287 __libdispatch::__dispatch_parallel_for(
288 __partitions,
289 __first,
290 [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
291 __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
292 });
293
294 bool __objects_are_in_buffer = false;
295 do {
296 const auto __old_chunk_size = __partitions.__chunk_size_;
297 if (__partitions.__chunk_count_ % 2 == 1) {
298 auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
299 std::inplace_merge(
300 __first_chunk_begin,
301 __first_chunk_begin + __partitions.__first_chunk_size_,
302 __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
303 __comp);
304 };
305 if (__objects_are_in_buffer)
306 __inplace_merge_chunks(__values.get());
307 else
308 __inplace_merge_chunks(__first);
309 __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
310 } else {
311 __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
312 }
313
314 __partitions.__chunk_size_ *= 2;
315 __partitions.__chunk_count_ /= 2;
316
317 auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
318 __libdispatch::__dispatch_parallel_for(
319 __partitions,
320 __from_first,
321 [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
322 std::merge(std::make_move_iterator(__chunk_first),
323 std::make_move_iterator(__chunk_last - __old_chunk_size),
324 std::make_move_iterator(__chunk_last - __old_chunk_size),
325 std::make_move_iterator(__chunk_last),
326 __to_first + (__chunk_first - __from_first),
327 __comp);
328 });
329 };
330
331 if (__objects_are_in_buffer)
332 __merge_chunks(__values.get(), __first);
333 else
334 __merge_chunks(__first, __values.get());
335 __objects_are_in_buffer = !__objects_are_in_buffer;
336 } while (__partitions.__chunk_count_ > 1);
337
338 if (__objects_are_in_buffer) {
339 std::move(__values.get(), __values.get() + __size, __first);
340 }
341
342 return __empty{};
343 }
344
345 _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
346
347 static constexpr size_t __lane_size = 64;
348 };
349
350 // Mandatory implementations of the computational basis
351 template <class _ExecutionPolicy>
352 struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy>
353 : __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {};
354
355 template <class _ExecutionPolicy>
356 struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy>
357 : __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {};
358
359 template <class _ExecutionPolicy>
360 struct __merge<__libdispatch_backend_tag, _ExecutionPolicy>
361 : __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {};
362
363 template <class _ExecutionPolicy>
364 struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy>
365 : __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {};
366
367 template <class _ExecutionPolicy>
368 struct __transform<__libdispatch_backend_tag, _ExecutionPolicy>
369 : __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {};
370
371 template <class _ExecutionPolicy>
372 struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy>
373 : __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
374
375 template <class _ExecutionPolicy>
376 struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy>
377 : __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {};
378
379 template <class _ExecutionPolicy>
380 struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy>
381 : __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
382
383 // Not mandatory, but better optimized
384 template <class _ExecutionPolicy>
385 struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy>
386 : __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {};
387
388 template <class _ExecutionPolicy>
389 struct __fill<__libdispatch_backend_tag, _ExecutionPolicy>
390 : __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {};
391
392 } // namespace __pstl
393 _LIBCPP_END_NAMESPACE_STD
394
395 _LIBCPP_POP_MACROS
396
397 #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
398