1 //===----------------------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H 10 #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H 11 12 #include <__algorithm/inplace_merge.h> 13 #include <__algorithm/lower_bound.h> 14 #include <__algorithm/max.h> 15 #include <__algorithm/merge.h> 16 #include <__algorithm/upper_bound.h> 17 #include <__atomic/atomic.h> 18 #include <__config> 19 #include <__exception/terminate.h> 20 #include <__iterator/iterator_traits.h> 21 #include <__iterator/move_iterator.h> 22 #include <__memory/allocator.h> 23 #include <__memory/construct_at.h> 24 #include <__memory/unique_ptr.h> 25 #include <__numeric/reduce.h> 26 #include <__pstl/backend_fwd.h> 27 #include <__pstl/cpu_algos/any_of.h> 28 #include <__pstl/cpu_algos/cpu_traits.h> 29 #include <__pstl/cpu_algos/fill.h> 30 #include <__pstl/cpu_algos/find_if.h> 31 #include <__pstl/cpu_algos/for_each.h> 32 #include <__pstl/cpu_algos/merge.h> 33 #include <__pstl/cpu_algos/stable_sort.h> 34 #include <__pstl/cpu_algos/transform.h> 35 #include <__pstl/cpu_algos/transform_reduce.h> 36 #include <__utility/empty.h> 37 #include <__utility/exception_guard.h> 38 #include <__utility/move.h> 39 #include <__utility/pair.h> 40 #include <cstddef> 41 #include <new> 42 #include <optional> 43 44 _LIBCPP_PUSH_MACROS 45 #include <__undef_macros> 46 47 _LIBCPP_BEGIN_NAMESPACE_STD 48 namespace __pstl { 49 50 namespace __libdispatch { 51 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do 52 // we. 53 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]? 54 _LIBCPP_EXPORTED_FROM_ABI void 55 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept; 56 57 template <class _Func> 58 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept { 59 __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) { 60 (*static_cast<_Func*>(__context))(__chunk); 61 }); 62 } 63 64 struct __chunk_partitions { 65 ptrdiff_t __chunk_count_; // includes the first chunk 66 ptrdiff_t __chunk_size_; 67 ptrdiff_t __first_chunk_size_; 68 }; 69 70 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept; 71 72 template <class _RandomAccessIterator, class _Functor> 73 _LIBCPP_HIDE_FROM_ABI optional<__empty> 74 __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) { 75 // Perform the chunked execution. 76 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 77 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 78 auto __index = 79 __chunk == 0 80 ? 0 81 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 82 __func(__first + __index, __first + __index + __this_chunk_size); 83 }); 84 85 return __empty{}; 86 } 87 } // namespace __libdispatch 88 89 template <> 90 struct __cpu_traits<__libdispatch_backend_tag> { 91 template <class _RandomAccessIterator, class _Functor> 92 _LIBCPP_HIDE_FROM_ABI static optional<__empty> 93 __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { 94 return __libdispatch::__dispatch_parallel_for( 95 __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); 96 } 97 98 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut> 99 struct __merge_range { 100 __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) 101 : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {} 102 103 _RandomAccessIterator1 __mid1_; 104 _RandomAccessIterator2 __mid2_; 105 _RandomAccessIteratorOut __result_; 106 }; 107 108 template <typename _RandomAccessIterator1, 109 typename _RandomAccessIterator2, 110 typename _RandomAccessIterator3, 111 typename _Compare, 112 typename _LeafMerge> 113 _LIBCPP_HIDE_FROM_ABI static optional<__empty> 114 __merge(_RandomAccessIterator1 __first1, 115 _RandomAccessIterator1 __last1, 116 _RandomAccessIterator2 __first2, 117 _RandomAccessIterator2 __last2, 118 _RandomAccessIterator3 __result, 119 _Compare __comp, 120 _LeafMerge __leaf_merge) noexcept { 121 __libdispatch::__chunk_partitions __partitions = 122 __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2)); 123 124 if (__partitions.__chunk_count_ == 0) 125 return __empty{}; 126 127 if (__partitions.__chunk_count_ == 1) { 128 __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp); 129 return __empty{}; 130 } 131 132 using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>; 133 auto const __n_ranges = __partitions.__chunk_count_ + 1; 134 135 // TODO: use __uninitialized_buffer 136 auto __destroy = [=](__merge_range_t* __ptr) { 137 std::destroy_n(__ptr, __n_ranges); 138 std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges); 139 }; 140 141 unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( 142 [&]() -> __merge_range_t* { 143 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS 144 try { 145 #endif 146 return std::allocator<__merge_range_t>().allocate(__n_ranges); 147 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS 148 } catch (const std::bad_alloc&) { 149 return nullptr; 150 } 151 #endif 152 }(), 153 __destroy); 154 155 if (!__ranges) 156 return nullopt; 157 158 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case 159 __merge_range_t* __r = __ranges.get(); 160 std::__construct_at(__r++, __first1, __first2, __result); 161 162 bool __iterate_first_range = __last1 - __first1 > __last2 - __first2; 163 164 auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t { 165 auto [__mid1, __mid2] = [&] { 166 if (__iterate_first_range) { 167 auto __m1 = __first1 + __chunk_size; 168 auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp); 169 return std::make_pair(__m1, __m2); 170 } else { 171 auto __m2 = __first2 + __chunk_size; 172 auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp); 173 return std::make_pair(__m1, __m2); 174 } 175 }(); 176 177 __result += (__mid1 - __first1) + (__mid2 - __first2); 178 __first1 = __mid1; 179 __first2 = __mid2; 180 return {std::move(__mid1), std::move(__mid2), __result}; 181 }; 182 183 // handle first chunk 184 std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_)); 185 186 // handle 2 -> N - 1 chunks 187 for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i) 188 std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_)); 189 190 // handle last chunk 191 std::__construct_at(__r, __last1, __last2, __result); 192 193 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) { 194 auto __first_iters = __ranges[__index]; 195 auto __last_iters = __ranges[__index + 1]; 196 __leaf_merge( 197 __first_iters.__mid1_, 198 __last_iters.__mid1_, 199 __first_iters.__mid2_, 200 __last_iters.__mid2_, 201 __first_iters.__result_, 202 __comp); 203 }); 204 205 return __empty{}; 206 } 207 208 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction> 209 _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce( 210 _RandomAccessIterator __first, 211 _RandomAccessIterator __last, 212 _Transform __transform, 213 _Value __init, 214 _Combiner __combiner, 215 _Reduction __reduction) { 216 if (__first == __last) 217 return __init; 218 219 auto __partitions = __libdispatch::__partition_chunks(__last - __first); 220 221 auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) { 222 std::destroy_n(__ptr, __count); 223 std::allocator<_Value>().deallocate(__ptr, __count); 224 }; 225 226 // TODO: use __uninitialized_buffer 227 // TODO: allocate one element per worker instead of one element per chunk 228 unique_ptr<_Value[], decltype(__destroy)> __values( 229 std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy); 230 231 // __dispatch_apply is noexcept 232 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 233 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 234 auto __index = __chunk == 0 ? 0 235 : (__chunk * __partitions.__chunk_size_) + 236 (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 237 if (__this_chunk_size != 1) { 238 std::__construct_at( 239 __values.get() + __chunk, 240 __reduction(__first + __index + 2, 241 __first + __index + __this_chunk_size, 242 __combiner(__transform(__first + __index), __transform(__first + __index + 1)))); 243 } else { 244 std::__construct_at(__values.get() + __chunk, __transform(__first + __index)); 245 } 246 }); 247 248 return std::reduce( 249 std::make_move_iterator(__values.get()), 250 std::make_move_iterator(__values.get() + __partitions.__chunk_count_), 251 std::move(__init), 252 __combiner); 253 } 254 255 template <class _RandomAccessIterator, class _Comp, class _LeafSort> 256 _LIBCPP_HIDE_FROM_ABI static optional<__empty> 257 __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { 258 const auto __size = __last - __first; 259 auto __partitions = __libdispatch::__partition_chunks(__size); 260 261 if (__partitions.__chunk_count_ == 0) 262 return __empty{}; 263 264 if (__partitions.__chunk_count_ == 1) { 265 __leaf_sort(__first, __last, __comp); 266 return __empty{}; 267 } 268 269 using _Value = __iter_value_type<_RandomAccessIterator>; 270 271 auto __destroy = [__size](_Value* __ptr) { 272 std::destroy_n(__ptr, __size); 273 std::allocator<_Value>().deallocate(__ptr, __size); 274 }; 275 276 // TODO: use __uninitialized_buffer 277 unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy); 278 279 // Initialize all elements to a moved-from state 280 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928 281 std::__construct_at(__values.get(), std::move(*__first)); 282 for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) { 283 std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1])); 284 } 285 *__first = std::move(__values.get()[__size - 1]); 286 287 __libdispatch::__dispatch_parallel_for( 288 __partitions, 289 __first, 290 [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) { 291 __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp); 292 }); 293 294 bool __objects_are_in_buffer = false; 295 do { 296 const auto __old_chunk_size = __partitions.__chunk_size_; 297 if (__partitions.__chunk_count_ % 2 == 1) { 298 auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) { 299 std::inplace_merge( 300 __first_chunk_begin, 301 __first_chunk_begin + __partitions.__first_chunk_size_, 302 __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_, 303 __comp); 304 }; 305 if (__objects_are_in_buffer) 306 __inplace_merge_chunks(__values.get()); 307 else 308 __inplace_merge_chunks(__first); 309 __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_; 310 } else { 311 __partitions.__first_chunk_size_ += __partitions.__chunk_size_; 312 } 313 314 __partitions.__chunk_size_ *= 2; 315 __partitions.__chunk_count_ /= 2; 316 317 auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) { 318 __libdispatch::__dispatch_parallel_for( 319 __partitions, 320 __from_first, 321 [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) { 322 std::merge(std::make_move_iterator(__chunk_first), 323 std::make_move_iterator(__chunk_last - __old_chunk_size), 324 std::make_move_iterator(__chunk_last - __old_chunk_size), 325 std::make_move_iterator(__chunk_last), 326 __to_first + (__chunk_first - __from_first), 327 __comp); 328 }); 329 }; 330 331 if (__objects_are_in_buffer) 332 __merge_chunks(__values.get(), __first); 333 else 334 __merge_chunks(__first, __values.get()); 335 __objects_are_in_buffer = !__objects_are_in_buffer; 336 } while (__partitions.__chunk_count_ > 1); 337 338 if (__objects_are_in_buffer) { 339 std::move(__values.get(), __values.get() + __size, __first); 340 } 341 342 return __empty{}; 343 } 344 345 _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {} 346 347 static constexpr size_t __lane_size = 64; 348 }; 349 350 // Mandatory implementations of the computational basis 351 template <class _ExecutionPolicy> 352 struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy> 353 : __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {}; 354 355 template <class _ExecutionPolicy> 356 struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy> 357 : __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {}; 358 359 template <class _ExecutionPolicy> 360 struct __merge<__libdispatch_backend_tag, _ExecutionPolicy> 361 : __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {}; 362 363 template <class _ExecutionPolicy> 364 struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> 365 : __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {}; 366 367 template <class _ExecutionPolicy> 368 struct __transform<__libdispatch_backend_tag, _ExecutionPolicy> 369 : __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {}; 370 371 template <class _ExecutionPolicy> 372 struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> 373 : __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {}; 374 375 template <class _ExecutionPolicy> 376 struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> 377 : __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {}; 378 379 template <class _ExecutionPolicy> 380 struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> 381 : __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {}; 382 383 // Not mandatory, but better optimized 384 template <class _ExecutionPolicy> 385 struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy> 386 : __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {}; 387 388 template <class _ExecutionPolicy> 389 struct __fill<__libdispatch_backend_tag, _ExecutionPolicy> 390 : __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {}; 391 392 } // namespace __pstl 393 _LIBCPP_END_NAMESPACE_STD 394 395 _LIBCPP_POP_MACROS 396 397 #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H 398