1// -*- C++ -*- 2//===--------------------------- barrier ----------------------------------===// 3// 4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5// See https://llvm.org/LICENSE.txt for license information. 6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7// 8//===----------------------------------------------------------------------===// 9 10#ifndef _LIBCPP_BARRIER 11#define _LIBCPP_BARRIER 12 13/* 14 barrier synopsis 15 16namespace std 17{ 18 19 template<class CompletionFunction = see below> 20 class barrier 21 { 22 public: 23 using arrival_token = see below; 24 25 static constexpr ptrdiff_t max() noexcept; 26 27 constexpr explicit barrier(ptrdiff_t phase_count, 28 CompletionFunction f = CompletionFunction()); 29 ~barrier(); 30 31 barrier(const barrier&) = delete; 32 barrier& operator=(const barrier&) = delete; 33 34 [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1); 35 void wait(arrival_token&& arrival) const; 36 37 void arrive_and_wait(); 38 void arrive_and_drop(); 39 40 private: 41 CompletionFunction completion; // exposition only 42 }; 43 44} 45 46*/ 47 48#include <__availability> 49#include <__config> 50#include <atomic> 51#ifndef _LIBCPP_HAS_NO_TREE_BARRIER 52# include <memory> 53#endif 54 55#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 56#pragma GCC system_header 57#endif 58 59#ifdef _LIBCPP_HAS_NO_THREADS 60# error <barrier> is not supported on this single threaded system 61#endif 62 63_LIBCPP_PUSH_MACROS 64#include <__undef_macros> 65 66#if _LIBCPP_STD_VER >= 14 67 68_LIBCPP_BEGIN_NAMESPACE_STD 69 70struct __empty_completion 71{ 72 inline _LIBCPP_INLINE_VISIBILITY 73 void operator()() noexcept 74 { 75 } 76}; 77 78#ifndef _LIBCPP_HAS_NO_TREE_BARRIER 79 80/* 81 82The default implementation of __barrier_base is a classic tree barrier. 83 84It looks different from literature pseudocode for two main reasons: 85 1. Threads that call into std::barrier functions do not provide indices, 86 so a numbering step is added before the actual barrier algorithm, 87 appearing as an N+1 round to the N rounds of the tree barrier. 88 2. A great deal of attention has been paid to avoid cache line thrashing 89 by flattening the tree structure into cache-line sized arrays, that 90 are indexed in an efficient way. 91 92*/ 93 94using __barrier_phase_t = uint8_t; 95 96class __barrier_algorithm_base; 97 98_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 99__barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected); 100 101_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 102bool __arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier, 103 __barrier_phase_t __old_phase); 104 105_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 106void __destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier); 107 108template<class _CompletionF> 109class __barrier_base { 110 ptrdiff_t __expected; 111 unique_ptr<__barrier_algorithm_base, 112 void (*)(__barrier_algorithm_base*)> __base; 113 __atomic_base<ptrdiff_t> __expected_adjustment; 114 _CompletionF __completion; 115 __atomic_base<__barrier_phase_t> __phase; 116 117public: 118 using arrival_token = __barrier_phase_t; 119 120 static constexpr ptrdiff_t max() noexcept { 121 return numeric_limits<ptrdiff_t>::max(); 122 } 123 124 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 125 __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) 126 : __expected(__expected), __base(__construct_barrier_algorithm_base(this->__expected), 127 &__destroy_barrier_algorithm_base), 128 __expected_adjustment(0), __completion(move(__completion)), __phase(0) 129 { 130 } 131 [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 132 arrival_token arrive(ptrdiff_t update) 133 { 134 auto const __old_phase = __phase.load(memory_order_relaxed); 135 for(; update; --update) 136 if(__arrive_barrier_algorithm_base(__base.get(), __old_phase)) { 137 __completion(); 138 __expected += __expected_adjustment.load(memory_order_relaxed); 139 __expected_adjustment.store(0, memory_order_relaxed); 140 __phase.store(__old_phase + 2, memory_order_release); 141 __phase.notify_all(); 142 } 143 return __old_phase; 144 } 145 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 146 void wait(arrival_token&& __old_phase) const 147 { 148 auto const __test_fn = [this, __old_phase]() -> bool { 149 return __phase.load(memory_order_acquire) != __old_phase; 150 }; 151 __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy()); 152 } 153 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 154 void arrive_and_drop() 155 { 156 __expected_adjustment.fetch_sub(1, memory_order_relaxed); 157 (void)arrive(1); 158 } 159}; 160 161#else 162 163/* 164 165The alternative implementation of __barrier_base is a central barrier. 166 167Two versions of this algorithm are provided: 168 1. A fairly straightforward implementation of the litterature for the 169 general case where the completion function is not empty. 170 2. An optimized implementation that exploits 2's complement arithmetic 171 and well-defined overflow in atomic arithmetic, to handle the phase 172 roll-over for free. 173 174*/ 175 176template<class _CompletionF> 177class __barrier_base { 178 179 __atomic_base<ptrdiff_t> __expected; 180 __atomic_base<ptrdiff_t> __arrived; 181 _CompletionF __completion; 182 __atomic_base<bool> __phase; 183public: 184 using arrival_token = bool; 185 186 static constexpr ptrdiff_t max() noexcept { 187 return numeric_limits<ptrdiff_t>::max(); 188 } 189 190 _LIBCPP_INLINE_VISIBILITY 191 __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) 192 : __expected(__expected), __arrived(__expected), __completion(move(__completion)), __phase(false) 193 { 194 } 195 [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 196 arrival_token arrive(ptrdiff_t update) 197 { 198 auto const __old_phase = __phase.load(memory_order_relaxed); 199 auto const __result = __arrived.fetch_sub(update, memory_order_acq_rel) - update; 200 auto const new_expected = __expected.load(memory_order_relaxed); 201 if(0 == __result) { 202 __completion(); 203 __arrived.store(new_expected, memory_order_relaxed); 204 __phase.store(!__old_phase, memory_order_release); 205 __phase.notify_all(); 206 } 207 return __old_phase; 208 } 209 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 210 void wait(arrival_token&& __old_phase) const 211 { 212 __phase.wait(__old_phase, memory_order_acquire); 213 } 214 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 215 void arrive_and_drop() 216 { 217 __expected.fetch_sub(1, memory_order_relaxed); 218 (void)arrive(1); 219 } 220}; 221 222template<> 223class __barrier_base<__empty_completion> { 224 225 static constexpr uint64_t __expected_unit = 1ull; 226 static constexpr uint64_t __arrived_unit = 1ull << 32; 227 static constexpr uint64_t __expected_mask = __arrived_unit - 1; 228 static constexpr uint64_t __phase_bit = 1ull << 63; 229 static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask; 230 231 __atomic_base<uint64_t> __phase_arrived_expected; 232 233 static _LIBCPP_INLINE_VISIBILITY 234 constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT 235 { 236 return ((uint64_t(1u << 31) - __count) << 32) 237 | (uint64_t(1u << 31) - __count); 238 } 239 240public: 241 using arrival_token = uint64_t; 242 243 static constexpr ptrdiff_t max() noexcept { 244 return ptrdiff_t(1u << 31) - 1; 245 } 246 247 _LIBCPP_INLINE_VISIBILITY 248 explicit inline __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion()) 249 : __phase_arrived_expected(__init(__count)) 250 { 251 } 252 [[nodiscard]] inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 253 arrival_token arrive(ptrdiff_t update) 254 { 255 auto const __inc = __arrived_unit * update; 256 auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel); 257 if((__old ^ (__old + __inc)) & __phase_bit) { 258 __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed); 259 __phase_arrived_expected.notify_all(); 260 } 261 return __old & __phase_bit; 262 } 263 inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 264 void wait(arrival_token&& __phase) const 265 { 266 auto const __test_fn = [=]() -> bool { 267 uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); 268 return ((__current & __phase_bit) != __phase); 269 }; 270 __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy()); 271 } 272 inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 273 void arrive_and_drop() 274 { 275 __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed); 276 (void)arrive(1); 277 } 278}; 279 280#endif //_LIBCPP_HAS_NO_TREE_BARRIER 281 282template<class _CompletionF = __empty_completion> 283class barrier { 284 285 __barrier_base<_CompletionF> __b; 286public: 287 using arrival_token = typename __barrier_base<_CompletionF>::arrival_token; 288 289 static constexpr ptrdiff_t max() noexcept { 290 return __barrier_base<_CompletionF>::max(); 291 } 292 293 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 294 barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) 295 : __b(__count, _VSTD::move(__completion)) { 296 } 297 298 barrier(barrier const&) = delete; 299 barrier& operator=(barrier const&) = delete; 300 301 [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 302 arrival_token arrive(ptrdiff_t update = 1) 303 { 304 return __b.arrive(update); 305 } 306 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 307 void wait(arrival_token&& __phase) const 308 { 309 __b.wait(_VSTD::move(__phase)); 310 } 311 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 312 void arrive_and_wait() 313 { 314 wait(arrive()); 315 } 316 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 317 void arrive_and_drop() 318 { 319 __b.arrive_and_drop(); 320 } 321}; 322 323_LIBCPP_END_NAMESPACE_STD 324 325#endif // _LIBCPP_STD_VER >= 14 326 327_LIBCPP_POP_MACROS 328 329#endif //_LIBCPP_BARRIER 330