1// -*- C++ -*- 2//===----------------------------------------------------------------------===// 3// 4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5// See https://llvm.org/LICENSE.txt for license information. 6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7// 8//===----------------------------------------------------------------------===// 9 10#ifndef _LIBCPP_BARRIER 11#define _LIBCPP_BARRIER 12 13/* 14 barrier synopsis 15 16namespace std 17{ 18 19 template<class CompletionFunction = see below> 20 class barrier 21 { 22 public: 23 using arrival_token = see below; 24 25 static constexpr ptrdiff_t max() noexcept; 26 27 constexpr explicit barrier(ptrdiff_t phase_count, 28 CompletionFunction f = CompletionFunction()); 29 ~barrier(); 30 31 barrier(const barrier&) = delete; 32 barrier& operator=(const barrier&) = delete; 33 34 [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1); 35 void wait(arrival_token&& arrival) const; 36 37 void arrive_and_wait(); 38 void arrive_and_drop(); 39 40 private: 41 CompletionFunction completion; // exposition only 42 }; 43 44} 45 46*/ 47 48#include <__availability> 49#include <__config> 50#include <__thread/timed_backoff_policy.h> 51#include <atomic> 52#ifndef _LIBCPP_HAS_NO_TREE_BARRIER 53# include <memory> 54#endif 55 56#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 57#pragma GCC system_header 58#endif 59 60#ifdef _LIBCPP_HAS_NO_THREADS 61# error <barrier> is not supported on this single threaded system 62#endif 63 64_LIBCPP_PUSH_MACROS 65#include <__undef_macros> 66 67#if _LIBCPP_STD_VER >= 14 68 69_LIBCPP_BEGIN_NAMESPACE_STD 70 71struct __empty_completion 72{ 73 inline _LIBCPP_INLINE_VISIBILITY 74 void operator()() noexcept 75 { 76 } 77}; 78 79#ifndef _LIBCPP_HAS_NO_TREE_BARRIER 80 81/* 82 83The default implementation of __barrier_base is a classic tree barrier. 84 85It looks different from literature pseudocode for two main reasons: 86 1. Threads that call into std::barrier functions do not provide indices, 87 so a numbering step is added before the actual barrier algorithm, 88 appearing as an N+1 round to the N rounds of the tree barrier. 89 2. A great deal of attention has been paid to avoid cache line thrashing 90 by flattening the tree structure into cache-line sized arrays, that 91 are indexed in an efficient way. 92 93*/ 94 95using __barrier_phase_t = uint8_t; 96 97class __barrier_algorithm_base; 98 99_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 100__barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected); 101 102_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 103bool __arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier, 104 __barrier_phase_t __old_phase); 105 106_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 107void __destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier); 108 109template<class _CompletionF> 110class __barrier_base { 111 ptrdiff_t __expected; 112 unique_ptr<__barrier_algorithm_base, 113 void (*)(__barrier_algorithm_base*)> __base; 114 __atomic_base<ptrdiff_t> __expected_adjustment; 115 _CompletionF __completion; 116 __atomic_base<__barrier_phase_t> __phase; 117 118public: 119 using arrival_token = __barrier_phase_t; 120 121 static constexpr ptrdiff_t max() noexcept { 122 return numeric_limits<ptrdiff_t>::max(); 123 } 124 125 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 126 __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) 127 : __expected(__expected), __base(__construct_barrier_algorithm_base(this->__expected), 128 &__destroy_barrier_algorithm_base), 129 __expected_adjustment(0), __completion(move(__completion)), __phase(0) 130 { 131 } 132 [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 133 arrival_token arrive(ptrdiff_t update) 134 { 135 auto const __old_phase = __phase.load(memory_order_relaxed); 136 for(; update; --update) 137 if(__arrive_barrier_algorithm_base(__base.get(), __old_phase)) { 138 __completion(); 139 __expected += __expected_adjustment.load(memory_order_relaxed); 140 __expected_adjustment.store(0, memory_order_relaxed); 141 __phase.store(__old_phase + 2, memory_order_release); 142 __phase.notify_all(); 143 } 144 return __old_phase; 145 } 146 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 147 void wait(arrival_token&& __old_phase) const 148 { 149 auto const __test_fn = [this, __old_phase]() -> bool { 150 return __phase.load(memory_order_acquire) != __old_phase; 151 }; 152 __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy()); 153 } 154 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 155 void arrive_and_drop() 156 { 157 __expected_adjustment.fetch_sub(1, memory_order_relaxed); 158 (void)arrive(1); 159 } 160}; 161 162#else 163 164/* 165 166The alternative implementation of __barrier_base is a central barrier. 167 168Two versions of this algorithm are provided: 169 1. A fairly straightforward implementation of the litterature for the 170 general case where the completion function is not empty. 171 2. An optimized implementation that exploits 2's complement arithmetic 172 and well-defined overflow in atomic arithmetic, to handle the phase 173 roll-over for free. 174 175*/ 176 177template<class _CompletionF> 178class __barrier_base { 179 180 __atomic_base<ptrdiff_t> __expected; 181 __atomic_base<ptrdiff_t> __arrived; 182 _CompletionF __completion; 183 __atomic_base<bool> __phase; 184public: 185 using arrival_token = bool; 186 187 static constexpr ptrdiff_t max() noexcept { 188 return numeric_limits<ptrdiff_t>::max(); 189 } 190 191 _LIBCPP_INLINE_VISIBILITY 192 __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) 193 : __expected(__expected), __arrived(__expected), __completion(move(__completion)), __phase(false) 194 { 195 } 196 [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 197 arrival_token arrive(ptrdiff_t update) 198 { 199 auto const __old_phase = __phase.load(memory_order_relaxed); 200 auto const __result = __arrived.fetch_sub(update, memory_order_acq_rel) - update; 201 auto const new_expected = __expected.load(memory_order_relaxed); 202 if(0 == __result) { 203 __completion(); 204 __arrived.store(new_expected, memory_order_relaxed); 205 __phase.store(!__old_phase, memory_order_release); 206 __phase.notify_all(); 207 } 208 return __old_phase; 209 } 210 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 211 void wait(arrival_token&& __old_phase) const 212 { 213 __phase.wait(__old_phase, memory_order_acquire); 214 } 215 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 216 void arrive_and_drop() 217 { 218 __expected.fetch_sub(1, memory_order_relaxed); 219 (void)arrive(1); 220 } 221}; 222 223template<> 224class __barrier_base<__empty_completion> { 225 226 static constexpr uint64_t __expected_unit = 1ull; 227 static constexpr uint64_t __arrived_unit = 1ull << 32; 228 static constexpr uint64_t __expected_mask = __arrived_unit - 1; 229 static constexpr uint64_t __phase_bit = 1ull << 63; 230 static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask; 231 232 __atomic_base<uint64_t> __phase_arrived_expected; 233 234 static _LIBCPP_INLINE_VISIBILITY 235 constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT 236 { 237 return ((uint64_t(1u << 31) - __count) << 32) 238 | (uint64_t(1u << 31) - __count); 239 } 240 241public: 242 using arrival_token = uint64_t; 243 244 static constexpr ptrdiff_t max() noexcept { 245 return ptrdiff_t(1u << 31) - 1; 246 } 247 248 _LIBCPP_INLINE_VISIBILITY 249 explicit inline __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion()) 250 : __phase_arrived_expected(__init(__count)) 251 { 252 } 253 [[nodiscard]] inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 254 arrival_token arrive(ptrdiff_t update) 255 { 256 auto const __inc = __arrived_unit * update; 257 auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel); 258 if((__old ^ (__old + __inc)) & __phase_bit) { 259 __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed); 260 __phase_arrived_expected.notify_all(); 261 } 262 return __old & __phase_bit; 263 } 264 inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 265 void wait(arrival_token&& __phase) const 266 { 267 auto const __test_fn = [=]() -> bool { 268 uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); 269 return ((__current & __phase_bit) != __phase); 270 }; 271 __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy()); 272 } 273 inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 274 void arrive_and_drop() 275 { 276 __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed); 277 (void)arrive(1); 278 } 279}; 280 281#endif //_LIBCPP_HAS_NO_TREE_BARRIER 282 283template<class _CompletionF = __empty_completion> 284class barrier { 285 286 __barrier_base<_CompletionF> __b; 287public: 288 using arrival_token = typename __barrier_base<_CompletionF>::arrival_token; 289 290 static constexpr ptrdiff_t max() noexcept { 291 return __barrier_base<_CompletionF>::max(); 292 } 293 294 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 295 barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) 296 : __b(__count, _VSTD::move(__completion)) { 297 } 298 299 barrier(barrier const&) = delete; 300 barrier& operator=(barrier const&) = delete; 301 302 [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 303 arrival_token arrive(ptrdiff_t update = 1) 304 { 305 return __b.arrive(update); 306 } 307 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 308 void wait(arrival_token&& __phase) const 309 { 310 __b.wait(_VSTD::move(__phase)); 311 } 312 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 313 void arrive_and_wait() 314 { 315 wait(arrive()); 316 } 317 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 318 void arrive_and_drop() 319 { 320 __b.arrive_and_drop(); 321 } 322}; 323 324_LIBCPP_END_NAMESPACE_STD 325 326#endif // _LIBCPP_STD_VER >= 14 327 328_LIBCPP_POP_MACROS 329 330#endif //_LIBCPP_BARRIER 331