1*349cc55cSDimitry Andric /* 2*349cc55cSDimitry Andric * kmp_barrier.h 3*349cc55cSDimitry Andric */ 4*349cc55cSDimitry Andric 5*349cc55cSDimitry Andric //===----------------------------------------------------------------------===// 6*349cc55cSDimitry Andric // 7*349cc55cSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8*349cc55cSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 9*349cc55cSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10*349cc55cSDimitry Andric // 11*349cc55cSDimitry Andric //===----------------------------------------------------------------------===// 12*349cc55cSDimitry Andric 13*349cc55cSDimitry Andric #ifndef KMP_BARRIER_H 14*349cc55cSDimitry Andric #define KMP_BARRIER_H 15*349cc55cSDimitry Andric 16*349cc55cSDimitry Andric #include "kmp.h" 17*349cc55cSDimitry Andric #include "kmp_i18n.h" 18*349cc55cSDimitry Andric 19*349cc55cSDimitry Andric #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC 20*349cc55cSDimitry Andric #include <xmmintrin.h> 21*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment) 22*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr) 23*349cc55cSDimitry Andric #elif KMP_HAVE_ALIGNED_ALLOC 24*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size) 25*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) free(ptr) 26*349cc55cSDimitry Andric #elif KMP_HAVE_POSIX_MEMALIGN 27*349cc55cSDimitry Andric static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) { 28*349cc55cSDimitry Andric void *ptr; 29*349cc55cSDimitry Andric int n = posix_memalign(&ptr, alignment, size); 30*349cc55cSDimitry Andric if (n != 0) { 31*349cc55cSDimitry Andric if (ptr) 32*349cc55cSDimitry Andric free(ptr); 33*349cc55cSDimitry Andric return nullptr; 34*349cc55cSDimitry Andric } 35*349cc55cSDimitry Andric return ptr; 36*349cc55cSDimitry Andric } 37*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) free(ptr) 38*349cc55cSDimitry Andric #elif KMP_HAVE__ALIGNED_MALLOC 39*349cc55cSDimitry Andric #include <malloc.h> 40*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment) 41*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr) 42*349cc55cSDimitry Andric #else 43*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size) 44*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr) 45*349cc55cSDimitry Andric #endif 46*349cc55cSDimitry Andric 47*349cc55cSDimitry Andric // Use four cache lines: MLC tends to prefetch the next or previous cache line 48*349cc55cSDimitry Andric // creating a possible fake conflict between cores, so this is the only way to 49*349cc55cSDimitry Andric // guarantee that no such prefetch can happen. 50*349cc55cSDimitry Andric #ifndef KMP_FOURLINE_ALIGN_CACHE 51*349cc55cSDimitry Andric #define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE) 52*349cc55cSDimitry Andric #endif 53*349cc55cSDimitry Andric 54*349cc55cSDimitry Andric #define KMP_OPTIMIZE_FOR_REDUCTIONS 0 55*349cc55cSDimitry Andric 56*349cc55cSDimitry Andric class distributedBarrier { 57*349cc55cSDimitry Andric struct flags_s { 58*349cc55cSDimitry Andric kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed; 59*349cc55cSDimitry Andric }; 60*349cc55cSDimitry Andric 61*349cc55cSDimitry Andric struct go_s { 62*349cc55cSDimitry Andric std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go; 63*349cc55cSDimitry Andric }; 64*349cc55cSDimitry Andric 65*349cc55cSDimitry Andric struct iter_s { 66*349cc55cSDimitry Andric kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter; 67*349cc55cSDimitry Andric }; 68*349cc55cSDimitry Andric 69*349cc55cSDimitry Andric struct sleep_s { 70*349cc55cSDimitry Andric std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep; 71*349cc55cSDimitry Andric }; 72*349cc55cSDimitry Andric 73*349cc55cSDimitry Andric void init(size_t nthr); 74*349cc55cSDimitry Andric void resize(size_t nthr); 75*349cc55cSDimitry Andric void computeGo(size_t n); 76*349cc55cSDimitry Andric void computeVarsForN(size_t n); 77*349cc55cSDimitry Andric 78*349cc55cSDimitry Andric public: 79*349cc55cSDimitry Andric enum { 80*349cc55cSDimitry Andric MAX_ITERS = 3, 81*349cc55cSDimitry Andric MAX_GOS = 8, 82*349cc55cSDimitry Andric IDEAL_GOS = 4, 83*349cc55cSDimitry Andric IDEAL_CONTENTION = 16, 84*349cc55cSDimitry Andric }; 85*349cc55cSDimitry Andric 86*349cc55cSDimitry Andric flags_s *flags[MAX_ITERS]; 87*349cc55cSDimitry Andric go_s *go; 88*349cc55cSDimitry Andric iter_s *iter; 89*349cc55cSDimitry Andric sleep_s *sleep; 90*349cc55cSDimitry Andric 91*349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier 92*349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure 93*349cc55cSDimitry Andric // number of go signals each requiring one write per iteration 94*349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE num_gos; 95*349cc55cSDimitry Andric // number of groups of gos 96*349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE num_groups; 97*349cc55cSDimitry Andric // threads per go signal 98*349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE threads_per_go; 99*349cc55cSDimitry Andric bool KMP_ALIGN_CACHE fix_threads_per_go; 100*349cc55cSDimitry Andric // threads per group 101*349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE threads_per_group; 102*349cc55cSDimitry Andric // number of go signals in a group 103*349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE gos_per_group; 104*349cc55cSDimitry Andric void *team_icvs; 105*349cc55cSDimitry Andric 106*349cc55cSDimitry Andric distributedBarrier() = delete; 107*349cc55cSDimitry Andric ~distributedBarrier() = delete; 108*349cc55cSDimitry Andric 109*349cc55cSDimitry Andric // Used instead of constructor to create aligned data 110*349cc55cSDimitry Andric static distributedBarrier *allocate(int nThreads) { 111*349cc55cSDimitry Andric distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE( 112*349cc55cSDimitry Andric sizeof(distributedBarrier), 4 * CACHE_LINE); 113*349cc55cSDimitry Andric if (!d) { 114*349cc55cSDimitry Andric KMP_FATAL(MemoryAllocFailed); 115*349cc55cSDimitry Andric } 116*349cc55cSDimitry Andric d->num_threads = 0; 117*349cc55cSDimitry Andric d->max_threads = 0; 118*349cc55cSDimitry Andric for (int i = 0; i < MAX_ITERS; ++i) 119*349cc55cSDimitry Andric d->flags[i] = NULL; 120*349cc55cSDimitry Andric d->go = NULL; 121*349cc55cSDimitry Andric d->iter = NULL; 122*349cc55cSDimitry Andric d->sleep = NULL; 123*349cc55cSDimitry Andric d->team_icvs = NULL; 124*349cc55cSDimitry Andric d->fix_threads_per_go = false; 125*349cc55cSDimitry Andric // calculate gos and groups ONCE on base size 126*349cc55cSDimitry Andric d->computeGo(nThreads); 127*349cc55cSDimitry Andric d->init(nThreads); 128*349cc55cSDimitry Andric return d; 129*349cc55cSDimitry Andric } 130*349cc55cSDimitry Andric 131*349cc55cSDimitry Andric static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); } 132*349cc55cSDimitry Andric 133*349cc55cSDimitry Andric void update_num_threads(size_t nthr) { init(nthr); } 134*349cc55cSDimitry Andric 135*349cc55cSDimitry Andric bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); } 136*349cc55cSDimitry Andric size_t get_num_threads() { return num_threads; } 137*349cc55cSDimitry Andric kmp_uint64 go_release(); 138*349cc55cSDimitry Andric void go_reset(); 139*349cc55cSDimitry Andric }; 140*349cc55cSDimitry Andric 141*349cc55cSDimitry Andric #endif // KMP_BARRIER_H 142