1349cc55cSDimitry Andric /* 2349cc55cSDimitry Andric * kmp_barrier.h 3349cc55cSDimitry Andric */ 4349cc55cSDimitry Andric 5349cc55cSDimitry Andric //===----------------------------------------------------------------------===// 6349cc55cSDimitry Andric // 7349cc55cSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8349cc55cSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 9349cc55cSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10349cc55cSDimitry Andric // 11349cc55cSDimitry Andric //===----------------------------------------------------------------------===// 12349cc55cSDimitry Andric 13349cc55cSDimitry Andric #ifndef KMP_BARRIER_H 14349cc55cSDimitry Andric #define KMP_BARRIER_H 15349cc55cSDimitry Andric 16349cc55cSDimitry Andric #include "kmp.h" 17349cc55cSDimitry Andric #include "kmp_i18n.h" 18349cc55cSDimitry Andric 19349cc55cSDimitry Andric #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC 20349cc55cSDimitry Andric #include <xmmintrin.h> 21349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment) 22349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr) 23349cc55cSDimitry Andric #elif KMP_HAVE_ALIGNED_ALLOC 24*5f757f3fSDimitry Andric #define KMP_ALGIN_UP(val, alignment) \ 25*5f757f3fSDimitry Andric (((val) + (alignment)-1) / (alignment) * (alignment)) 26*5f757f3fSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) \ 27*5f757f3fSDimitry Andric aligned_alloc(alignment, KMP_ALGIN_UP(size, alignment)) 28349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) free(ptr) 29349cc55cSDimitry Andric #elif KMP_HAVE_POSIX_MEMALIGN 30349cc55cSDimitry Andric static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) { 31349cc55cSDimitry Andric void *ptr; 32349cc55cSDimitry Andric int n = posix_memalign(&ptr, alignment, size); 33349cc55cSDimitry Andric if (n != 0) { 34349cc55cSDimitry Andric if (ptr) 35349cc55cSDimitry Andric free(ptr); 36349cc55cSDimitry Andric return nullptr; 37349cc55cSDimitry Andric } 38349cc55cSDimitry Andric return ptr; 39349cc55cSDimitry Andric } 40349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) free(ptr) 41349cc55cSDimitry Andric #elif KMP_HAVE__ALIGNED_MALLOC 42349cc55cSDimitry Andric #include <malloc.h> 43349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment) 44349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr) 45349cc55cSDimitry Andric #else 46349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size) 47349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr) 48349cc55cSDimitry Andric #endif 49349cc55cSDimitry Andric 50349cc55cSDimitry Andric // Use four cache lines: MLC tends to prefetch the next or previous cache line 51349cc55cSDimitry Andric // creating a possible fake conflict between cores, so this is the only way to 52349cc55cSDimitry Andric // guarantee that no such prefetch can happen. 53349cc55cSDimitry Andric #ifndef KMP_FOURLINE_ALIGN_CACHE 54349cc55cSDimitry Andric #define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE) 55349cc55cSDimitry Andric #endif 56349cc55cSDimitry Andric 57349cc55cSDimitry Andric #define KMP_OPTIMIZE_FOR_REDUCTIONS 0 58349cc55cSDimitry Andric 59349cc55cSDimitry Andric class distributedBarrier { 60349cc55cSDimitry Andric struct flags_s { 61349cc55cSDimitry Andric kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed; 62349cc55cSDimitry Andric }; 63349cc55cSDimitry Andric 64349cc55cSDimitry Andric struct go_s { 65349cc55cSDimitry Andric std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go; 66349cc55cSDimitry Andric }; 67349cc55cSDimitry Andric 68349cc55cSDimitry Andric struct iter_s { 69349cc55cSDimitry Andric kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter; 70349cc55cSDimitry Andric }; 71349cc55cSDimitry Andric 72349cc55cSDimitry Andric struct sleep_s { 73349cc55cSDimitry Andric std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep; 74349cc55cSDimitry Andric }; 75349cc55cSDimitry Andric 76349cc55cSDimitry Andric void init(size_t nthr); 77349cc55cSDimitry Andric void resize(size_t nthr); 78349cc55cSDimitry Andric void computeGo(size_t n); 79349cc55cSDimitry Andric void computeVarsForN(size_t n); 80349cc55cSDimitry Andric 81349cc55cSDimitry Andric public: 82349cc55cSDimitry Andric enum { 83349cc55cSDimitry Andric MAX_ITERS = 3, 84349cc55cSDimitry Andric MAX_GOS = 8, 85349cc55cSDimitry Andric IDEAL_GOS = 4, 86349cc55cSDimitry Andric IDEAL_CONTENTION = 16, 87349cc55cSDimitry Andric }; 88349cc55cSDimitry Andric 89349cc55cSDimitry Andric flags_s *flags[MAX_ITERS]; 90349cc55cSDimitry Andric go_s *go; 91349cc55cSDimitry Andric iter_s *iter; 92349cc55cSDimitry Andric sleep_s *sleep; 93349cc55cSDimitry Andric 94349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier 95349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure 96349cc55cSDimitry Andric // number of go signals each requiring one write per iteration 97349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE num_gos; 98349cc55cSDimitry Andric // number of groups of gos 99349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE num_groups; 100349cc55cSDimitry Andric // threads per go signal 101349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE threads_per_go; 102349cc55cSDimitry Andric bool KMP_ALIGN_CACHE fix_threads_per_go; 103349cc55cSDimitry Andric // threads per group 104349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE threads_per_group; 105349cc55cSDimitry Andric // number of go signals in a group 106349cc55cSDimitry Andric size_t KMP_ALIGN_CACHE gos_per_group; 107349cc55cSDimitry Andric void *team_icvs; 108349cc55cSDimitry Andric 109349cc55cSDimitry Andric distributedBarrier() = delete; 110349cc55cSDimitry Andric ~distributedBarrier() = delete; 111349cc55cSDimitry Andric 112349cc55cSDimitry Andric // Used instead of constructor to create aligned data 113349cc55cSDimitry Andric static distributedBarrier *allocate(int nThreads) { 114349cc55cSDimitry Andric distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE( 115349cc55cSDimitry Andric sizeof(distributedBarrier), 4 * CACHE_LINE); 116349cc55cSDimitry Andric if (!d) { 117349cc55cSDimitry Andric KMP_FATAL(MemoryAllocFailed); 118349cc55cSDimitry Andric } 119349cc55cSDimitry Andric d->num_threads = 0; 120349cc55cSDimitry Andric d->max_threads = 0; 121349cc55cSDimitry Andric for (int i = 0; i < MAX_ITERS; ++i) 122349cc55cSDimitry Andric d->flags[i] = NULL; 123349cc55cSDimitry Andric d->go = NULL; 124349cc55cSDimitry Andric d->iter = NULL; 125349cc55cSDimitry Andric d->sleep = NULL; 126349cc55cSDimitry Andric d->team_icvs = NULL; 127349cc55cSDimitry Andric d->fix_threads_per_go = false; 128349cc55cSDimitry Andric // calculate gos and groups ONCE on base size 129349cc55cSDimitry Andric d->computeGo(nThreads); 130349cc55cSDimitry Andric d->init(nThreads); 131349cc55cSDimitry Andric return d; 132349cc55cSDimitry Andric } 133349cc55cSDimitry Andric 134349cc55cSDimitry Andric static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); } 135349cc55cSDimitry Andric 136349cc55cSDimitry Andric void update_num_threads(size_t nthr) { init(nthr); } 137349cc55cSDimitry Andric 138349cc55cSDimitry Andric bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); } 139349cc55cSDimitry Andric size_t get_num_threads() { return num_threads; } 140349cc55cSDimitry Andric kmp_uint64 go_release(); 141349cc55cSDimitry Andric void go_reset(); 142349cc55cSDimitry Andric }; 143349cc55cSDimitry Andric 144349cc55cSDimitry Andric #endif // KMP_BARRIER_H 145