/* * kmp_barrier.h */ //===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef KMP_BARRIER_H #define KMP_BARRIER_H #include "kmp.h" #include "kmp_i18n.h" #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC #include #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment) #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr) #elif KMP_HAVE_ALIGNED_ALLOC #define KMP_ALGIN_UP(val, alignment) \ (((val) + (alignment)-1) / (alignment) * (alignment)) #define KMP_ALIGNED_ALLOCATE(size, alignment) \ aligned_alloc(alignment, KMP_ALGIN_UP(size, alignment)) #define KMP_ALIGNED_FREE(ptr) free(ptr) #elif KMP_HAVE_POSIX_MEMALIGN static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) { void *ptr; int n = posix_memalign(&ptr, alignment, size); if (n != 0) { if (ptr) free(ptr); return nullptr; } return ptr; } #define KMP_ALIGNED_FREE(ptr) free(ptr) #elif KMP_HAVE__ALIGNED_MALLOC #include #define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment) #define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr) #else #define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size) #define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr) #endif // Use four cache lines: MLC tends to prefetch the next or previous cache line // creating a possible fake conflict between cores, so this is the only way to // guarantee that no such prefetch can happen. #ifndef KMP_FOURLINE_ALIGN_CACHE #define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE) #endif #define KMP_OPTIMIZE_FOR_REDUCTIONS 0 class distributedBarrier { struct flags_s { kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed; }; struct go_s { std::atomic KMP_FOURLINE_ALIGN_CACHE go; }; struct iter_s { kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter; }; struct sleep_s { std::atomic KMP_FOURLINE_ALIGN_CACHE sleep; }; void init(size_t nthr); void resize(size_t nthr); void computeGo(size_t n); void computeVarsForN(size_t n); public: enum { MAX_ITERS = 3, MAX_GOS = 8, IDEAL_GOS = 4, IDEAL_CONTENTION = 16, }; flags_s *flags[MAX_ITERS]; go_s *go; iter_s *iter; sleep_s *sleep; size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure // number of go signals each requiring one write per iteration size_t KMP_ALIGN_CACHE num_gos; // number of groups of gos size_t KMP_ALIGN_CACHE num_groups; // threads per go signal size_t KMP_ALIGN_CACHE threads_per_go; bool KMP_ALIGN_CACHE fix_threads_per_go; // threads per group size_t KMP_ALIGN_CACHE threads_per_group; // number of go signals in a group size_t KMP_ALIGN_CACHE gos_per_group; void *team_icvs; distributedBarrier() = delete; ~distributedBarrier() = delete; // Used instead of constructor to create aligned data static distributedBarrier *allocate(int nThreads) { distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE( sizeof(distributedBarrier), 4 * CACHE_LINE); if (!d) { KMP_FATAL(MemoryAllocFailed); } d->num_threads = 0; d->max_threads = 0; for (int i = 0; i < MAX_ITERS; ++i) d->flags[i] = NULL; d->go = NULL; d->iter = NULL; d->sleep = NULL; d->team_icvs = NULL; d->fix_threads_per_go = false; // calculate gos and groups ONCE on base size d->computeGo(nThreads); d->init(nThreads); return d; } static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); } void update_num_threads(size_t nthr) { init(nthr); } bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); } size_t get_num_threads() { return num_threads; } kmp_uint64 go_release(); void go_reset(); }; #endif // KMP_BARRIER_H