xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_barrier.h (revision 349cc55c9796c4596a5b9904cd3281af295f878f)
1*349cc55cSDimitry Andric /*
2*349cc55cSDimitry Andric  * kmp_barrier.h
3*349cc55cSDimitry Andric  */
4*349cc55cSDimitry Andric 
5*349cc55cSDimitry Andric //===----------------------------------------------------------------------===//
6*349cc55cSDimitry Andric //
7*349cc55cSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8*349cc55cSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
9*349cc55cSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10*349cc55cSDimitry Andric //
11*349cc55cSDimitry Andric //===----------------------------------------------------------------------===//
12*349cc55cSDimitry Andric 
13*349cc55cSDimitry Andric #ifndef KMP_BARRIER_H
14*349cc55cSDimitry Andric #define KMP_BARRIER_H
15*349cc55cSDimitry Andric 
16*349cc55cSDimitry Andric #include "kmp.h"
17*349cc55cSDimitry Andric #include "kmp_i18n.h"
18*349cc55cSDimitry Andric 
19*349cc55cSDimitry Andric #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
20*349cc55cSDimitry Andric #include <xmmintrin.h>
21*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
22*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
23*349cc55cSDimitry Andric #elif KMP_HAVE_ALIGNED_ALLOC
24*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
25*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) free(ptr)
26*349cc55cSDimitry Andric #elif KMP_HAVE_POSIX_MEMALIGN
27*349cc55cSDimitry Andric static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
28*349cc55cSDimitry Andric   void *ptr;
29*349cc55cSDimitry Andric   int n = posix_memalign(&ptr, alignment, size);
30*349cc55cSDimitry Andric   if (n != 0) {
31*349cc55cSDimitry Andric     if (ptr)
32*349cc55cSDimitry Andric       free(ptr);
33*349cc55cSDimitry Andric     return nullptr;
34*349cc55cSDimitry Andric   }
35*349cc55cSDimitry Andric   return ptr;
36*349cc55cSDimitry Andric }
37*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) free(ptr)
38*349cc55cSDimitry Andric #elif KMP_HAVE__ALIGNED_MALLOC
39*349cc55cSDimitry Andric #include <malloc.h>
40*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
41*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
42*349cc55cSDimitry Andric #else
43*349cc55cSDimitry Andric #define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
44*349cc55cSDimitry Andric #define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
45*349cc55cSDimitry Andric #endif
46*349cc55cSDimitry Andric 
47*349cc55cSDimitry Andric // Use four cache lines: MLC tends to prefetch the next or previous cache line
48*349cc55cSDimitry Andric // creating a possible fake conflict between cores, so this is the only way to
49*349cc55cSDimitry Andric // guarantee that no such prefetch can happen.
50*349cc55cSDimitry Andric #ifndef KMP_FOURLINE_ALIGN_CACHE
51*349cc55cSDimitry Andric #define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
52*349cc55cSDimitry Andric #endif
53*349cc55cSDimitry Andric 
54*349cc55cSDimitry Andric #define KMP_OPTIMIZE_FOR_REDUCTIONS 0
55*349cc55cSDimitry Andric 
56*349cc55cSDimitry Andric class distributedBarrier {
57*349cc55cSDimitry Andric   struct flags_s {
58*349cc55cSDimitry Andric     kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
59*349cc55cSDimitry Andric   };
60*349cc55cSDimitry Andric 
61*349cc55cSDimitry Andric   struct go_s {
62*349cc55cSDimitry Andric     std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
63*349cc55cSDimitry Andric   };
64*349cc55cSDimitry Andric 
65*349cc55cSDimitry Andric   struct iter_s {
66*349cc55cSDimitry Andric     kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
67*349cc55cSDimitry Andric   };
68*349cc55cSDimitry Andric 
69*349cc55cSDimitry Andric   struct sleep_s {
70*349cc55cSDimitry Andric     std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
71*349cc55cSDimitry Andric   };
72*349cc55cSDimitry Andric 
73*349cc55cSDimitry Andric   void init(size_t nthr);
74*349cc55cSDimitry Andric   void resize(size_t nthr);
75*349cc55cSDimitry Andric   void computeGo(size_t n);
76*349cc55cSDimitry Andric   void computeVarsForN(size_t n);
77*349cc55cSDimitry Andric 
78*349cc55cSDimitry Andric public:
79*349cc55cSDimitry Andric   enum {
80*349cc55cSDimitry Andric     MAX_ITERS = 3,
81*349cc55cSDimitry Andric     MAX_GOS = 8,
82*349cc55cSDimitry Andric     IDEAL_GOS = 4,
83*349cc55cSDimitry Andric     IDEAL_CONTENTION = 16,
84*349cc55cSDimitry Andric   };
85*349cc55cSDimitry Andric 
86*349cc55cSDimitry Andric   flags_s *flags[MAX_ITERS];
87*349cc55cSDimitry Andric   go_s *go;
88*349cc55cSDimitry Andric   iter_s *iter;
89*349cc55cSDimitry Andric   sleep_s *sleep;
90*349cc55cSDimitry Andric 
91*349cc55cSDimitry Andric   size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
92*349cc55cSDimitry Andric   size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
93*349cc55cSDimitry Andric   // number of go signals each requiring one write per iteration
94*349cc55cSDimitry Andric   size_t KMP_ALIGN_CACHE num_gos;
95*349cc55cSDimitry Andric   // number of groups of gos
96*349cc55cSDimitry Andric   size_t KMP_ALIGN_CACHE num_groups;
97*349cc55cSDimitry Andric   // threads per go signal
98*349cc55cSDimitry Andric   size_t KMP_ALIGN_CACHE threads_per_go;
99*349cc55cSDimitry Andric   bool KMP_ALIGN_CACHE fix_threads_per_go;
100*349cc55cSDimitry Andric   // threads per group
101*349cc55cSDimitry Andric   size_t KMP_ALIGN_CACHE threads_per_group;
102*349cc55cSDimitry Andric   // number of go signals in a group
103*349cc55cSDimitry Andric   size_t KMP_ALIGN_CACHE gos_per_group;
104*349cc55cSDimitry Andric   void *team_icvs;
105*349cc55cSDimitry Andric 
106*349cc55cSDimitry Andric   distributedBarrier() = delete;
107*349cc55cSDimitry Andric   ~distributedBarrier() = delete;
108*349cc55cSDimitry Andric 
109*349cc55cSDimitry Andric   // Used instead of constructor to create aligned data
110*349cc55cSDimitry Andric   static distributedBarrier *allocate(int nThreads) {
111*349cc55cSDimitry Andric     distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE(
112*349cc55cSDimitry Andric         sizeof(distributedBarrier), 4 * CACHE_LINE);
113*349cc55cSDimitry Andric     if (!d) {
114*349cc55cSDimitry Andric       KMP_FATAL(MemoryAllocFailed);
115*349cc55cSDimitry Andric     }
116*349cc55cSDimitry Andric     d->num_threads = 0;
117*349cc55cSDimitry Andric     d->max_threads = 0;
118*349cc55cSDimitry Andric     for (int i = 0; i < MAX_ITERS; ++i)
119*349cc55cSDimitry Andric       d->flags[i] = NULL;
120*349cc55cSDimitry Andric     d->go = NULL;
121*349cc55cSDimitry Andric     d->iter = NULL;
122*349cc55cSDimitry Andric     d->sleep = NULL;
123*349cc55cSDimitry Andric     d->team_icvs = NULL;
124*349cc55cSDimitry Andric     d->fix_threads_per_go = false;
125*349cc55cSDimitry Andric     // calculate gos and groups ONCE on base size
126*349cc55cSDimitry Andric     d->computeGo(nThreads);
127*349cc55cSDimitry Andric     d->init(nThreads);
128*349cc55cSDimitry Andric     return d;
129*349cc55cSDimitry Andric   }
130*349cc55cSDimitry Andric 
131*349cc55cSDimitry Andric   static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }
132*349cc55cSDimitry Andric 
133*349cc55cSDimitry Andric   void update_num_threads(size_t nthr) { init(nthr); }
134*349cc55cSDimitry Andric 
135*349cc55cSDimitry Andric   bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
136*349cc55cSDimitry Andric   size_t get_num_threads() { return num_threads; }
137*349cc55cSDimitry Andric   kmp_uint64 go_release();
138*349cc55cSDimitry Andric   void go_reset();
139*349cc55cSDimitry Andric };
140*349cc55cSDimitry Andric 
141*349cc55cSDimitry Andric #endif // KMP_BARRIER_H
142