xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_alloc.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 /*
2  * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_io.h"
15 #include "kmp_wrapper_malloc.h"
16 
17 #if KMP_HWLOC_ENABLED
18 #if HWLOC_API_VERSION > 0x00020300
19 #define KMP_HWLOC_LOCATION_TYPE_CPUSET HWLOC_LOCATION_TYPE_CPUSET
20 #elif HWLOC_API_VERSION == 0x00020300
21 #define KMP_HWLOC_LOCATION_TYPE_CPUSET                                         \
22   hwloc_location::HWLOC_LOCATION_TYPE_CPUSET
23 #else
24 enum hwloc_memattr_id_e {
25   HWLOC_MEMATTR_ID_BANDWIDTH,
26   HWLOC_MEMATTR_ID_CAPACITY
27 };
28 #endif
29 #endif // KMP_HWLOC_ENABLED
30 
31 // Disable bget when it is not used
32 #if KMP_USE_BGET
33 
34 /* Thread private buffer management code */
35 
36 typedef int (*bget_compact_t)(size_t, int);
37 typedef void *(*bget_acquire_t)(size_t);
38 typedef void (*bget_release_t)(void *);
39 
40 /* NOTE: bufsize must be a signed datatype */
41 
42 #if KMP_OS_WINDOWS
43 #if KMP_ARCH_X86 || KMP_ARCH_ARM
44 typedef kmp_int32 bufsize;
45 #else
46 typedef kmp_int64 bufsize;
47 #endif
48 #else
49 typedef ssize_t bufsize;
50 #endif // KMP_OS_WINDOWS
51 
52 /* The three modes of operation are, fifo search, lifo search, and best-fit */
53 
54 typedef enum bget_mode {
55   bget_mode_fifo = 0,
56   bget_mode_lifo = 1,
57   bget_mode_best = 2
58 } bget_mode_t;
59 
60 static void bpool(kmp_info_t *th, void *buffer, bufsize len);
61 static void *bget(kmp_info_t *th, bufsize size);
62 static void *bgetz(kmp_info_t *th, bufsize size);
63 static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
64 static void brel(kmp_info_t *th, void *buf);
65 static void bectl(kmp_info_t *th, bget_compact_t compact,
66                   bget_acquire_t acquire, bget_release_t release,
67                   bufsize pool_incr);
68 
69 /* BGET CONFIGURATION */
70 /* Buffer allocation size quantum: all buffers allocated are a
71    multiple of this size.  This MUST be a power of two. */
72 
73 /* On some architectures, malloc() does not ensure 16 byte alignment,
74    Solaris/sparc and x86 among them. */
75 
76 #if KMP_ARCH_X86 || KMP_ARCH_SPARC || !KMP_HAVE_QUAD
77 
78 #define SizeQuant 8
79 #define AlignType double
80 
81 #else
82 
83 #define SizeQuant 16
84 #define AlignType _Quad
85 
86 #endif
87 
88 // Define this symbol to enable the bstats() function which calculates the
89 // total free space in the buffer pool, the largest available buffer, and the
90 // total space currently allocated.
91 #define BufStats 1
92 
93 #ifdef KMP_DEBUG
94 
95 // Define this symbol to enable the bpoold() function which dumps the buffers
96 // in a buffer pool.
97 #define BufDump 1
98 
99 // Define this symbol to enable the bpoolv() function for validating a buffer
100 // pool.
101 #define BufValid 1
102 
103 // Define this symbol to enable the bufdump() function which allows dumping the
104 // contents of an allocated or free buffer.
105 #define DumpData 1
106 
107 #ifdef NOT_USED_NOW
108 
109 // Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
110 // who attempt to use pointers into released buffers.
111 #define FreeWipe 1
112 
113 // Use a best fit algorithm when searching for space for an allocation request.
114 // This uses memory more efficiently, but allocation will be much slower.
115 #define BestFit 1
116 
117 #endif /* NOT_USED_NOW */
118 #endif /* KMP_DEBUG */
119 
120 static bufsize bget_bin_size[] = {
121     0,
122     //    1 << 6,    /* .5 Cache line */
123     1 << 7, /* 1 Cache line, new */
124     1 << 8, /* 2 Cache lines */
125     1 << 9, /* 4 Cache lines, new */
126     1 << 10, /* 8 Cache lines */
127     1 << 11, /* 16 Cache lines, new */
128     1 << 12, 1 << 13, /* new */
129     1 << 14, 1 << 15, /* new */
130     1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
131     1 << 21, /*  2MB */
132     1 << 22, /*  4MB */
133     1 << 23, /*  8MB */
134     1 << 24, /* 16MB */
135     1 << 25, /* 32MB */
136 };
137 
138 #define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
139 
140 struct bfhead;
141 
142 //  Declare the interface, including the requested buffer size type, bufsize.
143 
144 /* Queue links */
145 typedef struct qlinks {
146   struct bfhead *flink; /* Forward link */
147   struct bfhead *blink; /* Backward link */
148 } qlinks_t;
149 
150 /* Header in allocated and free buffers */
151 typedef struct bhead2 {
152   kmp_info_t *bthr; /* The thread which owns the buffer pool */
153   bufsize prevfree; /* Relative link back to previous free buffer in memory or
154                        0 if previous buffer is allocated.  */
155   bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
156 } bhead2_t;
157 
158 /* Make sure the bhead structure is a multiple of SizeQuant in size. */
159 typedef union bhead {
160   KMP_ALIGN(SizeQuant)
161   AlignType b_align;
162   char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
163   bhead2_t bb;
164 } bhead_t;
165 #define BH(p) ((bhead_t *)(p))
166 
167 /*  Header in directly allocated buffers (by acqfcn) */
168 typedef struct bdhead {
169   bufsize tsize; /* Total size, including overhead */
170   bhead_t bh; /* Common header */
171 } bdhead_t;
172 #define BDH(p) ((bdhead_t *)(p))
173 
174 /* Header in free buffers */
175 typedef struct bfhead {
176   bhead_t bh; /* Common allocated/free header */
177   qlinks_t ql; /* Links on free list */
178 } bfhead_t;
179 #define BFH(p) ((bfhead_t *)(p))
180 
181 typedef struct thr_data {
182   bfhead_t freelist[MAX_BGET_BINS];
183 #if BufStats
184   size_t totalloc; /* Total space currently allocated */
185   long numget, numrel; /* Number of bget() and brel() calls */
186   long numpblk; /* Number of pool blocks */
187   long numpget, numprel; /* Number of block gets and rels */
188   long numdget, numdrel; /* Number of direct gets and rels */
189 #endif /* BufStats */
190 
191   /* Automatic expansion block management functions */
192   bget_compact_t compfcn;
193   bget_acquire_t acqfcn;
194   bget_release_t relfcn;
195 
196   bget_mode_t mode; /* what allocation mode to use? */
197 
198   bufsize exp_incr; /* Expansion block size */
199   bufsize pool_len; /* 0: no bpool calls have been made
200                        -1: not all pool blocks are the same size
201                        >0: (common) block size for all bpool calls made so far
202                     */
203   bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */
204 } thr_data_t;
205 
206 /*  Minimum allocation quantum: */
207 #define QLSize (sizeof(qlinks_t))
208 #define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
209 #define MaxSize                                                                \
210   (bufsize)(                                                                   \
211       ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
212 // Maximum for the requested size.
213 
214 /* End sentinel: value placed in bsize field of dummy block delimiting
215    end of pool block.  The most negative number which will  fit  in  a
216    bufsize, defined in a way that the compiler will accept. */
217 
218 #define ESent                                                                  \
219   ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
220 
221 /* Thread Data management routines */
bget_get_bin(bufsize size)222 static int bget_get_bin(bufsize size) {
223   // binary chop bins
224   int lo = 0, hi = MAX_BGET_BINS - 1;
225 
226   KMP_DEBUG_ASSERT(size > 0);
227 
228   while ((hi - lo) > 1) {
229     int mid = (lo + hi) >> 1;
230     if (size < bget_bin_size[mid])
231       hi = mid - 1;
232     else
233       lo = mid;
234   }
235 
236   KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
237 
238   return lo;
239 }
240 
set_thr_data(kmp_info_t * th)241 static void set_thr_data(kmp_info_t *th) {
242   int i;
243   thr_data_t *data;
244 
245   data = (thr_data_t *)((!th->th.th_local.bget_data)
246                             ? __kmp_allocate(sizeof(*data))
247                             : th->th.th_local.bget_data);
248 
249   memset(data, '\0', sizeof(*data));
250 
251   for (i = 0; i < MAX_BGET_BINS; ++i) {
252     data->freelist[i].ql.flink = &data->freelist[i];
253     data->freelist[i].ql.blink = &data->freelist[i];
254   }
255 
256   th->th.th_local.bget_data = data;
257   th->th.th_local.bget_list = 0;
258 #if !USE_CMP_XCHG_FOR_BGET
259 #ifdef USE_QUEUING_LOCK_FOR_BGET
260   __kmp_init_lock(&th->th.th_local.bget_lock);
261 #else
262   __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
263 #endif /* USE_LOCK_FOR_BGET */
264 #endif /* ! USE_CMP_XCHG_FOR_BGET */
265 }
266 
get_thr_data(kmp_info_t * th)267 static thr_data_t *get_thr_data(kmp_info_t *th) {
268   thr_data_t *data;
269 
270   data = (thr_data_t *)th->th.th_local.bget_data;
271 
272   KMP_DEBUG_ASSERT(data != 0);
273 
274   return data;
275 }
276 
277 /* Walk the free list and release the enqueued buffers */
__kmp_bget_dequeue(kmp_info_t * th)278 static void __kmp_bget_dequeue(kmp_info_t *th) {
279   void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
280 
281   if (p != 0) {
282 #if USE_CMP_XCHG_FOR_BGET
283     {
284       volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
285       while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
286                                         CCAST(void *, old_value), nullptr)) {
287         KMP_CPU_PAUSE();
288         old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
289       }
290       p = CCAST(void *, old_value);
291     }
292 #else /* ! USE_CMP_XCHG_FOR_BGET */
293 #ifdef USE_QUEUING_LOCK_FOR_BGET
294     __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
295 #else
296     __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
297 #endif /* USE_QUEUING_LOCK_FOR_BGET */
298 
299     p = (void *)th->th.th_local.bget_list;
300     th->th.th_local.bget_list = 0;
301 
302 #ifdef USE_QUEUING_LOCK_FOR_BGET
303     __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
304 #else
305     __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
306 #endif
307 #endif /* USE_CMP_XCHG_FOR_BGET */
308 
309     /* Check again to make sure the list is not empty */
310     while (p != 0) {
311       void *buf = p;
312       bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
313 
314       KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
315       KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
316                        (kmp_uintptr_t)th); // clear possible mark
317       KMP_DEBUG_ASSERT(b->ql.blink == 0);
318 
319       p = (void *)b->ql.flink;
320 
321       brel(th, buf);
322     }
323   }
324 }
325 
326 /* Chain together the free buffers by using the thread owner field */
__kmp_bget_enqueue(kmp_info_t * th,void * buf,kmp_int32 rel_gtid)327 static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
328 #ifdef USE_QUEUING_LOCK_FOR_BGET
329                                ,
330                                kmp_int32 rel_gtid
331 #endif
332 ) {
333   bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
334 
335   KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
336   KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
337                    (kmp_uintptr_t)th); // clear possible mark
338 
339   b->ql.blink = 0;
340 
341   KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
342                 __kmp_gtid_from_thread(th)));
343 
344 #if USE_CMP_XCHG_FOR_BGET
345   {
346     volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
347     /* the next pointer must be set before setting bget_list to buf to avoid
348        exposing a broken list to other threads, even for an instant. */
349     b->ql.flink = BFH(CCAST(void *, old_value));
350 
351     while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
352                                       CCAST(void *, old_value), buf)) {
353       KMP_CPU_PAUSE();
354       old_value = TCR_PTR(th->th.th_local.bget_list);
355       /* the next pointer must be set before setting bget_list to buf to avoid
356          exposing a broken list to other threads, even for an instant. */
357       b->ql.flink = BFH(CCAST(void *, old_value));
358     }
359   }
360 #else /* ! USE_CMP_XCHG_FOR_BGET */
361 #ifdef USE_QUEUING_LOCK_FOR_BGET
362   __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
363 #else
364   __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
365 #endif
366 
367   b->ql.flink = BFH(th->th.th_local.bget_list);
368   th->th.th_local.bget_list = (void *)buf;
369 
370 #ifdef USE_QUEUING_LOCK_FOR_BGET
371   __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
372 #else
373   __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
374 #endif
375 #endif /* USE_CMP_XCHG_FOR_BGET */
376 }
377 
378 /* insert buffer back onto a new freelist */
__kmp_bget_insert_into_freelist(thr_data_t * thr,bfhead_t * b)379 static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
380   int bin;
381 
382   KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
383   KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
384 
385   bin = bget_get_bin(b->bh.bb.bsize);
386 
387   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
388                    &thr->freelist[bin]);
389   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
390                    &thr->freelist[bin]);
391 
392   b->ql.flink = &thr->freelist[bin];
393   b->ql.blink = thr->freelist[bin].ql.blink;
394 
395   thr->freelist[bin].ql.blink = b;
396   b->ql.blink->ql.flink = b;
397 }
398 
399 /* unlink the buffer from the old freelist */
__kmp_bget_remove_from_freelist(bfhead_t * b)400 static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
401   KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
402   KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
403 
404   b->ql.blink->ql.flink = b->ql.flink;
405   b->ql.flink->ql.blink = b->ql.blink;
406 }
407 
408 /*  GET STATS -- check info on free list */
bcheck(kmp_info_t * th,bufsize * max_free,bufsize * total_free)409 static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
410   thr_data_t *thr = get_thr_data(th);
411   int bin;
412 
413   *total_free = *max_free = 0;
414 
415   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
416     bfhead_t *b, *best;
417 
418     best = &thr->freelist[bin];
419     b = best->ql.flink;
420 
421     while (b != &thr->freelist[bin]) {
422       *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
423       if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
424         best = b;
425 
426       /* Link to next buffer */
427       b = b->ql.flink;
428     }
429 
430     if (*max_free < best->bh.bb.bsize)
431       *max_free = best->bh.bb.bsize;
432   }
433 
434   if (*max_free > (bufsize)sizeof(bhead_t))
435     *max_free -= sizeof(bhead_t);
436 }
437 
438 /*  BGET  --  Allocate a buffer.  */
bget(kmp_info_t * th,bufsize requested_size)439 static void *bget(kmp_info_t *th, bufsize requested_size) {
440   thr_data_t *thr = get_thr_data(th);
441   bufsize size = requested_size;
442   bfhead_t *b;
443   void *buf;
444   int compactseq = 0;
445   int use_blink = 0;
446   /* For BestFit */
447   bfhead_t *best;
448 
449   if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
450     return NULL;
451   }
452 
453   __kmp_bget_dequeue(th); /* Release any queued buffers */
454 
455   if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
456     size = SizeQ;
457   }
458 #if defined(SizeQuant) && (SizeQuant > 1)
459   size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
460 #endif
461 
462   size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
463   KMP_DEBUG_ASSERT(size >= 0);
464   KMP_DEBUG_ASSERT(size % SizeQuant == 0);
465 
466   use_blink = (thr->mode == bget_mode_lifo);
467 
468   /* If a compact function was provided in the call to bectl(), wrap
469      a loop around the allocation process  to  allow  compaction  to
470      intervene in case we don't find a suitable buffer in the chain. */
471 
472   for (;;) {
473     int bin;
474 
475     for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
476       /* Link to next buffer */
477       b = (use_blink ? thr->freelist[bin].ql.blink
478                      : thr->freelist[bin].ql.flink);
479 
480       if (thr->mode == bget_mode_best) {
481         best = &thr->freelist[bin];
482 
483         /* Scan the free list searching for the first buffer big enough
484            to hold the requested size buffer. */
485         while (b != &thr->freelist[bin]) {
486           if (b->bh.bb.bsize >= (bufsize)size) {
487             if ((best == &thr->freelist[bin]) ||
488                 (b->bh.bb.bsize < best->bh.bb.bsize)) {
489               best = b;
490             }
491           }
492 
493           /* Link to next buffer */
494           b = (use_blink ? b->ql.blink : b->ql.flink);
495         }
496         b = best;
497       }
498 
499       while (b != &thr->freelist[bin]) {
500         if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
501 
502           // Buffer is big enough to satisfy the request. Allocate it to the
503           // caller. We must decide whether the buffer is large enough to split
504           // into the part given to the caller and a free buffer that remains
505           // on the free list, or whether the entire buffer should be removed
506           // from the free list and given to the caller in its entirety. We
507           // only split the buffer if enough room remains for a header plus the
508           // minimum quantum of allocation.
509           if ((b->bh.bb.bsize - (bufsize)size) >
510               (bufsize)(SizeQ + (sizeof(bhead_t)))) {
511             bhead_t *ba, *bn;
512 
513             ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
514             bn = BH(((char *)ba) + size);
515 
516             KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
517 
518             /* Subtract size from length of free block. */
519             b->bh.bb.bsize -= (bufsize)size;
520 
521             /* Link allocated buffer to the previous free buffer. */
522             ba->bb.prevfree = b->bh.bb.bsize;
523 
524             /* Plug negative size into user buffer. */
525             ba->bb.bsize = -size;
526 
527             /* Mark this buffer as owned by this thread. */
528             TCW_PTR(ba->bb.bthr,
529                     th); // not an allocated address (do not mark it)
530             /* Mark buffer after this one not preceded by free block. */
531             bn->bb.prevfree = 0;
532 
533             // unlink buffer from old freelist, and reinsert into new freelist
534             __kmp_bget_remove_from_freelist(b);
535             __kmp_bget_insert_into_freelist(thr, b);
536 #if BufStats
537             thr->totalloc += (size_t)size;
538             thr->numget++; /* Increment number of bget() calls */
539 #endif
540             buf = (void *)((((char *)ba) + sizeof(bhead_t)));
541             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
542             return buf;
543           } else {
544             bhead_t *ba;
545 
546             ba = BH(((char *)b) + b->bh.bb.bsize);
547 
548             KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
549 
550             /* The buffer isn't big enough to split.  Give  the  whole
551                shebang to the caller and remove it from the free list. */
552 
553             __kmp_bget_remove_from_freelist(b);
554 #if BufStats
555             thr->totalloc += (size_t)b->bh.bb.bsize;
556             thr->numget++; /* Increment number of bget() calls */
557 #endif
558             /* Negate size to mark buffer allocated. */
559             b->bh.bb.bsize = -(b->bh.bb.bsize);
560 
561             /* Mark this buffer as owned by this thread. */
562             TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
563             /* Zero the back pointer in the next buffer in memory
564                to indicate that this buffer is allocated. */
565             ba->bb.prevfree = 0;
566 
567             /* Give user buffer starting at queue links. */
568             buf = (void *)&(b->ql);
569             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
570             return buf;
571           }
572         }
573 
574         /* Link to next buffer */
575         b = (use_blink ? b->ql.blink : b->ql.flink);
576       }
577     }
578 
579     /* We failed to find a buffer. If there's a compact function defined,
580        notify it of the size requested. If it returns TRUE, try the allocation
581        again. */
582 
583     if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
584       break;
585     }
586   }
587 
588   /* No buffer available with requested size free. */
589 
590   /* Don't give up yet -- look in the reserve supply. */
591   if (thr->acqfcn != 0) {
592     if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
593       /* Request is too large to fit in a single expansion block.
594          Try to satisfy it by a direct buffer acquisition. */
595       bdhead_t *bdh;
596 
597       size += sizeof(bdhead_t) - sizeof(bhead_t);
598 
599       KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
600 
601       /* richryan */
602       bdh = BDH((*thr->acqfcn)((bufsize)size));
603       if (bdh != NULL) {
604 
605         // Mark the buffer special by setting size field of its header to zero.
606         bdh->bh.bb.bsize = 0;
607 
608         /* Mark this buffer as owned by this thread. */
609         TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
610         // because direct buffer never goes to free list
611         bdh->bh.bb.prevfree = 0;
612         bdh->tsize = size;
613 #if BufStats
614         thr->totalloc += (size_t)size;
615         thr->numget++; /* Increment number of bget() calls */
616         thr->numdget++; /* Direct bget() call count */
617 #endif
618         buf = (void *)(bdh + 1);
619         KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
620         return buf;
621       }
622 
623     } else {
624 
625       /*  Try to obtain a new expansion block */
626       void *newpool;
627 
628       KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
629 
630       /* richryan */
631       newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
632       KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
633       if (newpool != NULL) {
634         bpool(th, newpool, thr->exp_incr);
635         buf = bget(
636             th, requested_size); /* This can't, I say, can't get into a loop. */
637         return buf;
638       }
639     }
640   }
641 
642   /*  Still no buffer available */
643 
644   return NULL;
645 }
646 
647 /*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
648                the  entire  contents  of  the buffer to zero, not just the
649                region requested by the caller. */
650 
bgetz(kmp_info_t * th,bufsize size)651 static void *bgetz(kmp_info_t *th, bufsize size) {
652   char *buf = (char *)bget(th, size);
653 
654   if (buf != NULL) {
655     bhead_t *b;
656     bufsize rsize;
657 
658     b = BH(buf - sizeof(bhead_t));
659     rsize = -(b->bb.bsize);
660     if (rsize == 0) {
661       bdhead_t *bd;
662 
663       bd = BDH(buf - sizeof(bdhead_t));
664       rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
665     } else {
666       rsize -= sizeof(bhead_t);
667     }
668 
669     KMP_DEBUG_ASSERT(rsize >= size);
670 
671     (void)memset(buf, 0, (bufsize)rsize);
672   }
673   return ((void *)buf);
674 }
675 
676 /*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
677                simply in terms of brel()  and  bget().   It  could  be
678                enhanced to allow the buffer to grow into adjacent free
679                blocks and to avoid moving data unnecessarily.  */
680 
bgetr(kmp_info_t * th,void * buf,bufsize size)681 static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
682   void *nbuf;
683   bufsize osize; /* Old size of buffer */
684   bhead_t *b;
685 
686   nbuf = bget(th, size);
687   if (nbuf == NULL) { /* Acquire new buffer */
688     return NULL;
689   }
690   if (buf == NULL) {
691     return nbuf;
692   }
693   b = BH(((char *)buf) - sizeof(bhead_t));
694   osize = -b->bb.bsize;
695   if (osize == 0) {
696     /*  Buffer acquired directly through acqfcn. */
697     bdhead_t *bd;
698 
699     bd = BDH(((char *)buf) - sizeof(bdhead_t));
700     osize = bd->tsize - (bufsize)sizeof(bdhead_t);
701   } else {
702     osize -= sizeof(bhead_t);
703   }
704 
705   KMP_DEBUG_ASSERT(osize > 0);
706 
707   (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
708                    (size_t)((size < osize) ? size : osize));
709   brel(th, buf);
710 
711   return nbuf;
712 }
713 
714 /*  BREL  --  Release a buffer.  */
brel(kmp_info_t * th,void * buf)715 static void brel(kmp_info_t *th, void *buf) {
716   thr_data_t *thr = get_thr_data(th);
717   bfhead_t *b, *bn;
718   kmp_info_t *bth;
719 
720   KMP_DEBUG_ASSERT(buf != NULL);
721   KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
722 
723   b = BFH(((char *)buf) - sizeof(bhead_t));
724 
725   if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
726     bdhead_t *bdh;
727 
728     bdh = BDH(((char *)buf) - sizeof(bdhead_t));
729     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
730 #if BufStats
731     thr->totalloc -= (size_t)bdh->tsize;
732     thr->numdrel++; /* Number of direct releases */
733     thr->numrel++; /* Increment number of brel() calls */
734 #endif /* BufStats */
735 #ifdef FreeWipe
736     (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
737 #endif /* FreeWipe */
738 
739     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
740 
741     KMP_DEBUG_ASSERT(thr->relfcn != 0);
742     (*thr->relfcn)((void *)bdh); /* Release it directly. */
743     return;
744   }
745 
746   bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
747                        ~1); // clear possible mark before comparison
748   if (bth != th) {
749     /* Add this buffer to be released by the owning thread later */
750     __kmp_bget_enqueue(bth, buf
751 #ifdef USE_QUEUING_LOCK_FOR_BGET
752                        ,
753                        __kmp_gtid_from_thread(th)
754 #endif
755     );
756     return;
757   }
758 
759   /* Buffer size must be negative, indicating that the buffer is allocated. */
760   if (b->bh.bb.bsize >= 0) {
761     bn = NULL;
762   }
763   KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
764 
765   /*  Back pointer in next buffer must be zero, indicating the same thing: */
766 
767   KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
768 
769 #if BufStats
770   thr->numrel++; /* Increment number of brel() calls */
771   thr->totalloc += (size_t)b->bh.bb.bsize;
772 #endif
773 
774   /* If the back link is nonzero, the previous buffer is free.  */
775 
776   if (b->bh.bb.prevfree != 0) {
777     /* The previous buffer is free. Consolidate this buffer with it by adding
778        the length of this buffer to the previous free buffer. Note that we
779        subtract the size in the buffer being released, since it's negative to
780        indicate that the buffer is allocated. */
781     bufsize size = b->bh.bb.bsize;
782 
783     /* Make the previous buffer the one we're working on. */
784     KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
785                      b->bh.bb.prevfree);
786     b = BFH(((char *)b) - b->bh.bb.prevfree);
787     b->bh.bb.bsize -= size;
788 
789     /* unlink the buffer from the old freelist */
790     __kmp_bget_remove_from_freelist(b);
791   } else {
792     /* The previous buffer isn't allocated. Mark this buffer size as positive
793        (i.e. free) and fall through to place the buffer on the free list as an
794        isolated free block. */
795     b->bh.bb.bsize = -b->bh.bb.bsize;
796   }
797 
798   /* insert buffer back onto a new freelist */
799   __kmp_bget_insert_into_freelist(thr, b);
800 
801   /* Now we look at the next buffer in memory, located by advancing from
802      the  start  of  this  buffer  by its size, to see if that buffer is
803      free.  If it is, we combine  this  buffer  with  the  next  one  in
804      memory, dechaining the second buffer from the free list. */
805   bn = BFH(((char *)b) + b->bh.bb.bsize);
806   if (bn->bh.bb.bsize > 0) {
807 
808     /* The buffer is free.  Remove it from the free list and add
809        its size to that of our buffer. */
810     KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
811                      bn->bh.bb.bsize);
812 
813     __kmp_bget_remove_from_freelist(bn);
814 
815     b->bh.bb.bsize += bn->bh.bb.bsize;
816 
817     /* unlink the buffer from the old freelist, and reinsert it into the new
818      * freelist */
819     __kmp_bget_remove_from_freelist(b);
820     __kmp_bget_insert_into_freelist(thr, b);
821 
822     /* Finally,  advance  to   the  buffer  that   follows  the  newly
823        consolidated free block.  We must set its  backpointer  to  the
824        head  of  the  consolidated free block.  We know the next block
825        must be an allocated block because the process of recombination
826        guarantees  that  two  free  blocks will never be contiguous in
827        memory.  */
828     bn = BFH(((char *)b) + b->bh.bb.bsize);
829   }
830 #ifdef FreeWipe
831   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
832                (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
833 #endif
834   KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
835 
836   /* The next buffer is allocated.  Set the backpointer in it  to  point
837      to this buffer; the previous free buffer in memory. */
838 
839   bn->bh.bb.prevfree = b->bh.bb.bsize;
840 
841   /*  If  a  block-release function is defined, and this free buffer
842       constitutes the entire block, release it.  Note that  pool_len
843       is  defined  in  such a way that the test will fail unless all
844       pool blocks are the same size.  */
845   if (thr->relfcn != 0 &&
846       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
847 #if BufStats
848     if (thr->numpblk !=
849         1) { /* Do not release the last buffer until finalization time */
850 #endif
851 
852       KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
853       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
854       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
855                        b->bh.bb.bsize);
856 
857       /*  Unlink the buffer from the free list  */
858       __kmp_bget_remove_from_freelist(b);
859 
860       KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
861 
862       (*thr->relfcn)(b);
863 #if BufStats
864       thr->numprel++; /* Nr of expansion block releases */
865       thr->numpblk--; /* Total number of blocks */
866       KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
867 
868       // avoid leaving stale last_pool pointer around if it is being dealloced
869       if (thr->last_pool == b)
870         thr->last_pool = 0;
871     } else {
872       thr->last_pool = b;
873     }
874 #endif /* BufStats */
875   }
876 }
877 
878 /*  BECTL  --  Establish automatic pool expansion control  */
bectl(kmp_info_t * th,bget_compact_t compact,bget_acquire_t acquire,bget_release_t release,bufsize pool_incr)879 static void bectl(kmp_info_t *th, bget_compact_t compact,
880                   bget_acquire_t acquire, bget_release_t release,
881                   bufsize pool_incr) {
882   thr_data_t *thr = get_thr_data(th);
883 
884   thr->compfcn = compact;
885   thr->acqfcn = acquire;
886   thr->relfcn = release;
887   thr->exp_incr = pool_incr;
888 }
889 
890 /*  BPOOL  --  Add a region of memory to the buffer pool.  */
bpool(kmp_info_t * th,void * buf,bufsize len)891 static void bpool(kmp_info_t *th, void *buf, bufsize len) {
892   /*    int bin = 0; */
893   thr_data_t *thr = get_thr_data(th);
894   bfhead_t *b = BFH(buf);
895   bhead_t *bn;
896 
897   __kmp_bget_dequeue(th); /* Release any queued buffers */
898 
899 #ifdef SizeQuant
900   len &= ~((bufsize)(SizeQuant - 1));
901 #endif
902   if (thr->pool_len == 0) {
903     thr->pool_len = len;
904   } else if (len != thr->pool_len) {
905     thr->pool_len = -1;
906   }
907 #if BufStats
908   thr->numpget++; /* Number of block acquisitions */
909   thr->numpblk++; /* Number of blocks total */
910   KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
911 #endif /* BufStats */
912 
913   /* Since the block is initially occupied by a single free  buffer,
914      it  had  better  not  be  (much) larger than the largest buffer
915      whose size we can store in bhead.bb.bsize. */
916   KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
917 
918   /* Clear  the  backpointer at  the start of the block to indicate that
919      there  is  no  free  block  prior  to  this   one.    That   blocks
920      recombination when the first block in memory is released. */
921   b->bh.bb.prevfree = 0;
922 
923   /* Create a dummy allocated buffer at the end of the pool.  This dummy
924      buffer is seen when a buffer at the end of the pool is released and
925      blocks  recombination  of  the last buffer with the dummy buffer at
926      the end.  The length in the dummy buffer  is  set  to  the  largest
927      negative  number  to  denote  the  end  of  the pool for diagnostic
928      routines (this specific value is  not  counted  on  by  the  actual
929      allocation and release functions). */
930   len -= sizeof(bhead_t);
931   b->bh.bb.bsize = (bufsize)len;
932   /* Set the owner of this buffer */
933   TCW_PTR(b->bh.bb.bthr,
934           (kmp_info_t *)((kmp_uintptr_t)th |
935                          1)); // mark the buffer as allocated address
936 
937   /* Chain the new block to the free list. */
938   __kmp_bget_insert_into_freelist(thr, b);
939 
940 #ifdef FreeWipe
941   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
942                (size_t)(len - sizeof(bfhead_t)));
943 #endif
944   bn = BH(((char *)b) + len);
945   bn->bb.prevfree = (bufsize)len;
946   /* Definition of ESent assumes two's complement! */
947   KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
948 
949   bn->bb.bsize = ESent;
950 }
951 
952 /*  BFREED  --  Dump the free lists for this thread. */
bfreed(kmp_info_t * th)953 static void bfreed(kmp_info_t *th) {
954   int bin = 0, count = 0;
955   int gtid = __kmp_gtid_from_thread(th);
956   thr_data_t *thr = get_thr_data(th);
957 
958 #if BufStats
959   __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
960                        " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
961                        " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
962                        " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
963                        " drel=%" KMP_INT64_SPEC "\n",
964                        gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
965                        (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
966                        (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
967                        (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
968 #endif
969 
970   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
971     bfhead_t *b;
972 
973     for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
974          b = b->ql.flink) {
975       bufsize bs = b->bh.bb.bsize;
976 
977       KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
978       KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
979       KMP_DEBUG_ASSERT(bs > 0);
980 
981       count += 1;
982 
983       __kmp_printf_no_lock(
984           "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
985           (long)bs);
986 #ifdef FreeWipe
987       {
988         char *lerr = ((char *)b) + sizeof(bfhead_t);
989         if ((bs > sizeof(bfhead_t)) &&
990             ((*lerr != 0x55) ||
991              (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
992               0))) {
993           __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
994                                "free block have been overstored.)\n",
995                                gtid);
996         }
997       }
998 #endif
999     }
1000   }
1001 
1002   if (count == 0)
1003     __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
1004 }
1005 
__kmp_initialize_bget(kmp_info_t * th)1006 void __kmp_initialize_bget(kmp_info_t *th) {
1007   KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
1008 
1009   set_thr_data(th);
1010 
1011   bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
1012         (bufsize)__kmp_malloc_pool_incr);
1013 }
1014 
__kmp_finalize_bget(kmp_info_t * th)1015 void __kmp_finalize_bget(kmp_info_t *th) {
1016   thr_data_t *thr;
1017   bfhead_t *b;
1018 
1019   KMP_DEBUG_ASSERT(th != 0);
1020 
1021 #if BufStats
1022   thr = (thr_data_t *)th->th.th_local.bget_data;
1023   KMP_DEBUG_ASSERT(thr != NULL);
1024   b = thr->last_pool;
1025 
1026   /*  If a block-release function is defined, and this free buffer constitutes
1027       the entire block, release it. Note that pool_len is defined in such a way
1028       that the test will fail unless all pool blocks are the same size.  */
1029 
1030   // Deallocate the last pool if one exists because we no longer do it in brel()
1031   if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
1032       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
1033     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
1034     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
1035     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
1036                      b->bh.bb.bsize);
1037 
1038     /*  Unlink the buffer from the free list  */
1039     __kmp_bget_remove_from_freelist(b);
1040 
1041     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
1042 
1043     (*thr->relfcn)(b);
1044     thr->numprel++; /* Nr of expansion block releases */
1045     thr->numpblk--; /* Total number of blocks */
1046     KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
1047   }
1048 #endif /* BufStats */
1049 
1050   /* Deallocate bget_data */
1051   if (th->th.th_local.bget_data != NULL) {
1052     __kmp_free(th->th.th_local.bget_data);
1053     th->th.th_local.bget_data = NULL;
1054   }
1055 }
1056 
kmpc_set_poolsize(size_t size)1057 void kmpc_set_poolsize(size_t size) {
1058   bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
1059         (bget_release_t)free, (bufsize)size);
1060 }
1061 
kmpc_get_poolsize(void)1062 size_t kmpc_get_poolsize(void) {
1063   thr_data_t *p;
1064 
1065   p = get_thr_data(__kmp_get_thread());
1066 
1067   return p->exp_incr;
1068 }
1069 
kmpc_set_poolmode(int mode)1070 void kmpc_set_poolmode(int mode) {
1071   thr_data_t *p;
1072 
1073   if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
1074       mode == bget_mode_best) {
1075     p = get_thr_data(__kmp_get_thread());
1076     p->mode = (bget_mode_t)mode;
1077   }
1078 }
1079 
kmpc_get_poolmode(void)1080 int kmpc_get_poolmode(void) {
1081   thr_data_t *p;
1082 
1083   p = get_thr_data(__kmp_get_thread());
1084 
1085   return p->mode;
1086 }
1087 
kmpc_get_poolstat(size_t * maxmem,size_t * allmem)1088 void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
1089   kmp_info_t *th = __kmp_get_thread();
1090   bufsize a, b;
1091 
1092   __kmp_bget_dequeue(th); /* Release any queued buffers */
1093 
1094   bcheck(th, &a, &b);
1095 
1096   *maxmem = a;
1097   *allmem = b;
1098 }
1099 
kmpc_poolprint(void)1100 void kmpc_poolprint(void) {
1101   kmp_info_t *th = __kmp_get_thread();
1102 
1103   __kmp_bget_dequeue(th); /* Release any queued buffers */
1104 
1105   bfreed(th);
1106 }
1107 
1108 #endif // #if KMP_USE_BGET
1109 
kmpc_malloc(size_t size)1110 void *kmpc_malloc(size_t size) {
1111   void *ptr;
1112   ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1113   if (ptr != NULL) {
1114     // save allocated pointer just before one returned to user
1115     *(void **)ptr = ptr;
1116     ptr = (void **)ptr + 1;
1117   }
1118   return ptr;
1119 }
1120 
1121 #define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
1122 
kmpc_aligned_malloc(size_t size,size_t alignment)1123 void *kmpc_aligned_malloc(size_t size, size_t alignment) {
1124   void *ptr;
1125   void *ptr_allocated;
1126   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
1127   if (!IS_POWER_OF_TWO(alignment)) {
1128     // AC: do we need to issue a warning here?
1129     errno = EINVAL;
1130     return NULL;
1131   }
1132   size = size + sizeof(void *) + alignment;
1133   ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
1134   if (ptr_allocated != NULL) {
1135     // save allocated pointer just before one returned to user
1136     ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
1137                    ~(alignment - 1));
1138     *((void **)ptr - 1) = ptr_allocated;
1139   } else {
1140     ptr = NULL;
1141   }
1142   return ptr;
1143 }
1144 
kmpc_calloc(size_t nelem,size_t elsize)1145 void *kmpc_calloc(size_t nelem, size_t elsize) {
1146   void *ptr;
1147   ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
1148   if (ptr != NULL) {
1149     // save allocated pointer just before one returned to user
1150     *(void **)ptr = ptr;
1151     ptr = (void **)ptr + 1;
1152   }
1153   return ptr;
1154 }
1155 
kmpc_realloc(void * ptr,size_t size)1156 void *kmpc_realloc(void *ptr, size_t size) {
1157   void *result = NULL;
1158   if (ptr == NULL) {
1159     // If pointer is NULL, realloc behaves like malloc.
1160     result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1161     // save allocated pointer just before one returned to user
1162     if (result != NULL) {
1163       *(void **)result = result;
1164       result = (void **)result + 1;
1165     }
1166   } else if (size == 0) {
1167     // If size is 0, realloc behaves like free.
1168     // The thread must be registered by the call to kmpc_malloc() or
1169     // kmpc_calloc() before.
1170     // So it should be safe to call __kmp_get_thread(), not
1171     // __kmp_entry_thread().
1172     KMP_ASSERT(*((void **)ptr - 1));
1173     brel(__kmp_get_thread(), *((void **)ptr - 1));
1174   } else {
1175     result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
1176                    (bufsize)(size + sizeof(ptr)));
1177     if (result != NULL) {
1178       *(void **)result = result;
1179       result = (void **)result + 1;
1180     }
1181   }
1182   return result;
1183 }
1184 
1185 // NOTE: the library must have already been initialized by a previous allocate
kmpc_free(void * ptr)1186 void kmpc_free(void *ptr) {
1187   if (!__kmp_init_serial) {
1188     return;
1189   }
1190   if (ptr != NULL) {
1191     kmp_info_t *th = __kmp_get_thread();
1192     __kmp_bget_dequeue(th); /* Release any queued buffers */
1193     // extract allocated pointer and free it
1194     KMP_ASSERT(*((void **)ptr - 1));
1195     brel(th, *((void **)ptr - 1));
1196   }
1197 }
1198 
___kmp_thread_malloc(kmp_info_t * th,size_t size KMP_SRC_LOC_DECL)1199 void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
1200   void *ptr;
1201   KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
1202                 (int)size KMP_SRC_LOC_PARM));
1203   ptr = bget(th, (bufsize)size);
1204   KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
1205   return ptr;
1206 }
1207 
___kmp_thread_calloc(kmp_info_t * th,size_t nelem,size_t elsize KMP_SRC_LOC_DECL)1208 void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
1209                            size_t elsize KMP_SRC_LOC_DECL) {
1210   void *ptr;
1211   KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
1212                 (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
1213   ptr = bgetz(th, (bufsize)(nelem * elsize));
1214   KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
1215   return ptr;
1216 }
1217 
___kmp_thread_realloc(kmp_info_t * th,void * ptr,size_t size KMP_SRC_LOC_DECL)1218 void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
1219                             size_t size KMP_SRC_LOC_DECL) {
1220   KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
1221                 ptr, (int)size KMP_SRC_LOC_PARM));
1222   ptr = bgetr(th, ptr, (bufsize)size);
1223   KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
1224   return ptr;
1225 }
1226 
___kmp_thread_free(kmp_info_t * th,void * ptr KMP_SRC_LOC_DECL)1227 void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
1228   KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
1229                 ptr KMP_SRC_LOC_PARM));
1230   if (ptr != NULL) {
1231     __kmp_bget_dequeue(th); /* Release any queued buffers */
1232     brel(th, ptr);
1233   }
1234   KE_TRACE(30, ("<- __kmp_thread_free()\n"));
1235 }
1236 
1237 /* OMP 5.0 Memory Management support */
1238 static const char *kmp_mk_lib_name;
1239 static void *h_memkind;
1240 /* memkind experimental API: */
1241 // memkind_alloc
1242 static void *(*kmp_mk_alloc)(void *k, size_t sz);
1243 // memkind_free
1244 static void (*kmp_mk_free)(void *kind, void *ptr);
1245 // memkind_check_available
1246 static int (*kmp_mk_check)(void *kind);
1247 // kinds we are going to use
1248 static void **mk_default;
1249 static void **mk_interleave;
1250 static void **mk_hbw;
1251 static void **mk_hbw_interleave;
1252 static void **mk_hbw_preferred;
1253 static void **mk_hugetlb;
1254 static void **mk_hbw_hugetlb;
1255 static void **mk_hbw_preferred_hugetlb;
1256 static void **mk_dax_kmem;
1257 static void **mk_dax_kmem_all;
1258 static void **mk_dax_kmem_preferred;
1259 static void *(*kmp_target_alloc_host)(size_t size, int device);
1260 static void *(*kmp_target_alloc_shared)(size_t size, int device);
1261 static void *(*kmp_target_alloc_device)(size_t size, int device);
1262 static void *(*kmp_target_lock_mem)(void *ptr, size_t size, int device);
1263 static void *(*kmp_target_unlock_mem)(void *ptr, int device);
1264 static void *(*kmp_target_free_host)(void *ptr, int device);
1265 static void *(*kmp_target_free_shared)(void *ptr, int device);
1266 static void *(*kmp_target_free_device)(void *ptr, int device);
1267 static bool __kmp_target_mem_available;
1268 
1269 #define KMP_IS_TARGET_MEM_SPACE(MS)                                            \
1270   (MS == llvm_omp_target_host_mem_space ||                                     \
1271    MS == llvm_omp_target_shared_mem_space ||                                   \
1272    MS == llvm_omp_target_device_mem_space)
1273 
1274 #define KMP_IS_TARGET_MEM_ALLOC(MA)                                            \
1275   (MA == llvm_omp_target_host_mem_alloc ||                                     \
1276    MA == llvm_omp_target_shared_mem_alloc ||                                   \
1277    MA == llvm_omp_target_device_mem_alloc)
1278 
1279 #define KMP_IS_PREDEF_MEM_SPACE(MS)                                            \
1280   (MS == omp_null_mem_space || MS == omp_default_mem_space ||                  \
1281    MS == omp_large_cap_mem_space || MS == omp_const_mem_space ||               \
1282    MS == omp_high_bw_mem_space || MS == omp_low_lat_mem_space ||               \
1283    KMP_IS_TARGET_MEM_SPACE(MS))
1284 
1285 /// Support OMP 6.0 target memory management
1286 /// Expected offload runtime entries.
1287 ///
1288 /// Returns number of resources and list of unique resource IDs in "resouces".
1289 /// Runtime needs to invoke this twice to get the number of resources, allocate
1290 /// space for the resource IDs, and finally let offload runtime write resource
1291 /// IDs in "resources".
1292 /// int __tgt_get_mem_resources(int num_devices, const int *devices,
1293 ///                             int host_access, omp_memspace_handle_t memspace,
1294 ///                             int *resources);
1295 ///
1296 /// Redirects omp_alloc call to offload runtime.
1297 /// void *__tgt_omp_alloc(size_t size, omp_allocator_handle_t allocator);
1298 ///
1299 /// Redirects omp_free call to offload runtime.
1300 /// void __tgt_omp_free(void *ptr, omp_allocator_handle_t);
1301 class kmp_tgt_allocator_t {
1302   bool supported = false;
1303   using get_mem_resources_t = int (*)(int, const int *, int,
1304                                       omp_memspace_handle_t, int *);
1305   using omp_alloc_t = void *(*)(size_t, omp_allocator_handle_t);
1306   using omp_free_t = void (*)(void *, omp_allocator_handle_t);
1307   get_mem_resources_t tgt_get_mem_resources = nullptr;
1308   omp_alloc_t tgt_omp_alloc = nullptr;
1309   omp_free_t tgt_omp_free = nullptr;
1310 
1311 public:
1312   /// Initialize interface with offload runtime
init()1313   void init() {
1314     tgt_get_mem_resources =
1315         (get_mem_resources_t)KMP_DLSYM("__tgt_get_mem_resources");
1316     tgt_omp_alloc = (omp_alloc_t)KMP_DLSYM("__tgt_omp_alloc");
1317     tgt_omp_free = (omp_free_t)KMP_DLSYM("__tgt_omp_free");
1318     supported = tgt_get_mem_resources && tgt_omp_alloc && tgt_omp_free;
1319   }
1320   /// Obtain resource information from offload runtime. We assume offload
1321   /// runtime backends maintain a list of unique resource IDS.
get_mem_resources(int ndevs,const int * devs,int host,omp_memspace_handle_t memspace,int * resources)1322   int get_mem_resources(int ndevs, const int *devs, int host,
1323                         omp_memspace_handle_t memspace, int *resources) {
1324     if (supported)
1325       return tgt_get_mem_resources(ndevs, devs, host, memspace, resources);
1326     return 0;
1327   }
1328   /// Invoke offload runtime's memory allocation routine
omp_alloc(size_t size,omp_allocator_handle_t allocator)1329   void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
1330     if (supported)
1331       return tgt_omp_alloc(size, allocator);
1332     return nullptr;
1333   }
1334   /// Invoke offload runtime's memory deallocation routine
omp_free(void * ptr,omp_allocator_handle_t allocator)1335   void omp_free(void *ptr, omp_allocator_handle_t allocator) {
1336     if (supported)
1337       tgt_omp_free(ptr, allocator);
1338   }
1339 } __kmp_tgt_allocator;
1340 
1341 extern "C" int omp_get_num_devices(void);
1342 
1343 /// Maintain a list of target memory spaces that are identified with the
1344 /// requested information. There will be only one unique memory space object
1345 /// that matches the input.
1346 class kmp_tgt_memspace_list_t {
1347   kmp_memspace_t *memspace_list = nullptr;
1348   KMP_LOCK_INIT(mtx);
1349   /// Find memory space that matches the provided input
find(int num_resources,const int * resources,omp_memspace_handle_t memspace)1350   kmp_memspace_t *find(int num_resources, const int *resources,
1351                        omp_memspace_handle_t memspace) {
1352     kmp_memspace_t *ms = memspace_list;
1353     while (ms) {
1354       if (ms->num_resources == num_resources && ms->memspace == memspace &&
1355           !memcmp(ms->resources, resources, sizeof(int) * num_resources))
1356         break;
1357       ms = ms->next;
1358     }
1359     return ms;
1360   }
1361   /// Return memory space for the provided input. It tries to find existing
1362   /// memory space that exactly matches the provided input or create one if
1363   /// not found.
get(int num_resources,const int * resources,omp_memspace_handle_t memspace)1364   omp_memspace_handle_t get(int num_resources, const int *resources,
1365                             omp_memspace_handle_t memspace) {
1366     int gtid = __kmp_entry_gtid();
1367     __kmp_acquire_lock(&mtx, gtid);
1368     // Sort absolute IDs in the resource list
1369     int *sorted_resources = (int *)__kmp_allocate(sizeof(int) * num_resources);
1370     KMP_MEMCPY(sorted_resources, resources, num_resources * sizeof(int));
1371     qsort(sorted_resources, (size_t)num_resources, sizeof(int),
1372           [](const void *a, const void *b) {
1373             const int val_a = *(const int *)a;
1374             const int val_b = *(const int *)b;
1375             return (val_a > val_b) ? 1 : ((val_a < val_b) ? -1 : 0);
1376           });
1377     kmp_memspace_t *ms = find(num_resources, sorted_resources, memspace);
1378     if (ms) {
1379       __kmp_free(sorted_resources);
1380       __kmp_release_lock(&mtx, gtid);
1381       return ms;
1382     }
1383     ms = (kmp_memspace_t *)__kmp_allocate(sizeof(kmp_memspace_t));
1384     ms->memspace = memspace;
1385     ms->num_resources = num_resources;
1386     ms->resources = sorted_resources;
1387     ms->next = memspace_list;
1388     memspace_list = ms;
1389     __kmp_release_lock(&mtx, gtid);
1390     return ms;
1391   }
1392 
1393 public:
1394   /// Initialize memory space list
init()1395   void init() { __kmp_init_lock(&mtx); }
1396   /// Release resources for the memory space list
fini()1397   void fini() {
1398     kmp_memspace_t *ms = memspace_list;
1399     while (ms) {
1400       if (ms->resources)
1401         __kmp_free(ms->resources);
1402       kmp_memspace_t *tmp = ms;
1403       ms = ms->next;
1404       __kmp_free(tmp);
1405     }
1406     __kmp_destroy_lock(&mtx);
1407   }
1408   /// Return memory space for the provided input
get_memspace(int num_devices,const int * devices,int host_access,omp_memspace_handle_t memspace)1409   omp_memspace_handle_t get_memspace(int num_devices, const int *devices,
1410                                      int host_access,
1411                                      omp_memspace_handle_t memspace) {
1412     int actual_num_devices = num_devices;
1413     int *actual_devices = const_cast<int *>(devices);
1414     if (actual_num_devices == 0) {
1415       actual_num_devices = omp_get_num_devices();
1416       if (actual_num_devices <= 0)
1417         return omp_null_mem_space;
1418     }
1419     if (actual_devices == NULL) {
1420       // Prepare list of all devices in this case.
1421       actual_devices = (int *)__kmp_allocate(sizeof(int) * actual_num_devices);
1422       for (int i = 0; i < actual_num_devices; i++)
1423         actual_devices[i] = i;
1424     }
1425     // Get the number of available resources first
1426     int num_resources = __kmp_tgt_allocator.get_mem_resources(
1427         actual_num_devices, actual_devices, host_access, memspace, NULL);
1428     if (num_resources <= 0)
1429       return omp_null_mem_space; // No available resources
1430 
1431     omp_memspace_handle_t ms = omp_null_mem_space;
1432     if (num_resources > 0) {
1433       int *resources = (int *)__kmp_allocate(sizeof(int) * num_resources);
1434       // Let offload runtime write the resource IDs
1435       num_resources = __kmp_tgt_allocator.get_mem_resources(
1436           actual_num_devices, actual_devices, host_access, memspace, resources);
1437       ms = get(num_resources, resources, memspace);
1438       __kmp_free(resources);
1439     }
1440     if (!devices && actual_devices)
1441       __kmp_free(actual_devices);
1442     return ms;
1443   }
1444   /// Return sub memory space from the parent memory space
get_memspace(int num_resources,const int * resources,omp_memspace_handle_t parent)1445   omp_memspace_handle_t get_memspace(int num_resources, const int *resources,
1446                                      omp_memspace_handle_t parent) {
1447     kmp_memspace_t *ms = (kmp_memspace_t *)parent;
1448     return get(num_resources, resources, ms->memspace);
1449   }
1450 } __kmp_tgt_memspace_list;
1451 
1452 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
chk_kind(void *** pkind)1453 static inline void chk_kind(void ***pkind) {
1454   KMP_DEBUG_ASSERT(pkind);
1455   if (*pkind) // symbol found
1456     if (kmp_mk_check(**pkind)) // kind not available or error
1457       *pkind = NULL;
1458 }
1459 #endif
1460 
__kmp_init_memkind()1461 void __kmp_init_memkind() {
1462 // as of 2018-07-31 memkind does not support Windows*, exclude it for now
1463 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1464   // use of statically linked memkind is problematic, as it depends on libnuma
1465   kmp_mk_lib_name = "libmemkind.so";
1466   h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);
1467   if (h_memkind) {
1468     kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");
1469     kmp_mk_alloc =
1470         (void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");
1471     kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");
1472     mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");
1473     if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&
1474         !kmp_mk_check(*mk_default)) {
1475       __kmp_memkind_available = 1;
1476       mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");
1477       chk_kind(&mk_interleave);
1478       mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");
1479       chk_kind(&mk_hbw);
1480       mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");
1481       chk_kind(&mk_hbw_interleave);
1482       mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");
1483       chk_kind(&mk_hbw_preferred);
1484       mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");
1485       chk_kind(&mk_hugetlb);
1486       mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");
1487       chk_kind(&mk_hbw_hugetlb);
1488       mk_hbw_preferred_hugetlb =
1489           (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
1490       chk_kind(&mk_hbw_preferred_hugetlb);
1491       mk_dax_kmem = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM");
1492       chk_kind(&mk_dax_kmem);
1493       mk_dax_kmem_all = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_ALL");
1494       chk_kind(&mk_dax_kmem_all);
1495       mk_dax_kmem_preferred =
1496           (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_PREFERRED");
1497       chk_kind(&mk_dax_kmem_preferred);
1498       KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
1499       return; // success
1500     }
1501     dlclose(h_memkind); // failure
1502   }
1503 #else // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
1504   kmp_mk_lib_name = "";
1505 #endif // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
1506   h_memkind = NULL;
1507   kmp_mk_check = NULL;
1508   kmp_mk_alloc = NULL;
1509   kmp_mk_free = NULL;
1510   mk_default = NULL;
1511   mk_interleave = NULL;
1512   mk_hbw = NULL;
1513   mk_hbw_interleave = NULL;
1514   mk_hbw_preferred = NULL;
1515   mk_hugetlb = NULL;
1516   mk_hbw_hugetlb = NULL;
1517   mk_hbw_preferred_hugetlb = NULL;
1518   mk_dax_kmem = NULL;
1519   mk_dax_kmem_all = NULL;
1520   mk_dax_kmem_preferred = NULL;
1521 }
1522 
__kmp_fini_memkind()1523 void __kmp_fini_memkind() {
1524 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
1525   if (__kmp_memkind_available)
1526     KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));
1527   if (h_memkind) {
1528     dlclose(h_memkind);
1529     h_memkind = NULL;
1530   }
1531   kmp_mk_check = NULL;
1532   kmp_mk_alloc = NULL;
1533   kmp_mk_free = NULL;
1534   mk_default = NULL;
1535   mk_interleave = NULL;
1536   mk_hbw = NULL;
1537   mk_hbw_interleave = NULL;
1538   mk_hbw_preferred = NULL;
1539   mk_hugetlb = NULL;
1540   mk_hbw_hugetlb = NULL;
1541   mk_hbw_preferred_hugetlb = NULL;
1542   mk_dax_kmem = NULL;
1543   mk_dax_kmem_all = NULL;
1544   mk_dax_kmem_preferred = NULL;
1545 #endif
1546 }
1547 
1548 #if KMP_HWLOC_ENABLED
__kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy)1549 static bool __kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy) {
1550 #if HWLOC_API_VERSION >= 0x00020300
1551   const hwloc_topology_support *support;
1552   support = hwloc_topology_get_support(__kmp_hwloc_topology);
1553   if (support) {
1554     if (policy == HWLOC_MEMBIND_BIND)
1555       return (support->membind->alloc_membind &&
1556               support->membind->bind_membind);
1557     if (policy == HWLOC_MEMBIND_INTERLEAVE)
1558       return (support->membind->alloc_membind &&
1559               support->membind->interleave_membind);
1560   }
1561   return false;
1562 #else
1563   return false;
1564 #endif // KMP_HWLOC_ENABLED
1565 }
1566 
__kmp_hwloc_alloc_membind(hwloc_memattr_id_e attr,size_t size,hwloc_membind_policy_t policy)1567 void *__kmp_hwloc_alloc_membind(hwloc_memattr_id_e attr, size_t size,
1568                                 hwloc_membind_policy_t policy) {
1569 #if HWLOC_API_VERSION >= 0x00020300
1570   void *ptr = NULL;
1571   hwloc_obj_t node;
1572   struct hwloc_location initiator;
1573   int ret;
1574   // TODO: We should make this more efficient by getting rid of the OS syscall
1575   // 'hwloc_bitmap_alloc' and 'hwloc_get_cpubind' to get affinity and instead
1576   // use th_affin_mask field when it's capable of getting the underlying
1577   // mask implementation.
1578   hwloc_cpuset_t mask = hwloc_bitmap_alloc();
1579   ret = hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
1580   if (ret < 0) {
1581     hwloc_bitmap_free(mask);
1582     return ptr;
1583   }
1584   initiator.type = KMP_HWLOC_LOCATION_TYPE_CPUSET;
1585   initiator.location.cpuset = mask;
1586   ret = hwloc_memattr_get_best_target(__kmp_hwloc_topology, attr, &initiator, 0,
1587                                       &node, NULL);
1588   if (ret < 0) {
1589     return ptr;
1590   }
1591   return hwloc_alloc_membind(__kmp_hwloc_topology, size, node->nodeset, policy,
1592                              HWLOC_MEMBIND_BYNODESET);
1593 #else
1594   return NULL;
1595 #endif
1596 }
1597 
__kmp_hwloc_membind_policy(omp_memspace_handle_t ms,size_t size,hwloc_membind_policy_t policy)1598 void *__kmp_hwloc_membind_policy(omp_memspace_handle_t ms, size_t size,
1599                                  hwloc_membind_policy_t policy) {
1600 #if HWLOC_API_VERSION >= 0x00020300
1601   void *ptr = NULL;
1602   if (ms == omp_high_bw_mem_space) {
1603     ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH, size, policy);
1604   } else if (ms == omp_large_cap_mem_space) {
1605     ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY, size, policy);
1606   } else {
1607     ptr = hwloc_alloc(__kmp_hwloc_topology, size);
1608   }
1609   return ptr;
1610 #else
1611   return NULL;
1612 #endif
1613 }
1614 #endif // KMP_HWLOC_ENABLED
1615 
__kmp_init_target_mem()1616 void __kmp_init_target_mem() {
1617   *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
1618   *(void **)(&kmp_target_alloc_shared) =
1619       KMP_DLSYM("llvm_omp_target_alloc_shared");
1620   *(void **)(&kmp_target_alloc_device) =
1621       KMP_DLSYM("llvm_omp_target_alloc_device");
1622   *(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");
1623   *(void **)(&kmp_target_free_shared) =
1624       KMP_DLSYM("llvm_omp_target_free_shared");
1625   *(void **)(&kmp_target_free_device) =
1626       KMP_DLSYM("llvm_omp_target_free_device");
1627   __kmp_target_mem_available =
1628       kmp_target_alloc_host && kmp_target_alloc_shared &&
1629       kmp_target_alloc_device && kmp_target_free_host &&
1630       kmp_target_free_shared && kmp_target_free_device;
1631   // lock/pin and unlock/unpin target calls
1632   *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
1633   *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
1634   __kmp_tgt_allocator.init();
1635   __kmp_tgt_memspace_list.init();
1636 }
1637 
1638 /// Finalize target memory support
__kmp_fini_target_mem()1639 void __kmp_fini_target_mem() { __kmp_tgt_memspace_list.fini(); }
1640 
__kmpc_init_allocator(int gtid,omp_memspace_handle_t ms,int ntraits,omp_alloctrait_t traits[])1641 omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
1642                                              int ntraits,
1643                                              omp_alloctrait_t traits[]) {
1644   kmp_allocator_t *al;
1645   int i;
1646   al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
1647   al->memspace = ms; // not used currently
1648 
1649   // Assign default values if applicable
1650   al->alignment = 1;
1651   al->pinned = false;
1652   al->partition = omp_atv_environment;
1653   al->pin_device = -1;
1654   al->preferred_device = -1;
1655   al->target_access = omp_atv_single;
1656   al->atomic_scope = omp_atv_device;
1657 
1658   for (i = 0; i < ntraits; ++i) {
1659     switch (traits[i].key) {
1660     case omp_atk_sync_hint:
1661     case omp_atk_access:
1662       break;
1663     case omp_atk_pinned:
1664       al->pinned = true;
1665       break;
1666     case omp_atk_alignment:
1667       __kmp_type_convert(traits[i].value, &(al->alignment));
1668       KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
1669       break;
1670     case omp_atk_pool_size:
1671       al->pool_size = traits[i].value;
1672       break;
1673     case omp_atk_fallback:
1674       al->fb = (omp_alloctrait_value_t)traits[i].value;
1675       KMP_DEBUG_ASSERT(
1676           al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||
1677           al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);
1678       break;
1679     case omp_atk_fb_data:
1680       al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
1681       break;
1682     case omp_atk_partition:
1683 #if KMP_HWLOC_ENABLED
1684       al->membind = (omp_alloctrait_value_t)traits[i].value;
1685       KMP_DEBUG_ASSERT(al->membind == omp_atv_environment ||
1686                        al->membind == omp_atv_nearest ||
1687                        al->membind == omp_atv_blocked ||
1688                        al->membind == omp_atv_interleaved);
1689 #endif // KMP_HWLOC_ENABLED
1690       al->memkind = RCAST(void **, traits[i].value);
1691       break;
1692     case omp_atk_pin_device:
1693       __kmp_type_convert(traits[i].value, &(al->pin_device));
1694       break;
1695     case omp_atk_preferred_device:
1696       __kmp_type_convert(traits[i].value, &(al->preferred_device));
1697       break;
1698     case omp_atk_target_access:
1699       al->target_access = (omp_alloctrait_value_t)traits[i].value;
1700       break;
1701     case omp_atk_atomic_scope:
1702       al->atomic_scope = (omp_alloctrait_value_t)traits[i].value;
1703       break;
1704     case omp_atk_part_size:
1705       __kmp_type_convert(traits[i].value, &(al->part_size));
1706       break;
1707     default:
1708       KMP_ASSERT2(0, "Unexpected allocator trait");
1709     }
1710   }
1711 
1712   if (al->memspace > kmp_max_mem_space) {
1713     // Memory space has been allocated for targets.
1714     return (omp_allocator_handle_t)al;
1715   }
1716 
1717   KMP_DEBUG_ASSERT(KMP_IS_PREDEF_MEM_SPACE(al->memspace));
1718 
1719   if (al->fb == 0) {
1720     // set default allocator
1721     al->fb = omp_atv_default_mem_fb;
1722     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1723   } else if (al->fb == omp_atv_allocator_fb) {
1724     KMP_ASSERT(al->fb_data != NULL);
1725   } else if (al->fb == omp_atv_default_mem_fb) {
1726     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1727   }
1728   if (__kmp_memkind_available) {
1729     // Let's use memkind library if available
1730     if (ms == omp_high_bw_mem_space) {
1731       if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {
1732         al->memkind = mk_hbw_interleave;
1733       } else if (mk_hbw_preferred) {
1734         // AC: do not try to use MEMKIND_HBW for now, because memkind library
1735         // cannot reliably detect exhaustion of HBW memory.
1736         // It could be possible using hbw_verify_memory_region() but memkind
1737         // manual says: "Using this function in production code may result in
1738         // serious performance penalty".
1739         al->memkind = mk_hbw_preferred;
1740       } else {
1741         // HBW is requested but not available --> return NULL allocator
1742         __kmp_free(al);
1743         return omp_null_allocator;
1744       }
1745     } else if (ms == omp_large_cap_mem_space) {
1746       if (mk_dax_kmem_all) {
1747         // All pmem nodes are visited
1748         al->memkind = mk_dax_kmem_all;
1749       } else if (mk_dax_kmem) {
1750         // Only closest pmem node is visited
1751         al->memkind = mk_dax_kmem;
1752       } else {
1753         __kmp_free(al);
1754         return omp_null_allocator;
1755       }
1756     } else {
1757       if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {
1758         al->memkind = mk_interleave;
1759       } else {
1760         al->memkind = mk_default;
1761       }
1762     }
1763   } else if (KMP_IS_TARGET_MEM_SPACE(ms) && !__kmp_target_mem_available) {
1764     __kmp_free(al);
1765     return omp_null_allocator;
1766   } else {
1767     if (!__kmp_hwloc_available &&
1768         (ms == omp_high_bw_mem_space || ms == omp_large_cap_mem_space)) {
1769       // cannot detect HBW memory presence without memkind library
1770       __kmp_free(al);
1771       return omp_null_allocator;
1772     }
1773   }
1774   return (omp_allocator_handle_t)al;
1775 }
1776 
__kmpc_destroy_allocator(int gtid,omp_allocator_handle_t allocator)1777 void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {
1778   if (allocator > kmp_max_mem_alloc)
1779     __kmp_free(allocator);
1780 }
1781 
__kmpc_set_default_allocator(int gtid,omp_allocator_handle_t allocator)1782 void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {
1783   if (allocator == omp_null_allocator)
1784     allocator = omp_default_mem_alloc;
1785   __kmp_threads[gtid]->th.th_def_allocator = allocator;
1786 }
1787 
__kmpc_get_default_allocator(int gtid)1788 omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
1789   return __kmp_threads[gtid]->th.th_def_allocator;
1790 }
1791 
__kmp_get_devices_memspace(int ndevs,const int * devs,omp_memspace_handle_t memspace,int host)1792 omp_memspace_handle_t __kmp_get_devices_memspace(int ndevs, const int *devs,
1793                                                  omp_memspace_handle_t memspace,
1794                                                  int host) {
1795   if (!__kmp_init_serial)
1796     __kmp_serial_initialize();
1797   // Only accept valid device description and predefined memory space
1798   if (ndevs < 0 || (ndevs > 0 && !devs) || memspace > kmp_max_mem_space)
1799     return omp_null_mem_space;
1800 
1801   return __kmp_tgt_memspace_list.get_memspace(ndevs, devs, host, memspace);
1802 }
1803 
1804 omp_allocator_handle_t
__kmp_get_devices_allocator(int ndevs,const int * devs,omp_memspace_handle_t memspace,int host)1805 __kmp_get_devices_allocator(int ndevs, const int *devs,
1806                             omp_memspace_handle_t memspace, int host) {
1807   if (!__kmp_init_serial)
1808     __kmp_serial_initialize();
1809   // Only accept valid device description and predefined memory space
1810   if (ndevs < 0 || (ndevs > 0 && !devs) || memspace > kmp_max_mem_space)
1811     return omp_null_allocator;
1812 
1813   omp_memspace_handle_t mspace =
1814       __kmp_get_devices_memspace(ndevs, devs, memspace, host);
1815   if (mspace == omp_null_mem_space)
1816     return omp_null_allocator;
1817 
1818   return __kmpc_init_allocator(__kmp_entry_gtid(), mspace, 0, NULL);
1819 }
1820 
__kmp_get_memspace_num_resources(omp_memspace_handle_t memspace)1821 int __kmp_get_memspace_num_resources(omp_memspace_handle_t memspace) {
1822   if (!__kmp_init_serial)
1823     __kmp_serial_initialize();
1824   if (memspace == omp_null_mem_space)
1825     return 0;
1826   if (memspace < kmp_max_mem_space)
1827     return 1; // return 1 for predefined memory space
1828   kmp_memspace_t *ms = (kmp_memspace_t *)memspace;
1829   return ms->num_resources;
1830 }
1831 
__kmp_get_submemspace(omp_memspace_handle_t memspace,int num_resources,int * resources)1832 omp_memspace_handle_t __kmp_get_submemspace(omp_memspace_handle_t memspace,
1833                                             int num_resources, int *resources) {
1834   if (!__kmp_init_serial)
1835     __kmp_serial_initialize();
1836   if (memspace == omp_null_mem_space || memspace < kmp_max_mem_space)
1837     return memspace; // return input memory space for predefined memory space
1838   kmp_memspace_t *ms = (kmp_memspace_t *)memspace;
1839   if (num_resources == 0 || ms->num_resources < num_resources || !resources)
1840     return omp_null_mem_space; // input memory space cannot satisfy the request
1841 
1842   // The stored resource ID is an absolute ID only known to the offload backend,
1843   // and the returned memory space will still keep the property.
1844   int *resources_abs = (int *)__kmp_allocate(sizeof(int) * num_resources);
1845 
1846   // Collect absolute resource ID from the relative ID
1847   for (int i = 0; i < num_resources; i++)
1848     resources_abs[i] = ms->resources[resources[i]];
1849 
1850   omp_memspace_handle_t submemspace = __kmp_tgt_memspace_list.get_memspace(
1851       num_resources, resources_abs, memspace);
1852   __kmp_free(resources_abs);
1853 
1854   return submemspace;
1855 }
1856 
1857 typedef struct kmp_mem_desc { // Memory block descriptor
1858   void *ptr_alloc; // Pointer returned by allocator
1859   size_t size_a; // Size of allocated memory block (initial+descriptor+align)
1860   size_t size_orig; // Original size requested
1861   void *ptr_align; // Pointer to aligned memory, returned
1862   kmp_allocator_t *allocator; // allocator
1863 } kmp_mem_desc_t;
1864 constexpr size_t alignment = SizeQuant;
1865 
1866 // external interfaces are wrappers over internal implementation
__kmpc_alloc(int gtid,size_t size,omp_allocator_handle_t allocator)1867 void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
1868   KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
1869   void *ptr = __kmp_alloc(gtid, 0, size, allocator);
1870   KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));
1871   return ptr;
1872 }
1873 
__kmpc_aligned_alloc(int gtid,size_t algn,size_t size,omp_allocator_handle_t allocator)1874 void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,
1875                            omp_allocator_handle_t allocator) {
1876   KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,
1877                 (int)size, allocator));
1878   void *ptr = __kmp_alloc(gtid, algn, size, allocator);
1879   KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));
1880   return ptr;
1881 }
1882 
__kmpc_calloc(int gtid,size_t nmemb,size_t size,omp_allocator_handle_t allocator)1883 void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
1884                     omp_allocator_handle_t allocator) {
1885   KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
1886                 (int)size, allocator));
1887   void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);
1888   KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
1889   return ptr;
1890 }
1891 
__kmpc_realloc(int gtid,void * ptr,size_t size,omp_allocator_handle_t allocator,omp_allocator_handle_t free_allocator)1892 void *__kmpc_realloc(int gtid, void *ptr, size_t size,
1893                      omp_allocator_handle_t allocator,
1894                      omp_allocator_handle_t free_allocator) {
1895   KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
1896                 allocator, free_allocator));
1897   void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);
1898   KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
1899   return nptr;
1900 }
1901 
__kmpc_free(int gtid,void * ptr,omp_allocator_handle_t allocator)1902 void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
1903   KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
1904   ___kmpc_free(gtid, ptr, allocator);
1905   KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));
1906   return;
1907 }
1908 
1909 // internal implementation, called from inside the library
__kmp_alloc(int gtid,size_t algn,size_t size,omp_allocator_handle_t allocator)1910 void *__kmp_alloc(int gtid, size_t algn, size_t size,
1911                   omp_allocator_handle_t allocator) {
1912   void *ptr = NULL;
1913   kmp_allocator_t *al;
1914   KMP_DEBUG_ASSERT(__kmp_init_serial);
1915   if (size == 0)
1916     return NULL;
1917   if (allocator == omp_null_allocator)
1918     allocator = __kmp_threads[gtid]->th.th_def_allocator;
1919   kmp_int32 default_device =
1920       __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1921 
1922   al = RCAST(kmp_allocator_t *, allocator);
1923 
1924   int sz_desc = sizeof(kmp_mem_desc_t);
1925   kmp_mem_desc_t desc;
1926   kmp_uintptr_t addr; // address returned by allocator
1927   kmp_uintptr_t addr_align; // address to return to caller
1928   kmp_uintptr_t addr_descr; // address of memory block descriptor
1929   size_t align = alignment; // default alignment
1930   if (allocator > kmp_max_mem_alloc && al->alignment > align)
1931     align = al->alignment; // alignment required by allocator trait
1932   if (align < algn)
1933     align = algn; // max of allocator trait, parameter and sizeof(void*)
1934   desc.size_orig = size;
1935   desc.size_a = size + sz_desc + align;
1936   bool is_pinned = false;
1937   if (allocator > kmp_max_mem_alloc)
1938     is_pinned = al->pinned;
1939 
1940   // Use default allocator if hwloc and libmemkind are not available
1941   int use_default_allocator =
1942       (!__kmp_hwloc_available && !__kmp_memkind_available);
1943 
1944   if (al > kmp_max_mem_alloc && al->memspace > kmp_max_mem_space) {
1945     // Memspace has been allocated for targets.
1946     return __kmp_tgt_allocator.omp_alloc(size, allocator);
1947   }
1948 
1949   if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
1950     // Use size input directly as the memory may not be accessible on host.
1951     // Use default device for now.
1952     if (__kmp_target_mem_available) {
1953       kmp_int32 device =
1954           __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1955       if (allocator == llvm_omp_target_host_mem_alloc)
1956         ptr = kmp_target_alloc_host(size, device);
1957       else if (allocator == llvm_omp_target_shared_mem_alloc)
1958         ptr = kmp_target_alloc_shared(size, device);
1959       else // allocator == llvm_omp_target_device_mem_alloc
1960         ptr = kmp_target_alloc_device(size, device);
1961       return ptr;
1962     } else {
1963       KMP_INFORM(TargetMemNotAvailable);
1964     }
1965   }
1966 
1967   if (allocator >= kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
1968     if (__kmp_target_mem_available) {
1969       kmp_int32 device =
1970           __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1971       if (al->memspace == llvm_omp_target_host_mem_space)
1972         ptr = kmp_target_alloc_host(size, device);
1973       else if (al->memspace == llvm_omp_target_shared_mem_space)
1974         ptr = kmp_target_alloc_shared(size, device);
1975       else // al->memspace == llvm_omp_target_device_mem_space
1976         ptr = kmp_target_alloc_device(size, device);
1977       return ptr;
1978     } else {
1979       KMP_INFORM(TargetMemNotAvailable);
1980     }
1981   }
1982 
1983 #if KMP_HWLOC_ENABLED
1984   if (__kmp_hwloc_available) {
1985     if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_BIND)) {
1986       if (allocator < kmp_max_mem_alloc) {
1987         // pre-defined allocator
1988         if (allocator == omp_high_bw_mem_alloc) {
1989           ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH,
1990                                           desc.size_a, HWLOC_MEMBIND_BIND);
1991           if (ptr == NULL)
1992             use_default_allocator = true;
1993         } else if (allocator == omp_large_cap_mem_alloc) {
1994           ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY,
1995                                           desc.size_a, HWLOC_MEMBIND_BIND);
1996           if (ptr == NULL)
1997             use_default_allocator = true;
1998         } else {
1999           use_default_allocator = true;
2000         }
2001         if (use_default_allocator) {
2002           ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2003         }
2004       } else if (al->pool_size > 0) {
2005         // custom allocator with pool size requested
2006         kmp_uint64 used =
2007             KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
2008         if (used + desc.size_a > al->pool_size) {
2009           // not enough space, need to go fallback path
2010           KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2011           if (al->fb == omp_atv_default_mem_fb) {
2012             al = (kmp_allocator_t *)omp_default_mem_alloc;
2013             ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2014           } else if (al->fb == omp_atv_abort_fb) {
2015             KMP_ASSERT(0); // abort fallback requested
2016           } else if (al->fb == omp_atv_allocator_fb) {
2017             KMP_ASSERT(al != al->fb_data);
2018             al = al->fb_data;
2019             return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2020           } // else ptr == NULL;
2021         } else {
2022           // pool has enough space
2023           if (al->membind == omp_atv_interleaved) {
2024             if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) {
2025               ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2026                                                HWLOC_MEMBIND_INTERLEAVE);
2027             }
2028           } else if (al->membind == omp_atv_environment) {
2029             ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2030                                              HWLOC_MEMBIND_DEFAULT);
2031           } else {
2032             ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2033           }
2034           if (ptr == NULL) {
2035             if (al->fb == omp_atv_default_mem_fb) {
2036               al = (kmp_allocator_t *)omp_default_mem_alloc;
2037               ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2038             } else if (al->fb == omp_atv_abort_fb) {
2039               KMP_ASSERT(0); // abort fallback requested
2040             } else if (al->fb == omp_atv_allocator_fb) {
2041               KMP_ASSERT(al != al->fb_data);
2042               al = al->fb_data;
2043               return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2044             }
2045           }
2046         }
2047       } else {
2048         // custom allocator, pool size not requested
2049         if (al->membind == omp_atv_interleaved) {
2050           if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) {
2051             ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2052                                              HWLOC_MEMBIND_INTERLEAVE);
2053           }
2054         } else if (al->membind == omp_atv_environment) {
2055           ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
2056                                            HWLOC_MEMBIND_DEFAULT);
2057         } else {
2058           ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2059         }
2060         if (ptr == NULL) {
2061           if (al->fb == omp_atv_default_mem_fb) {
2062             al = (kmp_allocator_t *)omp_default_mem_alloc;
2063             ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2064           } else if (al->fb == omp_atv_abort_fb) {
2065             KMP_ASSERT(0); // abort fallback requested
2066           } else if (al->fb == omp_atv_allocator_fb) {
2067             KMP_ASSERT(al != al->fb_data);
2068             al = al->fb_data;
2069             return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2070           }
2071         }
2072       }
2073     } else { // alloc membind not supported, use hwloc_alloc
2074       ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
2075     }
2076   } else {
2077 #endif // KMP_HWLOC_ENABLED
2078     if (__kmp_memkind_available) {
2079       if (allocator < kmp_max_mem_alloc) {
2080         // pre-defined allocator
2081         if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
2082           ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
2083         } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
2084           ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a);
2085         } else {
2086           ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2087         }
2088       } else if (al->pool_size > 0) {
2089         // custom allocator with pool size requested
2090         kmp_uint64 used =
2091             KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
2092         if (used + desc.size_a > al->pool_size) {
2093           // not enough space, need to go fallback path
2094           KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2095           if (al->fb == omp_atv_default_mem_fb) {
2096             al = (kmp_allocator_t *)omp_default_mem_alloc;
2097             ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2098           } else if (al->fb == omp_atv_abort_fb) {
2099             KMP_ASSERT(0); // abort fallback requested
2100           } else if (al->fb == omp_atv_allocator_fb) {
2101             KMP_ASSERT(al != al->fb_data);
2102             al = al->fb_data;
2103             ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2104             if (is_pinned && kmp_target_lock_mem)
2105               kmp_target_lock_mem(ptr, size, default_device);
2106             return ptr;
2107           } // else ptr == NULL;
2108         } else {
2109           // pool has enough space
2110           ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
2111           if (ptr == NULL) {
2112             if (al->fb == omp_atv_default_mem_fb) {
2113               al = (kmp_allocator_t *)omp_default_mem_alloc;
2114               ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2115             } else if (al->fb == omp_atv_abort_fb) {
2116               KMP_ASSERT(0); // abort fallback requested
2117             } else if (al->fb == omp_atv_allocator_fb) {
2118               KMP_ASSERT(al != al->fb_data);
2119               al = al->fb_data;
2120               ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2121               if (is_pinned && kmp_target_lock_mem)
2122                 kmp_target_lock_mem(ptr, size, default_device);
2123               return ptr;
2124             }
2125           }
2126         }
2127       } else {
2128         // custom allocator, pool size not requested
2129         ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
2130         if (ptr == NULL) {
2131           if (al->fb == omp_atv_default_mem_fb) {
2132             al = (kmp_allocator_t *)omp_default_mem_alloc;
2133             ptr = kmp_mk_alloc(*mk_default, desc.size_a);
2134           } else if (al->fb == omp_atv_abort_fb) {
2135             KMP_ASSERT(0); // abort fallback requested
2136           } else if (al->fb == omp_atv_allocator_fb) {
2137             KMP_ASSERT(al != al->fb_data);
2138             al = al->fb_data;
2139             ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2140             if (is_pinned && kmp_target_lock_mem)
2141               kmp_target_lock_mem(ptr, size, default_device);
2142             return ptr;
2143           }
2144         }
2145       }
2146     } else if (allocator < kmp_max_mem_alloc) {
2147       // pre-defined allocator
2148       if (allocator == omp_high_bw_mem_alloc) {
2149         KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
2150       } else if (allocator == omp_large_cap_mem_alloc) {
2151         KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
2152       } else if (allocator == omp_const_mem_alloc) {
2153         KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
2154       } else if (allocator == omp_low_lat_mem_alloc) {
2155         KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
2156       } else if (allocator == omp_cgroup_mem_alloc) {
2157         KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
2158       } else if (allocator == omp_pteam_mem_alloc) {
2159         KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
2160       } else if (allocator == omp_thread_mem_alloc) {
2161         KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
2162       } else { // default allocator requested
2163         use_default_allocator = true;
2164       }
2165       if (use_default_allocator) {
2166         ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2167         use_default_allocator = false;
2168       }
2169     } else if (al->pool_size > 0) {
2170       // custom allocator with pool size requested
2171       kmp_uint64 used =
2172           KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
2173       if (used + desc.size_a > al->pool_size) {
2174         // not enough space, need to go fallback path
2175         KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2176         if (al->fb == omp_atv_default_mem_fb) {
2177           al = (kmp_allocator_t *)omp_default_mem_alloc;
2178           ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2179         } else if (al->fb == omp_atv_abort_fb) {
2180           KMP_ASSERT(0); // abort fallback requested
2181         } else if (al->fb == omp_atv_allocator_fb) {
2182           KMP_ASSERT(al != al->fb_data);
2183           al = al->fb_data;
2184           ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
2185           if (is_pinned && kmp_target_lock_mem)
2186             kmp_target_lock_mem(ptr, size, default_device);
2187           return ptr;
2188         } // else ptr == NULL
2189       } else {
2190         // pool has enough space
2191         ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2192         if (ptr == NULL && al->fb == omp_atv_abort_fb) {
2193           KMP_ASSERT(0); // abort fallback requested
2194         } // no sense to look for another fallback because of same internal
2195         // alloc
2196       }
2197     } else {
2198       // custom allocator, pool size not requested
2199       ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
2200       if (ptr == NULL && al->fb == omp_atv_abort_fb) {
2201         KMP_ASSERT(0); // abort fallback requested
2202       } // no sense to look for another fallback because of same internal alloc
2203     }
2204 #if KMP_HWLOC_ENABLED
2205   }
2206 #endif // KMP_HWLOC_ENABLED
2207   KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
2208   if (ptr == NULL)
2209     return NULL;
2210 
2211   if (is_pinned && kmp_target_lock_mem)
2212     kmp_target_lock_mem(ptr, desc.size_a, default_device);
2213 
2214   addr = (kmp_uintptr_t)ptr;
2215   addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
2216   addr_descr = addr_align - sz_desc;
2217 
2218   desc.ptr_alloc = ptr;
2219   desc.ptr_align = (void *)addr_align;
2220   desc.allocator = al;
2221   *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
2222   KMP_MB();
2223 
2224   return desc.ptr_align;
2225 }
2226 
__kmp_calloc(int gtid,size_t algn,size_t nmemb,size_t size,omp_allocator_handle_t allocator)2227 void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,
2228                    omp_allocator_handle_t allocator) {
2229   void *ptr = NULL;
2230   kmp_allocator_t *al;
2231   KMP_DEBUG_ASSERT(__kmp_init_serial);
2232 
2233   if (allocator == omp_null_allocator)
2234     allocator = __kmp_threads[gtid]->th.th_def_allocator;
2235 
2236   al = RCAST(kmp_allocator_t *, allocator);
2237 
2238   if (nmemb == 0 || size == 0)
2239     return ptr;
2240 
2241   if ((SIZE_MAX - sizeof(kmp_mem_desc_t)) / size < nmemb) {
2242     if (al->fb == omp_atv_abort_fb) {
2243       KMP_ASSERT(0);
2244     }
2245     return ptr;
2246   }
2247 
2248   ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);
2249 
2250   if (ptr) {
2251     memset(ptr, 0x00, nmemb * size);
2252   }
2253   return ptr;
2254 }
2255 
__kmp_realloc(int gtid,void * ptr,size_t size,omp_allocator_handle_t allocator,omp_allocator_handle_t free_allocator)2256 void *__kmp_realloc(int gtid, void *ptr, size_t size,
2257                     omp_allocator_handle_t allocator,
2258                     omp_allocator_handle_t free_allocator) {
2259   void *nptr = NULL;
2260   KMP_DEBUG_ASSERT(__kmp_init_serial);
2261 
2262   if (size == 0) {
2263     if (ptr != NULL)
2264       ___kmpc_free(gtid, ptr, free_allocator);
2265     return nptr;
2266   }
2267 
2268   nptr = __kmp_alloc(gtid, 0, size, allocator);
2269 
2270   if (nptr != NULL && ptr != NULL) {
2271     kmp_mem_desc_t desc;
2272     kmp_uintptr_t addr_align; // address to return to caller
2273     kmp_uintptr_t addr_descr; // address of memory block descriptor
2274 
2275     addr_align = (kmp_uintptr_t)ptr;
2276     addr_descr = addr_align - sizeof(kmp_mem_desc_t);
2277     desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
2278 
2279     KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
2280     KMP_DEBUG_ASSERT(desc.size_orig > 0);
2281     KMP_DEBUG_ASSERT(desc.size_orig < desc.size_a);
2282     KMP_MEMCPY((char *)nptr, (char *)ptr,
2283                (size_t)((size < desc.size_orig) ? size : desc.size_orig));
2284   }
2285 
2286   if (nptr != NULL) {
2287     ___kmpc_free(gtid, ptr, free_allocator);
2288   }
2289 
2290   return nptr;
2291 }
2292 
___kmpc_free(int gtid,void * ptr,omp_allocator_handle_t allocator)2293 void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
2294   if (ptr == NULL)
2295     return;
2296 
2297   kmp_allocator_t *al;
2298   omp_allocator_handle_t oal;
2299   al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
2300   kmp_mem_desc_t desc;
2301   kmp_uintptr_t addr_align; // address to return to caller
2302   kmp_uintptr_t addr_descr; // address of memory block descriptor
2303 
2304   if (al > kmp_max_mem_alloc && al->memspace > kmp_max_mem_space) {
2305     __kmp_tgt_allocator.omp_free(ptr, allocator);
2306     return;
2307   }
2308 
2309   if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
2310                                      (allocator > kmp_max_mem_alloc &&
2311                                       KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {
2312     kmp_int32 device =
2313         __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
2314     if (allocator == llvm_omp_target_host_mem_alloc) {
2315       kmp_target_free_host(ptr, device);
2316     } else if (allocator == llvm_omp_target_shared_mem_alloc) {
2317       kmp_target_free_shared(ptr, device);
2318     } else if (allocator == llvm_omp_target_device_mem_alloc) {
2319       kmp_target_free_device(ptr, device);
2320     }
2321     return;
2322   }
2323 
2324   addr_align = (kmp_uintptr_t)ptr;
2325   addr_descr = addr_align - sizeof(kmp_mem_desc_t);
2326   desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
2327 
2328   KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
2329   if (allocator) {
2330     KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);
2331   }
2332   al = desc.allocator;
2333   oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
2334   KMP_DEBUG_ASSERT(al);
2335 
2336   if (allocator > kmp_max_mem_alloc && kmp_target_unlock_mem && al->pinned) {
2337     kmp_int32 device =
2338         __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
2339     kmp_target_unlock_mem(desc.ptr_alloc, device);
2340   }
2341 
2342 #if KMP_HWLOC_ENABLED
2343   if (__kmp_hwloc_available) {
2344     if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
2345       kmp_uint64 used =
2346           KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2347       (void)used; // to suppress compiler warning
2348       KMP_DEBUG_ASSERT(used >= desc.size_a);
2349     }
2350     hwloc_free(__kmp_hwloc_topology, desc.ptr_alloc, desc.size_a);
2351   } else {
2352 #endif // KMP_HWLOC_ENABLED
2353     if (__kmp_memkind_available) {
2354       if (oal < kmp_max_mem_alloc) {
2355         // pre-defined allocator
2356         if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
2357           kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
2358         } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
2359           kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
2360         } else {
2361           kmp_mk_free(*mk_default, desc.ptr_alloc);
2362         }
2363       } else {
2364         if (al->pool_size > 0) { // custom allocator with pool size requested
2365           kmp_uint64 used =
2366               KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2367           (void)used; // to suppress compiler warning
2368           KMP_DEBUG_ASSERT(used >= desc.size_a);
2369         }
2370         kmp_mk_free(*al->memkind, desc.ptr_alloc);
2371       }
2372     } else {
2373       if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
2374         kmp_uint64 used =
2375             KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
2376         (void)used; // to suppress compiler warning
2377         KMP_DEBUG_ASSERT(used >= desc.size_a);
2378       }
2379       __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
2380     }
2381 #if KMP_HWLOC_ENABLED
2382   }
2383 #endif // KMP_HWLOC_ENABLED
2384 }
2385 
2386 /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
2387    memory leaks, but it may be useful for debugging memory corruptions, used
2388    freed pointers, etc. */
2389 /* #define LEAK_MEMORY */
2390 struct kmp_mem_descr { // Memory block descriptor.
2391   void *ptr_allocated; // Pointer returned by malloc(), subject for free().
2392   size_t size_allocated; // Size of allocated memory block.
2393   void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
2394   size_t size_aligned; // Size of aligned memory block.
2395 };
2396 typedef struct kmp_mem_descr kmp_mem_descr_t;
2397 
2398 /* Allocate memory on requested boundary, fill allocated memory with 0x00.
2399    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2400    error. Must use __kmp_free when freeing memory allocated by this routine! */
___kmp_allocate_align(size_t size,size_t alignment KMP_SRC_LOC_DECL)2401 static void *___kmp_allocate_align(size_t size,
2402                                    size_t alignment KMP_SRC_LOC_DECL) {
2403   /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
2404      requested to return properly aligned pointer. Original pointer returned
2405      by malloc() and size of allocated block is saved in descriptor just
2406      before the aligned pointer. This information used by __kmp_free() -- it
2407      has to pass to free() original pointer, not aligned one.
2408 
2409           +---------+------------+-----------------------------------+---------+
2410           | padding | descriptor |           aligned block           | padding |
2411           +---------+------------+-----------------------------------+---------+
2412           ^                      ^
2413           |                      |
2414           |                      +- Aligned pointer returned to caller
2415           +- Pointer returned by malloc()
2416 
2417       Aligned block is filled with zeros, paddings are filled with 0xEF. */
2418 
2419   kmp_mem_descr_t descr;
2420   kmp_uintptr_t addr_allocated; // Address returned by malloc().
2421   kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
2422   kmp_uintptr_t addr_descr; // Address of memory block descriptor.
2423 
2424   KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
2425                 (int)size, (int)alignment KMP_SRC_LOC_PARM));
2426 
2427   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
2428   KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
2429   // Make sure kmp_uintptr_t is enough to store addresses.
2430 
2431   descr.size_aligned = size;
2432   descr.size_allocated =
2433       descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
2434 
2435 #if KMP_DEBUG
2436   descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
2437 #else
2438   descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
2439 #endif
2440   KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
2441                 descr.ptr_allocated));
2442   if (descr.ptr_allocated == NULL) {
2443     KMP_FATAL(OutOfHeapMemory);
2444   }
2445 
2446   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
2447   addr_aligned =
2448       (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
2449   addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
2450 
2451   descr.ptr_aligned = (void *)addr_aligned;
2452 
2453   KE_TRACE(26, ("   ___kmp_allocate_align: "
2454                 "ptr_allocated=%p, size_allocated=%d, "
2455                 "ptr_aligned=%p, size_aligned=%d\n",
2456                 descr.ptr_allocated, (int)descr.size_allocated,
2457                 descr.ptr_aligned, (int)descr.size_aligned));
2458 
2459   KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
2460   KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
2461   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
2462                    addr_allocated + descr.size_allocated);
2463   KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
2464 #ifdef KMP_DEBUG
2465   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
2466 // Fill allocated memory block with 0xEF.
2467 #endif
2468   memset(descr.ptr_aligned, 0x00, descr.size_aligned);
2469   // Fill the aligned memory block (which is intended for using by caller) with
2470   // 0x00. Do not
2471   // put this filling under KMP_DEBUG condition! Many callers expect zeroed
2472   // memory. (Padding
2473   // bytes remain filled with 0xEF in debugging library.)
2474   *((kmp_mem_descr_t *)addr_descr) = descr;
2475 
2476   KMP_MB();
2477 
2478   KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
2479   return descr.ptr_aligned;
2480 } // func ___kmp_allocate_align
2481 
2482 /* Allocate memory on cache line boundary, fill allocated memory with 0x00.
2483    Do not call this func directly! Use __kmp_allocate macro instead.
2484    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2485    error. Must use __kmp_free when freeing memory allocated by this routine! */
___kmp_allocate(size_t size KMP_SRC_LOC_DECL)2486 void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
2487   void *ptr;
2488   KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
2489                 (int)size KMP_SRC_LOC_PARM));
2490   ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
2491   KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
2492   return ptr;
2493 } // func ___kmp_allocate
2494 
2495 /* Allocate memory on page boundary, fill allocated memory with 0x00.
2496    Does not call this func directly! Use __kmp_page_allocate macro instead.
2497    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2498    error. Must use __kmp_free when freeing memory allocated by this routine! */
___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL)2499 void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
2500   int page_size = 8 * 1024;
2501   void *ptr;
2502 
2503   KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
2504                 (int)size KMP_SRC_LOC_PARM));
2505   ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
2506   KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
2507   return ptr;
2508 } // ___kmp_page_allocate
2509 
2510 /* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
2511    In debug mode, fill the memory block with 0xEF before call to free(). */
___kmp_free(void * ptr KMP_SRC_LOC_DECL)2512 void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
2513   kmp_mem_descr_t descr;
2514 #if KMP_DEBUG
2515   kmp_uintptr_t addr_allocated; // Address returned by malloc().
2516   kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
2517 #endif
2518   KE_TRACE(25,
2519            ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
2520   KMP_ASSERT(ptr != NULL);
2521 
2522   descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
2523 
2524   KE_TRACE(26, ("   __kmp_free:     "
2525                 "ptr_allocated=%p, size_allocated=%d, "
2526                 "ptr_aligned=%p, size_aligned=%d\n",
2527                 descr.ptr_allocated, (int)descr.size_allocated,
2528                 descr.ptr_aligned, (int)descr.size_aligned));
2529 #if KMP_DEBUG
2530   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
2531   addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
2532   KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
2533   KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
2534   KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
2535   KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
2536   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
2537                    addr_allocated + descr.size_allocated);
2538   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
2539 // Fill memory block with 0xEF, it helps catch using freed memory.
2540 #endif
2541 
2542 #ifndef LEAK_MEMORY
2543   KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
2544 #ifdef KMP_DEBUG
2545   _free_src_loc(descr.ptr_allocated, _file_, _line_);
2546 #else
2547   free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
2548 #endif
2549 #endif
2550   KMP_MB();
2551   KE_TRACE(25, ("<- __kmp_free() returns\n"));
2552 } // func ___kmp_free
2553 
2554 #if USE_FAST_MEMORY == 3
2555 // Allocate fast memory by first scanning the thread's free lists
2556 // If a chunk the right size exists, grab it off the free list.
2557 // Otherwise allocate normally using kmp_thread_malloc.
2558 
2559 // AC: How to choose the limit? Just get 16 for now...
2560 #define KMP_FREE_LIST_LIMIT 16
2561 
2562 // Always use 128 bytes for determining buckets for caching memory blocks
2563 #define DCACHE_LINE 128
2564 
___kmp_fast_allocate(kmp_info_t * this_thr,size_t size KMP_SRC_LOC_DECL)2565 void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
2566   void *ptr;
2567   size_t num_lines, idx;
2568   int index;
2569   void *alloc_ptr;
2570   size_t alloc_size;
2571   kmp_mem_descr_t *descr;
2572 
2573   KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
2574                 __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
2575 
2576   num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
2577   idx = num_lines - 1;
2578   KMP_DEBUG_ASSERT(idx >= 0);
2579   if (idx < 2) {
2580     index = 0; // idx is [ 0, 1 ], use first free list
2581     num_lines = 2; // 1, 2 cache lines or less than cache line
2582   } else if ((idx >>= 2) == 0) {
2583     index = 1; // idx is [ 2, 3 ], use second free list
2584     num_lines = 4; // 3, 4 cache lines
2585   } else if ((idx >>= 2) == 0) {
2586     index = 2; // idx is [ 4, 15 ], use third free list
2587     num_lines = 16; // 5, 6, ..., 16 cache lines
2588   } else if ((idx >>= 2) == 0) {
2589     index = 3; // idx is [ 16, 63 ], use fourth free list
2590     num_lines = 64; // 17, 18, ..., 64 cache lines
2591   } else {
2592     goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
2593   }
2594 
2595   ptr = this_thr->th.th_free_lists[index].th_free_list_self;
2596   if (ptr != NULL) {
2597     // pop the head of no-sync free list
2598     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2599     KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2600                                                       sizeof(kmp_mem_descr_t)))
2601                                      ->ptr_aligned);
2602     goto end;
2603   }
2604   ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2605   if (ptr != NULL) {
2606     // no-sync free list is empty, use sync free list (filled in by other
2607     // threads only)
2608     // pop the head of the sync free list, push NULL instead
2609     while (!KMP_COMPARE_AND_STORE_PTR(
2610         &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
2611       KMP_CPU_PAUSE();
2612       ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2613     }
2614     // push the rest of chain into no-sync free list (can be NULL if there was
2615     // the only block)
2616     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2617     KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2618                                                       sizeof(kmp_mem_descr_t)))
2619                                      ->ptr_aligned);
2620     goto end;
2621   }
2622 
2623 alloc_call:
2624   // haven't found block in the free lists, thus allocate it
2625   size = num_lines * DCACHE_LINE;
2626 
2627   alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
2628   KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
2629                 "alloc_size %d\n",
2630                 __kmp_gtid_from_thread(this_thr), alloc_size));
2631   alloc_ptr = bget(this_thr, (bufsize)alloc_size);
2632 
2633   // align ptr to DCACHE_LINE
2634   ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
2635                   DCACHE_LINE) &
2636                  ~(DCACHE_LINE - 1));
2637   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2638 
2639   descr->ptr_allocated = alloc_ptr; // remember allocated pointer
2640   // we don't need size_allocated
2641   descr->ptr_aligned = (void *)this_thr; // remember allocating thread
2642   // (it is already saved in bget buffer,
2643   // but we may want to use another allocator in future)
2644   descr->size_aligned = size;
2645 
2646 end:
2647   KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
2648                 __kmp_gtid_from_thread(this_thr), ptr));
2649   return ptr;
2650 } // func __kmp_fast_allocate
2651 
2652 // Free fast memory and place it on the thread's free list if it is of
2653 // the correct size.
___kmp_fast_free(kmp_info_t * this_thr,void * ptr KMP_SRC_LOC_DECL)2654 void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
2655   kmp_mem_descr_t *descr;
2656   kmp_info_t *alloc_thr;
2657   size_t size;
2658   size_t idx;
2659   int index;
2660 
2661   KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
2662                 __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
2663   KMP_ASSERT(ptr != NULL);
2664 
2665   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2666 
2667   KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
2668                 (int)descr->size_aligned));
2669 
2670   size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
2671 
2672   idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
2673   if (idx == size) {
2674     index = 0; // 2 cache lines
2675   } else if ((idx <<= 1) == size) {
2676     index = 1; // 4 cache lines
2677   } else if ((idx <<= 2) == size) {
2678     index = 2; // 16 cache lines
2679   } else if ((idx <<= 2) == size) {
2680     index = 3; // 64 cache lines
2681   } else {
2682     KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
2683     goto free_call; // 65 or more cache lines ( > 8KB )
2684   }
2685 
2686   alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
2687   if (alloc_thr == this_thr) {
2688     // push block to self no-sync free list, linking previous head (LIFO)
2689     *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
2690     this_thr->th.th_free_lists[index].th_free_list_self = ptr;
2691   } else {
2692     void *head = this_thr->th.th_free_lists[index].th_free_list_other;
2693     if (head == NULL) {
2694       // Create new free list
2695       this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2696       *((void **)ptr) = NULL; // mark the tail of the list
2697       descr->size_allocated = (size_t)1; // head of the list keeps its length
2698     } else {
2699       // need to check existed "other" list's owner thread and size of queue
2700       kmp_mem_descr_t *dsc =
2701           (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
2702       // allocating thread, same for all queue nodes
2703       kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
2704       size_t q_sz =
2705           dsc->size_allocated + 1; // new size in case we add current task
2706       if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
2707         // we can add current task to "other" list, no sync needed
2708         *((void **)ptr) = head;
2709         descr->size_allocated = q_sz;
2710         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2711       } else {
2712         // either queue blocks owner is changing or size limit exceeded
2713         // return old queue to allocating thread (q_th) synchronously,
2714         // and start new list for alloc_thr's tasks
2715         void *old_ptr;
2716         void *tail = head;
2717         void *next = *((void **)head);
2718         while (next != NULL) {
2719           KMP_DEBUG_ASSERT(
2720               // queue size should decrease by 1 each step through the list
2721               ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
2722                       ->size_allocated +
2723                   1 ==
2724               ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
2725                   ->size_allocated);
2726           tail = next; // remember tail node
2727           next = *((void **)next);
2728         }
2729         KMP_DEBUG_ASSERT(q_th != NULL);
2730         // push block to owner's sync free list
2731         old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2732         /* the next pointer must be set before setting free_list to ptr to avoid
2733            exposing a broken list to other threads, even for an instant. */
2734         *((void **)tail) = old_ptr;
2735 
2736         while (!KMP_COMPARE_AND_STORE_PTR(
2737             &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
2738           KMP_CPU_PAUSE();
2739           old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2740           *((void **)tail) = old_ptr;
2741         }
2742 
2743         // start new list of not-selt tasks
2744         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2745         *((void **)ptr) = NULL;
2746         descr->size_allocated = (size_t)1; // head of queue keeps its length
2747       }
2748     }
2749   }
2750   goto end;
2751 
2752 free_call:
2753   KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
2754                 __kmp_gtid_from_thread(this_thr), size));
2755   __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
2756   brel(this_thr, descr->ptr_allocated);
2757 
2758 end:
2759   KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
2760 
2761 } // func __kmp_fast_free
2762 
2763 // Initialize the thread free lists related to fast memory
2764 // Only do this when a thread is initially created.
__kmp_initialize_fast_memory(kmp_info_t * this_thr)2765 void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
2766   KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
2767 
2768   memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
2769 }
2770 
2771 // Free the memory in the thread free lists related to fast memory
2772 // Only do this when a thread is being reaped (destroyed).
__kmp_free_fast_memory(kmp_info_t * th)2773 void __kmp_free_fast_memory(kmp_info_t *th) {
2774   // Suppose we use BGET underlying allocator, walk through its structures...
2775   int bin;
2776   thr_data_t *thr = get_thr_data(th);
2777   void **lst = NULL;
2778 
2779   KE_TRACE(
2780       5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
2781 
2782   __kmp_bget_dequeue(th); // Release any queued buffers
2783 
2784   // Dig through free lists and extract all allocated blocks
2785   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
2786     bfhead_t *b = thr->freelist[bin].ql.flink;
2787     while (b != &thr->freelist[bin]) {
2788       if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
2789         *((void **)b) =
2790             lst; // link the list (override bthr, but keep flink yet)
2791         lst = (void **)b; // push b into lst
2792       }
2793       b = b->ql.flink; // get next buffer
2794     }
2795   }
2796   while (lst != NULL) {
2797     void *next = *lst;
2798     KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
2799                   lst, next, th, __kmp_gtid_from_thread(th)));
2800     (*thr->relfcn)(lst);
2801 #if BufStats
2802     // count blocks to prevent problems in __kmp_finalize_bget()
2803     thr->numprel++; /* Nr of expansion block releases */
2804     thr->numpblk--; /* Total number of blocks */
2805 #endif
2806     lst = (void **)next;
2807   }
2808 
2809   KE_TRACE(
2810       5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
2811 }
2812 
2813 #endif // USE_FAST_MEMORY
2814