xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_alloc.cpp (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 /*
2  * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_io.h"
15 #include "kmp_wrapper_malloc.h"
16 
17 // Disable bget when it is not used
18 #if KMP_USE_BGET
19 
20 /* Thread private buffer management code */
21 
22 typedef int (*bget_compact_t)(size_t, int);
23 typedef void *(*bget_acquire_t)(size_t);
24 typedef void (*bget_release_t)(void *);
25 
26 /* NOTE: bufsize must be a signed datatype */
27 
28 #if KMP_OS_WINDOWS
29 #if KMP_ARCH_X86 || KMP_ARCH_ARM
30 typedef kmp_int32 bufsize;
31 #else
32 typedef kmp_int64 bufsize;
33 #endif
34 #else
35 typedef ssize_t bufsize;
36 #endif // KMP_OS_WINDOWS
37 
38 /* The three modes of operation are, fifo search, lifo search, and best-fit */
39 
40 typedef enum bget_mode {
41   bget_mode_fifo = 0,
42   bget_mode_lifo = 1,
43   bget_mode_best = 2
44 } bget_mode_t;
45 
46 static void bpool(kmp_info_t *th, void *buffer, bufsize len);
47 static void *bget(kmp_info_t *th, bufsize size);
48 static void *bgetz(kmp_info_t *th, bufsize size);
49 static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
50 static void brel(kmp_info_t *th, void *buf);
51 static void bectl(kmp_info_t *th, bget_compact_t compact,
52                   bget_acquire_t acquire, bget_release_t release,
53                   bufsize pool_incr);
54 
55 /* BGET CONFIGURATION */
56 /* Buffer allocation size quantum: all buffers allocated are a
57    multiple of this size.  This MUST be a power of two. */
58 
59 /* On IA-32 architecture with  Linux* OS, malloc() does not
60    ensure 16 byte alignment */
61 
62 #if KMP_ARCH_X86 || !KMP_HAVE_QUAD
63 
64 #define SizeQuant 8
65 #define AlignType double
66 
67 #else
68 
69 #define SizeQuant 16
70 #define AlignType _Quad
71 
72 #endif
73 
74 // Define this symbol to enable the bstats() function which calculates the
75 // total free space in the buffer pool, the largest available buffer, and the
76 // total space currently allocated.
77 #define BufStats 1
78 
79 #ifdef KMP_DEBUG
80 
81 // Define this symbol to enable the bpoold() function which dumps the buffers
82 // in a buffer pool.
83 #define BufDump 1
84 
85 // Define this symbol to enable the bpoolv() function for validating a buffer
86 // pool.
87 #define BufValid 1
88 
89 // Define this symbol to enable the bufdump() function which allows dumping the
90 // contents of an allocated or free buffer.
91 #define DumpData 1
92 
93 #ifdef NOT_USED_NOW
94 
95 // Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
96 // who attempt to use pointers into released buffers.
97 #define FreeWipe 1
98 
99 // Use a best fit algorithm when searching for space for an allocation request.
100 // This uses memory more efficiently, but allocation will be much slower.
101 #define BestFit 1
102 
103 #endif /* NOT_USED_NOW */
104 #endif /* KMP_DEBUG */
105 
106 static bufsize bget_bin_size[] = {
107     0,
108     //    1 << 6,    /* .5 Cache line */
109     1 << 7, /* 1 Cache line, new */
110     1 << 8, /* 2 Cache lines */
111     1 << 9, /* 4 Cache lines, new */
112     1 << 10, /* 8 Cache lines */
113     1 << 11, /* 16 Cache lines, new */
114     1 << 12, 1 << 13, /* new */
115     1 << 14, 1 << 15, /* new */
116     1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
117     1 << 21, /*  2MB */
118     1 << 22, /*  4MB */
119     1 << 23, /*  8MB */
120     1 << 24, /* 16MB */
121     1 << 25, /* 32MB */
122 };
123 
124 #define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
125 
126 struct bfhead;
127 
128 //  Declare the interface, including the requested buffer size type, bufsize.
129 
130 /* Queue links */
131 typedef struct qlinks {
132   struct bfhead *flink; /* Forward link */
133   struct bfhead *blink; /* Backward link */
134 } qlinks_t;
135 
136 /* Header in allocated and free buffers */
137 typedef struct bhead2 {
138   kmp_info_t *bthr; /* The thread which owns the buffer pool */
139   bufsize prevfree; /* Relative link back to previous free buffer in memory or
140                        0 if previous buffer is allocated.  */
141   bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
142 } bhead2_t;
143 
144 /* Make sure the bhead structure is a multiple of SizeQuant in size. */
145 typedef union bhead {
146   KMP_ALIGN(SizeQuant)
147   AlignType b_align;
148   char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
149   bhead2_t bb;
150 } bhead_t;
151 #define BH(p) ((bhead_t *)(p))
152 
153 /*  Header in directly allocated buffers (by acqfcn) */
154 typedef struct bdhead {
155   bufsize tsize; /* Total size, including overhead */
156   bhead_t bh; /* Common header */
157 } bdhead_t;
158 #define BDH(p) ((bdhead_t *)(p))
159 
160 /* Header in free buffers */
161 typedef struct bfhead {
162   bhead_t bh; /* Common allocated/free header */
163   qlinks_t ql; /* Links on free list */
164 } bfhead_t;
165 #define BFH(p) ((bfhead_t *)(p))
166 
167 typedef struct thr_data {
168   bfhead_t freelist[MAX_BGET_BINS];
169 #if BufStats
170   size_t totalloc; /* Total space currently allocated */
171   long numget, numrel; /* Number of bget() and brel() calls */
172   long numpblk; /* Number of pool blocks */
173   long numpget, numprel; /* Number of block gets and rels */
174   long numdget, numdrel; /* Number of direct gets and rels */
175 #endif /* BufStats */
176 
177   /* Automatic expansion block management functions */
178   bget_compact_t compfcn;
179   bget_acquire_t acqfcn;
180   bget_release_t relfcn;
181 
182   bget_mode_t mode; /* what allocation mode to use? */
183 
184   bufsize exp_incr; /* Expansion block size */
185   bufsize pool_len; /* 0: no bpool calls have been made
186                        -1: not all pool blocks are the same size
187                        >0: (common) block size for all bpool calls made so far
188                     */
189   bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */
190 } thr_data_t;
191 
192 /*  Minimum allocation quantum: */
193 #define QLSize (sizeof(qlinks_t))
194 #define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
195 #define MaxSize                                                                \
196   (bufsize)(                                                                   \
197       ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
198 // Maximum for the requested size.
199 
200 /* End sentinel: value placed in bsize field of dummy block delimiting
201    end of pool block.  The most negative number which will  fit  in  a
202    bufsize, defined in a way that the compiler will accept. */
203 
204 #define ESent                                                                  \
205   ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
206 
207 /* Thread Data management routines */
208 static int bget_get_bin(bufsize size) {
209   // binary chop bins
210   int lo = 0, hi = MAX_BGET_BINS - 1;
211 
212   KMP_DEBUG_ASSERT(size > 0);
213 
214   while ((hi - lo) > 1) {
215     int mid = (lo + hi) >> 1;
216     if (size < bget_bin_size[mid])
217       hi = mid - 1;
218     else
219       lo = mid;
220   }
221 
222   KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
223 
224   return lo;
225 }
226 
227 static void set_thr_data(kmp_info_t *th) {
228   int i;
229   thr_data_t *data;
230 
231   data = (thr_data_t *)((!th->th.th_local.bget_data)
232                             ? __kmp_allocate(sizeof(*data))
233                             : th->th.th_local.bget_data);
234 
235   memset(data, '\0', sizeof(*data));
236 
237   for (i = 0; i < MAX_BGET_BINS; ++i) {
238     data->freelist[i].ql.flink = &data->freelist[i];
239     data->freelist[i].ql.blink = &data->freelist[i];
240   }
241 
242   th->th.th_local.bget_data = data;
243   th->th.th_local.bget_list = 0;
244 #if !USE_CMP_XCHG_FOR_BGET
245 #ifdef USE_QUEUING_LOCK_FOR_BGET
246   __kmp_init_lock(&th->th.th_local.bget_lock);
247 #else
248   __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
249 #endif /* USE_LOCK_FOR_BGET */
250 #endif /* ! USE_CMP_XCHG_FOR_BGET */
251 }
252 
253 static thr_data_t *get_thr_data(kmp_info_t *th) {
254   thr_data_t *data;
255 
256   data = (thr_data_t *)th->th.th_local.bget_data;
257 
258   KMP_DEBUG_ASSERT(data != 0);
259 
260   return data;
261 }
262 
263 /* Walk the free list and release the enqueued buffers */
264 static void __kmp_bget_dequeue(kmp_info_t *th) {
265   void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
266 
267   if (p != 0) {
268 #if USE_CMP_XCHG_FOR_BGET
269     {
270       volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
271       while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
272                                         CCAST(void *, old_value), nullptr)) {
273         KMP_CPU_PAUSE();
274         old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
275       }
276       p = CCAST(void *, old_value);
277     }
278 #else /* ! USE_CMP_XCHG_FOR_BGET */
279 #ifdef USE_QUEUING_LOCK_FOR_BGET
280     __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
281 #else
282     __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
283 #endif /* USE_QUEUING_LOCK_FOR_BGET */
284 
285     p = (void *)th->th.th_local.bget_list;
286     th->th.th_local.bget_list = 0;
287 
288 #ifdef USE_QUEUING_LOCK_FOR_BGET
289     __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
290 #else
291     __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
292 #endif
293 #endif /* USE_CMP_XCHG_FOR_BGET */
294 
295     /* Check again to make sure the list is not empty */
296     while (p != 0) {
297       void *buf = p;
298       bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
299 
300       KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
301       KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
302                        (kmp_uintptr_t)th); // clear possible mark
303       KMP_DEBUG_ASSERT(b->ql.blink == 0);
304 
305       p = (void *)b->ql.flink;
306 
307       brel(th, buf);
308     }
309   }
310 }
311 
312 /* Chain together the free buffers by using the thread owner field */
313 static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
314 #ifdef USE_QUEUING_LOCK_FOR_BGET
315                                ,
316                                kmp_int32 rel_gtid
317 #endif
318 ) {
319   bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
320 
321   KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
322   KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
323                    (kmp_uintptr_t)th); // clear possible mark
324 
325   b->ql.blink = 0;
326 
327   KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
328                 __kmp_gtid_from_thread(th)));
329 
330 #if USE_CMP_XCHG_FOR_BGET
331   {
332     volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
333     /* the next pointer must be set before setting bget_list to buf to avoid
334        exposing a broken list to other threads, even for an instant. */
335     b->ql.flink = BFH(CCAST(void *, old_value));
336 
337     while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
338                                       CCAST(void *, old_value), buf)) {
339       KMP_CPU_PAUSE();
340       old_value = TCR_PTR(th->th.th_local.bget_list);
341       /* the next pointer must be set before setting bget_list to buf to avoid
342          exposing a broken list to other threads, even for an instant. */
343       b->ql.flink = BFH(CCAST(void *, old_value));
344     }
345   }
346 #else /* ! USE_CMP_XCHG_FOR_BGET */
347 #ifdef USE_QUEUING_LOCK_FOR_BGET
348   __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
349 #else
350   __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
351 #endif
352 
353   b->ql.flink = BFH(th->th.th_local.bget_list);
354   th->th.th_local.bget_list = (void *)buf;
355 
356 #ifdef USE_QUEUING_LOCK_FOR_BGET
357   __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
358 #else
359   __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
360 #endif
361 #endif /* USE_CMP_XCHG_FOR_BGET */
362 }
363 
364 /* insert buffer back onto a new freelist */
365 static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
366   int bin;
367 
368   KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
369   KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
370 
371   bin = bget_get_bin(b->bh.bb.bsize);
372 
373   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
374                    &thr->freelist[bin]);
375   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
376                    &thr->freelist[bin]);
377 
378   b->ql.flink = &thr->freelist[bin];
379   b->ql.blink = thr->freelist[bin].ql.blink;
380 
381   thr->freelist[bin].ql.blink = b;
382   b->ql.blink->ql.flink = b;
383 }
384 
385 /* unlink the buffer from the old freelist */
386 static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
387   KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
388   KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
389 
390   b->ql.blink->ql.flink = b->ql.flink;
391   b->ql.flink->ql.blink = b->ql.blink;
392 }
393 
394 /*  GET STATS -- check info on free list */
395 static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
396   thr_data_t *thr = get_thr_data(th);
397   int bin;
398 
399   *total_free = *max_free = 0;
400 
401   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
402     bfhead_t *b, *best;
403 
404     best = &thr->freelist[bin];
405     b = best->ql.flink;
406 
407     while (b != &thr->freelist[bin]) {
408       *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
409       if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
410         best = b;
411 
412       /* Link to next buffer */
413       b = b->ql.flink;
414     }
415 
416     if (*max_free < best->bh.bb.bsize)
417       *max_free = best->bh.bb.bsize;
418   }
419 
420   if (*max_free > (bufsize)sizeof(bhead_t))
421     *max_free -= sizeof(bhead_t);
422 }
423 
424 /*  BGET  --  Allocate a buffer.  */
425 static void *bget(kmp_info_t *th, bufsize requested_size) {
426   thr_data_t *thr = get_thr_data(th);
427   bufsize size = requested_size;
428   bfhead_t *b;
429   void *buf;
430   int compactseq = 0;
431   int use_blink = 0;
432   /* For BestFit */
433   bfhead_t *best;
434 
435   if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
436     return NULL;
437   }
438 
439   __kmp_bget_dequeue(th); /* Release any queued buffers */
440 
441   if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
442     size = SizeQ;
443   }
444 #if defined(SizeQuant) && (SizeQuant > 1)
445   size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
446 #endif
447 
448   size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
449   KMP_DEBUG_ASSERT(size >= 0);
450   KMP_DEBUG_ASSERT(size % SizeQuant == 0);
451 
452   use_blink = (thr->mode == bget_mode_lifo);
453 
454   /* If a compact function was provided in the call to bectl(), wrap
455      a loop around the allocation process  to  allow  compaction  to
456      intervene in case we don't find a suitable buffer in the chain. */
457 
458   for (;;) {
459     int bin;
460 
461     for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
462       /* Link to next buffer */
463       b = (use_blink ? thr->freelist[bin].ql.blink
464                      : thr->freelist[bin].ql.flink);
465 
466       if (thr->mode == bget_mode_best) {
467         best = &thr->freelist[bin];
468 
469         /* Scan the free list searching for the first buffer big enough
470            to hold the requested size buffer. */
471         while (b != &thr->freelist[bin]) {
472           if (b->bh.bb.bsize >= (bufsize)size) {
473             if ((best == &thr->freelist[bin]) ||
474                 (b->bh.bb.bsize < best->bh.bb.bsize)) {
475               best = b;
476             }
477           }
478 
479           /* Link to next buffer */
480           b = (use_blink ? b->ql.blink : b->ql.flink);
481         }
482         b = best;
483       }
484 
485       while (b != &thr->freelist[bin]) {
486         if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
487 
488           // Buffer is big enough to satisfy the request. Allocate it to the
489           // caller. We must decide whether the buffer is large enough to split
490           // into the part given to the caller and a free buffer that remains
491           // on the free list, or whether the entire buffer should be removed
492           // from the free list and given to the caller in its entirety. We
493           // only split the buffer if enough room remains for a header plus the
494           // minimum quantum of allocation.
495           if ((b->bh.bb.bsize - (bufsize)size) >
496               (bufsize)(SizeQ + (sizeof(bhead_t)))) {
497             bhead_t *ba, *bn;
498 
499             ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
500             bn = BH(((char *)ba) + size);
501 
502             KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
503 
504             /* Subtract size from length of free block. */
505             b->bh.bb.bsize -= (bufsize)size;
506 
507             /* Link allocated buffer to the previous free buffer. */
508             ba->bb.prevfree = b->bh.bb.bsize;
509 
510             /* Plug negative size into user buffer. */
511             ba->bb.bsize = -size;
512 
513             /* Mark this buffer as owned by this thread. */
514             TCW_PTR(ba->bb.bthr,
515                     th); // not an allocated address (do not mark it)
516             /* Mark buffer after this one not preceded by free block. */
517             bn->bb.prevfree = 0;
518 
519             // unlink buffer from old freelist, and reinsert into new freelist
520             __kmp_bget_remove_from_freelist(b);
521             __kmp_bget_insert_into_freelist(thr, b);
522 #if BufStats
523             thr->totalloc += (size_t)size;
524             thr->numget++; /* Increment number of bget() calls */
525 #endif
526             buf = (void *)((((char *)ba) + sizeof(bhead_t)));
527             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
528             return buf;
529           } else {
530             bhead_t *ba;
531 
532             ba = BH(((char *)b) + b->bh.bb.bsize);
533 
534             KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
535 
536             /* The buffer isn't big enough to split.  Give  the  whole
537                shebang to the caller and remove it from the free list. */
538 
539             __kmp_bget_remove_from_freelist(b);
540 #if BufStats
541             thr->totalloc += (size_t)b->bh.bb.bsize;
542             thr->numget++; /* Increment number of bget() calls */
543 #endif
544             /* Negate size to mark buffer allocated. */
545             b->bh.bb.bsize = -(b->bh.bb.bsize);
546 
547             /* Mark this buffer as owned by this thread. */
548             TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
549             /* Zero the back pointer in the next buffer in memory
550                to indicate that this buffer is allocated. */
551             ba->bb.prevfree = 0;
552 
553             /* Give user buffer starting at queue links. */
554             buf = (void *)&(b->ql);
555             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
556             return buf;
557           }
558         }
559 
560         /* Link to next buffer */
561         b = (use_blink ? b->ql.blink : b->ql.flink);
562       }
563     }
564 
565     /* We failed to find a buffer. If there's a compact function defined,
566        notify it of the size requested. If it returns TRUE, try the allocation
567        again. */
568 
569     if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
570       break;
571     }
572   }
573 
574   /* No buffer available with requested size free. */
575 
576   /* Don't give up yet -- look in the reserve supply. */
577   if (thr->acqfcn != 0) {
578     if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
579       /* Request is too large to fit in a single expansion block.
580          Try to satisfy it by a direct buffer acquisition. */
581       bdhead_t *bdh;
582 
583       size += sizeof(bdhead_t) - sizeof(bhead_t);
584 
585       KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
586 
587       /* richryan */
588       bdh = BDH((*thr->acqfcn)((bufsize)size));
589       if (bdh != NULL) {
590 
591         // Mark the buffer special by setting size field of its header to zero.
592         bdh->bh.bb.bsize = 0;
593 
594         /* Mark this buffer as owned by this thread. */
595         TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
596         // because direct buffer never goes to free list
597         bdh->bh.bb.prevfree = 0;
598         bdh->tsize = size;
599 #if BufStats
600         thr->totalloc += (size_t)size;
601         thr->numget++; /* Increment number of bget() calls */
602         thr->numdget++; /* Direct bget() call count */
603 #endif
604         buf = (void *)(bdh + 1);
605         KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
606         return buf;
607       }
608 
609     } else {
610 
611       /*  Try to obtain a new expansion block */
612       void *newpool;
613 
614       KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
615 
616       /* richryan */
617       newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
618       KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
619       if (newpool != NULL) {
620         bpool(th, newpool, thr->exp_incr);
621         buf = bget(
622             th, requested_size); /* This can't, I say, can't get into a loop. */
623         return buf;
624       }
625     }
626   }
627 
628   /*  Still no buffer available */
629 
630   return NULL;
631 }
632 
633 /*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
634                the  entire  contents  of  the buffer to zero, not just the
635                region requested by the caller. */
636 
637 static void *bgetz(kmp_info_t *th, bufsize size) {
638   char *buf = (char *)bget(th, size);
639 
640   if (buf != NULL) {
641     bhead_t *b;
642     bufsize rsize;
643 
644     b = BH(buf - sizeof(bhead_t));
645     rsize = -(b->bb.bsize);
646     if (rsize == 0) {
647       bdhead_t *bd;
648 
649       bd = BDH(buf - sizeof(bdhead_t));
650       rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
651     } else {
652       rsize -= sizeof(bhead_t);
653     }
654 
655     KMP_DEBUG_ASSERT(rsize >= size);
656 
657     (void)memset(buf, 0, (bufsize)rsize);
658   }
659   return ((void *)buf);
660 }
661 
662 /*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
663                simply in terms of brel()  and  bget().   It  could  be
664                enhanced to allow the buffer to grow into adjacent free
665                blocks and to avoid moving data unnecessarily.  */
666 
667 static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
668   void *nbuf;
669   bufsize osize; /* Old size of buffer */
670   bhead_t *b;
671 
672   nbuf = bget(th, size);
673   if (nbuf == NULL) { /* Acquire new buffer */
674     return NULL;
675   }
676   if (buf == NULL) {
677     return nbuf;
678   }
679   b = BH(((char *)buf) - sizeof(bhead_t));
680   osize = -b->bb.bsize;
681   if (osize == 0) {
682     /*  Buffer acquired directly through acqfcn. */
683     bdhead_t *bd;
684 
685     bd = BDH(((char *)buf) - sizeof(bdhead_t));
686     osize = bd->tsize - (bufsize)sizeof(bdhead_t);
687   } else {
688     osize -= sizeof(bhead_t);
689   }
690 
691   KMP_DEBUG_ASSERT(osize > 0);
692 
693   (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
694                    (size_t)((size < osize) ? size : osize));
695   brel(th, buf);
696 
697   return nbuf;
698 }
699 
700 /*  BREL  --  Release a buffer.  */
701 static void brel(kmp_info_t *th, void *buf) {
702   thr_data_t *thr = get_thr_data(th);
703   bfhead_t *b, *bn;
704   kmp_info_t *bth;
705 
706   KMP_DEBUG_ASSERT(buf != NULL);
707   KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
708 
709   b = BFH(((char *)buf) - sizeof(bhead_t));
710 
711   if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
712     bdhead_t *bdh;
713 
714     bdh = BDH(((char *)buf) - sizeof(bdhead_t));
715     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
716 #if BufStats
717     thr->totalloc -= (size_t)bdh->tsize;
718     thr->numdrel++; /* Number of direct releases */
719     thr->numrel++; /* Increment number of brel() calls */
720 #endif /* BufStats */
721 #ifdef FreeWipe
722     (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
723 #endif /* FreeWipe */
724 
725     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
726 
727     KMP_DEBUG_ASSERT(thr->relfcn != 0);
728     (*thr->relfcn)((void *)bdh); /* Release it directly. */
729     return;
730   }
731 
732   bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
733                        ~1); // clear possible mark before comparison
734   if (bth != th) {
735     /* Add this buffer to be released by the owning thread later */
736     __kmp_bget_enqueue(bth, buf
737 #ifdef USE_QUEUING_LOCK_FOR_BGET
738                        ,
739                        __kmp_gtid_from_thread(th)
740 #endif
741     );
742     return;
743   }
744 
745   /* Buffer size must be negative, indicating that the buffer is allocated. */
746   if (b->bh.bb.bsize >= 0) {
747     bn = NULL;
748   }
749   KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
750 
751   /*  Back pointer in next buffer must be zero, indicating the same thing: */
752 
753   KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
754 
755 #if BufStats
756   thr->numrel++; /* Increment number of brel() calls */
757   thr->totalloc += (size_t)b->bh.bb.bsize;
758 #endif
759 
760   /* If the back link is nonzero, the previous buffer is free.  */
761 
762   if (b->bh.bb.prevfree != 0) {
763     /* The previous buffer is free. Consolidate this buffer with it by adding
764        the length of this buffer to the previous free buffer. Note that we
765        subtract the size in the buffer being released, since it's negative to
766        indicate that the buffer is allocated. */
767     bufsize size = b->bh.bb.bsize;
768 
769     /* Make the previous buffer the one we're working on. */
770     KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
771                      b->bh.bb.prevfree);
772     b = BFH(((char *)b) - b->bh.bb.prevfree);
773     b->bh.bb.bsize -= size;
774 
775     /* unlink the buffer from the old freelist */
776     __kmp_bget_remove_from_freelist(b);
777   } else {
778     /* The previous buffer isn't allocated. Mark this buffer size as positive
779        (i.e. free) and fall through to place the buffer on the free list as an
780        isolated free block. */
781     b->bh.bb.bsize = -b->bh.bb.bsize;
782   }
783 
784   /* insert buffer back onto a new freelist */
785   __kmp_bget_insert_into_freelist(thr, b);
786 
787   /* Now we look at the next buffer in memory, located by advancing from
788      the  start  of  this  buffer  by its size, to see if that buffer is
789      free.  If it is, we combine  this  buffer  with  the  next  one  in
790      memory, dechaining the second buffer from the free list. */
791   bn = BFH(((char *)b) + b->bh.bb.bsize);
792   if (bn->bh.bb.bsize > 0) {
793 
794     /* The buffer is free.  Remove it from the free list and add
795        its size to that of our buffer. */
796     KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
797                      bn->bh.bb.bsize);
798 
799     __kmp_bget_remove_from_freelist(bn);
800 
801     b->bh.bb.bsize += bn->bh.bb.bsize;
802 
803     /* unlink the buffer from the old freelist, and reinsert it into the new
804      * freelist */
805     __kmp_bget_remove_from_freelist(b);
806     __kmp_bget_insert_into_freelist(thr, b);
807 
808     /* Finally,  advance  to   the  buffer  that   follows  the  newly
809        consolidated free block.  We must set its  backpointer  to  the
810        head  of  the  consolidated free block.  We know the next block
811        must be an allocated block because the process of recombination
812        guarantees  that  two  free  blocks will never be contiguous in
813        memory.  */
814     bn = BFH(((char *)b) + b->bh.bb.bsize);
815   }
816 #ifdef FreeWipe
817   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
818                (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
819 #endif
820   KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
821 
822   /* The next buffer is allocated.  Set the backpointer in it  to  point
823      to this buffer; the previous free buffer in memory. */
824 
825   bn->bh.bb.prevfree = b->bh.bb.bsize;
826 
827   /*  If  a  block-release function is defined, and this free buffer
828       constitutes the entire block, release it.  Note that  pool_len
829       is  defined  in  such a way that the test will fail unless all
830       pool blocks are the same size.  */
831   if (thr->relfcn != 0 &&
832       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
833 #if BufStats
834     if (thr->numpblk !=
835         1) { /* Do not release the last buffer until finalization time */
836 #endif
837 
838       KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
839       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
840       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
841                        b->bh.bb.bsize);
842 
843       /*  Unlink the buffer from the free list  */
844       __kmp_bget_remove_from_freelist(b);
845 
846       KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
847 
848       (*thr->relfcn)(b);
849 #if BufStats
850       thr->numprel++; /* Nr of expansion block releases */
851       thr->numpblk--; /* Total number of blocks */
852       KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
853 
854       // avoid leaving stale last_pool pointer around if it is being dealloced
855       if (thr->last_pool == b)
856         thr->last_pool = 0;
857     } else {
858       thr->last_pool = b;
859     }
860 #endif /* BufStats */
861   }
862 }
863 
864 /*  BECTL  --  Establish automatic pool expansion control  */
865 static void bectl(kmp_info_t *th, bget_compact_t compact,
866                   bget_acquire_t acquire, bget_release_t release,
867                   bufsize pool_incr) {
868   thr_data_t *thr = get_thr_data(th);
869 
870   thr->compfcn = compact;
871   thr->acqfcn = acquire;
872   thr->relfcn = release;
873   thr->exp_incr = pool_incr;
874 }
875 
876 /*  BPOOL  --  Add a region of memory to the buffer pool.  */
877 static void bpool(kmp_info_t *th, void *buf, bufsize len) {
878   /*    int bin = 0; */
879   thr_data_t *thr = get_thr_data(th);
880   bfhead_t *b = BFH(buf);
881   bhead_t *bn;
882 
883   __kmp_bget_dequeue(th); /* Release any queued buffers */
884 
885 #ifdef SizeQuant
886   len &= ~((bufsize)(SizeQuant - 1));
887 #endif
888   if (thr->pool_len == 0) {
889     thr->pool_len = len;
890   } else if (len != thr->pool_len) {
891     thr->pool_len = -1;
892   }
893 #if BufStats
894   thr->numpget++; /* Number of block acquisitions */
895   thr->numpblk++; /* Number of blocks total */
896   KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
897 #endif /* BufStats */
898 
899   /* Since the block is initially occupied by a single free  buffer,
900      it  had  better  not  be  (much) larger than the largest buffer
901      whose size we can store in bhead.bb.bsize. */
902   KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
903 
904   /* Clear  the  backpointer at  the start of the block to indicate that
905      there  is  no  free  block  prior  to  this   one.    That   blocks
906      recombination when the first block in memory is released. */
907   b->bh.bb.prevfree = 0;
908 
909   /* Create a dummy allocated buffer at the end of the pool.  This dummy
910      buffer is seen when a buffer at the end of the pool is released and
911      blocks  recombination  of  the last buffer with the dummy buffer at
912      the end.  The length in the dummy buffer  is  set  to  the  largest
913      negative  number  to  denote  the  end  of  the pool for diagnostic
914      routines (this specific value is  not  counted  on  by  the  actual
915      allocation and release functions). */
916   len -= sizeof(bhead_t);
917   b->bh.bb.bsize = (bufsize)len;
918   /* Set the owner of this buffer */
919   TCW_PTR(b->bh.bb.bthr,
920           (kmp_info_t *)((kmp_uintptr_t)th |
921                          1)); // mark the buffer as allocated address
922 
923   /* Chain the new block to the free list. */
924   __kmp_bget_insert_into_freelist(thr, b);
925 
926 #ifdef FreeWipe
927   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
928                (size_t)(len - sizeof(bfhead_t)));
929 #endif
930   bn = BH(((char *)b) + len);
931   bn->bb.prevfree = (bufsize)len;
932   /* Definition of ESent assumes two's complement! */
933   KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
934 
935   bn->bb.bsize = ESent;
936 }
937 
938 /*  BFREED  --  Dump the free lists for this thread. */
939 static void bfreed(kmp_info_t *th) {
940   int bin = 0, count = 0;
941   int gtid = __kmp_gtid_from_thread(th);
942   thr_data_t *thr = get_thr_data(th);
943 
944 #if BufStats
945   __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
946                        " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
947                        " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
948                        " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
949                        " drel=%" KMP_INT64_SPEC "\n",
950                        gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
951                        (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
952                        (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
953                        (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
954 #endif
955 
956   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
957     bfhead_t *b;
958 
959     for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
960          b = b->ql.flink) {
961       bufsize bs = b->bh.bb.bsize;
962 
963       KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
964       KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
965       KMP_DEBUG_ASSERT(bs > 0);
966 
967       count += 1;
968 
969       __kmp_printf_no_lock(
970           "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
971           (long)bs);
972 #ifdef FreeWipe
973       {
974         char *lerr = ((char *)b) + sizeof(bfhead_t);
975         if ((bs > sizeof(bfhead_t)) &&
976             ((*lerr != 0x55) ||
977              (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
978               0))) {
979           __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
980                                "free block have been overstored.)\n",
981                                gtid);
982         }
983       }
984 #endif
985     }
986   }
987 
988   if (count == 0)
989     __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
990 }
991 
992 void __kmp_initialize_bget(kmp_info_t *th) {
993   KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
994 
995   set_thr_data(th);
996 
997   bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
998         (bufsize)__kmp_malloc_pool_incr);
999 }
1000 
1001 void __kmp_finalize_bget(kmp_info_t *th) {
1002   thr_data_t *thr;
1003   bfhead_t *b;
1004 
1005   KMP_DEBUG_ASSERT(th != 0);
1006 
1007 #if BufStats
1008   thr = (thr_data_t *)th->th.th_local.bget_data;
1009   KMP_DEBUG_ASSERT(thr != NULL);
1010   b = thr->last_pool;
1011 
1012   /*  If a block-release function is defined, and this free buffer constitutes
1013       the entire block, release it. Note that pool_len is defined in such a way
1014       that the test will fail unless all pool blocks are the same size.  */
1015 
1016   // Deallocate the last pool if one exists because we no longer do it in brel()
1017   if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
1018       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
1019     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
1020     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
1021     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
1022                      b->bh.bb.bsize);
1023 
1024     /*  Unlink the buffer from the free list  */
1025     __kmp_bget_remove_from_freelist(b);
1026 
1027     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
1028 
1029     (*thr->relfcn)(b);
1030     thr->numprel++; /* Nr of expansion block releases */
1031     thr->numpblk--; /* Total number of blocks */
1032     KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
1033   }
1034 #endif /* BufStats */
1035 
1036   /* Deallocate bget_data */
1037   if (th->th.th_local.bget_data != NULL) {
1038     __kmp_free(th->th.th_local.bget_data);
1039     th->th.th_local.bget_data = NULL;
1040   }
1041 }
1042 
1043 void kmpc_set_poolsize(size_t size) {
1044   bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
1045         (bget_release_t)free, (bufsize)size);
1046 }
1047 
1048 size_t kmpc_get_poolsize(void) {
1049   thr_data_t *p;
1050 
1051   p = get_thr_data(__kmp_get_thread());
1052 
1053   return p->exp_incr;
1054 }
1055 
1056 void kmpc_set_poolmode(int mode) {
1057   thr_data_t *p;
1058 
1059   if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
1060       mode == bget_mode_best) {
1061     p = get_thr_data(__kmp_get_thread());
1062     p->mode = (bget_mode_t)mode;
1063   }
1064 }
1065 
1066 int kmpc_get_poolmode(void) {
1067   thr_data_t *p;
1068 
1069   p = get_thr_data(__kmp_get_thread());
1070 
1071   return p->mode;
1072 }
1073 
1074 void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
1075   kmp_info_t *th = __kmp_get_thread();
1076   bufsize a, b;
1077 
1078   __kmp_bget_dequeue(th); /* Release any queued buffers */
1079 
1080   bcheck(th, &a, &b);
1081 
1082   *maxmem = a;
1083   *allmem = b;
1084 }
1085 
1086 void kmpc_poolprint(void) {
1087   kmp_info_t *th = __kmp_get_thread();
1088 
1089   __kmp_bget_dequeue(th); /* Release any queued buffers */
1090 
1091   bfreed(th);
1092 }
1093 
1094 #endif // #if KMP_USE_BGET
1095 
1096 void *kmpc_malloc(size_t size) {
1097   void *ptr;
1098   ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1099   if (ptr != NULL) {
1100     // save allocated pointer just before one returned to user
1101     *(void **)ptr = ptr;
1102     ptr = (void **)ptr + 1;
1103   }
1104   return ptr;
1105 }
1106 
1107 #define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
1108 
1109 void *kmpc_aligned_malloc(size_t size, size_t alignment) {
1110   void *ptr;
1111   void *ptr_allocated;
1112   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
1113   if (!IS_POWER_OF_TWO(alignment)) {
1114     // AC: do we need to issue a warning here?
1115     errno = EINVAL;
1116     return NULL;
1117   }
1118   size = size + sizeof(void *) + alignment;
1119   ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
1120   if (ptr_allocated != NULL) {
1121     // save allocated pointer just before one returned to user
1122     ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
1123                    ~(alignment - 1));
1124     *((void **)ptr - 1) = ptr_allocated;
1125   } else {
1126     ptr = NULL;
1127   }
1128   return ptr;
1129 }
1130 
1131 void *kmpc_calloc(size_t nelem, size_t elsize) {
1132   void *ptr;
1133   ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
1134   if (ptr != NULL) {
1135     // save allocated pointer just before one returned to user
1136     *(void **)ptr = ptr;
1137     ptr = (void **)ptr + 1;
1138   }
1139   return ptr;
1140 }
1141 
1142 void *kmpc_realloc(void *ptr, size_t size) {
1143   void *result = NULL;
1144   if (ptr == NULL) {
1145     // If pointer is NULL, realloc behaves like malloc.
1146     result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1147     // save allocated pointer just before one returned to user
1148     if (result != NULL) {
1149       *(void **)result = result;
1150       result = (void **)result + 1;
1151     }
1152   } else if (size == 0) {
1153     // If size is 0, realloc behaves like free.
1154     // The thread must be registered by the call to kmpc_malloc() or
1155     // kmpc_calloc() before.
1156     // So it should be safe to call __kmp_get_thread(), not
1157     // __kmp_entry_thread().
1158     KMP_ASSERT(*((void **)ptr - 1));
1159     brel(__kmp_get_thread(), *((void **)ptr - 1));
1160   } else {
1161     result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
1162                    (bufsize)(size + sizeof(ptr)));
1163     if (result != NULL) {
1164       *(void **)result = result;
1165       result = (void **)result + 1;
1166     }
1167   }
1168   return result;
1169 }
1170 
1171 // NOTE: the library must have already been initialized by a previous allocate
1172 void kmpc_free(void *ptr) {
1173   if (!__kmp_init_serial) {
1174     return;
1175   }
1176   if (ptr != NULL) {
1177     kmp_info_t *th = __kmp_get_thread();
1178     __kmp_bget_dequeue(th); /* Release any queued buffers */
1179     // extract allocated pointer and free it
1180     KMP_ASSERT(*((void **)ptr - 1));
1181     brel(th, *((void **)ptr - 1));
1182   }
1183 }
1184 
1185 void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
1186   void *ptr;
1187   KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
1188                 (int)size KMP_SRC_LOC_PARM));
1189   ptr = bget(th, (bufsize)size);
1190   KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
1191   return ptr;
1192 }
1193 
1194 void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
1195                            size_t elsize KMP_SRC_LOC_DECL) {
1196   void *ptr;
1197   KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
1198                 (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
1199   ptr = bgetz(th, (bufsize)(nelem * elsize));
1200   KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
1201   return ptr;
1202 }
1203 
1204 void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
1205                             size_t size KMP_SRC_LOC_DECL) {
1206   KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
1207                 ptr, (int)size KMP_SRC_LOC_PARM));
1208   ptr = bgetr(th, ptr, (bufsize)size);
1209   KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
1210   return ptr;
1211 }
1212 
1213 void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
1214   KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
1215                 ptr KMP_SRC_LOC_PARM));
1216   if (ptr != NULL) {
1217     __kmp_bget_dequeue(th); /* Release any queued buffers */
1218     brel(th, ptr);
1219   }
1220   KE_TRACE(30, ("<- __kmp_thread_free()\n"));
1221 }
1222 
1223 /* OMP 5.0 Memory Management support */
1224 static const char *kmp_mk_lib_name;
1225 static void *h_memkind;
1226 /* memkind experimental API: */
1227 // memkind_alloc
1228 static void *(*kmp_mk_alloc)(void *k, size_t sz);
1229 // memkind_free
1230 static void (*kmp_mk_free)(void *kind, void *ptr);
1231 // memkind_check_available
1232 static int (*kmp_mk_check)(void *kind);
1233 // kinds we are going to use
1234 static void **mk_default;
1235 static void **mk_interleave;
1236 static void **mk_hbw;
1237 static void **mk_hbw_interleave;
1238 static void **mk_hbw_preferred;
1239 static void **mk_hugetlb;
1240 static void **mk_hbw_hugetlb;
1241 static void **mk_hbw_preferred_hugetlb;
1242 static void **mk_dax_kmem;
1243 static void **mk_dax_kmem_all;
1244 static void **mk_dax_kmem_preferred;
1245 static void *(*kmp_target_alloc_host)(size_t size, int device);
1246 static void *(*kmp_target_alloc_shared)(size_t size, int device);
1247 static void *(*kmp_target_alloc_device)(size_t size, int device);
1248 static void *(*kmp_target_lock_mem)(void *ptr, size_t size, int device);
1249 static void *(*kmp_target_unlock_mem)(void *ptr, int device);
1250 static void *(*kmp_target_free_host)(void *ptr, int device);
1251 static void *(*kmp_target_free_shared)(void *ptr, int device);
1252 static void *(*kmp_target_free_device)(void *ptr, int device);
1253 static bool __kmp_target_mem_available;
1254 #define KMP_IS_TARGET_MEM_SPACE(MS)                                            \
1255   (MS == llvm_omp_target_host_mem_space ||                                     \
1256    MS == llvm_omp_target_shared_mem_space ||                                   \
1257    MS == llvm_omp_target_device_mem_space)
1258 #define KMP_IS_TARGET_MEM_ALLOC(MA)                                            \
1259   (MA == llvm_omp_target_host_mem_alloc ||                                     \
1260    MA == llvm_omp_target_shared_mem_alloc ||                                   \
1261    MA == llvm_omp_target_device_mem_alloc)
1262 
1263 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1264 static inline void chk_kind(void ***pkind) {
1265   KMP_DEBUG_ASSERT(pkind);
1266   if (*pkind) // symbol found
1267     if (kmp_mk_check(**pkind)) // kind not available or error
1268       *pkind = NULL;
1269 }
1270 #endif
1271 
1272 void __kmp_init_memkind() {
1273 // as of 2018-07-31 memkind does not support Windows*, exclude it for now
1274 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1275   // use of statically linked memkind is problematic, as it depends on libnuma
1276   kmp_mk_lib_name = "libmemkind.so";
1277   h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);
1278   if (h_memkind) {
1279     kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");
1280     kmp_mk_alloc =
1281         (void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");
1282     kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");
1283     mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");
1284     if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&
1285         !kmp_mk_check(*mk_default)) {
1286       __kmp_memkind_available = 1;
1287       mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");
1288       chk_kind(&mk_interleave);
1289       mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");
1290       chk_kind(&mk_hbw);
1291       mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");
1292       chk_kind(&mk_hbw_interleave);
1293       mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");
1294       chk_kind(&mk_hbw_preferred);
1295       mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");
1296       chk_kind(&mk_hugetlb);
1297       mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");
1298       chk_kind(&mk_hbw_hugetlb);
1299       mk_hbw_preferred_hugetlb =
1300           (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
1301       chk_kind(&mk_hbw_preferred_hugetlb);
1302       mk_dax_kmem = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM");
1303       chk_kind(&mk_dax_kmem);
1304       mk_dax_kmem_all = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_ALL");
1305       chk_kind(&mk_dax_kmem_all);
1306       mk_dax_kmem_preferred =
1307           (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_PREFERRED");
1308       chk_kind(&mk_dax_kmem_preferred);
1309       KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
1310       return; // success
1311     }
1312     dlclose(h_memkind); // failure
1313   }
1314 #else // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
1315   kmp_mk_lib_name = "";
1316 #endif // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
1317   h_memkind = NULL;
1318   kmp_mk_check = NULL;
1319   kmp_mk_alloc = NULL;
1320   kmp_mk_free = NULL;
1321   mk_default = NULL;
1322   mk_interleave = NULL;
1323   mk_hbw = NULL;
1324   mk_hbw_interleave = NULL;
1325   mk_hbw_preferred = NULL;
1326   mk_hugetlb = NULL;
1327   mk_hbw_hugetlb = NULL;
1328   mk_hbw_preferred_hugetlb = NULL;
1329   mk_dax_kmem = NULL;
1330   mk_dax_kmem_all = NULL;
1331   mk_dax_kmem_preferred = NULL;
1332 }
1333 
1334 void __kmp_fini_memkind() {
1335 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
1336   if (__kmp_memkind_available)
1337     KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));
1338   if (h_memkind) {
1339     dlclose(h_memkind);
1340     h_memkind = NULL;
1341   }
1342   kmp_mk_check = NULL;
1343   kmp_mk_alloc = NULL;
1344   kmp_mk_free = NULL;
1345   mk_default = NULL;
1346   mk_interleave = NULL;
1347   mk_hbw = NULL;
1348   mk_hbw_interleave = NULL;
1349   mk_hbw_preferred = NULL;
1350   mk_hugetlb = NULL;
1351   mk_hbw_hugetlb = NULL;
1352   mk_hbw_preferred_hugetlb = NULL;
1353   mk_dax_kmem = NULL;
1354   mk_dax_kmem_all = NULL;
1355   mk_dax_kmem_preferred = NULL;
1356 #endif
1357 }
1358 
1359 void __kmp_init_target_mem() {
1360   *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
1361   *(void **)(&kmp_target_alloc_shared) =
1362       KMP_DLSYM("llvm_omp_target_alloc_shared");
1363   *(void **)(&kmp_target_alloc_device) =
1364       KMP_DLSYM("llvm_omp_target_alloc_device");
1365   *(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");
1366   *(void **)(&kmp_target_free_shared) =
1367       KMP_DLSYM("llvm_omp_target_free_shared");
1368   *(void **)(&kmp_target_free_device) =
1369       KMP_DLSYM("llvm_omp_target_free_device");
1370   __kmp_target_mem_available =
1371       kmp_target_alloc_host && kmp_target_alloc_shared &&
1372       kmp_target_alloc_device && kmp_target_free_host &&
1373       kmp_target_free_shared && kmp_target_free_device;
1374   // lock/pin and unlock/unpin target calls
1375   *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
1376   *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
1377 }
1378 
1379 omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
1380                                              int ntraits,
1381                                              omp_alloctrait_t traits[]) {
1382   // OpenMP 5.0 only allows predefined memspaces
1383   KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space ||
1384                    ms == omp_large_cap_mem_space || ms == omp_const_mem_space ||
1385                    ms == omp_high_bw_mem_space || KMP_IS_TARGET_MEM_SPACE(ms));
1386   kmp_allocator_t *al;
1387   int i;
1388   al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
1389   al->memspace = ms; // not used currently
1390   for (i = 0; i < ntraits; ++i) {
1391     switch (traits[i].key) {
1392     case omp_atk_sync_hint:
1393     case omp_atk_access:
1394       break;
1395     case omp_atk_pinned:
1396       al->pinned = true;
1397       break;
1398     case omp_atk_alignment:
1399       __kmp_type_convert(traits[i].value, &(al->alignment));
1400       KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
1401       break;
1402     case omp_atk_pool_size:
1403       al->pool_size = traits[i].value;
1404       break;
1405     case omp_atk_fallback:
1406       al->fb = (omp_alloctrait_value_t)traits[i].value;
1407       KMP_DEBUG_ASSERT(
1408           al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||
1409           al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);
1410       break;
1411     case omp_atk_fb_data:
1412       al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
1413       break;
1414     case omp_atk_partition:
1415       al->memkind = RCAST(void **, traits[i].value);
1416       break;
1417     default:
1418       KMP_ASSERT2(0, "Unexpected allocator trait");
1419     }
1420   }
1421   if (al->fb == 0) {
1422     // set default allocator
1423     al->fb = omp_atv_default_mem_fb;
1424     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1425   } else if (al->fb == omp_atv_allocator_fb) {
1426     KMP_ASSERT(al->fb_data != NULL);
1427   } else if (al->fb == omp_atv_default_mem_fb) {
1428     al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1429   }
1430   if (__kmp_memkind_available) {
1431     // Let's use memkind library if available
1432     if (ms == omp_high_bw_mem_space) {
1433       if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {
1434         al->memkind = mk_hbw_interleave;
1435       } else if (mk_hbw_preferred) {
1436         // AC: do not try to use MEMKIND_HBW for now, because memkind library
1437         // cannot reliably detect exhaustion of HBW memory.
1438         // It could be possible using hbw_verify_memory_region() but memkind
1439         // manual says: "Using this function in production code may result in
1440         // serious performance penalty".
1441         al->memkind = mk_hbw_preferred;
1442       } else {
1443         // HBW is requested but not available --> return NULL allocator
1444         __kmp_free(al);
1445         return omp_null_allocator;
1446       }
1447     } else if (ms == omp_large_cap_mem_space) {
1448       if (mk_dax_kmem_all) {
1449         // All pmem nodes are visited
1450         al->memkind = mk_dax_kmem_all;
1451       } else if (mk_dax_kmem) {
1452         // Only closest pmem node is visited
1453         al->memkind = mk_dax_kmem;
1454       } else {
1455         __kmp_free(al);
1456         return omp_null_allocator;
1457       }
1458     } else {
1459       if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {
1460         al->memkind = mk_interleave;
1461       } else {
1462         al->memkind = mk_default;
1463       }
1464     }
1465   } else if (KMP_IS_TARGET_MEM_SPACE(ms) && !__kmp_target_mem_available) {
1466     __kmp_free(al);
1467     return omp_null_allocator;
1468   } else {
1469     if (ms == omp_high_bw_mem_space) {
1470       // cannot detect HBW memory presence without memkind library
1471       __kmp_free(al);
1472       return omp_null_allocator;
1473     }
1474   }
1475   return (omp_allocator_handle_t)al;
1476 }
1477 
1478 void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {
1479   if (allocator > kmp_max_mem_alloc)
1480     __kmp_free(allocator);
1481 }
1482 
1483 void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {
1484   if (allocator == omp_null_allocator)
1485     allocator = omp_default_mem_alloc;
1486   __kmp_threads[gtid]->th.th_def_allocator = allocator;
1487 }
1488 
1489 omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
1490   return __kmp_threads[gtid]->th.th_def_allocator;
1491 }
1492 
1493 typedef struct kmp_mem_desc { // Memory block descriptor
1494   void *ptr_alloc; // Pointer returned by allocator
1495   size_t size_a; // Size of allocated memory block (initial+descriptor+align)
1496   size_t size_orig; // Original size requested
1497   void *ptr_align; // Pointer to aligned memory, returned
1498   kmp_allocator_t *allocator; // allocator
1499 } kmp_mem_desc_t;
1500 static int alignment = sizeof(void *); // align to pointer size by default
1501 
1502 // external interfaces are wrappers over internal implementation
1503 void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
1504   KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
1505   void *ptr = __kmp_alloc(gtid, 0, size, allocator);
1506   KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));
1507   return ptr;
1508 }
1509 
1510 void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,
1511                            omp_allocator_handle_t allocator) {
1512   KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,
1513                 (int)size, allocator));
1514   void *ptr = __kmp_alloc(gtid, algn, size, allocator);
1515   KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));
1516   return ptr;
1517 }
1518 
1519 void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
1520                     omp_allocator_handle_t allocator) {
1521   KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
1522                 (int)size, allocator));
1523   void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);
1524   KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
1525   return ptr;
1526 }
1527 
1528 void *__kmpc_realloc(int gtid, void *ptr, size_t size,
1529                      omp_allocator_handle_t allocator,
1530                      omp_allocator_handle_t free_allocator) {
1531   KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
1532                 allocator, free_allocator));
1533   void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);
1534   KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
1535   return nptr;
1536 }
1537 
1538 void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
1539   KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
1540   ___kmpc_free(gtid, ptr, allocator);
1541   KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));
1542   return;
1543 }
1544 
1545 // internal implementation, called from inside the library
1546 void *__kmp_alloc(int gtid, size_t algn, size_t size,
1547                   omp_allocator_handle_t allocator) {
1548   void *ptr = NULL;
1549   kmp_allocator_t *al;
1550   KMP_DEBUG_ASSERT(__kmp_init_serial);
1551   if (size == 0)
1552     return NULL;
1553   if (allocator == omp_null_allocator)
1554     allocator = __kmp_threads[gtid]->th.th_def_allocator;
1555   kmp_int32 default_device =
1556       __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1557 
1558   al = RCAST(kmp_allocator_t *, allocator);
1559 
1560   int sz_desc = sizeof(kmp_mem_desc_t);
1561   kmp_mem_desc_t desc;
1562   kmp_uintptr_t addr; // address returned by allocator
1563   kmp_uintptr_t addr_align; // address to return to caller
1564   kmp_uintptr_t addr_descr; // address of memory block descriptor
1565   size_t align = alignment; // default alignment
1566   if (allocator > kmp_max_mem_alloc && al->alignment > align)
1567     align = al->alignment; // alignment required by allocator trait
1568   if (align < algn)
1569     align = algn; // max of allocator trait, parameter and sizeof(void*)
1570   desc.size_orig = size;
1571   desc.size_a = size + sz_desc + align;
1572   bool is_pinned = false;
1573   if (allocator > kmp_max_mem_alloc)
1574     is_pinned = al->pinned;
1575 
1576   // Use default allocator if libmemkind is not available
1577   int use_default_allocator = (__kmp_memkind_available) ? false : true;
1578 
1579   if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
1580     // Use size input directly as the memory may not be accessible on host.
1581     // Use default device for now.
1582     if (__kmp_target_mem_available) {
1583       kmp_int32 device =
1584           __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1585       if (allocator == llvm_omp_target_host_mem_alloc)
1586         ptr = kmp_target_alloc_host(size, device);
1587       else if (allocator == llvm_omp_target_shared_mem_alloc)
1588         ptr = kmp_target_alloc_shared(size, device);
1589       else // allocator == llvm_omp_target_device_mem_alloc
1590         ptr = kmp_target_alloc_device(size, device);
1591       return ptr;
1592     } else {
1593       KMP_INFORM(TargetMemNotAvailable);
1594     }
1595   }
1596 
1597   if (allocator >= kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
1598     if (__kmp_target_mem_available) {
1599       kmp_int32 device =
1600           __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1601       if (al->memspace == llvm_omp_target_host_mem_space)
1602         ptr = kmp_target_alloc_host(size, device);
1603       else if (al->memspace == llvm_omp_target_shared_mem_space)
1604         ptr = kmp_target_alloc_shared(size, device);
1605       else // al->memspace == llvm_omp_target_device_mem_space
1606         ptr = kmp_target_alloc_device(size, device);
1607       return ptr;
1608     } else {
1609       KMP_INFORM(TargetMemNotAvailable);
1610     }
1611   }
1612 
1613   if (__kmp_memkind_available) {
1614     if (allocator < kmp_max_mem_alloc) {
1615       // pre-defined allocator
1616       if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
1617         ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
1618       } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
1619         ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a);
1620       } else {
1621         ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1622       }
1623     } else if (al->pool_size > 0) {
1624       // custom allocator with pool size requested
1625       kmp_uint64 used =
1626           KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
1627       if (used + desc.size_a > al->pool_size) {
1628         // not enough space, need to go fallback path
1629         KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1630         if (al->fb == omp_atv_default_mem_fb) {
1631           al = (kmp_allocator_t *)omp_default_mem_alloc;
1632           ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1633         } else if (al->fb == omp_atv_abort_fb) {
1634           KMP_ASSERT(0); // abort fallback requested
1635         } else if (al->fb == omp_atv_allocator_fb) {
1636           KMP_ASSERT(al != al->fb_data);
1637           al = al->fb_data;
1638           ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1639           if (is_pinned && kmp_target_lock_mem)
1640             kmp_target_lock_mem(ptr, size, default_device);
1641           return ptr;
1642         } // else ptr == NULL;
1643       } else {
1644         // pool has enough space
1645         ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
1646         if (ptr == NULL) {
1647           if (al->fb == omp_atv_default_mem_fb) {
1648             al = (kmp_allocator_t *)omp_default_mem_alloc;
1649             ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1650           } else if (al->fb == omp_atv_abort_fb) {
1651             KMP_ASSERT(0); // abort fallback requested
1652           } else if (al->fb == omp_atv_allocator_fb) {
1653             KMP_ASSERT(al != al->fb_data);
1654             al = al->fb_data;
1655             ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1656             if (is_pinned && kmp_target_lock_mem)
1657               kmp_target_lock_mem(ptr, size, default_device);
1658             return ptr;
1659           }
1660         }
1661       }
1662     } else {
1663       // custom allocator, pool size not requested
1664       ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
1665       if (ptr == NULL) {
1666         if (al->fb == omp_atv_default_mem_fb) {
1667           al = (kmp_allocator_t *)omp_default_mem_alloc;
1668           ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1669         } else if (al->fb == omp_atv_abort_fb) {
1670           KMP_ASSERT(0); // abort fallback requested
1671         } else if (al->fb == omp_atv_allocator_fb) {
1672           KMP_ASSERT(al != al->fb_data);
1673           al = al->fb_data;
1674           ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1675           if (is_pinned && kmp_target_lock_mem)
1676             kmp_target_lock_mem(ptr, size, default_device);
1677           return ptr;
1678         }
1679       }
1680     }
1681   } else if (allocator < kmp_max_mem_alloc) {
1682     // pre-defined allocator
1683     if (allocator == omp_high_bw_mem_alloc) {
1684       KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
1685     } else if (allocator == omp_large_cap_mem_alloc) {
1686       KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
1687     } else if (allocator == omp_const_mem_alloc) {
1688       KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
1689     } else if (allocator == omp_low_lat_mem_alloc) {
1690       KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
1691     } else if (allocator == omp_cgroup_mem_alloc) {
1692       KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
1693     } else if (allocator == omp_pteam_mem_alloc) {
1694       KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
1695     } else if (allocator == omp_thread_mem_alloc) {
1696       KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
1697     } else { // default allocator requested
1698       use_default_allocator = true;
1699     }
1700     if (use_default_allocator) {
1701       ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1702       use_default_allocator = false;
1703     }
1704   } else if (al->pool_size > 0) {
1705     // custom allocator with pool size requested
1706     kmp_uint64 used =
1707         KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
1708     if (used + desc.size_a > al->pool_size) {
1709       // not enough space, need to go fallback path
1710       KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1711       if (al->fb == omp_atv_default_mem_fb) {
1712         al = (kmp_allocator_t *)omp_default_mem_alloc;
1713         ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1714       } else if (al->fb == omp_atv_abort_fb) {
1715         KMP_ASSERT(0); // abort fallback requested
1716       } else if (al->fb == omp_atv_allocator_fb) {
1717         KMP_ASSERT(al != al->fb_data);
1718         al = al->fb_data;
1719         ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1720         if (is_pinned && kmp_target_lock_mem)
1721           kmp_target_lock_mem(ptr, size, default_device);
1722         return ptr;
1723       } // else ptr == NULL;
1724     } else {
1725       // pool has enough space
1726       ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1727       if (ptr == NULL && al->fb == omp_atv_abort_fb) {
1728         KMP_ASSERT(0); // abort fallback requested
1729       } // no sense to look for another fallback because of same internal alloc
1730     }
1731   } else {
1732     // custom allocator, pool size not requested
1733     ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1734     if (ptr == NULL && al->fb == omp_atv_abort_fb) {
1735       KMP_ASSERT(0); // abort fallback requested
1736     } // no sense to look for another fallback because of same internal alloc
1737   }
1738   KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
1739   if (ptr == NULL)
1740     return NULL;
1741 
1742   if (is_pinned && kmp_target_lock_mem)
1743     kmp_target_lock_mem(ptr, desc.size_a, default_device);
1744 
1745   addr = (kmp_uintptr_t)ptr;
1746   addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
1747   addr_descr = addr_align - sz_desc;
1748 
1749   desc.ptr_alloc = ptr;
1750   desc.ptr_align = (void *)addr_align;
1751   desc.allocator = al;
1752   *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
1753   KMP_MB();
1754 
1755   return desc.ptr_align;
1756 }
1757 
1758 void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,
1759                    omp_allocator_handle_t allocator) {
1760   void *ptr = NULL;
1761   kmp_allocator_t *al;
1762   KMP_DEBUG_ASSERT(__kmp_init_serial);
1763 
1764   if (allocator == omp_null_allocator)
1765     allocator = __kmp_threads[gtid]->th.th_def_allocator;
1766 
1767   al = RCAST(kmp_allocator_t *, allocator);
1768 
1769   if (nmemb == 0 || size == 0)
1770     return ptr;
1771 
1772   if ((SIZE_MAX - sizeof(kmp_mem_desc_t)) / size < nmemb) {
1773     if (al->fb == omp_atv_abort_fb) {
1774       KMP_ASSERT(0);
1775     }
1776     return ptr;
1777   }
1778 
1779   ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);
1780 
1781   if (ptr) {
1782     memset(ptr, 0x00, nmemb * size);
1783   }
1784   return ptr;
1785 }
1786 
1787 void *__kmp_realloc(int gtid, void *ptr, size_t size,
1788                     omp_allocator_handle_t allocator,
1789                     omp_allocator_handle_t free_allocator) {
1790   void *nptr = NULL;
1791   KMP_DEBUG_ASSERT(__kmp_init_serial);
1792 
1793   if (size == 0) {
1794     if (ptr != NULL)
1795       ___kmpc_free(gtid, ptr, free_allocator);
1796     return nptr;
1797   }
1798 
1799   nptr = __kmp_alloc(gtid, 0, size, allocator);
1800 
1801   if (nptr != NULL && ptr != NULL) {
1802     kmp_mem_desc_t desc;
1803     kmp_uintptr_t addr_align; // address to return to caller
1804     kmp_uintptr_t addr_descr; // address of memory block descriptor
1805 
1806     addr_align = (kmp_uintptr_t)ptr;
1807     addr_descr = addr_align - sizeof(kmp_mem_desc_t);
1808     desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
1809 
1810     KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
1811     KMP_DEBUG_ASSERT(desc.size_orig > 0);
1812     KMP_DEBUG_ASSERT(desc.size_orig < desc.size_a);
1813     KMP_MEMCPY((char *)nptr, (char *)ptr,
1814                (size_t)((size < desc.size_orig) ? size : desc.size_orig));
1815   }
1816 
1817   if (nptr != NULL) {
1818     ___kmpc_free(gtid, ptr, free_allocator);
1819   }
1820 
1821   return nptr;
1822 }
1823 
1824 void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
1825   if (ptr == NULL)
1826     return;
1827 
1828   kmp_allocator_t *al;
1829   omp_allocator_handle_t oal;
1830   al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
1831   kmp_mem_desc_t desc;
1832   kmp_uintptr_t addr_align; // address to return to caller
1833   kmp_uintptr_t addr_descr; // address of memory block descriptor
1834   if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
1835                                      (allocator > kmp_max_mem_alloc &&
1836                                       KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {
1837     kmp_int32 device =
1838         __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1839     if (allocator == llvm_omp_target_host_mem_alloc) {
1840       kmp_target_free_host(ptr, device);
1841     } else if (allocator == llvm_omp_target_shared_mem_alloc) {
1842       kmp_target_free_shared(ptr, device);
1843     } else if (allocator == llvm_omp_target_device_mem_alloc) {
1844       kmp_target_free_device(ptr, device);
1845     }
1846     return;
1847   }
1848 
1849   addr_align = (kmp_uintptr_t)ptr;
1850   addr_descr = addr_align - sizeof(kmp_mem_desc_t);
1851   desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
1852 
1853   KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
1854   if (allocator) {
1855     KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);
1856   }
1857   al = desc.allocator;
1858   oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
1859   KMP_DEBUG_ASSERT(al);
1860 
1861   if (allocator > kmp_max_mem_alloc && kmp_target_unlock_mem && al->pinned) {
1862     kmp_int32 device =
1863         __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1864     kmp_target_unlock_mem(desc.ptr_alloc, device);
1865   }
1866 
1867   if (__kmp_memkind_available) {
1868     if (oal < kmp_max_mem_alloc) {
1869       // pre-defined allocator
1870       if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
1871         kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
1872       } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
1873         kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
1874       } else {
1875         kmp_mk_free(*mk_default, desc.ptr_alloc);
1876       }
1877     } else {
1878       if (al->pool_size > 0) { // custom allocator with pool size requested
1879         kmp_uint64 used =
1880             KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1881         (void)used; // to suppress compiler warning
1882         KMP_DEBUG_ASSERT(used >= desc.size_a);
1883       }
1884       kmp_mk_free(*al->memkind, desc.ptr_alloc);
1885     }
1886   } else {
1887     if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
1888       kmp_uint64 used =
1889           KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1890       (void)used; // to suppress compiler warning
1891       KMP_DEBUG_ASSERT(used >= desc.size_a);
1892     }
1893     __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
1894   }
1895 }
1896 
1897 /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
1898    memory leaks, but it may be useful for debugging memory corruptions, used
1899    freed pointers, etc. */
1900 /* #define LEAK_MEMORY */
1901 struct kmp_mem_descr { // Memory block descriptor.
1902   void *ptr_allocated; // Pointer returned by malloc(), subject for free().
1903   size_t size_allocated; // Size of allocated memory block.
1904   void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
1905   size_t size_aligned; // Size of aligned memory block.
1906 };
1907 typedef struct kmp_mem_descr kmp_mem_descr_t;
1908 
1909 /* Allocate memory on requested boundary, fill allocated memory with 0x00.
1910    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1911    error. Must use __kmp_free when freeing memory allocated by this routine! */
1912 static void *___kmp_allocate_align(size_t size,
1913                                    size_t alignment KMP_SRC_LOC_DECL) {
1914   /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
1915      requested to return properly aligned pointer. Original pointer returned
1916      by malloc() and size of allocated block is saved in descriptor just
1917      before the aligned pointer. This information used by __kmp_free() -- it
1918      has to pass to free() original pointer, not aligned one.
1919 
1920           +---------+------------+-----------------------------------+---------+
1921           | padding | descriptor |           aligned block           | padding |
1922           +---------+------------+-----------------------------------+---------+
1923           ^                      ^
1924           |                      |
1925           |                      +- Aligned pointer returned to caller
1926           +- Pointer returned by malloc()
1927 
1928       Aligned block is filled with zeros, paddings are filled with 0xEF. */
1929 
1930   kmp_mem_descr_t descr;
1931   kmp_uintptr_t addr_allocated; // Address returned by malloc().
1932   kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
1933   kmp_uintptr_t addr_descr; // Address of memory block descriptor.
1934 
1935   KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
1936                 (int)size, (int)alignment KMP_SRC_LOC_PARM));
1937 
1938   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
1939   KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
1940   // Make sure kmp_uintptr_t is enough to store addresses.
1941 
1942   descr.size_aligned = size;
1943   descr.size_allocated =
1944       descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
1945 
1946 #if KMP_DEBUG
1947   descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
1948 #else
1949   descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
1950 #endif
1951   KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
1952                 descr.ptr_allocated));
1953   if (descr.ptr_allocated == NULL) {
1954     KMP_FATAL(OutOfHeapMemory);
1955   }
1956 
1957   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1958   addr_aligned =
1959       (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
1960   addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
1961 
1962   descr.ptr_aligned = (void *)addr_aligned;
1963 
1964   KE_TRACE(26, ("   ___kmp_allocate_align: "
1965                 "ptr_allocated=%p, size_allocated=%d, "
1966                 "ptr_aligned=%p, size_aligned=%d\n",
1967                 descr.ptr_allocated, (int)descr.size_allocated,
1968                 descr.ptr_aligned, (int)descr.size_aligned));
1969 
1970   KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
1971   KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
1972   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1973                    addr_allocated + descr.size_allocated);
1974   KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
1975 #ifdef KMP_DEBUG
1976   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1977 // Fill allocated memory block with 0xEF.
1978 #endif
1979   memset(descr.ptr_aligned, 0x00, descr.size_aligned);
1980   // Fill the aligned memory block (which is intended for using by caller) with
1981   // 0x00. Do not
1982   // put this filling under KMP_DEBUG condition! Many callers expect zeroed
1983   // memory. (Padding
1984   // bytes remain filled with 0xEF in debugging library.)
1985   *((kmp_mem_descr_t *)addr_descr) = descr;
1986 
1987   KMP_MB();
1988 
1989   KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
1990   return descr.ptr_aligned;
1991 } // func ___kmp_allocate_align
1992 
1993 /* Allocate memory on cache line boundary, fill allocated memory with 0x00.
1994    Do not call this func directly! Use __kmp_allocate macro instead.
1995    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1996    error. Must use __kmp_free when freeing memory allocated by this routine! */
1997 void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
1998   void *ptr;
1999   KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
2000                 (int)size KMP_SRC_LOC_PARM));
2001   ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
2002   KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
2003   return ptr;
2004 } // func ___kmp_allocate
2005 
2006 /* Allocate memory on page boundary, fill allocated memory with 0x00.
2007    Does not call this func directly! Use __kmp_page_allocate macro instead.
2008    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2009    error. Must use __kmp_free when freeing memory allocated by this routine! */
2010 void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
2011   int page_size = 8 * 1024;
2012   void *ptr;
2013 
2014   KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
2015                 (int)size KMP_SRC_LOC_PARM));
2016   ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
2017   KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
2018   return ptr;
2019 } // ___kmp_page_allocate
2020 
2021 /* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
2022    In debug mode, fill the memory block with 0xEF before call to free(). */
2023 void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
2024   kmp_mem_descr_t descr;
2025 #if KMP_DEBUG
2026   kmp_uintptr_t addr_allocated; // Address returned by malloc().
2027   kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
2028 #endif
2029   KE_TRACE(25,
2030            ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
2031   KMP_ASSERT(ptr != NULL);
2032 
2033   descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
2034 
2035   KE_TRACE(26, ("   __kmp_free:     "
2036                 "ptr_allocated=%p, size_allocated=%d, "
2037                 "ptr_aligned=%p, size_aligned=%d\n",
2038                 descr.ptr_allocated, (int)descr.size_allocated,
2039                 descr.ptr_aligned, (int)descr.size_aligned));
2040 #if KMP_DEBUG
2041   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
2042   addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
2043   KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
2044   KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
2045   KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
2046   KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
2047   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
2048                    addr_allocated + descr.size_allocated);
2049   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
2050 // Fill memory block with 0xEF, it helps catch using freed memory.
2051 #endif
2052 
2053 #ifndef LEAK_MEMORY
2054   KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
2055 #ifdef KMP_DEBUG
2056   _free_src_loc(descr.ptr_allocated, _file_, _line_);
2057 #else
2058   free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
2059 #endif
2060 #endif
2061   KMP_MB();
2062   KE_TRACE(25, ("<- __kmp_free() returns\n"));
2063 } // func ___kmp_free
2064 
2065 #if USE_FAST_MEMORY == 3
2066 // Allocate fast memory by first scanning the thread's free lists
2067 // If a chunk the right size exists, grab it off the free list.
2068 // Otherwise allocate normally using kmp_thread_malloc.
2069 
2070 // AC: How to choose the limit? Just get 16 for now...
2071 #define KMP_FREE_LIST_LIMIT 16
2072 
2073 // Always use 128 bytes for determining buckets for caching memory blocks
2074 #define DCACHE_LINE 128
2075 
2076 void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
2077   void *ptr;
2078   size_t num_lines, idx;
2079   int index;
2080   void *alloc_ptr;
2081   size_t alloc_size;
2082   kmp_mem_descr_t *descr;
2083 
2084   KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
2085                 __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
2086 
2087   num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
2088   idx = num_lines - 1;
2089   KMP_DEBUG_ASSERT(idx >= 0);
2090   if (idx < 2) {
2091     index = 0; // idx is [ 0, 1 ], use first free list
2092     num_lines = 2; // 1, 2 cache lines or less than cache line
2093   } else if ((idx >>= 2) == 0) {
2094     index = 1; // idx is [ 2, 3 ], use second free list
2095     num_lines = 4; // 3, 4 cache lines
2096   } else if ((idx >>= 2) == 0) {
2097     index = 2; // idx is [ 4, 15 ], use third free list
2098     num_lines = 16; // 5, 6, ..., 16 cache lines
2099   } else if ((idx >>= 2) == 0) {
2100     index = 3; // idx is [ 16, 63 ], use fourth free list
2101     num_lines = 64; // 17, 18, ..., 64 cache lines
2102   } else {
2103     goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
2104   }
2105 
2106   ptr = this_thr->th.th_free_lists[index].th_free_list_self;
2107   if (ptr != NULL) {
2108     // pop the head of no-sync free list
2109     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2110     KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2111                                                       sizeof(kmp_mem_descr_t)))
2112                                      ->ptr_aligned);
2113     goto end;
2114   }
2115   ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2116   if (ptr != NULL) {
2117     // no-sync free list is empty, use sync free list (filled in by other
2118     // threads only)
2119     // pop the head of the sync free list, push NULL instead
2120     while (!KMP_COMPARE_AND_STORE_PTR(
2121         &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
2122       KMP_CPU_PAUSE();
2123       ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2124     }
2125     // push the rest of chain into no-sync free list (can be NULL if there was
2126     // the only block)
2127     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2128     KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2129                                                       sizeof(kmp_mem_descr_t)))
2130                                      ->ptr_aligned);
2131     goto end;
2132   }
2133 
2134 alloc_call:
2135   // haven't found block in the free lists, thus allocate it
2136   size = num_lines * DCACHE_LINE;
2137 
2138   alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
2139   KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
2140                 "alloc_size %d\n",
2141                 __kmp_gtid_from_thread(this_thr), alloc_size));
2142   alloc_ptr = bget(this_thr, (bufsize)alloc_size);
2143 
2144   // align ptr to DCACHE_LINE
2145   ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
2146                   DCACHE_LINE) &
2147                  ~(DCACHE_LINE - 1));
2148   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2149 
2150   descr->ptr_allocated = alloc_ptr; // remember allocated pointer
2151   // we don't need size_allocated
2152   descr->ptr_aligned = (void *)this_thr; // remember allocating thread
2153   // (it is already saved in bget buffer,
2154   // but we may want to use another allocator in future)
2155   descr->size_aligned = size;
2156 
2157 end:
2158   KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
2159                 __kmp_gtid_from_thread(this_thr), ptr));
2160   return ptr;
2161 } // func __kmp_fast_allocate
2162 
2163 // Free fast memory and place it on the thread's free list if it is of
2164 // the correct size.
2165 void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
2166   kmp_mem_descr_t *descr;
2167   kmp_info_t *alloc_thr;
2168   size_t size;
2169   size_t idx;
2170   int index;
2171 
2172   KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
2173                 __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
2174   KMP_ASSERT(ptr != NULL);
2175 
2176   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2177 
2178   KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
2179                 (int)descr->size_aligned));
2180 
2181   size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
2182 
2183   idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
2184   if (idx == size) {
2185     index = 0; // 2 cache lines
2186   } else if ((idx <<= 1) == size) {
2187     index = 1; // 4 cache lines
2188   } else if ((idx <<= 2) == size) {
2189     index = 2; // 16 cache lines
2190   } else if ((idx <<= 2) == size) {
2191     index = 3; // 64 cache lines
2192   } else {
2193     KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
2194     goto free_call; // 65 or more cache lines ( > 8KB )
2195   }
2196 
2197   alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
2198   if (alloc_thr == this_thr) {
2199     // push block to self no-sync free list, linking previous head (LIFO)
2200     *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
2201     this_thr->th.th_free_lists[index].th_free_list_self = ptr;
2202   } else {
2203     void *head = this_thr->th.th_free_lists[index].th_free_list_other;
2204     if (head == NULL) {
2205       // Create new free list
2206       this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2207       *((void **)ptr) = NULL; // mark the tail of the list
2208       descr->size_allocated = (size_t)1; // head of the list keeps its length
2209     } else {
2210       // need to check existed "other" list's owner thread and size of queue
2211       kmp_mem_descr_t *dsc =
2212           (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
2213       // allocating thread, same for all queue nodes
2214       kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
2215       size_t q_sz =
2216           dsc->size_allocated + 1; // new size in case we add current task
2217       if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
2218         // we can add current task to "other" list, no sync needed
2219         *((void **)ptr) = head;
2220         descr->size_allocated = q_sz;
2221         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2222       } else {
2223         // either queue blocks owner is changing or size limit exceeded
2224         // return old queue to allocating thread (q_th) synchronously,
2225         // and start new list for alloc_thr's tasks
2226         void *old_ptr;
2227         void *tail = head;
2228         void *next = *((void **)head);
2229         while (next != NULL) {
2230           KMP_DEBUG_ASSERT(
2231               // queue size should decrease by 1 each step through the list
2232               ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
2233                       ->size_allocated +
2234                   1 ==
2235               ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
2236                   ->size_allocated);
2237           tail = next; // remember tail node
2238           next = *((void **)next);
2239         }
2240         KMP_DEBUG_ASSERT(q_th != NULL);
2241         // push block to owner's sync free list
2242         old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2243         /* the next pointer must be set before setting free_list to ptr to avoid
2244            exposing a broken list to other threads, even for an instant. */
2245         *((void **)tail) = old_ptr;
2246 
2247         while (!KMP_COMPARE_AND_STORE_PTR(
2248             &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
2249           KMP_CPU_PAUSE();
2250           old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2251           *((void **)tail) = old_ptr;
2252         }
2253 
2254         // start new list of not-selt tasks
2255         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2256         *((void **)ptr) = NULL;
2257         descr->size_allocated = (size_t)1; // head of queue keeps its length
2258       }
2259     }
2260   }
2261   goto end;
2262 
2263 free_call:
2264   KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
2265                 __kmp_gtid_from_thread(this_thr), size));
2266   __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
2267   brel(this_thr, descr->ptr_allocated);
2268 
2269 end:
2270   KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
2271 
2272 } // func __kmp_fast_free
2273 
2274 // Initialize the thread free lists related to fast memory
2275 // Only do this when a thread is initially created.
2276 void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
2277   KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
2278 
2279   memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
2280 }
2281 
2282 // Free the memory in the thread free lists related to fast memory
2283 // Only do this when a thread is being reaped (destroyed).
2284 void __kmp_free_fast_memory(kmp_info_t *th) {
2285   // Suppose we use BGET underlying allocator, walk through its structures...
2286   int bin;
2287   thr_data_t *thr = get_thr_data(th);
2288   void **lst = NULL;
2289 
2290   KE_TRACE(
2291       5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
2292 
2293   __kmp_bget_dequeue(th); // Release any queued buffers
2294 
2295   // Dig through free lists and extract all allocated blocks
2296   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
2297     bfhead_t *b = thr->freelist[bin].ql.flink;
2298     while (b != &thr->freelist[bin]) {
2299       if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
2300         *((void **)b) =
2301             lst; // link the list (override bthr, but keep flink yet)
2302         lst = (void **)b; // push b into lst
2303       }
2304       b = b->ql.flink; // get next buffer
2305     }
2306   }
2307   while (lst != NULL) {
2308     void *next = *lst;
2309     KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
2310                   lst, next, th, __kmp_gtid_from_thread(th)));
2311     (*thr->relfcn)(lst);
2312 #if BufStats
2313     // count blocks to prevent problems in __kmp_finalize_bget()
2314     thr->numprel++; /* Nr of expansion block releases */
2315     thr->numpblk--; /* Total number of blocks */
2316 #endif
2317     lst = (void **)next;
2318   }
2319 
2320   KE_TRACE(
2321       5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
2322 }
2323 
2324 #endif // USE_FAST_MEMORY
2325