xref: /illumos-gate/usr/src/lib/libmtmalloc/common/mtmalloc.c (revision 8f2954fa920e8474d4a939a960aff61167bc444a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <mtmalloc.h>
30 #include "mtmalloc_impl.h"
31 #include <unistd.h>
32 #include <synch.h>
33 #include <thread.h>
34 #include <pthread.h>
35 #include <stdio.h>
36 #include <limits.h>
37 #include <errno.h>
38 #include <string.h>
39 #include <strings.h>
40 #include <sys/param.h>
41 #include <sys/sysmacros.h>
42 
43 /*
44  * To turn on the asserts just compile -DDEBUG
45  */
46 
47 #ifndef	DEBUG
48 #define	NDEBUG
49 #endif
50 
51 #include <assert.h>
52 
53 /*
54  * The MT hot malloc implementation contained herein is designed to be
55  * plug-compatible with the libc version of malloc. It is not intended
56  * to replace that implementation until we decide that it is ok to break
57  * customer apps (Solaris 3.0).
58  *
59  * For requests up to 2^^16, the allocator initializes itself into NCPUS
60  * worth of chains of caches. When a memory request is made, the calling thread
61  * is vectored into one of NCPUS worth of caches.  The LWP id gives us a cheap,
62  * contention-reducing index to use, eventually, this should be replaced with
63  * the actual CPU sequence number, when an interface to get it is available.
64  *
65  * Once the thread is vectored into one of the list of caches the real
66  * allocation of the memory begins. The size is determined to figure out which
67  * bucket the allocation should be satisfied from. The management of free
68  * buckets is done via a bitmask. A free bucket is represented by a 1. The
69  * first free bit represents the first free bucket. The position of the bit,
70  * represents the position of the bucket in the arena.
71  *
72  * When the memory from the arena is handed out, the address of the cache
73  * control structure is written in the word preceeding the returned memory.
74  * This cache control address is used during free() to mark the buffer free
75  * in the cache control structure.
76  *
77  * When all available memory in a cache has been depleted, a new chunk of memory
78  * is allocated via sbrk(). The new cache is allocated from this chunk of memory
79  * and initialized in the function create_cache(). New caches are installed at
80  * the front of a singly linked list of the same size memory pools. This helps
81  * to ensure that there will tend to be available memory in the beginning of the
82  * list.
83  *
84  * Long linked lists hurt performance. To decrease this effect, there is a
85  * tunable, requestsize, that bumps up the sbrk allocation size and thus
86  * increases the number of available blocks within an arena.  We also keep
87  * a "hint" for each cache list, which is the last cache in the list allocated
88  * from.  This lowers the cost of searching if there are a lot of fully
89  * allocated blocks at the front of the list.
90  *
91  * For requests greater than 2^^16 (oversize allocations), there are two pieces
92  * of overhead. There is the OVERHEAD used to hold the cache addr
93  * (&oversize_list), plus an oversize_t structure to further describe the block.
94  *
95  * The oversize list is kept as defragmented as possible by coalescing
96  * freed oversized allocations with adjacent neighbors.
97  *
98  * Addresses handed out are stored in a hash table, and are aligned on
99  * MTMALLOC_MIN_ALIGN-byte boundaries at both ends. Request sizes are rounded-up
100  * where necessary in order to achieve this. This eases the implementation of
101  * MTDEBUGPATTERN and MTINITPATTERN, particularly where coalescing occurs.
102  *
103  * A memalign allocation takes memalign header overhead.  There's two
104  * types of memalign headers distinguished by MTMALLOC_MEMALIGN_MAGIC
105  * and MTMALLOC_MEMALIGN_MIN_MAGIC.  When the size of memory taken to
106  * get to the aligned address from malloc'ed address is the minimum size
107  * OVERHEAD, we create a header taking only one OVERHEAD space with magic
108  * number MTMALLOC_MEMALIGN_MIN_MAGIC, and we know by subtracting OVERHEAD
109  * from memaligned address, we can get to the malloc'ed address. Otherwise,
110  * we create a memalign header taking two OVERHEAD space, one stores
111  * MTMALLOC_MEMALIGN_MAGIC magic number, the other one points back to the
112  * malloc'ed address.
113  */
114 
115 #if defined(__i386) || defined(__amd64)
116 #include <arpa/inet.h>	/* for htonl() */
117 #endif
118 
119 static void * morecore(size_t);
120 static void create_cache(cache_t *, size_t bufsize, uint_t hunks);
121 static void * malloc_internal(size_t, percpu_t *);
122 static void * oversize(size_t);
123 static oversize_t *find_oversize(size_t);
124 static void add_oversize(oversize_t *);
125 static void copy_pattern(uint32_t, void *, size_t);
126 static void * verify_pattern(uint32_t, void *, size_t);
127 static void reinit_cpu_list(void);
128 static void reinit_cache(cache_t *);
129 static void free_oversize(oversize_t *);
130 static oversize_t *oversize_header_alloc(uintptr_t, size_t);
131 
132 /*
133  * oversize hash table stuff
134  */
135 #define	NUM_BUCKETS	67	/* must be prime */
136 #define	HASH_OVERSIZE(caddr)	((uintptr_t)(caddr) % NUM_BUCKETS)
137 oversize_t *ovsz_hashtab[NUM_BUCKETS];
138 
139 #define	ALIGN(x, a)	((((uintptr_t)(x) + ((uintptr_t)(a) - 1)) \
140 			& ~((uintptr_t)(a) - 1)))
141 
142 /* need this to deal with little endianess of x86 */
143 #if defined(__i386) || defined(__amd64)
144 #define	FLIP_EM(x)	htonl((x))
145 #else
146 #define	FLIP_EM(x)	(x)
147 #endif
148 
149 #define	INSERT_ONLY			0
150 #define	COALESCE_LEFT			0x00000001
151 #define	COALESCE_RIGHT			0x00000002
152 #define	COALESCE_WITH_BOTH_SIDES	(COALESCE_LEFT | COALESCE_RIGHT)
153 
154 #define	OVERHEAD	8	/* size needed to write cache addr */
155 #define	HUNKSIZE	8192	/* just a multiplier */
156 
157 #define	MAX_CACHED_SHIFT	16	/* 64K is the max cached size */
158 #define	MAX_CACHED		(1 << MAX_CACHED_SHIFT)
159 #define	MIN_CACHED_SHIFT	4	/* smaller requests rounded up */
160 #define	MTMALLOC_MIN_ALIGN	8	/* min guaranteed alignment */
161 
162 /* maximum size before overflow */
163 #define	MAX_MTMALLOC	(SIZE_MAX - (SIZE_MAX % MTMALLOC_MIN_ALIGN) \
164 			- OVSZ_HEADER_SIZE)
165 
166 #define	NUM_CACHES	(MAX_CACHED_SHIFT - MIN_CACHED_SHIFT + 1)
167 #define	CACHELIST_SIZE	ALIGN(NUM_CACHES * sizeof (cache_head_t), \
168     CACHE_COHERENCY_UNIT)
169 
170 #define	MINSIZE		9	/* for requestsize, tunable */
171 #define	MAXSIZE		256	/* arbitrary, big enough, for requestsize */
172 
173 #define	FREEPATTERN	0xdeadbeef /* debug fill pattern for free buf */
174 #define	INITPATTERN	0xbaddcafe /* debug fill pattern for new buf */
175 
176 #define	misaligned(p)	((unsigned)(p) & (sizeof (int) - 1))
177 #define	IS_OVERSIZE(x, y)	(((x) < (y)) && (((x) > MAX_CACHED)? 1 : 0))
178 
179 static long requestsize = MINSIZE; /* 9 pages per cache; tunable; 9 is min */
180 
181 static uint_t cpu_mask;
182 static curcpu_func curcpu;
183 
184 static int32_t debugopt;
185 static int32_t reinit;
186 
187 static percpu_t *cpu_list;
188 static oversize_t oversize_list;
189 static mutex_t oversize_lock = DEFAULTMUTEX;
190 
191 static int ncpus = 0;
192 
193 #define	MTMALLOC_OVERSIZE_MAGIC		((uintptr_t)&oversize_list)
194 #define	MTMALLOC_MEMALIGN_MAGIC		((uintptr_t)&oversize_list + 1)
195 #define	MTMALLOC_MEMALIGN_MIN_MAGIC	((uintptr_t)&oversize_list + 2)
196 
197 /*
198  * We require allocations handed out to be aligned on MTMALLOC_MIN_ALIGN-byte
199  * boundaries. We round up sizeof (oversize_t) (when necessary) to ensure that
200  * this is achieved.
201  */
202 #define	OVSZ_SIZE		(ALIGN(sizeof (oversize_t), MTMALLOC_MIN_ALIGN))
203 #define	OVSZ_HEADER_SIZE	(OVSZ_SIZE + OVERHEAD)
204 
205 /*
206  * memalign header takes 2 OVERHEAD space.  One for memalign magic, and the
207  * other one points back to the start address of originally allocated space.
208  */
209 #define	MEMALIGN_HEADER_SIZE	2 * OVERHEAD
210 #define	MEMALIGN_HEADER_ALLOC(x, shift, malloc_addr)\
211 	if (shift == OVERHEAD)\
212 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
213 			MTMALLOC_MEMALIGN_MIN_MAGIC; \
214 	else {\
215 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
216 			MTMALLOC_MEMALIGN_MAGIC; \
217 		*((uintptr_t *)((caddr_t)x - 2 * OVERHEAD)) = \
218 			(uintptr_t)malloc_addr; \
219 	}
220 
221 /*
222  * Add big to the oversize hash table at the head of the relevant bucket.
223  */
224 static void
225 insert_hash(oversize_t *big)
226 {
227 	caddr_t ret = big->addr;
228 	int bucket = HASH_OVERSIZE(ret);
229 
230 	assert(MUTEX_HELD(&oversize_lock));
231 	big->hash_next = ovsz_hashtab[bucket];
232 	ovsz_hashtab[bucket] = big;
233 }
234 
235 void *
236 malloc(size_t bytes)
237 {
238 	percpu_t *list_rotor;
239 	uint_t	list_index;
240 
241 	if (bytes > MAX_CACHED)
242 		return (oversize(bytes));
243 
244 	list_index = (curcpu() & cpu_mask);
245 
246 	list_rotor = &cpu_list[list_index];
247 
248 	return (malloc_internal(bytes, list_rotor));
249 }
250 
251 void *
252 realloc(void * ptr, size_t bytes)
253 {
254 	void *new, *data_ptr;
255 	cache_t *cacheptr;
256 	caddr_t mem;
257 	size_t shift = 0;
258 
259 	if (ptr == NULL)
260 		return (malloc(bytes));
261 
262 	if (bytes == 0) {
263 		free(ptr);
264 		return (NULL);
265 	}
266 
267 	data_ptr = ptr;
268 	mem = (caddr_t)ptr - OVERHEAD;
269 
270 	new = malloc(bytes);
271 
272 	if (new == NULL)
273 		return (NULL);
274 
275 	/*
276 	 * If new == ptr, ptr has previously been freed. Passing a freed pointer
277 	 * to realloc() is not allowed - unless the caller specifically states
278 	 * otherwise, in which case we must avoid freeing ptr (ie new) before we
279 	 * return new. There is (obviously) no requirement to memcpy() ptr to
280 	 * new before we return.
281 	 */
282 	if (new == ptr) {
283 		if (!(debugopt & MTDOUBLEFREE))
284 			abort();
285 		return (new);
286 	}
287 
288 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
289 		mem -= OVERHEAD;
290 		ptr = (void *)*(uintptr_t *)mem;
291 		mem = (caddr_t)ptr - OVERHEAD;
292 		shift = (size_t)((uintptr_t)data_ptr - (uintptr_t)ptr);
293 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
294 		ptr = (void *) mem;
295 		mem -= OVERHEAD;
296 		shift = OVERHEAD;
297 	}
298 
299 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
300 		oversize_t *old;
301 
302 		old = (oversize_t *)(mem - OVSZ_SIZE);
303 		(void) memcpy(new, data_ptr, MIN(bytes, old->size - shift));
304 		free(ptr);
305 		return (new);
306 	}
307 
308 	cacheptr = (cache_t *)*(uintptr_t *)mem;
309 
310 	(void) memcpy(new, data_ptr,
311 		MIN(cacheptr->mt_size - OVERHEAD - shift, bytes));
312 	free(ptr);
313 
314 	return (new);
315 }
316 
317 void *
318 calloc(size_t nelem, size_t bytes)
319 {
320 	void * ptr;
321 	size_t size = nelem * bytes;
322 
323 	ptr = malloc(size);
324 	if (ptr == NULL)
325 		return (NULL);
326 	(void) memset(ptr, 0, size);
327 
328 	return (ptr);
329 }
330 
331 void
332 free(void * ptr)
333 {
334 	cache_t *cacheptr;
335 	caddr_t mem;
336 	int32_t i;
337 	caddr_t freeblocks;
338 	uintptr_t offset;
339 	uchar_t mask;
340 	int32_t which_bit, num_bytes;
341 
342 	if (ptr == NULL)
343 		return;
344 
345 	mem = (caddr_t)ptr - OVERHEAD;
346 
347 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
348 		mem -= OVERHEAD;
349 		ptr = (void *)*(uintptr_t *)mem;
350 		mem = (caddr_t)ptr - OVERHEAD;
351 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
352 		ptr = (void *) mem;
353 		mem -= OVERHEAD;
354 	}
355 
356 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
357 		oversize_t *big, **opp;
358 		int bucket;
359 
360 		big = (oversize_t *)(mem - OVSZ_SIZE);
361 		(void) mutex_lock(&oversize_lock);
362 
363 		bucket = HASH_OVERSIZE(big->addr);
364 		for (opp = &ovsz_hashtab[bucket]; *opp != NULL;
365 		    opp = &(*opp)->hash_next)
366 			if (*opp == big)
367 				break;
368 
369 		if (*opp == NULL) {
370 			if (!(debugopt & MTDOUBLEFREE))
371 				abort();
372 			(void) mutex_unlock(&oversize_lock);
373 			return;
374 		}
375 
376 		*opp = big->hash_next;	/* remove big from the hash table */
377 		big->hash_next = NULL;
378 
379 		if (debugopt & MTDEBUGPATTERN)
380 			copy_pattern(FREEPATTERN, ptr, big->size);
381 		add_oversize(big);
382 		(void) mutex_unlock(&oversize_lock);
383 		return;
384 	}
385 
386 	cacheptr = (cache_t *)*(uintptr_t *)mem;
387 	freeblocks = cacheptr->mt_freelist;
388 
389 	/*
390 	 * This is the distance measured in bits into the arena.
391 	 * The value of offset is in bytes but there is a 1-1 correlation
392 	 * between distance into the arena and distance into the
393 	 * freelist bitmask.
394 	 */
395 	offset = mem - cacheptr->mt_arena;
396 
397 	/*
398 	 * i is total number of bits to offset into freelist bitmask.
399 	 */
400 
401 	i = offset / cacheptr->mt_size;
402 
403 	num_bytes = i >> 3;
404 
405 	/*
406 	 * which_bit is the bit offset into the byte in the freelist.
407 	 * if our freelist bitmask looks like 0xf3 and we are freeing
408 	 * block 5 (ie: the 6th block) our mask will be 0xf7 after
409 	 * the free. Things go left to right that's why the mask is 0x80
410 	 * and not 0x01.
411 	 */
412 	which_bit = i - (num_bytes << 3);
413 
414 	mask = 0x80 >> which_bit;
415 
416 	freeblocks += num_bytes;
417 
418 	if (debugopt & MTDEBUGPATTERN)
419 		copy_pattern(FREEPATTERN, ptr, cacheptr->mt_size - OVERHEAD);
420 
421 	(void) mutex_lock(&cacheptr->mt_cache_lock);
422 
423 	if (*freeblocks & mask) {
424 		if (!(debugopt & MTDOUBLEFREE))
425 			abort();
426 	} else {
427 		*freeblocks |= mask;
428 		cacheptr->mt_nfree++;
429 	}
430 
431 	(void) mutex_unlock(&cacheptr->mt_cache_lock);
432 }
433 
434 void *
435 memalign(size_t alignment, size_t size)
436 {
437 	size_t alloc_size;
438 	uintptr_t offset;
439 	void *alloc_buf;
440 	void *ret_buf;
441 
442 	if (size == 0 || alignment == 0 ||
443 		misaligned(alignment) ||
444 		(alignment & (alignment - 1)) != 0) {
445 		errno = EINVAL;
446 		return (NULL);
447 	}
448 
449 	/* <= MTMALLOC_MIN_ALIGN, malloc can provide directly */
450 	if (alignment <= MTMALLOC_MIN_ALIGN)
451 		return (malloc(size));
452 
453 	alloc_size = size + alignment - MTMALLOC_MIN_ALIGN;
454 
455 	if (alloc_size < size) { /* overflow */
456 		errno = ENOMEM;
457 		return (NULL);
458 	}
459 
460 	alloc_buf = malloc(alloc_size);
461 
462 	if (alloc_buf == NULL)
463 		/* malloc sets errno */
464 		return (NULL);
465 
466 	/*
467 	 * If alloc_size > MAX_CACHED, malloc() will have returned a multiple of
468 	 * MTMALLOC_MIN_ALIGN, having rounded-up alloc_size if necessary. Since
469 	 * we will use alloc_size to return the excess fragments to the free
470 	 * list, we also round-up alloc_size if necessary.
471 	 */
472 	if ((alloc_size > MAX_CACHED) &&
473 	    (alloc_size & (MTMALLOC_MIN_ALIGN - 1)))
474 		alloc_size = ALIGN(alloc_size, MTMALLOC_MIN_ALIGN);
475 
476 	if ((offset = (uintptr_t)alloc_buf & (alignment - 1)) == 0) {
477 		/* aligned correctly */
478 
479 		size_t frag_size = alloc_size -
480 			(size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
481 
482 		/*
483 		 * If the leftover piece of the memory > MAX_CACHED,
484 		 * split off the piece and return it back to the freelist.
485 		 */
486 		if (IS_OVERSIZE(frag_size, alloc_size)) {
487 			oversize_t *orig, *tail;
488 			uintptr_t taddr;
489 			size_t data_size;
490 			taddr = ALIGN((uintptr_t)alloc_buf + size,
491 					MTMALLOC_MIN_ALIGN);
492 			data_size = taddr - (uintptr_t)alloc_buf;
493 			orig = (oversize_t *)((uintptr_t)alloc_buf -
494 					OVSZ_HEADER_SIZE);
495 			frag_size = orig->size - data_size -
496 					OVSZ_HEADER_SIZE;
497 			orig->size = data_size;
498 			tail = oversize_header_alloc(taddr, frag_size);
499 			free_oversize(tail);
500 		}
501 		ret_buf = alloc_buf;
502 	} else {
503 		uchar_t	oversize_bits = 0;
504 		size_t	head_sz, data_sz, tail_sz;
505 		uintptr_t ret_addr, taddr, shift, tshift;
506 		oversize_t *orig, *tail, *big;
507 		size_t tsize;
508 
509 		/* needs to be aligned */
510 		shift = alignment - offset;
511 
512 		assert(shift >= MTMALLOC_MIN_ALIGN);
513 
514 		ret_addr = ((uintptr_t)alloc_buf + shift);
515 		ret_buf = (void *)ret_addr;
516 
517 		if (alloc_size <= MAX_CACHED) {
518 			MEMALIGN_HEADER_ALLOC(ret_addr, shift, alloc_buf);
519 			return (ret_buf);
520 		}
521 
522 		/*
523 		 * Only check for the fragments when the memory is allocted
524 		 * from oversize_list.  Split off a fragment and return it
525 		 * to the oversize freelist when it's > MAX_CACHED.
526 		 */
527 
528 		head_sz = shift - MAX(MEMALIGN_HEADER_SIZE, OVSZ_HEADER_SIZE);
529 
530 		tail_sz = alloc_size -
531 			(shift + size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
532 
533 		oversize_bits |= IS_OVERSIZE(head_sz, alloc_size) |
534 				IS_OVERSIZE(size, alloc_size) << DATA_SHIFT |
535 				IS_OVERSIZE(tail_sz, alloc_size) << TAIL_SHIFT;
536 
537 		switch (oversize_bits) {
538 			case NONE_OVERSIZE:
539 			case DATA_OVERSIZE:
540 				MEMALIGN_HEADER_ALLOC(ret_addr, shift,
541 					alloc_buf);
542 				break;
543 			case HEAD_OVERSIZE:
544 				/*
545 				 * If we can extend data > MAX_CACHED and have
546 				 * head still > MAX_CACHED, we split head-end
547 				 * as the case of head-end and data oversized,
548 				 * otherwise just create memalign header.
549 				 */
550 				tsize = (shift + size) - (MAX_CACHED + 8 +
551 					MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
552 
553 				if (!IS_OVERSIZE(tsize, alloc_size)) {
554 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
555 						alloc_buf);
556 					break;
557 				} else {
558 					tsize += OVSZ_HEADER_SIZE;
559 					taddr = ALIGN((uintptr_t)alloc_buf +
560 						tsize, MTMALLOC_MIN_ALIGN);
561 					tshift = ret_addr - taddr;
562 					MEMALIGN_HEADER_ALLOC(ret_addr, tshift,
563 						taddr);
564 					ret_addr = taddr;
565 					shift = ret_addr - (uintptr_t)alloc_buf;
566 				}
567 				/* FALLTHROUGH */
568 			case HEAD_AND_DATA_OVERSIZE:
569 				/*
570 				 * Split off the head fragment and
571 				 * return it back to oversize freelist.
572 				 * Create oversize header for the piece
573 				 * of (data + tail fragment).
574 				 */
575 				orig = (oversize_t *)((uintptr_t)alloc_buf -
576 						OVSZ_HEADER_SIZE);
577 				big = oversize_header_alloc(ret_addr -
578 						OVSZ_HEADER_SIZE,
579 						(orig->size - shift));
580 				(void) mutex_lock(&oversize_lock);
581 				insert_hash(big);
582 				(void) mutex_unlock(&oversize_lock);
583 				orig->size = shift - OVSZ_HEADER_SIZE;
584 
585 				/* free up the head fragment */
586 				free_oversize(orig);
587 				break;
588 			case TAIL_OVERSIZE:
589 				/*
590 				 * If we can extend data > MAX_CACHED and have
591 				 * tail-end still > MAX_CACHED, we split tail
592 				 * end, otherwise just create memalign header.
593 				 */
594 				orig = (oversize_t *)((uintptr_t)alloc_buf -
595 						OVSZ_HEADER_SIZE);
596 				tsize =  orig->size - (MAX_CACHED + 8 +
597 					shift + OVSZ_HEADER_SIZE +
598 					MTMALLOC_MIN_ALIGN);
599 				if (!IS_OVERSIZE(tsize, alloc_size)) {
600 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
601 						alloc_buf);
602 					break;
603 				} else {
604 					size = MAX_CACHED + 8;
605 				}
606 				/* FALLTHROUGH */
607 			case DATA_AND_TAIL_OVERSIZE:
608 				/*
609 				 * Split off the tail fragment and
610 				 * return it back to oversize freelist.
611 				 * Create memalign header and adjust
612 				 * the size for the piece of
613 				 * (head fragment + data).
614 				 */
615 				taddr = ALIGN(ret_addr + size,
616 						MTMALLOC_MIN_ALIGN);
617 				data_sz = (size_t)(taddr -
618 						(uintptr_t)alloc_buf);
619 				orig = (oversize_t *)((uintptr_t)alloc_buf -
620 						OVSZ_HEADER_SIZE);
621 				tsize = orig->size - data_sz;
622 				orig->size = data_sz;
623 				MEMALIGN_HEADER_ALLOC(ret_buf, shift,
624 					alloc_buf);
625 				tsize -= OVSZ_HEADER_SIZE;
626 				tail = oversize_header_alloc(taddr,  tsize);
627 				free_oversize(tail);
628 				break;
629 			case HEAD_AND_TAIL_OVERSIZE:
630 				/*
631 				 * Split off the head fragment.
632 				 * We try to free up tail-end when we can
633 				 * extend data size to (MAX_CACHED + 8)
634 				 * and remain tail-end oversized.
635 				 * The bottom line is all split pieces
636 				 * should be oversize in size.
637 				 */
638 				orig = (oversize_t *)((uintptr_t)alloc_buf -
639 					OVSZ_HEADER_SIZE);
640 				tsize =  orig->size - (MAX_CACHED + 8 +
641 					OVSZ_HEADER_SIZE + shift +
642 					MTMALLOC_MIN_ALIGN);
643 
644 				if (!IS_OVERSIZE(tsize, alloc_size)) {
645 					/*
646 					 * If the chunk is not big enough
647 					 * to make both data and tail oversize
648 					 * we just keep them as one piece.
649 					 */
650 					big = oversize_header_alloc(ret_addr -
651 						OVSZ_HEADER_SIZE,
652 						orig->size - shift);
653 					(void) mutex_lock(&oversize_lock);
654 					insert_hash(big);
655 					(void) mutex_unlock(&oversize_lock);
656 					orig->size = shift -
657 						OVSZ_HEADER_SIZE;
658 					free_oversize(orig);
659 					break;
660 				} else {
661 					/*
662 					 * extend data size > MAX_CACHED
663 					 * and handle it as head, data, tail
664 					 * are all oversized.
665 					 */
666 					size = MAX_CACHED + 8;
667 				}
668 				/* FALLTHROUGH */
669 			case ALL_OVERSIZE:
670 				/*
671 				 * split off the head and tail fragments,
672 				 * return them back to the oversize freelist.
673 				 * Alloc oversize header for data seg.
674 				 */
675 				orig = (oversize_t *)((uintptr_t)alloc_buf -
676 					OVSZ_HEADER_SIZE);
677 				tsize = orig->size;
678 				orig->size = shift - OVSZ_HEADER_SIZE;
679 				free_oversize(orig);
680 
681 				taddr = ALIGN(ret_addr + size,
682 					MTMALLOC_MIN_ALIGN);
683 				data_sz = taddr - ret_addr;
684 				assert(tsize > (shift + data_sz +
685 					OVSZ_HEADER_SIZE));
686 				tail_sz = tsize -
687 					(shift + data_sz + OVSZ_HEADER_SIZE);
688 
689 				/* create oversize header for data seg */
690 				big = oversize_header_alloc(ret_addr -
691 					OVSZ_HEADER_SIZE, data_sz);
692 				(void) mutex_lock(&oversize_lock);
693 				insert_hash(big);
694 				(void) mutex_unlock(&oversize_lock);
695 
696 				/* create oversize header for tail fragment */
697 				tail = oversize_header_alloc(taddr, tail_sz);
698 				free_oversize(tail);
699 				break;
700 			default:
701 				/* should not reach here */
702 				assert(0);
703 		}
704 	}
705 	return (ret_buf);
706 }
707 
708 
709 void *
710 valloc(size_t size)
711 {
712 	static unsigned pagesize;
713 
714 	if (size == 0)
715 		return (NULL);
716 
717 	if (!pagesize)
718 		pagesize = sysconf(_SC_PAGESIZE);
719 
720 	return (memalign(pagesize, size));
721 }
722 
723 void
724 mallocctl(int cmd, long value)
725 {
726 	switch (cmd) {
727 
728 	case MTDEBUGPATTERN:
729 		/*
730 		 * Reinitialize free blocks in case malloc() is called prior
731 		 * to mallocctl().
732 		 */
733 		if (value && !(debugopt & cmd)) {
734 			reinit++;
735 			debugopt |= cmd;
736 			reinit_cpu_list();
737 		}
738 		/*FALLTHRU*/
739 	case MTDOUBLEFREE:
740 	case MTINITBUFFER:
741 		if (value)
742 			debugopt |= cmd;
743 		else
744 			debugopt &= ~cmd;
745 		break;
746 	case MTCHUNKSIZE:
747 		if (value >= MINSIZE && value <= MAXSIZE)
748 			requestsize = value;
749 		break;
750 	default:
751 		break;
752 	}
753 }
754 
755 /*
756  * Initialization function, called from the init section of the library.
757  * No locking is required here because we are single-threaded during
758  * library initialization.
759  */
760 static void
761 setup_caches(void)
762 {
763 	uintptr_t oldbrk;
764 	uintptr_t newbrk;
765 
766 	size_t cache_space_needed;
767 	size_t padding;
768 
769 	curcpu_func new_curcpu;
770 	uint_t new_cpu_mask;
771 	percpu_t *new_cpu_list;
772 
773 	uint_t i, j;
774 	uintptr_t list_addr;
775 
776 	/*
777 	 * Get a decent "current cpu identifier", to be used to reduce
778 	 * contention.  Eventually, this should be replaced by an interface
779 	 * to get the actual CPU sequence number in libthread/liblwp.
780 	 */
781 	new_curcpu = (curcpu_func)thr_self;
782 	if ((ncpus = 2 * sysconf(_SC_NPROCESSORS_CONF)) <= 0)
783 		ncpus = 4; /* decent default value */
784 
785 	/* round ncpus up to a power of 2 */
786 	while (ncpus & (ncpus - 1))
787 		ncpus++;
788 
789 	new_cpu_mask = ncpus - 1;	/* create the cpu mask */
790 
791 	/*
792 	 * We now do some magic with the brk.  What we want to get in the
793 	 * end is a bunch of well-aligned stuff in a big initial allocation.
794 	 * Along the way, we do sanity checks to make sure no one else has
795 	 * touched the brk (which shouldn't happen, but it's always good to
796 	 * check)
797 	 *
798 	 * First, make sure sbrk is sane, and store the current brk in oldbrk.
799 	 */
800 	oldbrk = (uintptr_t)sbrk(0);
801 	if ((void *)oldbrk == (void *)-1)
802 		abort();	/* sbrk is broken -- we're doomed. */
803 
804 	/*
805 	 * Now, align the brk to a multiple of CACHE_COHERENCY_UNIT, so that
806 	 * the percpu structures and cache lists will be properly aligned.
807 	 *
808 	 *   2.  All hunks will be page-aligned, assuming HUNKSIZE >= PAGESIZE,
809 	 *	so they can be paged out individually.
810 	 */
811 	newbrk = ALIGN(oldbrk, CACHE_COHERENCY_UNIT);
812 	if (newbrk != oldbrk && (uintptr_t)sbrk(newbrk - oldbrk) != oldbrk)
813 		abort();	/* sbrk is broken -- we're doomed. */
814 
815 	/*
816 	 * For each cpu, there is one percpu_t and a list of caches
817 	 */
818 	cache_space_needed = ncpus * (sizeof (percpu_t) + CACHELIST_SIZE);
819 
820 	new_cpu_list = (percpu_t *)sbrk(cache_space_needed);
821 
822 	if (new_cpu_list == (percpu_t *)-1 ||
823 	    (uintptr_t)new_cpu_list != newbrk)
824 		abort();	/* sbrk is broken -- we're doomed. */
825 
826 	/*
827 	 * Finally, align the brk to HUNKSIZE so that all hunks are
828 	 * page-aligned, to avoid edge-effects.
829 	 */
830 
831 	newbrk = (uintptr_t)new_cpu_list + cache_space_needed;
832 
833 	padding = ALIGN(newbrk, HUNKSIZE) - newbrk;
834 
835 	if (padding > 0 && (uintptr_t)sbrk(padding) != newbrk)
836 		abort();	/* sbrk is broken -- we're doomed. */
837 
838 	list_addr = ((uintptr_t)new_cpu_list + (sizeof (percpu_t) * ncpus));
839 
840 	/* initialize the percpu list */
841 	for (i = 0; i < ncpus; i++) {
842 		new_cpu_list[i].mt_caches = (cache_head_t *)list_addr;
843 		for (j = 0; j < NUM_CACHES; j++) {
844 			new_cpu_list[i].mt_caches[j].mt_cache = NULL;
845 			new_cpu_list[i].mt_caches[j].mt_hint = NULL;
846 		}
847 
848 		(void) mutex_init(&new_cpu_list[i].mt_parent_lock,
849 		    USYNC_THREAD, NULL);
850 
851 		/* get the correct cache list alignment */
852 		list_addr += CACHELIST_SIZE;
853 	}
854 
855 	/*
856 	 * Initialize oversize listhead
857 	 */
858 	oversize_list.next_bysize = &oversize_list;
859 	oversize_list.prev_bysize = &oversize_list;
860 	oversize_list.next_byaddr = &oversize_list;
861 	oversize_list.prev_byaddr = &oversize_list;
862 	oversize_list.addr = NULL;
863 	oversize_list.size = 0;		/* sentinal */
864 
865 	/*
866 	 * Now install the global variables.
867 	 */
868 	curcpu = new_curcpu;
869 	cpu_mask = new_cpu_mask;
870 	cpu_list = new_cpu_list;
871 }
872 
873 static void
874 create_cache(cache_t *cp, size_t size, uint_t chunksize)
875 {
876 	long nblocks;
877 
878 	(void) mutex_init(&cp->mt_cache_lock, USYNC_THREAD, NULL);
879 	cp->mt_size = size;
880 	cp->mt_freelist = ((caddr_t)cp + sizeof (cache_t));
881 	cp->mt_span = chunksize * HUNKSIZE - sizeof (cache_t);
882 	cp->mt_hunks = chunksize;
883 	/*
884 	 * rough calculation. We will need to adjust later.
885 	 */
886 	nblocks = cp->mt_span / cp->mt_size;
887 	nblocks >>= 3;
888 	if (nblocks == 0) { /* less than 8 free blocks in this pool */
889 		int32_t numblocks = 0;
890 		long i = cp->mt_span;
891 		size_t sub = cp->mt_size;
892 		uchar_t mask = 0;
893 
894 		while (i > sub) {
895 			numblocks++;
896 			i -= sub;
897 		}
898 		nblocks = numblocks;
899 		cp->mt_arena = (caddr_t)ALIGN(cp->mt_freelist + 8, 8);
900 		cp->mt_nfree = numblocks;
901 		while (numblocks--) {
902 			mask |= 0x80 >> numblocks;
903 		}
904 		*(cp->mt_freelist) = mask;
905 	} else {
906 		cp->mt_arena = (caddr_t)ALIGN((caddr_t)cp->mt_freelist +
907 			nblocks, 32);
908 		/* recompute nblocks */
909 		nblocks = (uintptr_t)((caddr_t)cp->mt_freelist +
910 			cp->mt_span - cp->mt_arena) / cp->mt_size;
911 		cp->mt_nfree = ((nblocks >> 3) << 3);
912 		/* Set everything to free */
913 		(void) memset(cp->mt_freelist, 0xff, nblocks >> 3);
914 	}
915 
916 	if (debugopt & MTDEBUGPATTERN)
917 		copy_pattern(FREEPATTERN, cp->mt_arena, cp->mt_size * nblocks);
918 
919 	cp->mt_next = NULL;
920 }
921 
922 static void
923 reinit_cpu_list(void)
924 {
925 	oversize_t *wp = oversize_list.next_bysize;
926 	percpu_t *cpuptr;
927 	cache_t *thiscache;
928 	cache_head_t *cachehead;
929 
930 	/* Reinitialize free oversize blocks. */
931 	(void) mutex_lock(&oversize_lock);
932 	if (debugopt & MTDEBUGPATTERN)
933 		for (; wp != &oversize_list; wp = wp->next_bysize)
934 			copy_pattern(FREEPATTERN, wp->addr, wp->size);
935 	(void) mutex_unlock(&oversize_lock);
936 
937 	/* Reinitialize free blocks. */
938 	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
939 		(void) mutex_lock(&cpuptr->mt_parent_lock);
940 		for (cachehead = &cpuptr->mt_caches[0]; cachehead <
941 			&cpuptr->mt_caches[NUM_CACHES]; cachehead++) {
942 			for (thiscache = cachehead->mt_cache; thiscache != NULL;
943 				thiscache = thiscache->mt_next) {
944 				(void) mutex_lock(&thiscache->mt_cache_lock);
945 				if (thiscache->mt_nfree == 0) {
946 					(void) mutex_unlock(
947 					    &thiscache->mt_cache_lock);
948 					continue;
949 				}
950 				if (thiscache != NULL)
951 					reinit_cache(thiscache);
952 				(void) mutex_unlock(&thiscache->mt_cache_lock);
953 			}
954 		}
955 		(void) mutex_unlock(&cpuptr->mt_parent_lock);
956 	}
957 	reinit = 0;
958 }
959 
960 static void
961 reinit_cache(cache_t *thiscache)
962 {
963 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
964 	int32_t i, n;
965 	caddr_t ret;
966 
967 	freeblocks = (uint32_t *)thiscache->mt_freelist;
968 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
969 		if (*freeblocks & 0xffffffff) {
970 		    for (i = 0; i < 32; i++) {
971 			if (FLIP_EM(*freeblocks) & (0x80000000 >> i)) {
972 				n = (uintptr_t)(((freeblocks -
973 				    (uint32_t *)thiscache->mt_freelist) << 5)
974 				    + i) * thiscache->mt_size;
975 				ret = thiscache->mt_arena + n;
976 				ret += OVERHEAD;
977 				copy_pattern(FREEPATTERN, ret,
978 				    thiscache->mt_size);
979 			}
980 		    }
981 		}
982 		freeblocks++;
983 	}
984 }
985 
986 static void *
987 malloc_internal(size_t size, percpu_t *cpuptr)
988 {
989 	cache_head_t *cachehead;
990 	cache_t *thiscache, *hintcache;
991 	int32_t i, n, logsz, bucket;
992 	uint32_t index;
993 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
994 	caddr_t ret;
995 
996 	logsz = MIN_CACHED_SHIFT;
997 
998 	while (size > (1 << logsz))
999 		logsz++;
1000 
1001 	bucket = logsz - MIN_CACHED_SHIFT;
1002 
1003 	(void) mutex_lock(&cpuptr->mt_parent_lock);
1004 
1005 	/*
1006 	 * Find a cache of the appropriate size with free buffers.
1007 	 *
1008 	 * We don't need to lock each cache as we check their mt_nfree count,
1009 	 * since:
1010 	 *	1.  We are only looking for caches with mt_nfree > 0.  If a
1011 	 *	   free happens during our search, it will increment mt_nfree,
1012 	 *	   which will not effect the test.
1013 	 *	2.  Allocations can decrement mt_nfree, but they can't happen
1014 	 *	   as long as we hold mt_parent_lock.
1015 	 */
1016 
1017 	cachehead = &cpuptr->mt_caches[bucket];
1018 
1019 	/* Search through the list, starting at the mt_hint */
1020 	thiscache = cachehead->mt_hint;
1021 
1022 	while (thiscache != NULL && thiscache->mt_nfree == 0)
1023 		thiscache = thiscache->mt_next;
1024 
1025 	if (thiscache == NULL) {
1026 		/* wrap around -- search up to the hint */
1027 		thiscache = cachehead->mt_cache;
1028 		hintcache = cachehead->mt_hint;
1029 
1030 		while (thiscache != NULL && thiscache != hintcache &&
1031 		    thiscache->mt_nfree == 0)
1032 			thiscache = thiscache->mt_next;
1033 
1034 		if (thiscache == hintcache)
1035 			thiscache = NULL;
1036 	}
1037 
1038 
1039 	if (thiscache == NULL) { /* there are no free caches */
1040 		int32_t thisrequest = requestsize;
1041 		int32_t buffer_size = (1 << logsz) + OVERHEAD;
1042 
1043 		thiscache = (cache_t *)morecore(thisrequest * HUNKSIZE);
1044 
1045 		if (thiscache == (cache_t *)-1) {
1046 		    (void) mutex_unlock(&cpuptr->mt_parent_lock);
1047 		    errno = EAGAIN;
1048 		    return (NULL);
1049 		}
1050 		create_cache(thiscache, buffer_size, thisrequest);
1051 
1052 		/* link in the new block at the beginning of the list */
1053 		thiscache->mt_next = cachehead->mt_cache;
1054 		cachehead->mt_cache = thiscache;
1055 	}
1056 
1057 	/* update the hint to the cache we found or created */
1058 	cachehead->mt_hint = thiscache;
1059 
1060 	/* thiscache now points to a cache with available space */
1061 	(void) mutex_lock(&thiscache->mt_cache_lock);
1062 
1063 	freeblocks = (uint32_t *)thiscache->mt_freelist;
1064 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
1065 		if (*freeblocks & 0xffffffff)
1066 			break;
1067 		freeblocks++;
1068 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1069 		    *freeblocks & 0xffffffff)
1070 			break;
1071 		freeblocks++;
1072 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1073 		    *freeblocks & 0xffffffff)
1074 			break;
1075 		freeblocks++;
1076 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1077 		    *freeblocks & 0xffffffff)
1078 			break;
1079 		freeblocks++;
1080 	}
1081 
1082 	/*
1083 	 * the offset from mt_freelist to freeblocks is the offset into
1084 	 * the arena. Be sure to include the offset into freeblocks
1085 	 * of the bitmask. n is the offset.
1086 	 */
1087 	for (i = 0; i < 32; ) {
1088 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1089 			break;
1090 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1091 			break;
1092 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1093 			break;
1094 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1095 			break;
1096 	}
1097 	index = 0x80000000 >> --i;
1098 
1099 
1100 	*freeblocks &= FLIP_EM(~index);
1101 
1102 	thiscache->mt_nfree--;
1103 
1104 	(void) mutex_unlock(&thiscache->mt_cache_lock);
1105 	(void) mutex_unlock(&cpuptr->mt_parent_lock);
1106 
1107 	n = (uintptr_t)(((freeblocks - (uint32_t *)thiscache->mt_freelist) << 5)
1108 		+ i) * thiscache->mt_size;
1109 	/*
1110 	 * Now you have the offset in n, you've changed the free mask
1111 	 * in the freelist. Nothing left to do but find the block
1112 	 * in the arena and put the value of thiscache in the word
1113 	 * ahead of the handed out address and return the memory
1114 	 * back to the user.
1115 	 */
1116 	ret = thiscache->mt_arena + n;
1117 
1118 	/* Store the cache addr for this buf. Makes free go fast. */
1119 	*(uintptr_t *)ret = (uintptr_t)thiscache;
1120 
1121 	/*
1122 	 * This assert makes sure we don't hand out memory that is not
1123 	 * owned by this cache.
1124 	 */
1125 	assert(ret + thiscache->mt_size <= thiscache->mt_freelist +
1126 		thiscache->mt_span);
1127 
1128 	ret += OVERHEAD;
1129 
1130 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1131 
1132 	if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1133 		if (verify_pattern(FREEPATTERN, ret, size))
1134 			abort();	/* reference after free */
1135 
1136 	if (debugopt & MTINITBUFFER)
1137 		copy_pattern(INITPATTERN, ret, size);
1138 	return ((void *)ret);
1139 }
1140 
1141 static void *
1142 morecore(size_t bytes)
1143 {
1144 	void * ret;
1145 
1146 	if (bytes > LONG_MAX) {
1147 		intptr_t wad;
1148 		/*
1149 		 * The request size is too big. We need to do this in
1150 		 * chunks. Sbrk only takes an int for an arg.
1151 		 */
1152 		if (bytes == ULONG_MAX)
1153 			return ((void *)-1);
1154 
1155 		ret = sbrk(0);
1156 		wad = LONG_MAX;
1157 		while (wad > 0) {
1158 			if (sbrk(wad) == (void *)-1) {
1159 				if (ret != sbrk(0))
1160 					(void) sbrk(-LONG_MAX);
1161 				return ((void *)-1);
1162 			}
1163 			bytes -= LONG_MAX;
1164 			wad = bytes;
1165 		}
1166 	} else
1167 		ret = sbrk(bytes);
1168 
1169 	return (ret);
1170 }
1171 
1172 
1173 static void *
1174 oversize(size_t size)
1175 {
1176 	caddr_t ret;
1177 	oversize_t *big;
1178 
1179 	/* make sure we will not overflow */
1180 	if (size > MAX_MTMALLOC) {
1181 		errno = ENOMEM;
1182 		return (NULL);
1183 	}
1184 
1185 	/*
1186 	 * Since we ensure every address we hand back is
1187 	 * MTMALLOC_MIN_ALIGN-byte aligned, ALIGNing size ensures that the
1188 	 * memory handed out is MTMALLOC_MIN_ALIGN-byte aligned at both ends.
1189 	 * This eases the implementation of MTDEBUGPATTERN and MTINITPATTERN,
1190 	 * particularly where coalescing occurs.
1191 	 */
1192 	size = ALIGN(size, MTMALLOC_MIN_ALIGN);
1193 
1194 	/*
1195 	 * The idea with the global lock is that we are sure to
1196 	 * block in the kernel anyway since given an oversize alloc
1197 	 * we are sure to have to call morecore();
1198 	 */
1199 	(void) mutex_lock(&oversize_lock);
1200 
1201 	if ((big = find_oversize(size)) != NULL) {
1202 		if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1203 			if (verify_pattern(FREEPATTERN, big->addr, size))
1204 				abort();	/* reference after free */
1205 	} else {
1206 		/* Get more 8-byte aligned memory from heap */
1207 		ret = morecore(size + OVSZ_HEADER_SIZE);
1208 		if (ret == (caddr_t)-1) {
1209 			(void) mutex_unlock(&oversize_lock);
1210 			errno = ENOMEM;
1211 			return (NULL);
1212 		}
1213 		big = oversize_header_alloc((uintptr_t)ret, size);
1214 	}
1215 	ret = big->addr;
1216 
1217 	insert_hash(big);
1218 
1219 	if (debugopt & MTINITBUFFER)
1220 		copy_pattern(INITPATTERN, ret, size);
1221 
1222 	(void) mutex_unlock(&oversize_lock);
1223 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1224 	return ((void *)ret);
1225 }
1226 
1227 static void
1228 insert_oversize(oversize_t *op, oversize_t *nx)
1229 {
1230 	oversize_t *sp;
1231 
1232 	/* locate correct insertion point in size-ordered list */
1233 	for (sp = oversize_list.next_bysize;
1234 	    sp != &oversize_list && (op->size > sp->size);
1235 	    sp = sp->next_bysize)
1236 		;
1237 
1238 	/* link into size-ordered list */
1239 	op->next_bysize = sp;
1240 	op->prev_bysize = sp->prev_bysize;
1241 	op->prev_bysize->next_bysize = op;
1242 	op->next_bysize->prev_bysize = op;
1243 
1244 	/*
1245 	 * link item into address-ordered list
1246 	 * (caller provides insertion point as an optimization)
1247 	 */
1248 	op->next_byaddr = nx;
1249 	op->prev_byaddr = nx->prev_byaddr;
1250 	op->prev_byaddr->next_byaddr = op;
1251 	op->next_byaddr->prev_byaddr = op;
1252 
1253 }
1254 
1255 static void
1256 unlink_oversize(oversize_t *lp)
1257 {
1258 	/* unlink from address list */
1259 	lp->prev_byaddr->next_byaddr = lp->next_byaddr;
1260 	lp->next_byaddr->prev_byaddr = lp->prev_byaddr;
1261 
1262 	/* unlink from size list */
1263 	lp->prev_bysize->next_bysize = lp->next_bysize;
1264 	lp->next_bysize->prev_bysize = lp->prev_bysize;
1265 }
1266 
1267 static void
1268 position_oversize_by_size(oversize_t *op)
1269 {
1270 	oversize_t *sp;
1271 
1272 	if (op->size > op->next_bysize->size ||
1273 	    op->size < op->prev_bysize->size) {
1274 
1275 		/* unlink from size list */
1276 		op->prev_bysize->next_bysize = op->next_bysize;
1277 		op->next_bysize->prev_bysize = op->prev_bysize;
1278 
1279 		/* locate correct insertion point in size-ordered list */
1280 		for (sp = oversize_list.next_bysize;
1281 		    sp != &oversize_list && (op->size > sp->size);
1282 		    sp = sp->next_bysize)
1283 			;
1284 
1285 		/* link into size-ordered list */
1286 		op->next_bysize = sp;
1287 		op->prev_bysize = sp->prev_bysize;
1288 		op->prev_bysize->next_bysize = op;
1289 		op->next_bysize->prev_bysize = op;
1290 	}
1291 }
1292 
1293 static void
1294 add_oversize(oversize_t *lp)
1295 {
1296 	int merge_flags = INSERT_ONLY;
1297 	oversize_t *nx;  	/* ptr to item right of insertion point */
1298 	oversize_t *pv;  	/* ptr to item left of insertion point */
1299 	uint_t size_lp, size_pv, size_nx;
1300 	uintptr_t endp_lp, endp_pv, endp_nx;
1301 
1302 	/*
1303 	 * Locate insertion point in address-ordered list
1304 	 */
1305 
1306 	for (nx = oversize_list.next_byaddr;
1307 	    nx != &oversize_list && (lp->addr > nx->addr);
1308 	    nx = nx->next_byaddr)
1309 		;
1310 
1311 	/*
1312 	 * Determine how to add chunk to oversize freelist
1313 	 */
1314 
1315 	size_lp = OVSZ_HEADER_SIZE + lp->size;
1316 	endp_lp = ALIGN((uintptr_t)lp + size_lp, MTMALLOC_MIN_ALIGN);
1317 	size_lp = endp_lp - (uintptr_t)lp;
1318 
1319 	pv = nx->prev_byaddr;
1320 
1321 	if (pv->size) {
1322 
1323 		size_pv = OVSZ_HEADER_SIZE + pv->size;
1324 		endp_pv = ALIGN((uintptr_t)pv + size_pv,
1325 		    MTMALLOC_MIN_ALIGN);
1326 		size_pv = endp_pv - (uintptr_t)pv;
1327 
1328 		/* Check for adjacency with left chunk */
1329 		if ((uintptr_t)lp == endp_pv)
1330 			merge_flags |= COALESCE_LEFT;
1331 	}
1332 
1333 	if (nx->size) {
1334 
1335 	    /* Check for adjacency with right chunk */
1336 	    if ((uintptr_t)nx == endp_lp) {
1337 		size_nx = OVSZ_HEADER_SIZE + nx->size;
1338 		endp_nx = ALIGN((uintptr_t)nx + size_nx,
1339 		    MTMALLOC_MIN_ALIGN);
1340 		size_nx = endp_nx - (uintptr_t)nx;
1341 		merge_flags |= COALESCE_RIGHT;
1342 	    }
1343 	}
1344 
1345 	/*
1346 	 * If MTDEBUGPATTERN==1, lp->addr will have been overwritten with
1347 	 * FREEPATTERN for lp->size bytes. If we can merge, the oversize
1348 	 * header(s) that will also become part of the memory available for
1349 	 * reallocation (ie lp and/or nx) must also be overwritten with
1350 	 * FREEPATTERN or we will SIGABRT when this memory is next reallocated.
1351 	 */
1352 	switch (merge_flags) {
1353 
1354 	case INSERT_ONLY:		/* Coalescing not possible */
1355 		insert_oversize(lp, nx);
1356 		break;
1357 	case COALESCE_LEFT:
1358 		pv->size += size_lp;
1359 		position_oversize_by_size(pv);
1360 		if (debugopt & MTDEBUGPATTERN)
1361 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1362 		break;
1363 	case COALESCE_RIGHT:
1364 		unlink_oversize(nx);
1365 		lp->size += size_nx;
1366 		insert_oversize(lp, pv->next_byaddr);
1367 		if (debugopt & MTDEBUGPATTERN)
1368 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1369 		break;
1370 	case COALESCE_WITH_BOTH_SIDES:	/* Merge (with right) to the left */
1371 		pv->size += size_lp + size_nx;
1372 		unlink_oversize(nx);
1373 		position_oversize_by_size(pv);
1374 		if (debugopt & MTDEBUGPATTERN) {
1375 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1376 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1377 		}
1378 		break;
1379 	}
1380 }
1381 
1382 /*
1383  * Find memory on our list that is at least size big. If we find a block that is
1384  * big enough, we break it up and return the associated oversize_t struct back
1385  * to the calling client. Any leftover piece of that block is returned to the
1386  * freelist.
1387  */
1388 static oversize_t *
1389 find_oversize(size_t size)
1390 {
1391 	oversize_t *wp = oversize_list.next_bysize;
1392 	while (wp != &oversize_list && size > wp->size)
1393 		wp = wp->next_bysize;
1394 
1395 	if (wp == &oversize_list) /* empty list or nothing big enough */
1396 		return (NULL);
1397 	/* breaking up a chunk of memory */
1398 	if ((long)((wp->size - (size + OVSZ_HEADER_SIZE + MTMALLOC_MIN_ALIGN)))
1399 	    > MAX_CACHED) {
1400 		caddr_t off;
1401 		oversize_t *np;
1402 		size_t osize;
1403 		off = (caddr_t)ALIGN(wp->addr + size,
1404 		    MTMALLOC_MIN_ALIGN);
1405 		osize = wp->size;
1406 		wp->size = (size_t)(off - wp->addr);
1407 		np = oversize_header_alloc((uintptr_t)off,
1408 		    osize - (wp->size + OVSZ_HEADER_SIZE));
1409 		if ((long)np->size < 0)
1410 			abort();
1411 		unlink_oversize(wp);
1412 		add_oversize(np);
1413 	} else {
1414 		unlink_oversize(wp);
1415 	}
1416 	return (wp);
1417 }
1418 
1419 static void
1420 copy_pattern(uint32_t pattern, void *buf_arg, size_t size)
1421 {
1422 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1423 	uint32_t *buf = buf_arg;
1424 
1425 	while (buf < bufend - 3) {
1426 		buf[3] = buf[2] = buf[1] = buf[0] = pattern;
1427 		buf += 4;
1428 	}
1429 	while (buf < bufend)
1430 		*buf++ = pattern;
1431 }
1432 
1433 static void *
1434 verify_pattern(uint32_t pattern, void *buf_arg, size_t size)
1435 {
1436 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1437 	uint32_t *buf;
1438 
1439 	for (buf = buf_arg; buf < bufend; buf++)
1440 		if (*buf != pattern)
1441 			return (buf);
1442 	return (NULL);
1443 }
1444 
1445 static void
1446 free_oversize(oversize_t *ovp)
1447 {
1448 	assert(((uintptr_t)ovp->addr & 7) == 0); /* are we 8 byte aligned */
1449 	assert(ovp->size > MAX_CACHED);
1450 
1451 	ovp->next_bysize = ovp->prev_bysize = NULL;
1452 	ovp->next_byaddr = ovp->prev_byaddr = NULL;
1453 	(void) mutex_lock(&oversize_lock);
1454 	add_oversize(ovp);
1455 	(void) mutex_unlock(&oversize_lock);
1456 }
1457 
1458 static oversize_t *
1459 oversize_header_alloc(uintptr_t mem, size_t size)
1460 {
1461 	oversize_t *ovsz_hdr;
1462 
1463 	assert(size > MAX_CACHED);
1464 
1465 	ovsz_hdr = (oversize_t *)mem;
1466 	ovsz_hdr->prev_bysize = NULL;
1467 	ovsz_hdr->next_bysize = NULL;
1468 	ovsz_hdr->prev_byaddr = NULL;
1469 	ovsz_hdr->next_byaddr = NULL;
1470 	ovsz_hdr->hash_next = NULL;
1471 	ovsz_hdr->size = size;
1472 	mem += OVSZ_SIZE;
1473 	*(uintptr_t *)mem = MTMALLOC_OVERSIZE_MAGIC;
1474 	mem += OVERHEAD;
1475 	assert(((uintptr_t)mem & 7) == 0); /* are we 8 byte aligned */
1476 	ovsz_hdr->addr = (caddr_t)mem;
1477 	return (ovsz_hdr);
1478 }
1479 
1480 static void
1481 malloc_prepare()
1482 {
1483 	percpu_t *cpuptr;
1484 	cache_head_t *cachehead;
1485 	cache_t *thiscache;
1486 
1487 	(void) mutex_lock(&oversize_lock);
1488 	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
1489 		(void) mutex_lock(&cpuptr->mt_parent_lock);
1490 		for (cachehead = &cpuptr->mt_caches[0];
1491 		    cachehead < &cpuptr->mt_caches[NUM_CACHES];
1492 		    cachehead++) {
1493 			for (thiscache = cachehead->mt_cache;
1494 			    thiscache != NULL;
1495 			    thiscache = thiscache->mt_next) {
1496 				(void) mutex_lock(
1497 				    &thiscache->mt_cache_lock);
1498 			}
1499 		}
1500 	}
1501 }
1502 
1503 static void
1504 malloc_release()
1505 {
1506 	percpu_t *cpuptr;
1507 	cache_head_t *cachehead;
1508 	cache_t *thiscache;
1509 
1510 	for (cpuptr = &cpu_list[ncpus - 1]; cpuptr >= &cpu_list[0]; cpuptr--) {
1511 		for (cachehead = &cpuptr->mt_caches[NUM_CACHES - 1];
1512 		    cachehead >= &cpuptr->mt_caches[0];
1513 		    cachehead--) {
1514 			for (thiscache = cachehead->mt_cache;
1515 			    thiscache != NULL;
1516 			    thiscache = thiscache->mt_next) {
1517 				(void) mutex_unlock(
1518 				    &thiscache->mt_cache_lock);
1519 			}
1520 		}
1521 		(void) mutex_unlock(&cpuptr->mt_parent_lock);
1522 	}
1523 	(void) mutex_unlock(&oversize_lock);
1524 }
1525 
1526 #pragma init(malloc_init)
1527 static void
1528 malloc_init(void)
1529 {
1530 	/*
1531 	 * This works in the init section for this library
1532 	 * because setup_caches() doesn't call anything in libc
1533 	 * that calls malloc().  If it did, disaster would ensue.
1534 	 *
1535 	 * For this to work properly, this library must be the first
1536 	 * one to have its init section called (after libc) by the
1537 	 * dynamic linker.  If some other library's init section
1538 	 * ran first and called malloc(), disaster would ensue.
1539 	 * Because this is an interposer library for malloc(), the
1540 	 * dynamic linker arranges for its init section to run first.
1541 	 */
1542 	(void) setup_caches();
1543 
1544 	(void) pthread_atfork(malloc_prepare, malloc_release, malloc_release);
1545 }
1546