xref: /illumos-gate/usr/src/lib/libmtmalloc/common/mtmalloc.c (revision 67a4bb8f9ad4c49e9aa9e21e2114a7c093c3a73a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <mtmalloc.h>
28 #include "mtmalloc_impl.h"
29 #include <unistd.h>
30 #include <synch.h>
31 #include <thread.h>
32 #include <pthread.h>
33 #include <stdio.h>
34 #include <limits.h>
35 #include <errno.h>
36 #include <string.h>
37 #include <strings.h>
38 #include <sys/param.h>
39 #include <sys/sysmacros.h>
40 
41 /*
42  * To turn on the asserts just compile -DDEBUG
43  */
44 
45 #ifndef	DEBUG
46 #define	NDEBUG
47 #endif
48 
49 #include <assert.h>
50 
51 /*
52  * The MT hot malloc implementation contained herein is designed to be
53  * plug-compatible with the libc version of malloc. It is not intended
54  * to replace that implementation until we decide that it is ok to break
55  * customer apps (Solaris 3.0).
56  *
57  * For requests up to 2^^16, the allocator initializes itself into NCPUS
58  * worth of chains of caches. When a memory request is made, the calling thread
59  * is vectored into one of NCPUS worth of caches.  The LWP id gives us a cheap,
60  * contention-reducing index to use, eventually, this should be replaced with
61  * the actual CPU sequence number, when an interface to get it is available.
62  *
63  * Once the thread is vectored into one of the list of caches the real
64  * allocation of the memory begins. The size is determined to figure out which
65  * bucket the allocation should be satisfied from. The management of free
66  * buckets is done via a bitmask. A free bucket is represented by a 1. The
67  * first free bit represents the first free bucket. The position of the bit,
68  * represents the position of the bucket in the arena.
69  *
70  * When the memory from the arena is handed out, the address of the cache
71  * control structure is written in the word preceeding the returned memory.
72  * This cache control address is used during free() to mark the buffer free
73  * in the cache control structure.
74  *
75  * When all available memory in a cache has been depleted, a new chunk of memory
76  * is allocated via sbrk(). The new cache is allocated from this chunk of memory
77  * and initialized in the function create_cache(). New caches are installed at
78  * the front of a singly linked list of the same size memory pools. This helps
79  * to ensure that there will tend to be available memory in the beginning of the
80  * list.
81  *
82  * Long linked lists hurt performance. To decrease this effect, there is a
83  * tunable, requestsize, that bumps up the sbrk allocation size and thus
84  * increases the number of available blocks within an arena.  We also keep
85  * a "hint" for each cache list, which is the last cache in the list allocated
86  * from.  This lowers the cost of searching if there are a lot of fully
87  * allocated blocks at the front of the list.
88  *
89  * For requests greater than 2^^16 (oversize allocations), there are two pieces
90  * of overhead. There is the OVERHEAD used to hold the cache addr
91  * (&oversize_list), plus an oversize_t structure to further describe the block.
92  *
93  * The oversize list is kept as defragmented as possible by coalescing
94  * freed oversized allocations with adjacent neighbors.
95  *
96  * Addresses handed out are stored in a hash table, and are aligned on
97  * MTMALLOC_MIN_ALIGN-byte boundaries at both ends. Request sizes are rounded-up
98  * where necessary in order to achieve this. This eases the implementation of
99  * MTDEBUGPATTERN and MTINITPATTERN, particularly where coalescing occurs.
100  *
101  * A memalign allocation takes memalign header overhead.  There's two
102  * types of memalign headers distinguished by MTMALLOC_MEMALIGN_MAGIC
103  * and MTMALLOC_MEMALIGN_MIN_MAGIC.  When the size of memory taken to
104  * get to the aligned address from malloc'ed address is the minimum size
105  * OVERHEAD, we create a header taking only one OVERHEAD space with magic
106  * number MTMALLOC_MEMALIGN_MIN_MAGIC, and we know by subtracting OVERHEAD
107  * from memaligned address, we can get to the malloc'ed address. Otherwise,
108  * we create a memalign header taking two OVERHEAD space, one stores
109  * MTMALLOC_MEMALIGN_MAGIC magic number, the other one points back to the
110  * malloc'ed address.
111  */
112 
113 #if defined(__i386) || defined(__amd64)
114 #include <arpa/inet.h>	/* for htonl() */
115 #endif
116 
117 static void * morecore(size_t);
118 static void create_cache(cache_t *, size_t bufsize, uint_t hunks);
119 static void * malloc_internal(size_t, percpu_t *);
120 static void * oversize(size_t);
121 static oversize_t *find_oversize(size_t);
122 static void add_oversize(oversize_t *);
123 static void copy_pattern(uint32_t, void *, size_t);
124 static void * verify_pattern(uint32_t, void *, size_t);
125 static void reinit_cpu_list(void);
126 static void reinit_cache(cache_t *);
127 static void free_oversize(oversize_t *);
128 static oversize_t *oversize_header_alloc(uintptr_t, size_t);
129 
130 /*
131  * oversize hash table stuff
132  */
133 #define	NUM_BUCKETS	67	/* must be prime */
134 #define	HASH_OVERSIZE(caddr)	((uintptr_t)(caddr) % NUM_BUCKETS)
135 oversize_t *ovsz_hashtab[NUM_BUCKETS];
136 
137 #define	ALIGN(x, a)	((((uintptr_t)(x) + ((uintptr_t)(a) - 1)) \
138 			& ~((uintptr_t)(a) - 1)))
139 
140 /* need this to deal with little endianess of x86 */
141 #if defined(__i386) || defined(__amd64)
142 #define	FLIP_EM(x)	htonl((x))
143 #else
144 #define	FLIP_EM(x)	(x)
145 #endif
146 
147 #define	INSERT_ONLY			0
148 #define	COALESCE_LEFT			0x00000001
149 #define	COALESCE_RIGHT			0x00000002
150 #define	COALESCE_WITH_BOTH_SIDES	(COALESCE_LEFT | COALESCE_RIGHT)
151 
152 #define	OVERHEAD	8	/* size needed to write cache addr */
153 #define	HUNKSIZE	8192	/* just a multiplier */
154 
155 #define	MAX_CACHED_SHIFT	16	/* 64K is the max cached size */
156 #define	MAX_CACHED		(1 << MAX_CACHED_SHIFT)
157 #define	MIN_CACHED_SHIFT	4	/* smaller requests rounded up */
158 #define	MTMALLOC_MIN_ALIGN	8	/* min guaranteed alignment */
159 
160 /* maximum size before overflow */
161 #define	MAX_MTMALLOC	(SIZE_MAX - (SIZE_MAX % MTMALLOC_MIN_ALIGN) \
162 			- OVSZ_HEADER_SIZE)
163 
164 #define	NUM_CACHES	(MAX_CACHED_SHIFT - MIN_CACHED_SHIFT + 1)
165 #define	CACHELIST_SIZE	ALIGN(NUM_CACHES * sizeof (cache_head_t), \
166     CACHE_COHERENCY_UNIT)
167 
168 #define	MINSIZE		9	/* for requestsize, tunable */
169 #define	MAXSIZE		256	/* arbitrary, big enough, for requestsize */
170 
171 #define	FREEPATTERN	0xdeadbeef /* debug fill pattern for free buf */
172 #define	INITPATTERN	0xbaddcafe /* debug fill pattern for new buf */
173 
174 #define	misaligned(p)	((unsigned)(p) & (sizeof (int) - 1))
175 #define	IS_OVERSIZE(x, y)	(((x) < (y)) && (((x) > MAX_CACHED)? 1 : 0))
176 
177 static long requestsize = MINSIZE; /* 9 pages per cache; tunable; 9 is min */
178 
179 static uint_t cpu_mask;
180 static curcpu_func curcpu;
181 
182 static int32_t debugopt;
183 static int32_t reinit;
184 
185 static percpu_t *cpu_list;
186 static oversize_t oversize_list;
187 static mutex_t oversize_lock = DEFAULTMUTEX;
188 
189 static int ncpus = 0;
190 
191 #define	MTMALLOC_OVERSIZE_MAGIC		((uintptr_t)&oversize_list)
192 #define	MTMALLOC_MEMALIGN_MAGIC		((uintptr_t)&oversize_list + 1)
193 #define	MTMALLOC_MEMALIGN_MIN_MAGIC	((uintptr_t)&oversize_list + 2)
194 
195 /*
196  * We require allocations handed out to be aligned on MTMALLOC_MIN_ALIGN-byte
197  * boundaries. We round up sizeof (oversize_t) (when necessary) to ensure that
198  * this is achieved.
199  */
200 #define	OVSZ_SIZE		(ALIGN(sizeof (oversize_t), MTMALLOC_MIN_ALIGN))
201 #define	OVSZ_HEADER_SIZE	(OVSZ_SIZE + OVERHEAD)
202 
203 /*
204  * memalign header takes 2 OVERHEAD space.  One for memalign magic, and the
205  * other one points back to the start address of originally allocated space.
206  */
207 #define	MEMALIGN_HEADER_SIZE	2 * OVERHEAD
208 #define	MEMALIGN_HEADER_ALLOC(x, shift, malloc_addr)\
209 	if (shift == OVERHEAD)\
210 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
211 			MTMALLOC_MEMALIGN_MIN_MAGIC; \
212 	else {\
213 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
214 			MTMALLOC_MEMALIGN_MAGIC; \
215 		*((uintptr_t *)((caddr_t)x - 2 * OVERHEAD)) = \
216 			(uintptr_t)malloc_addr; \
217 	}
218 
219 /*
220  * Add big to the oversize hash table at the head of the relevant bucket.
221  */
222 static void
223 insert_hash(oversize_t *big)
224 {
225 	caddr_t ret = big->addr;
226 	int bucket = HASH_OVERSIZE(ret);
227 
228 	assert(MUTEX_HELD(&oversize_lock));
229 	big->hash_next = ovsz_hashtab[bucket];
230 	ovsz_hashtab[bucket] = big;
231 }
232 
233 void *
234 malloc(size_t bytes)
235 {
236 	percpu_t *list_rotor;
237 	uint_t	list_index;
238 
239 	if (bytes > MAX_CACHED)
240 		return (oversize(bytes));
241 
242 	list_index = (curcpu() & cpu_mask);
243 
244 	list_rotor = &cpu_list[list_index];
245 
246 	return (malloc_internal(bytes, list_rotor));
247 }
248 
249 void *
250 realloc(void * ptr, size_t bytes)
251 {
252 	void *new, *data_ptr;
253 	cache_t *cacheptr;
254 	caddr_t mem;
255 	size_t shift = 0;
256 
257 	if (ptr == NULL)
258 		return (malloc(bytes));
259 
260 	if (bytes == 0) {
261 		free(ptr);
262 		return (NULL);
263 	}
264 
265 	data_ptr = ptr;
266 	mem = (caddr_t)ptr - OVERHEAD;
267 
268 	/*
269 	 * Optimization possibility :
270 	 *	p = malloc(64);
271 	 *	q = realloc(p, 64);
272 	 * q can be same as p.
273 	 * Apply this optimization for the normal
274 	 * sized caches for now.
275 	 */
276 	if (*(uintptr_t *)mem < MTMALLOC_OVERSIZE_MAGIC ||
277 	    *(uintptr_t *)mem > MTMALLOC_MEMALIGN_MIN_MAGIC) {
278 		cacheptr = (cache_t *)*(uintptr_t *)mem;
279 		if (bytes <= (cacheptr->mt_size - OVERHEAD))
280 			return (ptr);
281 	}
282 
283 	new = malloc(bytes);
284 
285 	if (new == NULL)
286 		return (NULL);
287 
288 	/*
289 	 * If new == ptr, ptr has previously been freed. Passing a freed pointer
290 	 * to realloc() is not allowed - unless the caller specifically states
291 	 * otherwise, in which case we must avoid freeing ptr (ie new) before we
292 	 * return new. There is (obviously) no requirement to memcpy() ptr to
293 	 * new before we return.
294 	 */
295 	if (new == ptr) {
296 		if (!(debugopt & MTDOUBLEFREE))
297 			abort();
298 		return (new);
299 	}
300 
301 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
302 		mem -= OVERHEAD;
303 		ptr = (void *)*(uintptr_t *)mem;
304 		mem = (caddr_t)ptr - OVERHEAD;
305 		shift = (size_t)((uintptr_t)data_ptr - (uintptr_t)ptr);
306 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
307 		ptr = (void *) mem;
308 		mem -= OVERHEAD;
309 		shift = OVERHEAD;
310 	}
311 
312 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
313 		oversize_t *old;
314 
315 		old = (oversize_t *)(mem - OVSZ_SIZE);
316 		(void) memcpy(new, data_ptr, MIN(bytes, old->size - shift));
317 		free(ptr);
318 		return (new);
319 	}
320 
321 	cacheptr = (cache_t *)*(uintptr_t *)mem;
322 
323 	(void) memcpy(new, data_ptr,
324 	    MIN(cacheptr->mt_size - OVERHEAD - shift, bytes));
325 	free(ptr);
326 
327 	return (new);
328 }
329 
330 void *
331 calloc(size_t nelem, size_t bytes)
332 {
333 	void * ptr;
334 	size_t size;
335 
336 	if (nelem == 0 || bytes == 0) {
337 		size = 0;
338 	} else {
339 		size = nelem * bytes;
340 
341 		/* check for overflow */
342 		if ((size / nelem) != bytes) {
343 			errno = ENOMEM;
344 			return (NULL);
345 		}
346 	}
347 
348 	ptr = malloc(size);
349 	if (ptr == NULL)
350 		return (NULL);
351 	(void) memset(ptr, 0, size);
352 
353 	return (ptr);
354 }
355 
356 void
357 free(void * ptr)
358 {
359 	cache_t *cacheptr;
360 	caddr_t mem;
361 	int32_t i;
362 	caddr_t freeblocks;
363 	uintptr_t offset;
364 	uchar_t mask;
365 	int32_t which_bit, num_bytes;
366 
367 	if (ptr == NULL)
368 		return;
369 
370 	mem = (caddr_t)ptr - OVERHEAD;
371 
372 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
373 		mem -= OVERHEAD;
374 		ptr = (void *)*(uintptr_t *)mem;
375 		mem = (caddr_t)ptr - OVERHEAD;
376 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
377 		ptr = (void *) mem;
378 		mem -= OVERHEAD;
379 	}
380 
381 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
382 		oversize_t *big, **opp;
383 		int bucket;
384 
385 		big = (oversize_t *)(mem - OVSZ_SIZE);
386 		(void) mutex_lock(&oversize_lock);
387 
388 		bucket = HASH_OVERSIZE(big->addr);
389 		for (opp = &ovsz_hashtab[bucket]; *opp != NULL;
390 		    opp = &(*opp)->hash_next)
391 			if (*opp == big)
392 				break;
393 
394 		if (*opp == NULL) {
395 			if (!(debugopt & MTDOUBLEFREE))
396 				abort();
397 			(void) mutex_unlock(&oversize_lock);
398 			return;
399 		}
400 
401 		*opp = big->hash_next;	/* remove big from the hash table */
402 		big->hash_next = NULL;
403 
404 		if (debugopt & MTDEBUGPATTERN)
405 			copy_pattern(FREEPATTERN, ptr, big->size);
406 		add_oversize(big);
407 		(void) mutex_unlock(&oversize_lock);
408 		return;
409 	}
410 
411 	cacheptr = (cache_t *)*(uintptr_t *)mem;
412 	freeblocks = cacheptr->mt_freelist;
413 
414 	/*
415 	 * This is the distance measured in bits into the arena.
416 	 * The value of offset is in bytes but there is a 1-1 correlation
417 	 * between distance into the arena and distance into the
418 	 * freelist bitmask.
419 	 */
420 	offset = mem - cacheptr->mt_arena;
421 
422 	/*
423 	 * i is total number of bits to offset into freelist bitmask.
424 	 */
425 
426 	i = offset / cacheptr->mt_size;
427 
428 	num_bytes = i >> 3;
429 
430 	/*
431 	 * which_bit is the bit offset into the byte in the freelist.
432 	 * if our freelist bitmask looks like 0xf3 and we are freeing
433 	 * block 5 (ie: the 6th block) our mask will be 0xf7 after
434 	 * the free. Things go left to right that's why the mask is 0x80
435 	 * and not 0x01.
436 	 */
437 	which_bit = i - (num_bytes << 3);
438 
439 	mask = 0x80 >> which_bit;
440 
441 	freeblocks += num_bytes;
442 
443 	if (debugopt & MTDEBUGPATTERN)
444 		copy_pattern(FREEPATTERN, ptr, cacheptr->mt_size - OVERHEAD);
445 
446 	(void) mutex_lock(&cacheptr->mt_cache_lock);
447 
448 	if (*freeblocks & mask) {
449 		if (!(debugopt & MTDOUBLEFREE))
450 			abort();
451 	} else {
452 		*freeblocks |= mask;
453 		cacheptr->mt_nfree++;
454 	}
455 
456 	(void) mutex_unlock(&cacheptr->mt_cache_lock);
457 }
458 
459 void *
460 memalign(size_t alignment, size_t size)
461 {
462 	size_t alloc_size;
463 	uintptr_t offset;
464 	void *alloc_buf;
465 	void *ret_buf;
466 
467 	if (size == 0 || alignment == 0 || misaligned(alignment) ||
468 	    (alignment & (alignment - 1)) != 0) {
469 		errno = EINVAL;
470 		return (NULL);
471 	}
472 
473 	/* <= MTMALLOC_MIN_ALIGN, malloc can provide directly */
474 	if (alignment <= MTMALLOC_MIN_ALIGN)
475 		return (malloc(size));
476 
477 	alloc_size = size + alignment - MTMALLOC_MIN_ALIGN;
478 
479 	if (alloc_size < size) { /* overflow */
480 		errno = ENOMEM;
481 		return (NULL);
482 	}
483 
484 	alloc_buf = malloc(alloc_size);
485 
486 	if (alloc_buf == NULL)
487 		/* malloc sets errno */
488 		return (NULL);
489 
490 	/*
491 	 * If alloc_size > MAX_CACHED, malloc() will have returned a multiple of
492 	 * MTMALLOC_MIN_ALIGN, having rounded-up alloc_size if necessary. Since
493 	 * we will use alloc_size to return the excess fragments to the free
494 	 * list, we also round-up alloc_size if necessary.
495 	 */
496 	if ((alloc_size > MAX_CACHED) &&
497 	    (alloc_size & (MTMALLOC_MIN_ALIGN - 1)))
498 		alloc_size = ALIGN(alloc_size, MTMALLOC_MIN_ALIGN);
499 
500 	if ((offset = (uintptr_t)alloc_buf & (alignment - 1)) == 0) {
501 		/* aligned correctly */
502 
503 		size_t frag_size = alloc_size -
504 		    (size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
505 
506 		/*
507 		 * If the leftover piece of the memory > MAX_CACHED,
508 		 * split off the piece and return it back to the freelist.
509 		 */
510 		if (IS_OVERSIZE(frag_size, alloc_size)) {
511 			oversize_t *orig, *tail;
512 			uintptr_t taddr;
513 			size_t data_size;
514 			taddr = ALIGN((uintptr_t)alloc_buf + size,
515 			    MTMALLOC_MIN_ALIGN);
516 			data_size = taddr - (uintptr_t)alloc_buf;
517 			orig = (oversize_t *)((uintptr_t)alloc_buf -
518 			    OVSZ_HEADER_SIZE);
519 			frag_size = orig->size - data_size -
520 			    OVSZ_HEADER_SIZE;
521 			orig->size = data_size;
522 			tail = oversize_header_alloc(taddr, frag_size);
523 			free_oversize(tail);
524 		}
525 		ret_buf = alloc_buf;
526 	} else {
527 		uchar_t	oversize_bits = 0;
528 		size_t	head_sz, data_sz, tail_sz;
529 		uintptr_t ret_addr, taddr, shift, tshift;
530 		oversize_t *orig, *tail, *big;
531 		size_t tsize;
532 
533 		/* needs to be aligned */
534 		shift = alignment - offset;
535 
536 		assert(shift >= MTMALLOC_MIN_ALIGN);
537 
538 		ret_addr = ((uintptr_t)alloc_buf + shift);
539 		ret_buf = (void *)ret_addr;
540 
541 		if (alloc_size <= MAX_CACHED) {
542 			MEMALIGN_HEADER_ALLOC(ret_addr, shift, alloc_buf);
543 			return (ret_buf);
544 		}
545 
546 		/*
547 		 * Only check for the fragments when the memory is allocted
548 		 * from oversize_list.  Split off a fragment and return it
549 		 * to the oversize freelist when it's > MAX_CACHED.
550 		 */
551 
552 		head_sz = shift - MAX(MEMALIGN_HEADER_SIZE, OVSZ_HEADER_SIZE);
553 
554 		tail_sz = alloc_size -
555 		    (shift + size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
556 
557 		oversize_bits |= IS_OVERSIZE(head_sz, alloc_size) |
558 		    IS_OVERSIZE(size, alloc_size) << DATA_SHIFT |
559 		    IS_OVERSIZE(tail_sz, alloc_size) << TAIL_SHIFT;
560 
561 		switch (oversize_bits) {
562 			case NONE_OVERSIZE:
563 			case DATA_OVERSIZE:
564 				MEMALIGN_HEADER_ALLOC(ret_addr, shift,
565 				    alloc_buf);
566 				break;
567 			case HEAD_OVERSIZE:
568 				/*
569 				 * If we can extend data > MAX_CACHED and have
570 				 * head still > MAX_CACHED, we split head-end
571 				 * as the case of head-end and data oversized,
572 				 * otherwise just create memalign header.
573 				 */
574 				tsize = (shift + size) - (MAX_CACHED + 8 +
575 				    MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
576 
577 				if (!IS_OVERSIZE(tsize, alloc_size)) {
578 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
579 					    alloc_buf);
580 					break;
581 				} else {
582 					tsize += OVSZ_HEADER_SIZE;
583 					taddr = ALIGN((uintptr_t)alloc_buf +
584 					    tsize, MTMALLOC_MIN_ALIGN);
585 					tshift = ret_addr - taddr;
586 					MEMALIGN_HEADER_ALLOC(ret_addr, tshift,
587 					    taddr);
588 					ret_addr = taddr;
589 					shift = ret_addr - (uintptr_t)alloc_buf;
590 				}
591 				/* FALLTHROUGH */
592 			case HEAD_AND_DATA_OVERSIZE:
593 				/*
594 				 * Split off the head fragment and
595 				 * return it back to oversize freelist.
596 				 * Create oversize header for the piece
597 				 * of (data + tail fragment).
598 				 */
599 				orig = (oversize_t *)((uintptr_t)alloc_buf -
600 				    OVSZ_HEADER_SIZE);
601 				big = oversize_header_alloc(ret_addr -
602 				    OVSZ_HEADER_SIZE, (orig->size - shift));
603 				(void) mutex_lock(&oversize_lock);
604 				insert_hash(big);
605 				(void) mutex_unlock(&oversize_lock);
606 				orig->size = shift - OVSZ_HEADER_SIZE;
607 
608 				/* free up the head fragment */
609 				free_oversize(orig);
610 				break;
611 			case TAIL_OVERSIZE:
612 				/*
613 				 * If we can extend data > MAX_CACHED and have
614 				 * tail-end still > MAX_CACHED, we split tail
615 				 * end, otherwise just create memalign header.
616 				 */
617 				orig = (oversize_t *)((uintptr_t)alloc_buf -
618 				    OVSZ_HEADER_SIZE);
619 				tsize =  orig->size - (MAX_CACHED + 8 +
620 				    shift + OVSZ_HEADER_SIZE +
621 				    MTMALLOC_MIN_ALIGN);
622 				if (!IS_OVERSIZE(tsize, alloc_size)) {
623 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
624 					    alloc_buf);
625 					break;
626 				} else {
627 					size = MAX_CACHED + 8;
628 				}
629 				/* FALLTHROUGH */
630 			case DATA_AND_TAIL_OVERSIZE:
631 				/*
632 				 * Split off the tail fragment and
633 				 * return it back to oversize freelist.
634 				 * Create memalign header and adjust
635 				 * the size for the piece of
636 				 * (head fragment + data).
637 				 */
638 				taddr = ALIGN(ret_addr + size,
639 				    MTMALLOC_MIN_ALIGN);
640 				data_sz = (size_t)(taddr -
641 				    (uintptr_t)alloc_buf);
642 				orig = (oversize_t *)((uintptr_t)alloc_buf -
643 				    OVSZ_HEADER_SIZE);
644 				tsize = orig->size - data_sz;
645 				orig->size = data_sz;
646 				MEMALIGN_HEADER_ALLOC(ret_buf, shift,
647 				    alloc_buf);
648 				tsize -= OVSZ_HEADER_SIZE;
649 				tail = oversize_header_alloc(taddr,  tsize);
650 				free_oversize(tail);
651 				break;
652 			case HEAD_AND_TAIL_OVERSIZE:
653 				/*
654 				 * Split off the head fragment.
655 				 * We try to free up tail-end when we can
656 				 * extend data size to (MAX_CACHED + 8)
657 				 * and remain tail-end oversized.
658 				 * The bottom line is all split pieces
659 				 * should be oversize in size.
660 				 */
661 				orig = (oversize_t *)((uintptr_t)alloc_buf -
662 				    OVSZ_HEADER_SIZE);
663 				tsize =  orig->size - (MAX_CACHED + 8 +
664 				    OVSZ_HEADER_SIZE + shift +
665 				    MTMALLOC_MIN_ALIGN);
666 
667 				if (!IS_OVERSIZE(tsize, alloc_size)) {
668 					/*
669 					 * If the chunk is not big enough
670 					 * to make both data and tail oversize
671 					 * we just keep them as one piece.
672 					 */
673 					big = oversize_header_alloc(ret_addr -
674 					    OVSZ_HEADER_SIZE,
675 					    orig->size - shift);
676 					(void) mutex_lock(&oversize_lock);
677 					insert_hash(big);
678 					(void) mutex_unlock(&oversize_lock);
679 					orig->size = shift - OVSZ_HEADER_SIZE;
680 					free_oversize(orig);
681 					break;
682 				} else {
683 					/*
684 					 * extend data size > MAX_CACHED
685 					 * and handle it as head, data, tail
686 					 * are all oversized.
687 					 */
688 					size = MAX_CACHED + 8;
689 				}
690 				/* FALLTHROUGH */
691 			case ALL_OVERSIZE:
692 				/*
693 				 * split off the head and tail fragments,
694 				 * return them back to the oversize freelist.
695 				 * Alloc oversize header for data seg.
696 				 */
697 				orig = (oversize_t *)((uintptr_t)alloc_buf -
698 				    OVSZ_HEADER_SIZE);
699 				tsize = orig->size;
700 				orig->size = shift - OVSZ_HEADER_SIZE;
701 				free_oversize(orig);
702 
703 				taddr = ALIGN(ret_addr + size,
704 				    MTMALLOC_MIN_ALIGN);
705 				data_sz = taddr - ret_addr;
706 				assert(tsize > (shift + data_sz +
707 				    OVSZ_HEADER_SIZE));
708 				tail_sz = tsize -
709 				    (shift + data_sz + OVSZ_HEADER_SIZE);
710 
711 				/* create oversize header for data seg */
712 				big = oversize_header_alloc(ret_addr -
713 				    OVSZ_HEADER_SIZE, data_sz);
714 				(void) mutex_lock(&oversize_lock);
715 				insert_hash(big);
716 				(void) mutex_unlock(&oversize_lock);
717 
718 				/* create oversize header for tail fragment */
719 				tail = oversize_header_alloc(taddr, tail_sz);
720 				free_oversize(tail);
721 				break;
722 			default:
723 				/* should not reach here */
724 				assert(0);
725 		}
726 	}
727 	return (ret_buf);
728 }
729 
730 
731 void *
732 valloc(size_t size)
733 {
734 	static unsigned pagesize;
735 
736 	if (size == 0)
737 		return (NULL);
738 
739 	if (!pagesize)
740 		pagesize = sysconf(_SC_PAGESIZE);
741 
742 	return (memalign(pagesize, size));
743 }
744 
745 void
746 mallocctl(int cmd, long value)
747 {
748 	switch (cmd) {
749 
750 	case MTDEBUGPATTERN:
751 		/*
752 		 * Reinitialize free blocks in case malloc() is called prior
753 		 * to mallocctl().
754 		 */
755 		if (value && !(debugopt & cmd)) {
756 			reinit++;
757 			debugopt |= cmd;
758 			reinit_cpu_list();
759 		}
760 		/*FALLTHRU*/
761 	case MTDOUBLEFREE:
762 	case MTINITBUFFER:
763 		if (value)
764 			debugopt |= cmd;
765 		else
766 			debugopt &= ~cmd;
767 		break;
768 	case MTCHUNKSIZE:
769 		if (value >= MINSIZE && value <= MAXSIZE)
770 			requestsize = value;
771 		break;
772 	default:
773 		break;
774 	}
775 }
776 
777 /*
778  * Initialization function, called from the init section of the library.
779  * No locking is required here because we are single-threaded during
780  * library initialization.
781  */
782 static void
783 setup_caches(void)
784 {
785 	uintptr_t oldbrk;
786 	uintptr_t newbrk;
787 
788 	size_t cache_space_needed;
789 	size_t padding;
790 
791 	curcpu_func new_curcpu;
792 	uint_t new_cpu_mask;
793 	percpu_t *new_cpu_list;
794 
795 	uint_t i, j;
796 	uintptr_t list_addr;
797 
798 	/*
799 	 * Get a decent "current cpu identifier", to be used to reduce
800 	 * contention.  Eventually, this should be replaced by an interface
801 	 * to get the actual CPU sequence number in libthread/liblwp.
802 	 */
803 	new_curcpu = (curcpu_func)thr_self;
804 	if ((ncpus = 2 * sysconf(_SC_NPROCESSORS_CONF)) <= 0)
805 		ncpus = 4; /* decent default value */
806 
807 	/* round ncpus up to a power of 2 */
808 	while (ncpus & (ncpus - 1))
809 		ncpus++;
810 
811 	new_cpu_mask = ncpus - 1;	/* create the cpu mask */
812 
813 	/*
814 	 * We now do some magic with the brk.  What we want to get in the
815 	 * end is a bunch of well-aligned stuff in a big initial allocation.
816 	 * Along the way, we do sanity checks to make sure no one else has
817 	 * touched the brk (which shouldn't happen, but it's always good to
818 	 * check)
819 	 *
820 	 * First, make sure sbrk is sane, and store the current brk in oldbrk.
821 	 */
822 	oldbrk = (uintptr_t)sbrk(0);
823 	if ((void *)oldbrk == (void *)-1)
824 		abort();	/* sbrk is broken -- we're doomed. */
825 
826 	/*
827 	 * Now, align the brk to a multiple of CACHE_COHERENCY_UNIT, so that
828 	 * the percpu structures and cache lists will be properly aligned.
829 	 *
830 	 *   2.  All hunks will be page-aligned, assuming HUNKSIZE >= PAGESIZE,
831 	 *	so they can be paged out individually.
832 	 */
833 	newbrk = ALIGN(oldbrk, CACHE_COHERENCY_UNIT);
834 	if (newbrk != oldbrk && (uintptr_t)sbrk(newbrk - oldbrk) != oldbrk)
835 		abort();	/* sbrk is broken -- we're doomed. */
836 
837 	/*
838 	 * For each cpu, there is one percpu_t and a list of caches
839 	 */
840 	cache_space_needed = ncpus * (sizeof (percpu_t) + CACHELIST_SIZE);
841 
842 	new_cpu_list = (percpu_t *)sbrk(cache_space_needed);
843 
844 	if (new_cpu_list == (percpu_t *)-1 ||
845 	    (uintptr_t)new_cpu_list != newbrk)
846 		abort();	/* sbrk is broken -- we're doomed. */
847 
848 	/*
849 	 * Finally, align the brk to HUNKSIZE so that all hunks are
850 	 * page-aligned, to avoid edge-effects.
851 	 */
852 
853 	newbrk = (uintptr_t)new_cpu_list + cache_space_needed;
854 
855 	padding = ALIGN(newbrk, HUNKSIZE) - newbrk;
856 
857 	if (padding > 0 && (uintptr_t)sbrk(padding) != newbrk)
858 		abort();	/* sbrk is broken -- we're doomed. */
859 
860 	list_addr = ((uintptr_t)new_cpu_list + (sizeof (percpu_t) * ncpus));
861 
862 	/* initialize the percpu list */
863 	for (i = 0; i < ncpus; i++) {
864 		new_cpu_list[i].mt_caches = (cache_head_t *)list_addr;
865 		for (j = 0; j < NUM_CACHES; j++) {
866 			new_cpu_list[i].mt_caches[j].mt_cache = NULL;
867 			new_cpu_list[i].mt_caches[j].mt_hint = NULL;
868 		}
869 
870 		(void) mutex_init(&new_cpu_list[i].mt_parent_lock,
871 		    USYNC_THREAD, NULL);
872 
873 		/* get the correct cache list alignment */
874 		list_addr += CACHELIST_SIZE;
875 	}
876 
877 	/*
878 	 * Initialize oversize listhead
879 	 */
880 	oversize_list.next_bysize = &oversize_list;
881 	oversize_list.prev_bysize = &oversize_list;
882 	oversize_list.next_byaddr = &oversize_list;
883 	oversize_list.prev_byaddr = &oversize_list;
884 	oversize_list.addr = NULL;
885 	oversize_list.size = 0;		/* sentinal */
886 
887 	/*
888 	 * Now install the global variables.
889 	 */
890 	curcpu = new_curcpu;
891 	cpu_mask = new_cpu_mask;
892 	cpu_list = new_cpu_list;
893 }
894 
895 static void
896 create_cache(cache_t *cp, size_t size, uint_t chunksize)
897 {
898 	long nblocks;
899 
900 	(void) mutex_init(&cp->mt_cache_lock, USYNC_THREAD, NULL);
901 	cp->mt_size = size;
902 	cp->mt_freelist = ((caddr_t)cp + sizeof (cache_t));
903 	cp->mt_span = chunksize * HUNKSIZE - sizeof (cache_t);
904 	cp->mt_hunks = chunksize;
905 	/*
906 	 * rough calculation. We will need to adjust later.
907 	 */
908 	nblocks = cp->mt_span / cp->mt_size;
909 	nblocks >>= 3;
910 	if (nblocks == 0) { /* less than 8 free blocks in this pool */
911 		int32_t numblocks = 0;
912 		long i = cp->mt_span;
913 		size_t sub = cp->mt_size;
914 		uchar_t mask = 0;
915 
916 		while (i > sub) {
917 			numblocks++;
918 			i -= sub;
919 		}
920 		nblocks = numblocks;
921 		cp->mt_arena = (caddr_t)ALIGN(cp->mt_freelist + 8, 8);
922 		cp->mt_nfree = numblocks;
923 		while (numblocks--) {
924 			mask |= 0x80 >> numblocks;
925 		}
926 		*(cp->mt_freelist) = mask;
927 	} else {
928 		cp->mt_arena = (caddr_t)ALIGN((caddr_t)cp->mt_freelist +
929 		    nblocks, 32);
930 		/* recompute nblocks */
931 		nblocks = (uintptr_t)((caddr_t)cp->mt_freelist +
932 		    cp->mt_span - cp->mt_arena) / cp->mt_size;
933 		cp->mt_nfree = ((nblocks >> 3) << 3);
934 		/* Set everything to free */
935 		(void) memset(cp->mt_freelist, 0xff, nblocks >> 3);
936 	}
937 
938 	if (debugopt & MTDEBUGPATTERN)
939 		copy_pattern(FREEPATTERN, cp->mt_arena, cp->mt_size * nblocks);
940 
941 	cp->mt_next = NULL;
942 }
943 
944 static void
945 reinit_cpu_list(void)
946 {
947 	oversize_t *wp = oversize_list.next_bysize;
948 	percpu_t *cpuptr;
949 	cache_t *thiscache;
950 	cache_head_t *cachehead;
951 
952 	/* Reinitialize free oversize blocks. */
953 	(void) mutex_lock(&oversize_lock);
954 	if (debugopt & MTDEBUGPATTERN)
955 		for (; wp != &oversize_list; wp = wp->next_bysize)
956 			copy_pattern(FREEPATTERN, wp->addr, wp->size);
957 	(void) mutex_unlock(&oversize_lock);
958 
959 	/* Reinitialize free blocks. */
960 	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
961 		(void) mutex_lock(&cpuptr->mt_parent_lock);
962 		for (cachehead = &cpuptr->mt_caches[0]; cachehead <
963 		    &cpuptr->mt_caches[NUM_CACHES]; cachehead++) {
964 			for (thiscache = cachehead->mt_cache; thiscache != NULL;
965 			    thiscache = thiscache->mt_next) {
966 				(void) mutex_lock(&thiscache->mt_cache_lock);
967 				if (thiscache->mt_nfree == 0) {
968 					(void) mutex_unlock(
969 					    &thiscache->mt_cache_lock);
970 					continue;
971 				}
972 				if (thiscache != NULL)
973 					reinit_cache(thiscache);
974 				(void) mutex_unlock(&thiscache->mt_cache_lock);
975 			}
976 		}
977 		(void) mutex_unlock(&cpuptr->mt_parent_lock);
978 	}
979 	reinit = 0;
980 }
981 
982 static void
983 reinit_cache(cache_t *thiscache)
984 {
985 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
986 	int32_t i, n;
987 	caddr_t ret;
988 
989 	freeblocks = (uint32_t *)thiscache->mt_freelist;
990 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
991 		if (*freeblocks & 0xffffffff) {
992 			for (i = 0; i < 32; i++) {
993 				if (FLIP_EM(*freeblocks) & (0x80000000 >> i)) {
994 					n = (uintptr_t)(((freeblocks -
995 					    (uint32_t *)thiscache->mt_freelist)
996 					    << 5) + i) * thiscache->mt_size;
997 					ret = thiscache->mt_arena + n;
998 					ret += OVERHEAD;
999 					copy_pattern(FREEPATTERN, ret,
1000 					    thiscache->mt_size);
1001 				}
1002 			}
1003 		}
1004 		freeblocks++;
1005 	}
1006 }
1007 
1008 static void *
1009 malloc_internal(size_t size, percpu_t *cpuptr)
1010 {
1011 	cache_head_t *cachehead;
1012 	cache_t *thiscache, *hintcache;
1013 	int32_t i, n, logsz, bucket;
1014 	uint32_t index;
1015 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
1016 	caddr_t ret;
1017 
1018 	logsz = MIN_CACHED_SHIFT;
1019 
1020 	while (size > (1 << logsz))
1021 		logsz++;
1022 
1023 	bucket = logsz - MIN_CACHED_SHIFT;
1024 
1025 	(void) mutex_lock(&cpuptr->mt_parent_lock);
1026 
1027 	/*
1028 	 * Find a cache of the appropriate size with free buffers.
1029 	 *
1030 	 * We don't need to lock each cache as we check their mt_nfree count,
1031 	 * since:
1032 	 *	1.  We are only looking for caches with mt_nfree > 0.  If a
1033 	 *	   free happens during our search, it will increment mt_nfree,
1034 	 *	   which will not effect the test.
1035 	 *	2.  Allocations can decrement mt_nfree, but they can't happen
1036 	 *	   as long as we hold mt_parent_lock.
1037 	 */
1038 
1039 	cachehead = &cpuptr->mt_caches[bucket];
1040 
1041 	/* Search through the list, starting at the mt_hint */
1042 	thiscache = cachehead->mt_hint;
1043 
1044 	while (thiscache != NULL && thiscache->mt_nfree == 0)
1045 		thiscache = thiscache->mt_next;
1046 
1047 	if (thiscache == NULL) {
1048 		/* wrap around -- search up to the hint */
1049 		thiscache = cachehead->mt_cache;
1050 		hintcache = cachehead->mt_hint;
1051 
1052 		while (thiscache != NULL && thiscache != hintcache &&
1053 		    thiscache->mt_nfree == 0)
1054 			thiscache = thiscache->mt_next;
1055 
1056 		if (thiscache == hintcache)
1057 			thiscache = NULL;
1058 	}
1059 
1060 
1061 	if (thiscache == NULL) { /* there are no free caches */
1062 		int32_t thisrequest = requestsize;
1063 		int32_t buffer_size = (1 << logsz) + OVERHEAD;
1064 
1065 		thiscache = (cache_t *)morecore(thisrequest * HUNKSIZE);
1066 
1067 		if (thiscache == (cache_t *)-1) {
1068 			(void) mutex_unlock(&cpuptr->mt_parent_lock);
1069 			errno = EAGAIN;
1070 			return (NULL);
1071 		}
1072 		create_cache(thiscache, buffer_size, thisrequest);
1073 
1074 		/* link in the new block at the beginning of the list */
1075 		thiscache->mt_next = cachehead->mt_cache;
1076 		cachehead->mt_cache = thiscache;
1077 	}
1078 
1079 	/* update the hint to the cache we found or created */
1080 	cachehead->mt_hint = thiscache;
1081 
1082 	/* thiscache now points to a cache with available space */
1083 	(void) mutex_lock(&thiscache->mt_cache_lock);
1084 
1085 	freeblocks = (uint32_t *)thiscache->mt_freelist;
1086 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
1087 		if (*freeblocks & 0xffffffff)
1088 			break;
1089 		freeblocks++;
1090 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1091 		    *freeblocks & 0xffffffff)
1092 			break;
1093 		freeblocks++;
1094 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1095 		    *freeblocks & 0xffffffff)
1096 			break;
1097 		freeblocks++;
1098 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1099 		    *freeblocks & 0xffffffff)
1100 			break;
1101 		freeblocks++;
1102 	}
1103 
1104 	/*
1105 	 * the offset from mt_freelist to freeblocks is the offset into
1106 	 * the arena. Be sure to include the offset into freeblocks
1107 	 * of the bitmask. n is the offset.
1108 	 */
1109 	for (i = 0; i < 32; ) {
1110 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1111 			break;
1112 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1113 			break;
1114 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1115 			break;
1116 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1117 			break;
1118 	}
1119 	index = 0x80000000 >> --i;
1120 
1121 
1122 	*freeblocks &= FLIP_EM(~index);
1123 
1124 	thiscache->mt_nfree--;
1125 
1126 	(void) mutex_unlock(&thiscache->mt_cache_lock);
1127 	(void) mutex_unlock(&cpuptr->mt_parent_lock);
1128 
1129 	n = (uintptr_t)(((freeblocks - (uint32_t *)thiscache->mt_freelist) << 5)
1130 	    + i) * thiscache->mt_size;
1131 	/*
1132 	 * Now you have the offset in n, you've changed the free mask
1133 	 * in the freelist. Nothing left to do but find the block
1134 	 * in the arena and put the value of thiscache in the word
1135 	 * ahead of the handed out address and return the memory
1136 	 * back to the user.
1137 	 */
1138 	ret = thiscache->mt_arena + n;
1139 
1140 	/* Store the cache addr for this buf. Makes free go fast. */
1141 	*(uintptr_t *)ret = (uintptr_t)thiscache;
1142 
1143 	/*
1144 	 * This assert makes sure we don't hand out memory that is not
1145 	 * owned by this cache.
1146 	 */
1147 	assert(ret + thiscache->mt_size <= thiscache->mt_freelist +
1148 	    thiscache->mt_span);
1149 
1150 	ret += OVERHEAD;
1151 
1152 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1153 
1154 	if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1155 		if (verify_pattern(FREEPATTERN, ret, size))
1156 			abort();	/* reference after free */
1157 
1158 	if (debugopt & MTINITBUFFER)
1159 		copy_pattern(INITPATTERN, ret, size);
1160 	return ((void *)ret);
1161 }
1162 
1163 static void *
1164 morecore(size_t bytes)
1165 {
1166 	void * ret;
1167 
1168 	if (bytes > LONG_MAX) {
1169 		intptr_t wad;
1170 		/*
1171 		 * The request size is too big. We need to do this in
1172 		 * chunks. Sbrk only takes an int for an arg.
1173 		 */
1174 		if (bytes == ULONG_MAX)
1175 			return ((void *)-1);
1176 
1177 		ret = sbrk(0);
1178 		wad = LONG_MAX;
1179 		while (wad > 0) {
1180 			if (sbrk(wad) == (void *)-1) {
1181 				if (ret != sbrk(0))
1182 					(void) sbrk(-LONG_MAX);
1183 				return ((void *)-1);
1184 			}
1185 			bytes -= LONG_MAX;
1186 			wad = bytes;
1187 		}
1188 	} else
1189 		ret = sbrk(bytes);
1190 
1191 	return (ret);
1192 }
1193 
1194 
1195 static void *
1196 oversize(size_t size)
1197 {
1198 	caddr_t ret;
1199 	oversize_t *big;
1200 
1201 	/* make sure we will not overflow */
1202 	if (size > MAX_MTMALLOC) {
1203 		errno = ENOMEM;
1204 		return (NULL);
1205 	}
1206 
1207 	/*
1208 	 * Since we ensure every address we hand back is
1209 	 * MTMALLOC_MIN_ALIGN-byte aligned, ALIGNing size ensures that the
1210 	 * memory handed out is MTMALLOC_MIN_ALIGN-byte aligned at both ends.
1211 	 * This eases the implementation of MTDEBUGPATTERN and MTINITPATTERN,
1212 	 * particularly where coalescing occurs.
1213 	 */
1214 	size = ALIGN(size, MTMALLOC_MIN_ALIGN);
1215 
1216 	/*
1217 	 * The idea with the global lock is that we are sure to
1218 	 * block in the kernel anyway since given an oversize alloc
1219 	 * we are sure to have to call morecore();
1220 	 */
1221 	(void) mutex_lock(&oversize_lock);
1222 
1223 	if ((big = find_oversize(size)) != NULL) {
1224 		if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1225 			if (verify_pattern(FREEPATTERN, big->addr, size))
1226 				abort();	/* reference after free */
1227 	} else {
1228 		/* Get more 8-byte aligned memory from heap */
1229 		ret = morecore(size + OVSZ_HEADER_SIZE);
1230 		if (ret == (caddr_t)-1) {
1231 			(void) mutex_unlock(&oversize_lock);
1232 			errno = ENOMEM;
1233 			return (NULL);
1234 		}
1235 		big = oversize_header_alloc((uintptr_t)ret, size);
1236 	}
1237 	ret = big->addr;
1238 
1239 	insert_hash(big);
1240 
1241 	if (debugopt & MTINITBUFFER)
1242 		copy_pattern(INITPATTERN, ret, size);
1243 
1244 	(void) mutex_unlock(&oversize_lock);
1245 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1246 	return ((void *)ret);
1247 }
1248 
1249 static void
1250 insert_oversize(oversize_t *op, oversize_t *nx)
1251 {
1252 	oversize_t *sp;
1253 
1254 	/* locate correct insertion point in size-ordered list */
1255 	for (sp = oversize_list.next_bysize;
1256 	    sp != &oversize_list && (op->size > sp->size);
1257 	    sp = sp->next_bysize)
1258 		;
1259 
1260 	/* link into size-ordered list */
1261 	op->next_bysize = sp;
1262 	op->prev_bysize = sp->prev_bysize;
1263 	op->prev_bysize->next_bysize = op;
1264 	op->next_bysize->prev_bysize = op;
1265 
1266 	/*
1267 	 * link item into address-ordered list
1268 	 * (caller provides insertion point as an optimization)
1269 	 */
1270 	op->next_byaddr = nx;
1271 	op->prev_byaddr = nx->prev_byaddr;
1272 	op->prev_byaddr->next_byaddr = op;
1273 	op->next_byaddr->prev_byaddr = op;
1274 
1275 }
1276 
1277 static void
1278 unlink_oversize(oversize_t *lp)
1279 {
1280 	/* unlink from address list */
1281 	lp->prev_byaddr->next_byaddr = lp->next_byaddr;
1282 	lp->next_byaddr->prev_byaddr = lp->prev_byaddr;
1283 
1284 	/* unlink from size list */
1285 	lp->prev_bysize->next_bysize = lp->next_bysize;
1286 	lp->next_bysize->prev_bysize = lp->prev_bysize;
1287 }
1288 
1289 static void
1290 position_oversize_by_size(oversize_t *op)
1291 {
1292 	oversize_t *sp;
1293 
1294 	if (op->size > op->next_bysize->size ||
1295 	    op->size < op->prev_bysize->size) {
1296 
1297 		/* unlink from size list */
1298 		op->prev_bysize->next_bysize = op->next_bysize;
1299 		op->next_bysize->prev_bysize = op->prev_bysize;
1300 
1301 		/* locate correct insertion point in size-ordered list */
1302 		for (sp = oversize_list.next_bysize;
1303 		    sp != &oversize_list && (op->size > sp->size);
1304 		    sp = sp->next_bysize)
1305 			;
1306 
1307 		/* link into size-ordered list */
1308 		op->next_bysize = sp;
1309 		op->prev_bysize = sp->prev_bysize;
1310 		op->prev_bysize->next_bysize = op;
1311 		op->next_bysize->prev_bysize = op;
1312 	}
1313 }
1314 
1315 static void
1316 add_oversize(oversize_t *lp)
1317 {
1318 	int merge_flags = INSERT_ONLY;
1319 	oversize_t *nx;  	/* ptr to item right of insertion point */
1320 	oversize_t *pv;  	/* ptr to item left of insertion point */
1321 	uint_t size_lp, size_pv, size_nx;
1322 	uintptr_t endp_lp, endp_pv, endp_nx;
1323 
1324 	/*
1325 	 * Locate insertion point in address-ordered list
1326 	 */
1327 
1328 	for (nx = oversize_list.next_byaddr;
1329 	    nx != &oversize_list && (lp->addr > nx->addr);
1330 	    nx = nx->next_byaddr)
1331 		;
1332 
1333 	/*
1334 	 * Determine how to add chunk to oversize freelist
1335 	 */
1336 
1337 	size_lp = OVSZ_HEADER_SIZE + lp->size;
1338 	endp_lp = ALIGN((uintptr_t)lp + size_lp, MTMALLOC_MIN_ALIGN);
1339 	size_lp = endp_lp - (uintptr_t)lp;
1340 
1341 	pv = nx->prev_byaddr;
1342 
1343 	if (pv->size) {
1344 
1345 		size_pv = OVSZ_HEADER_SIZE + pv->size;
1346 		endp_pv = ALIGN((uintptr_t)pv + size_pv,
1347 		    MTMALLOC_MIN_ALIGN);
1348 		size_pv = endp_pv - (uintptr_t)pv;
1349 
1350 		/* Check for adjacency with left chunk */
1351 		if ((uintptr_t)lp == endp_pv)
1352 			merge_flags |= COALESCE_LEFT;
1353 	}
1354 
1355 	if (nx->size) {
1356 
1357 		/* Check for adjacency with right chunk */
1358 		if ((uintptr_t)nx == endp_lp) {
1359 			size_nx = OVSZ_HEADER_SIZE + nx->size;
1360 			endp_nx = ALIGN((uintptr_t)nx + size_nx,
1361 			    MTMALLOC_MIN_ALIGN);
1362 			size_nx = endp_nx - (uintptr_t)nx;
1363 			merge_flags |= COALESCE_RIGHT;
1364 		}
1365 	}
1366 
1367 	/*
1368 	 * If MTDEBUGPATTERN==1, lp->addr will have been overwritten with
1369 	 * FREEPATTERN for lp->size bytes. If we can merge, the oversize
1370 	 * header(s) that will also become part of the memory available for
1371 	 * reallocation (ie lp and/or nx) must also be overwritten with
1372 	 * FREEPATTERN or we will SIGABRT when this memory is next reallocated.
1373 	 */
1374 	switch (merge_flags) {
1375 
1376 	case INSERT_ONLY:		/* Coalescing not possible */
1377 		insert_oversize(lp, nx);
1378 		break;
1379 	case COALESCE_LEFT:
1380 		pv->size += size_lp;
1381 		position_oversize_by_size(pv);
1382 		if (debugopt & MTDEBUGPATTERN)
1383 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1384 		break;
1385 	case COALESCE_RIGHT:
1386 		unlink_oversize(nx);
1387 		lp->size += size_nx;
1388 		insert_oversize(lp, pv->next_byaddr);
1389 		if (debugopt & MTDEBUGPATTERN)
1390 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1391 		break;
1392 	case COALESCE_WITH_BOTH_SIDES:	/* Merge (with right) to the left */
1393 		pv->size += size_lp + size_nx;
1394 		unlink_oversize(nx);
1395 		position_oversize_by_size(pv);
1396 		if (debugopt & MTDEBUGPATTERN) {
1397 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1398 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1399 		}
1400 		break;
1401 	}
1402 }
1403 
1404 /*
1405  * Find memory on our list that is at least size big. If we find a block that is
1406  * big enough, we break it up and return the associated oversize_t struct back
1407  * to the calling client. Any leftover piece of that block is returned to the
1408  * freelist.
1409  */
1410 static oversize_t *
1411 find_oversize(size_t size)
1412 {
1413 	oversize_t *wp = oversize_list.next_bysize;
1414 	while (wp != &oversize_list && size > wp->size)
1415 		wp = wp->next_bysize;
1416 
1417 	if (wp == &oversize_list) /* empty list or nothing big enough */
1418 		return (NULL);
1419 	/* breaking up a chunk of memory */
1420 	if ((long)((wp->size - (size + OVSZ_HEADER_SIZE + MTMALLOC_MIN_ALIGN)))
1421 	    > MAX_CACHED) {
1422 		caddr_t off;
1423 		oversize_t *np;
1424 		size_t osize;
1425 		off = (caddr_t)ALIGN(wp->addr + size,
1426 		    MTMALLOC_MIN_ALIGN);
1427 		osize = wp->size;
1428 		wp->size = (size_t)(off - wp->addr);
1429 		np = oversize_header_alloc((uintptr_t)off,
1430 		    osize - (wp->size + OVSZ_HEADER_SIZE));
1431 		if ((long)np->size < 0)
1432 			abort();
1433 		unlink_oversize(wp);
1434 		add_oversize(np);
1435 	} else {
1436 		unlink_oversize(wp);
1437 	}
1438 	return (wp);
1439 }
1440 
1441 static void
1442 copy_pattern(uint32_t pattern, void *buf_arg, size_t size)
1443 {
1444 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1445 	uint32_t *buf = buf_arg;
1446 
1447 	while (buf < bufend - 3) {
1448 		buf[3] = buf[2] = buf[1] = buf[0] = pattern;
1449 		buf += 4;
1450 	}
1451 	while (buf < bufend)
1452 		*buf++ = pattern;
1453 }
1454 
1455 static void *
1456 verify_pattern(uint32_t pattern, void *buf_arg, size_t size)
1457 {
1458 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1459 	uint32_t *buf;
1460 
1461 	for (buf = buf_arg; buf < bufend; buf++)
1462 		if (*buf != pattern)
1463 			return (buf);
1464 	return (NULL);
1465 }
1466 
1467 static void
1468 free_oversize(oversize_t *ovp)
1469 {
1470 	assert(((uintptr_t)ovp->addr & 7) == 0); /* are we 8 byte aligned */
1471 	assert(ovp->size > MAX_CACHED);
1472 
1473 	ovp->next_bysize = ovp->prev_bysize = NULL;
1474 	ovp->next_byaddr = ovp->prev_byaddr = NULL;
1475 	(void) mutex_lock(&oversize_lock);
1476 	add_oversize(ovp);
1477 	(void) mutex_unlock(&oversize_lock);
1478 }
1479 
1480 static oversize_t *
1481 oversize_header_alloc(uintptr_t mem, size_t size)
1482 {
1483 	oversize_t *ovsz_hdr;
1484 
1485 	assert(size > MAX_CACHED);
1486 
1487 	ovsz_hdr = (oversize_t *)mem;
1488 	ovsz_hdr->prev_bysize = NULL;
1489 	ovsz_hdr->next_bysize = NULL;
1490 	ovsz_hdr->prev_byaddr = NULL;
1491 	ovsz_hdr->next_byaddr = NULL;
1492 	ovsz_hdr->hash_next = NULL;
1493 	ovsz_hdr->size = size;
1494 	mem += OVSZ_SIZE;
1495 	*(uintptr_t *)mem = MTMALLOC_OVERSIZE_MAGIC;
1496 	mem += OVERHEAD;
1497 	assert(((uintptr_t)mem & 7) == 0); /* are we 8 byte aligned */
1498 	ovsz_hdr->addr = (caddr_t)mem;
1499 	return (ovsz_hdr);
1500 }
1501 
1502 static void
1503 malloc_prepare()
1504 {
1505 	percpu_t *cpuptr;
1506 	cache_head_t *cachehead;
1507 	cache_t *thiscache;
1508 
1509 	(void) mutex_lock(&oversize_lock);
1510 	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
1511 		(void) mutex_lock(&cpuptr->mt_parent_lock);
1512 		for (cachehead = &cpuptr->mt_caches[0];
1513 		    cachehead < &cpuptr->mt_caches[NUM_CACHES];
1514 		    cachehead++) {
1515 			for (thiscache = cachehead->mt_cache;
1516 			    thiscache != NULL;
1517 			    thiscache = thiscache->mt_next) {
1518 				(void) mutex_lock(
1519 				    &thiscache->mt_cache_lock);
1520 			}
1521 		}
1522 	}
1523 }
1524 
1525 static void
1526 malloc_release()
1527 {
1528 	percpu_t *cpuptr;
1529 	cache_head_t *cachehead;
1530 	cache_t *thiscache;
1531 
1532 	for (cpuptr = &cpu_list[ncpus - 1]; cpuptr >= &cpu_list[0]; cpuptr--) {
1533 		for (cachehead = &cpuptr->mt_caches[NUM_CACHES - 1];
1534 		    cachehead >= &cpuptr->mt_caches[0];
1535 		    cachehead--) {
1536 			for (thiscache = cachehead->mt_cache;
1537 			    thiscache != NULL;
1538 			    thiscache = thiscache->mt_next) {
1539 				(void) mutex_unlock(
1540 				    &thiscache->mt_cache_lock);
1541 			}
1542 		}
1543 		(void) mutex_unlock(&cpuptr->mt_parent_lock);
1544 	}
1545 	(void) mutex_unlock(&oversize_lock);
1546 }
1547 
1548 #pragma init(malloc_init)
1549 static void
1550 malloc_init(void)
1551 {
1552 	/*
1553 	 * This works in the init section for this library
1554 	 * because setup_caches() doesn't call anything in libc
1555 	 * that calls malloc().  If it did, disaster would ensue.
1556 	 *
1557 	 * For this to work properly, this library must be the first
1558 	 * one to have its init section called (after libc) by the
1559 	 * dynamic linker.  If some other library's init section
1560 	 * ran first and called malloc(), disaster would ensue.
1561 	 * Because this is an interposer library for malloc(), the
1562 	 * dynamic linker arranges for its init section to run first.
1563 	 */
1564 	(void) setup_caches();
1565 
1566 	(void) pthread_atfork(malloc_prepare, malloc_release, malloc_release);
1567 }
1568