xref: /titanic_52/usr/src/lib/libmtmalloc/common/mtmalloc.c (revision 65d61b8c5bb0da98e137e8d74245d8f53d690237)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <c_synonyms.h>
30 #include <mtmalloc.h>
31 #include "mtmalloc_impl.h"
32 #include <unistd.h>
33 #include <synch.h>
34 #include <thread.h>
35 #include <pthread.h>
36 #include <stdio.h>
37 #include <limits.h>
38 #include <errno.h>
39 #include <string.h>
40 #include <strings.h>
41 #include <sys/param.h>
42 #include <sys/sysmacros.h>
43 
44 /*
45  * To turn on the asserts just compile -DDEBUG
46  */
47 
48 #ifndef	DEBUG
49 #define	NDEBUG
50 #endif
51 
52 #include <assert.h>
53 
54 /*
55  * The MT hot malloc implementation contained herein is designed to be
56  * plug-compatible with the libc version of malloc. It is not intended
57  * to replace that implementation until we decide that it is ok to break
58  * customer apps (Solaris 3.0).
59  *
60  * For requests up to 2^^16, the allocator initializes itself into NCPUS
61  * worth of chains of caches. When a memory request is made, the calling thread
62  * is vectored into one of NCPUS worth of caches.  The LWP id gives us a cheap,
63  * contention-reducing index to use, eventually, this should be replaced with
64  * the actual CPU sequence number, when an interface to get it is available.
65  *
66  * Once the thread is vectored into one of the list of caches the real
67  * allocation of the memory begins. The size is determined to figure out which
68  * bucket the allocation should be satisfied from. The management of free
69  * buckets is done via a bitmask. A free bucket is represented by a 1. The
70  * first free bit represents the first free bucket. The position of the bit,
71  * represents the position of the bucket in the arena.
72  *
73  * When the memory from the arena is handed out, the address of the cache
74  * control structure is written in the word preceeding the returned memory.
75  * This cache control address is used during free() to mark the buffer free
76  * in the cache control structure.
77  *
78  * When all available memory in a cache has been depleted, a new chunk of memory
79  * is allocated via sbrk(). The new cache is allocated from this chunk of memory
80  * and initialized in the function create_cache(). New caches are installed at
81  * the front of a singly linked list of the same size memory pools. This helps
82  * to ensure that there will tend to be available memory in the beginning of the
83  * list.
84  *
85  * Long linked lists hurt performance. To decrease this effect, there is a
86  * tunable, requestsize, that bumps up the sbrk allocation size and thus
87  * increases the number of available blocks within an arena.  We also keep
88  * a "hint" for each cache list, which is the last cache in the list allocated
89  * from.  This lowers the cost of searching if there are a lot of fully
90  * allocated blocks at the front of the list.
91  *
92  * For requests greater than 2^^16 (oversize allocations), there are two pieces
93  * of overhead. There is the OVERHEAD used to hold the cache addr
94  * (&oversize_list), plus an oversize_t structure to further describe the block.
95  *
96  * The oversize list is kept as defragmented as possible by coalescing
97  * freed oversized allocations with adjacent neighbors.
98  *
99  * Addresses handed out are stored in a hash table, and are aligned on
100  * MTMALLOC_MIN_ALIGN-byte boundaries at both ends. Request sizes are rounded-up
101  * where necessary in order to achieve this. This eases the implementation of
102  * MTDEBUGPATTERN and MTINITPATTERN, particularly where coalescing occurs.
103  *
104  * A memalign allocation takes memalign header overhead.  There's two
105  * types of memalign headers distinguished by MTMALLOC_MEMALIGN_MAGIC
106  * and MTMALLOC_MEMALIGN_MIN_MAGIC.  When the size of memory taken to
107  * get to the aligned address from malloc'ed address is the minimum size
108  * OVERHEAD, we create a header taking only one OVERHEAD space with magic
109  * number MTMALLOC_MEMALIGN_MIN_MAGIC, and we know by subtracting OVERHEAD
110  * from memaligned address, we can get to the malloc'ed address. Otherwise,
111  * we create a memalign header taking two OVERHEAD space, one stores
112  * MTMALLOC_MEMALIGN_MAGIC magic number, the other one points back to the
113  * malloc'ed address.
114  */
115 
116 #if defined(__i386) || defined(__amd64)
117 #include <arpa/inet.h>	/* for htonl() */
118 #endif
119 
120 static void * morecore(size_t);
121 static void create_cache(cache_t *, size_t bufsize, uint_t hunks);
122 static void * malloc_internal(size_t, percpu_t *);
123 static void * oversize(size_t);
124 static oversize_t *find_oversize(size_t);
125 static void add_oversize(oversize_t *);
126 static void copy_pattern(uint32_t, void *, size_t);
127 static void * verify_pattern(uint32_t, void *, size_t);
128 static void reinit_cpu_list(void);
129 static void reinit_cache(cache_t *);
130 static void free_oversize(oversize_t *);
131 static oversize_t *oversize_header_alloc(uintptr_t, size_t);
132 
133 /*
134  * oversize hash table stuff
135  */
136 #define	NUM_BUCKETS	67	/* must be prime */
137 #define	HASH_OVERSIZE(caddr)	((uintptr_t)(caddr) % NUM_BUCKETS)
138 oversize_t *ovsz_hashtab[NUM_BUCKETS];
139 
140 #define	ALIGN(x, a)	((((uintptr_t)(x) + ((uintptr_t)(a) - 1)) \
141 			& ~((uintptr_t)(a) - 1)))
142 
143 /* need this to deal with little endianess of x86 */
144 #if defined(__i386) || defined(__amd64)
145 #define	FLIP_EM(x)	htonl((x))
146 #else
147 #define	FLIP_EM(x)	(x)
148 #endif
149 
150 #define	INSERT_ONLY			0
151 #define	COALESCE_LEFT			0x00000001
152 #define	COALESCE_RIGHT			0x00000002
153 #define	COALESCE_WITH_BOTH_SIDES	(COALESCE_LEFT | COALESCE_RIGHT)
154 
155 #define	OVERHEAD	8	/* size needed to write cache addr */
156 #define	HUNKSIZE	8192	/* just a multiplier */
157 
158 #define	MAX_CACHED_SHIFT	16	/* 64K is the max cached size */
159 #define	MAX_CACHED		(1 << MAX_CACHED_SHIFT)
160 #define	MIN_CACHED_SHIFT	4	/* smaller requests rounded up */
161 #define	MTMALLOC_MIN_ALIGN	8	/* min guaranteed alignment */
162 
163 /* maximum size before overflow */
164 #define	MAX_MTMALLOC	(SIZE_MAX - (SIZE_MAX % MTMALLOC_MIN_ALIGN) \
165 			- OVSZ_HEADER_SIZE)
166 
167 #define	NUM_CACHES	(MAX_CACHED_SHIFT - MIN_CACHED_SHIFT + 1)
168 #define	CACHELIST_SIZE	ALIGN(NUM_CACHES * sizeof (cache_head_t), \
169     CACHE_COHERENCY_UNIT)
170 
171 #define	MINSIZE		9	/* for requestsize, tunable */
172 #define	MAXSIZE		256	/* arbitrary, big enough, for requestsize */
173 
174 #define	FREEPATTERN	0xdeadbeef /* debug fill pattern for free buf */
175 #define	INITPATTERN	0xbaddcafe /* debug fill pattern for new buf */
176 
177 #define	misaligned(p)	((unsigned)(p) & (sizeof (int) - 1))
178 #define	IS_OVERSIZE(x, y)	(((x) < (y)) && (((x) > MAX_CACHED)? 1 : 0))
179 
180 static long requestsize = MINSIZE; /* 9 pages per cache; tunable; 9 is min */
181 
182 static uint_t cpu_mask;
183 static curcpu_func curcpu;
184 
185 static int32_t debugopt;
186 static int32_t reinit;
187 
188 static percpu_t *cpu_list;
189 static oversize_t oversize_list;
190 static mutex_t oversize_lock = DEFAULTMUTEX;
191 
192 static int ncpus = 0;
193 
194 #define	MTMALLOC_OVERSIZE_MAGIC		((uintptr_t)&oversize_list)
195 #define	MTMALLOC_MEMALIGN_MAGIC		((uintptr_t)&oversize_list + 1)
196 #define	MTMALLOC_MEMALIGN_MIN_MAGIC	((uintptr_t)&oversize_list + 2)
197 
198 /*
199  * We require allocations handed out to be aligned on MTMALLOC_MIN_ALIGN-byte
200  * boundaries. We round up sizeof (oversize_t) (when necessary) to ensure that
201  * this is achieved.
202  */
203 #define	OVSZ_SIZE		(ALIGN(sizeof (oversize_t), MTMALLOC_MIN_ALIGN))
204 #define	OVSZ_HEADER_SIZE	(OVSZ_SIZE + OVERHEAD)
205 
206 /*
207  * memalign header takes 2 OVERHEAD space.  One for memalign magic, and the
208  * other one points back to the start address of originally allocated space.
209  */
210 #define	MEMALIGN_HEADER_SIZE	2 * OVERHEAD
211 #define	MEMALIGN_HEADER_ALLOC(x, shift, malloc_addr)\
212 	if (shift == OVERHEAD)\
213 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
214 			MTMALLOC_MEMALIGN_MIN_MAGIC; \
215 	else {\
216 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
217 			MTMALLOC_MEMALIGN_MAGIC; \
218 		*((uintptr_t *)((caddr_t)x - 2 * OVERHEAD)) = \
219 			(uintptr_t)malloc_addr; \
220 	}
221 
222 void *
223 malloc(size_t bytes)
224 {
225 	percpu_t *list_rotor;
226 	uint_t	list_index;
227 
228 	if (bytes > MAX_CACHED)
229 		return (oversize(bytes));
230 
231 	list_index = (curcpu() & cpu_mask);
232 
233 	list_rotor = &cpu_list[list_index];
234 
235 	return (malloc_internal(bytes, list_rotor));
236 }
237 
238 void *
239 realloc(void * ptr, size_t bytes)
240 {
241 	void *new, *data_ptr;
242 	cache_t *cacheptr;
243 	caddr_t mem;
244 	size_t shift = 0;
245 
246 	if (ptr == NULL)
247 		return (malloc(bytes));
248 
249 	if (bytes == 0) {
250 		free(ptr);
251 		return (NULL);
252 	}
253 
254 	data_ptr = ptr;
255 	mem = (caddr_t)ptr - OVERHEAD;
256 
257 	new = malloc(bytes);
258 
259 	if (new == NULL)
260 		return (NULL);
261 
262 	/*
263 	 * If new == ptr, ptr has previously been freed. Passing a freed pointer
264 	 * to realloc() is not allowed - unless the caller specifically states
265 	 * otherwise, in which case we must avoid freeing ptr (ie new) before we
266 	 * return new. There is (obviously) no requirement to memcpy() ptr to
267 	 * new before we return.
268 	 */
269 	if (new == ptr) {
270 		if (!(debugopt & MTDOUBLEFREE))
271 			abort();
272 		return (new);
273 	}
274 
275 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
276 		mem -= OVERHEAD;
277 		ptr = (void *)*(uintptr_t *)mem;
278 		mem = (caddr_t)ptr - OVERHEAD;
279 		shift = (size_t)((uintptr_t)data_ptr - (uintptr_t)ptr);
280 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
281 		ptr = (void *) mem;
282 		mem -= OVERHEAD;
283 		shift = OVERHEAD;
284 	}
285 
286 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
287 		oversize_t *old;
288 
289 		old = (oversize_t *)(mem - OVSZ_SIZE);
290 		(void) memcpy(new, data_ptr, MIN(bytes, old->size - shift));
291 		free(ptr);
292 		return (new);
293 	}
294 
295 	cacheptr = (cache_t *)*(uintptr_t *)mem;
296 
297 	(void) memcpy(new, data_ptr,
298 		MIN(cacheptr->mt_size - OVERHEAD - shift, bytes));
299 	free(ptr);
300 
301 	return (new);
302 }
303 
304 void *
305 calloc(size_t nelem, size_t bytes)
306 {
307 	void * ptr;
308 	size_t size = nelem * bytes;
309 
310 	ptr = malloc(size);
311 	if (ptr == NULL)
312 		return (NULL);
313 	(void) memset(ptr, 0, size);
314 
315 	return (ptr);
316 }
317 
318 void
319 free(void * ptr)
320 {
321 	cache_t *cacheptr;
322 	caddr_t mem;
323 	int32_t i;
324 	caddr_t freeblocks;
325 	uintptr_t offset;
326 	uchar_t mask;
327 	int32_t which_bit, num_bytes;
328 
329 	if (ptr == NULL)
330 		return;
331 
332 	mem = (caddr_t)ptr - OVERHEAD;
333 
334 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
335 		mem -= OVERHEAD;
336 		ptr = (void *)*(uintptr_t *)mem;
337 		mem = (caddr_t)ptr - OVERHEAD;
338 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
339 		ptr = (void *) mem;
340 		mem -= OVERHEAD;
341 	}
342 
343 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
344 		oversize_t *big, **opp;
345 		int bucket;
346 
347 		big = (oversize_t *)(mem - OVSZ_SIZE);
348 		(void) mutex_lock(&oversize_lock);
349 
350 		bucket = HASH_OVERSIZE(big->addr);
351 		for (opp = &ovsz_hashtab[bucket]; *opp != NULL;
352 		    opp = &(*opp)->hash_next)
353 			if (*opp == big)
354 				break;
355 
356 		if (*opp == NULL) {
357 			if (!(debugopt & MTDOUBLEFREE))
358 				abort();
359 			(void) mutex_unlock(&oversize_lock);
360 			return;
361 		}
362 
363 		*opp = big->hash_next;	/* remove big from the hash table */
364 		big->hash_next = NULL;
365 
366 		if (debugopt & MTDEBUGPATTERN)
367 			copy_pattern(FREEPATTERN, ptr, big->size);
368 		add_oversize(big);
369 		(void) mutex_unlock(&oversize_lock);
370 		return;
371 	}
372 
373 	cacheptr = (cache_t *)*(uintptr_t *)mem;
374 	freeblocks = cacheptr->mt_freelist;
375 
376 	/*
377 	 * This is the distance measured in bits into the arena.
378 	 * The value of offset is in bytes but there is a 1-1 correlation
379 	 * between distance into the arena and distance into the
380 	 * freelist bitmask.
381 	 */
382 	offset = mem - cacheptr->mt_arena;
383 
384 	/*
385 	 * i is total number of bits to offset into freelist bitmask.
386 	 */
387 
388 	i = offset / cacheptr->mt_size;
389 
390 	num_bytes = i >> 3;
391 
392 	/*
393 	 * which_bit is the bit offset into the byte in the freelist.
394 	 * if our freelist bitmask looks like 0xf3 and we are freeing
395 	 * block 5 (ie: the 6th block) our mask will be 0xf7 after
396 	 * the free. Things go left to right that's why the mask is 0x80
397 	 * and not 0x01.
398 	 */
399 	which_bit = i - (num_bytes << 3);
400 
401 	mask = 0x80 >> which_bit;
402 
403 	freeblocks += num_bytes;
404 
405 	if (debugopt & MTDEBUGPATTERN)
406 		copy_pattern(FREEPATTERN, ptr, cacheptr->mt_size - OVERHEAD);
407 
408 	(void) mutex_lock(&cacheptr->mt_cache_lock);
409 
410 	if (*freeblocks & mask) {
411 		if (!(debugopt & MTDOUBLEFREE))
412 			abort();
413 	} else {
414 		*freeblocks |= mask;
415 		cacheptr->mt_nfree++;
416 	}
417 
418 	(void) mutex_unlock(&cacheptr->mt_cache_lock);
419 }
420 
421 void *
422 memalign(size_t alignment, size_t size)
423 {
424 	size_t alloc_size;
425 	uintptr_t offset;
426 	void *alloc_buf;
427 	void *ret_buf;
428 
429 	if (size == 0 || alignment == 0 ||
430 		misaligned(alignment) ||
431 		(alignment & (alignment - 1)) != 0) {
432 		errno = EINVAL;
433 		return (NULL);
434 	}
435 
436 	/* <= MTMALLOC_MIN_ALIGN, malloc can provide directly */
437 	if (alignment <= MTMALLOC_MIN_ALIGN)
438 		return (malloc(size));
439 
440 	alloc_size = size + alignment - MTMALLOC_MIN_ALIGN;
441 
442 	if (alloc_size < size) { /* overflow */
443 		errno = ENOMEM;
444 		return (NULL);
445 	}
446 
447 	alloc_buf = malloc(alloc_size);
448 
449 	if (alloc_buf == NULL)
450 		/* malloc sets errno */
451 		return (NULL);
452 
453 	/*
454 	 * If alloc_size > MAX_CACHED, malloc() will have returned a multiple of
455 	 * MTMALLOC_MIN_ALIGN, having rounded-up alloc_size if necessary. Since
456 	 * we will use alloc_size to return the excess fragments to the free
457 	 * list, we also round-up alloc_size if necessary.
458 	 */
459 	if ((alloc_size > MAX_CACHED) &&
460 	    (alloc_size & (MTMALLOC_MIN_ALIGN - 1)))
461 		alloc_size = ALIGN(alloc_size, MTMALLOC_MIN_ALIGN);
462 
463 	if ((offset = (uintptr_t)alloc_buf & (alignment - 1)) == 0) {
464 		/* aligned correctly */
465 
466 		size_t frag_size = alloc_size -
467 			(size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
468 
469 		/*
470 		 * If the leftover piece of the memory > MAX_CACHED,
471 		 * split off the piece and return it back to the freelist.
472 		 */
473 		if (IS_OVERSIZE(frag_size, alloc_size)) {
474 			oversize_t *orig, *tail;
475 			uintptr_t taddr;
476 			size_t data_size;
477 			taddr = ALIGN((uintptr_t)alloc_buf + size,
478 					MTMALLOC_MIN_ALIGN);
479 			data_size = taddr - (uintptr_t)alloc_buf;
480 			orig = (oversize_t *)((uintptr_t)alloc_buf -
481 					OVSZ_HEADER_SIZE);
482 			frag_size = orig->size - data_size -
483 					OVSZ_HEADER_SIZE;
484 			orig->size = data_size;
485 			tail = oversize_header_alloc(taddr, frag_size);
486 			free_oversize(tail);
487 		}
488 		ret_buf = alloc_buf;
489 	} else {
490 		uchar_t	oversize_bits = 0;
491 		size_t	head_sz, data_sz, tail_sz;
492 		uintptr_t ret_addr, taddr, shift, tshift;
493 		oversize_t *orig, *tail;
494 		size_t tsize;
495 
496 		/* needs to be aligned */
497 		shift = alignment - offset;
498 
499 		assert(shift >= MTMALLOC_MIN_ALIGN);
500 
501 		ret_addr = ((uintptr_t)alloc_buf + shift);
502 		ret_buf = (void *)ret_addr;
503 
504 		if (alloc_size <= MAX_CACHED) {
505 			MEMALIGN_HEADER_ALLOC(ret_addr, shift, alloc_buf);
506 			return (ret_buf);
507 		}
508 
509 		/*
510 		 * Only check for the fragments when the memory is allocted
511 		 * from oversize_list.  Split off a fragment and return it
512 		 * to the oversize freelist when it's > MAX_CACHED.
513 		 */
514 
515 		head_sz = shift - MAX(MEMALIGN_HEADER_SIZE, OVSZ_HEADER_SIZE);
516 
517 		tail_sz = alloc_size -
518 			(shift + size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
519 
520 		oversize_bits |= IS_OVERSIZE(head_sz, alloc_size) |
521 				IS_OVERSIZE(size, alloc_size) << DATA_SHIFT |
522 				IS_OVERSIZE(tail_sz, alloc_size) << TAIL_SHIFT;
523 
524 		switch (oversize_bits) {
525 			case NONE_OVERSIZE:
526 			case DATA_OVERSIZE:
527 				MEMALIGN_HEADER_ALLOC(ret_addr, shift,
528 					alloc_buf);
529 				break;
530 			case HEAD_OVERSIZE:
531 				/*
532 				 * If we can extend data > MAX_CACHED and have
533 				 * head still > MAX_CACHED, we split head-end
534 				 * as the case of head-end and data oversized,
535 				 * otherwise just create memalign header.
536 				 */
537 				tsize = (shift + size) - (MAX_CACHED + 8 +
538 					MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
539 
540 				if (!IS_OVERSIZE(tsize, alloc_size)) {
541 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
542 						alloc_buf);
543 					break;
544 				} else {
545 					tsize += OVSZ_HEADER_SIZE;
546 					taddr = ALIGN((uintptr_t)alloc_buf +
547 						tsize, MTMALLOC_MIN_ALIGN);
548 					tshift = ret_addr - taddr;
549 					MEMALIGN_HEADER_ALLOC(ret_addr, tshift,
550 						taddr);
551 					ret_addr = taddr;
552 					shift = ret_addr - (uintptr_t)alloc_buf;
553 				}
554 				/* FALLTHROUGH */
555 			case HEAD_AND_DATA_OVERSIZE:
556 				/*
557 				 * Split off the head fragment and
558 				 * return it back to oversize freelist.
559 				 * Create oversize header for the piece
560 				 * of (data + tail fragment).
561 				 */
562 				orig = (oversize_t *)((uintptr_t)alloc_buf -
563 						OVSZ_HEADER_SIZE);
564 				(void) oversize_header_alloc(ret_addr -
565 						OVSZ_HEADER_SIZE,
566 						(orig->size - shift));
567 				orig->size = shift - OVSZ_HEADER_SIZE;
568 
569 				/* free up the head fragment */
570 				free_oversize(orig);
571 				break;
572 			case TAIL_OVERSIZE:
573 				/*
574 				 * If we can extend data > MAX_CACHED and have
575 				 * tail-end still > MAX_CACHED, we split tail
576 				 * end, otherwise just create memalign header.
577 				 */
578 				orig = (oversize_t *)((uintptr_t)alloc_buf -
579 						OVSZ_HEADER_SIZE);
580 				tsize =  orig->size - (MAX_CACHED + 8 +
581 					shift + OVSZ_HEADER_SIZE +
582 					MTMALLOC_MIN_ALIGN);
583 				if (!IS_OVERSIZE(tsize, alloc_size)) {
584 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
585 						alloc_buf);
586 					break;
587 				} else {
588 					size = MAX_CACHED + 8;
589 				}
590 				/* FALLTHROUGH */
591 			case DATA_AND_TAIL_OVERSIZE:
592 				/*
593 				 * Split off the tail fragment and
594 				 * return it back to oversize freelist.
595 				 * Create memalign header and adjust
596 				 * the size for the piece of
597 				 * (head fragment + data).
598 				 */
599 				taddr = ALIGN(ret_addr + size,
600 						MTMALLOC_MIN_ALIGN);
601 				data_sz = (size_t)(taddr -
602 						(uintptr_t)alloc_buf);
603 				orig = (oversize_t *)((uintptr_t)alloc_buf -
604 						OVSZ_HEADER_SIZE);
605 				tsize = orig->size - data_sz;
606 				orig->size = data_sz;
607 				MEMALIGN_HEADER_ALLOC(ret_buf, shift,
608 					alloc_buf);
609 				tsize -= OVSZ_HEADER_SIZE;
610 				tail = oversize_header_alloc(taddr,  tsize);
611 				free_oversize(tail);
612 				break;
613 			case HEAD_AND_TAIL_OVERSIZE:
614 				/*
615 				 * Split off the head fragment.
616 				 * We try to free up tail-end when we can
617 				 * extend data size to (MAX_CACHED + 8)
618 				 * and remain tail-end oversized.
619 				 * The bottom line is all split pieces
620 				 * should be oversize in size.
621 				 */
622 				orig = (oversize_t *)((uintptr_t)alloc_buf -
623 					OVSZ_HEADER_SIZE);
624 				tsize =  orig->size - (MAX_CACHED + 8 +
625 					OVSZ_HEADER_SIZE + shift +
626 					MTMALLOC_MIN_ALIGN);
627 
628 				if (!IS_OVERSIZE(tsize, alloc_size)) {
629 					/*
630 					 * If the chunk is not big enough
631 					 * to make both data and tail oversize
632 					 * we just keep them as one piece.
633 					 */
634 					(void) oversize_header_alloc(ret_addr -
635 						OVSZ_HEADER_SIZE,
636 						orig->size - shift);
637 					orig->size = shift -
638 						OVSZ_HEADER_SIZE;
639 					free_oversize(orig);
640 					break;
641 				} else {
642 					/*
643 					 * extend data size > MAX_CACHED
644 					 * and handle it as head, data, tail
645 					 * are all oversized.
646 					 */
647 					size = MAX_CACHED + 8;
648 				}
649 				/* FALLTHROUGH */
650 			case ALL_OVERSIZE:
651 				/*
652 				 * split off the head and tail fragments,
653 				 * return them back to the oversize freelist.
654 				 * Alloc oversize header for data seg.
655 				 */
656 				orig = (oversize_t *)((uintptr_t)alloc_buf -
657 					OVSZ_HEADER_SIZE);
658 				tsize = orig->size;
659 				orig->size = shift - OVSZ_HEADER_SIZE;
660 				free_oversize(orig);
661 
662 				taddr = ALIGN(ret_addr + size,
663 					MTMALLOC_MIN_ALIGN);
664 				data_sz = taddr - ret_addr;
665 				assert(tsize > (shift + data_sz +
666 					OVSZ_HEADER_SIZE));
667 				tail_sz = tsize -
668 					(shift + data_sz + OVSZ_HEADER_SIZE);
669 
670 				/* create oversize header for data seg */
671 				(void) oversize_header_alloc(ret_addr -
672 					OVSZ_HEADER_SIZE, data_sz);
673 
674 				/* create oversize header for tail fragment */
675 				tail = oversize_header_alloc(taddr, tail_sz);
676 				free_oversize(tail);
677 				break;
678 			default:
679 				/* should not reach here */
680 				assert(0);
681 		}
682 	}
683 	return (ret_buf);
684 }
685 
686 
687 void *
688 valloc(size_t size)
689 {
690 	static unsigned pagesize;
691 
692 	if (size == 0)
693 		return (NULL);
694 
695 	if (!pagesize)
696 		pagesize = sysconf(_SC_PAGESIZE);
697 
698 	return (memalign(pagesize, size));
699 }
700 
701 void
702 mallocctl(int cmd, long value)
703 {
704 	switch (cmd) {
705 
706 	case MTDEBUGPATTERN:
707 		/*
708 		 * Reinitialize free blocks in case malloc() is called prior
709 		 * to mallocctl().
710 		 */
711 		if (value && !(debugopt & cmd)) {
712 			reinit++;
713 			debugopt |= cmd;
714 			reinit_cpu_list();
715 		}
716 		/*FALLTHRU*/
717 	case MTDOUBLEFREE:
718 	case MTINITBUFFER:
719 		if (value)
720 			debugopt |= cmd;
721 		else
722 			debugopt &= ~cmd;
723 		break;
724 	case MTCHUNKSIZE:
725 		if (value >= MINSIZE && value <= MAXSIZE)
726 			requestsize = value;
727 		break;
728 	default:
729 		break;
730 	}
731 }
732 
733 /*
734  * Initialization function, called from the init section of the library.
735  * No locking is required here because we are single-threaded during
736  * library initialization.
737  */
738 static void
739 setup_caches(void)
740 {
741 	uintptr_t oldbrk;
742 	uintptr_t newbrk;
743 
744 	size_t cache_space_needed;
745 	size_t padding;
746 
747 	curcpu_func new_curcpu;
748 	uint_t new_cpu_mask;
749 	percpu_t *new_cpu_list;
750 
751 	uint_t i, j;
752 	uintptr_t list_addr;
753 
754 	/*
755 	 * Get a decent "current cpu identifier", to be used to reduce
756 	 * contention.  Eventually, this should be replaced by an interface
757 	 * to get the actual CPU sequence number in libthread/liblwp.
758 	 */
759 	new_curcpu = (curcpu_func)thr_self;
760 	if ((ncpus = 2 * sysconf(_SC_NPROCESSORS_CONF)) <= 0)
761 		ncpus = 4; /* decent default value */
762 
763 	/* round ncpus up to a power of 2 */
764 	while (ncpus & (ncpus - 1))
765 		ncpus++;
766 
767 	new_cpu_mask = ncpus - 1;	/* create the cpu mask */
768 
769 	/*
770 	 * We now do some magic with the brk.  What we want to get in the
771 	 * end is a bunch of well-aligned stuff in a big initial allocation.
772 	 * Along the way, we do sanity checks to make sure no one else has
773 	 * touched the brk (which shouldn't happen, but it's always good to
774 	 * check)
775 	 *
776 	 * First, make sure sbrk is sane, and store the current brk in oldbrk.
777 	 */
778 	oldbrk = (uintptr_t)sbrk(0);
779 	if ((void *)oldbrk == (void *)-1)
780 		abort();	/* sbrk is broken -- we're doomed. */
781 
782 	/*
783 	 * Now, align the brk to a multiple of CACHE_COHERENCY_UNIT, so that
784 	 * the percpu structures and cache lists will be properly aligned.
785 	 *
786 	 *   2.  All hunks will be page-aligned, assuming HUNKSIZE >= PAGESIZE,
787 	 *	so they can be paged out individually.
788 	 */
789 	newbrk = ALIGN(oldbrk, CACHE_COHERENCY_UNIT);
790 	if (newbrk != oldbrk && (uintptr_t)sbrk(newbrk - oldbrk) != oldbrk)
791 		abort();	/* sbrk is broken -- we're doomed. */
792 
793 	/*
794 	 * For each cpu, there is one percpu_t and a list of caches
795 	 */
796 	cache_space_needed = ncpus * (sizeof (percpu_t) + CACHELIST_SIZE);
797 
798 	new_cpu_list = (percpu_t *)sbrk(cache_space_needed);
799 
800 	if (new_cpu_list == (percpu_t *)-1 ||
801 	    (uintptr_t)new_cpu_list != newbrk)
802 		abort();	/* sbrk is broken -- we're doomed. */
803 
804 	/*
805 	 * Finally, align the brk to HUNKSIZE so that all hunks are
806 	 * page-aligned, to avoid edge-effects.
807 	 */
808 
809 	newbrk = (uintptr_t)new_cpu_list + cache_space_needed;
810 
811 	padding = ALIGN(newbrk, HUNKSIZE) - newbrk;
812 
813 	if (padding > 0 && (uintptr_t)sbrk(padding) != newbrk)
814 		abort();	/* sbrk is broken -- we're doomed. */
815 
816 	list_addr = ((uintptr_t)new_cpu_list + (sizeof (percpu_t) * ncpus));
817 
818 	/* initialize the percpu list */
819 	for (i = 0; i < ncpus; i++) {
820 		new_cpu_list[i].mt_caches = (cache_head_t *)list_addr;
821 		for (j = 0; j < NUM_CACHES; j++) {
822 			new_cpu_list[i].mt_caches[j].mt_cache = NULL;
823 			new_cpu_list[i].mt_caches[j].mt_hint = NULL;
824 		}
825 
826 		(void) mutex_init(&new_cpu_list[i].mt_parent_lock,
827 		    USYNC_THREAD, NULL);
828 
829 		/* get the correct cache list alignment */
830 		list_addr += CACHELIST_SIZE;
831 	}
832 
833 	/*
834 	 * Initialize oversize listhead
835 	 */
836 	oversize_list.next_bysize = &oversize_list;
837 	oversize_list.prev_bysize = &oversize_list;
838 	oversize_list.next_byaddr = &oversize_list;
839 	oversize_list.prev_byaddr = &oversize_list;
840 	oversize_list.addr = NULL;
841 	oversize_list.size = 0;		/* sentinal */
842 
843 	/*
844 	 * Now install the global variables.
845 	 */
846 	curcpu = new_curcpu;
847 	cpu_mask = new_cpu_mask;
848 	cpu_list = new_cpu_list;
849 }
850 
851 static void
852 create_cache(cache_t *cp, size_t size, uint_t chunksize)
853 {
854 	long nblocks;
855 
856 	(void) mutex_init(&cp->mt_cache_lock, USYNC_THREAD, NULL);
857 	cp->mt_size = size;
858 	cp->mt_freelist = ((caddr_t)cp + sizeof (cache_t));
859 	cp->mt_span = chunksize * HUNKSIZE - sizeof (cache_t);
860 	cp->mt_hunks = chunksize;
861 	/*
862 	 * rough calculation. We will need to adjust later.
863 	 */
864 	nblocks = cp->mt_span / cp->mt_size;
865 	nblocks >>= 3;
866 	if (nblocks == 0) { /* less than 8 free blocks in this pool */
867 		int32_t numblocks = 0;
868 		long i = cp->mt_span;
869 		size_t sub = cp->mt_size;
870 		uchar_t mask = 0;
871 
872 		while (i > sub) {
873 			numblocks++;
874 			i -= sub;
875 		}
876 		nblocks = numblocks;
877 		cp->mt_arena = (caddr_t)ALIGN(cp->mt_freelist + 8, 8);
878 		cp->mt_nfree = numblocks;
879 		while (numblocks--) {
880 			mask |= 0x80 >> numblocks;
881 		}
882 		*(cp->mt_freelist) = mask;
883 	} else {
884 		cp->mt_arena = (caddr_t)ALIGN((caddr_t)cp->mt_freelist +
885 			nblocks, 32);
886 		/* recompute nblocks */
887 		nblocks = (uintptr_t)((caddr_t)cp->mt_freelist +
888 			cp->mt_span - cp->mt_arena) / cp->mt_size;
889 		cp->mt_nfree = ((nblocks >> 3) << 3);
890 		/* Set everything to free */
891 		(void) memset(cp->mt_freelist, 0xff, nblocks >> 3);
892 	}
893 
894 	if (debugopt & MTDEBUGPATTERN)
895 		copy_pattern(FREEPATTERN, cp->mt_arena, cp->mt_size * nblocks);
896 
897 	cp->mt_next = NULL;
898 }
899 
900 static void
901 reinit_cpu_list(void)
902 {
903 	oversize_t *wp = oversize_list.next_bysize;
904 	percpu_t *cpuptr;
905 	cache_t *thiscache;
906 	cache_head_t *cachehead;
907 
908 	/* Reinitialize free oversize blocks. */
909 	(void) mutex_lock(&oversize_lock);
910 	if (debugopt & MTDEBUGPATTERN)
911 		for (; wp != &oversize_list; wp = wp->next_bysize)
912 			copy_pattern(FREEPATTERN, wp->addr, wp->size);
913 	(void) mutex_unlock(&oversize_lock);
914 
915 	/* Reinitialize free blocks. */
916 	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
917 		(void) mutex_lock(&cpuptr->mt_parent_lock);
918 		for (cachehead = &cpuptr->mt_caches[0]; cachehead <
919 			&cpuptr->mt_caches[NUM_CACHES]; cachehead++) {
920 			for (thiscache = cachehead->mt_cache; thiscache != NULL;
921 				thiscache = thiscache->mt_next) {
922 				(void) mutex_lock(&thiscache->mt_cache_lock);
923 				if (thiscache->mt_nfree == 0) {
924 					(void) mutex_unlock(
925 					    &thiscache->mt_cache_lock);
926 					continue;
927 				}
928 				if (thiscache != NULL)
929 					reinit_cache(thiscache);
930 				(void) mutex_unlock(&thiscache->mt_cache_lock);
931 			}
932 		}
933 		(void) mutex_unlock(&cpuptr->mt_parent_lock);
934 	}
935 	reinit = 0;
936 }
937 
938 static void
939 reinit_cache(cache_t *thiscache)
940 {
941 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
942 	int32_t i, n;
943 	caddr_t ret;
944 
945 	freeblocks = (uint32_t *)thiscache->mt_freelist;
946 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
947 		if (*freeblocks & 0xffffffff) {
948 		    for (i = 0; i < 32; i++) {
949 			if (FLIP_EM(*freeblocks) & (0x80000000 >> i)) {
950 				n = (uintptr_t)(((freeblocks -
951 				    (uint32_t *)thiscache->mt_freelist) << 5)
952 				    + i) * thiscache->mt_size;
953 				ret = thiscache->mt_arena + n;
954 				ret += OVERHEAD;
955 				copy_pattern(FREEPATTERN, ret,
956 				    thiscache->mt_size);
957 			}
958 		    }
959 		}
960 		freeblocks++;
961 	}
962 }
963 
964 static void *
965 malloc_internal(size_t size, percpu_t *cpuptr)
966 {
967 	cache_head_t *cachehead;
968 	cache_t *thiscache, *hintcache;
969 	int32_t i, n, logsz, bucket;
970 	uint32_t index;
971 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
972 	caddr_t ret;
973 
974 	logsz = MIN_CACHED_SHIFT;
975 
976 	while (size > (1 << logsz))
977 		logsz++;
978 
979 	bucket = logsz - MIN_CACHED_SHIFT;
980 
981 	(void) mutex_lock(&cpuptr->mt_parent_lock);
982 
983 	/*
984 	 * Find a cache of the appropriate size with free buffers.
985 	 *
986 	 * We don't need to lock each cache as we check their mt_nfree count,
987 	 * since:
988 	 *	1.  We are only looking for caches with mt_nfree > 0.  If a
989 	 *	   free happens during our search, it will increment mt_nfree,
990 	 *	   which will not effect the test.
991 	 *	2.  Allocations can decrement mt_nfree, but they can't happen
992 	 *	   as long as we hold mt_parent_lock.
993 	 */
994 
995 	cachehead = &cpuptr->mt_caches[bucket];
996 
997 	/* Search through the list, starting at the mt_hint */
998 	thiscache = cachehead->mt_hint;
999 
1000 	while (thiscache != NULL && thiscache->mt_nfree == 0)
1001 		thiscache = thiscache->mt_next;
1002 
1003 	if (thiscache == NULL) {
1004 		/* wrap around -- search up to the hint */
1005 		thiscache = cachehead->mt_cache;
1006 		hintcache = cachehead->mt_hint;
1007 
1008 		while (thiscache != NULL && thiscache != hintcache &&
1009 		    thiscache->mt_nfree == 0)
1010 			thiscache = thiscache->mt_next;
1011 
1012 		if (thiscache == hintcache)
1013 			thiscache = NULL;
1014 	}
1015 
1016 
1017 	if (thiscache == NULL) { /* there are no free caches */
1018 		int32_t thisrequest = requestsize;
1019 		int32_t buffer_size = (1 << logsz) + OVERHEAD;
1020 
1021 		thiscache = (cache_t *)morecore(thisrequest * HUNKSIZE);
1022 
1023 		if (thiscache == (cache_t *)-1) {
1024 		    (void) mutex_unlock(&cpuptr->mt_parent_lock);
1025 		    errno = EAGAIN;
1026 		    return (NULL);
1027 		}
1028 		create_cache(thiscache, buffer_size, thisrequest);
1029 
1030 		/* link in the new block at the beginning of the list */
1031 		thiscache->mt_next = cachehead->mt_cache;
1032 		cachehead->mt_cache = thiscache;
1033 	}
1034 
1035 	/* update the hint to the cache we found or created */
1036 	cachehead->mt_hint = thiscache;
1037 
1038 	/* thiscache now points to a cache with available space */
1039 	(void) mutex_lock(&thiscache->mt_cache_lock);
1040 
1041 	freeblocks = (uint32_t *)thiscache->mt_freelist;
1042 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
1043 		if (*freeblocks & 0xffffffff)
1044 			break;
1045 		freeblocks++;
1046 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1047 		    *freeblocks & 0xffffffff)
1048 			break;
1049 		freeblocks++;
1050 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1051 		    *freeblocks & 0xffffffff)
1052 			break;
1053 		freeblocks++;
1054 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1055 		    *freeblocks & 0xffffffff)
1056 			break;
1057 		freeblocks++;
1058 	}
1059 
1060 	/*
1061 	 * the offset from mt_freelist to freeblocks is the offset into
1062 	 * the arena. Be sure to include the offset into freeblocks
1063 	 * of the bitmask. n is the offset.
1064 	 */
1065 	for (i = 0; i < 32; ) {
1066 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1067 			break;
1068 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1069 			break;
1070 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1071 			break;
1072 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1073 			break;
1074 	}
1075 	index = 0x80000000 >> --i;
1076 
1077 
1078 	*freeblocks &= FLIP_EM(~index);
1079 
1080 	thiscache->mt_nfree--;
1081 
1082 	(void) mutex_unlock(&thiscache->mt_cache_lock);
1083 	(void) mutex_unlock(&cpuptr->mt_parent_lock);
1084 
1085 	n = (uintptr_t)(((freeblocks - (uint32_t *)thiscache->mt_freelist) << 5)
1086 		+ i) * thiscache->mt_size;
1087 	/*
1088 	 * Now you have the offset in n, you've changed the free mask
1089 	 * in the freelist. Nothing left to do but find the block
1090 	 * in the arena and put the value of thiscache in the word
1091 	 * ahead of the handed out address and return the memory
1092 	 * back to the user.
1093 	 */
1094 	ret = thiscache->mt_arena + n;
1095 
1096 	/* Store the cache addr for this buf. Makes free go fast. */
1097 	*(uintptr_t *)ret = (uintptr_t)thiscache;
1098 
1099 	/*
1100 	 * This assert makes sure we don't hand out memory that is not
1101 	 * owned by this cache.
1102 	 */
1103 	assert(ret + thiscache->mt_size <= thiscache->mt_freelist +
1104 		thiscache->mt_span);
1105 
1106 	ret += OVERHEAD;
1107 
1108 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1109 
1110 	if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1111 		if (verify_pattern(FREEPATTERN, ret, size))
1112 			abort();	/* reference after free */
1113 
1114 	if (debugopt & MTINITBUFFER)
1115 		copy_pattern(INITPATTERN, ret, size);
1116 	return ((void *)ret);
1117 }
1118 
1119 static void *
1120 morecore(size_t bytes)
1121 {
1122 	void * ret;
1123 
1124 	if (bytes > LONG_MAX) {
1125 		intptr_t wad;
1126 		/*
1127 		 * The request size is too big. We need to do this in
1128 		 * chunks. Sbrk only takes an int for an arg.
1129 		 */
1130 		if (bytes == ULONG_MAX)
1131 			return ((void *)-1);
1132 
1133 		ret = sbrk(0);
1134 		wad = LONG_MAX;
1135 		while (wad > 0) {
1136 			if (sbrk(wad) == (void *)-1) {
1137 				if (ret != sbrk(0))
1138 					(void) sbrk(-LONG_MAX);
1139 				return ((void *)-1);
1140 			}
1141 			bytes -= LONG_MAX;
1142 			wad = bytes;
1143 		}
1144 	} else
1145 		ret = sbrk(bytes);
1146 
1147 	return (ret);
1148 }
1149 
1150 
1151 static void *
1152 oversize(size_t size)
1153 {
1154 	caddr_t ret;
1155 	oversize_t *big;
1156 	int bucket;
1157 
1158 	/* make sure we will not overflow */
1159 	if (size > MAX_MTMALLOC) {
1160 		errno = ENOMEM;
1161 		return (NULL);
1162 	}
1163 
1164 	/*
1165 	 * Since we ensure every address we hand back is
1166 	 * MTMALLOC_MIN_ALIGN-byte aligned, ALIGNing size ensures that the
1167 	 * memory handed out is MTMALLOC_MIN_ALIGN-byte aligned at both ends.
1168 	 * This eases the implementation of MTDEBUGPATTERN and MTINITPATTERN,
1169 	 * particularly where coalescing occurs.
1170 	 */
1171 	size = ALIGN(size, MTMALLOC_MIN_ALIGN);
1172 
1173 	/*
1174 	 * The idea with the global lock is that we are sure to
1175 	 * block in the kernel anyway since given an oversize alloc
1176 	 * we are sure to have to call morecore();
1177 	 */
1178 	(void) mutex_lock(&oversize_lock);
1179 
1180 	if ((big = find_oversize(size)) != NULL) {
1181 		if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1182 			if (verify_pattern(FREEPATTERN, big->addr, size))
1183 				abort();	/* reference after free */
1184 	} else {
1185 		/* Get more 8-byte aligned memory from heap */
1186 		ret = morecore(size + OVSZ_HEADER_SIZE);
1187 		if (ret == (caddr_t)-1) {
1188 			(void) mutex_unlock(&oversize_lock);
1189 			errno = ENOMEM;
1190 			return (NULL);
1191 		}
1192 		big = oversize_header_alloc((uintptr_t)ret, size);
1193 	}
1194 	ret = big->addr;
1195 
1196 	/* Add big to the hash table at the head of the relevant bucket. */
1197 	bucket = HASH_OVERSIZE(ret);
1198 	big->hash_next = ovsz_hashtab[bucket];
1199 	ovsz_hashtab[bucket] = big;
1200 
1201 	if (debugopt & MTINITBUFFER)
1202 		copy_pattern(INITPATTERN, ret, size);
1203 
1204 	(void) mutex_unlock(&oversize_lock);
1205 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1206 	return ((void *)ret);
1207 }
1208 
1209 static void
1210 insert_oversize(oversize_t *op, oversize_t *nx)
1211 {
1212 	oversize_t *sp;
1213 
1214 	/* locate correct insertion point in size-ordered list */
1215 	for (sp = oversize_list.next_bysize;
1216 	    sp != &oversize_list && (op->size > sp->size);
1217 	    sp = sp->next_bysize)
1218 		;
1219 
1220 	/* link into size-ordered list */
1221 	op->next_bysize = sp;
1222 	op->prev_bysize = sp->prev_bysize;
1223 	op->prev_bysize->next_bysize = op;
1224 	op->next_bysize->prev_bysize = op;
1225 
1226 	/*
1227 	 * link item into address-ordered list
1228 	 * (caller provides insertion point as an optimization)
1229 	 */
1230 	op->next_byaddr = nx;
1231 	op->prev_byaddr = nx->prev_byaddr;
1232 	op->prev_byaddr->next_byaddr = op;
1233 	op->next_byaddr->prev_byaddr = op;
1234 
1235 }
1236 
1237 static void
1238 unlink_oversize(oversize_t *lp)
1239 {
1240 	/* unlink from address list */
1241 	lp->prev_byaddr->next_byaddr = lp->next_byaddr;
1242 	lp->next_byaddr->prev_byaddr = lp->prev_byaddr;
1243 
1244 	/* unlink from size list */
1245 	lp->prev_bysize->next_bysize = lp->next_bysize;
1246 	lp->next_bysize->prev_bysize = lp->prev_bysize;
1247 }
1248 
1249 static void
1250 position_oversize_by_size(oversize_t *op)
1251 {
1252 	oversize_t *sp;
1253 
1254 	if (op->size > op->next_bysize->size ||
1255 	    op->size < op->prev_bysize->size) {
1256 
1257 		/* unlink from size list */
1258 		op->prev_bysize->next_bysize = op->next_bysize;
1259 		op->next_bysize->prev_bysize = op->prev_bysize;
1260 
1261 		/* locate correct insertion point in size-ordered list */
1262 		for (sp = oversize_list.next_bysize;
1263 		    sp != &oversize_list && (op->size > sp->size);
1264 		    sp = sp->next_bysize)
1265 			;
1266 
1267 		/* link into size-ordered list */
1268 		op->next_bysize = sp;
1269 		op->prev_bysize = sp->prev_bysize;
1270 		op->prev_bysize->next_bysize = op;
1271 		op->next_bysize->prev_bysize = op;
1272 	}
1273 }
1274 
1275 static void
1276 add_oversize(oversize_t *lp)
1277 {
1278 	int merge_flags = INSERT_ONLY;
1279 	oversize_t *nx;  	/* ptr to item right of insertion point */
1280 	oversize_t *pv;  	/* ptr to item left of insertion point */
1281 	uint_t size_lp, size_pv, size_nx;
1282 	uintptr_t endp_lp, endp_pv, endp_nx;
1283 
1284 	/*
1285 	 * Locate insertion point in address-ordered list
1286 	 */
1287 
1288 	for (nx = oversize_list.next_byaddr;
1289 	    nx != &oversize_list && (lp->addr > nx->addr);
1290 	    nx = nx->next_byaddr)
1291 		;
1292 
1293 	/*
1294 	 * Determine how to add chunk to oversize freelist
1295 	 */
1296 
1297 	size_lp = OVSZ_HEADER_SIZE + lp->size;
1298 	endp_lp = ALIGN((uintptr_t)lp + size_lp, MTMALLOC_MIN_ALIGN);
1299 	size_lp = endp_lp - (uintptr_t)lp;
1300 
1301 	pv = nx->prev_byaddr;
1302 
1303 	if (pv->size) {
1304 
1305 		size_pv = OVSZ_HEADER_SIZE + pv->size;
1306 		endp_pv = ALIGN((uintptr_t)pv + size_pv,
1307 		    MTMALLOC_MIN_ALIGN);
1308 		size_pv = endp_pv - (uintptr_t)pv;
1309 
1310 		/* Check for adjacency with left chunk */
1311 		if ((uintptr_t)lp == endp_pv)
1312 			merge_flags |= COALESCE_LEFT;
1313 	}
1314 
1315 	if (nx->size) {
1316 
1317 	    /* Check for adjacency with right chunk */
1318 	    if ((uintptr_t)nx == endp_lp) {
1319 		size_nx = OVSZ_HEADER_SIZE + nx->size;
1320 		endp_nx = ALIGN((uintptr_t)nx + size_nx,
1321 		    MTMALLOC_MIN_ALIGN);
1322 		size_nx = endp_nx - (uintptr_t)nx;
1323 		merge_flags |= COALESCE_RIGHT;
1324 	    }
1325 	}
1326 
1327 	/*
1328 	 * If MTDEBUGPATTERN==1, lp->addr will have been overwritten with
1329 	 * FREEPATTERN for lp->size bytes. If we can merge, the oversize
1330 	 * header(s) that will also become part of the memory available for
1331 	 * reallocation (ie lp and/or nx) must also be overwritten with
1332 	 * FREEPATTERN or we will SIGABRT when this memory is next reallocated.
1333 	 */
1334 	switch (merge_flags) {
1335 
1336 	case INSERT_ONLY:		/* Coalescing not possible */
1337 		insert_oversize(lp, nx);
1338 		break;
1339 	case COALESCE_LEFT:
1340 		pv->size += size_lp;
1341 		position_oversize_by_size(pv);
1342 		if (debugopt & MTDEBUGPATTERN)
1343 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1344 		break;
1345 	case COALESCE_RIGHT:
1346 		unlink_oversize(nx);
1347 		lp->size += size_nx;
1348 		insert_oversize(lp, pv->next_byaddr);
1349 		if (debugopt & MTDEBUGPATTERN)
1350 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1351 		break;
1352 	case COALESCE_WITH_BOTH_SIDES:	/* Merge (with right) to the left */
1353 		pv->size += size_lp + size_nx;
1354 		unlink_oversize(nx);
1355 		position_oversize_by_size(pv);
1356 		if (debugopt & MTDEBUGPATTERN) {
1357 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1358 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1359 		}
1360 		break;
1361 	}
1362 }
1363 
1364 /*
1365  * Find memory on our list that is at least size big. If we find a block that is
1366  * big enough, we break it up and return the associated oversize_t struct back
1367  * to the calling client. Any leftover piece of that block is returned to the
1368  * freelist.
1369  */
1370 static oversize_t *
1371 find_oversize(size_t size)
1372 {
1373 	oversize_t *wp = oversize_list.next_bysize;
1374 	while (wp != &oversize_list && size > wp->size)
1375 		wp = wp->next_bysize;
1376 
1377 	if (wp == &oversize_list) /* empty list or nothing big enough */
1378 		return (NULL);
1379 	/* breaking up a chunk of memory */
1380 	if ((long)((wp->size - (size + OVSZ_HEADER_SIZE + MTMALLOC_MIN_ALIGN)))
1381 	    > MAX_CACHED) {
1382 		caddr_t off;
1383 		oversize_t *np;
1384 		size_t osize;
1385 		off = (caddr_t)ALIGN(wp->addr + size,
1386 		    MTMALLOC_MIN_ALIGN);
1387 		osize = wp->size;
1388 		wp->size = (size_t)(off - wp->addr);
1389 		np = oversize_header_alloc((uintptr_t)off,
1390 		    osize - (wp->size + OVSZ_HEADER_SIZE));
1391 		if ((long)np->size < 0)
1392 			abort();
1393 		unlink_oversize(wp);
1394 		add_oversize(np);
1395 	} else {
1396 		unlink_oversize(wp);
1397 	}
1398 	return (wp);
1399 }
1400 
1401 static void
1402 copy_pattern(uint32_t pattern, void *buf_arg, size_t size)
1403 {
1404 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1405 	uint32_t *buf = buf_arg;
1406 
1407 	while (buf < bufend - 3) {
1408 		buf[3] = buf[2] = buf[1] = buf[0] = pattern;
1409 		buf += 4;
1410 	}
1411 	while (buf < bufend)
1412 		*buf++ = pattern;
1413 }
1414 
1415 static void *
1416 verify_pattern(uint32_t pattern, void *buf_arg, size_t size)
1417 {
1418 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1419 	uint32_t *buf;
1420 
1421 	for (buf = buf_arg; buf < bufend; buf++)
1422 		if (*buf != pattern)
1423 			return (buf);
1424 	return (NULL);
1425 }
1426 
1427 static void
1428 free_oversize(oversize_t *ovp)
1429 {
1430 	assert(((uintptr_t)ovp->addr & 7) == 0); /* are we 8 byte aligned */
1431 	assert(ovp->size > MAX_CACHED);
1432 
1433 	ovp->next_bysize = ovp->prev_bysize = NULL;
1434 	ovp->next_byaddr = ovp->prev_byaddr = NULL;
1435 	(void) mutex_lock(&oversize_lock);
1436 	add_oversize(ovp);
1437 	(void) mutex_unlock(&oversize_lock);
1438 }
1439 
1440 static oversize_t *
1441 oversize_header_alloc(uintptr_t mem, size_t size)
1442 {
1443 	oversize_t *ovsz_hdr;
1444 
1445 	assert(size > MAX_CACHED);
1446 
1447 	ovsz_hdr = (oversize_t *)mem;
1448 	ovsz_hdr->prev_bysize = NULL;
1449 	ovsz_hdr->next_bysize = NULL;
1450 	ovsz_hdr->prev_byaddr = NULL;
1451 	ovsz_hdr->next_byaddr = NULL;
1452 	ovsz_hdr->hash_next = NULL;
1453 	ovsz_hdr->size = size;
1454 	mem += OVSZ_SIZE;
1455 	*(uintptr_t *)mem = MTMALLOC_OVERSIZE_MAGIC;
1456 	mem += OVERHEAD;
1457 	assert(((uintptr_t)mem & 7) == 0); /* are we 8 byte aligned */
1458 	ovsz_hdr->addr = (caddr_t)mem;
1459 	return (ovsz_hdr);
1460 }
1461 
1462 static void
1463 malloc_prepare()
1464 {
1465 	percpu_t *cpuptr;
1466 	cache_head_t *cachehead;
1467 	cache_t *thiscache;
1468 
1469 	(void) mutex_lock(&oversize_lock);
1470 	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
1471 		(void) mutex_lock(&cpuptr->mt_parent_lock);
1472 		for (cachehead = &cpuptr->mt_caches[0];
1473 		    cachehead < &cpuptr->mt_caches[NUM_CACHES];
1474 		    cachehead++) {
1475 			for (thiscache = cachehead->mt_cache;
1476 			    thiscache != NULL;
1477 			    thiscache = thiscache->mt_next) {
1478 				(void) mutex_lock(
1479 				    &thiscache->mt_cache_lock);
1480 			}
1481 		}
1482 	}
1483 }
1484 
1485 static void
1486 malloc_release()
1487 {
1488 	percpu_t *cpuptr;
1489 	cache_head_t *cachehead;
1490 	cache_t *thiscache;
1491 
1492 	for (cpuptr = &cpu_list[ncpus - 1]; cpuptr >= &cpu_list[0]; cpuptr--) {
1493 		for (cachehead = &cpuptr->mt_caches[NUM_CACHES - 1];
1494 		    cachehead >= &cpuptr->mt_caches[0];
1495 		    cachehead--) {
1496 			for (thiscache = cachehead->mt_cache;
1497 			    thiscache != NULL;
1498 			    thiscache = thiscache->mt_next) {
1499 				(void) mutex_unlock(
1500 				    &thiscache->mt_cache_lock);
1501 			}
1502 		}
1503 		(void) mutex_unlock(&cpuptr->mt_parent_lock);
1504 	}
1505 	(void) mutex_unlock(&oversize_lock);
1506 }
1507 
1508 #pragma init(malloc_init)
1509 static void
1510 malloc_init(void)
1511 {
1512 	/*
1513 	 * This works in the init section for this library
1514 	 * because setup_caches() doesn't call anything in libc
1515 	 * that calls malloc().  If it did, disaster would ensue.
1516 	 *
1517 	 * For this to work properly, this library must be the first
1518 	 * one to have its init section called (after libc) by the
1519 	 * dynamic linker.  If some other library's init section
1520 	 * ran first and called malloc(), disaster would ensue.
1521 	 * Because this is an interposer library for malloc(), the
1522 	 * dynamic linker arranges for its init section to run first.
1523 	 */
1524 	(void) setup_caches();
1525 
1526 	(void) pthread_atfork(malloc_prepare, malloc_release, malloc_release);
1527 }
1528