xref: /titanic_44/usr/src/lib/libmtmalloc/common/mtmalloc.c (revision b369f4b871a39ef94e220443957975f445f52eb6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <mtmalloc.h>
30 #include "mtmalloc_impl.h"
31 #include <unistd.h>
32 #include <synch.h>
33 #include <thread.h>
34 #include <stdio.h>
35 #include <limits.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <strings.h>
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 
42 /*
43  * To turn on the asserts just compile -DDEBUG
44  */
45 
46 #ifndef	DEBUG
47 #define	NDEBUG
48 #endif
49 
50 #include <assert.h>
51 
52 /*
53  * The MT hot malloc implementation contained herein is designed to be
54  * plug-compatible with the libc version of malloc. It is not intended
55  * to replace that implementation until we decide that it is ok to break
56  * customer apps (Solaris 3.0).
57  *
58  * For requests up to 2^^16, the allocator initializes itself into NCPUS
59  * worth of chains of caches. When a memory request is made, the calling thread
60  * is vectored into one of NCPUS worth of caches.  The LWP id gives us a cheap,
61  * contention-reducing index to use, eventually, this should be replaced with
62  * the actual CPU sequence number, when an interface to get it is available.
63  *
64  * Once the thread is vectored into one of the list of caches the real
65  * allocation of the memory begins. The size is determined to figure out which
66  * bucket the allocation should be satisfied from. The management of free
67  * buckets is done via a bitmask. A free bucket is represented by a 1. The
68  * first free bit represents the first free bucket. The position of the bit,
69  * represents the position of the bucket in the arena.
70  *
71  * When the memory from the arena is handed out, the address of the cache
72  * control structure is written in the word preceeding the returned memory.
73  * This cache control address is used during free() to mark the buffer free
74  * in the cache control structure.
75  *
76  * When all available memory in a cache has been depleted, a new chunk of memory
77  * is allocated via sbrk(). The new cache is allocated from this chunk of memory
78  * and initialized in the function create_cache(). New caches are installed at
79  * the front of a singly linked list of the same size memory pools. This helps
80  * to ensure that there will tend to be available memory in the beginning of the
81  * list.
82  *
83  * Long linked lists hurt performance. To decrease this effect, there is a
84  * tunable, requestsize, that bumps up the sbrk allocation size and thus
85  * increases the number of available blocks within an arena.  We also keep
86  * a "hint" for each cache list, which is the last cache in the list allocated
87  * from.  This lowers the cost of searching if there are a lot of fully
88  * allocated blocks at the front of the list.
89  *
90  * For requests greater than 2^^16 (oversize allocations), there are two pieces
91  * of overhead. There is the OVERHEAD used to hold the cache addr
92  * (&oversize_list), plus an oversize_t structure to further describe the block.
93  *
94  * The oversize list is kept as defragmented as possible by coalescing
95  * freed oversized allocations with adjacent neighbors.
96  *
97  * Addresses handed out are stored in a hash table, and are aligned on
98  * MTMALLOC_MIN_ALIGN-byte boundaries at both ends. Request sizes are rounded-up
99  * where necessary in order to achieve this. This eases the implementation of
100  * MTDEBUGPATTERN and MTINITPATTERN, particularly where coalescing occurs.
101  *
102  * A memalign allocation takes memalign header overhead.  There's two
103  * types of memalign headers distinguished by MTMALLOC_MEMALIGN_MAGIC
104  * and MTMALLOC_MEMALIGN_MIN_MAGIC.  When the size of memory taken to
105  * get to the aligned address from malloc'ed address is the minimum size
106  * OVERHEAD, we create a header taking only one OVERHEAD space with magic
107  * number MTMALLOC_MEMALIGN_MIN_MAGIC, and we know by subtracting OVERHEAD
108  * from memaligned address, we can get to the malloc'ed address. Otherwise,
109  * we create a memalign header taking two OVERHEAD space, one stores
110  * MTMALLOC_MEMALIGN_MAGIC magic number, the other one points back to the
111  * malloc'ed address.
112  */
113 
114 #if defined(__i386) || defined(__amd64)
115 #include <arpa/inet.h>	/* for htonl() */
116 #endif
117 
118 static void * morecore(size_t);
119 static int setup_caches(void);
120 static void create_cache(cache_t *, size_t bufsize, uint_t hunks);
121 static void * malloc_internal(size_t, percpu_t *);
122 static void * oversize(size_t);
123 static oversize_t *find_oversize(size_t);
124 static void add_oversize(oversize_t *);
125 static void copy_pattern(uint32_t, void *, size_t);
126 static void * verify_pattern(uint32_t, void *, size_t);
127 static void reinit_cpu_list(void);
128 static void reinit_cache(cache_t *);
129 static void free_oversize(oversize_t *);
130 static oversize_t *oversize_header_alloc(uintptr_t, size_t);
131 
132 /*
133  * oversize hash table stuff
134  */
135 #define	NUM_BUCKETS	67	/* must be prime */
136 #define	HASH_OVERSIZE(caddr)	((uintptr_t)(caddr) % NUM_BUCKETS)
137 oversize_t *ovsz_hashtab[NUM_BUCKETS];
138 
139 /*
140  * Gets a decent "current cpu identifier", to be used to reduce contention.
141  * Eventually, this should be replaced by an interface to get the actual
142  * CPU sequence number in libthread/liblwp.
143  */
144 extern uint_t _thr_self();
145 #pragma weak _thr_self
146 #define	get_curcpu_func() (curcpu_func)_thr_self
147 
148 #define	ALIGN(x, a)	((((uintptr_t)(x) + ((uintptr_t)(a) - 1)) \
149 			& ~((uintptr_t)(a) - 1)))
150 
151 /* need this to deal with little endianess of x86 */
152 #if defined(__i386) || defined(__amd64)
153 #define	FLIP_EM(x)	htonl((x))
154 #else
155 #define	FLIP_EM(x)	(x)
156 #endif
157 
158 #define	INSERT_ONLY			0
159 #define	COALESCE_LEFT			0x00000001
160 #define	COALESCE_RIGHT			0x00000002
161 #define	COALESCE_WITH_BOTH_SIDES	(COALESCE_LEFT | COALESCE_RIGHT)
162 
163 #define	OVERHEAD	8	/* size needed to write cache addr */
164 #define	HUNKSIZE	8192	/* just a multiplier */
165 
166 #define	MAX_CACHED_SHIFT	16	/* 64K is the max cached size */
167 #define	MAX_CACHED		(1 << MAX_CACHED_SHIFT)
168 #define	MIN_CACHED_SHIFT	4	/* smaller requests rounded up */
169 #define	MTMALLOC_MIN_ALIGN	8	/* min guaranteed alignment */
170 
171 /* maximum size before overflow */
172 #define	MAX_MTMALLOC	(SIZE_MAX - (SIZE_MAX % MTMALLOC_MIN_ALIGN) \
173 			- OVSZ_HEADER_SIZE)
174 
175 #define	NUM_CACHES	(MAX_CACHED_SHIFT - MIN_CACHED_SHIFT + 1)
176 #define	CACHELIST_SIZE	ALIGN(NUM_CACHES * sizeof (cache_head_t), \
177     CACHE_COHERENCY_UNIT)
178 
179 #define	MINSIZE		9	/* for requestsize, tunable */
180 #define	MAXSIZE		256	/* arbitrary, big enough, for requestsize */
181 
182 #define	FREEPATTERN	0xdeadbeef /* debug fill pattern for free buf */
183 #define	INITPATTERN	0xbaddcafe /* debug fill pattern for new buf */
184 
185 #define	misaligned(p)	((unsigned)(p) & (sizeof (int) - 1))
186 #define	IS_OVERSIZE(x, y)	(((x) < (y)) && (((x) > MAX_CACHED)? 1 : 0))
187 
188 static long requestsize = MINSIZE; /* 9 pages per cache; tunable; 9 is min */
189 
190 static uint_t cpu_mask;
191 static curcpu_func curcpu;
192 
193 static int32_t debugopt;
194 static int32_t reinit;
195 
196 static percpu_t *cpu_list;
197 static oversize_t oversize_list;
198 static mutex_t oversize_lock;
199 
200 static int ncpus;
201 
202 #define	MTMALLOC_OVERSIZE_MAGIC		((uintptr_t)&oversize_list)
203 #define	MTMALLOC_MEMALIGN_MAGIC		((uintptr_t)&oversize_list + 1)
204 #define	MTMALLOC_MEMALIGN_MIN_MAGIC	((uintptr_t)&oversize_list + 2)
205 
206 /*
207  * We require allocations handed out to be aligned on MTMALLOC_MIN_ALIGN-byte
208  * boundaries. We round up sizeof (oversize_t) (when necessary) to ensure that
209  * this is achieved.
210  */
211 #define	OVSZ_SIZE		(ALIGN(sizeof (oversize_t), MTMALLOC_MIN_ALIGN))
212 #define	OVSZ_HEADER_SIZE	(OVSZ_SIZE + OVERHEAD)
213 
214 /*
215  * memalign header takes 2 OVERHEAD space.  One for memalign magic, and the
216  * other one points back to the start address of originally allocated space.
217  */
218 #define	MEMALIGN_HEADER_SIZE	2 * OVERHEAD
219 #define	MEMALIGN_HEADER_ALLOC(x, shift, malloc_addr)\
220 	if (shift == OVERHEAD)\
221 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
222 			MTMALLOC_MEMALIGN_MIN_MAGIC; \
223 	else {\
224 		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
225 			MTMALLOC_MEMALIGN_MAGIC; \
226 		*((uintptr_t *)((caddr_t)x - 2 * OVERHEAD)) = \
227 			(uintptr_t)malloc_addr; \
228 	}
229 
230 void *
231 malloc(size_t bytes)
232 {
233 	percpu_t *list_rotor;
234 	uint_t	list_index;
235 
236 	/*
237 	 * this test is due to linking with libthread.
238 	 * There are malloc calls prior to this library
239 	 * being initialized.
240 	 *
241 	 * If setup_caches fails, we set ENOMEM and return NULL
242 	 */
243 	if (cpu_list == (percpu_t *)NULL) {
244 		if (setup_caches() == 0) {
245 			errno = ENOMEM;
246 			return (NULL);
247 		}
248 	}
249 
250 	if (bytes > MAX_CACHED)
251 		return (oversize(bytes));
252 
253 	list_index = (curcpu() & cpu_mask);
254 
255 	list_rotor = &cpu_list[list_index];
256 
257 	return (malloc_internal(bytes, list_rotor));
258 }
259 
260 void *
261 realloc(void * ptr, size_t bytes)
262 {
263 	void *new, *data_ptr;
264 	cache_t *cacheptr;
265 	caddr_t mem;
266 	size_t shift = 0;
267 
268 	if (ptr == NULL)
269 		return (malloc(bytes));
270 
271 	if (bytes == 0) {
272 		free(ptr);
273 		return (NULL);
274 	}
275 
276 	data_ptr = ptr;
277 	mem = (caddr_t)ptr - OVERHEAD;
278 
279 	new = malloc(bytes);
280 
281 	if (new == NULL)
282 		return (NULL);
283 
284 	/*
285 	 * If new == ptr, ptr has previously been freed. Passing a freed pointer
286 	 * to realloc() is not allowed - unless the caller specifically states
287 	 * otherwise, in which case we must avoid freeing ptr (ie new) before we
288 	 * return new. There is (obviously) no requirement to memcpy() ptr to
289 	 * new before we return.
290 	 */
291 	if (new == ptr) {
292 		if (!(debugopt & MTDOUBLEFREE))
293 			abort();
294 		return (new);
295 	}
296 
297 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
298 		mem -= OVERHEAD;
299 		ptr = (void *)*(uintptr_t *)mem;
300 		mem = (caddr_t)ptr - OVERHEAD;
301 		shift = (size_t)((uintptr_t)data_ptr - (uintptr_t)ptr);
302 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
303 		ptr = (void *) mem;
304 		mem -= OVERHEAD;
305 		shift = OVERHEAD;
306 	}
307 
308 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
309 		oversize_t *old;
310 
311 		old = (oversize_t *)(mem - OVSZ_SIZE);
312 		(void) memcpy(new, data_ptr, MIN(bytes, old->size - shift));
313 		free(ptr);
314 		return (new);
315 	}
316 
317 	cacheptr = (cache_t *)*(uintptr_t *)mem;
318 
319 	(void) memcpy(new, data_ptr,
320 		MIN(cacheptr->mt_size - OVERHEAD - shift, bytes));
321 	free(ptr);
322 
323 	return (new);
324 }
325 
326 void *
327 calloc(size_t nelem, size_t bytes)
328 {
329 	void * ptr;
330 	size_t size = nelem * bytes;
331 
332 	ptr = malloc(size);
333 	if (ptr == NULL)
334 		return (NULL);
335 	bzero(ptr, size);
336 
337 	return (ptr);
338 }
339 
340 void
341 free(void * ptr)
342 {
343 	cache_t *cacheptr;
344 	caddr_t mem;
345 	int32_t i;
346 	caddr_t freeblocks;
347 	uintptr_t offset;
348 	uchar_t mask;
349 	int32_t which_bit, num_bytes;
350 
351 	if (ptr == NULL)
352 		return;
353 
354 	mem = (caddr_t)ptr - OVERHEAD;
355 
356 	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
357 		mem -= OVERHEAD;
358 		ptr = (void *)*(uintptr_t *)mem;
359 		mem = (caddr_t)ptr - OVERHEAD;
360 	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
361 		ptr = (void *) mem;
362 		mem -= OVERHEAD;
363 	}
364 
365 	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
366 		oversize_t *big, **opp;
367 		int bucket;
368 
369 		big = (oversize_t *)(mem - OVSZ_SIZE);
370 		(void) mutex_lock(&oversize_lock);
371 
372 		bucket = HASH_OVERSIZE(big->addr);
373 		for (opp = &ovsz_hashtab[bucket]; *opp != NULL;
374 		    opp = &(*opp)->hash_next)
375 			if (*opp == big)
376 				break;
377 
378 		if (*opp == NULL) {
379 			if (!(debugopt & MTDOUBLEFREE))
380 				abort();
381 			(void) mutex_unlock(&oversize_lock);
382 			return;
383 		}
384 
385 		*opp = big->hash_next;	/* remove big from the hash table */
386 		big->hash_next = NULL;
387 
388 		if (debugopt & MTDEBUGPATTERN)
389 			copy_pattern(FREEPATTERN, ptr, big->size);
390 		add_oversize(big);
391 		(void) mutex_unlock(&oversize_lock);
392 		return;
393 	}
394 
395 	cacheptr = (cache_t *)*(uintptr_t *)mem;
396 	freeblocks = cacheptr->mt_freelist;
397 
398 	/*
399 	 * This is the distance measured in bits into the arena.
400 	 * The value of offset is in bytes but there is a 1-1 correlation
401 	 * between distance into the arena and distance into the
402 	 * freelist bitmask.
403 	 */
404 	offset = mem - cacheptr->mt_arena;
405 
406 	/*
407 	 * i is total number of bits to offset into freelist bitmask.
408 	 */
409 
410 	i = offset / cacheptr->mt_size;
411 
412 	num_bytes = i >> 3;
413 
414 	/*
415 	 * which_bit is the bit offset into the byte in the freelist.
416 	 * if our freelist bitmask looks like 0xf3 and we are freeing
417 	 * block 5 (ie: the 6th block) our mask will be 0xf7 after
418 	 * the free. Things go left to right that's why the mask is 0x80
419 	 * and not 0x01.
420 	 */
421 	which_bit = i - (num_bytes << 3);
422 
423 	mask = 0x80 >> which_bit;
424 
425 	freeblocks += num_bytes;
426 
427 	if (debugopt & MTDEBUGPATTERN)
428 		copy_pattern(FREEPATTERN, ptr, cacheptr->mt_size - OVERHEAD);
429 
430 	(void) mutex_lock(&cacheptr->mt_cache_lock);
431 
432 	if (*freeblocks & mask) {
433 		if (!(debugopt & MTDOUBLEFREE))
434 			abort();
435 	} else {
436 		*freeblocks |= mask;
437 		cacheptr->mt_nfree++;
438 	}
439 
440 	(void) mutex_unlock(&cacheptr->mt_cache_lock);
441 }
442 
443 void *
444 memalign(size_t alignment, size_t size)
445 {
446 	size_t alloc_size;
447 	uintptr_t offset;
448 	void *alloc_buf;
449 	void *ret_buf;
450 
451 	if (size == 0 || alignment == 0 ||
452 		misaligned(alignment) ||
453 		(alignment & (alignment - 1)) != 0) {
454 		errno = EINVAL;
455 		return (NULL);
456 	}
457 
458 	/* <= MTMALLOC_MIN_ALIGN, malloc can provide directly */
459 	if (alignment <= MTMALLOC_MIN_ALIGN)
460 		return (malloc(size));
461 
462 	alloc_size = size + alignment - MTMALLOC_MIN_ALIGN;
463 
464 	if (alloc_size < size) { /* overflow */
465 		errno = ENOMEM;
466 		return (NULL);
467 	}
468 
469 	alloc_buf = malloc(alloc_size);
470 
471 	if (alloc_buf == NULL)
472 		/* malloc sets errno */
473 		return (NULL);
474 
475 	/*
476 	 * If alloc_size > MAX_CACHED, malloc() will have returned a multiple of
477 	 * MTMALLOC_MIN_ALIGN, having rounded-up alloc_size if necessary. Since
478 	 * we will use alloc_size to return the excess fragments to the free
479 	 * list, we also round-up alloc_size if necessary.
480 	 */
481 	if ((alloc_size > MAX_CACHED) &&
482 	    (alloc_size & (MTMALLOC_MIN_ALIGN - 1)))
483 		alloc_size = ALIGN(alloc_size, MTMALLOC_MIN_ALIGN);
484 
485 	if ((offset = (uintptr_t)alloc_buf & (alignment - 1)) == 0) {
486 		/* aligned correctly */
487 
488 		size_t frag_size = alloc_size -
489 			(size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
490 
491 		/*
492 		 * If the leftover piece of the memory > MAX_CACHED,
493 		 * split off the piece and return it back to the freelist.
494 		 */
495 		if (IS_OVERSIZE(frag_size, alloc_size)) {
496 			oversize_t *orig, *tail;
497 			uintptr_t taddr;
498 			size_t data_size;
499 			taddr = ALIGN((uintptr_t)alloc_buf + size,
500 					MTMALLOC_MIN_ALIGN);
501 			data_size = taddr - (uintptr_t)alloc_buf;
502 			orig = (oversize_t *)((uintptr_t)alloc_buf -
503 					OVSZ_HEADER_SIZE);
504 			frag_size = orig->size - data_size -
505 					OVSZ_HEADER_SIZE;
506 			orig->size = data_size;
507 			tail = oversize_header_alloc(taddr, frag_size);
508 			free_oversize(tail);
509 		}
510 		ret_buf = alloc_buf;
511 	} else {
512 		uchar_t	oversize_bits = 0;
513 		size_t	head_sz, data_sz, tail_sz;
514 		uintptr_t ret_addr, taddr, shift, tshift;
515 		oversize_t *orig, *tail;
516 		size_t tsize;
517 
518 		/* needs to be aligned */
519 		shift = alignment - offset;
520 
521 		assert(shift >= MTMALLOC_MIN_ALIGN);
522 
523 		ret_addr = ((uintptr_t)alloc_buf + shift);
524 		ret_buf = (void *)ret_addr;
525 
526 		if (alloc_size <= MAX_CACHED) {
527 			MEMALIGN_HEADER_ALLOC(ret_addr, shift, alloc_buf);
528 			return (ret_buf);
529 		}
530 
531 		/*
532 		 * Only check for the fragments when the memory is allocted
533 		 * from oversize_list.  Split off a fragment and return it
534 		 * to the oversize freelist when it's > MAX_CACHED.
535 		 */
536 
537 		head_sz = shift - MAX(MEMALIGN_HEADER_SIZE, OVSZ_HEADER_SIZE);
538 
539 		tail_sz = alloc_size -
540 			(shift + size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
541 
542 		oversize_bits |= IS_OVERSIZE(head_sz, alloc_size) |
543 				IS_OVERSIZE(size, alloc_size) << DATA_SHIFT |
544 				IS_OVERSIZE(tail_sz, alloc_size) << TAIL_SHIFT;
545 
546 		switch (oversize_bits) {
547 			case NONE_OVERSIZE:
548 			case DATA_OVERSIZE:
549 				MEMALIGN_HEADER_ALLOC(ret_addr, shift,
550 					alloc_buf);
551 				break;
552 			case HEAD_OVERSIZE:
553 				/*
554 				 * If we can extend data > MAX_CACHED and have
555 				 * head still > MAX_CACHED, we split head-end
556 				 * as the case of head-end and data oversized,
557 				 * otherwise just create memalign header.
558 				 */
559 				tsize = (shift + size) - (MAX_CACHED + 8 +
560 					MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
561 
562 				if (!IS_OVERSIZE(tsize, alloc_size)) {
563 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
564 						alloc_buf);
565 					break;
566 				} else {
567 					tsize += OVSZ_HEADER_SIZE;
568 					taddr = ALIGN((uintptr_t)alloc_buf +
569 						tsize, MTMALLOC_MIN_ALIGN);
570 					tshift = ret_addr - taddr;
571 					MEMALIGN_HEADER_ALLOC(ret_addr, tshift,
572 						taddr);
573 					ret_addr = taddr;
574 					shift = ret_addr - (uintptr_t)alloc_buf;
575 				}
576 				/* FALLTHROUGH */
577 			case HEAD_AND_DATA_OVERSIZE:
578 				/*
579 				 * Split off the head fragment and
580 				 * return it back to oversize freelist.
581 				 * Create oversize header for the piece
582 				 * of (data + tail fragment).
583 				 */
584 				orig = (oversize_t *)((uintptr_t)alloc_buf -
585 						OVSZ_HEADER_SIZE);
586 				(void) oversize_header_alloc(ret_addr -
587 						OVSZ_HEADER_SIZE,
588 						(orig->size - shift));
589 				orig->size = shift - OVSZ_HEADER_SIZE;
590 
591 				/* free up the head fragment */
592 				free_oversize(orig);
593 				break;
594 			case TAIL_OVERSIZE:
595 				/*
596 				 * If we can extend data > MAX_CACHED and have
597 				 * tail-end still > MAX_CACHED, we split tail
598 				 * end, otherwise just create memalign header.
599 				 */
600 				orig = (oversize_t *)((uintptr_t)alloc_buf -
601 						OVSZ_HEADER_SIZE);
602 				tsize =  orig->size - (MAX_CACHED + 8 +
603 					shift + OVSZ_HEADER_SIZE +
604 					MTMALLOC_MIN_ALIGN);
605 				if (!IS_OVERSIZE(tsize, alloc_size)) {
606 					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
607 						alloc_buf);
608 					break;
609 				} else {
610 					size = MAX_CACHED + 8;
611 				}
612 				/* FALLTHROUGH */
613 			case DATA_AND_TAIL_OVERSIZE:
614 				/*
615 				 * Split off the tail fragment and
616 				 * return it back to oversize freelist.
617 				 * Create memalign header and adjust
618 				 * the size for the piece of
619 				 * (head fragment + data).
620 				 */
621 				taddr = ALIGN(ret_addr + size,
622 						MTMALLOC_MIN_ALIGN);
623 				data_sz = (size_t)(taddr -
624 						(uintptr_t)alloc_buf);
625 				orig = (oversize_t *)((uintptr_t)alloc_buf -
626 						OVSZ_HEADER_SIZE);
627 				tsize = orig->size - data_sz;
628 				orig->size = data_sz;
629 				MEMALIGN_HEADER_ALLOC(ret_buf, shift,
630 					alloc_buf);
631 				tsize -= OVSZ_HEADER_SIZE;
632 				tail = oversize_header_alloc(taddr,  tsize);
633 				free_oversize(tail);
634 				break;
635 			case HEAD_AND_TAIL_OVERSIZE:
636 				/*
637 				 * Split off the head fragment.
638 				 * We try to free up tail-end when we can
639 				 * extend data size to (MAX_CACHED + 8)
640 				 * and remain tail-end oversized.
641 				 * The bottom line is all split pieces
642 				 * should be oversize in size.
643 				 */
644 				orig = (oversize_t *)((uintptr_t)alloc_buf -
645 					OVSZ_HEADER_SIZE);
646 				tsize =  orig->size - (MAX_CACHED + 8 +
647 					OVSZ_HEADER_SIZE + shift +
648 					MTMALLOC_MIN_ALIGN);
649 
650 				if (!IS_OVERSIZE(tsize, alloc_size)) {
651 					/*
652 					 * If the chunk is not big enough
653 					 * to make both data and tail oversize
654 					 * we just keep them as one piece.
655 					 */
656 					(void) oversize_header_alloc(ret_addr -
657 						OVSZ_HEADER_SIZE,
658 						orig->size - shift);
659 					orig->size = shift -
660 						OVSZ_HEADER_SIZE;
661 					free_oversize(orig);
662 					break;
663 				} else {
664 					/*
665 					 * extend data size > MAX_CACHED
666 					 * and handle it as head, data, tail
667 					 * are all oversized.
668 					 */
669 					size = MAX_CACHED + 8;
670 				}
671 				/* FALLTHROUGH */
672 			case ALL_OVERSIZE:
673 				/*
674 				 * split off the head and tail fragments,
675 				 * return them back to the oversize freelist.
676 				 * Alloc oversize header for data seg.
677 				 */
678 				orig = (oversize_t *)((uintptr_t)alloc_buf -
679 					OVSZ_HEADER_SIZE);
680 				tsize = orig->size;
681 				orig->size = shift - OVSZ_HEADER_SIZE;
682 				free_oversize(orig);
683 
684 				taddr = ALIGN(ret_addr + size,
685 					MTMALLOC_MIN_ALIGN);
686 				data_sz = taddr - ret_addr;
687 				assert(tsize > (shift + data_sz +
688 					OVSZ_HEADER_SIZE));
689 				tail_sz = tsize -
690 					(shift + data_sz + OVSZ_HEADER_SIZE);
691 
692 				/* create oversize header for data seg */
693 				(void) oversize_header_alloc(ret_addr -
694 					OVSZ_HEADER_SIZE, data_sz);
695 
696 				/* create oversize header for tail fragment */
697 				tail = oversize_header_alloc(taddr, tail_sz);
698 				free_oversize(tail);
699 				break;
700 			default:
701 				/* should not reach here */
702 				assert(0);
703 		}
704 	}
705 	return (ret_buf);
706 }
707 
708 
709 void *
710 valloc(size_t size)
711 {
712 	static unsigned pagesize;
713 
714 	if (size == 0)
715 		return (NULL);
716 
717 	if (!pagesize)
718 		pagesize = sysconf(_SC_PAGESIZE);
719 
720 	return (memalign(pagesize, size));
721 }
722 
723 void
724 mallocctl(int cmd, long value)
725 {
726 	switch (cmd) {
727 
728 	case MTDEBUGPATTERN:
729 		/*
730 		 * Reinitialize free blocks in case malloc() is called prior
731 		 * to mallocctl().
732 		 */
733 		if (value && !(debugopt & cmd)) {
734 			reinit++;
735 			debugopt |= cmd;
736 			reinit_cpu_list();
737 		}
738 		/*FALLTHRU*/
739 	case MTDOUBLEFREE:
740 	case MTINITBUFFER:
741 		if (value)
742 			debugopt |= cmd;
743 		else
744 			debugopt &= ~cmd;
745 		break;
746 	case MTCHUNKSIZE:
747 		if (value >= MINSIZE && value <= MAXSIZE)
748 			requestsize = value;
749 		break;
750 	default:
751 		break;
752 	}
753 }
754 
755 /*
756  * if this function is changed, update the fallback code in setup_caches to
757  * set ncpus to the number of possible return values. (currently 1)
758  */
759 static uint_t
760 fallback_curcpu(void)
761 {
762 	return (0);
763 }
764 
765 /*
766  * Returns non-zero on success, zero on failure.
767  *
768  * This carefully doesn't set cpu_list until initialization is finished.
769  */
770 static int
771 setup_caches(void)
772 {
773 	static mutex_t init_lock = DEFAULTMUTEX;
774 
775 	uintptr_t oldbrk;
776 	uintptr_t newbrk;
777 
778 	size_t cache_space_needed;
779 	size_t padding;
780 
781 	curcpu_func new_curcpu;
782 	uint_t new_cpu_mask;
783 	percpu_t *new_cpu_list;
784 
785 	uint_t i, j;
786 	uintptr_t list_addr;
787 
788 	(void) mutex_lock(&init_lock);
789 	if (cpu_list != NULL) {
790 		(void) mutex_unlock(&init_lock);
791 		return (1); 		/* success -- already initialized */
792 	}
793 
794 	new_curcpu = get_curcpu_func();
795 	if (new_curcpu == NULL) {
796 		new_curcpu = fallback_curcpu;
797 		ncpus = 1;
798 	} else {
799 		if ((ncpus = 2 * sysconf(_SC_NPROCESSORS_CONF)) <= 0)
800 			ncpus = 4; /* decent default value */
801 	}
802 	assert(ncpus > 0);
803 
804 	/* round ncpus up to a power of 2 */
805 	while (ncpus & (ncpus - 1))
806 		ncpus++;
807 
808 	new_cpu_mask = ncpus - 1;	/* create the cpu mask */
809 
810 	/*
811 	 * We now do some magic with the brk.  What we want to get in the
812 	 * end is a bunch of well-aligned stuff in a big initial allocation.
813 	 * Along the way, we do sanity checks to make sure no one else has
814 	 * touched the brk (which shouldn't happen, but it's always good to
815 	 * check)
816 	 *
817 	 * First, make sure sbrk is sane, and store the current brk in oldbrk.
818 	 */
819 	oldbrk = (uintptr_t)sbrk(0);
820 	if ((void *)oldbrk == (void *)-1) {
821 		(void) mutex_unlock(&init_lock);
822 		return (0);	/* sbrk is broken -- we're doomed. */
823 	}
824 
825 	/*
826 	 * Now, align the brk to a multiple of CACHE_COHERENCY_UNIT, so that
827 	 * the percpu structures and cache lists will be properly aligned.
828 	 *
829 	 *   2.  All hunks will be page-aligned, assuming HUNKSIZE >= PAGESIZE,
830 	 *	so they can be paged out individually.
831 	 */
832 	newbrk = ALIGN(oldbrk, CACHE_COHERENCY_UNIT);
833 	if (newbrk != oldbrk && (uintptr_t)sbrk(newbrk - oldbrk) != oldbrk) {
834 		(void) mutex_unlock(&init_lock);
835 		return (0);	/* someone else sbrked */
836 	}
837 
838 	/*
839 	 * For each cpu, there is one percpu_t and a list of caches
840 	 */
841 	cache_space_needed = ncpus * (sizeof (percpu_t) + CACHELIST_SIZE);
842 
843 	new_cpu_list = (percpu_t *)sbrk(cache_space_needed);
844 
845 	if (new_cpu_list == (percpu_t *)-1 ||
846 	    (uintptr_t)new_cpu_list != newbrk) {
847 		(void) mutex_unlock(&init_lock);
848 		return (0);	/* someone else sbrked */
849 	}
850 
851 	/*
852 	 * Finally, align the brk to HUNKSIZE so that all hunks are
853 	 * page-aligned, to avoid edge-effects.
854 	 */
855 
856 	newbrk = (uintptr_t)new_cpu_list + cache_space_needed;
857 
858 	padding = ALIGN(newbrk, HUNKSIZE) - newbrk;
859 
860 	if (padding > 0 && (uintptr_t)sbrk(padding) != newbrk) {
861 		(void) mutex_unlock(&init_lock);
862 		return (0);	/* someone else sbrked */
863 	}
864 
865 	list_addr = ((uintptr_t)new_cpu_list + (sizeof (percpu_t) * ncpus));
866 
867 	/* initialize the percpu list */
868 	for (i = 0; i < ncpus; i++) {
869 		new_cpu_list[i].mt_caches = (cache_head_t *)list_addr;
870 		for (j = 0; j < NUM_CACHES; j++) {
871 			new_cpu_list[i].mt_caches[j].mt_cache = NULL;
872 			new_cpu_list[i].mt_caches[j].mt_hint = NULL;
873 		}
874 
875 		bzero(&new_cpu_list[i].mt_parent_lock, sizeof (mutex_t));
876 
877 		/* get the correct cache list alignment */
878 		list_addr += CACHELIST_SIZE;
879 	}
880 
881 	/*
882 	 * Initialize oversize listhead
883 	 */
884 	oversize_list.next_bysize = &oversize_list;
885 	oversize_list.prev_bysize = &oversize_list;
886 	oversize_list.next_byaddr = &oversize_list;
887 	oversize_list.prev_byaddr = &oversize_list;
888 	oversize_list.addr = NULL;
889 	oversize_list.size = 0;		/* sentinal */
890 
891 	/*
892 	 * now install the global variables, leaving cpu_list for last, so that
893 	 * there aren't any race conditions.
894 	 */
895 	curcpu = new_curcpu;
896 	cpu_mask = new_cpu_mask;
897 	cpu_list = new_cpu_list;
898 
899 	(void) mutex_unlock(&init_lock);
900 
901 	return (1);
902 }
903 
904 static void
905 create_cache(cache_t *cp, size_t size, uint_t chunksize)
906 {
907 	long nblocks;
908 
909 	bzero(&cp->mt_cache_lock, sizeof (mutex_t));
910 	cp->mt_size = size;
911 	cp->mt_freelist = ((caddr_t)cp + sizeof (cache_t));
912 	cp->mt_span = chunksize * HUNKSIZE - sizeof (cache_t);
913 	cp->mt_hunks = chunksize;
914 	/*
915 	 * rough calculation. We will need to adjust later.
916 	 */
917 	nblocks = cp->mt_span / cp->mt_size;
918 	nblocks >>= 3;
919 	if (nblocks == 0) { /* less than 8 free blocks in this pool */
920 		int32_t numblocks = 0;
921 		long i = cp->mt_span;
922 		size_t sub = cp->mt_size;
923 		uchar_t mask = 0;
924 
925 		while (i > sub) {
926 			numblocks++;
927 			i -= sub;
928 		}
929 		nblocks = numblocks;
930 		cp->mt_arena = (caddr_t)ALIGN(cp->mt_freelist + 8, 8);
931 		cp->mt_nfree = numblocks;
932 		while (numblocks--) {
933 			mask |= 0x80 >> numblocks;
934 		}
935 		*(cp->mt_freelist) = mask;
936 	} else {
937 		cp->mt_arena = (caddr_t)ALIGN((caddr_t)cp->mt_freelist +
938 			nblocks, 32);
939 		/* recompute nblocks */
940 		nblocks = (uintptr_t)((caddr_t)cp->mt_freelist +
941 			cp->mt_span - cp->mt_arena) / cp->mt_size;
942 		cp->mt_nfree = ((nblocks >> 3) << 3);
943 		/* Set everything to free */
944 		(void) memset(cp->mt_freelist, 0xff, nblocks >> 3);
945 	}
946 
947 	if (debugopt & MTDEBUGPATTERN)
948 		copy_pattern(FREEPATTERN, cp->mt_arena, cp->mt_size * nblocks);
949 
950 	cp->mt_next = NULL;
951 }
952 
953 static void
954 reinit_cpu_list(void)
955 {
956 	oversize_t *wp = oversize_list.next_bysize;
957 	percpu_t *cpuptr;
958 	cache_t *thiscache;
959 	cache_head_t *cachehead;
960 
961 	if (wp == NULL || cpu_list == NULL) {
962 		reinit = 0;
963 		return;
964 	}
965 
966 	/* Reinitialize free oversize blocks. */
967 	(void) mutex_lock(&oversize_lock);
968 	if (debugopt & MTDEBUGPATTERN)
969 		for (; wp != &oversize_list; wp = wp->next_bysize)
970 			copy_pattern(FREEPATTERN, wp->addr, wp->size);
971 	(void) mutex_unlock(&oversize_lock);
972 
973 	/* Reinitialize free blocks. */
974 	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
975 		(void) mutex_lock(&cpuptr->mt_parent_lock);
976 		for (cachehead = &cpuptr->mt_caches[0]; cachehead <
977 			&cpuptr->mt_caches[NUM_CACHES]; cachehead++) {
978 			for (thiscache = cachehead->mt_cache; thiscache != NULL;
979 				thiscache = thiscache->mt_next) {
980 				(void) mutex_lock(&thiscache->mt_cache_lock);
981 				if (thiscache->mt_nfree == 0) {
982 					(void) mutex_unlock(
983 					    &thiscache->mt_cache_lock);
984 					continue;
985 				}
986 				if (thiscache != NULL)
987 					reinit_cache(thiscache);
988 				(void) mutex_unlock(&thiscache->mt_cache_lock);
989 			}
990 		}
991 		(void) mutex_unlock(&cpuptr->mt_parent_lock);
992 	}
993 	reinit = 0;
994 }
995 
996 static void
997 reinit_cache(cache_t *thiscache)
998 {
999 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
1000 	int32_t i, n;
1001 	caddr_t ret;
1002 
1003 	freeblocks = (uint32_t *)thiscache->mt_freelist;
1004 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
1005 		if (*freeblocks & 0xffffffff) {
1006 		    for (i = 0; i < 32; i++) {
1007 			if (FLIP_EM(*freeblocks) & (0x80000000 >> i)) {
1008 				n = (uintptr_t)(((freeblocks -
1009 				    (uint32_t *)thiscache->mt_freelist) << 5)
1010 				    + i) * thiscache->mt_size;
1011 				ret = thiscache->mt_arena + n;
1012 				ret += OVERHEAD;
1013 				copy_pattern(FREEPATTERN, ret,
1014 				    thiscache->mt_size);
1015 			}
1016 		    }
1017 		}
1018 		freeblocks++;
1019 	}
1020 }
1021 
1022 static void *
1023 malloc_internal(size_t size, percpu_t *cpuptr)
1024 {
1025 	cache_head_t *cachehead;
1026 	cache_t *thiscache, *hintcache;
1027 	int32_t i, n, logsz, bucket;
1028 	uint32_t index;
1029 	uint32_t *freeblocks; /* not a uintptr_t on purpose */
1030 	caddr_t ret;
1031 
1032 	logsz = MIN_CACHED_SHIFT;
1033 
1034 	while (size > (1 << logsz))
1035 		logsz++;
1036 
1037 	bucket = logsz - MIN_CACHED_SHIFT;
1038 
1039 	(void) mutex_lock(&cpuptr->mt_parent_lock);
1040 
1041 	/*
1042 	 * Find a cache of the appropriate size with free buffers.
1043 	 *
1044 	 * We don't need to lock each cache as we check their mt_nfree count,
1045 	 * since:
1046 	 *	1.  We are only looking for caches with mt_nfree > 0.  If a
1047 	 *	   free happens during our search, it will increment mt_nfree,
1048 	 *	   which will not effect the test.
1049 	 *	2.  Allocations can decrement mt_nfree, but they can't happen
1050 	 *	   as long as we hold mt_parent_lock.
1051 	 */
1052 
1053 	cachehead = &cpuptr->mt_caches[bucket];
1054 
1055 	/* Search through the list, starting at the mt_hint */
1056 	thiscache = cachehead->mt_hint;
1057 
1058 	while (thiscache != NULL && thiscache->mt_nfree == 0)
1059 		thiscache = thiscache->mt_next;
1060 
1061 	if (thiscache == NULL) {
1062 		/* wrap around -- search up to the hint */
1063 		thiscache = cachehead->mt_cache;
1064 		hintcache = cachehead->mt_hint;
1065 
1066 		while (thiscache != NULL && thiscache != hintcache &&
1067 		    thiscache->mt_nfree == 0)
1068 			thiscache = thiscache->mt_next;
1069 
1070 		if (thiscache == hintcache)
1071 			thiscache = NULL;
1072 	}
1073 
1074 
1075 	if (thiscache == NULL) { /* there are no free caches */
1076 		int32_t thisrequest = requestsize;
1077 		int32_t buffer_size = (1 << logsz) + OVERHEAD;
1078 
1079 		thiscache = (cache_t *)morecore(thisrequest * HUNKSIZE);
1080 
1081 		if (thiscache == (cache_t *)-1) {
1082 		    (void) mutex_unlock(&cpuptr->mt_parent_lock);
1083 		    errno = EAGAIN;
1084 		    return (NULL);
1085 		}
1086 		create_cache(thiscache, buffer_size, thisrequest);
1087 
1088 		/* link in the new block at the beginning of the list */
1089 		thiscache->mt_next = cachehead->mt_cache;
1090 		cachehead->mt_cache = thiscache;
1091 	}
1092 
1093 	/* update the hint to the cache we found or created */
1094 	cachehead->mt_hint = thiscache;
1095 
1096 	/* thiscache now points to a cache with available space */
1097 	(void) mutex_lock(&thiscache->mt_cache_lock);
1098 
1099 	freeblocks = (uint32_t *)thiscache->mt_freelist;
1100 	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
1101 		if (*freeblocks & 0xffffffff)
1102 			break;
1103 		freeblocks++;
1104 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1105 		    *freeblocks & 0xffffffff)
1106 			break;
1107 		freeblocks++;
1108 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1109 		    *freeblocks & 0xffffffff)
1110 			break;
1111 		freeblocks++;
1112 		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1113 		    *freeblocks & 0xffffffff)
1114 			break;
1115 		freeblocks++;
1116 	}
1117 
1118 	/*
1119 	 * the offset from mt_freelist to freeblocks is the offset into
1120 	 * the arena. Be sure to include the offset into freeblocks
1121 	 * of the bitmask. n is the offset.
1122 	 */
1123 	for (i = 0; i < 32; ) {
1124 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1125 			break;
1126 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1127 			break;
1128 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1129 			break;
1130 		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1131 			break;
1132 	}
1133 	index = 0x80000000 >> --i;
1134 
1135 
1136 	*freeblocks &= FLIP_EM(~index);
1137 
1138 	thiscache->mt_nfree--;
1139 
1140 	(void) mutex_unlock(&thiscache->mt_cache_lock);
1141 	(void) mutex_unlock(&cpuptr->mt_parent_lock);
1142 
1143 	n = (uintptr_t)(((freeblocks - (uint32_t *)thiscache->mt_freelist) << 5)
1144 		+ i) * thiscache->mt_size;
1145 	/*
1146 	 * Now you have the offset in n, you've changed the free mask
1147 	 * in the freelist. Nothing left to do but find the block
1148 	 * in the arena and put the value of thiscache in the word
1149 	 * ahead of the handed out address and return the memory
1150 	 * back to the user.
1151 	 */
1152 	ret = thiscache->mt_arena + n;
1153 
1154 	/* Store the cache addr for this buf. Makes free go fast. */
1155 	*(uintptr_t *)ret = (uintptr_t)thiscache;
1156 
1157 	/*
1158 	 * This assert makes sure we don't hand out memory that is not
1159 	 * owned by this cache.
1160 	 */
1161 	assert(ret + thiscache->mt_size <= thiscache->mt_freelist +
1162 		thiscache->mt_span);
1163 
1164 	ret += OVERHEAD;
1165 
1166 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1167 
1168 	if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1169 		if (verify_pattern(FREEPATTERN, ret, size))
1170 			abort();	/* reference after free */
1171 
1172 	if (debugopt & MTINITBUFFER)
1173 		copy_pattern(INITPATTERN, ret, size);
1174 	return ((void *)ret);
1175 }
1176 
1177 static void *
1178 morecore(size_t bytes)
1179 {
1180 	void * ret;
1181 
1182 	if (bytes > LONG_MAX) {
1183 		intptr_t wad;
1184 		/*
1185 		 * The request size is too big. We need to do this in
1186 		 * chunks. Sbrk only takes an int for an arg.
1187 		 */
1188 		if (bytes == ULONG_MAX)
1189 			return ((void *)-1);
1190 
1191 		ret = sbrk(0);
1192 		wad = LONG_MAX;
1193 		while (wad > 0) {
1194 			if (sbrk(wad) == (void *)-1) {
1195 				if (ret != sbrk(0))
1196 					(void) sbrk(-LONG_MAX);
1197 				return ((void *)-1);
1198 			}
1199 			bytes -= LONG_MAX;
1200 			wad = bytes;
1201 		}
1202 	} else
1203 		ret = sbrk(bytes);
1204 
1205 	return (ret);
1206 }
1207 
1208 
1209 static void *
1210 oversize(size_t size)
1211 {
1212 	caddr_t ret;
1213 	oversize_t *big;
1214 	int bucket;
1215 
1216 	/* make sure we will not overflow */
1217 	if (size > MAX_MTMALLOC) {
1218 		errno = ENOMEM;
1219 		return (NULL);
1220 	}
1221 
1222 	/*
1223 	 * Since we ensure every address we hand back is
1224 	 * MTMALLOC_MIN_ALIGN-byte aligned, ALIGNing size ensures that the
1225 	 * memory handed out is MTMALLOC_MIN_ALIGN-byte aligned at both ends.
1226 	 * This eases the implementation of MTDEBUGPATTERN and MTINITPATTERN,
1227 	 * particularly where coalescing occurs.
1228 	 */
1229 	size = ALIGN(size, MTMALLOC_MIN_ALIGN);
1230 
1231 	/*
1232 	 * The idea with the global lock is that we are sure to
1233 	 * block in the kernel anyway since given an oversize alloc
1234 	 * we are sure to have to call morecore();
1235 	 */
1236 	(void) mutex_lock(&oversize_lock);
1237 
1238 	if ((big = find_oversize(size)) != NULL) {
1239 		if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1240 			if (verify_pattern(FREEPATTERN, big->addr, size))
1241 				abort();	/* reference after free */
1242 	} else {
1243 		/* Get more 8-byte aligned memory from heap */
1244 		ret = morecore(size + OVSZ_HEADER_SIZE);
1245 		if (ret == (caddr_t)-1) {
1246 			(void) mutex_unlock(&oversize_lock);
1247 			errno = ENOMEM;
1248 			return (NULL);
1249 		}
1250 		big = oversize_header_alloc((uintptr_t)ret, size);
1251 	}
1252 	ret = big->addr;
1253 
1254 	/* Add big to the hash table at the head of the relevant bucket. */
1255 	bucket = HASH_OVERSIZE(ret);
1256 	big->hash_next = ovsz_hashtab[bucket];
1257 	ovsz_hashtab[bucket] = big;
1258 
1259 	if (debugopt & MTINITBUFFER)
1260 		copy_pattern(INITPATTERN, ret, size);
1261 
1262 	(void) mutex_unlock(&oversize_lock);
1263 	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1264 	return ((void *)ret);
1265 }
1266 
1267 static void
1268 insert_oversize(oversize_t *op, oversize_t *nx)
1269 {
1270 	oversize_t *sp;
1271 
1272 	/* locate correct insertion point in size-ordered list */
1273 	for (sp = oversize_list.next_bysize;
1274 	    sp != &oversize_list && (op->size > sp->size);
1275 	    sp = sp->next_bysize)
1276 		;
1277 
1278 	/* link into size-ordered list */
1279 	op->next_bysize = sp;
1280 	op->prev_bysize = sp->prev_bysize;
1281 	op->prev_bysize->next_bysize = op;
1282 	op->next_bysize->prev_bysize = op;
1283 
1284 	/*
1285 	 * link item into address-ordered list
1286 	 * (caller provides insertion point as an optimization)
1287 	 */
1288 	op->next_byaddr = nx;
1289 	op->prev_byaddr = nx->prev_byaddr;
1290 	op->prev_byaddr->next_byaddr = op;
1291 	op->next_byaddr->prev_byaddr = op;
1292 
1293 }
1294 
1295 static void
1296 unlink_oversize(oversize_t *lp)
1297 {
1298 	/* unlink from address list */
1299 	lp->prev_byaddr->next_byaddr = lp->next_byaddr;
1300 	lp->next_byaddr->prev_byaddr = lp->prev_byaddr;
1301 
1302 	/* unlink from size list */
1303 	lp->prev_bysize->next_bysize = lp->next_bysize;
1304 	lp->next_bysize->prev_bysize = lp->prev_bysize;
1305 }
1306 
1307 static void
1308 position_oversize_by_size(oversize_t *op)
1309 {
1310 	oversize_t *sp;
1311 
1312 	if (op->size > op->next_bysize->size ||
1313 	    op->size < op->prev_bysize->size) {
1314 
1315 		/* unlink from size list */
1316 		op->prev_bysize->next_bysize = op->next_bysize;
1317 		op->next_bysize->prev_bysize = op->prev_bysize;
1318 
1319 		/* locate correct insertion point in size-ordered list */
1320 		for (sp = oversize_list.next_bysize;
1321 		    sp != &oversize_list && (op->size > sp->size);
1322 		    sp = sp->next_bysize)
1323 			;
1324 
1325 		/* link into size-ordered list */
1326 		op->next_bysize = sp;
1327 		op->prev_bysize = sp->prev_bysize;
1328 		op->prev_bysize->next_bysize = op;
1329 		op->next_bysize->prev_bysize = op;
1330 	}
1331 }
1332 
1333 static void
1334 add_oversize(oversize_t *lp)
1335 {
1336 	int merge_flags = INSERT_ONLY;
1337 	oversize_t *nx;  	/* ptr to item right of insertion point */
1338 	oversize_t *pv;  	/* ptr to item left of insertion point */
1339 	uint_t size_lp, size_pv, size_nx;
1340 	uintptr_t endp_lp, endp_pv, endp_nx;
1341 
1342 	/*
1343 	 * Locate insertion point in address-ordered list
1344 	 */
1345 
1346 	for (nx = oversize_list.next_byaddr;
1347 	    nx != &oversize_list && (lp->addr > nx->addr);
1348 	    nx = nx->next_byaddr)
1349 		;
1350 
1351 	/*
1352 	 * Determine how to add chunk to oversize freelist
1353 	 */
1354 
1355 	size_lp = OVSZ_HEADER_SIZE + lp->size;
1356 	endp_lp = ALIGN((uintptr_t)lp + size_lp, MTMALLOC_MIN_ALIGN);
1357 	size_lp = endp_lp - (uintptr_t)lp;
1358 
1359 	pv = nx->prev_byaddr;
1360 
1361 	if (pv->size) {
1362 
1363 		size_pv = OVSZ_HEADER_SIZE + pv->size;
1364 		endp_pv = ALIGN((uintptr_t)pv + size_pv,
1365 		    MTMALLOC_MIN_ALIGN);
1366 		size_pv = endp_pv - (uintptr_t)pv;
1367 
1368 		/* Check for adjacency with left chunk */
1369 		if ((uintptr_t)lp == endp_pv)
1370 			merge_flags |= COALESCE_LEFT;
1371 	}
1372 
1373 	if (nx->size) {
1374 
1375 	    /* Check for adjacency with right chunk */
1376 	    if ((uintptr_t)nx == endp_lp) {
1377 		size_nx = OVSZ_HEADER_SIZE + nx->size;
1378 		endp_nx = ALIGN((uintptr_t)nx + size_nx,
1379 		    MTMALLOC_MIN_ALIGN);
1380 		size_nx = endp_nx - (uintptr_t)nx;
1381 		merge_flags |= COALESCE_RIGHT;
1382 	    }
1383 	}
1384 
1385 	/*
1386 	 * If MTDEBUGPATTERN==1, lp->addr will have been overwritten with
1387 	 * FREEPATTERN for lp->size bytes. If we can merge, the oversize
1388 	 * header(s) that will also become part of the memory available for
1389 	 * reallocation (ie lp and/or nx) must also be overwritten with
1390 	 * FREEPATTERN or we will SIGABRT when this memory is next reallocated.
1391 	 */
1392 	switch (merge_flags) {
1393 
1394 	case INSERT_ONLY:		/* Coalescing not possible */
1395 		insert_oversize(lp, nx);
1396 		break;
1397 	case COALESCE_LEFT:
1398 		pv->size += size_lp;
1399 		position_oversize_by_size(pv);
1400 		if (debugopt & MTDEBUGPATTERN)
1401 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1402 		break;
1403 	case COALESCE_RIGHT:
1404 		unlink_oversize(nx);
1405 		lp->size += size_nx;
1406 		insert_oversize(lp, pv->next_byaddr);
1407 		if (debugopt & MTDEBUGPATTERN)
1408 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1409 		break;
1410 	case COALESCE_WITH_BOTH_SIDES:	/* Merge (with right) to the left */
1411 		pv->size += size_lp + size_nx;
1412 		unlink_oversize(nx);
1413 		position_oversize_by_size(pv);
1414 		if (debugopt & MTDEBUGPATTERN) {
1415 			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1416 			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1417 		}
1418 		break;
1419 	}
1420 }
1421 
1422 /*
1423  * Find memory on our list that is at least size big. If we find a block that is
1424  * big enough, we break it up and return the associated oversize_t struct back
1425  * to the calling client. Any leftover piece of that block is returned to the
1426  * freelist.
1427  */
1428 static oversize_t *
1429 find_oversize(size_t size)
1430 {
1431 	oversize_t *wp = oversize_list.next_bysize;
1432 	while (wp != &oversize_list && size > wp->size)
1433 		wp = wp->next_bysize;
1434 
1435 	if (wp == &oversize_list) /* empty list or nothing big enough */
1436 		return (NULL);
1437 	/* breaking up a chunk of memory */
1438 	if ((long)((wp->size - (size + OVSZ_HEADER_SIZE + MTMALLOC_MIN_ALIGN)))
1439 	    > MAX_CACHED) {
1440 		caddr_t off;
1441 		oversize_t *np;
1442 		size_t osize;
1443 		off = (caddr_t)ALIGN(wp->addr + size,
1444 		    MTMALLOC_MIN_ALIGN);
1445 		osize = wp->size;
1446 		wp->size = (size_t)(off - wp->addr);
1447 		np = oversize_header_alloc((uintptr_t)off,
1448 		    osize - (wp->size + OVSZ_HEADER_SIZE));
1449 		if ((long)np->size < 0)
1450 			abort();
1451 		unlink_oversize(wp);
1452 		add_oversize(np);
1453 	} else {
1454 		unlink_oversize(wp);
1455 	}
1456 	return (wp);
1457 }
1458 
1459 static void
1460 copy_pattern(uint32_t pattern, void *buf_arg, size_t size)
1461 {
1462 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1463 	uint32_t *buf = buf_arg;
1464 
1465 	while (buf < bufend - 3) {
1466 		buf[3] = buf[2] = buf[1] = buf[0] = pattern;
1467 		buf += 4;
1468 	}
1469 	while (buf < bufend)
1470 		*buf++ = pattern;
1471 }
1472 
1473 static void *
1474 verify_pattern(uint32_t pattern, void *buf_arg, size_t size)
1475 {
1476 	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1477 	uint32_t *buf;
1478 
1479 	for (buf = buf_arg; buf < bufend; buf++)
1480 		if (*buf != pattern)
1481 			return (buf);
1482 	return (NULL);
1483 }
1484 
1485 static void
1486 free_oversize(oversize_t *ovp)
1487 {
1488 	assert(((uintptr_t)ovp->addr & 7) == 0); /* are we 8 byte aligned */
1489 	assert(ovp->size > MAX_CACHED);
1490 
1491 	ovp->next_bysize = ovp->prev_bysize = NULL;
1492 	ovp->next_byaddr = ovp->prev_byaddr = NULL;
1493 	(void) mutex_lock(&oversize_lock);
1494 	add_oversize(ovp);
1495 	(void) mutex_unlock(&oversize_lock);
1496 }
1497 
1498 static oversize_t *
1499 oversize_header_alloc(uintptr_t mem, size_t size)
1500 {
1501 	oversize_t *ovsz_hdr;
1502 
1503 	assert(size > MAX_CACHED);
1504 
1505 	ovsz_hdr = (oversize_t *)mem;
1506 	ovsz_hdr->prev_bysize = NULL;
1507 	ovsz_hdr->next_bysize = NULL;
1508 	ovsz_hdr->prev_byaddr = NULL;
1509 	ovsz_hdr->next_byaddr = NULL;
1510 	ovsz_hdr->hash_next = NULL;
1511 	ovsz_hdr->size = size;
1512 	mem += OVSZ_SIZE;
1513 	*(uintptr_t *)mem = MTMALLOC_OVERSIZE_MAGIC;
1514 	mem += OVERHEAD;
1515 	assert(((uintptr_t)mem & 7) == 0); /* are we 8 byte aligned */
1516 	ovsz_hdr->addr = (caddr_t)mem;
1517 	return (ovsz_hdr);
1518 }
1519