xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 1a220b56b93ff1dc80855691548503117af4cc10)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * This file contains common functions to access and manage the page lists.
38  * Many of these routines originated from platform dependent modules
39  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
40  * a platform independent manner.
41  *
42  * vm/vm_dep.h provides for platform specific support.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/atomic.h>
50 #include <sys/sysmacros.h>
51 #include <vm/as.h>
52 #include <vm/page.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_vn.h>
55 #include <sys/memnode.h>
56 #include <vm/vm_dep.h>
57 #include <sys/lgrp.h>
58 #include <sys/mem_config.h>
59 #include <sys/callb.h>
60 #include <sys/mem_cage.h>
61 #include <sys/sdt.h>
62 
63 extern uint_t	vac_colors;
64 
65 #define	MAX_PRAGMA_ALIGN	128
66 
67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
68 
69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
70 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
71 #else
72 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
73 #endif
74 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
75 
76 /*
77  * number of page colors equivalent to reqested color in page_get routines.
78  * If set, keeps large pages intact longer and keeps MPO allocation
79  * from the local mnode in favor of acquiring the 'correct' page color from
80  * a demoted large page or from a remote mnode.
81  */
82 int	colorequiv;
83 
84 /*
85  * if set, specifies the percentage of large pages that are free from within
86  * a large page region before attempting to lock those pages for
87  * page_get_contig_pages processing.
88  *
89  * Should be turned on when kpr is available when page_trylock_contig_pages
90  * can be more selective.
91  */
92 
93 int	ptcpthreshold;
94 
95 /*
96  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
97  * Enabled by default via pgcplimitsearch.
98  *
99  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
100  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
101  * bound. This upper bound range guarantees:
102  *    - all large page 'slots' will be searched over time
103  *    - the minimum (1) large page candidates considered on each pgcp call
104  *    - count doesn't wrap around to 0
105  */
106 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
107 int	pgcplimitsearch = 1;
108 
109 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
110 #define	SETPGCPFAILCNT(szc)						\
111 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
112 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
113 
114 #ifdef VM_STATS
115 struct vmm_vmstats_str  vmm_vmstats;
116 
117 #endif /* VM_STATS */
118 
119 #if defined(__sparc)
120 #define	LPGCREATE	0
121 #else
122 /* enable page_get_contig_pages */
123 #define	LPGCREATE	1
124 #endif
125 
126 int pg_contig_disable;
127 int pg_lpgcreate_nocage = LPGCREATE;
128 
129 /*
130  * page_freelist_fill pfn flag to signify no hi pfn requirement.
131  */
132 #define	PFNNULL		0
133 
134 /* Flags involved in promotion and demotion routines */
135 #define	PC_FREE		0x1	/* put page on freelist */
136 #define	PC_ALLOC	0x2	/* return page for allocation */
137 
138 /*
139  * Flag for page_demote to be used with PC_FREE to denote that we don't care
140  * what the color is as the color parameter to the function is ignored.
141  */
142 #define	PC_NO_COLOR	(-1)
143 
144 /*
145  * page counters candidates info
146  * See page_ctrs_cands comment below for more details.
147  * fields are as follows:
148  *	pcc_pages_free:		# pages which freelist coalesce can create
149  *	pcc_color_free_len:	number of elements in pcc_color_free array
150  *	pcc_color_free:		pointer to page free counts per color
151  */
152 typedef struct pcc_info {
153 	pgcnt_t	pcc_pages_free;
154 	int	pcc_color_free_len;
155 	pgcnt_t	*pcc_color_free;
156 } pcc_info_t;
157 
158 /*
159  * On big machines it can take a long time to check page_counters
160  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
161  * updated sum of all elements of the corresponding page_counters arrays.
162  * page_freelist_coalesce() searches page_counters only if an appropriate
163  * element of page_ctrs_cands array is greater than 0.
164  *
165  * An extra dimension is used for page_ctrs_cands to spread the elements
166  * over a few e$ cache lines to avoid serialization during the array
167  * updates.
168  */
169 #pragma	align 64(page_ctrs_cands)
170 
171 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
172 
173 /*
174  * Return in val the total number of free pages which can be created
175  * for the given mnode (m) and region size (r)
176  */
177 #define	PGCTRS_CANDS_GETVALUE(m, r, val) {				\
178 	int i;								\
179 	val = 0;							\
180 	for (i = 0; i < NPC_MUTEX; i++) {				\
181 	    val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free;		\
182 	}								\
183 }
184 
185 /*
186  * Return in val the total number of free pages which can be created
187  * for the given mnode (m), region size (r), and color (c)
188  */
189 #define	PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) {			\
190 	int i;								\
191 	val = 0;							\
192 	ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len);	\
193 	for (i = 0; i < NPC_MUTEX; i++) {				\
194 	    val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)];	\
195 	}								\
196 }
197 
198 /*
199  * We can only allow a single thread to update a counter within the physical
200  * range of the largest supported page size. That is the finest granularity
201  * possible since the counter values are dependent on each other
202  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
203  * ctr_mutex lock index for a particular physical range.
204  */
205 static kmutex_t	*ctr_mutex[NPC_MUTEX];
206 
207 #define	PP_CTR_LOCK_INDX(pp)						\
208 	(((pp)->p_pagenum >>					\
209 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
210 
211 /*
212  * Local functions prototypes.
213  */
214 
215 void page_ctr_add(int, int, page_t *, int);
216 void page_ctr_add_internal(int, int, page_t *, int);
217 void page_ctr_sub(int, int, page_t *, int);
218 uint_t  page_convert_color(uchar_t, uchar_t, uint_t);
219 void page_freelist_lock(int);
220 void page_freelist_unlock(int);
221 page_t *page_promote(int, pfn_t, uchar_t, int);
222 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
223 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
224 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
225 static int page_trylock_cons(page_t *pp, se_t se);
226 
227 #define	PNUM_SIZE(szc)							\
228 	(hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
229 #define	PNUM_SHIFT(szc)							\
230 	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
231 
232 /*
233  * The page_counters array below is used to keep track of free contiguous
234  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
235  * This contains an array of counters, the size of the array, a shift value
236  * used to convert a pagenum into a counter array index or vice versa, as
237  * well as a cache of the last successful index to be promoted to a larger
238  * page size.  As an optimization, we keep track of the last successful index
239  * to be promoted per page color for the given size region, and this is
240  * allocated dynamically based upon the number of colors for a given
241  * region size.
242  *
243  * Conceptually, the page counters are represented as:
244  *
245  *	page_counters[region_size][mnode]
246  *
247  *	region_size:	size code of a candidate larger page made up
248  *			of contiguous free smaller pages.
249  *
250  *	page_counters[region_size][mnode].hpm_counters[index]:
251  *		represents how many (region_size - 1) pages either
252  *		exist or can be created within the given index range.
253  *
254  * Let's look at a sparc example:
255  *	If we want to create a free 512k page, we look at region_size 2
256  *	for the mnode we want.  We calculate the index and look at a specific
257  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
258  *	this location, it means that 8 64k pages either exist or can be created
259  *	from 8K pages in order to make a single free 512k page at the given
260  *	index.  Note that when a region is full, it will contribute to the
261  *	counts in the region above it.  Thus we will not know what page
262  *	size the free pages will be which can be promoted to this new free
263  *	page unless we look at all regions below the current region.
264  */
265 
266 /*
267  * Note: hpmctr_t is defined in platform vm_dep.h
268  * hw_page_map_t contains all the information needed for the page_counters
269  * logic. The fields are as follows:
270  *
271  *	hpm_counters:	dynamically allocated array to hold counter data
272  *	hpm_entries:	entries in hpm_counters
273  *	hpm_shift:	shift for pnum/array index conv
274  *	hpm_base:	PFN mapped to counter index 0
275  *	hpm_color_current_len:	# of elements in hpm_color_current "array" below
276  *	hpm_color_current:	last index in counter array for this color at
277  *				which we successfully created a large page
278  */
279 typedef struct hw_page_map {
280 	hpmctr_t	*hpm_counters;
281 	size_t		hpm_entries;
282 	int		hpm_shift;
283 	pfn_t		hpm_base;
284 	size_t		hpm_color_current_len;
285 	size_t 		*hpm_color_current;
286 } hw_page_map_t;
287 
288 /*
289  * Element zero is not used, but is allocated for convenience.
290  */
291 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
292 
293 /*
294  * The following macros are convenient ways to get access to the individual
295  * elements of the page_counters arrays.  They can be used on both
296  * the left side and right side of equations.
297  */
298 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
299 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
300 
301 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
302 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
303 
304 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
305 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
306 
307 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
308 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
309 
310 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
311 	(page_counters[(rg_szc)][(mnode)].hpm_base)
312 
313 #define	PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc)		\
314 	(page_counters[(rg_szc)][(mnode)].hpm_color_current_len)
315 
316 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc)	\
317 	(page_counters[(rg_szc)][(mnode)].hpm_color_current)
318 
319 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color)	\
320 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])
321 
322 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
323 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
324 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
325 
326 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
327 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
328 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
329 
330 /*
331  * Protects the hpm_counters and hpm_color_current memory from changing while
332  * looking at page counters information.
333  * Grab the write lock to modify what these fields point at.
334  * Grab the read lock to prevent any pointers from changing.
335  * The write lock can not be held during memory allocation due to a possible
336  * recursion deadlock with trying to grab the read lock while the
337  * write lock is already held.
338  */
339 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
340 
341 
342 /*
343  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
344  */
345 void
346 cpu_vm_data_init(struct cpu *cp)
347 {
348 	if (cp == CPU0) {
349 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
350 	} else {
351 		void	*kmptr;
352 		int	align;
353 		size_t	sz;
354 
355 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
356 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
357 		kmptr = kmem_zalloc(sz, KM_SLEEP);
358 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
359 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
360 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
361 	}
362 }
363 
364 /*
365  * free cpu_vm_data
366  */
367 void
368 cpu_vm_data_destroy(struct cpu *cp)
369 {
370 	if (cp->cpu_seqid && cp->cpu_vm_data) {
371 		ASSERT(cp != CPU0);
372 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
373 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
374 	}
375 	cp->cpu_vm_data = NULL;
376 }
377 
378 
379 /*
380  * page size to page size code
381  */
382 int
383 page_szc(size_t pagesize)
384 {
385 	int	i = 0;
386 
387 	while (hw_page_array[i].hp_size) {
388 		if (pagesize == hw_page_array[i].hp_size)
389 			return (i);
390 		i++;
391 	}
392 	return (-1);
393 }
394 
395 /*
396  * page size to page size code with the restriction that it be a supported
397  * user page size.  If it's not a supported user page size, -1 will be returned.
398  */
399 int
400 page_szc_user_filtered(size_t pagesize)
401 {
402 	int szc = page_szc(pagesize);
403 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
404 		return (szc);
405 	}
406 	return (-1);
407 }
408 
409 /*
410  * Return how many page sizes are available for the user to use.  This is
411  * what the hardware supports and not based upon how the OS implements the
412  * support of different page sizes.
413  */
414 uint_t
415 page_num_user_pagesizes(void)
416 {
417 	return (mmu_exported_page_sizes);
418 }
419 
420 uint_t
421 page_num_pagesizes(void)
422 {
423 	return (mmu_page_sizes);
424 }
425 
426 /*
427  * returns the count of the number of base pagesize pages associated with szc
428  */
429 pgcnt_t
430 page_get_pagecnt(uint_t szc)
431 {
432 	if (szc >= mmu_page_sizes)
433 		panic("page_get_pagecnt: out of range %d", szc);
434 	return (hw_page_array[szc].hp_pgcnt);
435 }
436 
437 size_t
438 page_get_pagesize(uint_t szc)
439 {
440 	if (szc >= mmu_page_sizes)
441 		panic("page_get_pagesize: out of range %d", szc);
442 	return (hw_page_array[szc].hp_size);
443 }
444 
445 /*
446  * Return the size of a page based upon the index passed in.  An index of
447  * zero refers to the smallest page size in the system, and as index increases
448  * it refers to the next larger supported page size in the system.
449  * Note that szc and userszc may not be the same due to unsupported szc's on
450  * some systems.
451  */
452 size_t
453 page_get_user_pagesize(uint_t userszc)
454 {
455 	uint_t szc = USERSZC_2_SZC(userszc);
456 
457 	if (szc >= mmu_page_sizes)
458 		panic("page_get_user_pagesize: out of range %d", szc);
459 	return (hw_page_array[szc].hp_size);
460 }
461 
462 uint_t
463 page_get_shift(uint_t szc)
464 {
465 	if (szc >= mmu_page_sizes)
466 		panic("page_get_shift: out of range %d", szc);
467 	return (hw_page_array[szc].hp_shift);
468 }
469 
470 uint_t
471 page_get_pagecolors(uint_t szc)
472 {
473 	ASSERT(page_colors != 0);
474 	return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
475 }
476 
477 /*
478  * Called by startup().
479  * Size up the per page size free list counters based on physmax
480  * of each node and max_mem_nodes.
481  */
482 size_t
483 page_ctrs_sz(void)
484 {
485 	int	r;		/* region size */
486 	int	mnode;
487 	uint_t	ctrs_sz = 0;
488 	int 	i;
489 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
490 
491 	/*
492 	 * We need to determine how many page colors there are for each
493 	 * page size in order to allocate memory for any color specific
494 	 * arrays.
495 	 */
496 	colors_per_szc[0] = page_colors;
497 	for (i = 1; i < mmu_page_sizes; i++) {
498 		colors_per_szc[i] =
499 		    page_convert_color(0, i, page_colors - 1) + 1;
500 	}
501 
502 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
503 
504 		pgcnt_t r_pgcnt;
505 		pfn_t   r_base;
506 		pgcnt_t r_align;
507 
508 		if (mem_node_config[mnode].exists == 0)
509 			continue;
510 
511 		/*
512 		 * determine size needed for page counter arrays with
513 		 * base aligned to large page size.
514 		 */
515 		for (r = 1; r < mmu_page_sizes; r++) {
516 			/* add in space for hpm_counters */
517 			r_align = page_get_pagecnt(r);
518 			r_base = mem_node_config[mnode].physbase;
519 			r_base &= ~(r_align - 1);
520 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
521 			    r_base + 1, r_align);
522 			/*
523 			 * Round up to always allocate on pointer sized
524 			 * boundaries.
525 			 */
526 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
527 			    sizeof (hpmctr_t *));
528 
529 			/* add in space for hpm_color_current */
530 			ctrs_sz += (colors_per_szc[r] *
531 			    sizeof (size_t));
532 		}
533 	}
534 
535 	for (r = 1; r < mmu_page_sizes; r++) {
536 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
537 
538 		/* add in space for page_ctrs_cands */
539 		ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
540 		ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
541 		    sizeof (pgcnt_t);
542 	}
543 
544 	/* ctr_mutex */
545 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
546 
547 	/* size for page list counts */
548 	PLCNT_SZ(ctrs_sz);
549 
550 	/*
551 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
552 	 * address of the counters to ecache_alignsize boundary for every
553 	 * memory node.
554 	 */
555 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
556 }
557 
558 caddr_t
559 page_ctrs_alloc(caddr_t alloc_base)
560 {
561 	int	mnode;
562 	int	r;		/* region size */
563 	int	i;
564 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
565 
566 	/*
567 	 * We need to determine how many page colors there are for each
568 	 * page size in order to allocate memory for any color specific
569 	 * arrays.
570 	 */
571 	colors_per_szc[0] = page_colors;
572 	for (i = 1; i < mmu_page_sizes; i++) {
573 		colors_per_szc[i] =
574 		    page_convert_color(0, i, page_colors - 1) + 1;
575 	}
576 
577 	for (r = 1; r < mmu_page_sizes; r++) {
578 		page_counters[r] = (hw_page_map_t *)alloc_base;
579 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
580 	}
581 
582 	/* page_ctrs_cands */
583 	for (r = 1; r < mmu_page_sizes; r++) {
584 		for (i = 0; i < NPC_MUTEX; i++) {
585 			page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
586 			alloc_base += max_mem_nodes * (sizeof (pcc_info_t));
587 
588 		}
589 	}
590 
591 	/* page_ctrs_cands pcc_color_free array */
592 	for (r = 1; r < mmu_page_sizes; r++) {
593 		for (i = 0; i < NPC_MUTEX; i++) {
594 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
595 				page_ctrs_cands[i][r][mnode].pcc_color_free_len
596 				    = colors_per_szc[r];
597 				page_ctrs_cands[i][r][mnode].pcc_color_free =
598 				    (pgcnt_t *)alloc_base;
599 				alloc_base += colors_per_szc[r] *
600 				    sizeof (pgcnt_t);
601 			}
602 		}
603 	}
604 
605 	/* ctr_mutex */
606 	for (i = 0; i < NPC_MUTEX; i++) {
607 		ctr_mutex[i] = (kmutex_t *)alloc_base;
608 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
609 	}
610 
611 	/* initialize page list counts */
612 	PLCNT_INIT(alloc_base);
613 
614 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
615 
616 		pgcnt_t r_pgcnt;
617 		pfn_t	r_base;
618 		pgcnt_t r_align;
619 		int	r_shift;
620 
621 		if (mem_node_config[mnode].exists == 0)
622 			continue;
623 
624 		for (r = 1; r < mmu_page_sizes; r++) {
625 			/*
626 			 * the page_counters base has to be aligned to the
627 			 * page count of page size code r otherwise the counts
628 			 * will cross large page boundaries.
629 			 */
630 			r_align = page_get_pagecnt(r);
631 			r_base = mem_node_config[mnode].physbase;
632 			/* base needs to be aligned - lower to aligned value */
633 			r_base &= ~(r_align - 1);
634 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
635 			    r_base + 1, r_align);
636 			r_shift = PAGE_BSZS_SHIFT(r);
637 
638 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
639 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
640 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
641 			PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
642 			    colors_per_szc[r];
643 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
644 			    (size_t *)alloc_base;
645 			alloc_base += (sizeof (size_t) * colors_per_szc[r]);
646 			for (i = 0; i < colors_per_szc[r]; i++) {
647 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
648 			}
649 			PAGE_COUNTERS_COUNTERS(mnode, r) =
650 			    (hpmctr_t *)alloc_base;
651 			/*
652 			 * Round up to make alloc_base always be aligned on
653 			 * a pointer boundary.
654 			 */
655 			alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
656 			    sizeof (hpmctr_t *));
657 
658 			/*
659 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
660 			 * satisfy the identity requirement.
661 			 * We should be able to go from one to the other
662 			 * and get consistent values.
663 			 */
664 			ASSERT(PNUM_TO_IDX(mnode, r,
665 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
666 			ASSERT(IDX_TO_PNUM(mnode, r,
667 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
668 		}
669 		/*
670 		 * Roundup the start address of the page_counters to
671 		 * cache aligned boundary for every memory node.
672 		 * page_ctrs_sz() has added some slop for these roundups.
673 		 */
674 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
675 			L2CACHE_ALIGN);
676 	}
677 
678 	/* Initialize other page counter specific data structures. */
679 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
680 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
681 	}
682 
683 	return (alloc_base);
684 }
685 
686 /*
687  * Functions to adjust region counters for each size free list.
688  * Caller is responsible to acquire the ctr_mutex lock if necessary and
689  * thus can be called during startup without locks.
690  */
691 /* ARGSUSED */
692 void
693 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
694 {
695 	ssize_t		r;	/* region size */
696 	ssize_t		idx;
697 	pfn_t		pfnum;
698 	int		lckidx;
699 
700 	ASSERT(mnode == PP_2_MEM_NODE(pp));
701 	ASSERT(mtype == PP_2_MTYPE(pp));
702 
703 	ASSERT(pp->p_szc < mmu_page_sizes);
704 
705 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
706 
707 	/* no counter update needed for largest page size */
708 	if (pp->p_szc >= mmu_page_sizes - 1) {
709 		return;
710 	}
711 
712 	r = pp->p_szc + 1;
713 	pfnum = pp->p_pagenum;
714 	lckidx = PP_CTR_LOCK_INDX(pp);
715 
716 	/*
717 	 * Increment the count of free pages for the current
718 	 * region. Continue looping up in region size incrementing
719 	 * count if the preceeding region is full.
720 	 */
721 	while (r < mmu_page_sizes) {
722 		idx = PNUM_TO_IDX(mnode, r, pfnum);
723 
724 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
725 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
726 
727 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
728 			break;
729 
730 		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
731 		page_ctrs_cands[lckidx][r][mnode].
732 		    pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
733 		r++;
734 	}
735 }
736 
737 void
738 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
739 {
740 	int		lckidx = PP_CTR_LOCK_INDX(pp);
741 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
742 
743 	mutex_enter(lock);
744 	page_ctr_add_internal(mnode, mtype, pp, flags);
745 	mutex_exit(lock);
746 }
747 
748 void
749 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
750 {
751 	int		lckidx;
752 	kmutex_t	*lock;
753 	ssize_t		r;	/* region size */
754 	ssize_t		idx;
755 	pfn_t		pfnum;
756 
757 	ASSERT(mnode == PP_2_MEM_NODE(pp));
758 	ASSERT(mtype == PP_2_MTYPE(pp));
759 
760 	ASSERT(pp->p_szc < mmu_page_sizes);
761 
762 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
763 
764 	/* no counter update needed for largest page size */
765 	if (pp->p_szc >= mmu_page_sizes - 1) {
766 		return;
767 	}
768 
769 	r = pp->p_szc + 1;
770 	pfnum = pp->p_pagenum;
771 	lckidx = PP_CTR_LOCK_INDX(pp);
772 	lock = &ctr_mutex[lckidx][mnode];
773 
774 	/*
775 	 * Decrement the count of free pages for the current
776 	 * region. Continue looping up in region size decrementing
777 	 * count if the preceeding region was full.
778 	 */
779 	mutex_enter(lock);
780 	while (r < mmu_page_sizes) {
781 		idx = PNUM_TO_IDX(mnode, r, pfnum);
782 
783 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
784 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
785 
786 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
787 			break;
788 		}
789 		ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
790 		ASSERT(page_ctrs_cands[lckidx][r][mnode].
791 		    pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
792 
793 		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
794 		page_ctrs_cands[lckidx][r][mnode].
795 		    pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
796 		r++;
797 	}
798 	mutex_exit(lock);
799 }
800 
801 /*
802  * Adjust page counters following a memory attach, since typically the
803  * size of the array needs to change, and the PFN to counter index
804  * mapping needs to change.
805  */
806 uint_t
807 page_ctrs_adjust(int mnode)
808 {
809 	pgcnt_t npgs;
810 	int	r;		/* region size */
811 	int	i;
812 	size_t	pcsz, old_csz;
813 	hpmctr_t *new_ctr, *old_ctr;
814 	pfn_t	oldbase, newbase;
815 	size_t	old_npgs;
816 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
817 	size_t	size_cache[MMU_PAGE_SIZES];
818 	size_t	*color_cache[MMU_PAGE_SIZES];
819 	size_t	*old_color_array;
820 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
821 
822 	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
823 	npgs = roundup(mem_node_config[mnode].physmax,
824 	    PC_BASE_ALIGN) - newbase;
825 
826 	/*
827 	 * We need to determine how many page colors there are for each
828 	 * page size in order to allocate memory for any color specific
829 	 * arrays.
830 	 */
831 	colors_per_szc[0] = page_colors;
832 	for (r = 1; r < mmu_page_sizes; r++) {
833 		colors_per_szc[r] =
834 		    page_convert_color(0, r, page_colors - 1) + 1;
835 	}
836 
837 	/*
838 	 * Preallocate all of the new hpm_counters arrays as we can't
839 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
840 	 * If we can't allocate all of the arrays, undo our work so far
841 	 * and return failure.
842 	 */
843 	for (r = 1; r < mmu_page_sizes; r++) {
844 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
845 
846 		ctr_cache[r] = kmem_zalloc(pcsz *
847 		    sizeof (hpmctr_t), KM_NOSLEEP);
848 		if (ctr_cache[r] == NULL) {
849 			while (--r >= 1) {
850 				kmem_free(ctr_cache[r],
851 				    size_cache[r] * sizeof (hpmctr_t));
852 			}
853 			return (ENOMEM);
854 		}
855 		size_cache[r] = pcsz;
856 	}
857 	/*
858 	 * Preallocate all of the new color current arrays as we can't
859 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
860 	 * If we can't allocate all of the arrays, undo our work so far
861 	 * and return failure.
862 	 */
863 	for (r = 1; r < mmu_page_sizes; r++) {
864 		color_cache[r] = kmem_zalloc(sizeof (size_t) *
865 		    colors_per_szc[r], KM_NOSLEEP);
866 		if (color_cache[r] == NULL) {
867 			while (--r >= 1) {
868 				kmem_free(color_cache[r],
869 				    colors_per_szc[r] * sizeof (size_t));
870 			}
871 			for (r = 1; r < mmu_page_sizes; r++) {
872 				kmem_free(ctr_cache[r],
873 				    size_cache[r] * sizeof (hpmctr_t));
874 			}
875 			return (ENOMEM);
876 		}
877 	}
878 
879 	/*
880 	 * Grab the write lock to prevent others from walking these arrays
881 	 * while we are modifying them.
882 	 */
883 	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
884 	page_freelist_lock(mnode);
885 	for (r = 1; r < mmu_page_sizes; r++) {
886 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
887 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
888 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
889 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
890 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
891 		old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);
892 
893 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
894 		new_ctr = ctr_cache[r];
895 		ctr_cache[r] = NULL;
896 		if (old_ctr != NULL &&
897 		    (oldbase + old_npgs > newbase) &&
898 		    (newbase + npgs > oldbase)) {
899 			/*
900 			 * Map the intersection of the old and new
901 			 * counters into the new array.
902 			 */
903 			size_t offset;
904 			if (newbase > oldbase) {
905 				offset = (newbase - oldbase) >>
906 				    PAGE_COUNTERS_SHIFT(mnode, r);
907 				bcopy(old_ctr + offset, new_ctr,
908 				    MIN(pcsz, (old_csz - offset)) *
909 				    sizeof (hpmctr_t));
910 			} else {
911 				offset = (oldbase - newbase) >>
912 				    PAGE_COUNTERS_SHIFT(mnode, r);
913 				bcopy(old_ctr, new_ctr + offset,
914 				    MIN(pcsz - offset, old_csz) *
915 				    sizeof (hpmctr_t));
916 			}
917 		}
918 
919 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
920 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
921 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
922 		PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
923 		PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
924 		color_cache[r] = NULL;
925 		/*
926 		 * for now, just reset on these events as it's probably
927 		 * not worthwhile to try and optimize this.
928 		 */
929 		for (i = 0; i < colors_per_szc[r]; i++) {
930 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
931 		}
932 
933 		/* cache info for freeing out of the critical path */
934 		if ((caddr_t)old_ctr >= kernelheap &&
935 		    (caddr_t)old_ctr < ekernelheap) {
936 			ctr_cache[r] = old_ctr;
937 			size_cache[r] = old_csz;
938 		}
939 		if ((caddr_t)old_color_array >= kernelheap &&
940 		    (caddr_t)old_color_array < ekernelheap) {
941 			color_cache[r] = old_color_array;
942 		}
943 		/*
944 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
945 		 * satisfy the identity requirement.
946 		 * We should be able to go from one to the other
947 		 * and get consistent values.
948 		 */
949 		ASSERT(PNUM_TO_IDX(mnode, r,
950 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
951 		ASSERT(IDX_TO_PNUM(mnode, r,
952 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
953 	}
954 	page_freelist_unlock(mnode);
955 	rw_exit(&page_ctrs_rwlock[mnode]);
956 
957 	/*
958 	 * Now that we have dropped the write lock, it is safe to free all
959 	 * of the memory we have cached above.
960 	 */
961 	for (r = 1; r < mmu_page_sizes; r++) {
962 		if (ctr_cache[r] != NULL) {
963 			kmem_free(ctr_cache[r],
964 			    size_cache[r] * sizeof (hpmctr_t));
965 		}
966 		if (color_cache[r] != NULL) {
967 			kmem_free(color_cache[r],
968 			    colors_per_szc[r] * sizeof (size_t));
969 		}
970 	}
971 	return (0);
972 }
973 
974 /*
975  * color contains a valid color index or bin for cur_szc
976  */
977 uint_t
978 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
979 {
980 	uint_t shift;
981 
982 	if (cur_szc > new_szc) {
983 		shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
984 		return (color << shift);
985 	} else if (cur_szc < new_szc) {
986 		shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
987 		return (color >> shift);
988 	}
989 	return (color);
990 }
991 
992 #ifdef DEBUG
993 
994 /*
995  * confirm pp is a large page corresponding to szc
996  */
997 void
998 chk_lpg(page_t *pp, uchar_t szc)
999 {
1000 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1001 	uint_t noreloc;
1002 
1003 	if (npgs == 1) {
1004 		ASSERT(pp->p_szc == 0);
1005 		ASSERT(pp->p_next == pp);
1006 		ASSERT(pp->p_prev == pp);
1007 		return;
1008 	}
1009 
1010 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1011 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1012 
1013 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1014 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1015 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1016 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1017 
1018 	/*
1019 	 * Check list of pages.
1020 	 */
1021 	noreloc = PP_ISNORELOC(pp);
1022 	while (npgs--) {
1023 		if (npgs != 0) {
1024 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1025 			ASSERT(pp->p_next == (pp + 1));
1026 		}
1027 		ASSERT(pp->p_szc == szc);
1028 		ASSERT(PP_ISFREE(pp));
1029 		ASSERT(PP_ISAGED(pp));
1030 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1031 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1032 		ASSERT(pp->p_vnode  == NULL);
1033 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1034 
1035 		pp = pp->p_next;
1036 	}
1037 }
1038 #endif /* DEBUG */
1039 
1040 void
1041 page_freelist_lock(int mnode)
1042 {
1043 	int i;
1044 	for (i = 0; i < NPC_MUTEX; i++) {
1045 		mutex_enter(FPC_MUTEX(mnode, i));
1046 		mutex_enter(CPC_MUTEX(mnode, i));
1047 	}
1048 }
1049 
1050 void
1051 page_freelist_unlock(int mnode)
1052 {
1053 	int i;
1054 	for (i = 0; i < NPC_MUTEX; i++) {
1055 		mutex_exit(FPC_MUTEX(mnode, i));
1056 		mutex_exit(CPC_MUTEX(mnode, i));
1057 	}
1058 }
1059 
1060 /*
1061  * add pp to the specified page list. Defaults to head of the page list
1062  * unless PG_LIST_TAIL is specified.
1063  */
1064 void
1065 page_list_add(page_t *pp, int flags)
1066 {
1067 	page_t		**ppp;
1068 	kmutex_t	*pcm;
1069 	uint_t		bin, mtype;
1070 	int		mnode;
1071 
1072 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1073 	ASSERT(PP_ISFREE(pp));
1074 	ASSERT(!hat_page_is_mapped(pp));
1075 	ASSERT(hat_page_getshare(pp) == 0);
1076 
1077 	/*
1078 	 * Large pages should be freed via page_list_add_pages().
1079 	 */
1080 	ASSERT(pp->p_szc == 0);
1081 
1082 	/*
1083 	 * Don't need to lock the freelist first here
1084 	 * because the page isn't on the freelist yet.
1085 	 * This means p_szc can't change on us.
1086 	 */
1087 
1088 	bin = PP_2_BIN(pp);
1089 	mnode = PP_2_MEM_NODE(pp);
1090 	mtype = PP_2_MTYPE(pp);
1091 
1092 	if (flags & PG_LIST_ISINIT) {
1093 		/*
1094 		 * PG_LIST_ISINIT is set during system startup (ie. single
1095 		 * threaded), add a page to the free list and add to the
1096 		 * the free region counters w/o any locking
1097 		 */
1098 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1099 
1100 		/* inline version of page_add() */
1101 		if (*ppp != NULL) {
1102 			pp->p_next = *ppp;
1103 			pp->p_prev = (*ppp)->p_prev;
1104 			(*ppp)->p_prev = pp;
1105 			pp->p_prev->p_next = pp;
1106 		} else
1107 			*ppp = pp;
1108 
1109 		page_ctr_add_internal(mnode, mtype, pp, flags);
1110 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1111 	} else {
1112 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1113 
1114 		if (flags & PG_FREE_LIST) {
1115 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1116 			ASSERT(PP_ISAGED(pp));
1117 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1118 
1119 		} else {
1120 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1121 			ASSERT(pp->p_vnode);
1122 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1123 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1124 		}
1125 		mutex_enter(pcm);
1126 		page_add(ppp, pp);
1127 
1128 		if (flags & PG_LIST_TAIL)
1129 			*ppp = (*ppp)->p_next;
1130 		/*
1131 		 * Add counters before releasing pcm mutex to avoid a race with
1132 		 * page_freelist_coalesce and page_freelist_fill.
1133 		 */
1134 		page_ctr_add(mnode, mtype, pp, flags);
1135 		mutex_exit(pcm);
1136 	}
1137 
1138 
1139 #if defined(__sparc)
1140 	if (PP_ISNORELOC(pp)) {
1141 		kcage_freemem_add(1);
1142 	}
1143 #endif
1144 	/*
1145 	 * It is up to the caller to unlock the page!
1146 	 */
1147 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1148 }
1149 
1150 
1151 #ifdef __sparc
1152 /*
1153  * This routine is only used by kcage_init during system startup.
1154  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1155  * without the overhead of taking locks and updating counters.
1156  */
1157 void
1158 page_list_noreloc_startup(page_t *pp)
1159 {
1160 	page_t		**ppp;
1161 	uint_t		bin;
1162 	int		mnode;
1163 	int		mtype;
1164 	int		flags = 0;
1165 
1166 	/*
1167 	 * If this is a large page on the freelist then
1168 	 * break it up into smaller pages.
1169 	 */
1170 	if (pp->p_szc != 0)
1171 		page_boot_demote(pp);
1172 
1173 	/*
1174 	 * Get list page is currently on.
1175 	 */
1176 	bin = PP_2_BIN(pp);
1177 	mnode = PP_2_MEM_NODE(pp);
1178 	mtype = PP_2_MTYPE(pp);
1179 	ASSERT(mtype == MTYPE_RELOC);
1180 	ASSERT(pp->p_szc == 0);
1181 
1182 	if (PP_ISAGED(pp)) {
1183 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1184 		flags |= PG_FREE_LIST;
1185 	} else {
1186 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1187 		flags |= PG_CACHE_LIST;
1188 	}
1189 
1190 	ASSERT(*ppp != NULL);
1191 
1192 	/*
1193 	 * Delete page from current list.
1194 	 */
1195 	if (*ppp == pp)
1196 		*ppp = pp->p_next;		/* go to next page */
1197 	if (*ppp == pp) {
1198 		*ppp = NULL;			/* page list is gone */
1199 	} else {
1200 		pp->p_prev->p_next = pp->p_next;
1201 		pp->p_next->p_prev = pp->p_prev;
1202 	}
1203 
1204 	/* LINTED */
1205 	PLCNT_DECR(pp, mnode, mtype, 0, flags);
1206 
1207 	/*
1208 	 * Set no reloc for cage initted pages.
1209 	 */
1210 	PP_SETNORELOC(pp);
1211 
1212 	mtype = PP_2_MTYPE(pp);
1213 	ASSERT(mtype == MTYPE_NORELOC);
1214 
1215 	/*
1216 	 * Get new list for page.
1217 	 */
1218 	if (PP_ISAGED(pp)) {
1219 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1220 	} else {
1221 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1222 	}
1223 
1224 	/*
1225 	 * Insert page on new list.
1226 	 */
1227 	if (*ppp == NULL) {
1228 		*ppp = pp;
1229 		pp->p_next = pp->p_prev = pp;
1230 	} else {
1231 		pp->p_next = *ppp;
1232 		pp->p_prev = (*ppp)->p_prev;
1233 		(*ppp)->p_prev = pp;
1234 		pp->p_prev->p_next = pp;
1235 	}
1236 
1237 	/* LINTED */
1238 	PLCNT_INCR(pp, mnode, mtype, 0, flags);
1239 
1240 	/*
1241 	 * Update cage freemem counter
1242 	 */
1243 	atomic_add_long(&kcage_freemem, 1);
1244 }
1245 #else	/* __sparc */
1246 
1247 /* ARGSUSED */
1248 void
1249 page_list_noreloc_startup(page_t *pp)
1250 {
1251 	panic("page_list_noreloc_startup: should be here only for sparc");
1252 }
1253 #endif
1254 
1255 void
1256 page_list_add_pages(page_t *pp, int flags)
1257 {
1258 	kmutex_t *pcm;
1259 	pgcnt_t	pgcnt;
1260 	uint_t	bin, mtype, i;
1261 	int	mnode;
1262 
1263 	/* default to freelist/head */
1264 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1265 
1266 	CHK_LPG(pp, pp->p_szc);
1267 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1268 
1269 	bin = PP_2_BIN(pp);
1270 	mnode = PP_2_MEM_NODE(pp);
1271 	mtype = PP_2_MTYPE(pp);
1272 
1273 	if (flags & PG_LIST_ISINIT) {
1274 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1275 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1276 		ASSERT(!PP_ISNORELOC(pp));
1277 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1278 	} else {
1279 
1280 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1281 
1282 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1283 
1284 		mutex_enter(pcm);
1285 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1286 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1287 		mutex_exit(pcm);
1288 
1289 		pgcnt = page_get_pagecnt(pp->p_szc);
1290 #if defined(__sparc)
1291 		if (PP_ISNORELOC(pp))
1292 			kcage_freemem_add(pgcnt);
1293 #endif
1294 		for (i = 0; i < pgcnt; i++, pp++)
1295 			page_unlock_noretire(pp);
1296 	}
1297 }
1298 
1299 /*
1300  * During boot, need to demote a large page to base
1301  * pagesize pages for seg_kmem for use in boot_alloc()
1302  */
1303 void
1304 page_boot_demote(page_t *pp)
1305 {
1306 	ASSERT(pp->p_szc != 0);
1307 	ASSERT(PP_ISFREE(pp));
1308 	ASSERT(PP_ISAGED(pp));
1309 
1310 	(void) page_demote(PP_2_MEM_NODE(pp),
1311 	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
1312 	    PC_FREE);
1313 
1314 	ASSERT(PP_ISFREE(pp));
1315 	ASSERT(PP_ISAGED(pp));
1316 	ASSERT(pp->p_szc == 0);
1317 }
1318 
1319 /*
1320  * Take a particular page off of whatever freelist the page
1321  * is claimed to be on.
1322  *
1323  * NOTE: Only used for PAGESIZE pages.
1324  */
1325 void
1326 page_list_sub(page_t *pp, int flags)
1327 {
1328 	int		bin;
1329 	uint_t		mtype;
1330 	int		mnode;
1331 	kmutex_t	*pcm;
1332 	page_t		**ppp;
1333 
1334 	ASSERT(PAGE_EXCL(pp));
1335 	ASSERT(PP_ISFREE(pp));
1336 
1337 	/*
1338 	 * The p_szc field can only be changed by page_promote()
1339 	 * and page_demote(). Only free pages can be promoted and
1340 	 * demoted and the free list MUST be locked during these
1341 	 * operations. So to prevent a race in page_list_sub()
1342 	 * between computing which bin of the freelist lock to
1343 	 * grab and actually grabing the lock we check again that
1344 	 * the bin we locked is still the correct one. Notice that
1345 	 * the p_szc field could have actually changed on us but
1346 	 * if the bin happens to still be the same we are safe.
1347 	 */
1348 try_again:
1349 	bin = PP_2_BIN(pp);
1350 	mnode = PP_2_MEM_NODE(pp);
1351 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1352 	mutex_enter(pcm);
1353 	if (PP_2_BIN(pp) != bin) {
1354 		mutex_exit(pcm);
1355 		goto try_again;
1356 	}
1357 	mtype = PP_2_MTYPE(pp);
1358 
1359 	if (flags & PG_FREE_LIST) {
1360 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1361 		ASSERT(PP_ISAGED(pp));
1362 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1363 	} else {
1364 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1365 		ASSERT(!PP_ISAGED(pp));
1366 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1367 	}
1368 
1369 	/*
1370 	 * Common PAGESIZE case.
1371 	 *
1372 	 * Note that we locked the freelist. This prevents
1373 	 * any page promotion/demotion operations. Therefore
1374 	 * the p_szc will not change until we drop pcm mutex.
1375 	 */
1376 	if (pp->p_szc == 0) {
1377 		page_sub(ppp, pp);
1378 		/*
1379 		 * Subtract counters before releasing pcm mutex
1380 		 * to avoid race with page_freelist_coalesce.
1381 		 */
1382 		page_ctr_sub(mnode, mtype, pp, flags);
1383 		mutex_exit(pcm);
1384 
1385 #if defined(__sparc)
1386 		if (PP_ISNORELOC(pp)) {
1387 			kcage_freemem_sub(1);
1388 		}
1389 #endif
1390 		return;
1391 	}
1392 
1393 	/*
1394 	 * Large pages on the cache list are not supported.
1395 	 */
1396 	if (flags & PG_CACHE_LIST)
1397 		panic("page_list_sub: large page on cachelist");
1398 
1399 	/*
1400 	 * Slow but rare.
1401 	 *
1402 	 * Somebody wants this particular page which is part
1403 	 * of a large page. In this case we just demote the page
1404 	 * if it's on the freelist.
1405 	 *
1406 	 * We have to drop pcm before locking the entire freelist.
1407 	 * Once we have re-locked the freelist check to make sure
1408 	 * the page hasn't already been demoted or completely
1409 	 * freed.
1410 	 */
1411 	mutex_exit(pcm);
1412 	page_freelist_lock(mnode);
1413 	if (pp->p_szc != 0) {
1414 		/*
1415 		 * Large page is on freelist.
1416 		 */
1417 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1418 		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1419 	}
1420 	ASSERT(PP_ISFREE(pp));
1421 	ASSERT(PP_ISAGED(pp));
1422 	ASSERT(pp->p_szc == 0);
1423 
1424 	/*
1425 	 * Subtract counters before releasing pcm mutex
1426 	 * to avoid race with page_freelist_coalesce.
1427 	 */
1428 	bin = PP_2_BIN(pp);
1429 	mtype = PP_2_MTYPE(pp);
1430 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1431 
1432 	page_sub(ppp, pp);
1433 	page_ctr_sub(mnode, mtype, pp, flags);
1434 	page_freelist_unlock(mnode);
1435 
1436 #if defined(__sparc)
1437 	if (PP_ISNORELOC(pp)) {
1438 		kcage_freemem_sub(1);
1439 	}
1440 #endif
1441 }
1442 
1443 void
1444 page_list_sub_pages(page_t *pp, uint_t szc)
1445 {
1446 	kmutex_t *pcm;
1447 	uint_t	bin, mtype;
1448 	int	mnode;
1449 
1450 	ASSERT(PAGE_EXCL(pp));
1451 	ASSERT(PP_ISFREE(pp));
1452 	ASSERT(PP_ISAGED(pp));
1453 
1454 	/*
1455 	 * See comment in page_list_sub().
1456 	 */
1457 try_again:
1458 	bin = PP_2_BIN(pp);
1459 	mnode = PP_2_MEM_NODE(pp);
1460 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1461 	mutex_enter(pcm);
1462 	if (PP_2_BIN(pp) != bin) {
1463 		mutex_exit(pcm);
1464 		goto	try_again;
1465 	}
1466 
1467 	/*
1468 	 * If we're called with a page larger than szc or it got
1469 	 * promoted above szc before we locked the freelist then
1470 	 * drop pcm and re-lock entire freelist. If page still larger
1471 	 * than szc then demote it.
1472 	 */
1473 	if (pp->p_szc > szc) {
1474 		mutex_exit(pcm);
1475 		pcm = NULL;
1476 		page_freelist_lock(mnode);
1477 		if (pp->p_szc > szc) {
1478 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1479 			(void) page_demote(mnode,
1480 			    PFN_BASE(pp->p_pagenum, pp->p_szc),
1481 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1482 		}
1483 		bin = PP_2_BIN(pp);
1484 	}
1485 	ASSERT(PP_ISFREE(pp));
1486 	ASSERT(PP_ISAGED(pp));
1487 	ASSERT(pp->p_szc <= szc);
1488 	ASSERT(pp == PP_PAGEROOT(pp));
1489 
1490 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1491 
1492 	mtype = PP_2_MTYPE(pp);
1493 	if (pp->p_szc != 0) {
1494 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1495 		CHK_LPG(pp, pp->p_szc);
1496 	} else {
1497 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1498 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1499 	}
1500 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1501 
1502 	if (pcm != NULL) {
1503 		mutex_exit(pcm);
1504 	} else {
1505 		page_freelist_unlock(mnode);
1506 	}
1507 
1508 #if defined(__sparc)
1509 	if (PP_ISNORELOC(pp)) {
1510 		pgcnt_t	pgcnt;
1511 
1512 		pgcnt = page_get_pagecnt(pp->p_szc);
1513 		kcage_freemem_sub(pgcnt);
1514 	}
1515 #endif
1516 }
1517 
1518 /*
1519  * Add the page to the front of a linked list of pages
1520  * using the p_next & p_prev pointers for the list.
1521  * The caller is responsible for protecting the list pointers.
1522  */
1523 void
1524 mach_page_add(page_t **ppp, page_t *pp)
1525 {
1526 	if (*ppp == NULL) {
1527 		pp->p_next = pp->p_prev = pp;
1528 	} else {
1529 		pp->p_next = *ppp;
1530 		pp->p_prev = (*ppp)->p_prev;
1531 		(*ppp)->p_prev = pp;
1532 		pp->p_prev->p_next = pp;
1533 	}
1534 	*ppp = pp;
1535 }
1536 
1537 /*
1538  * Remove this page from a linked list of pages
1539  * using the p_next & p_prev pointers for the list.
1540  *
1541  * The caller is responsible for protecting the list pointers.
1542  */
1543 void
1544 mach_page_sub(page_t **ppp, page_t *pp)
1545 {
1546 	ASSERT(PP_ISFREE(pp));
1547 
1548 	if (*ppp == NULL || pp == NULL)
1549 		panic("mach_page_sub");
1550 
1551 	if (*ppp == pp)
1552 		*ppp = pp->p_next;		/* go to next page */
1553 
1554 	if (*ppp == pp)
1555 		*ppp = NULL;			/* page list is gone */
1556 	else {
1557 		pp->p_prev->p_next = pp->p_next;
1558 		pp->p_next->p_prev = pp->p_prev;
1559 	}
1560 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1561 }
1562 
1563 /*
1564  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1565  */
1566 void
1567 page_promote_size(page_t *pp, uint_t cur_szc)
1568 {
1569 	pfn_t pfn;
1570 	int mnode;
1571 	int idx;
1572 	int new_szc = cur_szc + 1;
1573 	int full = FULL_REGION_CNT(new_szc);
1574 
1575 	pfn = page_pptonum(pp);
1576 	mnode = PFN_2_MEM_NODE(pfn);
1577 
1578 	page_freelist_lock(mnode);
1579 
1580 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1581 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1582 		(void) page_promote(mnode, pfn, new_szc, PC_FREE);
1583 
1584 	page_freelist_unlock(mnode);
1585 }
1586 
1587 static uint_t page_promote_err;
1588 static uint_t page_promote_noreloc_err;
1589 
1590 /*
1591  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1592  * for the given mnode starting at pfnum. Pages involved are on the freelist
1593  * before the call and may be returned to the caller if requested, otherwise
1594  * they will be placed back on the freelist.
1595  * If flags is PC_ALLOC, then the large page will be returned to the user in
1596  * a state which is consistent with a page being taken off the freelist.  If
1597  * we failed to lock the new large page, then we will return NULL to the
1598  * caller and put the large page on the freelist instead.
1599  * If flags is PC_FREE, then the large page will be placed on the freelist,
1600  * and NULL will be returned.
1601  * The caller is responsible for locking the freelist as well as any other
1602  * accounting which needs to be done for a returned page.
1603  *
1604  * RFE: For performance pass in pp instead of pfnum so
1605  * 	we can avoid excessive calls to page_numtopp_nolock().
1606  *	This would depend on an assumption that all contiguous
1607  *	pages are in the same memseg so we can just add/dec
1608  *	our pp.
1609  *
1610  * Lock ordering:
1611  *
1612  *	There is a potential but rare deadlock situation
1613  *	for page promotion and demotion operations. The problem
1614  *	is there are two paths into the freelist manager and
1615  *	they have different lock orders:
1616  *
1617  *	page_create()
1618  *		lock freelist
1619  *		page_lock(EXCL)
1620  *		unlock freelist
1621  *		return
1622  *		caller drops page_lock
1623  *
1624  *	page_free() and page_reclaim()
1625  *		caller grabs page_lock(EXCL)
1626  *
1627  *		lock freelist
1628  *		unlock freelist
1629  *		drop page_lock
1630  *
1631  *	What prevents a thread in page_create() from deadlocking
1632  *	with a thread freeing or reclaiming the same page is the
1633  *	page_trylock() in page_get_freelist(). If the trylock fails
1634  *	it skips the page.
1635  *
1636  *	The lock ordering for promotion and demotion is the same as
1637  *	for page_create(). Since the same deadlock could occur during
1638  *	page promotion and freeing or reclaiming of a page on the
1639  *	cache list we might have to fail the operation and undo what
1640  *	have done so far. Again this is rare.
1641  */
1642 page_t *
1643 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
1644 {
1645 	page_t		*pp, *pplist, *tpp, *start_pp;
1646 	pgcnt_t		new_npgs, npgs;
1647 	uint_t		bin;
1648 	pgcnt_t		tmpnpgs, pages_left;
1649 	uint_t		mtype;
1650 	uint_t		noreloc;
1651 	uint_t 		i;
1652 	int 		which_list;
1653 	ulong_t		index;
1654 	kmutex_t	*phm;
1655 
1656 	/*
1657 	 * General algorithm:
1658 	 * Find the starting page
1659 	 * Walk each page struct removing it from the freelist,
1660 	 * and linking it to all the other pages removed.
1661 	 * Once all pages are off the freelist,
1662 	 * walk the list, modifying p_szc to new_szc and what
1663 	 * ever other info needs to be done to create a large free page.
1664 	 * According to the flags, either return the page or put it
1665 	 * on the freelist.
1666 	 */
1667 
1668 	start_pp = page_numtopp_nolock(pfnum);
1669 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1670 	new_npgs = page_get_pagecnt(new_szc);
1671 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1672 
1673 	/*
1674 	 * Loop through smaller pages to confirm that all pages
1675 	 * give the same result for PP_ISNORELOC().
1676 	 * We can check this reliably here as the protocol for setting
1677 	 * P_NORELOC requires pages to be taken off the free list first.
1678 	 */
1679 	for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
1680 		if (pp == start_pp) {
1681 			/* First page, set requirement. */
1682 			noreloc = PP_ISNORELOC(pp);
1683 		} else if (noreloc != PP_ISNORELOC(pp)) {
1684 			page_promote_noreloc_err++;
1685 			page_promote_err++;
1686 			return (NULL);
1687 		}
1688 	}
1689 
1690 	pages_left = new_npgs;
1691 	pplist = NULL;
1692 	pp = start_pp;
1693 
1694 	/* Loop around coalescing the smaller pages into a big page. */
1695 	while (pages_left) {
1696 		/*
1697 		 * Remove from the freelist.
1698 		 */
1699 		ASSERT(PP_ISFREE(pp));
1700 		bin = PP_2_BIN(pp);
1701 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1702 		mtype = PP_2_MTYPE(pp);
1703 		if (PP_ISAGED(pp)) {
1704 
1705 			/*
1706 			 * PG_FREE_LIST
1707 			 */
1708 			if (pp->p_szc) {
1709 				page_vpsub(&PAGE_FREELISTS(mnode,
1710 				    pp->p_szc, bin, mtype), pp);
1711 			} else {
1712 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1713 				    bin, mtype), pp);
1714 			}
1715 			which_list = PG_FREE_LIST;
1716 		} else {
1717 			ASSERT(pp->p_szc == 0);
1718 
1719 			/*
1720 			 * PG_CACHE_LIST
1721 			 *
1722 			 * Since this page comes from the
1723 			 * cachelist, we must destroy the
1724 			 * vnode association.
1725 			 */
1726 			if (!page_trylock(pp, SE_EXCL)) {
1727 				goto fail_promote;
1728 			}
1729 
1730 			/*
1731 			 * We need to be careful not to deadlock
1732 			 * with another thread in page_lookup().
1733 			 * The page_lookup() thread could be holding
1734 			 * the same phm that we need if the two
1735 			 * pages happen to hash to the same phm lock.
1736 			 * At this point we have locked the entire
1737 			 * freelist and page_lookup() could be trying
1738 			 * to grab a freelist lock.
1739 			 */
1740 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
1741 			phm = PAGE_HASH_MUTEX(index);
1742 			if (!mutex_tryenter(phm)) {
1743 				page_unlock_noretire(pp);
1744 				goto fail_promote;
1745 			}
1746 
1747 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
1748 			page_hashout(pp, phm);
1749 			mutex_exit(phm);
1750 			PP_SETAGED(pp);
1751 			page_unlock_noretire(pp);
1752 			which_list = PG_CACHE_LIST;
1753 		}
1754 		page_ctr_sub(mnode, mtype, pp, which_list);
1755 
1756 		/*
1757 		 * Concatenate the smaller page(s) onto
1758 		 * the large page list.
1759 		 */
1760 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
1761 		pages_left -= npgs;
1762 		tpp = pp;
1763 		while (npgs--) {
1764 			tpp->p_szc = new_szc;
1765 			tpp = tpp->p_next;
1766 		}
1767 		page_list_concat(&pplist, &pp);
1768 		pp += tmpnpgs;
1769 	}
1770 	CHK_LPG(pplist, new_szc);
1771 
1772 	/*
1773 	 * return the page to the user if requested
1774 	 * in the properly locked state.
1775 	 */
1776 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
1777 		return (pplist);
1778 	}
1779 
1780 	/*
1781 	 * Otherwise place the new large page on the freelist
1782 	 */
1783 	bin = PP_2_BIN(pplist);
1784 	mnode = PP_2_MEM_NODE(pplist);
1785 	mtype = PP_2_MTYPE(pplist);
1786 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
1787 
1788 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
1789 	return (NULL);
1790 
1791 fail_promote:
1792 	/*
1793 	 * A thread must have still been freeing or
1794 	 * reclaiming the page on the cachelist.
1795 	 * To prevent a deadlock undo what we have
1796 	 * done sofar and return failure. This
1797 	 * situation can only happen while promoting
1798 	 * PAGESIZE pages.
1799 	 */
1800 	page_promote_err++;
1801 	while (pplist) {
1802 		pp = pplist;
1803 		mach_page_sub(&pplist, pp);
1804 		pp->p_szc = 0;
1805 		bin = PP_2_BIN(pp);
1806 		mtype = PP_2_MTYPE(pp);
1807 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
1808 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1809 	}
1810 	return (NULL);
1811 
1812 }
1813 
1814 /*
1815  * Break up a large page into smaller size pages.
1816  * Pages involved are on the freelist before the call and may
1817  * be returned to the caller if requested, otherwise they will
1818  * be placed back on the freelist.
1819  * The caller is responsible for locking the freelist as well as any other
1820  * accounting which needs to be done for a returned page.
1821  * If flags is not PC_ALLOC, the color argument is ignored, and thus
1822  * technically, any value may be passed in but PC_NO_COLOR is the standard
1823  * which should be followed for clarity's sake.
1824  */
1825 page_t *
1826 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
1827     int color, int flags)
1828 {
1829 	page_t	*pp, *pplist, *npplist;
1830 	pgcnt_t	npgs, n;
1831 	uint_t	bin;
1832 	uint_t	mtype;
1833 	page_t	*ret_pp = NULL;
1834 
1835 	ASSERT(cur_szc != 0);
1836 	ASSERT(new_szc < cur_szc);
1837 
1838 	pplist = page_numtopp_nolock(pfnum);
1839 	ASSERT(pplist != NULL);
1840 
1841 	ASSERT(pplist->p_szc == cur_szc);
1842 
1843 	bin = PP_2_BIN(pplist);
1844 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
1845 	mtype = PP_2_MTYPE(pplist);
1846 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
1847 
1848 	CHK_LPG(pplist, cur_szc);
1849 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
1850 
1851 	/*
1852 	 * Number of PAGESIZE pages for smaller new_szc
1853 	 * page.
1854 	 */
1855 	npgs = page_get_pagecnt(new_szc);
1856 
1857 	while (pplist) {
1858 		pp = pplist;
1859 
1860 		ASSERT(pp->p_szc == cur_szc);
1861 
1862 		/*
1863 		 * We either break it up into PAGESIZE pages or larger.
1864 		 */
1865 		if (npgs == 1) {	/* PAGESIZE case */
1866 			mach_page_sub(&pplist, pp);
1867 			ASSERT(pp->p_szc == cur_szc);
1868 			ASSERT(new_szc == 0);
1869 			ASSERT(mnode == PP_2_MEM_NODE(pp));
1870 			pp->p_szc = new_szc;
1871 			bin = PP_2_BIN(pp);
1872 			if ((bin == color) && (flags == PC_ALLOC) &&
1873 			    (ret_pp == NULL) &&
1874 			    page_trylock_cons(pp, SE_EXCL)) {
1875 				ret_pp = pp;
1876 			} else {
1877 				mtype = PP_2_MTYPE(pp);
1878 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
1879 				    mtype), pp);
1880 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1881 			}
1882 		} else {
1883 
1884 			/*
1885 			 * Break down into smaller lists of pages.
1886 			 */
1887 			page_list_break(&pplist, &npplist, npgs);
1888 
1889 			pp = pplist;
1890 			n = npgs;
1891 			while (n--) {
1892 				ASSERT(pp->p_szc == cur_szc);
1893 				pp->p_szc = new_szc;
1894 				pp = pp->p_next;
1895 			}
1896 
1897 			CHK_LPG(pplist, new_szc);
1898 
1899 			bin = PP_2_BIN(pplist);
1900 			ASSERT(mnode == PP_2_MEM_NODE(pp));
1901 			if ((bin == color) && (flags == PC_ALLOC) &&
1902 			    (ret_pp == NULL) &&
1903 			    page_trylock_cons(pp, SE_EXCL)) {
1904 				ret_pp = pp;
1905 			} else {
1906 				mtype = PP_2_MTYPE(pp);
1907 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
1908 				    bin, mtype), pplist);
1909 
1910 				page_ctr_add(mnode, mtype, pplist,
1911 				    PG_FREE_LIST);
1912 			}
1913 			pplist = npplist;
1914 		}
1915 	}
1916 	return (ret_pp);
1917 }
1918 
1919 int mpss_coalesce_disable = 0;
1920 
1921 /*
1922  * Coalesce free pages into a page of the given szc and color if possible.
1923  * Return the pointer to the page created, otherwise, return NULL.
1924  */
1925 static page_t *
1926 page_freelist_coalesce(int mnode, uchar_t szc, int color)
1927 {
1928 	int 	r;		/* region size */
1929 	int 	idx, full, i;
1930 	pfn_t	pfnum;
1931 	size_t	len;
1932 	size_t	buckets_to_check;
1933 	pgcnt_t	cands;
1934 	page_t	*ret_pp;
1935 	int	color_stride;
1936 
1937 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);
1938 
1939 	if (mpss_coalesce_disable) {
1940 		return (NULL);
1941 	}
1942 
1943 	r = szc;
1944 	PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
1945 	if (cands == 0) {
1946 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
1947 		return (NULL);
1948 	}
1949 	full = FULL_REGION_CNT(r);
1950 	color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
1951 	    page_colors;
1952 
1953 	/* Prevent page_counters dynamic memory from being freed */
1954 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
1955 	len  = PAGE_COUNTERS_ENTRIES(mnode, r);
1956 	buckets_to_check = len / color_stride;
1957 	idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
1958 	ASSERT((idx % color_stride) == color);
1959 	idx += color_stride;
1960 	if (idx >= len)
1961 		idx = color;
1962 	for (i = 0; i < buckets_to_check; i++) {
1963 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
1964 			pfnum = IDX_TO_PNUM(mnode, r, idx);
1965 			ASSERT(pfnum >= mem_node_config[mnode].physbase &&
1966 			    pfnum < mem_node_config[mnode].physmax);
1967 			/*
1968 			 * RFE: For performance maybe we can do something less
1969 			 *	brutal than locking the entire freelist. So far
1970 			 * 	this doesn't seem to be a performance problem?
1971 			 */
1972 			page_freelist_lock(mnode);
1973 			if (PAGE_COUNTERS(mnode, r, idx) != full) {
1974 				VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
1975 				goto skip_this_one;
1976 			}
1977 			ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
1978 			if (ret_pp != NULL) {
1979 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
1980 				    idx;
1981 				page_freelist_unlock(mnode);
1982 				rw_exit(&page_ctrs_rwlock[mnode]);
1983 #if defined(__sparc)
1984 				if (PP_ISNORELOC(ret_pp)) {
1985 					pgcnt_t npgs;
1986 
1987 					npgs = page_get_pagecnt(ret_pp->p_szc);
1988 					kcage_freemem_sub(npgs);
1989 				}
1990 #endif
1991 				return (ret_pp);
1992 			}
1993 skip_this_one:
1994 			page_freelist_unlock(mnode);
1995 			/*
1996 			 * No point looking for another page if we've
1997 			 * already tried all of the ones that
1998 			 * page_ctr_cands indicated.  Stash off where we left
1999 			 * off.
2000 			 * Note: this is not exact since we don't hold the
2001 			 * page_freelist_locks before we initially get the
2002 			 * value of cands for performance reasons, but should
2003 			 * be a decent approximation.
2004 			 */
2005 			if (--cands == 0) {
2006 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
2007 				    idx;
2008 				break;
2009 			}
2010 		}
2011 		idx += color_stride;
2012 		if (idx >= len)
2013 			idx = color;
2014 	}
2015 	rw_exit(&page_ctrs_rwlock[mnode]);
2016 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
2017 	return (NULL);
2018 }
2019 
2020 /*
2021  * For the given mnode, promote as many small pages to large pages as possible.
2022  */
2023 void
2024 page_freelist_coalesce_all(int mnode)
2025 {
2026 	int 	r;		/* region size */
2027 	int 	idx, full;
2028 	pfn_t	pfnum;
2029 	size_t	len;
2030 
2031 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2032 
2033 	if (mpss_coalesce_disable) {
2034 		return;
2035 	}
2036 
2037 	/*
2038 	 * Lock the entire freelist and coalesce what we can.
2039 	 *
2040 	 * Always promote to the largest page possible
2041 	 * first to reduce the number of page promotions.
2042 	 */
2043 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2044 	page_freelist_lock(mnode);
2045 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2046 		pgcnt_t cands;
2047 
2048 		PGCTRS_CANDS_GETVALUE(mnode, r, cands);
2049 		if (cands == 0) {
2050 			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
2051 			continue;
2052 		}
2053 
2054 		full = FULL_REGION_CNT(r);
2055 		len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2056 
2057 		for (idx = 0; idx < len; idx++) {
2058 			if (PAGE_COUNTERS(mnode, r, idx) == full) {
2059 				pfnum = IDX_TO_PNUM(mnode, r, idx);
2060 				ASSERT(pfnum >=
2061 				    mem_node_config[mnode].physbase &&
2062 				    pfnum <
2063 				    mem_node_config[mnode].physmax);
2064 				(void) page_promote(mnode, pfnum, r, PC_FREE);
2065 			}
2066 		}
2067 	}
2068 	page_freelist_unlock(mnode);
2069 	rw_exit(&page_ctrs_rwlock[mnode]);
2070 }
2071 
2072 /*
2073  * This is where all polices for moving pages around
2074  * to different page size free lists is implemented.
2075  * Returns 1 on success, 0 on failure.
2076  *
2077  * So far these are the priorities for this algorithm in descending
2078  * order:
2079  *
2080  *	1) When servicing a request try to do so with a free page
2081  *	   from next size up. Helps defer fragmentation as long
2082  *	   as possible.
2083  *
2084  *	2) Page coalesce on demand. Only when a freelist
2085  *	   larger than PAGESIZE is empty and step 1
2086  *	   will not work since all larger size lists are
2087  *	   also empty.
2088  *
2089  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2090  */
2091 page_t *
2092 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
2093 {
2094 	uchar_t nszc = szc + 1;
2095 	int 	bin;
2096 	page_t	*pp, *firstpp;
2097 	page_t	*ret_pp = NULL;
2098 
2099 	ASSERT(szc < mmu_page_sizes);
2100 
2101 	VM_STAT_ADD(vmm_vmstats.pff_req[szc]);
2102 	/*
2103 	 * First try to break up a larger page to fill
2104 	 * current size freelist.
2105 	 */
2106 	while (nszc < mmu_page_sizes) {
2107 		/*
2108 		 * If page found then demote it.
2109 		 */
2110 		bin = page_convert_color(szc, nszc, color);
2111 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2112 			page_freelist_lock(mnode);
2113 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2114 
2115 			/*
2116 			 * If pfnhi is not PFNNULL, look for large page below
2117 			 * pfnhi. PFNNULL signifies no pfn requirement.
2118 			 */
2119 			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
2120 				do {
2121 					pp = pp->p_vpnext;
2122 					if (pp == firstpp) {
2123 						pp = NULL;
2124 						break;
2125 					}
2126 				} while (pp->p_pagenum >= pfnhi);
2127 			}
2128 			if (pp) {
2129 				ASSERT(pp->p_szc == nszc);
2130 				VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]);
2131 				ret_pp = page_demote(mnode, pp->p_pagenum,
2132 				    pp->p_szc, szc, color, PC_ALLOC);
2133 				if (ret_pp) {
2134 					page_freelist_unlock(mnode);
2135 #if defined(__sparc)
2136 					if (PP_ISNORELOC(ret_pp)) {
2137 						pgcnt_t npgs;
2138 
2139 						npgs = page_get_pagecnt(
2140 						    ret_pp->p_szc);
2141 						kcage_freemem_sub(npgs);
2142 					}
2143 #endif
2144 					return (ret_pp);
2145 				}
2146 			}
2147 			page_freelist_unlock(mnode);
2148 		}
2149 		nszc++;
2150 	}
2151 
2152 	/*
2153 	 * Ok that didn't work. Time to coalesce.
2154 	 */
2155 	if (szc != 0) {
2156 		ret_pp = page_freelist_coalesce(mnode, szc, color);
2157 		VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]);
2158 	}
2159 
2160 	return (ret_pp);
2161 }
2162 
2163 /*
2164  * Helper routine used only by the freelist code to lock
2165  * a page. If the page is a large page then it succeeds in
2166  * locking all the constituent pages or none at all.
2167  * Returns 1 on sucess, 0 on failure.
2168  */
2169 static int
2170 page_trylock_cons(page_t *pp, se_t se)
2171 {
2172 	page_t	*tpp, *first_pp = pp;
2173 
2174 	/*
2175 	 * Fail if can't lock first or only page.
2176 	 */
2177 	if (!page_trylock(pp, se)) {
2178 		return (0);
2179 	}
2180 
2181 	/*
2182 	 * PAGESIZE: common case.
2183 	 */
2184 	if (pp->p_szc == 0) {
2185 		return (1);
2186 	}
2187 
2188 	/*
2189 	 * Large page case.
2190 	 */
2191 	tpp = pp->p_next;
2192 	while (tpp != pp) {
2193 		if (!page_trylock(tpp, se)) {
2194 			/*
2195 			 * On failure unlock what we
2196 			 * have locked so far.
2197 			 */
2198 			while (first_pp != tpp) {
2199 				page_unlock_noretire(first_pp);
2200 				first_pp = first_pp->p_next;
2201 			}
2202 			return (0);
2203 		}
2204 		tpp = tpp->p_next;
2205 	}
2206 	return (1);
2207 }
2208 
2209 page_t *
2210 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2211     uint_t flags)
2212 {
2213 	kmutex_t	*pcm;
2214 	int		i, fill_tried, fill_marker;
2215 	page_t		*pp, *first_pp;
2216 	uint_t		bin_marker;
2217 	int		colors, cpucolors;
2218 	uchar_t		nszc;
2219 	uint_t		nszc_color_shift;
2220 	int		nwaybins = 0, nwaycnt;
2221 
2222 	ASSERT(szc < mmu_page_sizes);
2223 
2224 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2225 
2226 	MTYPE_START(mnode, mtype, flags);
2227 	if (mtype < 0) {	/* mnode foes not have memory in mtype range */
2228 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2229 		return (NULL);
2230 	}
2231 
2232 	/*
2233 	 * Set how many physical colors for this page size.
2234 	 */
2235 	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
2236 	    page_colors;
2237 
2238 	nszc = MIN(szc + 1, mmu_page_sizes - 1);
2239 	nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);
2240 
2241 	/* cpu_page_colors is non-zero if a page color may be in > 1 bin */
2242 	cpucolors = cpu_page_colors;
2243 
2244 	/*
2245 	 * adjust cpucolors to possibly check additional 'equivalent' bins
2246 	 * to try to minimize fragmentation of large pages by delaying calls
2247 	 * to page_freelist_fill.
2248 	 */
2249 	if (colorequiv > 1) {
2250 		int equivcolors = colors / colorequiv;
2251 
2252 		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
2253 			cpucolors = equivcolors;
2254 	}
2255 
2256 	ASSERT(colors <= page_colors);
2257 	ASSERT(colors);
2258 	ASSERT((colors & (colors - 1)) == 0);
2259 
2260 	ASSERT(bin < colors);
2261 
2262 	/*
2263 	 * Only hold one freelist lock at a time, that way we
2264 	 * can start anywhere and not have to worry about lock
2265 	 * ordering.
2266 	 */
2267 big_try_again:
2268 	fill_tried = 0;
2269 	nwaycnt = 0;
2270 	for (i = 0; i <= colors; i++) {
2271 try_again:
2272 		ASSERT(bin < colors);
2273 		if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
2274 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2275 			mutex_enter(pcm);
2276 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2277 			if (pp != NULL) {
2278 				/*
2279 				 * These were set before the page
2280 				 * was put on the free list,
2281 				 * they must still be set.
2282 				 */
2283 				ASSERT(PP_ISFREE(pp));
2284 				ASSERT(PP_ISAGED(pp));
2285 				ASSERT(pp->p_vnode == NULL);
2286 				ASSERT(pp->p_hash == NULL);
2287 				ASSERT(pp->p_offset == (u_offset_t)-1);
2288 				ASSERT(pp->p_szc == szc);
2289 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2290 
2291 				/*
2292 				 * Walk down the hash chain.
2293 				 * 8k pages are linked on p_next
2294 				 * and p_prev fields. Large pages
2295 				 * are a contiguous group of
2296 				 * constituent pages linked together
2297 				 * on their p_next and p_prev fields.
2298 				 * The large pages are linked together
2299 				 * on the hash chain using p_vpnext
2300 				 * p_vpprev of the base constituent
2301 				 * page of each large page.
2302 				 */
2303 				first_pp = pp;
2304 				while (!page_trylock_cons(pp, SE_EXCL)) {
2305 					if (szc == 0) {
2306 						pp = pp->p_next;
2307 					} else {
2308 						pp = pp->p_vpnext;
2309 					}
2310 
2311 					ASSERT(PP_ISFREE(pp));
2312 					ASSERT(PP_ISAGED(pp));
2313 					ASSERT(pp->p_vnode == NULL);
2314 					ASSERT(pp->p_hash == NULL);
2315 					ASSERT(pp->p_offset == (u_offset_t)-1);
2316 					ASSERT(pp->p_szc == szc);
2317 					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
2318 							mnode);
2319 
2320 					if (pp == first_pp) {
2321 						pp = NULL;
2322 						break;
2323 					}
2324 				}
2325 
2326 				if (pp) {
2327 					ASSERT(mtype == PP_2_MTYPE(pp));
2328 					ASSERT(pp->p_szc == szc);
2329 					if (szc == 0) {
2330 						page_sub(&PAGE_FREELISTS(mnode,
2331 						    szc, bin, mtype), pp);
2332 					} else {
2333 						page_vpsub(&PAGE_FREELISTS(
2334 						    mnode, szc, bin, mtype),
2335 						    pp);
2336 						CHK_LPG(pp, szc);
2337 					}
2338 					page_ctr_sub(mnode, mtype, pp,
2339 					    PG_FREE_LIST);
2340 
2341 					if ((PP_ISFREE(pp) == 0) ||
2342 					    (PP_ISAGED(pp) == 0))
2343 						panic("free page is not. pp %p",
2344 						    (void *)pp);
2345 					mutex_exit(pcm);
2346 
2347 #if defined(__sparc)
2348 					ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2349 					    (flags & PG_NORELOC) == 0);
2350 
2351 					if (PP_ISNORELOC(pp)) {
2352 						pgcnt_t	npgs;
2353 
2354 						npgs = page_get_pagecnt(szc);
2355 						kcage_freemem_sub(npgs);
2356 					}
2357 #endif
2358 					VM_STAT_ADD(vmm_vmstats.
2359 					    pgmf_allocok[szc]);
2360 					return (pp);
2361 				}
2362 			}
2363 			mutex_exit(pcm);
2364 		}
2365 
2366 		/*
2367 		 * Wow! The initial bin is empty.
2368 		 * If specific color is needed, check if page color may be
2369 		 * in other bins. cpucolors is:
2370 		 *   0	if the colors for this cpu is equal to page_colors.
2371 		 *	This means that pages with a particular color are in a
2372 		 *	single bin.
2373 		 *  -1	if colors of cpus (cheetah+) are heterogenous. Need to
2374 		 *	first determine the colors for the current cpu.
2375 		 *  >0	colors of all cpus are homogenous and < page_colors
2376 		 */
2377 
2378 		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
2379 			if (!nwaybins) {
2380 				/*
2381 				 * cpucolors is negative if ecache setsizes
2382 				 * are heterogenous. determine colors for this
2383 				 * particular cpu.
2384 				 */
2385 				if (cpucolors < 0) {
2386 					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
2387 					ASSERT(cpucolors > 0);
2388 					nwaybins = colors / cpucolors;
2389 				} else {
2390 					nwaybins = colors / cpucolors;
2391 					ASSERT(szc > 0 || nwaybins > 1);
2392 				}
2393 				if (nwaybins < 2)
2394 					cpucolors = 0;
2395 			}
2396 
2397 			if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
2398 				nwaycnt++;
2399 				bin = (bin + (colors / nwaybins)) &
2400 				    (colors - 1);
2401 				if (nwaycnt < nwaybins) {
2402 					goto try_again;
2403 				}
2404 			}
2405 			/* back to initial color if fall-thru */
2406 		}
2407 
2408 		/*
2409 		 * color bins are all empty if color match. Try and satisfy
2410 		 * the request by breaking up or coalescing pages from
2411 		 * a different size freelist of the correct color that
2412 		 * satisfies the ORIGINAL color requested. If that
2413 		 * fails then try pages of the same size but different
2414 		 * colors assuming we are not called with
2415 		 * PG_MATCH_COLOR.
2416 		 */
2417 		if (!fill_tried) {
2418 			fill_tried = 1;
2419 			fill_marker = bin >> nszc_color_shift;
2420 			pp = page_freelist_fill(szc, bin, mnode, mtype,
2421 			    PFNNULL);
2422 			if (pp != NULL) {
2423 				return (pp);
2424 			}
2425 		}
2426 
2427 		if (flags & PG_MATCH_COLOR)
2428 			break;
2429 
2430 		/*
2431 		 * Select next color bin to try.
2432 		 */
2433 		if (szc == 0) {
2434 			/*
2435 			 * PAGESIZE page case.
2436 			 */
2437 			if (i == 0) {
2438 				bin = (bin + BIN_STEP) & page_colors_mask;
2439 				bin_marker = bin;
2440 			} else {
2441 				bin = (bin + vac_colors) & page_colors_mask;
2442 				if (bin == bin_marker) {
2443 					bin = (bin + 1) & page_colors_mask;
2444 					bin_marker = bin;
2445 				}
2446 			}
2447 		} else {
2448 			/*
2449 			 * Large page case.
2450 			 */
2451 			bin = (bin + 1) & (colors - 1);
2452 		}
2453 		/*
2454 		 * If bin advanced to the next color bin of the
2455 		 * next larger pagesize, there is a chance the fill
2456 		 * could succeed.
2457 		 */
2458 		if (fill_marker != (bin >> nszc_color_shift))
2459 			fill_tried = 0;
2460 	}
2461 
2462 	/* if allowed, cycle through additional mtypes */
2463 	MTYPE_NEXT(mnode, mtype, flags);
2464 	if (mtype >= 0)
2465 		goto big_try_again;
2466 
2467 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2468 
2469 	return (NULL);
2470 }
2471 
2472 
2473 /*
2474  * Returns the count of free pages for 'pp' with size code 'szc'.
2475  * Note: This function does not return an exact value as the page freelist
2476  * locks are not held and thus the values in the page_counters may be
2477  * changing as we walk through the data.
2478  */
2479 static int
2480 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2481 {
2482 	pgcnt_t	pgfree;
2483 	pgcnt_t cnt;
2484 	ssize_t	r = szc;	/* region size */
2485 	ssize_t	idx;
2486 	int	i;
2487 	int	full, range;
2488 
2489 	/* Make sure pagenum passed in is aligned properly */
2490 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
2491 	ASSERT(szc > 0);
2492 
2493 	/* Prevent page_counters dynamic memory from being freed */
2494 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2495 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2496 	cnt = PAGE_COUNTERS(mnode, r, idx);
2497 	pgfree = cnt << PNUM_SHIFT(r - 1);
2498 	range = FULL_REGION_CNT(szc);
2499 
2500 	/* Check for completely full region */
2501 	if (cnt == range) {
2502 		rw_exit(&page_ctrs_rwlock[mnode]);
2503 		return (pgfree);
2504 	}
2505 
2506 	while (--r > 0) {
2507 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2508 		full = FULL_REGION_CNT(r);
2509 		for (i = 0; i < range; i++, idx++) {
2510 			cnt = PAGE_COUNTERS(mnode, r, idx);
2511 			/*
2512 			 * If cnt here is full, that means we have already
2513 			 * accounted for these pages earlier.
2514 			 */
2515 			if (cnt != full) {
2516 				pgfree += (cnt << PNUM_SHIFT(r - 1));
2517 			}
2518 		}
2519 		range *= full;
2520 	}
2521 	rw_exit(&page_ctrs_rwlock[mnode]);
2522 	return (pgfree);
2523 }
2524 
2525 /*
2526  * Called from page_geti_contig_pages to exclusively lock constituent pages
2527  * starting from 'spp' for page size code 'szc'.
2528  *
2529  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2530  * region needs to be greater than or equal to the threshold.
2531  */
2532 static int
2533 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
2534 {
2535 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
2536 	pgcnt_t pgfree, i;
2537 	page_t *pp;
2538 
2539 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
2540 
2541 
2542 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
2543 		goto skipptcpcheck;
2544 	/*
2545 	 * check if there are sufficient free pages available before attempting
2546 	 * to trylock. Count is approximate as page counters can change.
2547 	 */
2548 	pgfree = page_freecnt(mnode, spp, szc);
2549 
2550 	/* attempt to trylock if there are sufficient already free pages */
2551 	if (pgfree < pgcnt/ptcpthreshold) {
2552 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
2553 		return (0);
2554 	}
2555 
2556 skipptcpcheck:
2557 
2558 	for (i = 0; i < pgcnt; i++) {
2559 		pp = &spp[i];
2560 		if (!page_trylock(pp, SE_EXCL)) {
2561 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
2562 			while (--i != (pgcnt_t)-1) {
2563 				pp = &spp[i];
2564 				ASSERT(PAGE_EXCL(pp));
2565 				page_unlock_noretire(pp);
2566 			}
2567 			return (0);
2568 		}
2569 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
2570 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
2571 		    !PP_ISFREE(pp)) {
2572 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
2573 			ASSERT(i == 0);
2574 			page_unlock_noretire(pp);
2575 			return (0);
2576 		}
2577 		if (PP_ISNORELOC(pp)) {
2578 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
2579 			while (i != (pgcnt_t)-1) {
2580 				pp = &spp[i];
2581 				ASSERT(PAGE_EXCL(pp));
2582 				page_unlock_noretire(pp);
2583 				i--;
2584 			}
2585 			return (0);
2586 		}
2587 	}
2588 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
2589 	return (1);
2590 }
2591 
2592 /*
2593  * Claim large page pointed to by 'pp'. 'pp' is the starting set
2594  * of 'szc' constituent pages that had been locked exclusively previously.
2595  * Will attempt to relocate constituent pages in use.
2596  */
2597 static page_t *
2598 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
2599 {
2600 	spgcnt_t pgcnt, npgs, i;
2601 	page_t *targpp, *rpp, *hpp;
2602 	page_t *replpp = NULL;
2603 	page_t *pplist = NULL;
2604 
2605 	ASSERT(pp != NULL);
2606 
2607 	pgcnt = page_get_pagecnt(szc);
2608 	while (pgcnt) {
2609 		ASSERT(PAGE_EXCL(pp));
2610 		ASSERT(!PP_ISNORELOC(pp));
2611 		if (PP_ISFREE(pp)) {
2612 			/*
2613 			 * If this is a PG_FREE_LIST page then its
2614 			 * size code can change underneath us due to
2615 			 * page promotion or demotion. As an optimzation
2616 			 * use page_list_sub_pages() instead of
2617 			 * page_list_sub().
2618 			 */
2619 			if (PP_ISAGED(pp)) {
2620 				page_list_sub_pages(pp, szc);
2621 				if (pp->p_szc == szc) {
2622 					return (pp);
2623 				}
2624 				ASSERT(pp->p_szc < szc);
2625 				npgs = page_get_pagecnt(pp->p_szc);
2626 				hpp = pp;
2627 				for (i = 0; i < npgs; i++, pp++) {
2628 					pp->p_szc = szc;
2629 				}
2630 				page_list_concat(&pplist, &hpp);
2631 				pgcnt -= npgs;
2632 				continue;
2633 			}
2634 			ASSERT(!PP_ISAGED(pp));
2635 			ASSERT(pp->p_szc == 0);
2636 			page_list_sub(pp, PG_CACHE_LIST);
2637 			page_hashout(pp, NULL);
2638 			PP_SETAGED(pp);
2639 			pp->p_szc = szc;
2640 			page_list_concat(&pplist, &pp);
2641 			pp++;
2642 			pgcnt--;
2643 			continue;
2644 		}
2645 		npgs = page_get_pagecnt(pp->p_szc);
2646 
2647 		/*
2648 		 * page_create_wait freemem accounting done by caller of
2649 		 * page_get_freelist and not necessary to call it prior to
2650 		 * calling page_get_replacement_page.
2651 		 *
2652 		 * page_get_replacement_page can call page_get_contig_pages
2653 		 * to acquire a large page (szc > 0); the replacement must be
2654 		 * smaller than the contig page size to avoid looping or
2655 		 * szc == 0 and PGI_PGCPSZC0 is set.
2656 		 */
2657 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
2658 			replpp = page_get_replacement_page(pp, NULL, 0);
2659 			if (replpp) {
2660 				npgs = page_get_pagecnt(pp->p_szc);
2661 				ASSERT(npgs <= pgcnt);
2662 				targpp = pp;
2663 			}
2664 		}
2665 
2666 		/*
2667 		 * If replacement is NULL or do_page_relocate fails, fail
2668 		 * coalescing of pages.
2669 		 */
2670 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
2671 		    &npgs, NULL) != 0)) {
2672 			/*
2673 			 * Unlock un-processed target list
2674 			 */
2675 			while (pgcnt--) {
2676 				ASSERT(PAGE_EXCL(pp));
2677 				page_unlock_noretire(pp);
2678 				pp++;
2679 			}
2680 			/*
2681 			 * Free the processed target list.
2682 			 */
2683 			while (pplist) {
2684 				pp = pplist;
2685 				page_sub(&pplist, pp);
2686 				ASSERT(PAGE_EXCL(pp));
2687 				ASSERT(pp->p_szc == szc);
2688 				ASSERT(PP_ISFREE(pp));
2689 				ASSERT(PP_ISAGED(pp));
2690 				pp->p_szc = 0;
2691 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2692 				page_unlock_noretire(pp);
2693 			}
2694 
2695 			if (replpp != NULL)
2696 				page_free_replacement_page(replpp);
2697 
2698 			return (NULL);
2699 		}
2700 		ASSERT(pp == targpp);
2701 
2702 		/* LINTED */
2703 		ASSERT(hpp = pp); /* That's right, it's an assignment */
2704 
2705 		pp += npgs;
2706 		pgcnt -= npgs;
2707 
2708 		while (npgs--) {
2709 			ASSERT(PAGE_EXCL(targpp));
2710 			ASSERT(!PP_ISFREE(targpp));
2711 			ASSERT(!PP_ISNORELOC(targpp));
2712 			PP_SETFREE(targpp);
2713 			ASSERT(PP_ISAGED(targpp));
2714 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
2715 			    (flags & PGI_PGCPSZC0)));
2716 			targpp->p_szc = szc;
2717 			targpp = targpp->p_next;
2718 
2719 			rpp = replpp;
2720 			ASSERT(rpp != NULL);
2721 			page_sub(&replpp, rpp);
2722 			ASSERT(PAGE_EXCL(rpp));
2723 			ASSERT(!PP_ISFREE(rpp));
2724 			page_unlock_noretire(rpp);
2725 		}
2726 		ASSERT(targpp == hpp);
2727 		ASSERT(replpp == NULL);
2728 		page_list_concat(&pplist, &targpp);
2729 	}
2730 	CHK_LPG(pplist, szc);
2731 	return (pplist);
2732 }
2733 
2734 /*
2735  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
2736  * of 0 means nothing left after trim.
2737  */
2738 
2739 int
2740 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
2741 {
2742 	pfn_t	kcagepfn;
2743 	int	decr;
2744 	int	rc = 0;
2745 
2746 	if (PP_ISNORELOC(mseg->pages)) {
2747 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
2748 
2749 			/* lower part of this mseg inside kernel cage */
2750 			decr = kcage_current_pfn(&kcagepfn);
2751 
2752 			/* kernel cage may have transitioned past mseg */
2753 			if (kcagepfn >= mseg->pages_base &&
2754 			    kcagepfn < mseg->pages_end) {
2755 				ASSERT(decr == 0);
2756 				*lo = kcagepfn;
2757 				*hi = MIN(pfnhi,
2758 				    (mseg->pages_end - 1));
2759 				rc = 1;
2760 			}
2761 		}
2762 		/* else entire mseg in the cage */
2763 	} else {
2764 		if (PP_ISNORELOC(mseg->epages - 1)) {
2765 
2766 			/* upper part of this mseg inside kernel cage */
2767 			decr = kcage_current_pfn(&kcagepfn);
2768 
2769 			/* kernel cage may have transitioned past mseg */
2770 			if (kcagepfn >= mseg->pages_base &&
2771 			    kcagepfn < mseg->pages_end) {
2772 				ASSERT(decr);
2773 				*hi = kcagepfn;
2774 				*lo = MAX(pfnlo, mseg->pages_base);
2775 				rc = 1;
2776 			}
2777 		} else {
2778 			/* entire mseg outside of kernel cage */
2779 			*lo = MAX(pfnlo, mseg->pages_base);
2780 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
2781 			rc = 1;
2782 		}
2783 	}
2784 	return (rc);
2785 }
2786 
2787 /*
2788  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
2789  * page with size code 'szc'. Claiming such a page requires acquiring
2790  * exclusive locks on all constituent pages (page_trylock_contig_pages),
2791  * relocating pages in use and concatenating these constituent pages into a
2792  * large page.
2793  *
2794  * The page lists do not have such a large page and page_freelist_fill has
2795  * already failed to demote larger pages and/or coalesce smaller free pages.
2796  *
2797  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
2798  * pages with the same color as 'bin'.
2799  *
2800  * 'pfnflag' specifies the subset of the pfn range to search.
2801  */
2802 
2803 
2804 static page_t *
2805 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
2806     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
2807 {
2808 	struct memseg *mseg;
2809 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
2810 	pgcnt_t szcpgmask = szcpgcnt - 1;
2811 	pfn_t	randpfn;
2812 	page_t *pp, *randpp, *endpp;
2813 	uint_t colors;
2814 	pfn_t hi, lo;
2815 	uint_t skip;
2816 
2817 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
2818 
2819 	if ((pfnhi - pfnlo) + 1 < szcpgcnt)
2820 		return (NULL);
2821 
2822 	ASSERT(szc < mmu_page_sizes);
2823 
2824 	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
2825 	    page_colors;
2826 
2827 	ASSERT(bin < colors);
2828 
2829 	/*
2830 	 * trim the pfn range to search based on pfnflag. pfnflag is set
2831 	 * when there have been previous page_get_contig_page failures to
2832 	 * limit the search.
2833 	 *
2834 	 * The high bit in pfnflag specifies the number of 'slots' in the
2835 	 * pfn range and the remainder of pfnflag specifies which slot.
2836 	 * For example, a value of 1010b would mean the second slot of
2837 	 * the pfn range that has been divided into 8 slots.
2838 	 */
2839 	if (pfnflag > 1) {
2840 		int	slots = 1 << (highbit(pfnflag) - 1);
2841 		int	slotid = pfnflag & (slots - 1);
2842 		pgcnt_t	szcpages;
2843 		int	slotlen;
2844 
2845 		pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
2846 		pfnhi = pfnhi & ~(szcpgcnt - 1);
2847 
2848 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
2849 		slotlen = howmany(szcpages, slots);
2850 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
2851 		ASSERT(pfnlo < pfnhi);
2852 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
2853 			pfnhi = pfnlo + (slotlen * szcpgcnt);
2854 	}
2855 
2856 	memsegs_lock(0);
2857 
2858 	/*
2859 	 * loop through memsegs to look for contig page candidates
2860 	 */
2861 
2862 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
2863 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
2864 			/* no overlap */
2865 			continue;
2866 		}
2867 
2868 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
2869 			/* mseg too small */
2870 			continue;
2871 
2872 		/* trim off kernel cage pages from pfn range */
2873 		if (kcage_on) {
2874 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
2875 				continue;
2876 		} else {
2877 			lo = MAX(pfnlo, mseg->pages_base);
2878 			hi = MIN(pfnhi, (mseg->pages_end - 1));
2879 		}
2880 
2881 		/* round to szcpgcnt boundaries */
2882 		lo = P2ROUNDUP(lo, szcpgcnt);
2883 		hi = hi & ~(szcpgcnt - 1);
2884 
2885 		if (hi <= lo)
2886 			continue;
2887 
2888 		/*
2889 		 * set lo to point to the pfn for the desired bin. Large
2890 		 * page sizes may only have a single page color
2891 		 */
2892 		if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
2893 			uint_t	lobin;
2894 
2895 			/*
2896 			 * factor in colorequiv to check additional
2897 			 * 'equivalent' bins.
2898 			 */
2899 			if (colorequiv > 1 && colors > colorequiv)
2900 				colors = colors / colorequiv;
2901 
2902 			/* determine bin that lo currently points to */
2903 			lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;
2904 
2905 			/*
2906 			 * set lo to point at appropriate color and set skip
2907 			 * to arrive at the next szc page of the same color.
2908 			 */
2909 			lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;
2910 
2911 			skip = colors * szcpgcnt;
2912 		} else {
2913 			/* check all pages starting from lo */
2914 			skip = szcpgcnt;
2915 		}
2916 		if (hi <= lo)
2917 			/* mseg cannot satisfy color request */
2918 			continue;
2919 
2920 		/* randomly choose a point between lo and hi to begin search */
2921 
2922 		randpfn = (pfn_t)GETTICK();
2923 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
2924 		randpp = mseg->pages + (randpfn - mseg->pages_base);
2925 
2926 		ASSERT(randpp->p_pagenum == randpfn);
2927 
2928 		pp = randpp;
2929 		endpp =  mseg->pages + (hi - mseg->pages_base);
2930 
2931 		ASSERT(randpp + szcpgcnt <= endpp);
2932 
2933 		do {
2934 			ASSERT(!(pp->p_pagenum & szcpgmask));
2935 			ASSERT((flags & PG_MATCH_COLOR) == 0 ||
2936 			    colorequiv > 1 ||
2937 			    PP_2_BIN(pp) == bin);
2938 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
2939 				/* pages unlocked by page_claim on failure */
2940 				if (page_claim_contig_pages(pp, szc, flags)) {
2941 					memsegs_unlock(0);
2942 					return (pp);
2943 				}
2944 			}
2945 
2946 			pp += skip;
2947 			if (pp >= endpp) {
2948 				/* start from the beginning */
2949 				pp = mseg->pages + (lo - mseg->pages_base);
2950 				ASSERT(pp->p_pagenum == lo);
2951 				ASSERT(pp + szcpgcnt <= endpp);
2952 			}
2953 		} while (pp != randpp);
2954 	}
2955 	memsegs_unlock(0);
2956 	return (NULL);
2957 }
2958 
2959 
2960 /*
2961  * controlling routine that searches through physical memory in an attempt to
2962  * claim a large page based on the input parameters.
2963  * on the page free lists.
2964  *
2965  * calls page_geti_contig_pages with an initial pfn range from the mnode
2966  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
2967  * that overlaps with the kernel cage or does not match the requested page
2968  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
2969  * page_geti_contig_pages may further limit the search range based on
2970  * previous failure counts (pgcpfailcnt[]).
2971  *
2972  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
2973  * pagesize page that satisfies mtype.
2974  */
2975 page_t *
2976 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
2977     uint_t flags)
2978 {
2979 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
2980 	page_t		*pp;
2981 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
2982 
2983 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
2984 
2985 	/* no allocations from cage */
2986 	flags |= PGI_NOCAGE;
2987 
2988 	/* LINTED */
2989 	MTYPE_START(mnode, mtype, flags);
2990 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2991 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
2992 		return (NULL);
2993 	}
2994 
2995 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
2996 
2997 	/* do not limit search and ignore color if hi pri */
2998 
2999 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3000 		pfnflag = pgcpfailcnt[szc];
3001 
3002 	/* remove color match to improve chances */
3003 
3004 	if (flags & PGI_PGCPHIPRI || pfnflag)
3005 		flags &= ~PG_MATCH_COLOR;
3006 
3007 	do {
3008 		/* get pfn range based on mnode and mtype */
3009 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3010 
3011 		ASSERT(pfnhi >= pfnlo);
3012 
3013 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3014 		    pfnlo, pfnhi, pfnflag);
3015 
3016 		if (pp != NULL) {
3017 			pfnflag = pgcpfailcnt[szc];
3018 			if (pfnflag) {
3019 				/* double the search size */
3020 				pgcpfailcnt[szc] = pfnflag >> 1;
3021 			}
3022 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3023 			return (pp);
3024 		}
3025 		MTYPE_NEXT(mnode, mtype, flags);
3026 	} while (mtype >= 0);
3027 
3028 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3029 	return (NULL);
3030 }
3031 
3032 
3033 /*
3034  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3035  *
3036  * Does its own locking and accounting.
3037  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3038  * pages of the proper color even if there are pages of a different color.
3039  *
3040  * Finds a page, removes it, THEN locks it.
3041  */
3042 
3043 /*ARGSUSED*/
3044 page_t *
3045 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3046 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3047 {
3048 	struct as	*as = seg->s_as;
3049 	page_t		*pp = NULL;
3050 	ulong_t		bin;
3051 	uchar_t		szc;
3052 	int		mnode;
3053 	int		mtype;
3054 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3055 	lgrp_mnode_cookie_t	lgrp_cookie;
3056 
3057 	page_get_func = page_get_mnode_freelist;
3058 
3059 	/*
3060 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3061 	 * assume we wish to allocate near to the current thread's home.
3062 	 */
3063 	if (!LGRP_EXISTS(lgrp))
3064 		lgrp = lgrp_home_lgrp();
3065 
3066 	if (kcage_on) {
3067 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3068 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3069 		    curthread != kcage_cageout_thread) {
3070 			/*
3071 			 * Set a "reserve" of kcage_throttlefree pages for
3072 			 * PG_PANIC and cageout thread allocations.
3073 			 *
3074 			 * Everybody else has to serialize in
3075 			 * page_create_get_something() to get a cage page, so
3076 			 * that we don't deadlock cageout!
3077 			 */
3078 			return (NULL);
3079 		}
3080 	} else {
3081 		flags &= ~PG_NORELOC;
3082 		flags |= PGI_NOCAGE;
3083 	}
3084 
3085 	/* LINTED */
3086 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3087 
3088 	/*
3089 	 * Convert size to page size code.
3090 	 */
3091 	if ((szc = page_szc(size)) == (uchar_t)-1)
3092 		panic("page_get_freelist: illegal page size request");
3093 	ASSERT(szc < mmu_page_sizes);
3094 
3095 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3096 
3097 	/* LINTED */
3098 	AS_2_BIN(as, seg, vp, vaddr, bin);
3099 
3100 	/* bin is for base pagesize color - convert if larger pagesize. */
3101 	if (szc)
3102 		bin = page_convert_color(0, szc, bin);
3103 
3104 	/*
3105 	 * Try to get a local page first, but try remote if we can't
3106 	 * get a page of the right color.
3107 	 */
3108 pgretry:
3109 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3110 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3111 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3112 		if (pp != NULL) {
3113 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3114 			DTRACE_PROBE4(page__get,
3115 			    lgrp_t *, lgrp,
3116 			    int, mnode,
3117 			    ulong_t, bin,
3118 			    uint_t, flags);
3119 			return (pp);
3120 		}
3121 	}
3122 	ASSERT(pp == NULL);
3123 
3124 	/*
3125 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3126 	 * remote free lists.  Caller expected to call page_get_cachelist which
3127 	 * will check local cache lists and remote free lists.
3128 	 */
3129 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3130 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3131 		return (NULL);
3132 	}
3133 
3134 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3135 
3136 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3137 
3138 	/*
3139 	 * Try to get a non-local freelist page.
3140 	 */
3141 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3142 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3143 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3144 		if (pp != NULL) {
3145 			DTRACE_PROBE4(page__get,
3146 			    lgrp_t *, lgrp,
3147 			    int, mnode,
3148 			    ulong_t, bin,
3149 			    uint_t, flags);
3150 			VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3151 			return (pp);
3152 		}
3153 	}
3154 
3155 	ASSERT(pp == NULL);
3156 
3157 	/*
3158 	 * when the cage is off chances are page_get_contig_pages() will fail
3159 	 * to lock a large page chunk therefore when the cage is off it's not
3160 	 * called by default.  this can be changed via /etc/system.
3161 	 *
3162 	 * page_get_contig_pages() also called to acquire a base pagesize page
3163 	 * for page_create_get_something().
3164 	 */
3165 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3166 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3167 	    (page_get_func != page_get_contig_pages)) {
3168 
3169 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3170 		page_get_func = page_get_contig_pages;
3171 		goto pgretry;
3172 	}
3173 
3174 	if (pgcplimitsearch && page_get_func == page_get_contig_pages)
3175 		SETPGCPFAILCNT(szc);
3176 
3177 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3178 	return (NULL);
3179 }
3180 
3181 /*
3182  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3183  *
3184  * Does its own locking.
3185  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3186  * pages of the proper color even if there are pages of a different color.
3187  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3188  * try to lock one of them.  If no page can be locked, try the
3189  * next bin.  Return NULL if a page can not be found and locked.
3190  *
3191  * Finds a pages, trys to lock it, then removes it.
3192  */
3193 
3194 /*ARGSUSED*/
3195 page_t *
3196 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3197     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3198 {
3199 	page_t		*pp;
3200 	struct as	*as = seg->s_as;
3201 	ulong_t		bin;
3202 	/*LINTED*/
3203 	int		mnode;
3204 	int		mtype;
3205 	lgrp_mnode_cookie_t	lgrp_cookie;
3206 
3207 	/*
3208 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3209 	 * assume we wish to allocate near to the current thread's home.
3210 	 */
3211 	if (!LGRP_EXISTS(lgrp))
3212 		lgrp = lgrp_home_lgrp();
3213 
3214 	if (!kcage_on) {
3215 		flags &= ~PG_NORELOC;
3216 		flags |= PGI_NOCAGE;
3217 	}
3218 
3219 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3220 	    kcage_freemem <= kcage_throttlefree) {
3221 		/*
3222 		 * Reserve kcage_throttlefree pages for critical kernel
3223 		 * threads.
3224 		 *
3225 		 * Everybody else has to go to page_create_get_something()
3226 		 * to get a cage page, so we don't deadlock cageout.
3227 		 */
3228 		return (NULL);
3229 	}
3230 
3231 	/* LINTED */
3232 	AS_2_BIN(as, seg, vp, vaddr, bin);
3233 
3234 	ASSERT(bin <= page_colors_mask);
3235 
3236 	/* LINTED */
3237 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3238 
3239 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3240 
3241 	/*
3242 	 * Try local cachelists first
3243 	 */
3244 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3245 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3246 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3247 		if (pp != NULL) {
3248 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3249 			DTRACE_PROBE4(page__get,
3250 			    lgrp_t *, lgrp,
3251 			    int, mnode,
3252 			    ulong_t, bin,
3253 			    uint_t, flags);
3254 			return (pp);
3255 		}
3256 	}
3257 
3258 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3259 
3260 	/*
3261 	 * Try freelists/cachelists that are farther away
3262 	 * This is our only chance to allocate remote pages for PAGESIZE
3263 	 * requests.
3264 	 */
3265 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3266 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3267 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3268 		    0, flags);
3269 		if (pp != NULL) {
3270 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3271 			DTRACE_PROBE4(page__get,
3272 			    lgrp_t *, lgrp,
3273 			    int, mnode,
3274 			    ulong_t, bin,
3275 			    uint_t, flags);
3276 			return (pp);
3277 		}
3278 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3279 		if (pp != NULL) {
3280 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3281 			DTRACE_PROBE4(page__get,
3282 			    lgrp_t *, lgrp,
3283 			    int, mnode,
3284 			    ulong_t, bin,
3285 			    uint_t, flags);
3286 			return (pp);
3287 		}
3288 	}
3289 
3290 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3291 	return (NULL);
3292 }
3293 
3294 page_t *
3295 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3296 {
3297 	kmutex_t	*pcm;
3298 	int		i;
3299 	page_t		*pp;
3300 	page_t		*first_pp;
3301 	uint_t		bin_marker;
3302 	int		nwaybins, nwaycnt;
3303 	int		cpucolors;
3304 
3305 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3306 
3307 	/* LINTED */
3308 	MTYPE_START(mnode, mtype, flags);
3309 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3310 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3311 		return (NULL);
3312 	}
3313 
3314 	nwaybins = 0;
3315 	cpucolors = cpu_page_colors;
3316 	/*
3317 	 * adjust cpucolors to possibly check additional 'equivalent' bins
3318 	 * to try to minimize fragmentation of large pages by delaying calls
3319 	 * to page_freelist_fill.
3320 	 */
3321 	if (colorequiv > 1) {
3322 		int equivcolors = page_colors / colorequiv;
3323 
3324 		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
3325 			cpucolors = equivcolors;
3326 	}
3327 
3328 	/*
3329 	 * Only hold one cachelist lock at a time, that way we
3330 	 * can start anywhere and not have to worry about lock
3331 	 * ordering.
3332 	 */
3333 
3334 big_try_again:
3335 	nwaycnt = 0;
3336 	for (i = 0; i <= page_colors; i++) {
3337 		if (PAGE_CACHELISTS(mnode, bin, mtype)) {
3338 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3339 			mutex_enter(pcm);
3340 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3341 			if (pp != NULL) {
3342 				first_pp = pp;
3343 				ASSERT(pp->p_vnode);
3344 				ASSERT(PP_ISAGED(pp) == 0);
3345 				ASSERT(pp->p_szc == 0);
3346 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3347 				while (!page_trylock(pp, SE_EXCL)) {
3348 					pp = pp->p_next;
3349 					ASSERT(pp->p_szc == 0);
3350 					if (pp == first_pp) {
3351 						/*
3352 						 * We have searched the
3353 						 * complete list!
3354 						 * And all of them (might
3355 						 * only be one) are locked.
3356 						 * This can happen since
3357 						 * these pages can also be
3358 						 * found via the hash list.
3359 						 * When found via the hash
3360 						 * list, they are locked
3361 						 * first, then removed.
3362 						 * We give up to let the
3363 						 * other thread run.
3364 						 */
3365 						pp = NULL;
3366 						break;
3367 					}
3368 					ASSERT(pp->p_vnode);
3369 					ASSERT(PP_ISFREE(pp));
3370 					ASSERT(PP_ISAGED(pp) == 0);
3371 					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3372 							mnode);
3373 				}
3374 
3375 				if (pp) {
3376 					page_t	**ppp;
3377 					/*
3378 					 * Found and locked a page.
3379 					 * Pull it off the list.
3380 					 */
3381 					ASSERT(mtype == PP_2_MTYPE(pp));
3382 					ppp = &PAGE_CACHELISTS(mnode, bin,
3383 					    mtype);
3384 					page_sub(ppp, pp);
3385 					/*
3386 					 * Subtract counters before releasing
3387 					 * pcm mutex to avoid a race with
3388 					 * page_freelist_coalesce and
3389 					 * page_freelist_fill.
3390 					 */
3391 					page_ctr_sub(mnode, mtype, pp,
3392 					    PG_CACHE_LIST);
3393 					mutex_exit(pcm);
3394 					ASSERT(pp->p_vnode);
3395 					ASSERT(PP_ISAGED(pp) == 0);
3396 #if defined(__sparc)
3397 					ASSERT(!kcage_on ||
3398 					    (flags & PG_NORELOC) == 0 ||
3399 					    PP_ISNORELOC(pp));
3400 					if (PP_ISNORELOC(pp)) {
3401 						kcage_freemem_sub(1);
3402 					}
3403 #endif
3404 					VM_STAT_ADD(vmm_vmstats.
3405 					    pgmc_allocok);
3406 					return (pp);
3407 				}
3408 			}
3409 			mutex_exit(pcm);
3410 		}
3411 
3412 		/*
3413 		 * Wow! The initial bin is empty or no page in the bin could
3414 		 * be locked.
3415 		 *
3416 		 * If specific color is needed, check if page color may be in
3417 		 * other bins.
3418 		 */
3419 		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
3420 			if (!nwaybins) {
3421 				if (cpucolors < 0) {
3422 					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
3423 					ASSERT(cpucolors > 0);
3424 					nwaybins = page_colors / cpucolors;
3425 					if (nwaybins < 2)
3426 						cpucolors = 0;
3427 				} else {
3428 					nwaybins = page_colors / cpucolors;
3429 					ASSERT(nwaybins > 1);
3430 				}
3431 			}
3432 
3433 			if (++nwaycnt >= nwaybins) {
3434 				break;
3435 			}
3436 			bin = (bin + (page_colors / nwaybins)) &
3437 			    page_colors_mask;
3438 			continue;
3439 		}
3440 
3441 		if (i == 0) {
3442 			bin = (bin + BIN_STEP) & page_colors_mask;
3443 			bin_marker = bin;
3444 		} else {
3445 			bin = (bin + vac_colors) & page_colors_mask;
3446 			if (bin == bin_marker) {
3447 				bin = (bin + 1) & page_colors_mask;
3448 				bin_marker = bin;
3449 			}
3450 		}
3451 	}
3452 
3453 	MTYPE_NEXT(mnode, mtype, flags);
3454 	if (mtype >= 0)
3455 		goto big_try_again;
3456 
3457 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3458 	return (NULL);
3459 }
3460 
3461 #ifdef DEBUG
3462 #define	REPL_PAGE_STATS
3463 #endif /* DEBUG */
3464 
3465 #ifdef REPL_PAGE_STATS
3466 struct repl_page_stats {
3467 	uint_t	ngets;
3468 	uint_t	ngets_noreloc;
3469 	uint_t	npgr_noreloc;
3470 	uint_t	nnopage_first;
3471 	uint_t	nnopage;
3472 	uint_t	nhashout;
3473 	uint_t	nnofree;
3474 	uint_t	nnext_pp;
3475 } repl_page_stats;
3476 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
3477 #else /* REPL_PAGE_STATS */
3478 #define	REPL_STAT_INCR(v)
3479 #endif /* REPL_PAGE_STATS */
3480 
3481 int	pgrppgcp;
3482 
3483 /*
3484  * The freemem accounting must be done by the caller.
3485  * First we try to get a replacement page of the same size as like_pp,
3486  * if that is not possible, then we just get a set of discontiguous
3487  * PAGESIZE pages.
3488  */
3489 page_t *
3490 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3491     uint_t pgrflags)
3492 {
3493 	page_t		*like_pp;
3494 	page_t		*pp, *pplist;
3495 	page_t		*pl = NULL;
3496 	ulong_t		bin;
3497 	int		mnode, page_mnode;
3498 	int		szc;
3499 	spgcnt_t	npgs, pg_cnt;
3500 	pfn_t		pfnum;
3501 	int		mtype;
3502 	int		flags = 0;
3503 	lgrp_mnode_cookie_t	lgrp_cookie;
3504 	lgrp_t		*lgrp;
3505 
3506 	REPL_STAT_INCR(ngets);
3507 	like_pp = orig_like_pp;
3508 	ASSERT(PAGE_EXCL(like_pp));
3509 
3510 	szc = like_pp->p_szc;
3511 	npgs = page_get_pagecnt(szc);
3512 	/*
3513 	 * Now we reset like_pp to the base page_t.
3514 	 * That way, we won't walk past the end of this 'szc' page.
3515 	 */
3516 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
3517 	like_pp = page_numtopp_nolock(pfnum);
3518 	ASSERT(like_pp->p_szc == szc);
3519 
3520 	if (PP_ISNORELOC(like_pp)) {
3521 		ASSERT(kcage_on);
3522 		REPL_STAT_INCR(ngets_noreloc);
3523 		flags = PGI_RELOCONLY;
3524 	} else if (pgrflags & PGR_NORELOC) {
3525 		ASSERT(kcage_on);
3526 		REPL_STAT_INCR(npgr_noreloc);
3527 		flags = PG_NORELOC;
3528 	}
3529 
3530 	/*
3531 	 * Kernel pages must always be replaced with the same size
3532 	 * pages, since we cannot properly handle demotion of kernel
3533 	 * pages.
3534 	 */
3535 	if (like_pp->p_vnode == &kvp)
3536 		pgrflags |= PGR_SAMESZC;
3537 
3538 	/* LINTED */
3539 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
3540 
3541 	while (npgs) {
3542 		pplist = NULL;
3543 		for (;;) {
3544 			pg_cnt = page_get_pagecnt(szc);
3545 			bin = PP_2_BIN(like_pp);
3546 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
3547 			ASSERT(pg_cnt <= npgs);
3548 
3549 			/*
3550 			 * If an lgroup was specified, try to get the
3551 			 * page from that lgroup.
3552 			 * NOTE: Must be careful with code below because
3553 			 *	 lgroup may disappear and reappear since there
3554 			 *	 is no locking for lgroup here.
3555 			 */
3556 			if (LGRP_EXISTS(lgrp_target)) {
3557 				/*
3558 				 * Keep local variable for lgroup separate
3559 				 * from lgroup argument since this code should
3560 				 * only be exercised when lgroup argument
3561 				 * exists....
3562 				 */
3563 				lgrp = lgrp_target;
3564 
3565 				/* Try the lgroup's freelists first */
3566 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3567 				    LGRP_SRCH_LOCAL);
3568 				while ((pplist == NULL) &&
3569 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3570 				    != -1) {
3571 					pplist = page_get_mnode_freelist(
3572 						mnode, bin, mtype, szc,
3573 						    flags);
3574 				}
3575 
3576 				/*
3577 				 * Now try it's cachelists if this is a
3578 				 * small page. Don't need to do it for
3579 				 * larger ones since page_freelist_coalesce()
3580 				 * already failed.
3581 				 */
3582 				if (pplist != NULL || szc != 0)
3583 					break;
3584 
3585 				/* Now try it's cachelists */
3586 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3587 				    LGRP_SRCH_LOCAL);
3588 
3589 				while ((pplist == NULL) &&
3590 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3591 				    != -1) {
3592 					pplist = page_get_mnode_cachelist(
3593 						bin, flags, mnode, mtype);
3594 				}
3595 				if (pplist != NULL) {
3596 					page_hashout(pplist, NULL);
3597 					PP_SETAGED(pplist);
3598 					REPL_STAT_INCR(nhashout);
3599 					break;
3600 				}
3601 				/* Done looking in this lgroup. Bail out. */
3602 				break;
3603 			}
3604 
3605 			/*
3606 			 * No lgroup was specified (or lgroup was removed by
3607 			 * DR, so just try to get the page as close to
3608 			 * like_pp's mnode as possible.
3609 			 * First try the local freelist...
3610 			 */
3611 			mnode = PP_2_MEM_NODE(like_pp);
3612 			pplist = page_get_mnode_freelist(mnode, bin,
3613 			    mtype, szc, flags);
3614 			if (pplist != NULL)
3615 				break;
3616 
3617 			REPL_STAT_INCR(nnofree);
3618 
3619 			/*
3620 			 * ...then the local cachelist. Don't need to do it for
3621 			 * larger pages cause page_freelist_coalesce() already
3622 			 * failed there anyway.
3623 			 */
3624 			if (szc == 0) {
3625 				pplist = page_get_mnode_cachelist(bin, flags,
3626 				    mnode, mtype);
3627 				if (pplist != NULL) {
3628 					page_hashout(pplist, NULL);
3629 					PP_SETAGED(pplist);
3630 					REPL_STAT_INCR(nhashout);
3631 					break;
3632 				}
3633 			}
3634 
3635 			/* Now try remote freelists */
3636 			page_mnode = mnode;
3637 			lgrp =
3638 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
3639 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3640 			    LGRP_SRCH_HIER);
3641 			while (pplist == NULL &&
3642 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3643 			    != -1) {
3644 				/*
3645 				 * Skip local mnode.
3646 				 */
3647 				if ((mnode == page_mnode) ||
3648 				    (mem_node_config[mnode].exists == 0))
3649 					continue;
3650 
3651 				pplist = page_get_mnode_freelist(mnode,
3652 				    bin, mtype, szc, flags);
3653 			}
3654 
3655 			if (pplist != NULL)
3656 				break;
3657 
3658 
3659 			/* Now try remote cachelists */
3660 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3661 			    LGRP_SRCH_HIER);
3662 			while (pplist == NULL && szc == 0) {
3663 				mnode = lgrp_memnode_choose(&lgrp_cookie);
3664 				if (mnode == -1)
3665 					break;
3666 				/*
3667 				 * Skip local mnode.
3668 				 */
3669 				if ((mnode == page_mnode) ||
3670 				    (mem_node_config[mnode].exists == 0))
3671 					continue;
3672 
3673 				pplist = page_get_mnode_cachelist(bin,
3674 				    flags, mnode, mtype);
3675 
3676 				if (pplist != NULL) {
3677 					page_hashout(pplist, NULL);
3678 					PP_SETAGED(pplist);
3679 					REPL_STAT_INCR(nhashout);
3680 					break;
3681 				}
3682 			}
3683 
3684 			/*
3685 			 * Break out of while loop under the following cases:
3686 			 * - If we successfully got a page.
3687 			 * - If pgrflags specified only returning a specific
3688 			 *   page size and we could not find that page size.
3689 			 * - If we could not satisfy the request with PAGESIZE
3690 			 *   or larger pages.
3691 			 */
3692 			if (pplist != NULL || szc == 0)
3693 				break;
3694 
3695 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
3696 				/* try to find contig page */
3697 
3698 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3699 				    LGRP_SRCH_HIER);
3700 
3701 				while ((pplist == NULL) &&
3702 				    (mnode =
3703 					lgrp_memnode_choose(&lgrp_cookie))
3704 				    != -1) {
3705 					pplist = page_get_contig_pages(
3706 						mnode, bin, mtype, szc,
3707 						    flags | PGI_PGCPHIPRI);
3708 				}
3709 				break;
3710 			}
3711 
3712 			/*
3713 			 * The correct thing to do here is try the next
3714 			 * page size down using szc--. Due to a bug
3715 			 * with the processing of HAT_RELOAD_SHARE
3716 			 * where the sfmmu_ttecnt arrays of all
3717 			 * hats sharing an ISM segment don't get updated,
3718 			 * using intermediate size pages for relocation
3719 			 * can lead to continuous page faults.
3720 			 */
3721 			szc = 0;
3722 		}
3723 
3724 		if (pplist != NULL) {
3725 			DTRACE_PROBE4(page__get,
3726 			    lgrp_t *, lgrp,
3727 			    int, mnode,
3728 			    ulong_t, bin,
3729 			    uint_t, flags);
3730 
3731 			while (pplist != NULL && pg_cnt--) {
3732 				ASSERT(pplist != NULL);
3733 				pp = pplist;
3734 				page_sub(&pplist, pp);
3735 				PP_CLRFREE(pp);
3736 				PP_CLRAGED(pp);
3737 				page_list_concat(&pl, &pp);
3738 				npgs--;
3739 				like_pp = like_pp + 1;
3740 				REPL_STAT_INCR(nnext_pp);
3741 			}
3742 			ASSERT(pg_cnt == 0);
3743 		} else {
3744 			break;
3745 		}
3746 	}
3747 
3748 	if (npgs) {
3749 		/*
3750 		 * We were unable to allocate the necessary number
3751 		 * of pages.
3752 		 * We need to free up any pl.
3753 		 */
3754 		REPL_STAT_INCR(nnopage);
3755 		page_free_replacement_page(pl);
3756 		return (NULL);
3757 	} else {
3758 		return (pl);
3759 	}
3760 }
3761 
3762 /*
3763  * demote a free large page to it's constituent pages
3764  */
3765 void
3766 page_demote_free_pages(page_t *pp)
3767 {
3768 
3769 	int mnode;
3770 
3771 	ASSERT(pp != NULL);
3772 	ASSERT(PAGE_LOCKED(pp));
3773 	ASSERT(PP_ISFREE(pp));
3774 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
3775 
3776 	mnode = PP_2_MEM_NODE(pp);
3777 	page_freelist_lock(mnode);
3778 	if (pp->p_szc != 0) {
3779 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
3780 		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
3781 	}
3782 	page_freelist_unlock(mnode);
3783 	ASSERT(pp->p_szc == 0);
3784 }
3785