xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision e753f464d28e02e23aa93bd7d51d39fc56f79897)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 
35 /*
36  * This file contains common functions to access and manage the page lists.
37  * Many of these routines originated from platform dependent modules
38  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
39  * a platform independent manner.
40  *
41  * vm/vm_dep.h provides for platform specific support.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/debug.h>
46 #include <sys/cmn_err.h>
47 #include <sys/systm.h>
48 #include <sys/atomic.h>
49 #include <sys/sysmacros.h>
50 #include <vm/as.h>
51 #include <vm/page.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/seg_vn.h>
54 #include <sys/vmsystm.h>
55 #include <sys/memnode.h>
56 #include <vm/vm_dep.h>
57 #include <sys/lgrp.h>
58 #include <sys/mem_config.h>
59 #include <sys/callb.h>
60 #include <sys/mem_cage.h>
61 #include <sys/sdt.h>
62 #include <sys/dumphdr.h>
63 
64 extern uint_t	vac_colors;
65 
66 #define	MAX_PRAGMA_ALIGN	128
67 
68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
69 
70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
71 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
72 #else
73 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
74 #endif
75 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
76 
77 /*
78  * number of page colors equivalent to reqested color in page_get routines.
79  * If set, keeps large pages intact longer and keeps MPO allocation
80  * from the local mnode in favor of acquiring the 'correct' page color from
81  * a demoted large page or from a remote mnode.
82  */
83 uint_t	colorequiv;
84 
85 /*
86  * color equivalency mask for each page size.
87  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
88  * High 4 bits determine the number of high order bits of the color to ignore.
89  * Low 4 bits determines number of low order bits of color to ignore (it's only
90  * relevant for hashed index based page coloring).
91  */
92 uchar_t colorequivszc[MMU_PAGE_SIZES];
93 
94 /*
95  * if set, specifies the percentage of large pages that are free from within
96  * a large page region before attempting to lock those pages for
97  * page_get_contig_pages processing.
98  *
99  * Should be turned on when kpr is available when page_trylock_contig_pages
100  * can be more selective.
101  */
102 
103 int	ptcpthreshold;
104 
105 /*
106  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
107  * Enabled by default via pgcplimitsearch.
108  *
109  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
110  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
111  * bound. This upper bound range guarantees:
112  *    - all large page 'slots' will be searched over time
113  *    - the minimum (1) large page candidates considered on each pgcp call
114  *    - count doesn't wrap around to 0
115  */
116 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
117 int	pgcplimitsearch = 1;
118 
119 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
120 #define	SETPGCPFAILCNT(szc)						\
121 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
122 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
123 
124 #ifdef VM_STATS
125 struct vmm_vmstats_str  vmm_vmstats;
126 
127 #endif /* VM_STATS */
128 
129 #if defined(__sparc)
130 #define	LPGCREATE	0
131 #else
132 /* enable page_get_contig_pages */
133 #define	LPGCREATE	1
134 #endif
135 
136 int pg_contig_disable;
137 int pg_lpgcreate_nocage = LPGCREATE;
138 
139 /*
140  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
141  */
142 #define	PFNNULL		0
143 
144 /* Flags involved in promotion and demotion routines */
145 #define	PC_FREE		0x1	/* put page on freelist */
146 #define	PC_ALLOC	0x2	/* return page for allocation */
147 
148 /*
149  * Flag for page_demote to be used with PC_FREE to denote that we don't care
150  * what the color is as the color parameter to the function is ignored.
151  */
152 #define	PC_NO_COLOR	(-1)
153 
154 /* mtype value for page_promote to use when mtype does not matter */
155 #define	PC_MTYPE_ANY	(-1)
156 
157 /*
158  * page counters candidates info
159  * See page_ctrs_cands comment below for more details.
160  * fields are as follows:
161  *	pcc_pages_free:		# pages which freelist coalesce can create
162  *	pcc_color_free:		pointer to page free counts per color
163  */
164 typedef struct pcc_info {
165 	pgcnt_t	pcc_pages_free;
166 	pgcnt_t	*pcc_color_free;
167 	uint_t	pad[12];
168 } pcc_info_t;
169 
170 /*
171  * On big machines it can take a long time to check page_counters
172  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
173  * updated sum of all elements of the corresponding page_counters arrays.
174  * page_freelist_coalesce() searches page_counters only if an appropriate
175  * element of page_ctrs_cands array is greater than 0.
176  *
177  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
178  */
179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
180 
181 /*
182  * Return in val the total number of free pages which can be created
183  * for the given mnode (m), mrange (g), and region size (r)
184  */
185 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
186 	int i;								\
187 	val = 0;							\
188 	for (i = 0; i < NPC_MUTEX; i++) {				\
189 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
190 	}								\
191 }
192 
193 /*
194  * Return in val the total number of free pages which can be created
195  * for the given mnode (m), mrange (g), region size (r), and color (c)
196  */
197 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
198 	int i;								\
199 	val = 0;							\
200 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
201 	for (i = 0; i < NPC_MUTEX; i++) {				\
202 	    val +=							\
203 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
204 	}								\
205 }
206 
207 /*
208  * We can only allow a single thread to update a counter within the physical
209  * range of the largest supported page size. That is the finest granularity
210  * possible since the counter values are dependent on each other
211  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
212  * ctr_mutex lock index for a particular physical range.
213  */
214 static kmutex_t	*ctr_mutex[NPC_MUTEX];
215 
216 #define	PP_CTR_LOCK_INDX(pp)						\
217 	(((pp)->p_pagenum >>						\
218 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
219 
220 #define	INVALID_COLOR 0xffffffff
221 #define	INVALID_MASK  0xffffffff
222 
223 /*
224  * Local functions prototypes.
225  */
226 
227 void page_ctr_add(int, int, page_t *, int);
228 void page_ctr_add_internal(int, int, page_t *, int);
229 void page_ctr_sub(int, int, page_t *, int);
230 void page_ctr_sub_internal(int, int, page_t *, int);
231 void page_freelist_lock(int);
232 void page_freelist_unlock(int);
233 page_t *page_promote(int, pfn_t, uchar_t, int, int);
234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
235 page_t *page_freelist_split(uchar_t,
236     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
238 static int page_trylock_cons(page_t *pp, se_t se);
239 
240 /*
241  * The page_counters array below is used to keep track of free contiguous
242  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
243  * This contains an array of counters, the size of the array, a shift value
244  * used to convert a pagenum into a counter array index or vice versa, as
245  * well as a cache of the last successful index to be promoted to a larger
246  * page size.  As an optimization, we keep track of the last successful index
247  * to be promoted per page color for the given size region, and this is
248  * allocated dynamically based upon the number of colors for a given
249  * region size.
250  *
251  * Conceptually, the page counters are represented as:
252  *
253  *	page_counters[region_size][mnode]
254  *
255  *	region_size:	size code of a candidate larger page made up
256  *			of contiguous free smaller pages.
257  *
258  *	page_counters[region_size][mnode].hpm_counters[index]:
259  *		represents how many (region_size - 1) pages either
260  *		exist or can be created within the given index range.
261  *
262  * Let's look at a sparc example:
263  *	If we want to create a free 512k page, we look at region_size 2
264  *	for the mnode we want.  We calculate the index and look at a specific
265  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
266  *	this location, it means that 8 64k pages either exist or can be created
267  *	from 8K pages in order to make a single free 512k page at the given
268  *	index.  Note that when a region is full, it will contribute to the
269  *	counts in the region above it.  Thus we will not know what page
270  *	size the free pages will be which can be promoted to this new free
271  *	page unless we look at all regions below the current region.
272  */
273 
274 /*
275  * Note: hpmctr_t is defined in platform vm_dep.h
276  * hw_page_map_t contains all the information needed for the page_counters
277  * logic. The fields are as follows:
278  *
279  *	hpm_counters:	dynamically allocated array to hold counter data
280  *	hpm_entries:	entries in hpm_counters
281  *	hpm_shift:	shift for pnum/array index conv
282  *	hpm_base:	PFN mapped to counter index 0
283  *	hpm_color_current:	last index in counter array for this color at
284  *				which we successfully created a large page
285  */
286 typedef struct hw_page_map {
287 	hpmctr_t	*hpm_counters;
288 	size_t		hpm_entries;
289 	int		hpm_shift;
290 	pfn_t		hpm_base;
291 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
292 #if defined(__sparc)
293 	uint_t		pad[4];
294 #endif
295 } hw_page_map_t;
296 
297 /*
298  * Element zero is not used, but is allocated for convenience.
299  */
300 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
301 
302 /*
303  * Cached value of MNODE_RANGE_CNT(mnode).
304  * This is a function call in x86.
305  */
306 static int mnode_nranges[MAX_MEM_NODES];
307 static int mnode_maxmrange[MAX_MEM_NODES];
308 
309 /*
310  * The following macros are convenient ways to get access to the individual
311  * elements of the page_counters arrays.  They can be used on both
312  * the left side and right side of equations.
313  */
314 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
315 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
316 
317 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
318 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
319 
320 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
321 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
322 
323 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
324 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
325 
326 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
327 	(page_counters[(rg_szc)][(mnode)].hpm_base)
328 
329 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
330 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
331 
332 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
333 	(page_counters[(rg_szc)][(mnode)].				\
334 	hpm_color_current[(mrange)][(color)])
335 
336 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
337 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
338 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
339 
340 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
341 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
342 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
343 
344 /*
345  * Protects the hpm_counters and hpm_color_current memory from changing while
346  * looking at page counters information.
347  * Grab the write lock to modify what these fields point at.
348  * Grab the read lock to prevent any pointers from changing.
349  * The write lock can not be held during memory allocation due to a possible
350  * recursion deadlock with trying to grab the read lock while the
351  * write lock is already held.
352  */
353 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
354 
355 
356 /*
357  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
358  */
359 void
360 cpu_vm_data_init(struct cpu *cp)
361 {
362 	if (cp == CPU0) {
363 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
364 	} else {
365 		void	*kmptr;
366 		int	align;
367 		size_t	sz;
368 
369 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
370 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
371 		kmptr = kmem_zalloc(sz, KM_SLEEP);
372 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
373 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
374 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
375 	}
376 }
377 
378 /*
379  * free cpu_vm_data
380  */
381 void
382 cpu_vm_data_destroy(struct cpu *cp)
383 {
384 	if (cp->cpu_seqid && cp->cpu_vm_data) {
385 		ASSERT(cp != CPU0);
386 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
387 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
388 	}
389 	cp->cpu_vm_data = NULL;
390 }
391 
392 
393 /*
394  * page size to page size code
395  */
396 int
397 page_szc(size_t pagesize)
398 {
399 	int	i = 0;
400 
401 	while (hw_page_array[i].hp_size) {
402 		if (pagesize == hw_page_array[i].hp_size)
403 			return (i);
404 		i++;
405 	}
406 	return (-1);
407 }
408 
409 /*
410  * page size to page size code with the restriction that it be a supported
411  * user page size.  If it's not a supported user page size, -1 will be returned.
412  */
413 int
414 page_szc_user_filtered(size_t pagesize)
415 {
416 	int szc = page_szc(pagesize);
417 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
418 		return (szc);
419 	}
420 	return (-1);
421 }
422 
423 /*
424  * Return how many page sizes are available for the user to use.  This is
425  * what the hardware supports and not based upon how the OS implements the
426  * support of different page sizes.
427  *
428  * If legacy is non-zero, return the number of pagesizes available to legacy
429  * applications. The number of legacy page sizes might be less than the
430  * exported user page sizes. This is to prevent legacy applications that
431  * use the largest page size returned from getpagesizes(3c) from inadvertantly
432  * using the 'new' large pagesizes.
433  */
434 uint_t
435 page_num_user_pagesizes(int legacy)
436 {
437 	if (legacy)
438 		return (mmu_legacy_page_sizes);
439 	return (mmu_exported_page_sizes);
440 }
441 
442 uint_t
443 page_num_pagesizes(void)
444 {
445 	return (mmu_page_sizes);
446 }
447 
448 /*
449  * returns the count of the number of base pagesize pages associated with szc
450  */
451 pgcnt_t
452 page_get_pagecnt(uint_t szc)
453 {
454 	if (szc >= mmu_page_sizes)
455 		panic("page_get_pagecnt: out of range %d", szc);
456 	return (hw_page_array[szc].hp_pgcnt);
457 }
458 
459 size_t
460 page_get_pagesize(uint_t szc)
461 {
462 	if (szc >= mmu_page_sizes)
463 		panic("page_get_pagesize: out of range %d", szc);
464 	return (hw_page_array[szc].hp_size);
465 }
466 
467 /*
468  * Return the size of a page based upon the index passed in.  An index of
469  * zero refers to the smallest page size in the system, and as index increases
470  * it refers to the next larger supported page size in the system.
471  * Note that szc and userszc may not be the same due to unsupported szc's on
472  * some systems.
473  */
474 size_t
475 page_get_user_pagesize(uint_t userszc)
476 {
477 	uint_t szc = USERSZC_2_SZC(userszc);
478 
479 	if (szc >= mmu_page_sizes)
480 		panic("page_get_user_pagesize: out of range %d", szc);
481 	return (hw_page_array[szc].hp_size);
482 }
483 
484 uint_t
485 page_get_shift(uint_t szc)
486 {
487 	if (szc >= mmu_page_sizes)
488 		panic("page_get_shift: out of range %d", szc);
489 	return (PAGE_GET_SHIFT(szc));
490 }
491 
492 uint_t
493 page_get_pagecolors(uint_t szc)
494 {
495 	if (szc >= mmu_page_sizes)
496 		panic("page_get_pagecolors: out of range %d", szc);
497 	return (PAGE_GET_PAGECOLORS(szc));
498 }
499 
500 /*
501  * this assigns the desired equivalent color after a split
502  */
503 uint_t
504 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
505     uint_t ncolor, uint_t ceq_mask)
506 {
507 	ASSERT(nszc > szc);
508 	ASSERT(szc < mmu_page_sizes);
509 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
510 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
511 
512 	color &= ceq_mask;
513 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
514 	return (color | (ncolor & ~ceq_mask));
515 }
516 
517 /*
518  * The interleaved_mnodes flag is set when mnodes overlap in
519  * the physbase..physmax range, but have disjoint slices.
520  * In this case hpm_counters is shared by all mnodes.
521  * This flag is set dynamically by the platform.
522  */
523 int interleaved_mnodes = 0;
524 
525 /*
526  * Called by startup().
527  * Size up the per page size free list counters based on physmax
528  * of each node and max_mem_nodes.
529  *
530  * If interleaved_mnodes is set we need to find the first mnode that
531  * exists. hpm_counters for the first mnode will then be shared by
532  * all other mnodes. If interleaved_mnodes is not set, just set
533  * first=mnode each time. That means there will be no sharing.
534  */
535 size_t
536 page_ctrs_sz(void)
537 {
538 	int	r;		/* region size */
539 	int	mnode;
540 	int	firstmn;	/* first mnode that exists */
541 	int	nranges;
542 	pfn_t	physbase;
543 	pfn_t	physmax;
544 	uint_t	ctrs_sz = 0;
545 	int 	i;
546 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
547 
548 	/*
549 	 * We need to determine how many page colors there are for each
550 	 * page size in order to allocate memory for any color specific
551 	 * arrays.
552 	 */
553 	for (i = 0; i < mmu_page_sizes; i++) {
554 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
555 	}
556 
557 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
558 
559 		pgcnt_t r_pgcnt;
560 		pfn_t   r_base;
561 		pgcnt_t r_align;
562 
563 		if (mem_node_config[mnode].exists == 0)
564 			continue;
565 
566 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
567 		nranges = MNODE_RANGE_CNT(mnode);
568 		mnode_nranges[mnode] = nranges;
569 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
570 
571 		/*
572 		 * determine size needed for page counter arrays with
573 		 * base aligned to large page size.
574 		 */
575 		for (r = 1; r < mmu_page_sizes; r++) {
576 			/* add in space for hpm_color_current */
577 			ctrs_sz += sizeof (size_t) *
578 			    colors_per_szc[r] * nranges;
579 
580 			if (firstmn != mnode)
581 				continue;
582 
583 			/* add in space for hpm_counters */
584 			r_align = page_get_pagecnt(r);
585 			r_base = physbase;
586 			r_base &= ~(r_align - 1);
587 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
588 
589 			/*
590 			 * Round up to always allocate on pointer sized
591 			 * boundaries.
592 			 */
593 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
594 			    sizeof (hpmctr_t *));
595 		}
596 	}
597 
598 	for (r = 1; r < mmu_page_sizes; r++) {
599 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
600 	}
601 
602 	/* add in space for page_ctrs_cands and pcc_color_free */
603 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
604 	    mmu_page_sizes * NPC_MUTEX;
605 
606 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
607 
608 		if (mem_node_config[mnode].exists == 0)
609 			continue;
610 
611 		nranges = mnode_nranges[mnode];
612 		ctrs_sz += sizeof (pcc_info_t) * nranges *
613 		    mmu_page_sizes * NPC_MUTEX;
614 		for (r = 1; r < mmu_page_sizes; r++) {
615 			ctrs_sz += sizeof (pgcnt_t) * nranges *
616 			    colors_per_szc[r] * NPC_MUTEX;
617 		}
618 	}
619 
620 	/* ctr_mutex */
621 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
622 
623 	/* size for page list counts */
624 	PLCNT_SZ(ctrs_sz);
625 
626 	/*
627 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
628 	 * address of the counters to ecache_alignsize boundary for every
629 	 * memory node.
630 	 */
631 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
632 }
633 
634 caddr_t
635 page_ctrs_alloc(caddr_t alloc_base)
636 {
637 	int	mnode;
638 	int	mrange, nranges;
639 	int	r;		/* region size */
640 	int	i;
641 	int	firstmn;	/* first mnode that exists */
642 	pfn_t	physbase;
643 	pfn_t	physmax;
644 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
645 
646 	/*
647 	 * We need to determine how many page colors there are for each
648 	 * page size in order to allocate memory for any color specific
649 	 * arrays.
650 	 */
651 	for (i = 0; i < mmu_page_sizes; i++) {
652 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
653 	}
654 
655 	for (r = 1; r < mmu_page_sizes; r++) {
656 		page_counters[r] = (hw_page_map_t *)alloc_base;
657 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
658 	}
659 
660 	/* page_ctrs_cands and pcc_color_free array */
661 	for (i = 0; i < NPC_MUTEX; i++) {
662 		for (r = 1; r < mmu_page_sizes; r++) {
663 
664 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
665 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
666 
667 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
668 				pcc_info_t *pi;
669 
670 				if (mem_node_config[mnode].exists == 0)
671 					continue;
672 
673 				nranges = mnode_nranges[mnode];
674 
675 				pi = (pcc_info_t *)alloc_base;
676 				alloc_base += sizeof (pcc_info_t) * nranges;
677 				page_ctrs_cands[i][r][mnode] = pi;
678 
679 				for (mrange = 0; mrange < nranges; mrange++) {
680 					pi->pcc_color_free =
681 					    (pgcnt_t *)alloc_base;
682 					alloc_base += sizeof (pgcnt_t) *
683 					    colors_per_szc[r];
684 					pi++;
685 				}
686 			}
687 		}
688 	}
689 
690 	/* ctr_mutex */
691 	for (i = 0; i < NPC_MUTEX; i++) {
692 		ctr_mutex[i] = (kmutex_t *)alloc_base;
693 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
694 	}
695 
696 	/* initialize page list counts */
697 	PLCNT_INIT(alloc_base);
698 
699 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
700 
701 		pgcnt_t r_pgcnt;
702 		pfn_t	r_base;
703 		pgcnt_t r_align;
704 		int	r_shift;
705 		int	nranges = mnode_nranges[mnode];
706 
707 		if (mem_node_config[mnode].exists == 0)
708 			continue;
709 
710 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
711 
712 		for (r = 1; r < mmu_page_sizes; r++) {
713 			/*
714 			 * the page_counters base has to be aligned to the
715 			 * page count of page size code r otherwise the counts
716 			 * will cross large page boundaries.
717 			 */
718 			r_align = page_get_pagecnt(r);
719 			r_base = physbase;
720 			/* base needs to be aligned - lower to aligned value */
721 			r_base &= ~(r_align - 1);
722 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
723 			r_shift = PAGE_BSZS_SHIFT(r);
724 
725 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
726 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
727 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
728 			for (mrange = 0; mrange < nranges; mrange++) {
729 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
730 				    r, mrange) = (size_t *)alloc_base;
731 				alloc_base += sizeof (size_t) *
732 				    colors_per_szc[r];
733 			}
734 			for (i = 0; i < colors_per_szc[r]; i++) {
735 				uint_t color_mask = colors_per_szc[r] - 1;
736 				pfn_t  pfnum = r_base;
737 				size_t idx;
738 				int mrange;
739 				MEM_NODE_ITERATOR_DECL(it);
740 
741 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
742 				if (pfnum == (pfn_t)-1) {
743 					idx = 0;
744 				} else {
745 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
746 					    color_mask, color_mask, &it);
747 					idx = PNUM_TO_IDX(mnode, r, pfnum);
748 					idx = (idx >= r_pgcnt) ? 0 : idx;
749 				}
750 				for (mrange = 0; mrange < nranges; mrange++) {
751 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
752 					    r, i, mrange) = idx;
753 				}
754 			}
755 
756 			/* hpm_counters may be shared by all mnodes */
757 			if (firstmn == mnode) {
758 				PAGE_COUNTERS_COUNTERS(mnode, r) =
759 				    (hpmctr_t *)alloc_base;
760 				alloc_base +=
761 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
762 				    sizeof (hpmctr_t *));
763 			} else {
764 				PAGE_COUNTERS_COUNTERS(mnode, r) =
765 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
766 			}
767 
768 			/*
769 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
770 			 * satisfy the identity requirement.
771 			 * We should be able to go from one to the other
772 			 * and get consistent values.
773 			 */
774 			ASSERT(PNUM_TO_IDX(mnode, r,
775 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
776 			ASSERT(IDX_TO_PNUM(mnode, r,
777 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
778 		}
779 		/*
780 		 * Roundup the start address of the page_counters to
781 		 * cache aligned boundary for every memory node.
782 		 * page_ctrs_sz() has added some slop for these roundups.
783 		 */
784 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
785 		    L2CACHE_ALIGN);
786 	}
787 
788 	/* Initialize other page counter specific data structures. */
789 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
790 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
791 	}
792 
793 	return (alloc_base);
794 }
795 
796 /*
797  * Functions to adjust region counters for each size free list.
798  * Caller is responsible to acquire the ctr_mutex lock if necessary and
799  * thus can be called during startup without locks.
800  */
801 /* ARGSUSED */
802 void
803 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
804 {
805 	ssize_t		r;	/* region size */
806 	ssize_t		idx;
807 	pfn_t		pfnum;
808 	int		lckidx;
809 
810 	ASSERT(mnode == PP_2_MEM_NODE(pp));
811 	ASSERT(mtype == PP_2_MTYPE(pp));
812 
813 	ASSERT(pp->p_szc < mmu_page_sizes);
814 
815 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
816 
817 	/* no counter update needed for largest page size */
818 	if (pp->p_szc >= mmu_page_sizes - 1) {
819 		return;
820 	}
821 
822 	r = pp->p_szc + 1;
823 	pfnum = pp->p_pagenum;
824 	lckidx = PP_CTR_LOCK_INDX(pp);
825 
826 	/*
827 	 * Increment the count of free pages for the current
828 	 * region. Continue looping up in region size incrementing
829 	 * count if the preceeding region is full.
830 	 */
831 	while (r < mmu_page_sizes) {
832 		idx = PNUM_TO_IDX(mnode, r, pfnum);
833 
834 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
835 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
836 
837 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
838 			break;
839 		} else {
840 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
841 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
842 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
843 
844 			cand->pcc_pages_free++;
845 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
846 		}
847 		r++;
848 	}
849 }
850 
851 void
852 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
853 {
854 	int		lckidx = PP_CTR_LOCK_INDX(pp);
855 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
856 
857 	mutex_enter(lock);
858 	page_ctr_add_internal(mnode, mtype, pp, flags);
859 	mutex_exit(lock);
860 }
861 
862 void
863 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
864 {
865 	int		lckidx;
866 	ssize_t		r;	/* region size */
867 	ssize_t		idx;
868 	pfn_t		pfnum;
869 
870 	ASSERT(mnode == PP_2_MEM_NODE(pp));
871 	ASSERT(mtype == PP_2_MTYPE(pp));
872 
873 	ASSERT(pp->p_szc < mmu_page_sizes);
874 
875 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
876 
877 	/* no counter update needed for largest page size */
878 	if (pp->p_szc >= mmu_page_sizes - 1) {
879 		return;
880 	}
881 
882 	r = pp->p_szc + 1;
883 	pfnum = pp->p_pagenum;
884 	lckidx = PP_CTR_LOCK_INDX(pp);
885 
886 	/*
887 	 * Decrement the count of free pages for the current
888 	 * region. Continue looping up in region size decrementing
889 	 * count if the preceeding region was full.
890 	 */
891 	while (r < mmu_page_sizes) {
892 		idx = PNUM_TO_IDX(mnode, r, pfnum);
893 
894 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
895 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
896 
897 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
898 			break;
899 		} else {
900 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
901 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
902 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
903 
904 			ASSERT(cand->pcc_pages_free != 0);
905 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
906 
907 			cand->pcc_pages_free--;
908 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
909 		}
910 		r++;
911 	}
912 }
913 
914 void
915 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
916 {
917 	int		lckidx = PP_CTR_LOCK_INDX(pp);
918 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
919 
920 	mutex_enter(lock);
921 	page_ctr_sub_internal(mnode, mtype, pp, flags);
922 	mutex_exit(lock);
923 }
924 
925 /*
926  * Adjust page counters following a memory attach, since typically the
927  * size of the array needs to change, and the PFN to counter index
928  * mapping needs to change.
929  *
930  * It is possible this mnode did not exist at startup. In that case
931  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
932  * to change (a theoretical possibility on x86), which means pcc_color_free
933  * arrays must be extended.
934  */
935 uint_t
936 page_ctrs_adjust(int mnode)
937 {
938 	pgcnt_t npgs;
939 	int	r;		/* region size */
940 	int	i;
941 	size_t	pcsz, old_csz;
942 	hpmctr_t *new_ctr, *old_ctr;
943 	pfn_t	oldbase, newbase;
944 	pfn_t	physbase, physmax;
945 	size_t	old_npgs;
946 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
947 	size_t	size_cache[MMU_PAGE_SIZES];
948 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
949 	size_t	*old_color_array[MAX_MNODE_MRANGES];
950 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
951 	pcc_info_t **cands_cache;
952 	pcc_info_t *old_pi, *pi;
953 	pgcnt_t *pgcntp;
954 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
955 	int cands_cache_nranges;
956 	int old_maxmrange, new_maxmrange;
957 	int rc = 0;
958 	int oldmnode;
959 
960 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
961 	    MMU_PAGE_SIZES, KM_NOSLEEP);
962 	if (cands_cache == NULL)
963 		return (ENOMEM);
964 
965 	i = -1;
966 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
967 
968 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
969 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
970 
971 	/* prepare to free non-null pointers on the way out */
972 	cands_cache_nranges = nranges;
973 	bzero(ctr_cache, sizeof (ctr_cache));
974 	bzero(color_cache, sizeof (color_cache));
975 
976 	/*
977 	 * We need to determine how many page colors there are for each
978 	 * page size in order to allocate memory for any color specific
979 	 * arrays.
980 	 */
981 	for (r = 0; r < mmu_page_sizes; r++) {
982 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
983 	}
984 
985 	/*
986 	 * Preallocate all of the new hpm_counters arrays as we can't
987 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
988 	 * If we can't allocate all of the arrays, undo our work so far
989 	 * and return failure.
990 	 */
991 	for (r = 1; r < mmu_page_sizes; r++) {
992 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
993 		size_cache[r] = pcsz;
994 		ctr_cache[r] = kmem_zalloc(pcsz *
995 		    sizeof (hpmctr_t), KM_NOSLEEP);
996 		if (ctr_cache[r] == NULL) {
997 			rc = ENOMEM;
998 			goto cleanup;
999 		}
1000 	}
1001 
1002 	/*
1003 	 * Preallocate all of the new color current arrays as we can't
1004 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1005 	 * If we can't allocate all of the arrays, undo our work so far
1006 	 * and return failure.
1007 	 */
1008 	for (r = 1; r < mmu_page_sizes; r++) {
1009 		for (mrange = 0; mrange < nranges; mrange++) {
1010 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1011 			    colors_per_szc[r], KM_NOSLEEP);
1012 			if (color_cache[r][mrange] == NULL) {
1013 				rc = ENOMEM;
1014 				goto cleanup;
1015 			}
1016 		}
1017 	}
1018 
1019 	/*
1020 	 * Preallocate all of the new pcc_info_t arrays as we can't
1021 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1022 	 * If we can't allocate all of the arrays, undo our work so far
1023 	 * and return failure.
1024 	 */
1025 	for (r = 1; r < mmu_page_sizes; r++) {
1026 		for (i = 0; i < NPC_MUTEX; i++) {
1027 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1028 			    KM_NOSLEEP);
1029 			if (pi == NULL) {
1030 				rc = ENOMEM;
1031 				goto cleanup;
1032 			}
1033 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1034 
1035 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1036 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1037 				    sizeof (pgcnt_t), KM_NOSLEEP);
1038 				if (pgcntp == NULL) {
1039 					rc = ENOMEM;
1040 					goto cleanup;
1041 				}
1042 				pi->pcc_color_free = pgcntp;
1043 			}
1044 		}
1045 	}
1046 
1047 	/*
1048 	 * Grab the write lock to prevent others from walking these arrays
1049 	 * while we are modifying them.
1050 	 */
1051 	PAGE_CTRS_WRITE_LOCK(mnode);
1052 
1053 	/*
1054 	 * For interleaved mnodes, find the first mnode
1055 	 * with valid page counters since the current
1056 	 * mnode may have just been added and not have
1057 	 * valid page counters.
1058 	 */
1059 	if (interleaved_mnodes) {
1060 		for (i = 0; i < max_mem_nodes; i++)
1061 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1062 				break;
1063 		ASSERT(i < max_mem_nodes);
1064 		oldmnode = i;
1065 	} else
1066 		oldmnode = mnode;
1067 
1068 	old_nranges = mnode_nranges[mnode];
1069 	cands_cache_nranges = old_nranges;
1070 	mnode_nranges[mnode] = nranges;
1071 	old_maxmrange = mnode_maxmrange[mnode];
1072 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1073 	new_maxmrange = mnode_maxmrange[mnode];
1074 
1075 	for (r = 1; r < mmu_page_sizes; r++) {
1076 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1077 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1078 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1079 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1080 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1081 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1082 			old_color_array[mrange] =
1083 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1084 			    r, mrange);
1085 		}
1086 
1087 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1088 		new_ctr = ctr_cache[r];
1089 		ctr_cache[r] = NULL;
1090 		if (old_ctr != NULL &&
1091 		    (oldbase + old_npgs > newbase) &&
1092 		    (newbase + npgs > oldbase)) {
1093 			/*
1094 			 * Map the intersection of the old and new
1095 			 * counters into the new array.
1096 			 */
1097 			size_t offset;
1098 			if (newbase > oldbase) {
1099 				offset = (newbase - oldbase) >>
1100 				    PAGE_COUNTERS_SHIFT(mnode, r);
1101 				bcopy(old_ctr + offset, new_ctr,
1102 				    MIN(pcsz, (old_csz - offset)) *
1103 				    sizeof (hpmctr_t));
1104 			} else {
1105 				offset = (oldbase - newbase) >>
1106 				    PAGE_COUNTERS_SHIFT(mnode, r);
1107 				bcopy(old_ctr, new_ctr + offset,
1108 				    MIN(pcsz - offset, old_csz) *
1109 				    sizeof (hpmctr_t));
1110 			}
1111 		}
1112 
1113 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1114 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1115 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1116 
1117 		/* update shared hpm_counters in other mnodes */
1118 		if (interleaved_mnodes) {
1119 			for (i = 0; i < max_mem_nodes; i++) {
1120 				if ((i == mnode) ||
1121 				    (mem_node_config[i].exists == 0))
1122 					continue;
1123 				ASSERT(
1124 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1125 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1126 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1127 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1128 				PAGE_COUNTERS_BASE(i, r) = newbase;
1129 			}
1130 		}
1131 
1132 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1133 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1134 			    color_cache[r][mrange];
1135 			color_cache[r][mrange] = NULL;
1136 		}
1137 		/*
1138 		 * for now, just reset on these events as it's probably
1139 		 * not worthwhile to try and optimize this.
1140 		 */
1141 		for (i = 0; i < colors_per_szc[r]; i++) {
1142 			uint_t color_mask = colors_per_szc[r] - 1;
1143 			int mlo = interleaved_mnodes ? 0 : mnode;
1144 			int mhi = interleaved_mnodes ? max_mem_nodes :
1145 			    (mnode + 1);
1146 			int m;
1147 			pfn_t  pfnum;
1148 			size_t idx;
1149 			MEM_NODE_ITERATOR_DECL(it);
1150 
1151 			for (m = mlo; m < mhi; m++) {
1152 				if (mem_node_config[m].exists == 0)
1153 					continue;
1154 				pfnum = newbase;
1155 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1156 				if (pfnum == (pfn_t)-1) {
1157 					idx = 0;
1158 				} else {
1159 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1160 					    color_mask, color_mask, &it);
1161 					idx = PNUM_TO_IDX(m, r, pfnum);
1162 					idx = (idx < pcsz) ? idx : 0;
1163 				}
1164 				for (mrange = 0; mrange < nranges; mrange++) {
1165 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1166 					    r, mrange) != NULL)
1167 						PAGE_COUNTERS_CURRENT_COLOR(m,
1168 						    r, i, mrange) = idx;
1169 				}
1170 			}
1171 		}
1172 
1173 		/* cache info for freeing out of the critical path */
1174 		if ((caddr_t)old_ctr >= kernelheap &&
1175 		    (caddr_t)old_ctr < ekernelheap) {
1176 			ctr_cache[r] = old_ctr;
1177 			size_cache[r] = old_csz;
1178 		}
1179 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1180 			size_t *tmp = old_color_array[mrange];
1181 			if ((caddr_t)tmp >= kernelheap &&
1182 			    (caddr_t)tmp < ekernelheap) {
1183 				color_cache[r][mrange] = tmp;
1184 			}
1185 		}
1186 		/*
1187 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1188 		 * satisfy the identity requirement.
1189 		 * We should be able to go from one to the other
1190 		 * and get consistent values.
1191 		 */
1192 		ASSERT(PNUM_TO_IDX(mnode, r,
1193 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1194 		ASSERT(IDX_TO_PNUM(mnode, r,
1195 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1196 
1197 		/* pcc_info_t and pcc_color_free */
1198 		for (i = 0; i < NPC_MUTEX; i++) {
1199 			pcc_info_t *epi;
1200 			pcc_info_t *eold_pi;
1201 
1202 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1203 			old_pi = page_ctrs_cands[i][r][mnode];
1204 			page_ctrs_cands[i][r][mnode] = pi;
1205 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1206 
1207 			/* preserve old pcc_color_free values, if any */
1208 			if (old_pi == NULL)
1209 				continue;
1210 
1211 			/*
1212 			 * when/if x86 does DR, must account for
1213 			 * possible change in range index when
1214 			 * preserving pcc_info
1215 			 */
1216 			epi = &pi[nranges];
1217 			eold_pi = &old_pi[old_nranges];
1218 			if (new_maxmrange > old_maxmrange) {
1219 				pi += new_maxmrange - old_maxmrange;
1220 			} else if (new_maxmrange < old_maxmrange) {
1221 				old_pi += old_maxmrange - new_maxmrange;
1222 			}
1223 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1224 				pcc_info_t tmp = *pi;
1225 				*pi = *old_pi;
1226 				*old_pi = tmp;
1227 			}
1228 		}
1229 	}
1230 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1231 
1232 	/*
1233 	 * Now that we have dropped the write lock, it is safe to free all
1234 	 * of the memory we have cached above.
1235 	 * We come thru here to free memory when pre-alloc fails, and also to
1236 	 * free old pointers which were recorded while locked.
1237 	 */
1238 cleanup:
1239 	for (r = 1; r < mmu_page_sizes; r++) {
1240 		if (ctr_cache[r] != NULL) {
1241 			kmem_free(ctr_cache[r],
1242 			    size_cache[r] * sizeof (hpmctr_t));
1243 		}
1244 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1245 			if (color_cache[r][mrange] != NULL) {
1246 				kmem_free(color_cache[r][mrange],
1247 				    colors_per_szc[r] * sizeof (size_t));
1248 			}
1249 		}
1250 		for (i = 0; i < NPC_MUTEX; i++) {
1251 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1252 			if (pi == NULL)
1253 				continue;
1254 			nr = cands_cache_nranges;
1255 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1256 				pgcntp = pi->pcc_color_free;
1257 				if (pgcntp == NULL)
1258 					continue;
1259 				if ((caddr_t)pgcntp >= kernelheap &&
1260 				    (caddr_t)pgcntp < ekernelheap) {
1261 					kmem_free(pgcntp,
1262 					    colors_per_szc[r] *
1263 					    sizeof (pgcnt_t));
1264 				}
1265 			}
1266 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1267 			if ((caddr_t)pi >= kernelheap &&
1268 			    (caddr_t)pi < ekernelheap) {
1269 				kmem_free(pi, nr * sizeof (pcc_info_t));
1270 			}
1271 		}
1272 	}
1273 
1274 	kmem_free(cands_cache,
1275 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1276 	return (rc);
1277 }
1278 
1279 /*
1280  * Cleanup the hpm_counters field in the page counters
1281  * array.
1282  */
1283 void
1284 page_ctrs_cleanup(void)
1285 {
1286 	int r;	/* region size */
1287 	int i;	/* mnode index */
1288 
1289 	/*
1290 	 * Get the page counters write lock while we are
1291 	 * setting the page hpm_counters field to NULL
1292 	 * for non-existent mnodes.
1293 	 */
1294 	for (i = 0; i < max_mem_nodes; i++) {
1295 		PAGE_CTRS_WRITE_LOCK(i);
1296 		if (mem_node_config[i].exists) {
1297 			PAGE_CTRS_WRITE_UNLOCK(i);
1298 			continue;
1299 		}
1300 		for (r = 1; r < mmu_page_sizes; r++) {
1301 			PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1302 		}
1303 		PAGE_CTRS_WRITE_UNLOCK(i);
1304 	}
1305 }
1306 
1307 #ifdef DEBUG
1308 
1309 /*
1310  * confirm pp is a large page corresponding to szc
1311  */
1312 void
1313 chk_lpg(page_t *pp, uchar_t szc)
1314 {
1315 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1316 	uint_t noreloc;
1317 
1318 	if (npgs == 1) {
1319 		ASSERT(pp->p_szc == 0);
1320 		ASSERT(pp->p_next == pp);
1321 		ASSERT(pp->p_prev == pp);
1322 		return;
1323 	}
1324 
1325 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1326 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1327 
1328 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1329 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1330 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1331 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1332 
1333 	/*
1334 	 * Check list of pages.
1335 	 */
1336 	noreloc = PP_ISNORELOC(pp);
1337 	while (npgs--) {
1338 		if (npgs != 0) {
1339 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1340 			ASSERT(pp->p_next == (pp + 1));
1341 		}
1342 		ASSERT(pp->p_szc == szc);
1343 		ASSERT(PP_ISFREE(pp));
1344 		ASSERT(PP_ISAGED(pp));
1345 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1346 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1347 		ASSERT(pp->p_vnode  == NULL);
1348 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1349 
1350 		pp = pp->p_next;
1351 	}
1352 }
1353 #endif /* DEBUG */
1354 
1355 void
1356 page_freelist_lock(int mnode)
1357 {
1358 	int i;
1359 	for (i = 0; i < NPC_MUTEX; i++) {
1360 		mutex_enter(FPC_MUTEX(mnode, i));
1361 		mutex_enter(CPC_MUTEX(mnode, i));
1362 	}
1363 }
1364 
1365 void
1366 page_freelist_unlock(int mnode)
1367 {
1368 	int i;
1369 	for (i = 0; i < NPC_MUTEX; i++) {
1370 		mutex_exit(FPC_MUTEX(mnode, i));
1371 		mutex_exit(CPC_MUTEX(mnode, i));
1372 	}
1373 }
1374 
1375 /*
1376  * add pp to the specified page list. Defaults to head of the page list
1377  * unless PG_LIST_TAIL is specified.
1378  */
1379 void
1380 page_list_add(page_t *pp, int flags)
1381 {
1382 	page_t		**ppp;
1383 	kmutex_t	*pcm;
1384 	uint_t		bin, mtype;
1385 	int		mnode;
1386 
1387 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1388 	ASSERT(PP_ISFREE(pp));
1389 	ASSERT(!hat_page_is_mapped(pp));
1390 	ASSERT(hat_page_getshare(pp) == 0);
1391 
1392 	/*
1393 	 * Large pages should be freed via page_list_add_pages().
1394 	 */
1395 	ASSERT(pp->p_szc == 0);
1396 
1397 	/*
1398 	 * Don't need to lock the freelist first here
1399 	 * because the page isn't on the freelist yet.
1400 	 * This means p_szc can't change on us.
1401 	 */
1402 
1403 	bin = PP_2_BIN(pp);
1404 	mnode = PP_2_MEM_NODE(pp);
1405 	mtype = PP_2_MTYPE(pp);
1406 
1407 	if (flags & PG_LIST_ISINIT) {
1408 		/*
1409 		 * PG_LIST_ISINIT is set during system startup (ie. single
1410 		 * threaded), add a page to the free list and add to the
1411 		 * the free region counters w/o any locking
1412 		 */
1413 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1414 
1415 		/* inline version of page_add() */
1416 		if (*ppp != NULL) {
1417 			pp->p_next = *ppp;
1418 			pp->p_prev = (*ppp)->p_prev;
1419 			(*ppp)->p_prev = pp;
1420 			pp->p_prev->p_next = pp;
1421 		} else
1422 			*ppp = pp;
1423 
1424 		page_ctr_add_internal(mnode, mtype, pp, flags);
1425 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1426 	} else {
1427 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1428 
1429 		if (flags & PG_FREE_LIST) {
1430 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1431 			ASSERT(PP_ISAGED(pp));
1432 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1433 
1434 		} else {
1435 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1436 			ASSERT(pp->p_vnode);
1437 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1438 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1439 		}
1440 		mutex_enter(pcm);
1441 		page_add(ppp, pp);
1442 
1443 		if (flags & PG_LIST_TAIL)
1444 			*ppp = (*ppp)->p_next;
1445 		/*
1446 		 * Add counters before releasing pcm mutex to avoid a race with
1447 		 * page_freelist_coalesce and page_freelist_split.
1448 		 */
1449 		page_ctr_add(mnode, mtype, pp, flags);
1450 		mutex_exit(pcm);
1451 	}
1452 
1453 
1454 #if defined(__sparc)
1455 	if (PP_ISNORELOC(pp)) {
1456 		kcage_freemem_add(1);
1457 	}
1458 #endif
1459 	/*
1460 	 * It is up to the caller to unlock the page!
1461 	 */
1462 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1463 }
1464 
1465 
1466 #ifdef __sparc
1467 /*
1468  * This routine is only used by kcage_init during system startup.
1469  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1470  * without the overhead of taking locks and updating counters.
1471  */
1472 void
1473 page_list_noreloc_startup(page_t *pp)
1474 {
1475 	page_t		**ppp;
1476 	uint_t		bin;
1477 	int		mnode;
1478 	int		mtype;
1479 	int		flags = 0;
1480 
1481 	/*
1482 	 * If this is a large page on the freelist then
1483 	 * break it up into smaller pages.
1484 	 */
1485 	if (pp->p_szc != 0)
1486 		page_boot_demote(pp);
1487 
1488 	/*
1489 	 * Get list page is currently on.
1490 	 */
1491 	bin = PP_2_BIN(pp);
1492 	mnode = PP_2_MEM_NODE(pp);
1493 	mtype = PP_2_MTYPE(pp);
1494 	ASSERT(mtype == MTYPE_RELOC);
1495 	ASSERT(pp->p_szc == 0);
1496 
1497 	if (PP_ISAGED(pp)) {
1498 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1499 		flags |= PG_FREE_LIST;
1500 	} else {
1501 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1502 		flags |= PG_CACHE_LIST;
1503 	}
1504 
1505 	ASSERT(*ppp != NULL);
1506 
1507 	/*
1508 	 * Delete page from current list.
1509 	 */
1510 	if (*ppp == pp)
1511 		*ppp = pp->p_next;		/* go to next page */
1512 	if (*ppp == pp) {
1513 		*ppp = NULL;			/* page list is gone */
1514 	} else {
1515 		pp->p_prev->p_next = pp->p_next;
1516 		pp->p_next->p_prev = pp->p_prev;
1517 	}
1518 
1519 	/*
1520 	 * Decrement page counters
1521 	 */
1522 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1523 
1524 	/*
1525 	 * Set no reloc for cage initted pages.
1526 	 */
1527 	PP_SETNORELOC(pp);
1528 
1529 	mtype = PP_2_MTYPE(pp);
1530 	ASSERT(mtype == MTYPE_NORELOC);
1531 
1532 	/*
1533 	 * Get new list for page.
1534 	 */
1535 	if (PP_ISAGED(pp)) {
1536 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1537 	} else {
1538 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1539 	}
1540 
1541 	/*
1542 	 * Insert page on new list.
1543 	 */
1544 	if (*ppp == NULL) {
1545 		*ppp = pp;
1546 		pp->p_next = pp->p_prev = pp;
1547 	} else {
1548 		pp->p_next = *ppp;
1549 		pp->p_prev = (*ppp)->p_prev;
1550 		(*ppp)->p_prev = pp;
1551 		pp->p_prev->p_next = pp;
1552 	}
1553 
1554 	/*
1555 	 * Increment page counters
1556 	 */
1557 	page_ctr_add_internal(mnode, mtype, pp, flags);
1558 
1559 	/*
1560 	 * Update cage freemem counter
1561 	 */
1562 	atomic_add_long(&kcage_freemem, 1);
1563 }
1564 #else	/* __sparc */
1565 
1566 /* ARGSUSED */
1567 void
1568 page_list_noreloc_startup(page_t *pp)
1569 {
1570 	panic("page_list_noreloc_startup: should be here only for sparc");
1571 }
1572 #endif
1573 
1574 void
1575 page_list_add_pages(page_t *pp, int flags)
1576 {
1577 	kmutex_t *pcm;
1578 	pgcnt_t	pgcnt;
1579 	uint_t	bin, mtype, i;
1580 	int	mnode;
1581 
1582 	/* default to freelist/head */
1583 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1584 
1585 	CHK_LPG(pp, pp->p_szc);
1586 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1587 
1588 	bin = PP_2_BIN(pp);
1589 	mnode = PP_2_MEM_NODE(pp);
1590 	mtype = PP_2_MTYPE(pp);
1591 
1592 	if (flags & PG_LIST_ISINIT) {
1593 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1594 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1595 		ASSERT(!PP_ISNORELOC(pp));
1596 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1597 	} else {
1598 
1599 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1600 
1601 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1602 
1603 		mutex_enter(pcm);
1604 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1605 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1606 		mutex_exit(pcm);
1607 
1608 		pgcnt = page_get_pagecnt(pp->p_szc);
1609 #if defined(__sparc)
1610 		if (PP_ISNORELOC(pp))
1611 			kcage_freemem_add(pgcnt);
1612 #endif
1613 		for (i = 0; i < pgcnt; i++, pp++)
1614 			page_unlock_nocapture(pp);
1615 	}
1616 }
1617 
1618 /*
1619  * During boot, need to demote a large page to base
1620  * pagesize pages for seg_kmem for use in boot_alloc()
1621  */
1622 void
1623 page_boot_demote(page_t *pp)
1624 {
1625 	ASSERT(pp->p_szc != 0);
1626 	ASSERT(PP_ISFREE(pp));
1627 	ASSERT(PP_ISAGED(pp));
1628 
1629 	(void) page_demote(PP_2_MEM_NODE(pp),
1630 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1631 	    PC_FREE);
1632 
1633 	ASSERT(PP_ISFREE(pp));
1634 	ASSERT(PP_ISAGED(pp));
1635 	ASSERT(pp->p_szc == 0);
1636 }
1637 
1638 /*
1639  * Take a particular page off of whatever freelist the page
1640  * is claimed to be on.
1641  *
1642  * NOTE: Only used for PAGESIZE pages.
1643  */
1644 void
1645 page_list_sub(page_t *pp, int flags)
1646 {
1647 	int		bin;
1648 	uint_t		mtype;
1649 	int		mnode;
1650 	kmutex_t	*pcm;
1651 	page_t		**ppp;
1652 
1653 	ASSERT(PAGE_EXCL(pp));
1654 	ASSERT(PP_ISFREE(pp));
1655 
1656 	/*
1657 	 * The p_szc field can only be changed by page_promote()
1658 	 * and page_demote(). Only free pages can be promoted and
1659 	 * demoted and the free list MUST be locked during these
1660 	 * operations. So to prevent a race in page_list_sub()
1661 	 * between computing which bin of the freelist lock to
1662 	 * grab and actually grabing the lock we check again that
1663 	 * the bin we locked is still the correct one. Notice that
1664 	 * the p_szc field could have actually changed on us but
1665 	 * if the bin happens to still be the same we are safe.
1666 	 */
1667 try_again:
1668 	bin = PP_2_BIN(pp);
1669 	mnode = PP_2_MEM_NODE(pp);
1670 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1671 	mutex_enter(pcm);
1672 	if (PP_2_BIN(pp) != bin) {
1673 		mutex_exit(pcm);
1674 		goto try_again;
1675 	}
1676 	mtype = PP_2_MTYPE(pp);
1677 
1678 	if (flags & PG_FREE_LIST) {
1679 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1680 		ASSERT(PP_ISAGED(pp));
1681 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1682 	} else {
1683 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1684 		ASSERT(!PP_ISAGED(pp));
1685 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1686 	}
1687 
1688 	/*
1689 	 * Common PAGESIZE case.
1690 	 *
1691 	 * Note that we locked the freelist. This prevents
1692 	 * any page promotion/demotion operations. Therefore
1693 	 * the p_szc will not change until we drop pcm mutex.
1694 	 */
1695 	if (pp->p_szc == 0) {
1696 		page_sub(ppp, pp);
1697 		/*
1698 		 * Subtract counters before releasing pcm mutex
1699 		 * to avoid race with page_freelist_coalesce.
1700 		 */
1701 		page_ctr_sub(mnode, mtype, pp, flags);
1702 		mutex_exit(pcm);
1703 
1704 #if defined(__sparc)
1705 		if (PP_ISNORELOC(pp)) {
1706 			kcage_freemem_sub(1);
1707 		}
1708 #endif
1709 		return;
1710 	}
1711 
1712 	/*
1713 	 * Large pages on the cache list are not supported.
1714 	 */
1715 	if (flags & PG_CACHE_LIST)
1716 		panic("page_list_sub: large page on cachelist");
1717 
1718 	/*
1719 	 * Slow but rare.
1720 	 *
1721 	 * Somebody wants this particular page which is part
1722 	 * of a large page. In this case we just demote the page
1723 	 * if it's on the freelist.
1724 	 *
1725 	 * We have to drop pcm before locking the entire freelist.
1726 	 * Once we have re-locked the freelist check to make sure
1727 	 * the page hasn't already been demoted or completely
1728 	 * freed.
1729 	 */
1730 	mutex_exit(pcm);
1731 	page_freelist_lock(mnode);
1732 	if (pp->p_szc != 0) {
1733 		/*
1734 		 * Large page is on freelist.
1735 		 */
1736 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1737 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1738 	}
1739 	ASSERT(PP_ISFREE(pp));
1740 	ASSERT(PP_ISAGED(pp));
1741 	ASSERT(pp->p_szc == 0);
1742 
1743 	/*
1744 	 * Subtract counters before releasing pcm mutex
1745 	 * to avoid race with page_freelist_coalesce.
1746 	 */
1747 	bin = PP_2_BIN(pp);
1748 	mtype = PP_2_MTYPE(pp);
1749 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1750 
1751 	page_sub(ppp, pp);
1752 	page_ctr_sub(mnode, mtype, pp, flags);
1753 	page_freelist_unlock(mnode);
1754 
1755 #if defined(__sparc)
1756 	if (PP_ISNORELOC(pp)) {
1757 		kcage_freemem_sub(1);
1758 	}
1759 #endif
1760 }
1761 
1762 void
1763 page_list_sub_pages(page_t *pp, uint_t szc)
1764 {
1765 	kmutex_t *pcm;
1766 	uint_t	bin, mtype;
1767 	int	mnode;
1768 
1769 	ASSERT(PAGE_EXCL(pp));
1770 	ASSERT(PP_ISFREE(pp));
1771 	ASSERT(PP_ISAGED(pp));
1772 
1773 	/*
1774 	 * See comment in page_list_sub().
1775 	 */
1776 try_again:
1777 	bin = PP_2_BIN(pp);
1778 	mnode = PP_2_MEM_NODE(pp);
1779 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1780 	mutex_enter(pcm);
1781 	if (PP_2_BIN(pp) != bin) {
1782 		mutex_exit(pcm);
1783 		goto	try_again;
1784 	}
1785 
1786 	/*
1787 	 * If we're called with a page larger than szc or it got
1788 	 * promoted above szc before we locked the freelist then
1789 	 * drop pcm and re-lock entire freelist. If page still larger
1790 	 * than szc then demote it.
1791 	 */
1792 	if (pp->p_szc > szc) {
1793 		mutex_exit(pcm);
1794 		pcm = NULL;
1795 		page_freelist_lock(mnode);
1796 		if (pp->p_szc > szc) {
1797 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1798 			(void) page_demote(mnode,
1799 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1800 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1801 		}
1802 		bin = PP_2_BIN(pp);
1803 	}
1804 	ASSERT(PP_ISFREE(pp));
1805 	ASSERT(PP_ISAGED(pp));
1806 	ASSERT(pp->p_szc <= szc);
1807 	ASSERT(pp == PP_PAGEROOT(pp));
1808 
1809 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1810 
1811 	mtype = PP_2_MTYPE(pp);
1812 	if (pp->p_szc != 0) {
1813 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1814 		CHK_LPG(pp, pp->p_szc);
1815 	} else {
1816 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1817 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818 	}
1819 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1820 
1821 	if (pcm != NULL) {
1822 		mutex_exit(pcm);
1823 	} else {
1824 		page_freelist_unlock(mnode);
1825 	}
1826 
1827 #if defined(__sparc)
1828 	if (PP_ISNORELOC(pp)) {
1829 		pgcnt_t	pgcnt;
1830 
1831 		pgcnt = page_get_pagecnt(pp->p_szc);
1832 		kcage_freemem_sub(pgcnt);
1833 	}
1834 #endif
1835 }
1836 
1837 /*
1838  * Add the page to the front of a linked list of pages
1839  * using the p_next & p_prev pointers for the list.
1840  * The caller is responsible for protecting the list pointers.
1841  */
1842 void
1843 mach_page_add(page_t **ppp, page_t *pp)
1844 {
1845 	if (*ppp == NULL) {
1846 		pp->p_next = pp->p_prev = pp;
1847 	} else {
1848 		pp->p_next = *ppp;
1849 		pp->p_prev = (*ppp)->p_prev;
1850 		(*ppp)->p_prev = pp;
1851 		pp->p_prev->p_next = pp;
1852 	}
1853 	*ppp = pp;
1854 }
1855 
1856 /*
1857  * Remove this page from a linked list of pages
1858  * using the p_next & p_prev pointers for the list.
1859  *
1860  * The caller is responsible for protecting the list pointers.
1861  */
1862 void
1863 mach_page_sub(page_t **ppp, page_t *pp)
1864 {
1865 	ASSERT(PP_ISFREE(pp));
1866 
1867 	if (*ppp == NULL || pp == NULL)
1868 		panic("mach_page_sub");
1869 
1870 	if (*ppp == pp)
1871 		*ppp = pp->p_next;		/* go to next page */
1872 
1873 	if (*ppp == pp)
1874 		*ppp = NULL;			/* page list is gone */
1875 	else {
1876 		pp->p_prev->p_next = pp->p_next;
1877 		pp->p_next->p_prev = pp->p_prev;
1878 	}
1879 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1880 }
1881 
1882 /*
1883  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1884  */
1885 void
1886 page_promote_size(page_t *pp, uint_t cur_szc)
1887 {
1888 	pfn_t pfn;
1889 	int mnode;
1890 	int idx;
1891 	int new_szc = cur_szc + 1;
1892 	int full = FULL_REGION_CNT(new_szc);
1893 
1894 	pfn = page_pptonum(pp);
1895 	mnode = PFN_2_MEM_NODE(pfn);
1896 
1897 	page_freelist_lock(mnode);
1898 
1899 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1900 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1901 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1902 
1903 	page_freelist_unlock(mnode);
1904 }
1905 
1906 static uint_t page_promote_err;
1907 static uint_t page_promote_noreloc_err;
1908 
1909 /*
1910  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1911  * for the given mnode starting at pfnum. Pages involved are on the freelist
1912  * before the call and may be returned to the caller if requested, otherwise
1913  * they will be placed back on the freelist.
1914  * If flags is PC_ALLOC, then the large page will be returned to the user in
1915  * a state which is consistent with a page being taken off the freelist.  If
1916  * we failed to lock the new large page, then we will return NULL to the
1917  * caller and put the large page on the freelist instead.
1918  * If flags is PC_FREE, then the large page will be placed on the freelist,
1919  * and NULL will be returned.
1920  * The caller is responsible for locking the freelist as well as any other
1921  * accounting which needs to be done for a returned page.
1922  *
1923  * RFE: For performance pass in pp instead of pfnum so
1924  * 	we can avoid excessive calls to page_numtopp_nolock().
1925  *	This would depend on an assumption that all contiguous
1926  *	pages are in the same memseg so we can just add/dec
1927  *	our pp.
1928  *
1929  * Lock ordering:
1930  *
1931  *	There is a potential but rare deadlock situation
1932  *	for page promotion and demotion operations. The problem
1933  *	is there are two paths into the freelist manager and
1934  *	they have different lock orders:
1935  *
1936  *	page_create()
1937  *		lock freelist
1938  *		page_lock(EXCL)
1939  *		unlock freelist
1940  *		return
1941  *		caller drops page_lock
1942  *
1943  *	page_free() and page_reclaim()
1944  *		caller grabs page_lock(EXCL)
1945  *
1946  *		lock freelist
1947  *		unlock freelist
1948  *		drop page_lock
1949  *
1950  *	What prevents a thread in page_create() from deadlocking
1951  *	with a thread freeing or reclaiming the same page is the
1952  *	page_trylock() in page_get_freelist(). If the trylock fails
1953  *	it skips the page.
1954  *
1955  *	The lock ordering for promotion and demotion is the same as
1956  *	for page_create(). Since the same deadlock could occur during
1957  *	page promotion and freeing or reclaiming of a page on the
1958  *	cache list we might have to fail the operation and undo what
1959  *	have done so far. Again this is rare.
1960  */
1961 page_t *
1962 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1963 {
1964 	page_t		*pp, *pplist, *tpp, *start_pp;
1965 	pgcnt_t		new_npgs, npgs;
1966 	uint_t		bin;
1967 	pgcnt_t		tmpnpgs, pages_left;
1968 	uint_t		noreloc;
1969 	int 		which_list;
1970 	ulong_t		index;
1971 	kmutex_t	*phm;
1972 
1973 	/*
1974 	 * General algorithm:
1975 	 * Find the starting page
1976 	 * Walk each page struct removing it from the freelist,
1977 	 * and linking it to all the other pages removed.
1978 	 * Once all pages are off the freelist,
1979 	 * walk the list, modifying p_szc to new_szc and what
1980 	 * ever other info needs to be done to create a large free page.
1981 	 * According to the flags, either return the page or put it
1982 	 * on the freelist.
1983 	 */
1984 
1985 	start_pp = page_numtopp_nolock(pfnum);
1986 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1987 	new_npgs = page_get_pagecnt(new_szc);
1988 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1989 
1990 	/* don't return page of the wrong mtype */
1991 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1992 			return (NULL);
1993 
1994 	/*
1995 	 * Loop through smaller pages to confirm that all pages
1996 	 * give the same result for PP_ISNORELOC().
1997 	 * We can check this reliably here as the protocol for setting
1998 	 * P_NORELOC requires pages to be taken off the free list first.
1999 	 */
2000 	noreloc = PP_ISNORELOC(start_pp);
2001 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2002 		if (noreloc != PP_ISNORELOC(pp)) {
2003 			page_promote_noreloc_err++;
2004 			page_promote_err++;
2005 			return (NULL);
2006 		}
2007 	}
2008 
2009 	pages_left = new_npgs;
2010 	pplist = NULL;
2011 	pp = start_pp;
2012 
2013 	/* Loop around coalescing the smaller pages into a big page. */
2014 	while (pages_left) {
2015 		/*
2016 		 * Remove from the freelist.
2017 		 */
2018 		ASSERT(PP_ISFREE(pp));
2019 		bin = PP_2_BIN(pp);
2020 		ASSERT(mnode == PP_2_MEM_NODE(pp));
2021 		mtype = PP_2_MTYPE(pp);
2022 		if (PP_ISAGED(pp)) {
2023 
2024 			/*
2025 			 * PG_FREE_LIST
2026 			 */
2027 			if (pp->p_szc) {
2028 				page_vpsub(&PAGE_FREELISTS(mnode,
2029 				    pp->p_szc, bin, mtype), pp);
2030 			} else {
2031 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2032 				    bin, mtype), pp);
2033 			}
2034 			which_list = PG_FREE_LIST;
2035 		} else {
2036 			ASSERT(pp->p_szc == 0);
2037 
2038 			/*
2039 			 * PG_CACHE_LIST
2040 			 *
2041 			 * Since this page comes from the
2042 			 * cachelist, we must destroy the
2043 			 * vnode association.
2044 			 */
2045 			if (!page_trylock(pp, SE_EXCL)) {
2046 				goto fail_promote;
2047 			}
2048 
2049 			/*
2050 			 * We need to be careful not to deadlock
2051 			 * with another thread in page_lookup().
2052 			 * The page_lookup() thread could be holding
2053 			 * the same phm that we need if the two
2054 			 * pages happen to hash to the same phm lock.
2055 			 * At this point we have locked the entire
2056 			 * freelist and page_lookup() could be trying
2057 			 * to grab a freelist lock.
2058 			 */
2059 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2060 			phm = PAGE_HASH_MUTEX(index);
2061 			if (!mutex_tryenter(phm)) {
2062 				page_unlock_nocapture(pp);
2063 				goto fail_promote;
2064 			}
2065 
2066 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2067 			page_hashout(pp, phm);
2068 			mutex_exit(phm);
2069 			PP_SETAGED(pp);
2070 			page_unlock_nocapture(pp);
2071 			which_list = PG_CACHE_LIST;
2072 		}
2073 		page_ctr_sub(mnode, mtype, pp, which_list);
2074 
2075 		/*
2076 		 * Concatenate the smaller page(s) onto
2077 		 * the large page list.
2078 		 */
2079 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2080 		pages_left -= npgs;
2081 		tpp = pp;
2082 		while (npgs--) {
2083 			tpp->p_szc = new_szc;
2084 			tpp = tpp->p_next;
2085 		}
2086 		page_list_concat(&pplist, &pp);
2087 		pp += tmpnpgs;
2088 	}
2089 	CHK_LPG(pplist, new_szc);
2090 
2091 	/*
2092 	 * return the page to the user if requested
2093 	 * in the properly locked state.
2094 	 */
2095 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2096 		return (pplist);
2097 	}
2098 
2099 	/*
2100 	 * Otherwise place the new large page on the freelist
2101 	 */
2102 	bin = PP_2_BIN(pplist);
2103 	mnode = PP_2_MEM_NODE(pplist);
2104 	mtype = PP_2_MTYPE(pplist);
2105 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2106 
2107 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2108 	return (NULL);
2109 
2110 fail_promote:
2111 	/*
2112 	 * A thread must have still been freeing or
2113 	 * reclaiming the page on the cachelist.
2114 	 * To prevent a deadlock undo what we have
2115 	 * done sofar and return failure. This
2116 	 * situation can only happen while promoting
2117 	 * PAGESIZE pages.
2118 	 */
2119 	page_promote_err++;
2120 	while (pplist) {
2121 		pp = pplist;
2122 		mach_page_sub(&pplist, pp);
2123 		pp->p_szc = 0;
2124 		bin = PP_2_BIN(pp);
2125 		mtype = PP_2_MTYPE(pp);
2126 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2127 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2128 	}
2129 	return (NULL);
2130 
2131 }
2132 
2133 /*
2134  * Break up a large page into smaller size pages.
2135  * Pages involved are on the freelist before the call and may
2136  * be returned to the caller if requested, otherwise they will
2137  * be placed back on the freelist.
2138  * The caller is responsible for locking the freelist as well as any other
2139  * accounting which needs to be done for a returned page.
2140  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2141  * technically, any value may be passed in but PC_NO_COLOR is the standard
2142  * which should be followed for clarity's sake.
2143  * Returns a page whose pfn is < pfnmax
2144  */
2145 page_t *
2146 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2147     uchar_t new_szc, int color, int flags)
2148 {
2149 	page_t	*pp, *pplist, *npplist;
2150 	pgcnt_t	npgs, n;
2151 	uint_t	bin;
2152 	uint_t	mtype;
2153 	page_t	*ret_pp = NULL;
2154 
2155 	ASSERT(cur_szc != 0);
2156 	ASSERT(new_szc < cur_szc);
2157 
2158 	pplist = page_numtopp_nolock(pfnum);
2159 	ASSERT(pplist != NULL);
2160 
2161 	ASSERT(pplist->p_szc == cur_szc);
2162 
2163 	bin = PP_2_BIN(pplist);
2164 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2165 	mtype = PP_2_MTYPE(pplist);
2166 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2167 
2168 	CHK_LPG(pplist, cur_szc);
2169 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2170 
2171 	/*
2172 	 * Number of PAGESIZE pages for smaller new_szc
2173 	 * page.
2174 	 */
2175 	npgs = page_get_pagecnt(new_szc);
2176 
2177 	while (pplist) {
2178 		pp = pplist;
2179 
2180 		ASSERT(pp->p_szc == cur_szc);
2181 
2182 		/*
2183 		 * We either break it up into PAGESIZE pages or larger.
2184 		 */
2185 		if (npgs == 1) {	/* PAGESIZE case */
2186 			mach_page_sub(&pplist, pp);
2187 			ASSERT(pp->p_szc == cur_szc);
2188 			ASSERT(new_szc == 0);
2189 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2190 			pp->p_szc = new_szc;
2191 			bin = PP_2_BIN(pp);
2192 			if ((bin == color) && (flags == PC_ALLOC) &&
2193 			    (ret_pp == NULL) && (pfnmax == 0 ||
2194 			    pp->p_pagenum < pfnmax) &&
2195 			    page_trylock_cons(pp, SE_EXCL)) {
2196 				ret_pp = pp;
2197 			} else {
2198 				mtype = PP_2_MTYPE(pp);
2199 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2200 				    mtype), pp);
2201 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2202 			}
2203 		} else {
2204 			page_t *try_to_return_this_page = NULL;
2205 			int count = 0;
2206 
2207 			/*
2208 			 * Break down into smaller lists of pages.
2209 			 */
2210 			page_list_break(&pplist, &npplist, npgs);
2211 
2212 			pp = pplist;
2213 			n = npgs;
2214 			while (n--) {
2215 				ASSERT(pp->p_szc == cur_szc);
2216 				/*
2217 				 * Check whether all the pages in this list
2218 				 * fit the request criteria.
2219 				 */
2220 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2221 					count++;
2222 				}
2223 				pp->p_szc = new_szc;
2224 				pp = pp->p_next;
2225 			}
2226 
2227 			if (count == npgs &&
2228 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2229 				try_to_return_this_page = pp;
2230 			}
2231 
2232 			CHK_LPG(pplist, new_szc);
2233 
2234 			bin = PP_2_BIN(pplist);
2235 			if (try_to_return_this_page)
2236 				ASSERT(mnode ==
2237 				    PP_2_MEM_NODE(try_to_return_this_page));
2238 			if ((bin == color) && (flags == PC_ALLOC) &&
2239 			    (ret_pp == NULL) && try_to_return_this_page &&
2240 			    page_trylock_cons(try_to_return_this_page,
2241 			    SE_EXCL)) {
2242 				ret_pp = try_to_return_this_page;
2243 			} else {
2244 				mtype = PP_2_MTYPE(pp);
2245 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2246 				    bin, mtype), pplist);
2247 
2248 				page_ctr_add(mnode, mtype, pplist,
2249 				    PG_FREE_LIST);
2250 			}
2251 			pplist = npplist;
2252 		}
2253 	}
2254 	return (ret_pp);
2255 }
2256 
2257 int mpss_coalesce_disable = 0;
2258 
2259 /*
2260  * Coalesce free pages into a page of the given szc and color if possible.
2261  * Return the pointer to the page created, otherwise, return NULL.
2262  *
2263  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2264  */
2265 page_t *
2266 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2267     int mtype, pfn_t pfnhi)
2268 {
2269 	int 	r = szc;		/* region size */
2270 	int	mrange;
2271 	uint_t 	full, bin, color_mask, wrap = 0;
2272 	pfn_t	pfnum, lo, hi;
2273 	size_t	len, idx, idx0;
2274 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2275 	page_t	*ret_pp;
2276 	MEM_NODE_ITERATOR_DECL(it);
2277 #if defined(__sparc)
2278 	pfn_t pfnum0, nlo, nhi;
2279 #endif
2280 
2281 	if (mpss_coalesce_disable) {
2282 		ASSERT(szc < MMU_PAGE_SIZES);
2283 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2284 		return (NULL);
2285 	}
2286 
2287 	ASSERT(szc < mmu_page_sizes);
2288 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2289 	ASSERT(ceq_mask <= color_mask);
2290 	ASSERT(color <= color_mask);
2291 	color &= ceq_mask;
2292 
2293 	/* Prevent page_counters dynamic memory from being freed */
2294 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2295 
2296 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2297 	ASSERT(mrange < mnode_nranges[mnode]);
2298 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2299 
2300 	/* get pfn range for mtype */
2301 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2302 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2303 	hi++;
2304 
2305 	/* use lower limit if given */
2306 	if (pfnhi != PFNNULL && pfnhi < hi)
2307 		hi = pfnhi;
2308 
2309 	/* round to szcpgcnt boundaries */
2310 	lo = P2ROUNDUP(lo, szcpgcnt);
2311 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2312 	if (lo == (pfn_t)-1) {
2313 		rw_exit(&page_ctrs_rwlock[mnode]);
2314 		return (NULL);
2315 	}
2316 	hi = hi & ~(szcpgcnt - 1);
2317 
2318 	/* set lo to the closest pfn of the right color */
2319 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2320 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2321 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2322 		    &it);
2323 	}
2324 
2325 	if (hi <= lo) {
2326 		rw_exit(&page_ctrs_rwlock[mnode]);
2327 		return (NULL);
2328 	}
2329 
2330 	full = FULL_REGION_CNT(r);
2331 
2332 	/* calculate the number of page candidates and initial search index */
2333 	bin = color;
2334 	idx0 = (size_t)(-1);
2335 	do {
2336 		pgcnt_t acand;
2337 
2338 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2339 		if (acand) {
2340 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2341 			    r, bin, mrange);
2342 			idx0 = MIN(idx0, idx);
2343 			cands += acand;
2344 		}
2345 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2346 	} while (bin != color);
2347 
2348 	if (cands == 0) {
2349 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2350 		rw_exit(&page_ctrs_rwlock[mnode]);
2351 		return (NULL);
2352 	}
2353 
2354 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2355 	if (pfnum < lo || pfnum >= hi) {
2356 		pfnum = lo;
2357 	} else {
2358 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2359 		if (pfnum == (pfn_t)-1) {
2360 			pfnum = lo;
2361 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2362 			ASSERT(pfnum != (pfn_t)-1);
2363 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2364 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2365 			/* invalid color, get the closest correct pfn */
2366 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2367 			    color_mask, &it);
2368 			if (pfnum >= hi) {
2369 				pfnum = lo;
2370 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2371 			}
2372 		}
2373 	}
2374 
2375 	/* set starting index */
2376 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2377 	ASSERT(idx0 < len);
2378 
2379 #if defined(__sparc)
2380 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2381 	nhi = 0;		/* search kcage ranges */
2382 #endif
2383 
2384 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2385 
2386 #if defined(__sparc)
2387 		/*
2388 		 * Find lowest intersection of kcage ranges and mnode.
2389 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2390 		 */
2391 		if (nhi <= pfnum) {
2392 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2393 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2394 				goto wrapit;
2395 
2396 			/* jump to the next page in the range */
2397 			if (pfnum < nlo) {
2398 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2399 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2400 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2401 				if (idx >= len || pfnum >= hi)
2402 					goto wrapit;
2403 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2404 				    ceq_mask)
2405 					goto next;
2406 				if (interleaved_mnodes &&
2407 				    PFN_2_MEM_NODE(pfnum) != mnode)
2408 					goto next;
2409 			}
2410 		}
2411 #endif
2412 
2413 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2414 			goto next;
2415 
2416 		/*
2417 		 * RFE: For performance maybe we can do something less
2418 		 *	brutal than locking the entire freelist. So far
2419 		 * 	this doesn't seem to be a performance problem?
2420 		 */
2421 		page_freelist_lock(mnode);
2422 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2423 			ret_pp =
2424 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2425 			if (ret_pp != NULL) {
2426 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2427 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2428 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2429 				page_freelist_unlock(mnode);
2430 				rw_exit(&page_ctrs_rwlock[mnode]);
2431 #if defined(__sparc)
2432 				if (PP_ISNORELOC(ret_pp)) {
2433 					pgcnt_t npgs;
2434 
2435 					npgs = page_get_pagecnt(ret_pp->p_szc);
2436 					kcage_freemem_sub(npgs);
2437 				}
2438 #endif
2439 				return (ret_pp);
2440 			}
2441 		} else {
2442 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2443 		}
2444 
2445 		page_freelist_unlock(mnode);
2446 		/*
2447 		 * No point looking for another page if we've
2448 		 * already tried all of the ones that
2449 		 * page_ctr_cands indicated.  Stash off where we left
2450 		 * off.
2451 		 * Note: this is not exact since we don't hold the
2452 		 * page_freelist_locks before we initially get the
2453 		 * value of cands for performance reasons, but should
2454 		 * be a decent approximation.
2455 		 */
2456 		if (--cands == 0) {
2457 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2458 			    idx;
2459 			break;
2460 		}
2461 next:
2462 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2463 		    color_mask, &it);
2464 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2465 		if (idx >= len || pfnum >= hi) {
2466 wrapit:
2467 			pfnum = lo;
2468 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2469 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2470 			wrap++;
2471 #if defined(__sparc)
2472 			nhi = 0;	/* search kcage ranges */
2473 #endif
2474 		}
2475 	}
2476 
2477 	rw_exit(&page_ctrs_rwlock[mnode]);
2478 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2479 	return (NULL);
2480 }
2481 
2482 /*
2483  * For the given mnode, promote as many small pages to large pages as possible.
2484  * mnode can be -1, which means do them all
2485  */
2486 void
2487 page_freelist_coalesce_all(int mnode)
2488 {
2489 	int 	r;		/* region size */
2490 	int 	idx, full;
2491 	size_t	len;
2492 	int doall = interleaved_mnodes || mnode < 0;
2493 	int mlo = doall ? 0 : mnode;
2494 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2495 
2496 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2497 
2498 	if (mpss_coalesce_disable) {
2499 		return;
2500 	}
2501 
2502 	/*
2503 	 * Lock the entire freelist and coalesce what we can.
2504 	 *
2505 	 * Always promote to the largest page possible
2506 	 * first to reduce the number of page promotions.
2507 	 */
2508 	for (mnode = mlo; mnode < mhi; mnode++) {
2509 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2510 		page_freelist_lock(mnode);
2511 	}
2512 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2513 		for (mnode = mlo; mnode < mhi; mnode++) {
2514 			pgcnt_t cands = 0;
2515 			int mrange, nranges = mnode_nranges[mnode];
2516 
2517 			for (mrange = 0; mrange < nranges; mrange++) {
2518 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2519 				if (cands != 0)
2520 					break;
2521 			}
2522 			if (cands == 0) {
2523 				VM_STAT_ADD(vmm_vmstats.
2524 				    page_ctrs_cands_skip_all);
2525 				continue;
2526 			}
2527 
2528 			full = FULL_REGION_CNT(r);
2529 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2530 
2531 			for (idx = 0; idx < len; idx++) {
2532 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2533 					pfn_t pfnum =
2534 					    IDX_TO_PNUM(mnode, r, idx);
2535 					int tmnode = interleaved_mnodes ?
2536 					    PFN_2_MEM_NODE(pfnum) : mnode;
2537 
2538 					ASSERT(pfnum >=
2539 					    mem_node_config[tmnode].physbase &&
2540 					    pfnum <
2541 					    mem_node_config[tmnode].physmax);
2542 
2543 					(void) page_promote(tmnode,
2544 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2545 				}
2546 			}
2547 			/* shared hpm_counters covers all mnodes, so we quit */
2548 			if (interleaved_mnodes)
2549 				break;
2550 		}
2551 	}
2552 	for (mnode = mlo; mnode < mhi; mnode++) {
2553 		page_freelist_unlock(mnode);
2554 		rw_exit(&page_ctrs_rwlock[mnode]);
2555 	}
2556 }
2557 
2558 /*
2559  * This is where all polices for moving pages around
2560  * to different page size free lists is implemented.
2561  * Returns 1 on success, 0 on failure.
2562  *
2563  * So far these are the priorities for this algorithm in descending
2564  * order:
2565  *
2566  *	1) When servicing a request try to do so with a free page
2567  *	   from next size up. Helps defer fragmentation as long
2568  *	   as possible.
2569  *
2570  *	2) Page coalesce on demand. Only when a freelist
2571  *	   larger than PAGESIZE is empty and step 1
2572  *	   will not work since all larger size lists are
2573  *	   also empty.
2574  *
2575  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2576  */
2577 
2578 page_t *
2579 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2580     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2581 {
2582 	uchar_t nszc = szc + 1;
2583 	uint_t 	bin, sbin, bin_prev;
2584 	page_t	*pp, *firstpp;
2585 	page_t	*ret_pp = NULL;
2586 	uint_t  color_mask;
2587 
2588 	if (nszc == mmu_page_sizes)
2589 		return (NULL);
2590 
2591 	ASSERT(nszc < mmu_page_sizes);
2592 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2593 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2594 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2595 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2596 
2597 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2598 	/*
2599 	 * First try to break up a larger page to fill current size freelist.
2600 	 */
2601 	while (plw->plw_bins[nszc] != 0) {
2602 
2603 		ASSERT(nszc < mmu_page_sizes);
2604 
2605 		/*
2606 		 * If page found then demote it.
2607 		 */
2608 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2609 			page_freelist_lock(mnode);
2610 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2611 
2612 			/*
2613 			 * If pfnhi is not PFNNULL, look for large page below
2614 			 * pfnhi. PFNNULL signifies no pfn requirement.
2615 			 */
2616 			if (pp &&
2617 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2618 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2619 				do {
2620 					pp = pp->p_vpnext;
2621 					if (pp == firstpp) {
2622 						pp = NULL;
2623 						break;
2624 					}
2625 				} while ((pfnhi != PFNNULL &&
2626 				    pp->p_pagenum >= pfnhi) ||
2627 				    (pfnlo != PFNNULL &&
2628 				    pp->p_pagenum < pfnlo));
2629 
2630 				if (pfnhi != PFNNULL && pp != NULL)
2631 					ASSERT(pp->p_pagenum < pfnhi);
2632 
2633 				if (pfnlo != PFNNULL && pp != NULL)
2634 					ASSERT(pp->p_pagenum >= pfnlo);
2635 			}
2636 			if (pp) {
2637 				uint_t ccolor = page_correct_color(szc, nszc,
2638 				    color, bin, plw->plw_ceq_mask[szc]);
2639 
2640 				ASSERT(pp->p_szc == nszc);
2641 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2642 				ret_pp = page_demote(mnode, pp->p_pagenum,
2643 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2644 				if (ret_pp) {
2645 					page_freelist_unlock(mnode);
2646 #if defined(__sparc)
2647 					if (PP_ISNORELOC(ret_pp)) {
2648 						pgcnt_t npgs;
2649 
2650 						npgs = page_get_pagecnt(
2651 						    ret_pp->p_szc);
2652 						kcage_freemem_sub(npgs);
2653 					}
2654 #endif
2655 					return (ret_pp);
2656 				}
2657 			}
2658 			page_freelist_unlock(mnode);
2659 		}
2660 
2661 		/* loop through next size bins */
2662 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2663 		plw->plw_bins[nszc]--;
2664 
2665 		if (bin == sbin) {
2666 			uchar_t nnszc = nszc + 1;
2667 
2668 			/* we are done with this page size - check next */
2669 			if (plw->plw_bins[nnszc] == 0)
2670 				/* we have already checked next size bins */
2671 				break;
2672 
2673 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2674 			if (bin_prev != INVALID_COLOR) {
2675 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2676 				if (!((bin ^ bin_prev) &
2677 				    plw->plw_ceq_mask[nnszc]))
2678 					break;
2679 			}
2680 			ASSERT(nnszc < mmu_page_sizes);
2681 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2682 			nszc = nnszc;
2683 			ASSERT(nszc < mmu_page_sizes);
2684 		}
2685 	}
2686 
2687 	return (ret_pp);
2688 }
2689 
2690 /*
2691  * Helper routine used only by the freelist code to lock
2692  * a page. If the page is a large page then it succeeds in
2693  * locking all the constituent pages or none at all.
2694  * Returns 1 on sucess, 0 on failure.
2695  */
2696 static int
2697 page_trylock_cons(page_t *pp, se_t se)
2698 {
2699 	page_t	*tpp, *first_pp = pp;
2700 
2701 	/*
2702 	 * Fail if can't lock first or only page.
2703 	 */
2704 	if (!page_trylock(pp, se)) {
2705 		return (0);
2706 	}
2707 
2708 	/*
2709 	 * PAGESIZE: common case.
2710 	 */
2711 	if (pp->p_szc == 0) {
2712 		return (1);
2713 	}
2714 
2715 	/*
2716 	 * Large page case.
2717 	 */
2718 	tpp = pp->p_next;
2719 	while (tpp != pp) {
2720 		if (!page_trylock(tpp, se)) {
2721 			/*
2722 			 * On failure unlock what we have locked so far.
2723 			 * We want to avoid attempting to capture these
2724 			 * pages as the pcm mutex may be held which could
2725 			 * lead to a recursive mutex panic.
2726 			 */
2727 			while (first_pp != tpp) {
2728 				page_unlock_nocapture(first_pp);
2729 				first_pp = first_pp->p_next;
2730 			}
2731 			return (0);
2732 		}
2733 		tpp = tpp->p_next;
2734 	}
2735 	return (1);
2736 }
2737 
2738 /*
2739  * init context for walking page lists
2740  * Called when a page of the given szc in unavailable. Sets markers
2741  * for the beginning of the search to detect when search has
2742  * completed a full cycle. Sets flags for splitting larger pages
2743  * and coalescing smaller pages. Page walking procedes until a page
2744  * of the desired equivalent color is found.
2745  */
2746 void
2747 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2748     int use_ceq, page_list_walker_t *plw)
2749 {
2750 	uint_t  nszc, ceq_mask, colors;
2751 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2752 
2753 	ASSERT(szc < mmu_page_sizes);
2754 	colors = PAGE_GET_PAGECOLORS(szc);
2755 
2756 	plw->plw_colors = colors;
2757 	plw->plw_color_mask = colors - 1;
2758 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2759 	plw->plw_bin_split_prev = bin;
2760 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2761 
2762 	/*
2763 	 * if vac aliasing is possible make sure lower order color
2764 	 * bits are never ignored
2765 	 */
2766 	if (vac_colors > 1)
2767 		ceq &= 0xf0;
2768 
2769 	/*
2770 	 * calculate the number of non-equivalent colors and
2771 	 * color equivalency mask
2772 	 */
2773 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2774 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2775 	ASSERT(plw->plw_ceq_dif > 0);
2776 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2777 
2778 	if (flags & PG_MATCH_COLOR) {
2779 		if (cpu_page_colors <  0) {
2780 			/*
2781 			 * this is a heterogeneous machine with different CPUs
2782 			 * having different size e$ (not supported for ni2/rock
2783 			 */
2784 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2785 			cpucolors = MAX(cpucolors, 1);
2786 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2787 			plw->plw_ceq_mask[szc] =
2788 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2789 		}
2790 		plw->plw_ceq_dif = 1;
2791 	}
2792 
2793 	/* we can split pages in the freelist, but not the cachelist */
2794 	if (can_split) {
2795 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2796 
2797 		/* set next szc color masks and number of free list bins */
2798 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2799 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2800 			    plw->plw_ceq_mask[szc]);
2801 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2802 		}
2803 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2804 		plw->plw_bins[nszc] = 0;
2805 
2806 	} else {
2807 		ASSERT(szc == 0);
2808 		plw->plw_do_split = 0;
2809 		plw->plw_bins[1] = 0;
2810 		plw->plw_ceq_mask[1] = INVALID_MASK;
2811 	}
2812 }
2813 
2814 /*
2815  * set mark to flag where next split should occur
2816  */
2817 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2818 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2819 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2820 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2821 	plw->plw_split_next =						     \
2822 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2823 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2824 		plw->plw_split_next =					     \
2825 		INC_MASKED(plw->plw_split_next,				     \
2826 		    neq_mask, plw->plw_color_mask);			     \
2827 	}								     \
2828 }
2829 
2830 uint_t
2831 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2832 {
2833 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2834 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2835 	uchar_t nszc = szc + 1;
2836 
2837 	nbin = ADD_MASKED(bin,
2838 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2839 
2840 	if (plw->plw_do_split) {
2841 		plw->plw_bin_split_prev = bin;
2842 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2843 		plw->plw_do_split = 0;
2844 	}
2845 
2846 	if (szc == 0) {
2847 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2848 			if (nbin == plw->plw_bin0 &&
2849 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2850 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2851 				    neq_mask, plw->plw_color_mask);
2852 				plw->plw_bin_split_prev = plw->plw_bin0;
2853 			}
2854 
2855 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2856 				plw->plw_bin_marker =
2857 				    nbin = INC_MASKED(nbin, neq_mask,
2858 				    plw->plw_color_mask);
2859 				plw->plw_bin_split_prev = plw->plw_bin0;
2860 				/*
2861 				 * large pages all have the same vac color
2862 				 * so by now we should be done with next
2863 				 * size page splitting process
2864 				 */
2865 				ASSERT(plw->plw_bins[1] == 0);
2866 				plw->plw_do_split = 0;
2867 				return (nbin);
2868 			}
2869 
2870 		} else {
2871 			uint_t bin_jump = (vac_colors == 1) ?
2872 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2873 
2874 			bin_jump &= ~(vac_colors - 1);
2875 
2876 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2877 			    plw->plw_color_mask);
2878 
2879 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2880 
2881 				plw->plw_bin_marker = nbin = nbin0;
2882 
2883 				if (plw->plw_bins[nszc] != 0) {
2884 					/*
2885 					 * check if next page size bin is the
2886 					 * same as the next page size bin for
2887 					 * bin0
2888 					 */
2889 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2890 					    nbin);
2891 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2892 					    plw->plw_bin0);
2893 
2894 					if ((bin0_nsz ^ nbin_nsz) &
2895 					    plw->plw_ceq_mask[nszc])
2896 						plw->plw_do_split = 1;
2897 				}
2898 				return (nbin);
2899 			}
2900 		}
2901 	}
2902 
2903 	if (plw->plw_bins[nszc] != 0) {
2904 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2905 		if (!((plw->plw_split_next ^ nbin_nsz) &
2906 		    plw->plw_ceq_mask[nszc]))
2907 			plw->plw_do_split = 1;
2908 	}
2909 
2910 	return (nbin);
2911 }
2912 
2913 page_t *
2914 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2915     uint_t flags)
2916 {
2917 	kmutex_t		*pcm;
2918 	page_t			*pp, *first_pp;
2919 	uint_t			sbin;
2920 	int			plw_initialized;
2921 	page_list_walker_t	plw;
2922 
2923 	ASSERT(szc < mmu_page_sizes);
2924 
2925 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2926 
2927 	MTYPE_START(mnode, mtype, flags);
2928 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2929 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2930 		return (NULL);
2931 	}
2932 try_again:
2933 
2934 	plw_initialized = 0;
2935 	plw.plw_ceq_dif = 1;
2936 
2937 	/*
2938 	 * Only hold one freelist lock at a time, that way we
2939 	 * can start anywhere and not have to worry about lock
2940 	 * ordering.
2941 	 */
2942 	for (plw.plw_count = 0;
2943 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2944 		sbin = bin;
2945 		do {
2946 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2947 				goto bin_empty_1;
2948 
2949 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2950 			mutex_enter(pcm);
2951 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2952 			if (pp == NULL)
2953 				goto bin_empty_0;
2954 
2955 			/*
2956 			 * These were set before the page
2957 			 * was put on the free list,
2958 			 * they must still be set.
2959 			 */
2960 			ASSERT(PP_ISFREE(pp));
2961 			ASSERT(PP_ISAGED(pp));
2962 			ASSERT(pp->p_vnode == NULL);
2963 			ASSERT(pp->p_hash == NULL);
2964 			ASSERT(pp->p_offset == (u_offset_t)-1);
2965 			ASSERT(pp->p_szc == szc);
2966 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2967 
2968 			/*
2969 			 * Walk down the hash chain.
2970 			 * 8k pages are linked on p_next
2971 			 * and p_prev fields. Large pages
2972 			 * are a contiguous group of
2973 			 * constituent pages linked together
2974 			 * on their p_next and p_prev fields.
2975 			 * The large pages are linked together
2976 			 * on the hash chain using p_vpnext
2977 			 * p_vpprev of the base constituent
2978 			 * page of each large page.
2979 			 */
2980 			first_pp = pp;
2981 			while (!page_trylock_cons(pp, SE_EXCL) ||
2982 			    IS_DUMP_PAGE(pp)) {
2983 				if (szc == 0) {
2984 					pp = pp->p_next;
2985 				} else {
2986 					pp = pp->p_vpnext;
2987 				}
2988 
2989 				ASSERT(PP_ISFREE(pp));
2990 				ASSERT(PP_ISAGED(pp));
2991 				ASSERT(pp->p_vnode == NULL);
2992 				ASSERT(pp->p_hash == NULL);
2993 				ASSERT(pp->p_offset == (u_offset_t)-1);
2994 				ASSERT(pp->p_szc == szc);
2995 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2996 
2997 				if (pp == first_pp)
2998 					goto bin_empty_0;
2999 			}
3000 
3001 			ASSERT(pp != NULL);
3002 			ASSERT(mtype == PP_2_MTYPE(pp));
3003 			ASSERT(pp->p_szc == szc);
3004 			if (szc == 0) {
3005 				page_sub(&PAGE_FREELISTS(mnode,
3006 				    szc, bin, mtype), pp);
3007 			} else {
3008 				page_vpsub(&PAGE_FREELISTS(mnode,
3009 				    szc, bin, mtype), pp);
3010 				CHK_LPG(pp, szc);
3011 			}
3012 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3013 
3014 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3015 				panic("free page is not. pp %p", (void *)pp);
3016 			mutex_exit(pcm);
3017 
3018 #if defined(__sparc)
3019 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3020 			    (flags & PG_NORELOC) == 0);
3021 
3022 			if (PP_ISNORELOC(pp))
3023 				kcage_freemem_sub(page_get_pagecnt(szc));
3024 #endif
3025 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3026 			return (pp);
3027 
3028 bin_empty_0:
3029 			mutex_exit(pcm);
3030 bin_empty_1:
3031 			if (plw_initialized == 0) {
3032 				page_list_walk_init(szc, flags, bin, 1, 1,
3033 				    &plw);
3034 				plw_initialized = 1;
3035 				ASSERT(plw.plw_colors <=
3036 				    PAGE_GET_PAGECOLORS(szc));
3037 				ASSERT(plw.plw_colors > 0);
3038 				ASSERT((plw.plw_colors &
3039 				    (plw.plw_colors - 1)) == 0);
3040 				ASSERT(bin < plw.plw_colors);
3041 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3042 			}
3043 			/* calculate the next bin with equivalent color */
3044 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3045 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
3046 		} while (sbin != bin);
3047 
3048 		/*
3049 		 * color bins are all empty if color match. Try and
3050 		 * satisfy the request by breaking up or coalescing
3051 		 * pages from a different size freelist of the correct
3052 		 * color that satisfies the ORIGINAL color requested.
3053 		 * If that fails then try pages of the same size but
3054 		 * different colors assuming we are not called with
3055 		 * PG_MATCH_COLOR.
3056 		 */
3057 		if (plw.plw_do_split &&
3058 		    (pp = page_freelist_split(szc, bin, mnode,
3059 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3060 			return (pp);
3061 
3062 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3063 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3064 			return (pp);
3065 
3066 		if (plw.plw_ceq_dif > 1)
3067 			bin = page_list_walk_next_bin(szc, bin, &plw);
3068 	}
3069 
3070 	/* if allowed, cycle through additional mtypes */
3071 	MTYPE_NEXT(mnode, mtype, flags);
3072 	if (mtype >= 0)
3073 		goto try_again;
3074 
3075 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3076 
3077 	return (NULL);
3078 }
3079 
3080 /*
3081  * Returns the count of free pages for 'pp' with size code 'szc'.
3082  * Note: This function does not return an exact value as the page freelist
3083  * locks are not held and thus the values in the page_counters may be
3084  * changing as we walk through the data.
3085  */
3086 static int
3087 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3088 {
3089 	pgcnt_t	pgfree;
3090 	pgcnt_t cnt;
3091 	ssize_t	r = szc;	/* region size */
3092 	ssize_t	idx;
3093 	int	i;
3094 	int	full, range;
3095 
3096 	/* Make sure pagenum passed in is aligned properly */
3097 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3098 	ASSERT(szc > 0);
3099 
3100 	/* Prevent page_counters dynamic memory from being freed */
3101 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3102 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3103 	cnt = PAGE_COUNTERS(mnode, r, idx);
3104 	pgfree = cnt << PNUM_SHIFT(r - 1);
3105 	range = FULL_REGION_CNT(szc);
3106 
3107 	/* Check for completely full region */
3108 	if (cnt == range) {
3109 		rw_exit(&page_ctrs_rwlock[mnode]);
3110 		return (pgfree);
3111 	}
3112 
3113 	while (--r > 0) {
3114 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3115 		full = FULL_REGION_CNT(r);
3116 		for (i = 0; i < range; i++, idx++) {
3117 			cnt = PAGE_COUNTERS(mnode, r, idx);
3118 			/*
3119 			 * If cnt here is full, that means we have already
3120 			 * accounted for these pages earlier.
3121 			 */
3122 			if (cnt != full) {
3123 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3124 			}
3125 		}
3126 		range *= full;
3127 	}
3128 	rw_exit(&page_ctrs_rwlock[mnode]);
3129 	return (pgfree);
3130 }
3131 
3132 /*
3133  * Called from page_geti_contig_pages to exclusively lock constituent pages
3134  * starting from 'spp' for page size code 'szc'.
3135  *
3136  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3137  * region needs to be greater than or equal to the threshold.
3138  */
3139 static int
3140 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3141 {
3142 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3143 	pgcnt_t pgfree, i;
3144 	page_t *pp;
3145 
3146 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3147 
3148 
3149 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3150 		goto skipptcpcheck;
3151 	/*
3152 	 * check if there are sufficient free pages available before attempting
3153 	 * to trylock. Count is approximate as page counters can change.
3154 	 */
3155 	pgfree = page_freecnt(mnode, spp, szc);
3156 
3157 	/* attempt to trylock if there are sufficient already free pages */
3158 	if (pgfree < pgcnt/ptcpthreshold) {
3159 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3160 		return (0);
3161 	}
3162 
3163 skipptcpcheck:
3164 
3165 	for (i = 0; i < pgcnt; i++) {
3166 		pp = &spp[i];
3167 		if (!page_trylock(pp, SE_EXCL)) {
3168 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3169 			while (--i != (pgcnt_t)-1) {
3170 				pp = &spp[i];
3171 				ASSERT(PAGE_EXCL(pp));
3172 				page_unlock_nocapture(pp);
3173 			}
3174 			return (0);
3175 		}
3176 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3177 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3178 		    !PP_ISFREE(pp)) {
3179 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3180 			ASSERT(i == 0);
3181 			page_unlock_nocapture(pp);
3182 			return (0);
3183 		}
3184 		if (PP_ISNORELOC(pp)) {
3185 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3186 			while (i != (pgcnt_t)-1) {
3187 				pp = &spp[i];
3188 				ASSERT(PAGE_EXCL(pp));
3189 				page_unlock_nocapture(pp);
3190 				i--;
3191 			}
3192 			return (0);
3193 		}
3194 	}
3195 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3196 	return (1);
3197 }
3198 
3199 /*
3200  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3201  * of 'szc' constituent pages that had been locked exclusively previously.
3202  * Will attempt to relocate constituent pages in use.
3203  */
3204 static page_t *
3205 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3206 {
3207 	spgcnt_t pgcnt, npgs, i;
3208 	page_t *targpp, *rpp, *hpp;
3209 	page_t *replpp = NULL;
3210 	page_t *pplist = NULL;
3211 
3212 	ASSERT(pp != NULL);
3213 
3214 	pgcnt = page_get_pagecnt(szc);
3215 	while (pgcnt) {
3216 		ASSERT(PAGE_EXCL(pp));
3217 		ASSERT(!PP_ISNORELOC(pp));
3218 		if (PP_ISFREE(pp)) {
3219 			/*
3220 			 * If this is a PG_FREE_LIST page then its
3221 			 * size code can change underneath us due to
3222 			 * page promotion or demotion. As an optimzation
3223 			 * use page_list_sub_pages() instead of
3224 			 * page_list_sub().
3225 			 */
3226 			if (PP_ISAGED(pp)) {
3227 				page_list_sub_pages(pp, szc);
3228 				if (pp->p_szc == szc) {
3229 					return (pp);
3230 				}
3231 				ASSERT(pp->p_szc < szc);
3232 				npgs = page_get_pagecnt(pp->p_szc);
3233 				hpp = pp;
3234 				for (i = 0; i < npgs; i++, pp++) {
3235 					pp->p_szc = szc;
3236 				}
3237 				page_list_concat(&pplist, &hpp);
3238 				pgcnt -= npgs;
3239 				continue;
3240 			}
3241 			ASSERT(!PP_ISAGED(pp));
3242 			ASSERT(pp->p_szc == 0);
3243 			page_list_sub(pp, PG_CACHE_LIST);
3244 			page_hashout(pp, NULL);
3245 			PP_SETAGED(pp);
3246 			pp->p_szc = szc;
3247 			page_list_concat(&pplist, &pp);
3248 			pp++;
3249 			pgcnt--;
3250 			continue;
3251 		}
3252 		npgs = page_get_pagecnt(pp->p_szc);
3253 
3254 		/*
3255 		 * page_create_wait freemem accounting done by caller of
3256 		 * page_get_freelist and not necessary to call it prior to
3257 		 * calling page_get_replacement_page.
3258 		 *
3259 		 * page_get_replacement_page can call page_get_contig_pages
3260 		 * to acquire a large page (szc > 0); the replacement must be
3261 		 * smaller than the contig page size to avoid looping or
3262 		 * szc == 0 and PGI_PGCPSZC0 is set.
3263 		 */
3264 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3265 			replpp = page_get_replacement_page(pp, NULL, 0);
3266 			if (replpp) {
3267 				npgs = page_get_pagecnt(pp->p_szc);
3268 				ASSERT(npgs <= pgcnt);
3269 				targpp = pp;
3270 			}
3271 		}
3272 
3273 		/*
3274 		 * If replacement is NULL or do_page_relocate fails, fail
3275 		 * coalescing of pages.
3276 		 */
3277 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3278 		    &npgs, NULL) != 0)) {
3279 			/*
3280 			 * Unlock un-processed target list
3281 			 */
3282 			while (pgcnt--) {
3283 				ASSERT(PAGE_EXCL(pp));
3284 				page_unlock_nocapture(pp);
3285 				pp++;
3286 			}
3287 			/*
3288 			 * Free the processed target list.
3289 			 */
3290 			while (pplist) {
3291 				pp = pplist;
3292 				page_sub(&pplist, pp);
3293 				ASSERT(PAGE_EXCL(pp));
3294 				ASSERT(pp->p_szc == szc);
3295 				ASSERT(PP_ISFREE(pp));
3296 				ASSERT(PP_ISAGED(pp));
3297 				pp->p_szc = 0;
3298 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3299 				page_unlock_nocapture(pp);
3300 			}
3301 
3302 			if (replpp != NULL)
3303 				page_free_replacement_page(replpp);
3304 
3305 			return (NULL);
3306 		}
3307 		ASSERT(pp == targpp);
3308 
3309 		/* LINTED */
3310 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3311 
3312 		pp += npgs;
3313 		pgcnt -= npgs;
3314 
3315 		while (npgs--) {
3316 			ASSERT(PAGE_EXCL(targpp));
3317 			ASSERT(!PP_ISFREE(targpp));
3318 			ASSERT(!PP_ISNORELOC(targpp));
3319 			PP_SETFREE(targpp);
3320 			ASSERT(PP_ISAGED(targpp));
3321 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3322 			    (flags & PGI_PGCPSZC0)));
3323 			targpp->p_szc = szc;
3324 			targpp = targpp->p_next;
3325 
3326 			rpp = replpp;
3327 			ASSERT(rpp != NULL);
3328 			page_sub(&replpp, rpp);
3329 			ASSERT(PAGE_EXCL(rpp));
3330 			ASSERT(!PP_ISFREE(rpp));
3331 			page_unlock_nocapture(rpp);
3332 		}
3333 		ASSERT(targpp == hpp);
3334 		ASSERT(replpp == NULL);
3335 		page_list_concat(&pplist, &targpp);
3336 	}
3337 	CHK_LPG(pplist, szc);
3338 	return (pplist);
3339 }
3340 
3341 /*
3342  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3343  * of 0 means nothing left after trim.
3344  */
3345 int
3346 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3347 {
3348 	pfn_t	kcagepfn;
3349 	int	decr;
3350 	int	rc = 0;
3351 
3352 	if (PP_ISNORELOC(mseg->pages)) {
3353 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3354 
3355 			/* lower part of this mseg inside kernel cage */
3356 			decr = kcage_current_pfn(&kcagepfn);
3357 
3358 			/* kernel cage may have transitioned past mseg */
3359 			if (kcagepfn >= mseg->pages_base &&
3360 			    kcagepfn < mseg->pages_end) {
3361 				ASSERT(decr == 0);
3362 				*lo = MAX(kcagepfn, pfnlo);
3363 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
3364 				rc = 1;
3365 			}
3366 		}
3367 		/* else entire mseg in the cage */
3368 	} else {
3369 		if (PP_ISNORELOC(mseg->epages - 1)) {
3370 
3371 			/* upper part of this mseg inside kernel cage */
3372 			decr = kcage_current_pfn(&kcagepfn);
3373 
3374 			/* kernel cage may have transitioned past mseg */
3375 			if (kcagepfn >= mseg->pages_base &&
3376 			    kcagepfn < mseg->pages_end) {
3377 				ASSERT(decr);
3378 				*hi = MIN(kcagepfn, pfnhi);
3379 				*lo = MAX(pfnlo, mseg->pages_base);
3380 				rc = 1;
3381 			}
3382 		} else {
3383 			/* entire mseg outside of kernel cage */
3384 			*lo = MAX(pfnlo, mseg->pages_base);
3385 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3386 			rc = 1;
3387 		}
3388 	}
3389 	return (rc);
3390 }
3391 
3392 /*
3393  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3394  * page with size code 'szc'. Claiming such a page requires acquiring
3395  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3396  * relocating pages in use and concatenating these constituent pages into a
3397  * large page.
3398  *
3399  * The page lists do not have such a large page and page_freelist_split has
3400  * already failed to demote larger pages and/or coalesce smaller free pages.
3401  *
3402  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3403  * pages with the same color as 'bin'.
3404  *
3405  * 'pfnflag' specifies the subset of the pfn range to search.
3406  */
3407 
3408 static page_t *
3409 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3410     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3411 {
3412 	struct memseg *mseg;
3413 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3414 	pgcnt_t szcpgmask = szcpgcnt - 1;
3415 	pfn_t	randpfn;
3416 	page_t *pp, *randpp, *endpp;
3417 	uint_t colors, ceq_mask;
3418 	/* LINTED : set but not used in function */
3419 	uint_t color_mask;
3420 	pfn_t hi, lo;
3421 	uint_t skip;
3422 	MEM_NODE_ITERATOR_DECL(it);
3423 
3424 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3425 
3426 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3427 
3428 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3429 		return (NULL);
3430 
3431 	ASSERT(szc < mmu_page_sizes);
3432 
3433 	colors = PAGE_GET_PAGECOLORS(szc);
3434 	color_mask = colors - 1;
3435 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3436 		uchar_t ceq = colorequivszc[szc];
3437 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3438 
3439 		ASSERT(ceq_dif > 0);
3440 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3441 	} else {
3442 		ceq_mask = 0;
3443 	}
3444 
3445 	ASSERT(bin < colors);
3446 
3447 	/* clear "non-significant" color bits */
3448 	bin &= ceq_mask;
3449 
3450 	/*
3451 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3452 	 * when there have been previous page_get_contig_page failures to
3453 	 * limit the search.
3454 	 *
3455 	 * The high bit in pfnflag specifies the number of 'slots' in the
3456 	 * pfn range and the remainder of pfnflag specifies which slot.
3457 	 * For example, a value of 1010b would mean the second slot of
3458 	 * the pfn range that has been divided into 8 slots.
3459 	 */
3460 	if (pfnflag > 1) {
3461 		int	slots = 1 << (highbit(pfnflag) - 1);
3462 		int	slotid = pfnflag & (slots - 1);
3463 		pgcnt_t	szcpages;
3464 		int	slotlen;
3465 
3466 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3467 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3468 		slotlen = howmany(szcpages, slots);
3469 		/* skip if 'slotid' slot is empty */
3470 		if (slotid * slotlen >= szcpages)
3471 			return (NULL);
3472 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3473 		ASSERT(pfnlo < pfnhi);
3474 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3475 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3476 	}
3477 
3478 	/*
3479 	 * This routine is can be called recursively so we shouldn't
3480 	 * acquire a reader lock if a write request is pending. This
3481 	 * could lead to a deadlock with the DR thread.
3482 	 *
3483 	 * Returning NULL informs the caller that we could not get
3484 	 * a contig page with the required characteristics.
3485 	 */
3486 
3487 	if (!memsegs_trylock(0))
3488 		return (NULL);
3489 
3490 	/*
3491 	 * loop through memsegs to look for contig page candidates
3492 	 */
3493 
3494 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3495 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3496 			/* no overlap */
3497 			continue;
3498 		}
3499 
3500 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3501 			/* mseg too small */
3502 			continue;
3503 
3504 		/*
3505 		 * trim off kernel cage pages from pfn range and check for
3506 		 * a trimmed pfn range returned that does not span the
3507 		 * desired large page size.
3508 		 */
3509 		if (kcage_on) {
3510 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3511 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3512 				continue;
3513 		} else {
3514 			lo = MAX(pfnlo, mseg->pages_base);
3515 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3516 		}
3517 
3518 		/* round to szcpgcnt boundaries */
3519 		lo = P2ROUNDUP(lo, szcpgcnt);
3520 
3521 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3522 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3523 
3524 		if (hi <= lo)
3525 			continue;
3526 
3527 		/*
3528 		 * set lo to point to the pfn for the desired bin. Large
3529 		 * page sizes may only have a single page color
3530 		 */
3531 		skip = szcpgcnt;
3532 		if (ceq_mask > 0 || interleaved_mnodes) {
3533 			/* set lo to point at appropriate color */
3534 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3535 			    (interleaved_mnodes &&
3536 			    PFN_2_MEM_NODE(lo) != mnode)) {
3537 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3538 				    color_mask, &it);
3539 			}
3540 			if (hi <= lo)
3541 				/* mseg cannot satisfy color request */
3542 				continue;
3543 		}
3544 
3545 		/* randomly choose a point between lo and hi to begin search */
3546 
3547 		randpfn = (pfn_t)GETTICK();
3548 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3549 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3550 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3551 			if (randpfn != (pfn_t)-1) {
3552 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3553 				    ceq_mask, color_mask, &it);
3554 			}
3555 			if (randpfn >= hi) {
3556 				randpfn = lo;
3557 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3558 				    &it);
3559 			}
3560 		}
3561 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3562 
3563 		ASSERT(randpp->p_pagenum == randpfn);
3564 
3565 		pp = randpp;
3566 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3567 
3568 		ASSERT(randpp + szcpgcnt <= endpp);
3569 
3570 		do {
3571 			ASSERT(!(pp->p_pagenum & szcpgmask));
3572 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3573 
3574 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3575 				/* pages unlocked by page_claim on failure */
3576 				if (page_claim_contig_pages(pp, szc, flags)) {
3577 					memsegs_unlock(0);
3578 					return (pp);
3579 				}
3580 			}
3581 
3582 			if (ceq_mask == 0 && !interleaved_mnodes) {
3583 				pp += skip;
3584 			} else {
3585 				pfn_t pfn = pp->p_pagenum;
3586 
3587 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3588 				    ceq_mask, color_mask, &it);
3589 				if (pfn == (pfn_t)-1) {
3590 					pp = endpp;
3591 				} else {
3592 					pp = mseg->pages +
3593 					    (pfn - mseg->pages_base);
3594 				}
3595 			}
3596 			if (pp >= endpp) {
3597 				/* start from the beginning */
3598 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3599 				pp = mseg->pages + (lo - mseg->pages_base);
3600 				ASSERT(pp->p_pagenum == lo);
3601 				ASSERT(pp + szcpgcnt <= endpp);
3602 			}
3603 		} while (pp != randpp);
3604 	}
3605 	memsegs_unlock(0);
3606 	return (NULL);
3607 }
3608 
3609 
3610 /*
3611  * controlling routine that searches through physical memory in an attempt to
3612  * claim a large page based on the input parameters.
3613  * on the page free lists.
3614  *
3615  * calls page_geti_contig_pages with an initial pfn range from the mnode
3616  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3617  * that overlaps with the kernel cage or does not match the requested page
3618  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3619  * page_geti_contig_pages may further limit the search range based on
3620  * previous failure counts (pgcpfailcnt[]).
3621  *
3622  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3623  * pagesize page that satisfies mtype.
3624  */
3625 page_t *
3626 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3627     uint_t flags)
3628 {
3629 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3630 	page_t		*pp;
3631 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3632 
3633 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3634 
3635 	/* no allocations from cage */
3636 	flags |= PGI_NOCAGE;
3637 
3638 	/* LINTED */
3639 	MTYPE_START(mnode, mtype, flags);
3640 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3641 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3642 		return (NULL);
3643 	}
3644 
3645 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3646 
3647 	/* do not limit search and ignore color if hi pri */
3648 
3649 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3650 		pfnflag = pgcpfailcnt[szc];
3651 
3652 	/* remove color match to improve chances */
3653 
3654 	if (flags & PGI_PGCPHIPRI || pfnflag)
3655 		flags &= ~PG_MATCH_COLOR;
3656 
3657 	do {
3658 		/* get pfn range based on mnode and mtype */
3659 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3660 
3661 		ASSERT(pfnhi >= pfnlo);
3662 
3663 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3664 		    pfnlo, pfnhi, pfnflag);
3665 
3666 		if (pp != NULL) {
3667 			pfnflag = pgcpfailcnt[szc];
3668 			if (pfnflag) {
3669 				/* double the search size */
3670 				pgcpfailcnt[szc] = pfnflag >> 1;
3671 			}
3672 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3673 			return (pp);
3674 		}
3675 		MTYPE_NEXT(mnode, mtype, flags);
3676 	} while (mtype >= 0);
3677 
3678 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3679 	return (NULL);
3680 }
3681 
3682 #if defined(__i386) || defined(__amd64)
3683 /*
3684  * Determine the likelihood of finding/coalescing a szc page.
3685  * Return 0 if the likelihood is small otherwise return 1.
3686  *
3687  * For now, be conservative and check only 1g pages and return 0
3688  * if there had been previous coalescing failures and the szc pages
3689  * needed to satisfy request would exhaust most of freemem.
3690  */
3691 int
3692 page_chk_freelist(uint_t szc)
3693 {
3694 	pgcnt_t		pgcnt;
3695 
3696 	if (szc <= 1)
3697 		return (1);
3698 
3699 	pgcnt = page_get_pagecnt(szc);
3700 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3701 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3702 		return (0);
3703 	}
3704 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3705 	return (1);
3706 }
3707 #endif
3708 
3709 /*
3710  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3711  *
3712  * Does its own locking and accounting.
3713  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3714  * pages of the proper color even if there are pages of a different color.
3715  *
3716  * Finds a page, removes it, THEN locks it.
3717  */
3718 
3719 /*ARGSUSED*/
3720 page_t *
3721 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3722 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3723 {
3724 	struct as	*as = seg->s_as;
3725 	page_t		*pp = NULL;
3726 	ulong_t		bin;
3727 	uchar_t		szc;
3728 	int		mnode;
3729 	int		mtype;
3730 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3731 	lgrp_mnode_cookie_t	lgrp_cookie;
3732 
3733 	page_get_func = page_get_mnode_freelist;
3734 
3735 	/*
3736 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3737 	 * assume we wish to allocate near to the current thread's home.
3738 	 */
3739 	if (!LGRP_EXISTS(lgrp))
3740 		lgrp = lgrp_home_lgrp();
3741 
3742 	if (kcage_on) {
3743 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3744 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3745 		    curthread != kcage_cageout_thread) {
3746 			/*
3747 			 * Set a "reserve" of kcage_throttlefree pages for
3748 			 * PG_PANIC and cageout thread allocations.
3749 			 *
3750 			 * Everybody else has to serialize in
3751 			 * page_create_get_something() to get a cage page, so
3752 			 * that we don't deadlock cageout!
3753 			 */
3754 			return (NULL);
3755 		}
3756 	} else {
3757 		flags &= ~PG_NORELOC;
3758 		flags |= PGI_NOCAGE;
3759 	}
3760 
3761 	/* LINTED */
3762 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3763 
3764 	/*
3765 	 * Convert size to page size code.
3766 	 */
3767 	if ((szc = page_szc(size)) == (uchar_t)-1)
3768 		panic("page_get_freelist: illegal page size request");
3769 	ASSERT(szc < mmu_page_sizes);
3770 
3771 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3772 
3773 	/* LINTED */
3774 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3775 
3776 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3777 
3778 	/*
3779 	 * Try to get a local page first, but try remote if we can't
3780 	 * get a page of the right color.
3781 	 */
3782 pgretry:
3783 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3784 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3785 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3786 		if (pp != NULL) {
3787 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3788 			DTRACE_PROBE4(page__get,
3789 			    lgrp_t *, lgrp,
3790 			    int, mnode,
3791 			    ulong_t, bin,
3792 			    uint_t, flags);
3793 			return (pp);
3794 		}
3795 	}
3796 	ASSERT(pp == NULL);
3797 
3798 	/*
3799 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3800 	 * remote free lists.  Caller expected to call page_get_cachelist which
3801 	 * will check local cache lists and remote free lists.
3802 	 */
3803 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3804 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3805 		return (NULL);
3806 	}
3807 
3808 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3809 
3810 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3811 
3812 	if (!(flags & PG_LOCAL)) {
3813 		/*
3814 		 * Try to get a non-local freelist page.
3815 		 */
3816 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3817 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3818 			pp = page_get_func(mnode, bin, mtype, szc, flags);
3819 			if (pp != NULL) {
3820 				DTRACE_PROBE4(page__get,
3821 				    lgrp_t *, lgrp,
3822 				    int, mnode,
3823 				    ulong_t, bin,
3824 				    uint_t, flags);
3825 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3826 				return (pp);
3827 			}
3828 		}
3829 		ASSERT(pp == NULL);
3830 	}
3831 
3832 	/*
3833 	 * when the cage is off chances are page_get_contig_pages() will fail
3834 	 * to lock a large page chunk therefore when the cage is off it's not
3835 	 * called by default.  this can be changed via /etc/system.
3836 	 *
3837 	 * page_get_contig_pages() also called to acquire a base pagesize page
3838 	 * for page_create_get_something().
3839 	 */
3840 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3841 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3842 	    (page_get_func != page_get_contig_pages)) {
3843 
3844 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3845 		page_get_func = page_get_contig_pages;
3846 		goto pgretry;
3847 	}
3848 
3849 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3850 	    page_get_func == page_get_contig_pages)
3851 		SETPGCPFAILCNT(szc);
3852 
3853 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3854 	return (NULL);
3855 }
3856 
3857 /*
3858  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3859  *
3860  * Does its own locking.
3861  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3862  * pages of the proper color even if there are pages of a different color.
3863  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3864  * try to lock one of them.  If no page can be locked, try the
3865  * next bin.  Return NULL if a page can not be found and locked.
3866  *
3867  * Finds a pages, trys to lock it, then removes it.
3868  */
3869 
3870 /*ARGSUSED*/
3871 page_t *
3872 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3873     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3874 {
3875 	page_t		*pp;
3876 	struct as	*as = seg->s_as;
3877 	ulong_t		bin;
3878 	/*LINTED*/
3879 	int		mnode;
3880 	int		mtype;
3881 	lgrp_mnode_cookie_t	lgrp_cookie;
3882 
3883 	/*
3884 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3885 	 * assume we wish to allocate near to the current thread's home.
3886 	 */
3887 	if (!LGRP_EXISTS(lgrp))
3888 		lgrp = lgrp_home_lgrp();
3889 
3890 	if (!kcage_on) {
3891 		flags &= ~PG_NORELOC;
3892 		flags |= PGI_NOCAGE;
3893 	}
3894 
3895 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3896 	    kcage_freemem <= kcage_throttlefree) {
3897 		/*
3898 		 * Reserve kcage_throttlefree pages for critical kernel
3899 		 * threads.
3900 		 *
3901 		 * Everybody else has to go to page_create_get_something()
3902 		 * to get a cage page, so we don't deadlock cageout.
3903 		 */
3904 		return (NULL);
3905 	}
3906 
3907 	/* LINTED */
3908 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3909 
3910 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3911 
3912 	/* LINTED */
3913 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3914 
3915 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3916 
3917 	/*
3918 	 * Try local cachelists first
3919 	 */
3920 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3921 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3922 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3923 		if (pp != NULL) {
3924 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3925 			DTRACE_PROBE4(page__get,
3926 			    lgrp_t *, lgrp,
3927 			    int, mnode,
3928 			    ulong_t, bin,
3929 			    uint_t, flags);
3930 			return (pp);
3931 		}
3932 	}
3933 
3934 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3935 
3936 	/*
3937 	 * Try freelists/cachelists that are farther away
3938 	 * This is our only chance to allocate remote pages for PAGESIZE
3939 	 * requests.
3940 	 */
3941 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3942 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3943 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3944 		    0, flags);
3945 		if (pp != NULL) {
3946 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3947 			DTRACE_PROBE4(page__get,
3948 			    lgrp_t *, lgrp,
3949 			    int, mnode,
3950 			    ulong_t, bin,
3951 			    uint_t, flags);
3952 			return (pp);
3953 		}
3954 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3955 		if (pp != NULL) {
3956 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3957 			DTRACE_PROBE4(page__get,
3958 			    lgrp_t *, lgrp,
3959 			    int, mnode,
3960 			    ulong_t, bin,
3961 			    uint_t, flags);
3962 			return (pp);
3963 		}
3964 	}
3965 
3966 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3967 	return (NULL);
3968 }
3969 
3970 page_t *
3971 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3972 {
3973 	kmutex_t		*pcm;
3974 	page_t			*pp, *first_pp;
3975 	uint_t			sbin;
3976 	int			plw_initialized;
3977 	page_list_walker_t	plw;
3978 
3979 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3980 
3981 	/* LINTED */
3982 	MTYPE_START(mnode, mtype, flags);
3983 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3984 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3985 		return (NULL);
3986 	}
3987 
3988 try_again:
3989 
3990 	plw_initialized = 0;
3991 	plw.plw_ceq_dif = 1;
3992 
3993 	/*
3994 	 * Only hold one cachelist lock at a time, that way we
3995 	 * can start anywhere and not have to worry about lock
3996 	 * ordering.
3997 	 */
3998 
3999 	for (plw.plw_count = 0;
4000 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4001 		sbin = bin;
4002 		do {
4003 
4004 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
4005 				goto bin_empty_1;
4006 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4007 			mutex_enter(pcm);
4008 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
4009 			if (pp == NULL)
4010 				goto bin_empty_0;
4011 
4012 			first_pp = pp;
4013 			ASSERT(pp->p_vnode);
4014 			ASSERT(PP_ISAGED(pp) == 0);
4015 			ASSERT(pp->p_szc == 0);
4016 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4017 			while (!page_trylock(pp, SE_EXCL)) {
4018 				pp = pp->p_next;
4019 				ASSERT(pp->p_szc == 0);
4020 				if (pp == first_pp) {
4021 					/*
4022 					 * We have searched the complete list!
4023 					 * And all of them (might only be one)
4024 					 * are locked. This can happen since
4025 					 * these pages can also be found via
4026 					 * the hash list. When found via the
4027 					 * hash list, they are locked first,
4028 					 * then removed. We give up to let the
4029 					 * other thread run.
4030 					 */
4031 					pp = NULL;
4032 					break;
4033 				}
4034 				ASSERT(pp->p_vnode);
4035 				ASSERT(PP_ISFREE(pp));
4036 				ASSERT(PP_ISAGED(pp) == 0);
4037 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4038 				    mnode);
4039 			}
4040 
4041 			if (pp) {
4042 				page_t	**ppp;
4043 				/*
4044 				 * Found and locked a page.
4045 				 * Pull it off the list.
4046 				 */
4047 				ASSERT(mtype == PP_2_MTYPE(pp));
4048 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4049 				page_sub(ppp, pp);
4050 				/*
4051 				 * Subtract counters before releasing pcm mutex
4052 				 * to avoid a race with page_freelist_coalesce
4053 				 * and page_freelist_split.
4054 				 */
4055 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4056 				mutex_exit(pcm);
4057 				ASSERT(pp->p_vnode);
4058 				ASSERT(PP_ISAGED(pp) == 0);
4059 #if defined(__sparc)
4060 				ASSERT(!kcage_on ||
4061 				    (flags & PG_NORELOC) == 0 ||
4062 				    PP_ISNORELOC(pp));
4063 				if (PP_ISNORELOC(pp)) {
4064 					kcage_freemem_sub(1);
4065 				}
4066 #endif
4067 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4068 				return (pp);
4069 			}
4070 bin_empty_0:
4071 			mutex_exit(pcm);
4072 bin_empty_1:
4073 			if (plw_initialized == 0) {
4074 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
4075 				plw_initialized = 1;
4076 			}
4077 			/* calculate the next bin with equivalent color */
4078 			bin = ADD_MASKED(bin, plw.plw_bin_step,
4079 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
4080 		} while (sbin != bin);
4081 
4082 		if (plw.plw_ceq_dif > 1)
4083 			bin = page_list_walk_next_bin(0, bin, &plw);
4084 	}
4085 
4086 	MTYPE_NEXT(mnode, mtype, flags);
4087 	if (mtype >= 0)
4088 		goto try_again;
4089 
4090 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4091 	return (NULL);
4092 }
4093 
4094 #ifdef DEBUG
4095 #define	REPL_PAGE_STATS
4096 #endif /* DEBUG */
4097 
4098 #ifdef REPL_PAGE_STATS
4099 struct repl_page_stats {
4100 	uint_t	ngets;
4101 	uint_t	ngets_noreloc;
4102 	uint_t	npgr_noreloc;
4103 	uint_t	nnopage_first;
4104 	uint_t	nnopage;
4105 	uint_t	nhashout;
4106 	uint_t	nnofree;
4107 	uint_t	nnext_pp;
4108 } repl_page_stats;
4109 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
4110 #else /* REPL_PAGE_STATS */
4111 #define	REPL_STAT_INCR(v)
4112 #endif /* REPL_PAGE_STATS */
4113 
4114 int	pgrppgcp;
4115 
4116 /*
4117  * The freemem accounting must be done by the caller.
4118  * First we try to get a replacement page of the same size as like_pp,
4119  * if that is not possible, then we just get a set of discontiguous
4120  * PAGESIZE pages.
4121  */
4122 page_t *
4123 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4124     uint_t pgrflags)
4125 {
4126 	page_t		*like_pp;
4127 	page_t		*pp, *pplist;
4128 	page_t		*pl = NULL;
4129 	ulong_t		bin;
4130 	int		mnode, page_mnode;
4131 	int		szc;
4132 	spgcnt_t	npgs, pg_cnt;
4133 	pfn_t		pfnum;
4134 	int		mtype;
4135 	int		flags = 0;
4136 	lgrp_mnode_cookie_t	lgrp_cookie;
4137 	lgrp_t		*lgrp;
4138 
4139 	REPL_STAT_INCR(ngets);
4140 	like_pp = orig_like_pp;
4141 	ASSERT(PAGE_EXCL(like_pp));
4142 
4143 	szc = like_pp->p_szc;
4144 	npgs = page_get_pagecnt(szc);
4145 	/*
4146 	 * Now we reset like_pp to the base page_t.
4147 	 * That way, we won't walk past the end of this 'szc' page.
4148 	 */
4149 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4150 	like_pp = page_numtopp_nolock(pfnum);
4151 	ASSERT(like_pp->p_szc == szc);
4152 
4153 	if (PP_ISNORELOC(like_pp)) {
4154 		ASSERT(kcage_on);
4155 		REPL_STAT_INCR(ngets_noreloc);
4156 		flags = PGI_RELOCONLY;
4157 	} else if (pgrflags & PGR_NORELOC) {
4158 		ASSERT(kcage_on);
4159 		REPL_STAT_INCR(npgr_noreloc);
4160 		flags = PG_NORELOC;
4161 	}
4162 
4163 	/*
4164 	 * Kernel pages must always be replaced with the same size
4165 	 * pages, since we cannot properly handle demotion of kernel
4166 	 * pages.
4167 	 */
4168 	if (PP_ISKAS(like_pp))
4169 		pgrflags |= PGR_SAMESZC;
4170 
4171 	/* LINTED */
4172 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4173 
4174 	while (npgs) {
4175 		pplist = NULL;
4176 		for (;;) {
4177 			pg_cnt = page_get_pagecnt(szc);
4178 			bin = PP_2_BIN(like_pp);
4179 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4180 			ASSERT(pg_cnt <= npgs);
4181 
4182 			/*
4183 			 * If an lgroup was specified, try to get the
4184 			 * page from that lgroup.
4185 			 * NOTE: Must be careful with code below because
4186 			 *	 lgroup may disappear and reappear since there
4187 			 *	 is no locking for lgroup here.
4188 			 */
4189 			if (LGRP_EXISTS(lgrp_target)) {
4190 				/*
4191 				 * Keep local variable for lgroup separate
4192 				 * from lgroup argument since this code should
4193 				 * only be exercised when lgroup argument
4194 				 * exists....
4195 				 */
4196 				lgrp = lgrp_target;
4197 
4198 				/* Try the lgroup's freelists first */
4199 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4200 				    LGRP_SRCH_LOCAL);
4201 				while ((pplist == NULL) &&
4202 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4203 				    != -1) {
4204 					pplist =
4205 					    page_get_mnode_freelist(mnode, bin,
4206 					    mtype, szc, flags);
4207 				}
4208 
4209 				/*
4210 				 * Now try it's cachelists if this is a
4211 				 * small page. Don't need to do it for
4212 				 * larger ones since page_freelist_coalesce()
4213 				 * already failed.
4214 				 */
4215 				if (pplist != NULL || szc != 0)
4216 					break;
4217 
4218 				/* Now try it's cachelists */
4219 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4220 				    LGRP_SRCH_LOCAL);
4221 
4222 				while ((pplist == NULL) &&
4223 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4224 				    != -1) {
4225 					pplist =
4226 					    page_get_mnode_cachelist(bin, flags,
4227 					    mnode, mtype);
4228 				}
4229 				if (pplist != NULL) {
4230 					page_hashout(pplist, NULL);
4231 					PP_SETAGED(pplist);
4232 					REPL_STAT_INCR(nhashout);
4233 					break;
4234 				}
4235 				/* Done looking in this lgroup. Bail out. */
4236 				break;
4237 			}
4238 
4239 			/*
4240 			 * No lgroup was specified (or lgroup was removed by
4241 			 * DR, so just try to get the page as close to
4242 			 * like_pp's mnode as possible.
4243 			 * First try the local freelist...
4244 			 */
4245 			mnode = PP_2_MEM_NODE(like_pp);
4246 			pplist = page_get_mnode_freelist(mnode, bin,
4247 			    mtype, szc, flags);
4248 			if (pplist != NULL)
4249 				break;
4250 
4251 			REPL_STAT_INCR(nnofree);
4252 
4253 			/*
4254 			 * ...then the local cachelist. Don't need to do it for
4255 			 * larger pages cause page_freelist_coalesce() already
4256 			 * failed there anyway.
4257 			 */
4258 			if (szc == 0) {
4259 				pplist = page_get_mnode_cachelist(bin, flags,
4260 				    mnode, mtype);
4261 				if (pplist != NULL) {
4262 					page_hashout(pplist, NULL);
4263 					PP_SETAGED(pplist);
4264 					REPL_STAT_INCR(nhashout);
4265 					break;
4266 				}
4267 			}
4268 
4269 			/* Now try remote freelists */
4270 			page_mnode = mnode;
4271 			lgrp =
4272 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4273 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4274 			    LGRP_SRCH_HIER);
4275 			while (pplist == NULL &&
4276 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4277 			    != -1) {
4278 				/*
4279 				 * Skip local mnode.
4280 				 */
4281 				if ((mnode == page_mnode) ||
4282 				    (mem_node_config[mnode].exists == 0))
4283 					continue;
4284 
4285 				pplist = page_get_mnode_freelist(mnode,
4286 				    bin, mtype, szc, flags);
4287 			}
4288 
4289 			if (pplist != NULL)
4290 				break;
4291 
4292 
4293 			/* Now try remote cachelists */
4294 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4295 			    LGRP_SRCH_HIER);
4296 			while (pplist == NULL && szc == 0) {
4297 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4298 				if (mnode == -1)
4299 					break;
4300 				/*
4301 				 * Skip local mnode.
4302 				 */
4303 				if ((mnode == page_mnode) ||
4304 				    (mem_node_config[mnode].exists == 0))
4305 					continue;
4306 
4307 				pplist = page_get_mnode_cachelist(bin,
4308 				    flags, mnode, mtype);
4309 
4310 				if (pplist != NULL) {
4311 					page_hashout(pplist, NULL);
4312 					PP_SETAGED(pplist);
4313 					REPL_STAT_INCR(nhashout);
4314 					break;
4315 				}
4316 			}
4317 
4318 			/*
4319 			 * Break out of while loop under the following cases:
4320 			 * - If we successfully got a page.
4321 			 * - If pgrflags specified only returning a specific
4322 			 *   page size and we could not find that page size.
4323 			 * - If we could not satisfy the request with PAGESIZE
4324 			 *   or larger pages.
4325 			 */
4326 			if (pplist != NULL || szc == 0)
4327 				break;
4328 
4329 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4330 				/* try to find contig page */
4331 
4332 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4333 				    LGRP_SRCH_HIER);
4334 
4335 				while ((pplist == NULL) &&
4336 				    (mnode =
4337 				    lgrp_memnode_choose(&lgrp_cookie))
4338 				    != -1) {
4339 					pplist = page_get_contig_pages(
4340 					    mnode, bin, mtype, szc,
4341 					    flags | PGI_PGCPHIPRI);
4342 				}
4343 				break;
4344 			}
4345 
4346 			/*
4347 			 * The correct thing to do here is try the next
4348 			 * page size down using szc--. Due to a bug
4349 			 * with the processing of HAT_RELOAD_SHARE
4350 			 * where the sfmmu_ttecnt arrays of all
4351 			 * hats sharing an ISM segment don't get updated,
4352 			 * using intermediate size pages for relocation
4353 			 * can lead to continuous page faults.
4354 			 */
4355 			szc = 0;
4356 		}
4357 
4358 		if (pplist != NULL) {
4359 			DTRACE_PROBE4(page__get,
4360 			    lgrp_t *, lgrp,
4361 			    int, mnode,
4362 			    ulong_t, bin,
4363 			    uint_t, flags);
4364 
4365 			while (pplist != NULL && pg_cnt--) {
4366 				ASSERT(pplist != NULL);
4367 				pp = pplist;
4368 				page_sub(&pplist, pp);
4369 				PP_CLRFREE(pp);
4370 				PP_CLRAGED(pp);
4371 				page_list_concat(&pl, &pp);
4372 				npgs--;
4373 				like_pp = like_pp + 1;
4374 				REPL_STAT_INCR(nnext_pp);
4375 			}
4376 			ASSERT(pg_cnt == 0);
4377 		} else {
4378 			break;
4379 		}
4380 	}
4381 
4382 	if (npgs) {
4383 		/*
4384 		 * We were unable to allocate the necessary number
4385 		 * of pages.
4386 		 * We need to free up any pl.
4387 		 */
4388 		REPL_STAT_INCR(nnopage);
4389 		page_free_replacement_page(pl);
4390 		return (NULL);
4391 	} else {
4392 		return (pl);
4393 	}
4394 }
4395 
4396 /*
4397  * demote a free large page to it's constituent pages
4398  */
4399 void
4400 page_demote_free_pages(page_t *pp)
4401 {
4402 
4403 	int mnode;
4404 
4405 	ASSERT(pp != NULL);
4406 	ASSERT(PAGE_LOCKED(pp));
4407 	ASSERT(PP_ISFREE(pp));
4408 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4409 
4410 	mnode = PP_2_MEM_NODE(pp);
4411 	page_freelist_lock(mnode);
4412 	if (pp->p_szc != 0) {
4413 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4414 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4415 	}
4416 	page_freelist_unlock(mnode);
4417 	ASSERT(pp->p_szc == 0);
4418 }
4419 
4420 /*
4421  * Factor in colorequiv to check additional 'equivalent' bins.
4422  * colorequiv may be set in /etc/system
4423  */
4424 void
4425 page_set_colorequiv_arr(void)
4426 {
4427 	if (colorequiv > 1) {
4428 		int i;
4429 		uint_t sv_a = lowbit(colorequiv) - 1;
4430 
4431 		if (sv_a > 15)
4432 			sv_a = 15;
4433 
4434 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4435 			uint_t colors;
4436 			uint_t a = sv_a;
4437 
4438 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4439 				continue;
4440 			}
4441 			while ((colors >> a) == 0)
4442 				a--;
4443 			if ((a << 4) > colorequivszc[i]) {
4444 				colorequivszc[i] = (a << 4);
4445 			}
4446 		}
4447 	}
4448 }
4449