xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 7d0b359ca572cd04474eb1f2ceec5a8ff39e36c9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright 2012 Joyent, Inc.  All rights reserved.
27  */
28 
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /*	All Rights Reserved   */
31 
32 /*
33  * Portions of this source code were derived from Berkeley 4.3 BSD
34  * under license from the Regents of the University of California.
35  */
36 
37 
38 /*
39  * This file contains common functions to access and manage the page lists.
40  * Many of these routines originated from platform dependent modules
41  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
42  * a platform independent manner.
43  *
44  * vm/vm_dep.h provides for platform specific support.
45  */
46 
47 #include <sys/types.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/systm.h>
51 #include <sys/atomic.h>
52 #include <sys/sysmacros.h>
53 #include <vm/as.h>
54 #include <vm/page.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_vn.h>
57 #include <sys/vmsystm.h>
58 #include <sys/memnode.h>
59 #include <vm/vm_dep.h>
60 #include <sys/lgrp.h>
61 #include <sys/mem_config.h>
62 #include <sys/callb.h>
63 #include <sys/mem_cage.h>
64 #include <sys/sdt.h>
65 #include <sys/dumphdr.h>
66 #include <sys/swap.h>
67 
68 extern uint_t	vac_colors;
69 
70 #define	MAX_PRAGMA_ALIGN	128
71 
72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
73 
74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
75 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
76 #else
77 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
78 #endif
79 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
80 
81 /*
82  * number of page colors equivalent to reqested color in page_get routines.
83  * If set, keeps large pages intact longer and keeps MPO allocation
84  * from the local mnode in favor of acquiring the 'correct' page color from
85  * a demoted large page or from a remote mnode.
86  */
87 uint_t	colorequiv;
88 
89 /*
90  * color equivalency mask for each page size.
91  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
92  * High 4 bits determine the number of high order bits of the color to ignore.
93  * Low 4 bits determines number of low order bits of color to ignore (it's only
94  * relevant for hashed index based page coloring).
95  */
96 uchar_t colorequivszc[MMU_PAGE_SIZES];
97 
98 /*
99  * if set, specifies the percentage of large pages that are free from within
100  * a large page region before attempting to lock those pages for
101  * page_get_contig_pages processing.
102  *
103  * Should be turned on when kpr is available when page_trylock_contig_pages
104  * can be more selective.
105  */
106 
107 int	ptcpthreshold;
108 
109 /*
110  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
111  * Enabled by default via pgcplimitsearch.
112  *
113  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
114  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
115  * bound. This upper bound range guarantees:
116  *    - all large page 'slots' will be searched over time
117  *    - the minimum (1) large page candidates considered on each pgcp call
118  *    - count doesn't wrap around to 0
119  */
120 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
121 int	pgcplimitsearch = 1;
122 
123 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
124 #define	SETPGCPFAILCNT(szc)						\
125 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
126 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
127 
128 #ifdef VM_STATS
129 struct vmm_vmstats_str  vmm_vmstats;
130 
131 #endif /* VM_STATS */
132 
133 #if defined(__sparc)
134 #define	LPGCREATE	0
135 #else
136 /* enable page_get_contig_pages */
137 #define	LPGCREATE	1
138 #endif
139 
140 int pg_contig_disable;
141 int pg_lpgcreate_nocage = LPGCREATE;
142 
143 /*
144  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
145  */
146 #define	PFNNULL		0
147 
148 /* Flags involved in promotion and demotion routines */
149 #define	PC_FREE		0x1	/* put page on freelist */
150 #define	PC_ALLOC	0x2	/* return page for allocation */
151 
152 /*
153  * Flag for page_demote to be used with PC_FREE to denote that we don't care
154  * what the color is as the color parameter to the function is ignored.
155  */
156 #define	PC_NO_COLOR	(-1)
157 
158 /* mtype value for page_promote to use when mtype does not matter */
159 #define	PC_MTYPE_ANY	(-1)
160 
161 /*
162  * page counters candidates info
163  * See page_ctrs_cands comment below for more details.
164  * fields are as follows:
165  *	pcc_pages_free:		# pages which freelist coalesce can create
166  *	pcc_color_free:		pointer to page free counts per color
167  */
168 typedef struct pcc_info {
169 	pgcnt_t	pcc_pages_free;
170 	pgcnt_t	*pcc_color_free;
171 	uint_t	pad[12];
172 } pcc_info_t;
173 
174 /*
175  * On big machines it can take a long time to check page_counters
176  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
177  * updated sum of all elements of the corresponding page_counters arrays.
178  * page_freelist_coalesce() searches page_counters only if an appropriate
179  * element of page_ctrs_cands array is greater than 0.
180  *
181  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
182  */
183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
184 
185 /*
186  * Return in val the total number of free pages which can be created
187  * for the given mnode (m), mrange (g), and region size (r)
188  */
189 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
190 	int i;								\
191 	val = 0;							\
192 	for (i = 0; i < NPC_MUTEX; i++) {				\
193 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
194 	}								\
195 }
196 
197 /*
198  * Return in val the total number of free pages which can be created
199  * for the given mnode (m), mrange (g), region size (r), and color (c)
200  */
201 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
202 	int i;								\
203 	val = 0;							\
204 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
205 	for (i = 0; i < NPC_MUTEX; i++) {				\
206 	    val +=							\
207 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
208 	}								\
209 }
210 
211 /*
212  * We can only allow a single thread to update a counter within the physical
213  * range of the largest supported page size. That is the finest granularity
214  * possible since the counter values are dependent on each other
215  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
216  * ctr_mutex lock index for a particular physical range.
217  */
218 static kmutex_t	*ctr_mutex[NPC_MUTEX];
219 
220 #define	PP_CTR_LOCK_INDX(pp)						\
221 	(((pp)->p_pagenum >>						\
222 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
223 
224 #define	INVALID_COLOR 0xffffffff
225 #define	INVALID_MASK  0xffffffff
226 
227 /*
228  * Local functions prototypes.
229  */
230 
231 void page_ctr_add(int, int, page_t *, int);
232 void page_ctr_add_internal(int, int, page_t *, int);
233 void page_ctr_sub(int, int, page_t *, int);
234 void page_ctr_sub_internal(int, int, page_t *, int);
235 void page_freelist_lock(int);
236 void page_freelist_unlock(int);
237 page_t *page_promote(int, pfn_t, uchar_t, int, int);
238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
239 page_t *page_freelist_split(uchar_t,
240     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
242 static int page_trylock_cons(page_t *pp, se_t se);
243 
244 /*
245  * The page_counters array below is used to keep track of free contiguous
246  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
247  * This contains an array of counters, the size of the array, a shift value
248  * used to convert a pagenum into a counter array index or vice versa, as
249  * well as a cache of the last successful index to be promoted to a larger
250  * page size.  As an optimization, we keep track of the last successful index
251  * to be promoted per page color for the given size region, and this is
252  * allocated dynamically based upon the number of colors for a given
253  * region size.
254  *
255  * Conceptually, the page counters are represented as:
256  *
257  *	page_counters[region_size][mnode]
258  *
259  *	region_size:	size code of a candidate larger page made up
260  *			of contiguous free smaller pages.
261  *
262  *	page_counters[region_size][mnode].hpm_counters[index]:
263  *		represents how many (region_size - 1) pages either
264  *		exist or can be created within the given index range.
265  *
266  * Let's look at a sparc example:
267  *	If we want to create a free 512k page, we look at region_size 2
268  *	for the mnode we want.  We calculate the index and look at a specific
269  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
270  *	this location, it means that 8 64k pages either exist or can be created
271  *	from 8K pages in order to make a single free 512k page at the given
272  *	index.  Note that when a region is full, it will contribute to the
273  *	counts in the region above it.  Thus we will not know what page
274  *	size the free pages will be which can be promoted to this new free
275  *	page unless we look at all regions below the current region.
276  */
277 
278 /*
279  * Note: hpmctr_t is defined in platform vm_dep.h
280  * hw_page_map_t contains all the information needed for the page_counters
281  * logic. The fields are as follows:
282  *
283  *	hpm_counters:	dynamically allocated array to hold counter data
284  *	hpm_entries:	entries in hpm_counters
285  *	hpm_shift:	shift for pnum/array index conv
286  *	hpm_base:	PFN mapped to counter index 0
287  *	hpm_color_current:	last index in counter array for this color at
288  *				which we successfully created a large page
289  */
290 typedef struct hw_page_map {
291 	hpmctr_t	*hpm_counters;
292 	size_t		hpm_entries;
293 	int		hpm_shift;
294 	pfn_t		hpm_base;
295 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
296 #if defined(__sparc)
297 	uint_t		pad[4];
298 #endif
299 } hw_page_map_t;
300 
301 /*
302  * Element zero is not used, but is allocated for convenience.
303  */
304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
305 
306 /*
307  * Cached value of MNODE_RANGE_CNT(mnode).
308  * This is a function call in x86.
309  */
310 static int mnode_nranges[MAX_MEM_NODES];
311 static int mnode_maxmrange[MAX_MEM_NODES];
312 
313 /*
314  * The following macros are convenient ways to get access to the individual
315  * elements of the page_counters arrays.  They can be used on both
316  * the left side and right side of equations.
317  */
318 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
319 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
320 
321 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
322 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
323 
324 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
325 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
326 
327 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
328 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
329 
330 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
331 	(page_counters[(rg_szc)][(mnode)].hpm_base)
332 
333 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
334 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
335 
336 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
337 	(page_counters[(rg_szc)][(mnode)].				\
338 	hpm_color_current[(mrange)][(color)])
339 
340 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
341 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
342 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
343 
344 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
345 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
346 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
347 
348 /*
349  * Protects the hpm_counters and hpm_color_current memory from changing while
350  * looking at page counters information.
351  * Grab the write lock to modify what these fields point at.
352  * Grab the read lock to prevent any pointers from changing.
353  * The write lock can not be held during memory allocation due to a possible
354  * recursion deadlock with trying to grab the read lock while the
355  * write lock is already held.
356  */
357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
358 
359 
360 /*
361  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
362  */
363 void
364 cpu_vm_data_init(struct cpu *cp)
365 {
366 	if (cp == CPU0) {
367 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
368 	} else {
369 		void	*kmptr;
370 		int	align;
371 		size_t	sz;
372 
373 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
374 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
375 		kmptr = kmem_zalloc(sz, KM_SLEEP);
376 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
377 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
378 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
379 	}
380 }
381 
382 /*
383  * free cpu_vm_data
384  */
385 void
386 cpu_vm_data_destroy(struct cpu *cp)
387 {
388 	if (cp->cpu_seqid && cp->cpu_vm_data) {
389 		ASSERT(cp != CPU0);
390 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
391 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
392 	}
393 	cp->cpu_vm_data = NULL;
394 }
395 
396 
397 /*
398  * page size to page size code
399  */
400 int
401 page_szc(size_t pagesize)
402 {
403 	int	i = 0;
404 
405 	while (hw_page_array[i].hp_size) {
406 		if (pagesize == hw_page_array[i].hp_size)
407 			return (i);
408 		i++;
409 	}
410 	return (-1);
411 }
412 
413 /*
414  * page size to page size code with the restriction that it be a supported
415  * user page size.  If it's not a supported user page size, -1 will be returned.
416  */
417 int
418 page_szc_user_filtered(size_t pagesize)
419 {
420 	int szc = page_szc(pagesize);
421 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
422 		return (szc);
423 	}
424 	return (-1);
425 }
426 
427 /*
428  * Return how many page sizes are available for the user to use.  This is
429  * what the hardware supports and not based upon how the OS implements the
430  * support of different page sizes.
431  *
432  * If legacy is non-zero, return the number of pagesizes available to legacy
433  * applications. The number of legacy page sizes might be less than the
434  * exported user page sizes. This is to prevent legacy applications that
435  * use the largest page size returned from getpagesizes(3c) from inadvertantly
436  * using the 'new' large pagesizes.
437  */
438 uint_t
439 page_num_user_pagesizes(int legacy)
440 {
441 	if (legacy)
442 		return (mmu_legacy_page_sizes);
443 	return (mmu_exported_page_sizes);
444 }
445 
446 uint_t
447 page_num_pagesizes(void)
448 {
449 	return (mmu_page_sizes);
450 }
451 
452 /*
453  * returns the count of the number of base pagesize pages associated with szc
454  */
455 pgcnt_t
456 page_get_pagecnt(uint_t szc)
457 {
458 	if (szc >= mmu_page_sizes)
459 		panic("page_get_pagecnt: out of range %d", szc);
460 	return (hw_page_array[szc].hp_pgcnt);
461 }
462 
463 size_t
464 page_get_pagesize(uint_t szc)
465 {
466 	if (szc >= mmu_page_sizes)
467 		panic("page_get_pagesize: out of range %d", szc);
468 	return (hw_page_array[szc].hp_size);
469 }
470 
471 /*
472  * Return the size of a page based upon the index passed in.  An index of
473  * zero refers to the smallest page size in the system, and as index increases
474  * it refers to the next larger supported page size in the system.
475  * Note that szc and userszc may not be the same due to unsupported szc's on
476  * some systems.
477  */
478 size_t
479 page_get_user_pagesize(uint_t userszc)
480 {
481 	uint_t szc = USERSZC_2_SZC(userszc);
482 
483 	if (szc >= mmu_page_sizes)
484 		panic("page_get_user_pagesize: out of range %d", szc);
485 	return (hw_page_array[szc].hp_size);
486 }
487 
488 uint_t
489 page_get_shift(uint_t szc)
490 {
491 	if (szc >= mmu_page_sizes)
492 		panic("page_get_shift: out of range %d", szc);
493 	return (PAGE_GET_SHIFT(szc));
494 }
495 
496 uint_t
497 page_get_pagecolors(uint_t szc)
498 {
499 	if (szc >= mmu_page_sizes)
500 		panic("page_get_pagecolors: out of range %d", szc);
501 	return (PAGE_GET_PAGECOLORS(szc));
502 }
503 
504 /*
505  * this assigns the desired equivalent color after a split
506  */
507 uint_t
508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
509     uint_t ncolor, uint_t ceq_mask)
510 {
511 	ASSERT(nszc > szc);
512 	ASSERT(szc < mmu_page_sizes);
513 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
514 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
515 
516 	color &= ceq_mask;
517 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
518 	return (color | (ncolor & ~ceq_mask));
519 }
520 
521 /*
522  * The interleaved_mnodes flag is set when mnodes overlap in
523  * the physbase..physmax range, but have disjoint slices.
524  * In this case hpm_counters is shared by all mnodes.
525  * This flag is set dynamically by the platform.
526  */
527 int interleaved_mnodes = 0;
528 
529 /*
530  * Called by startup().
531  * Size up the per page size free list counters based on physmax
532  * of each node and max_mem_nodes.
533  *
534  * If interleaved_mnodes is set we need to find the first mnode that
535  * exists. hpm_counters for the first mnode will then be shared by
536  * all other mnodes. If interleaved_mnodes is not set, just set
537  * first=mnode each time. That means there will be no sharing.
538  */
539 size_t
540 page_ctrs_sz(void)
541 {
542 	int	r;		/* region size */
543 	int	mnode;
544 	int	firstmn;	/* first mnode that exists */
545 	int	nranges;
546 	pfn_t	physbase;
547 	pfn_t	physmax;
548 	uint_t	ctrs_sz = 0;
549 	int 	i;
550 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
551 
552 	/*
553 	 * We need to determine how many page colors there are for each
554 	 * page size in order to allocate memory for any color specific
555 	 * arrays.
556 	 */
557 	for (i = 0; i < mmu_page_sizes; i++) {
558 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
559 	}
560 
561 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
562 
563 		pgcnt_t r_pgcnt;
564 		pfn_t   r_base;
565 		pgcnt_t r_align;
566 
567 		if (mem_node_config[mnode].exists == 0)
568 			continue;
569 
570 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
571 		nranges = MNODE_RANGE_CNT(mnode);
572 		mnode_nranges[mnode] = nranges;
573 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
574 
575 		/*
576 		 * determine size needed for page counter arrays with
577 		 * base aligned to large page size.
578 		 */
579 		for (r = 1; r < mmu_page_sizes; r++) {
580 			/* add in space for hpm_color_current */
581 			ctrs_sz += sizeof (size_t) *
582 			    colors_per_szc[r] * nranges;
583 
584 			if (firstmn != mnode)
585 				continue;
586 
587 			/* add in space for hpm_counters */
588 			r_align = page_get_pagecnt(r);
589 			r_base = physbase;
590 			r_base &= ~(r_align - 1);
591 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
592 
593 			/*
594 			 * Round up to always allocate on pointer sized
595 			 * boundaries.
596 			 */
597 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
598 			    sizeof (hpmctr_t *));
599 		}
600 	}
601 
602 	for (r = 1; r < mmu_page_sizes; r++) {
603 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
604 	}
605 
606 	/* add in space for page_ctrs_cands and pcc_color_free */
607 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
608 	    mmu_page_sizes * NPC_MUTEX;
609 
610 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
611 
612 		if (mem_node_config[mnode].exists == 0)
613 			continue;
614 
615 		nranges = mnode_nranges[mnode];
616 		ctrs_sz += sizeof (pcc_info_t) * nranges *
617 		    mmu_page_sizes * NPC_MUTEX;
618 		for (r = 1; r < mmu_page_sizes; r++) {
619 			ctrs_sz += sizeof (pgcnt_t) * nranges *
620 			    colors_per_szc[r] * NPC_MUTEX;
621 		}
622 	}
623 
624 	/* ctr_mutex */
625 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
626 
627 	/* size for page list counts */
628 	PLCNT_SZ(ctrs_sz);
629 
630 	/*
631 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
632 	 * address of the counters to ecache_alignsize boundary for every
633 	 * memory node.
634 	 */
635 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
636 }
637 
638 caddr_t
639 page_ctrs_alloc(caddr_t alloc_base)
640 {
641 	int	mnode;
642 	int	mrange, nranges;
643 	int	r;		/* region size */
644 	int	i;
645 	int	firstmn;	/* first mnode that exists */
646 	pfn_t	physbase;
647 	pfn_t	physmax;
648 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
649 
650 	/*
651 	 * We need to determine how many page colors there are for each
652 	 * page size in order to allocate memory for any color specific
653 	 * arrays.
654 	 */
655 	for (i = 0; i < mmu_page_sizes; i++) {
656 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
657 	}
658 
659 	for (r = 1; r < mmu_page_sizes; r++) {
660 		page_counters[r] = (hw_page_map_t *)alloc_base;
661 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
662 	}
663 
664 	/* page_ctrs_cands and pcc_color_free array */
665 	for (i = 0; i < NPC_MUTEX; i++) {
666 		for (r = 1; r < mmu_page_sizes; r++) {
667 
668 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
669 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
670 
671 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
672 				pcc_info_t *pi;
673 
674 				if (mem_node_config[mnode].exists == 0)
675 					continue;
676 
677 				nranges = mnode_nranges[mnode];
678 
679 				pi = (pcc_info_t *)alloc_base;
680 				alloc_base += sizeof (pcc_info_t) * nranges;
681 				page_ctrs_cands[i][r][mnode] = pi;
682 
683 				for (mrange = 0; mrange < nranges; mrange++) {
684 					pi->pcc_color_free =
685 					    (pgcnt_t *)alloc_base;
686 					alloc_base += sizeof (pgcnt_t) *
687 					    colors_per_szc[r];
688 					pi++;
689 				}
690 			}
691 		}
692 	}
693 
694 	/* ctr_mutex */
695 	for (i = 0; i < NPC_MUTEX; i++) {
696 		ctr_mutex[i] = (kmutex_t *)alloc_base;
697 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
698 	}
699 
700 	/* initialize page list counts */
701 	PLCNT_INIT(alloc_base);
702 
703 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
704 
705 		pgcnt_t r_pgcnt;
706 		pfn_t	r_base;
707 		pgcnt_t r_align;
708 		int	r_shift;
709 		int	nranges = mnode_nranges[mnode];
710 
711 		if (mem_node_config[mnode].exists == 0)
712 			continue;
713 
714 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
715 
716 		for (r = 1; r < mmu_page_sizes; r++) {
717 			/*
718 			 * the page_counters base has to be aligned to the
719 			 * page count of page size code r otherwise the counts
720 			 * will cross large page boundaries.
721 			 */
722 			r_align = page_get_pagecnt(r);
723 			r_base = physbase;
724 			/* base needs to be aligned - lower to aligned value */
725 			r_base &= ~(r_align - 1);
726 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
727 			r_shift = PAGE_BSZS_SHIFT(r);
728 
729 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
730 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
731 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
732 			for (mrange = 0; mrange < nranges; mrange++) {
733 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
734 				    r, mrange) = (size_t *)alloc_base;
735 				alloc_base += sizeof (size_t) *
736 				    colors_per_szc[r];
737 			}
738 			for (i = 0; i < colors_per_szc[r]; i++) {
739 				uint_t color_mask = colors_per_szc[r] - 1;
740 				pfn_t  pfnum = r_base;
741 				size_t idx;
742 				int mrange;
743 				MEM_NODE_ITERATOR_DECL(it);
744 
745 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
746 				if (pfnum == (pfn_t)-1) {
747 					idx = 0;
748 				} else {
749 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
750 					    color_mask, color_mask, &it);
751 					idx = PNUM_TO_IDX(mnode, r, pfnum);
752 					idx = (idx >= r_pgcnt) ? 0 : idx;
753 				}
754 				for (mrange = 0; mrange < nranges; mrange++) {
755 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
756 					    r, i, mrange) = idx;
757 				}
758 			}
759 
760 			/* hpm_counters may be shared by all mnodes */
761 			if (firstmn == mnode) {
762 				PAGE_COUNTERS_COUNTERS(mnode, r) =
763 				    (hpmctr_t *)alloc_base;
764 				alloc_base +=
765 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
766 				    sizeof (hpmctr_t *));
767 			} else {
768 				PAGE_COUNTERS_COUNTERS(mnode, r) =
769 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
770 			}
771 
772 			/*
773 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
774 			 * satisfy the identity requirement.
775 			 * We should be able to go from one to the other
776 			 * and get consistent values.
777 			 */
778 			ASSERT(PNUM_TO_IDX(mnode, r,
779 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
780 			ASSERT(IDX_TO_PNUM(mnode, r,
781 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
782 		}
783 		/*
784 		 * Roundup the start address of the page_counters to
785 		 * cache aligned boundary for every memory node.
786 		 * page_ctrs_sz() has added some slop for these roundups.
787 		 */
788 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
789 		    L2CACHE_ALIGN);
790 	}
791 
792 	/* Initialize other page counter specific data structures. */
793 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
794 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
795 	}
796 
797 	return (alloc_base);
798 }
799 
800 /*
801  * Functions to adjust region counters for each size free list.
802  * Caller is responsible to acquire the ctr_mutex lock if necessary and
803  * thus can be called during startup without locks.
804  */
805 /* ARGSUSED */
806 void
807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
808 {
809 	ssize_t		r;	/* region size */
810 	ssize_t		idx;
811 	pfn_t		pfnum;
812 	int		lckidx;
813 
814 	ASSERT(mnode == PP_2_MEM_NODE(pp));
815 	ASSERT(mtype == PP_2_MTYPE(pp));
816 
817 	ASSERT(pp->p_szc < mmu_page_sizes);
818 
819 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
820 
821 	/* no counter update needed for largest page size */
822 	if (pp->p_szc >= mmu_page_sizes - 1) {
823 		return;
824 	}
825 
826 	r = pp->p_szc + 1;
827 	pfnum = pp->p_pagenum;
828 	lckidx = PP_CTR_LOCK_INDX(pp);
829 
830 	/*
831 	 * Increment the count of free pages for the current
832 	 * region. Continue looping up in region size incrementing
833 	 * count if the preceeding region is full.
834 	 */
835 	while (r < mmu_page_sizes) {
836 		idx = PNUM_TO_IDX(mnode, r, pfnum);
837 
838 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
839 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
840 
841 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
842 			break;
843 		} else {
844 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
845 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
846 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
847 
848 			cand->pcc_pages_free++;
849 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
850 		}
851 		r++;
852 	}
853 }
854 
855 void
856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
857 {
858 	int		lckidx = PP_CTR_LOCK_INDX(pp);
859 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
860 
861 	mutex_enter(lock);
862 	page_ctr_add_internal(mnode, mtype, pp, flags);
863 	mutex_exit(lock);
864 }
865 
866 void
867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
868 {
869 	int		lckidx;
870 	ssize_t		r;	/* region size */
871 	ssize_t		idx;
872 	pfn_t		pfnum;
873 
874 	ASSERT(mnode == PP_2_MEM_NODE(pp));
875 	ASSERT(mtype == PP_2_MTYPE(pp));
876 
877 	ASSERT(pp->p_szc < mmu_page_sizes);
878 
879 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
880 
881 	/* no counter update needed for largest page size */
882 	if (pp->p_szc >= mmu_page_sizes - 1) {
883 		return;
884 	}
885 
886 	r = pp->p_szc + 1;
887 	pfnum = pp->p_pagenum;
888 	lckidx = PP_CTR_LOCK_INDX(pp);
889 
890 	/*
891 	 * Decrement the count of free pages for the current
892 	 * region. Continue looping up in region size decrementing
893 	 * count if the preceeding region was full.
894 	 */
895 	while (r < mmu_page_sizes) {
896 		idx = PNUM_TO_IDX(mnode, r, pfnum);
897 
898 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
899 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
900 
901 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
902 			break;
903 		} else {
904 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
905 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
906 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
907 
908 			ASSERT(cand->pcc_pages_free != 0);
909 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
910 
911 			cand->pcc_pages_free--;
912 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
913 		}
914 		r++;
915 	}
916 }
917 
918 void
919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
920 {
921 	int		lckidx = PP_CTR_LOCK_INDX(pp);
922 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
923 
924 	mutex_enter(lock);
925 	page_ctr_sub_internal(mnode, mtype, pp, flags);
926 	mutex_exit(lock);
927 }
928 
929 /*
930  * Adjust page counters following a memory attach, since typically the
931  * size of the array needs to change, and the PFN to counter index
932  * mapping needs to change.
933  *
934  * It is possible this mnode did not exist at startup. In that case
935  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
936  * to change (a theoretical possibility on x86), which means pcc_color_free
937  * arrays must be extended.
938  */
939 uint_t
940 page_ctrs_adjust(int mnode)
941 {
942 	pgcnt_t npgs;
943 	int	r;		/* region size */
944 	int	i;
945 	size_t	pcsz, old_csz;
946 	hpmctr_t *new_ctr, *old_ctr;
947 	pfn_t	oldbase, newbase;
948 	pfn_t	physbase, physmax;
949 	size_t	old_npgs;
950 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
951 	size_t	size_cache[MMU_PAGE_SIZES];
952 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
953 	size_t	*old_color_array[MAX_MNODE_MRANGES];
954 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
955 	pcc_info_t **cands_cache;
956 	pcc_info_t *old_pi, *pi;
957 	pgcnt_t *pgcntp;
958 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
959 	int cands_cache_nranges;
960 	int old_maxmrange, new_maxmrange;
961 	int rc = 0;
962 	int oldmnode;
963 
964 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
965 	    MMU_PAGE_SIZES, KM_NOSLEEP);
966 	if (cands_cache == NULL)
967 		return (ENOMEM);
968 
969 	i = -1;
970 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
971 
972 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
973 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
974 
975 	/* prepare to free non-null pointers on the way out */
976 	cands_cache_nranges = nranges;
977 	bzero(ctr_cache, sizeof (ctr_cache));
978 	bzero(color_cache, sizeof (color_cache));
979 
980 	/*
981 	 * We need to determine how many page colors there are for each
982 	 * page size in order to allocate memory for any color specific
983 	 * arrays.
984 	 */
985 	for (r = 0; r < mmu_page_sizes; r++) {
986 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
987 	}
988 
989 	/*
990 	 * Preallocate all of the new hpm_counters arrays as we can't
991 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
992 	 * If we can't allocate all of the arrays, undo our work so far
993 	 * and return failure.
994 	 */
995 	for (r = 1; r < mmu_page_sizes; r++) {
996 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
997 		size_cache[r] = pcsz;
998 		ctr_cache[r] = kmem_zalloc(pcsz *
999 		    sizeof (hpmctr_t), KM_NOSLEEP);
1000 		if (ctr_cache[r] == NULL) {
1001 			rc = ENOMEM;
1002 			goto cleanup;
1003 		}
1004 	}
1005 
1006 	/*
1007 	 * Preallocate all of the new color current arrays as we can't
1008 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1009 	 * If we can't allocate all of the arrays, undo our work so far
1010 	 * and return failure.
1011 	 */
1012 	for (r = 1; r < mmu_page_sizes; r++) {
1013 		for (mrange = 0; mrange < nranges; mrange++) {
1014 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1015 			    colors_per_szc[r], KM_NOSLEEP);
1016 			if (color_cache[r][mrange] == NULL) {
1017 				rc = ENOMEM;
1018 				goto cleanup;
1019 			}
1020 		}
1021 	}
1022 
1023 	/*
1024 	 * Preallocate all of the new pcc_info_t arrays as we can't
1025 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1026 	 * If we can't allocate all of the arrays, undo our work so far
1027 	 * and return failure.
1028 	 */
1029 	for (r = 1; r < mmu_page_sizes; r++) {
1030 		for (i = 0; i < NPC_MUTEX; i++) {
1031 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1032 			    KM_NOSLEEP);
1033 			if (pi == NULL) {
1034 				rc = ENOMEM;
1035 				goto cleanup;
1036 			}
1037 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1038 
1039 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1040 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1041 				    sizeof (pgcnt_t), KM_NOSLEEP);
1042 				if (pgcntp == NULL) {
1043 					rc = ENOMEM;
1044 					goto cleanup;
1045 				}
1046 				pi->pcc_color_free = pgcntp;
1047 			}
1048 		}
1049 	}
1050 
1051 	/*
1052 	 * Grab the write lock to prevent others from walking these arrays
1053 	 * while we are modifying them.
1054 	 */
1055 	PAGE_CTRS_WRITE_LOCK(mnode);
1056 
1057 	/*
1058 	 * For interleaved mnodes, find the first mnode
1059 	 * with valid page counters since the current
1060 	 * mnode may have just been added and not have
1061 	 * valid page counters.
1062 	 */
1063 	if (interleaved_mnodes) {
1064 		for (i = 0; i < max_mem_nodes; i++)
1065 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1066 				break;
1067 		ASSERT(i < max_mem_nodes);
1068 		oldmnode = i;
1069 	} else
1070 		oldmnode = mnode;
1071 
1072 	old_nranges = mnode_nranges[mnode];
1073 	cands_cache_nranges = old_nranges;
1074 	mnode_nranges[mnode] = nranges;
1075 	old_maxmrange = mnode_maxmrange[mnode];
1076 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1077 	new_maxmrange = mnode_maxmrange[mnode];
1078 
1079 	for (r = 1; r < mmu_page_sizes; r++) {
1080 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1081 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1082 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1083 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1084 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1085 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1086 			old_color_array[mrange] =
1087 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1088 			    r, mrange);
1089 		}
1090 
1091 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1092 		new_ctr = ctr_cache[r];
1093 		ctr_cache[r] = NULL;
1094 		if (old_ctr != NULL &&
1095 		    (oldbase + old_npgs > newbase) &&
1096 		    (newbase + npgs > oldbase)) {
1097 			/*
1098 			 * Map the intersection of the old and new
1099 			 * counters into the new array.
1100 			 */
1101 			size_t offset;
1102 			if (newbase > oldbase) {
1103 				offset = (newbase - oldbase) >>
1104 				    PAGE_COUNTERS_SHIFT(mnode, r);
1105 				bcopy(old_ctr + offset, new_ctr,
1106 				    MIN(pcsz, (old_csz - offset)) *
1107 				    sizeof (hpmctr_t));
1108 			} else {
1109 				offset = (oldbase - newbase) >>
1110 				    PAGE_COUNTERS_SHIFT(mnode, r);
1111 				bcopy(old_ctr, new_ctr + offset,
1112 				    MIN(pcsz - offset, old_csz) *
1113 				    sizeof (hpmctr_t));
1114 			}
1115 		}
1116 
1117 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1118 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1119 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1120 
1121 		/* update shared hpm_counters in other mnodes */
1122 		if (interleaved_mnodes) {
1123 			for (i = 0; i < max_mem_nodes; i++) {
1124 				if ((i == mnode) ||
1125 				    (mem_node_config[i].exists == 0))
1126 					continue;
1127 				ASSERT(
1128 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1129 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1130 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1131 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1132 				PAGE_COUNTERS_BASE(i, r) = newbase;
1133 			}
1134 		}
1135 
1136 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1137 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1138 			    color_cache[r][mrange];
1139 			color_cache[r][mrange] = NULL;
1140 		}
1141 		/*
1142 		 * for now, just reset on these events as it's probably
1143 		 * not worthwhile to try and optimize this.
1144 		 */
1145 		for (i = 0; i < colors_per_szc[r]; i++) {
1146 			uint_t color_mask = colors_per_szc[r] - 1;
1147 			int mlo = interleaved_mnodes ? 0 : mnode;
1148 			int mhi = interleaved_mnodes ? max_mem_nodes :
1149 			    (mnode + 1);
1150 			int m;
1151 			pfn_t  pfnum;
1152 			size_t idx;
1153 			MEM_NODE_ITERATOR_DECL(it);
1154 
1155 			for (m = mlo; m < mhi; m++) {
1156 				if (mem_node_config[m].exists == 0)
1157 					continue;
1158 				pfnum = newbase;
1159 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1160 				if (pfnum == (pfn_t)-1) {
1161 					idx = 0;
1162 				} else {
1163 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1164 					    color_mask, color_mask, &it);
1165 					idx = PNUM_TO_IDX(m, r, pfnum);
1166 					idx = (idx < pcsz) ? idx : 0;
1167 				}
1168 				for (mrange = 0; mrange < nranges; mrange++) {
1169 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1170 					    r, mrange) != NULL)
1171 						PAGE_COUNTERS_CURRENT_COLOR(m,
1172 						    r, i, mrange) = idx;
1173 				}
1174 			}
1175 		}
1176 
1177 		/* cache info for freeing out of the critical path */
1178 		if ((caddr_t)old_ctr >= kernelheap &&
1179 		    (caddr_t)old_ctr < ekernelheap) {
1180 			ctr_cache[r] = old_ctr;
1181 			size_cache[r] = old_csz;
1182 		}
1183 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1184 			size_t *tmp = old_color_array[mrange];
1185 			if ((caddr_t)tmp >= kernelheap &&
1186 			    (caddr_t)tmp < ekernelheap) {
1187 				color_cache[r][mrange] = tmp;
1188 			}
1189 		}
1190 		/*
1191 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192 		 * satisfy the identity requirement.
1193 		 * We should be able to go from one to the other
1194 		 * and get consistent values.
1195 		 */
1196 		ASSERT(PNUM_TO_IDX(mnode, r,
1197 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1198 		ASSERT(IDX_TO_PNUM(mnode, r,
1199 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1200 
1201 		/* pcc_info_t and pcc_color_free */
1202 		for (i = 0; i < NPC_MUTEX; i++) {
1203 			pcc_info_t *epi;
1204 			pcc_info_t *eold_pi;
1205 
1206 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1207 			old_pi = page_ctrs_cands[i][r][mnode];
1208 			page_ctrs_cands[i][r][mnode] = pi;
1209 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1210 
1211 			/* preserve old pcc_color_free values, if any */
1212 			if (old_pi == NULL)
1213 				continue;
1214 
1215 			/*
1216 			 * when/if x86 does DR, must account for
1217 			 * possible change in range index when
1218 			 * preserving pcc_info
1219 			 */
1220 			epi = &pi[nranges];
1221 			eold_pi = &old_pi[old_nranges];
1222 			if (new_maxmrange > old_maxmrange) {
1223 				pi += new_maxmrange - old_maxmrange;
1224 			} else if (new_maxmrange < old_maxmrange) {
1225 				old_pi += old_maxmrange - new_maxmrange;
1226 			}
1227 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1228 				pcc_info_t tmp = *pi;
1229 				*pi = *old_pi;
1230 				*old_pi = tmp;
1231 			}
1232 		}
1233 	}
1234 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1235 
1236 	/*
1237 	 * Now that we have dropped the write lock, it is safe to free all
1238 	 * of the memory we have cached above.
1239 	 * We come thru here to free memory when pre-alloc fails, and also to
1240 	 * free old pointers which were recorded while locked.
1241 	 */
1242 cleanup:
1243 	for (r = 1; r < mmu_page_sizes; r++) {
1244 		if (ctr_cache[r] != NULL) {
1245 			kmem_free(ctr_cache[r],
1246 			    size_cache[r] * sizeof (hpmctr_t));
1247 		}
1248 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1249 			if (color_cache[r][mrange] != NULL) {
1250 				kmem_free(color_cache[r][mrange],
1251 				    colors_per_szc[r] * sizeof (size_t));
1252 			}
1253 		}
1254 		for (i = 0; i < NPC_MUTEX; i++) {
1255 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1256 			if (pi == NULL)
1257 				continue;
1258 			nr = cands_cache_nranges;
1259 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1260 				pgcntp = pi->pcc_color_free;
1261 				if (pgcntp == NULL)
1262 					continue;
1263 				if ((caddr_t)pgcntp >= kernelheap &&
1264 				    (caddr_t)pgcntp < ekernelheap) {
1265 					kmem_free(pgcntp,
1266 					    colors_per_szc[r] *
1267 					    sizeof (pgcnt_t));
1268 				}
1269 			}
1270 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1271 			if ((caddr_t)pi >= kernelheap &&
1272 			    (caddr_t)pi < ekernelheap) {
1273 				kmem_free(pi, nr * sizeof (pcc_info_t));
1274 			}
1275 		}
1276 	}
1277 
1278 	kmem_free(cands_cache,
1279 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1280 	return (rc);
1281 }
1282 
1283 /*
1284  * Cleanup the hpm_counters field in the page counters
1285  * array.
1286  */
1287 void
1288 page_ctrs_cleanup(void)
1289 {
1290 	int r;	/* region size */
1291 	int i;	/* mnode index */
1292 
1293 	/*
1294 	 * Get the page counters write lock while we are
1295 	 * setting the page hpm_counters field to NULL
1296 	 * for non-existent mnodes.
1297 	 */
1298 	for (i = 0; i < max_mem_nodes; i++) {
1299 		PAGE_CTRS_WRITE_LOCK(i);
1300 		if (mem_node_config[i].exists) {
1301 			PAGE_CTRS_WRITE_UNLOCK(i);
1302 			continue;
1303 		}
1304 		for (r = 1; r < mmu_page_sizes; r++) {
1305 			PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1306 		}
1307 		PAGE_CTRS_WRITE_UNLOCK(i);
1308 	}
1309 }
1310 
1311 #ifdef DEBUG
1312 
1313 /*
1314  * confirm pp is a large page corresponding to szc
1315  */
1316 void
1317 chk_lpg(page_t *pp, uchar_t szc)
1318 {
1319 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1320 	uint_t noreloc;
1321 
1322 	if (npgs == 1) {
1323 		ASSERT(pp->p_szc == 0);
1324 		ASSERT(pp->p_next == pp);
1325 		ASSERT(pp->p_prev == pp);
1326 		return;
1327 	}
1328 
1329 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1330 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1331 
1332 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1333 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1334 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1335 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1336 
1337 	/*
1338 	 * Check list of pages.
1339 	 */
1340 	noreloc = PP_ISNORELOC(pp);
1341 	while (npgs--) {
1342 		if (npgs != 0) {
1343 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1344 			ASSERT(pp->p_next == (pp + 1));
1345 		}
1346 		ASSERT(pp->p_szc == szc);
1347 		ASSERT(PP_ISFREE(pp));
1348 		ASSERT(PP_ISAGED(pp));
1349 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1350 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1351 		ASSERT(pp->p_vnode  == NULL);
1352 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1353 
1354 		pp = pp->p_next;
1355 	}
1356 }
1357 #endif /* DEBUG */
1358 
1359 void
1360 page_freelist_lock(int mnode)
1361 {
1362 	int i;
1363 	for (i = 0; i < NPC_MUTEX; i++) {
1364 		mutex_enter(FPC_MUTEX(mnode, i));
1365 		mutex_enter(CPC_MUTEX(mnode, i));
1366 	}
1367 }
1368 
1369 void
1370 page_freelist_unlock(int mnode)
1371 {
1372 	int i;
1373 	for (i = 0; i < NPC_MUTEX; i++) {
1374 		mutex_exit(FPC_MUTEX(mnode, i));
1375 		mutex_exit(CPC_MUTEX(mnode, i));
1376 	}
1377 }
1378 
1379 /*
1380  * add pp to the specified page list. Defaults to head of the page list
1381  * unless PG_LIST_TAIL is specified.
1382  */
1383 void
1384 page_list_add(page_t *pp, int flags)
1385 {
1386 	page_t		**ppp;
1387 	kmutex_t	*pcm;
1388 	uint_t		bin, mtype;
1389 	int		mnode;
1390 
1391 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1392 	ASSERT(PP_ISFREE(pp));
1393 	ASSERT(!hat_page_is_mapped(pp));
1394 	ASSERT(hat_page_getshare(pp) == 0);
1395 
1396 	/*
1397 	 * Large pages should be freed via page_list_add_pages().
1398 	 */
1399 	ASSERT(pp->p_szc == 0);
1400 
1401 	/*
1402 	 * Don't need to lock the freelist first here
1403 	 * because the page isn't on the freelist yet.
1404 	 * This means p_szc can't change on us.
1405 	 */
1406 
1407 	bin = PP_2_BIN(pp);
1408 	mnode = PP_2_MEM_NODE(pp);
1409 	mtype = PP_2_MTYPE(pp);
1410 
1411 	if (flags & PG_LIST_ISINIT) {
1412 		/*
1413 		 * PG_LIST_ISINIT is set during system startup (ie. single
1414 		 * threaded), add a page to the free list and add to the
1415 		 * the free region counters w/o any locking
1416 		 */
1417 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1418 
1419 		/* inline version of page_add() */
1420 		if (*ppp != NULL) {
1421 			pp->p_next = *ppp;
1422 			pp->p_prev = (*ppp)->p_prev;
1423 			(*ppp)->p_prev = pp;
1424 			pp->p_prev->p_next = pp;
1425 		} else
1426 			*ppp = pp;
1427 
1428 		page_ctr_add_internal(mnode, mtype, pp, flags);
1429 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1430 	} else {
1431 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1432 
1433 		if (flags & PG_FREE_LIST) {
1434 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1435 			ASSERT(PP_ISAGED(pp));
1436 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1437 
1438 		} else {
1439 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1440 			ASSERT(pp->p_vnode);
1441 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1442 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1443 		}
1444 		mutex_enter(pcm);
1445 		page_add(ppp, pp);
1446 
1447 		if (flags & PG_LIST_TAIL)
1448 			*ppp = (*ppp)->p_next;
1449 		/*
1450 		 * Add counters before releasing pcm mutex to avoid a race with
1451 		 * page_freelist_coalesce and page_freelist_split.
1452 		 */
1453 		page_ctr_add(mnode, mtype, pp, flags);
1454 		mutex_exit(pcm);
1455 	}
1456 
1457 
1458 #if defined(__sparc)
1459 	if (PP_ISNORELOC(pp)) {
1460 		kcage_freemem_add(1);
1461 	}
1462 #endif
1463 	/*
1464 	 * It is up to the caller to unlock the page!
1465 	 */
1466 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1467 }
1468 
1469 
1470 #ifdef __sparc
1471 /*
1472  * This routine is only used by kcage_init during system startup.
1473  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1474  * without the overhead of taking locks and updating counters.
1475  */
1476 void
1477 page_list_noreloc_startup(page_t *pp)
1478 {
1479 	page_t		**ppp;
1480 	uint_t		bin;
1481 	int		mnode;
1482 	int		mtype;
1483 	int		flags = 0;
1484 
1485 	/*
1486 	 * If this is a large page on the freelist then
1487 	 * break it up into smaller pages.
1488 	 */
1489 	if (pp->p_szc != 0)
1490 		page_boot_demote(pp);
1491 
1492 	/*
1493 	 * Get list page is currently on.
1494 	 */
1495 	bin = PP_2_BIN(pp);
1496 	mnode = PP_2_MEM_NODE(pp);
1497 	mtype = PP_2_MTYPE(pp);
1498 	ASSERT(mtype == MTYPE_RELOC);
1499 	ASSERT(pp->p_szc == 0);
1500 
1501 	if (PP_ISAGED(pp)) {
1502 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1503 		flags |= PG_FREE_LIST;
1504 	} else {
1505 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1506 		flags |= PG_CACHE_LIST;
1507 	}
1508 
1509 	ASSERT(*ppp != NULL);
1510 
1511 	/*
1512 	 * Delete page from current list.
1513 	 */
1514 	if (*ppp == pp)
1515 		*ppp = pp->p_next;		/* go to next page */
1516 	if (*ppp == pp) {
1517 		*ppp = NULL;			/* page list is gone */
1518 	} else {
1519 		pp->p_prev->p_next = pp->p_next;
1520 		pp->p_next->p_prev = pp->p_prev;
1521 	}
1522 
1523 	/*
1524 	 * Decrement page counters
1525 	 */
1526 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1527 
1528 	/*
1529 	 * Set no reloc for cage initted pages.
1530 	 */
1531 	PP_SETNORELOC(pp);
1532 
1533 	mtype = PP_2_MTYPE(pp);
1534 	ASSERT(mtype == MTYPE_NORELOC);
1535 
1536 	/*
1537 	 * Get new list for page.
1538 	 */
1539 	if (PP_ISAGED(pp)) {
1540 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1541 	} else {
1542 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1543 	}
1544 
1545 	/*
1546 	 * Insert page on new list.
1547 	 */
1548 	if (*ppp == NULL) {
1549 		*ppp = pp;
1550 		pp->p_next = pp->p_prev = pp;
1551 	} else {
1552 		pp->p_next = *ppp;
1553 		pp->p_prev = (*ppp)->p_prev;
1554 		(*ppp)->p_prev = pp;
1555 		pp->p_prev->p_next = pp;
1556 	}
1557 
1558 	/*
1559 	 * Increment page counters
1560 	 */
1561 	page_ctr_add_internal(mnode, mtype, pp, flags);
1562 
1563 	/*
1564 	 * Update cage freemem counter
1565 	 */
1566 	atomic_inc_ulong(&kcage_freemem);
1567 }
1568 #else	/* __sparc */
1569 
1570 /* ARGSUSED */
1571 void
1572 page_list_noreloc_startup(page_t *pp)
1573 {
1574 	panic("page_list_noreloc_startup: should be here only for sparc");
1575 }
1576 #endif
1577 
1578 void
1579 page_list_add_pages(page_t *pp, int flags)
1580 {
1581 	kmutex_t *pcm;
1582 	pgcnt_t	pgcnt;
1583 	uint_t	bin, mtype, i;
1584 	int	mnode;
1585 
1586 	/* default to freelist/head */
1587 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1588 
1589 	CHK_LPG(pp, pp->p_szc);
1590 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1591 
1592 	bin = PP_2_BIN(pp);
1593 	mnode = PP_2_MEM_NODE(pp);
1594 	mtype = PP_2_MTYPE(pp);
1595 
1596 	if (flags & PG_LIST_ISINIT) {
1597 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1598 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1599 		ASSERT(!PP_ISNORELOC(pp));
1600 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1601 	} else {
1602 
1603 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1604 
1605 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1606 
1607 		mutex_enter(pcm);
1608 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1609 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1610 		mutex_exit(pcm);
1611 
1612 		pgcnt = page_get_pagecnt(pp->p_szc);
1613 #if defined(__sparc)
1614 		if (PP_ISNORELOC(pp))
1615 			kcage_freemem_add(pgcnt);
1616 #endif
1617 		for (i = 0; i < pgcnt; i++, pp++)
1618 			page_unlock_nocapture(pp);
1619 	}
1620 }
1621 
1622 /*
1623  * During boot, need to demote a large page to base
1624  * pagesize pages for seg_kmem for use in boot_alloc()
1625  */
1626 void
1627 page_boot_demote(page_t *pp)
1628 {
1629 	ASSERT(pp->p_szc != 0);
1630 	ASSERT(PP_ISFREE(pp));
1631 	ASSERT(PP_ISAGED(pp));
1632 
1633 	(void) page_demote(PP_2_MEM_NODE(pp),
1634 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1635 	    PC_FREE);
1636 
1637 	ASSERT(PP_ISFREE(pp));
1638 	ASSERT(PP_ISAGED(pp));
1639 	ASSERT(pp->p_szc == 0);
1640 }
1641 
1642 /*
1643  * Take a particular page off of whatever freelist the page
1644  * is claimed to be on.
1645  *
1646  * NOTE: Only used for PAGESIZE pages.
1647  */
1648 void
1649 page_list_sub(page_t *pp, int flags)
1650 {
1651 	int		bin;
1652 	uint_t		mtype;
1653 	int		mnode;
1654 	kmutex_t	*pcm;
1655 	page_t		**ppp;
1656 
1657 	ASSERT(PAGE_EXCL(pp));
1658 	ASSERT(PP_ISFREE(pp));
1659 
1660 	/*
1661 	 * The p_szc field can only be changed by page_promote()
1662 	 * and page_demote(). Only free pages can be promoted and
1663 	 * demoted and the free list MUST be locked during these
1664 	 * operations. So to prevent a race in page_list_sub()
1665 	 * between computing which bin of the freelist lock to
1666 	 * grab and actually grabing the lock we check again that
1667 	 * the bin we locked is still the correct one. Notice that
1668 	 * the p_szc field could have actually changed on us but
1669 	 * if the bin happens to still be the same we are safe.
1670 	 */
1671 try_again:
1672 	bin = PP_2_BIN(pp);
1673 	mnode = PP_2_MEM_NODE(pp);
1674 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1675 	mutex_enter(pcm);
1676 	if (PP_2_BIN(pp) != bin) {
1677 		mutex_exit(pcm);
1678 		goto try_again;
1679 	}
1680 	mtype = PP_2_MTYPE(pp);
1681 
1682 	if (flags & PG_FREE_LIST) {
1683 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1684 		ASSERT(PP_ISAGED(pp));
1685 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1686 	} else {
1687 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1688 		ASSERT(!PP_ISAGED(pp));
1689 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1690 	}
1691 
1692 	/*
1693 	 * Common PAGESIZE case.
1694 	 *
1695 	 * Note that we locked the freelist. This prevents
1696 	 * any page promotion/demotion operations. Therefore
1697 	 * the p_szc will not change until we drop pcm mutex.
1698 	 */
1699 	if (pp->p_szc == 0) {
1700 		page_sub(ppp, pp);
1701 		/*
1702 		 * Subtract counters before releasing pcm mutex
1703 		 * to avoid race with page_freelist_coalesce.
1704 		 */
1705 		page_ctr_sub(mnode, mtype, pp, flags);
1706 		mutex_exit(pcm);
1707 
1708 #if defined(__sparc)
1709 		if (PP_ISNORELOC(pp)) {
1710 			kcage_freemem_sub(1);
1711 		}
1712 #endif
1713 		return;
1714 	}
1715 
1716 	/*
1717 	 * Large pages on the cache list are not supported.
1718 	 */
1719 	if (flags & PG_CACHE_LIST)
1720 		panic("page_list_sub: large page on cachelist");
1721 
1722 	/*
1723 	 * Slow but rare.
1724 	 *
1725 	 * Somebody wants this particular page which is part
1726 	 * of a large page. In this case we just demote the page
1727 	 * if it's on the freelist.
1728 	 *
1729 	 * We have to drop pcm before locking the entire freelist.
1730 	 * Once we have re-locked the freelist check to make sure
1731 	 * the page hasn't already been demoted or completely
1732 	 * freed.
1733 	 */
1734 	mutex_exit(pcm);
1735 	page_freelist_lock(mnode);
1736 	if (pp->p_szc != 0) {
1737 		/*
1738 		 * Large page is on freelist.
1739 		 */
1740 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1741 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1742 	}
1743 	ASSERT(PP_ISFREE(pp));
1744 	ASSERT(PP_ISAGED(pp));
1745 	ASSERT(pp->p_szc == 0);
1746 
1747 	/*
1748 	 * Subtract counters before releasing pcm mutex
1749 	 * to avoid race with page_freelist_coalesce.
1750 	 */
1751 	bin = PP_2_BIN(pp);
1752 	mtype = PP_2_MTYPE(pp);
1753 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1754 
1755 	page_sub(ppp, pp);
1756 	page_ctr_sub(mnode, mtype, pp, flags);
1757 	page_freelist_unlock(mnode);
1758 
1759 #if defined(__sparc)
1760 	if (PP_ISNORELOC(pp)) {
1761 		kcage_freemem_sub(1);
1762 	}
1763 #endif
1764 }
1765 
1766 void
1767 page_list_sub_pages(page_t *pp, uint_t szc)
1768 {
1769 	kmutex_t *pcm;
1770 	uint_t	bin, mtype;
1771 	int	mnode;
1772 
1773 	ASSERT(PAGE_EXCL(pp));
1774 	ASSERT(PP_ISFREE(pp));
1775 	ASSERT(PP_ISAGED(pp));
1776 
1777 	/*
1778 	 * See comment in page_list_sub().
1779 	 */
1780 try_again:
1781 	bin = PP_2_BIN(pp);
1782 	mnode = PP_2_MEM_NODE(pp);
1783 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1784 	mutex_enter(pcm);
1785 	if (PP_2_BIN(pp) != bin) {
1786 		mutex_exit(pcm);
1787 		goto	try_again;
1788 	}
1789 
1790 	/*
1791 	 * If we're called with a page larger than szc or it got
1792 	 * promoted above szc before we locked the freelist then
1793 	 * drop pcm and re-lock entire freelist. If page still larger
1794 	 * than szc then demote it.
1795 	 */
1796 	if (pp->p_szc > szc) {
1797 		mutex_exit(pcm);
1798 		pcm = NULL;
1799 		page_freelist_lock(mnode);
1800 		if (pp->p_szc > szc) {
1801 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1802 			(void) page_demote(mnode,
1803 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1804 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1805 		}
1806 		bin = PP_2_BIN(pp);
1807 	}
1808 	ASSERT(PP_ISFREE(pp));
1809 	ASSERT(PP_ISAGED(pp));
1810 	ASSERT(pp->p_szc <= szc);
1811 	ASSERT(pp == PP_PAGEROOT(pp));
1812 
1813 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1814 
1815 	mtype = PP_2_MTYPE(pp);
1816 	if (pp->p_szc != 0) {
1817 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818 		CHK_LPG(pp, pp->p_szc);
1819 	} else {
1820 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1821 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1822 	}
1823 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1824 
1825 	if (pcm != NULL) {
1826 		mutex_exit(pcm);
1827 	} else {
1828 		page_freelist_unlock(mnode);
1829 	}
1830 
1831 #if defined(__sparc)
1832 	if (PP_ISNORELOC(pp)) {
1833 		pgcnt_t	pgcnt;
1834 
1835 		pgcnt = page_get_pagecnt(pp->p_szc);
1836 		kcage_freemem_sub(pgcnt);
1837 	}
1838 #endif
1839 }
1840 
1841 /*
1842  * Add the page to the front of a linked list of pages
1843  * using the p_next & p_prev pointers for the list.
1844  * The caller is responsible for protecting the list pointers.
1845  */
1846 void
1847 mach_page_add(page_t **ppp, page_t *pp)
1848 {
1849 	if (*ppp == NULL) {
1850 		pp->p_next = pp->p_prev = pp;
1851 	} else {
1852 		pp->p_next = *ppp;
1853 		pp->p_prev = (*ppp)->p_prev;
1854 		(*ppp)->p_prev = pp;
1855 		pp->p_prev->p_next = pp;
1856 	}
1857 	*ppp = pp;
1858 }
1859 
1860 /*
1861  * Remove this page from a linked list of pages
1862  * using the p_next & p_prev pointers for the list.
1863  *
1864  * The caller is responsible for protecting the list pointers.
1865  */
1866 void
1867 mach_page_sub(page_t **ppp, page_t *pp)
1868 {
1869 	ASSERT(PP_ISFREE(pp));
1870 
1871 	if (*ppp == NULL || pp == NULL)
1872 		panic("mach_page_sub");
1873 
1874 	if (*ppp == pp)
1875 		*ppp = pp->p_next;		/* go to next page */
1876 
1877 	if (*ppp == pp)
1878 		*ppp = NULL;			/* page list is gone */
1879 	else {
1880 		pp->p_prev->p_next = pp->p_next;
1881 		pp->p_next->p_prev = pp->p_prev;
1882 	}
1883 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1884 }
1885 
1886 /*
1887  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1888  */
1889 void
1890 page_promote_size(page_t *pp, uint_t cur_szc)
1891 {
1892 	pfn_t pfn;
1893 	int mnode;
1894 	int idx;
1895 	int new_szc = cur_szc + 1;
1896 	int full = FULL_REGION_CNT(new_szc);
1897 
1898 	pfn = page_pptonum(pp);
1899 	mnode = PFN_2_MEM_NODE(pfn);
1900 
1901 	page_freelist_lock(mnode);
1902 
1903 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1904 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1905 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1906 
1907 	page_freelist_unlock(mnode);
1908 }
1909 
1910 static uint_t page_promote_err;
1911 static uint_t page_promote_noreloc_err;
1912 
1913 /*
1914  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1915  * for the given mnode starting at pfnum. Pages involved are on the freelist
1916  * before the call and may be returned to the caller if requested, otherwise
1917  * they will be placed back on the freelist.
1918  * If flags is PC_ALLOC, then the large page will be returned to the user in
1919  * a state which is consistent with a page being taken off the freelist.  If
1920  * we failed to lock the new large page, then we will return NULL to the
1921  * caller and put the large page on the freelist instead.
1922  * If flags is PC_FREE, then the large page will be placed on the freelist,
1923  * and NULL will be returned.
1924  * The caller is responsible for locking the freelist as well as any other
1925  * accounting which needs to be done for a returned page.
1926  *
1927  * RFE: For performance pass in pp instead of pfnum so
1928  * 	we can avoid excessive calls to page_numtopp_nolock().
1929  *	This would depend on an assumption that all contiguous
1930  *	pages are in the same memseg so we can just add/dec
1931  *	our pp.
1932  *
1933  * Lock ordering:
1934  *
1935  *	There is a potential but rare deadlock situation
1936  *	for page promotion and demotion operations. The problem
1937  *	is there are two paths into the freelist manager and
1938  *	they have different lock orders:
1939  *
1940  *	page_create()
1941  *		lock freelist
1942  *		page_lock(EXCL)
1943  *		unlock freelist
1944  *		return
1945  *		caller drops page_lock
1946  *
1947  *	page_free() and page_reclaim()
1948  *		caller grabs page_lock(EXCL)
1949  *
1950  *		lock freelist
1951  *		unlock freelist
1952  *		drop page_lock
1953  *
1954  *	What prevents a thread in page_create() from deadlocking
1955  *	with a thread freeing or reclaiming the same page is the
1956  *	page_trylock() in page_get_freelist(). If the trylock fails
1957  *	it skips the page.
1958  *
1959  *	The lock ordering for promotion and demotion is the same as
1960  *	for page_create(). Since the same deadlock could occur during
1961  *	page promotion and freeing or reclaiming of a page on the
1962  *	cache list we might have to fail the operation and undo what
1963  *	have done so far. Again this is rare.
1964  */
1965 page_t *
1966 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1967 {
1968 	page_t		*pp, *pplist, *tpp, *start_pp;
1969 	pgcnt_t		new_npgs, npgs;
1970 	uint_t		bin;
1971 	pgcnt_t		tmpnpgs, pages_left;
1972 	uint_t		noreloc;
1973 	int 		which_list;
1974 	ulong_t		index;
1975 	kmutex_t	*phm;
1976 
1977 	/*
1978 	 * General algorithm:
1979 	 * Find the starting page
1980 	 * Walk each page struct removing it from the freelist,
1981 	 * and linking it to all the other pages removed.
1982 	 * Once all pages are off the freelist,
1983 	 * walk the list, modifying p_szc to new_szc and what
1984 	 * ever other info needs to be done to create a large free page.
1985 	 * According to the flags, either return the page or put it
1986 	 * on the freelist.
1987 	 */
1988 
1989 	start_pp = page_numtopp_nolock(pfnum);
1990 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1991 	new_npgs = page_get_pagecnt(new_szc);
1992 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1993 
1994 	/* don't return page of the wrong mtype */
1995 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1996 			return (NULL);
1997 
1998 	/*
1999 	 * Loop through smaller pages to confirm that all pages
2000 	 * give the same result for PP_ISNORELOC().
2001 	 * We can check this reliably here as the protocol for setting
2002 	 * P_NORELOC requires pages to be taken off the free list first.
2003 	 */
2004 	noreloc = PP_ISNORELOC(start_pp);
2005 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2006 		if (noreloc != PP_ISNORELOC(pp)) {
2007 			page_promote_noreloc_err++;
2008 			page_promote_err++;
2009 			return (NULL);
2010 		}
2011 	}
2012 
2013 	pages_left = new_npgs;
2014 	pplist = NULL;
2015 	pp = start_pp;
2016 
2017 	/* Loop around coalescing the smaller pages into a big page. */
2018 	while (pages_left) {
2019 		/*
2020 		 * Remove from the freelist.
2021 		 */
2022 		ASSERT(PP_ISFREE(pp));
2023 		bin = PP_2_BIN(pp);
2024 		ASSERT(mnode == PP_2_MEM_NODE(pp));
2025 		mtype = PP_2_MTYPE(pp);
2026 		if (PP_ISAGED(pp)) {
2027 
2028 			/*
2029 			 * PG_FREE_LIST
2030 			 */
2031 			if (pp->p_szc) {
2032 				page_vpsub(&PAGE_FREELISTS(mnode,
2033 				    pp->p_szc, bin, mtype), pp);
2034 			} else {
2035 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2036 				    bin, mtype), pp);
2037 			}
2038 			which_list = PG_FREE_LIST;
2039 		} else {
2040 			ASSERT(pp->p_szc == 0);
2041 
2042 			/*
2043 			 * PG_CACHE_LIST
2044 			 *
2045 			 * Since this page comes from the
2046 			 * cachelist, we must destroy the
2047 			 * vnode association.
2048 			 */
2049 			if (!page_trylock(pp, SE_EXCL)) {
2050 				goto fail_promote;
2051 			}
2052 
2053 			/*
2054 			 * We need to be careful not to deadlock
2055 			 * with another thread in page_lookup().
2056 			 * The page_lookup() thread could be holding
2057 			 * the same phm that we need if the two
2058 			 * pages happen to hash to the same phm lock.
2059 			 * At this point we have locked the entire
2060 			 * freelist and page_lookup() could be trying
2061 			 * to grab a freelist lock.
2062 			 */
2063 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2064 			phm = PAGE_HASH_MUTEX(index);
2065 			if (!mutex_tryenter(phm)) {
2066 				page_unlock_nocapture(pp);
2067 				goto fail_promote;
2068 			}
2069 
2070 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2071 			page_hashout(pp, phm);
2072 			mutex_exit(phm);
2073 			PP_SETAGED(pp);
2074 			page_unlock_nocapture(pp);
2075 			which_list = PG_CACHE_LIST;
2076 		}
2077 		page_ctr_sub(mnode, mtype, pp, which_list);
2078 
2079 		/*
2080 		 * Concatenate the smaller page(s) onto
2081 		 * the large page list.
2082 		 */
2083 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2084 		pages_left -= npgs;
2085 		tpp = pp;
2086 		while (npgs--) {
2087 			tpp->p_szc = new_szc;
2088 			tpp = tpp->p_next;
2089 		}
2090 		page_list_concat(&pplist, &pp);
2091 		pp += tmpnpgs;
2092 	}
2093 	CHK_LPG(pplist, new_szc);
2094 
2095 	/*
2096 	 * return the page to the user if requested
2097 	 * in the properly locked state.
2098 	 */
2099 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2100 		return (pplist);
2101 	}
2102 
2103 	/*
2104 	 * Otherwise place the new large page on the freelist
2105 	 */
2106 	bin = PP_2_BIN(pplist);
2107 	mnode = PP_2_MEM_NODE(pplist);
2108 	mtype = PP_2_MTYPE(pplist);
2109 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2110 
2111 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2112 	return (NULL);
2113 
2114 fail_promote:
2115 	/*
2116 	 * A thread must have still been freeing or
2117 	 * reclaiming the page on the cachelist.
2118 	 * To prevent a deadlock undo what we have
2119 	 * done sofar and return failure. This
2120 	 * situation can only happen while promoting
2121 	 * PAGESIZE pages.
2122 	 */
2123 	page_promote_err++;
2124 	while (pplist) {
2125 		pp = pplist;
2126 		mach_page_sub(&pplist, pp);
2127 		pp->p_szc = 0;
2128 		bin = PP_2_BIN(pp);
2129 		mtype = PP_2_MTYPE(pp);
2130 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2131 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2132 	}
2133 	return (NULL);
2134 
2135 }
2136 
2137 /*
2138  * Break up a large page into smaller size pages.
2139  * Pages involved are on the freelist before the call and may
2140  * be returned to the caller if requested, otherwise they will
2141  * be placed back on the freelist.
2142  * The caller is responsible for locking the freelist as well as any other
2143  * accounting which needs to be done for a returned page.
2144  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2145  * technically, any value may be passed in but PC_NO_COLOR is the standard
2146  * which should be followed for clarity's sake.
2147  * Returns a page whose pfn is < pfnmax
2148  */
2149 page_t *
2150 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2151     uchar_t new_szc, int color, int flags)
2152 {
2153 	page_t	*pp, *pplist, *npplist;
2154 	pgcnt_t	npgs, n;
2155 	uint_t	bin;
2156 	uint_t	mtype;
2157 	page_t	*ret_pp = NULL;
2158 
2159 	ASSERT(cur_szc != 0);
2160 	ASSERT(new_szc < cur_szc);
2161 
2162 	pplist = page_numtopp_nolock(pfnum);
2163 	ASSERT(pplist != NULL);
2164 
2165 	ASSERT(pplist->p_szc == cur_szc);
2166 
2167 	bin = PP_2_BIN(pplist);
2168 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2169 	mtype = PP_2_MTYPE(pplist);
2170 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2171 
2172 	CHK_LPG(pplist, cur_szc);
2173 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2174 
2175 	/*
2176 	 * Number of PAGESIZE pages for smaller new_szc
2177 	 * page.
2178 	 */
2179 	npgs = page_get_pagecnt(new_szc);
2180 
2181 	while (pplist) {
2182 		pp = pplist;
2183 
2184 		ASSERT(pp->p_szc == cur_szc);
2185 
2186 		/*
2187 		 * We either break it up into PAGESIZE pages or larger.
2188 		 */
2189 		if (npgs == 1) {	/* PAGESIZE case */
2190 			mach_page_sub(&pplist, pp);
2191 			ASSERT(pp->p_szc == cur_szc);
2192 			ASSERT(new_szc == 0);
2193 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2194 			pp->p_szc = new_szc;
2195 			bin = PP_2_BIN(pp);
2196 			if ((bin == color) && (flags == PC_ALLOC) &&
2197 			    (ret_pp == NULL) && (pfnmax == 0 ||
2198 			    pp->p_pagenum < pfnmax) &&
2199 			    page_trylock_cons(pp, SE_EXCL)) {
2200 				ret_pp = pp;
2201 			} else {
2202 				mtype = PP_2_MTYPE(pp);
2203 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2204 				    mtype), pp);
2205 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2206 			}
2207 		} else {
2208 			page_t *try_to_return_this_page = NULL;
2209 			int count = 0;
2210 
2211 			/*
2212 			 * Break down into smaller lists of pages.
2213 			 */
2214 			page_list_break(&pplist, &npplist, npgs);
2215 
2216 			pp = pplist;
2217 			n = npgs;
2218 			while (n--) {
2219 				ASSERT(pp->p_szc == cur_szc);
2220 				/*
2221 				 * Check whether all the pages in this list
2222 				 * fit the request criteria.
2223 				 */
2224 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2225 					count++;
2226 				}
2227 				pp->p_szc = new_szc;
2228 				pp = pp->p_next;
2229 			}
2230 
2231 			if (count == npgs &&
2232 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2233 				try_to_return_this_page = pp;
2234 			}
2235 
2236 			CHK_LPG(pplist, new_szc);
2237 
2238 			bin = PP_2_BIN(pplist);
2239 			if (try_to_return_this_page)
2240 				ASSERT(mnode ==
2241 				    PP_2_MEM_NODE(try_to_return_this_page));
2242 			if ((bin == color) && (flags == PC_ALLOC) &&
2243 			    (ret_pp == NULL) && try_to_return_this_page &&
2244 			    page_trylock_cons(try_to_return_this_page,
2245 			    SE_EXCL)) {
2246 				ret_pp = try_to_return_this_page;
2247 			} else {
2248 				mtype = PP_2_MTYPE(pp);
2249 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2250 				    bin, mtype), pplist);
2251 
2252 				page_ctr_add(mnode, mtype, pplist,
2253 				    PG_FREE_LIST);
2254 			}
2255 			pplist = npplist;
2256 		}
2257 	}
2258 	return (ret_pp);
2259 }
2260 
2261 int mpss_coalesce_disable = 0;
2262 
2263 /*
2264  * Coalesce free pages into a page of the given szc and color if possible.
2265  * Return the pointer to the page created, otherwise, return NULL.
2266  *
2267  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2268  */
2269 page_t *
2270 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2271     int mtype, pfn_t pfnhi)
2272 {
2273 	int 	r = szc;		/* region size */
2274 	int	mrange;
2275 	uint_t 	full, bin, color_mask, wrap = 0;
2276 	pfn_t	pfnum, lo, hi;
2277 	size_t	len, idx, idx0;
2278 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2279 	page_t	*ret_pp;
2280 	MEM_NODE_ITERATOR_DECL(it);
2281 #if defined(__sparc)
2282 	pfn_t pfnum0, nlo, nhi;
2283 #endif
2284 
2285 	if (mpss_coalesce_disable) {
2286 		ASSERT(szc < MMU_PAGE_SIZES);
2287 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2288 		return (NULL);
2289 	}
2290 
2291 	ASSERT(szc < mmu_page_sizes);
2292 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2293 	ASSERT(ceq_mask <= color_mask);
2294 	ASSERT(color <= color_mask);
2295 	color &= ceq_mask;
2296 
2297 	/* Prevent page_counters dynamic memory from being freed */
2298 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2299 
2300 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2301 	ASSERT(mrange < mnode_nranges[mnode]);
2302 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2303 
2304 	/* get pfn range for mtype */
2305 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2306 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2307 	hi++;
2308 
2309 	/* use lower limit if given */
2310 	if (pfnhi != PFNNULL && pfnhi < hi)
2311 		hi = pfnhi;
2312 
2313 	/* round to szcpgcnt boundaries */
2314 	lo = P2ROUNDUP(lo, szcpgcnt);
2315 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2316 	if (lo == (pfn_t)-1) {
2317 		rw_exit(&page_ctrs_rwlock[mnode]);
2318 		return (NULL);
2319 	}
2320 	hi = hi & ~(szcpgcnt - 1);
2321 
2322 	/* set lo to the closest pfn of the right color */
2323 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2324 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2325 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2326 		    &it);
2327 	}
2328 
2329 	if (hi <= lo) {
2330 		rw_exit(&page_ctrs_rwlock[mnode]);
2331 		return (NULL);
2332 	}
2333 
2334 	full = FULL_REGION_CNT(r);
2335 
2336 	/* calculate the number of page candidates and initial search index */
2337 	bin = color;
2338 	idx0 = (size_t)(-1);
2339 	do {
2340 		pgcnt_t acand;
2341 
2342 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2343 		if (acand) {
2344 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2345 			    r, bin, mrange);
2346 			idx0 = MIN(idx0, idx);
2347 			cands += acand;
2348 		}
2349 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2350 	} while (bin != color);
2351 
2352 	if (cands == 0) {
2353 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2354 		rw_exit(&page_ctrs_rwlock[mnode]);
2355 		return (NULL);
2356 	}
2357 
2358 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2359 	if (pfnum < lo || pfnum >= hi) {
2360 		pfnum = lo;
2361 	} else {
2362 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2363 		if (pfnum == (pfn_t)-1) {
2364 			pfnum = lo;
2365 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2366 			ASSERT(pfnum != (pfn_t)-1);
2367 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2368 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2369 			/* invalid color, get the closest correct pfn */
2370 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2371 			    color_mask, &it);
2372 			if (pfnum >= hi) {
2373 				pfnum = lo;
2374 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2375 			}
2376 		}
2377 	}
2378 
2379 	/* set starting index */
2380 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2381 	ASSERT(idx0 < len);
2382 
2383 #if defined(__sparc)
2384 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2385 	nhi = 0;		/* search kcage ranges */
2386 #endif
2387 
2388 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2389 
2390 #if defined(__sparc)
2391 		/*
2392 		 * Find lowest intersection of kcage ranges and mnode.
2393 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2394 		 */
2395 		if (nhi <= pfnum) {
2396 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2397 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2398 				goto wrapit;
2399 
2400 			/* jump to the next page in the range */
2401 			if (pfnum < nlo) {
2402 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2403 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2404 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2405 				if (idx >= len || pfnum >= hi)
2406 					goto wrapit;
2407 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2408 				    ceq_mask)
2409 					goto next;
2410 				if (interleaved_mnodes &&
2411 				    PFN_2_MEM_NODE(pfnum) != mnode)
2412 					goto next;
2413 			}
2414 		}
2415 #endif
2416 
2417 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2418 			goto next;
2419 
2420 		/*
2421 		 * RFE: For performance maybe we can do something less
2422 		 *	brutal than locking the entire freelist. So far
2423 		 * 	this doesn't seem to be a performance problem?
2424 		 */
2425 		page_freelist_lock(mnode);
2426 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2427 			ret_pp =
2428 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2429 			if (ret_pp != NULL) {
2430 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2431 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2432 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2433 				page_freelist_unlock(mnode);
2434 				rw_exit(&page_ctrs_rwlock[mnode]);
2435 #if defined(__sparc)
2436 				if (PP_ISNORELOC(ret_pp)) {
2437 					pgcnt_t npgs;
2438 
2439 					npgs = page_get_pagecnt(ret_pp->p_szc);
2440 					kcage_freemem_sub(npgs);
2441 				}
2442 #endif
2443 				return (ret_pp);
2444 			}
2445 		} else {
2446 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2447 		}
2448 
2449 		page_freelist_unlock(mnode);
2450 		/*
2451 		 * No point looking for another page if we've
2452 		 * already tried all of the ones that
2453 		 * page_ctr_cands indicated.  Stash off where we left
2454 		 * off.
2455 		 * Note: this is not exact since we don't hold the
2456 		 * page_freelist_locks before we initially get the
2457 		 * value of cands for performance reasons, but should
2458 		 * be a decent approximation.
2459 		 */
2460 		if (--cands == 0) {
2461 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2462 			    idx;
2463 			break;
2464 		}
2465 next:
2466 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2467 		    color_mask, &it);
2468 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2469 		if (idx >= len || pfnum >= hi) {
2470 wrapit:
2471 			pfnum = lo;
2472 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2473 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2474 			wrap++;
2475 #if defined(__sparc)
2476 			nhi = 0;	/* search kcage ranges */
2477 #endif
2478 		}
2479 	}
2480 
2481 	rw_exit(&page_ctrs_rwlock[mnode]);
2482 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2483 	return (NULL);
2484 }
2485 
2486 /*
2487  * For the given mnode, promote as many small pages to large pages as possible.
2488  * mnode can be -1, which means do them all
2489  */
2490 void
2491 page_freelist_coalesce_all(int mnode)
2492 {
2493 	int 	r;		/* region size */
2494 	int 	idx, full;
2495 	size_t	len;
2496 	int doall = interleaved_mnodes || mnode < 0;
2497 	int mlo = doall ? 0 : mnode;
2498 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2499 
2500 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2501 
2502 	if (mpss_coalesce_disable) {
2503 		return;
2504 	}
2505 
2506 	/*
2507 	 * Lock the entire freelist and coalesce what we can.
2508 	 *
2509 	 * Always promote to the largest page possible
2510 	 * first to reduce the number of page promotions.
2511 	 */
2512 	for (mnode = mlo; mnode < mhi; mnode++) {
2513 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2514 		page_freelist_lock(mnode);
2515 	}
2516 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2517 		for (mnode = mlo; mnode < mhi; mnode++) {
2518 			pgcnt_t cands = 0;
2519 			int mrange, nranges = mnode_nranges[mnode];
2520 
2521 			for (mrange = 0; mrange < nranges; mrange++) {
2522 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2523 				if (cands != 0)
2524 					break;
2525 			}
2526 			if (cands == 0) {
2527 				VM_STAT_ADD(vmm_vmstats.
2528 				    page_ctrs_cands_skip_all);
2529 				continue;
2530 			}
2531 
2532 			full = FULL_REGION_CNT(r);
2533 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2534 
2535 			for (idx = 0; idx < len; idx++) {
2536 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2537 					pfn_t pfnum =
2538 					    IDX_TO_PNUM(mnode, r, idx);
2539 					int tmnode = interleaved_mnodes ?
2540 					    PFN_2_MEM_NODE(pfnum) : mnode;
2541 
2542 					ASSERT(pfnum >=
2543 					    mem_node_config[tmnode].physbase &&
2544 					    pfnum <
2545 					    mem_node_config[tmnode].physmax);
2546 
2547 					(void) page_promote(tmnode,
2548 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2549 				}
2550 			}
2551 			/* shared hpm_counters covers all mnodes, so we quit */
2552 			if (interleaved_mnodes)
2553 				break;
2554 		}
2555 	}
2556 	for (mnode = mlo; mnode < mhi; mnode++) {
2557 		page_freelist_unlock(mnode);
2558 		rw_exit(&page_ctrs_rwlock[mnode]);
2559 	}
2560 }
2561 
2562 /*
2563  * This is where all polices for moving pages around
2564  * to different page size free lists is implemented.
2565  * Returns 1 on success, 0 on failure.
2566  *
2567  * So far these are the priorities for this algorithm in descending
2568  * order:
2569  *
2570  *	1) When servicing a request try to do so with a free page
2571  *	   from next size up. Helps defer fragmentation as long
2572  *	   as possible.
2573  *
2574  *	2) Page coalesce on demand. Only when a freelist
2575  *	   larger than PAGESIZE is empty and step 1
2576  *	   will not work since all larger size lists are
2577  *	   also empty.
2578  *
2579  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2580  */
2581 
2582 page_t *
2583 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2584     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2585 {
2586 	uchar_t nszc = szc + 1;
2587 	uint_t 	bin, sbin, bin_prev;
2588 	page_t	*pp, *firstpp;
2589 	page_t	*ret_pp = NULL;
2590 	uint_t  color_mask;
2591 
2592 	if (nszc == mmu_page_sizes)
2593 		return (NULL);
2594 
2595 	ASSERT(nszc < mmu_page_sizes);
2596 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2597 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2598 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2599 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2600 
2601 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2602 	/*
2603 	 * First try to break up a larger page to fill current size freelist.
2604 	 */
2605 	while (plw->plw_bins[nszc] != 0) {
2606 
2607 		ASSERT(nszc < mmu_page_sizes);
2608 
2609 		/*
2610 		 * If page found then demote it.
2611 		 */
2612 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2613 			page_freelist_lock(mnode);
2614 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2615 
2616 			/*
2617 			 * If pfnhi is not PFNNULL, look for large page below
2618 			 * pfnhi. PFNNULL signifies no pfn requirement.
2619 			 */
2620 			if (pp &&
2621 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2622 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2623 				do {
2624 					pp = pp->p_vpnext;
2625 					if (pp == firstpp) {
2626 						pp = NULL;
2627 						break;
2628 					}
2629 				} while ((pfnhi != PFNNULL &&
2630 				    pp->p_pagenum >= pfnhi) ||
2631 				    (pfnlo != PFNNULL &&
2632 				    pp->p_pagenum < pfnlo));
2633 
2634 				if (pfnhi != PFNNULL && pp != NULL)
2635 					ASSERT(pp->p_pagenum < pfnhi);
2636 
2637 				if (pfnlo != PFNNULL && pp != NULL)
2638 					ASSERT(pp->p_pagenum >= pfnlo);
2639 			}
2640 			if (pp) {
2641 				uint_t ccolor = page_correct_color(szc, nszc,
2642 				    color, bin, plw->plw_ceq_mask[szc]);
2643 
2644 				ASSERT(pp->p_szc == nszc);
2645 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2646 				ret_pp = page_demote(mnode, pp->p_pagenum,
2647 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2648 				if (ret_pp) {
2649 					page_freelist_unlock(mnode);
2650 #if defined(__sparc)
2651 					if (PP_ISNORELOC(ret_pp)) {
2652 						pgcnt_t npgs;
2653 
2654 						npgs = page_get_pagecnt(
2655 						    ret_pp->p_szc);
2656 						kcage_freemem_sub(npgs);
2657 					}
2658 #endif
2659 					return (ret_pp);
2660 				}
2661 			}
2662 			page_freelist_unlock(mnode);
2663 		}
2664 
2665 		/* loop through next size bins */
2666 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2667 		plw->plw_bins[nszc]--;
2668 
2669 		if (bin == sbin) {
2670 			uchar_t nnszc = nszc + 1;
2671 
2672 			/* we are done with this page size - check next */
2673 			if (plw->plw_bins[nnszc] == 0)
2674 				/* we have already checked next size bins */
2675 				break;
2676 
2677 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2678 			if (bin_prev != INVALID_COLOR) {
2679 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2680 				if (!((bin ^ bin_prev) &
2681 				    plw->plw_ceq_mask[nnszc]))
2682 					break;
2683 			}
2684 			ASSERT(nnszc < mmu_page_sizes);
2685 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2686 			nszc = nnszc;
2687 			ASSERT(nszc < mmu_page_sizes);
2688 		}
2689 	}
2690 
2691 	return (ret_pp);
2692 }
2693 
2694 /*
2695  * Helper routine used only by the freelist code to lock
2696  * a page. If the page is a large page then it succeeds in
2697  * locking all the constituent pages or none at all.
2698  * Returns 1 on sucess, 0 on failure.
2699  */
2700 static int
2701 page_trylock_cons(page_t *pp, se_t se)
2702 {
2703 	page_t	*tpp, *first_pp = pp;
2704 
2705 	/*
2706 	 * Fail if can't lock first or only page.
2707 	 */
2708 	if (!page_trylock(pp, se)) {
2709 		return (0);
2710 	}
2711 
2712 	/*
2713 	 * PAGESIZE: common case.
2714 	 */
2715 	if (pp->p_szc == 0) {
2716 		return (1);
2717 	}
2718 
2719 	/*
2720 	 * Large page case.
2721 	 */
2722 	tpp = pp->p_next;
2723 	while (tpp != pp) {
2724 		if (!page_trylock(tpp, se)) {
2725 			/*
2726 			 * On failure unlock what we have locked so far.
2727 			 * We want to avoid attempting to capture these
2728 			 * pages as the pcm mutex may be held which could
2729 			 * lead to a recursive mutex panic.
2730 			 */
2731 			while (first_pp != tpp) {
2732 				page_unlock_nocapture(first_pp);
2733 				first_pp = first_pp->p_next;
2734 			}
2735 			return (0);
2736 		}
2737 		tpp = tpp->p_next;
2738 	}
2739 	return (1);
2740 }
2741 
2742 /*
2743  * init context for walking page lists
2744  * Called when a page of the given szc in unavailable. Sets markers
2745  * for the beginning of the search to detect when search has
2746  * completed a full cycle. Sets flags for splitting larger pages
2747  * and coalescing smaller pages. Page walking procedes until a page
2748  * of the desired equivalent color is found.
2749  */
2750 void
2751 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2752     int use_ceq, page_list_walker_t *plw)
2753 {
2754 	uint_t  nszc, ceq_mask, colors;
2755 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2756 
2757 	ASSERT(szc < mmu_page_sizes);
2758 	colors = PAGE_GET_PAGECOLORS(szc);
2759 
2760 	plw->plw_colors = colors;
2761 	plw->plw_color_mask = colors - 1;
2762 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2763 	plw->plw_bin_split_prev = bin;
2764 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2765 
2766 	/*
2767 	 * if vac aliasing is possible make sure lower order color
2768 	 * bits are never ignored
2769 	 */
2770 	if (vac_colors > 1)
2771 		ceq &= 0xf0;
2772 
2773 	/*
2774 	 * calculate the number of non-equivalent colors and
2775 	 * color equivalency mask
2776 	 */
2777 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2778 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2779 	ASSERT(plw->plw_ceq_dif > 0);
2780 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2781 
2782 	if (flags & PG_MATCH_COLOR) {
2783 		if (cpu_page_colors <  0) {
2784 			/*
2785 			 * this is a heterogeneous machine with different CPUs
2786 			 * having different size e$ (not supported for ni2/rock
2787 			 */
2788 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2789 			cpucolors = MAX(cpucolors, 1);
2790 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2791 			plw->plw_ceq_mask[szc] =
2792 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2793 		}
2794 		plw->plw_ceq_dif = 1;
2795 	}
2796 
2797 	/* we can split pages in the freelist, but not the cachelist */
2798 	if (can_split) {
2799 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2800 
2801 		/* set next szc color masks and number of free list bins */
2802 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2803 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2804 			    plw->plw_ceq_mask[szc]);
2805 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2806 		}
2807 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2808 		plw->plw_bins[nszc] = 0;
2809 
2810 	} else {
2811 		ASSERT(szc == 0);
2812 		plw->plw_do_split = 0;
2813 		plw->plw_bins[1] = 0;
2814 		plw->plw_ceq_mask[1] = INVALID_MASK;
2815 	}
2816 }
2817 
2818 /*
2819  * set mark to flag where next split should occur
2820  */
2821 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2822 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2823 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2824 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2825 	plw->plw_split_next =						     \
2826 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2827 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2828 		plw->plw_split_next =					     \
2829 		INC_MASKED(plw->plw_split_next,				     \
2830 		    neq_mask, plw->plw_color_mask);			     \
2831 	}								     \
2832 }
2833 
2834 uint_t
2835 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2836 {
2837 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2838 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2839 	uchar_t nszc = szc + 1;
2840 
2841 	nbin = ADD_MASKED(bin,
2842 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2843 
2844 	if (plw->plw_do_split) {
2845 		plw->plw_bin_split_prev = bin;
2846 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2847 		plw->plw_do_split = 0;
2848 	}
2849 
2850 	if (szc == 0) {
2851 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2852 			if (nbin == plw->plw_bin0 &&
2853 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2854 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2855 				    neq_mask, plw->plw_color_mask);
2856 				plw->plw_bin_split_prev = plw->plw_bin0;
2857 			}
2858 
2859 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2860 				plw->plw_bin_marker =
2861 				    nbin = INC_MASKED(nbin, neq_mask,
2862 				    plw->plw_color_mask);
2863 				plw->plw_bin_split_prev = plw->plw_bin0;
2864 				/*
2865 				 * large pages all have the same vac color
2866 				 * so by now we should be done with next
2867 				 * size page splitting process
2868 				 */
2869 				ASSERT(plw->plw_bins[1] == 0);
2870 				plw->plw_do_split = 0;
2871 				return (nbin);
2872 			}
2873 
2874 		} else {
2875 			uint_t bin_jump = (vac_colors == 1) ?
2876 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2877 
2878 			bin_jump &= ~(vac_colors - 1);
2879 
2880 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2881 			    plw->plw_color_mask);
2882 
2883 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2884 
2885 				plw->plw_bin_marker = nbin = nbin0;
2886 
2887 				if (plw->plw_bins[nszc] != 0) {
2888 					/*
2889 					 * check if next page size bin is the
2890 					 * same as the next page size bin for
2891 					 * bin0
2892 					 */
2893 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2894 					    nbin);
2895 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2896 					    plw->plw_bin0);
2897 
2898 					if ((bin0_nsz ^ nbin_nsz) &
2899 					    plw->plw_ceq_mask[nszc])
2900 						plw->plw_do_split = 1;
2901 				}
2902 				return (nbin);
2903 			}
2904 		}
2905 	}
2906 
2907 	if (plw->plw_bins[nszc] != 0) {
2908 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2909 		if (!((plw->plw_split_next ^ nbin_nsz) &
2910 		    plw->plw_ceq_mask[nszc]))
2911 			plw->plw_do_split = 1;
2912 	}
2913 
2914 	return (nbin);
2915 }
2916 
2917 page_t *
2918 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2919     uint_t flags)
2920 {
2921 	kmutex_t		*pcm;
2922 	page_t			*pp, *first_pp;
2923 	uint_t			sbin;
2924 	int			plw_initialized;
2925 	page_list_walker_t	plw;
2926 
2927 	ASSERT(szc < mmu_page_sizes);
2928 
2929 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2930 
2931 	MTYPE_START(mnode, mtype, flags);
2932 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2933 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2934 		return (NULL);
2935 	}
2936 try_again:
2937 
2938 	plw_initialized = 0;
2939 	plw.plw_ceq_dif = 1;
2940 
2941 	/*
2942 	 * Only hold one freelist lock at a time, that way we
2943 	 * can start anywhere and not have to worry about lock
2944 	 * ordering.
2945 	 */
2946 	for (plw.plw_count = 0;
2947 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2948 		sbin = bin;
2949 		do {
2950 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2951 				goto bin_empty_1;
2952 
2953 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2954 			mutex_enter(pcm);
2955 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2956 			if (pp == NULL)
2957 				goto bin_empty_0;
2958 
2959 			/*
2960 			 * These were set before the page
2961 			 * was put on the free list,
2962 			 * they must still be set.
2963 			 */
2964 			ASSERT(PP_ISFREE(pp));
2965 			ASSERT(PP_ISAGED(pp));
2966 			ASSERT(pp->p_vnode == NULL);
2967 			ASSERT(pp->p_hash == NULL);
2968 			ASSERT(pp->p_offset == (u_offset_t)-1);
2969 			ASSERT(pp->p_szc == szc);
2970 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2971 
2972 			/*
2973 			 * Walk down the hash chain.
2974 			 * 8k pages are linked on p_next
2975 			 * and p_prev fields. Large pages
2976 			 * are a contiguous group of
2977 			 * constituent pages linked together
2978 			 * on their p_next and p_prev fields.
2979 			 * The large pages are linked together
2980 			 * on the hash chain using p_vpnext
2981 			 * p_vpprev of the base constituent
2982 			 * page of each large page.
2983 			 */
2984 			first_pp = pp;
2985 			while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp,
2986 			    SE_EXCL)) {
2987 				if (szc == 0) {
2988 					pp = pp->p_next;
2989 				} else {
2990 					pp = pp->p_vpnext;
2991 				}
2992 
2993 				ASSERT(PP_ISFREE(pp));
2994 				ASSERT(PP_ISAGED(pp));
2995 				ASSERT(pp->p_vnode == NULL);
2996 				ASSERT(pp->p_hash == NULL);
2997 				ASSERT(pp->p_offset == (u_offset_t)-1);
2998 				ASSERT(pp->p_szc == szc);
2999 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3000 
3001 				if (pp == first_pp)
3002 					goto bin_empty_0;
3003 			}
3004 
3005 			ASSERT(pp != NULL);
3006 			ASSERT(mtype == PP_2_MTYPE(pp));
3007 			ASSERT(pp->p_szc == szc);
3008 			if (szc == 0) {
3009 				page_sub(&PAGE_FREELISTS(mnode,
3010 				    szc, bin, mtype), pp);
3011 			} else {
3012 				page_vpsub(&PAGE_FREELISTS(mnode,
3013 				    szc, bin, mtype), pp);
3014 				CHK_LPG(pp, szc);
3015 			}
3016 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3017 
3018 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3019 				panic("free page is not. pp %p", (void *)pp);
3020 			mutex_exit(pcm);
3021 
3022 #if defined(__sparc)
3023 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3024 			    (flags & PG_NORELOC) == 0);
3025 
3026 			if (PP_ISNORELOC(pp))
3027 				kcage_freemem_sub(page_get_pagecnt(szc));
3028 #endif
3029 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3030 			return (pp);
3031 
3032 bin_empty_0:
3033 			mutex_exit(pcm);
3034 bin_empty_1:
3035 			if (plw_initialized == 0) {
3036 				page_list_walk_init(szc, flags, bin, 1, 1,
3037 				    &plw);
3038 				plw_initialized = 1;
3039 				ASSERT(plw.plw_colors <=
3040 				    PAGE_GET_PAGECOLORS(szc));
3041 				ASSERT(plw.plw_colors > 0);
3042 				ASSERT((plw.plw_colors &
3043 				    (plw.plw_colors - 1)) == 0);
3044 				ASSERT(bin < plw.plw_colors);
3045 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3046 			}
3047 			/* calculate the next bin with equivalent color */
3048 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3049 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
3050 		} while (sbin != bin);
3051 
3052 		/*
3053 		 * color bins are all empty if color match. Try and
3054 		 * satisfy the request by breaking up or coalescing
3055 		 * pages from a different size freelist of the correct
3056 		 * color that satisfies the ORIGINAL color requested.
3057 		 * If that fails then try pages of the same size but
3058 		 * different colors assuming we are not called with
3059 		 * PG_MATCH_COLOR.
3060 		 */
3061 		if (plw.plw_do_split &&
3062 		    (pp = page_freelist_split(szc, bin, mnode,
3063 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3064 			return (pp);
3065 
3066 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3067 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3068 			return (pp);
3069 
3070 		if (plw.plw_ceq_dif > 1)
3071 			bin = page_list_walk_next_bin(szc, bin, &plw);
3072 	}
3073 
3074 	/* if allowed, cycle through additional mtypes */
3075 	MTYPE_NEXT(mnode, mtype, flags);
3076 	if (mtype >= 0)
3077 		goto try_again;
3078 
3079 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3080 
3081 	return (NULL);
3082 }
3083 
3084 /*
3085  * Returns the count of free pages for 'pp' with size code 'szc'.
3086  * Note: This function does not return an exact value as the page freelist
3087  * locks are not held and thus the values in the page_counters may be
3088  * changing as we walk through the data.
3089  */
3090 static int
3091 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3092 {
3093 	pgcnt_t	pgfree;
3094 	pgcnt_t cnt;
3095 	ssize_t	r = szc;	/* region size */
3096 	ssize_t	idx;
3097 	int	i;
3098 	int	full, range;
3099 
3100 	/* Make sure pagenum passed in is aligned properly */
3101 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3102 	ASSERT(szc > 0);
3103 
3104 	/* Prevent page_counters dynamic memory from being freed */
3105 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3106 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3107 	cnt = PAGE_COUNTERS(mnode, r, idx);
3108 	pgfree = cnt << PNUM_SHIFT(r - 1);
3109 	range = FULL_REGION_CNT(szc);
3110 
3111 	/* Check for completely full region */
3112 	if (cnt == range) {
3113 		rw_exit(&page_ctrs_rwlock[mnode]);
3114 		return (pgfree);
3115 	}
3116 
3117 	while (--r > 0) {
3118 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3119 		full = FULL_REGION_CNT(r);
3120 		for (i = 0; i < range; i++, idx++) {
3121 			cnt = PAGE_COUNTERS(mnode, r, idx);
3122 			/*
3123 			 * If cnt here is full, that means we have already
3124 			 * accounted for these pages earlier.
3125 			 */
3126 			if (cnt != full) {
3127 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3128 			}
3129 		}
3130 		range *= full;
3131 	}
3132 	rw_exit(&page_ctrs_rwlock[mnode]);
3133 	return (pgfree);
3134 }
3135 
3136 /*
3137  * Called from page_geti_contig_pages to exclusively lock constituent pages
3138  * starting from 'spp' for page size code 'szc'.
3139  *
3140  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3141  * region needs to be greater than or equal to the threshold.
3142  */
3143 static int
3144 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3145 {
3146 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3147 	pgcnt_t pgfree, i;
3148 	page_t *pp;
3149 
3150 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3151 
3152 
3153 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3154 		goto skipptcpcheck;
3155 	/*
3156 	 * check if there are sufficient free pages available before attempting
3157 	 * to trylock. Count is approximate as page counters can change.
3158 	 */
3159 	pgfree = page_freecnt(mnode, spp, szc);
3160 
3161 	/* attempt to trylock if there are sufficient already free pages */
3162 	if (pgfree < pgcnt/ptcpthreshold) {
3163 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3164 		return (0);
3165 	}
3166 
3167 skipptcpcheck:
3168 
3169 	for (i = 0; i < pgcnt; i++) {
3170 		pp = &spp[i];
3171 		if (!page_trylock(pp, SE_EXCL)) {
3172 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3173 			while (--i != (pgcnt_t)-1) {
3174 				pp = &spp[i];
3175 				ASSERT(PAGE_EXCL(pp));
3176 				page_unlock_nocapture(pp);
3177 			}
3178 			return (0);
3179 		}
3180 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3181 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3182 		    !PP_ISFREE(pp)) {
3183 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3184 			ASSERT(i == 0);
3185 			page_unlock_nocapture(pp);
3186 			return (0);
3187 		}
3188 
3189 		/*
3190 		 * If a page has been marked non-relocatable or has been
3191 		 * explicitly locked in memory, we don't want to relocate it;
3192 		 * unlock the pages and fail the operation.
3193 		 */
3194 		if (PP_ISNORELOC(pp) ||
3195 		    pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3196 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3197 			while (i != (pgcnt_t)-1) {
3198 				pp = &spp[i];
3199 				ASSERT(PAGE_EXCL(pp));
3200 				page_unlock_nocapture(pp);
3201 				i--;
3202 			}
3203 			return (0);
3204 		}
3205 	}
3206 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3207 	return (1);
3208 }
3209 
3210 /*
3211  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3212  * of 'szc' constituent pages that had been locked exclusively previously.
3213  * Will attempt to relocate constituent pages in use.
3214  */
3215 static page_t *
3216 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3217 {
3218 	spgcnt_t pgcnt, npgs, i;
3219 	page_t *targpp, *rpp, *hpp;
3220 	page_t *replpp = NULL;
3221 	page_t *pplist = NULL;
3222 
3223 	ASSERT(pp != NULL);
3224 
3225 	pgcnt = page_get_pagecnt(szc);
3226 	while (pgcnt) {
3227 		ASSERT(PAGE_EXCL(pp));
3228 		ASSERT(!PP_ISNORELOC(pp));
3229 		if (PP_ISFREE(pp)) {
3230 			/*
3231 			 * If this is a PG_FREE_LIST page then its
3232 			 * size code can change underneath us due to
3233 			 * page promotion or demotion. As an optimzation
3234 			 * use page_list_sub_pages() instead of
3235 			 * page_list_sub().
3236 			 */
3237 			if (PP_ISAGED(pp)) {
3238 				page_list_sub_pages(pp, szc);
3239 				if (pp->p_szc == szc) {
3240 					return (pp);
3241 				}
3242 				ASSERT(pp->p_szc < szc);
3243 				npgs = page_get_pagecnt(pp->p_szc);
3244 				hpp = pp;
3245 				for (i = 0; i < npgs; i++, pp++) {
3246 					pp->p_szc = szc;
3247 				}
3248 				page_list_concat(&pplist, &hpp);
3249 				pgcnt -= npgs;
3250 				continue;
3251 			}
3252 			ASSERT(!PP_ISAGED(pp));
3253 			ASSERT(pp->p_szc == 0);
3254 			page_list_sub(pp, PG_CACHE_LIST);
3255 			page_hashout(pp, NULL);
3256 			PP_SETAGED(pp);
3257 			pp->p_szc = szc;
3258 			page_list_concat(&pplist, &pp);
3259 			pp++;
3260 			pgcnt--;
3261 			continue;
3262 		}
3263 		npgs = page_get_pagecnt(pp->p_szc);
3264 
3265 		/*
3266 		 * page_create_wait freemem accounting done by caller of
3267 		 * page_get_freelist and not necessary to call it prior to
3268 		 * calling page_get_replacement_page.
3269 		 *
3270 		 * page_get_replacement_page can call page_get_contig_pages
3271 		 * to acquire a large page (szc > 0); the replacement must be
3272 		 * smaller than the contig page size to avoid looping or
3273 		 * szc == 0 and PGI_PGCPSZC0 is set.
3274 		 */
3275 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3276 			replpp = page_get_replacement_page(pp, NULL, 0);
3277 			if (replpp) {
3278 				npgs = page_get_pagecnt(pp->p_szc);
3279 				ASSERT(npgs <= pgcnt);
3280 				targpp = pp;
3281 			}
3282 		}
3283 
3284 		/*
3285 		 * If replacement is NULL or do_page_relocate fails, fail
3286 		 * coalescing of pages.
3287 		 */
3288 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3289 		    &npgs, NULL) != 0)) {
3290 			/*
3291 			 * Unlock un-processed target list
3292 			 */
3293 			while (pgcnt--) {
3294 				ASSERT(PAGE_EXCL(pp));
3295 				page_unlock_nocapture(pp);
3296 				pp++;
3297 			}
3298 			/*
3299 			 * Free the processed target list.
3300 			 */
3301 			while (pplist) {
3302 				pp = pplist;
3303 				page_sub(&pplist, pp);
3304 				ASSERT(PAGE_EXCL(pp));
3305 				ASSERT(pp->p_szc == szc);
3306 				ASSERT(PP_ISFREE(pp));
3307 				ASSERT(PP_ISAGED(pp));
3308 				pp->p_szc = 0;
3309 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3310 				page_unlock_nocapture(pp);
3311 			}
3312 
3313 			if (replpp != NULL)
3314 				page_free_replacement_page(replpp);
3315 
3316 			return (NULL);
3317 		}
3318 		ASSERT(pp == targpp);
3319 
3320 		/* LINTED */
3321 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3322 
3323 		pp += npgs;
3324 		pgcnt -= npgs;
3325 
3326 		while (npgs--) {
3327 			ASSERT(PAGE_EXCL(targpp));
3328 			ASSERT(!PP_ISFREE(targpp));
3329 			ASSERT(!PP_ISNORELOC(targpp));
3330 			PP_SETFREE(targpp);
3331 			ASSERT(PP_ISAGED(targpp));
3332 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3333 			    (flags & PGI_PGCPSZC0)));
3334 			targpp->p_szc = szc;
3335 			targpp = targpp->p_next;
3336 
3337 			rpp = replpp;
3338 			ASSERT(rpp != NULL);
3339 			page_sub(&replpp, rpp);
3340 			ASSERT(PAGE_EXCL(rpp));
3341 			ASSERT(!PP_ISFREE(rpp));
3342 			page_unlock_nocapture(rpp);
3343 		}
3344 		ASSERT(targpp == hpp);
3345 		ASSERT(replpp == NULL);
3346 		page_list_concat(&pplist, &targpp);
3347 	}
3348 	CHK_LPG(pplist, szc);
3349 	return (pplist);
3350 }
3351 
3352 /*
3353  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3354  * of 0 means nothing left after trim.
3355  */
3356 int
3357 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3358 {
3359 	pfn_t	kcagepfn;
3360 	int	decr;
3361 	int	rc = 0;
3362 
3363 	if (PP_ISNORELOC(mseg->pages)) {
3364 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3365 
3366 			/* lower part of this mseg inside kernel cage */
3367 			decr = kcage_current_pfn(&kcagepfn);
3368 
3369 			/* kernel cage may have transitioned past mseg */
3370 			if (kcagepfn >= mseg->pages_base &&
3371 			    kcagepfn < mseg->pages_end) {
3372 				ASSERT(decr == 0);
3373 				*lo = MAX(kcagepfn, pfnlo);
3374 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
3375 				rc = 1;
3376 			}
3377 		}
3378 		/* else entire mseg in the cage */
3379 	} else {
3380 		if (PP_ISNORELOC(mseg->epages - 1)) {
3381 
3382 			/* upper part of this mseg inside kernel cage */
3383 			decr = kcage_current_pfn(&kcagepfn);
3384 
3385 			/* kernel cage may have transitioned past mseg */
3386 			if (kcagepfn >= mseg->pages_base &&
3387 			    kcagepfn < mseg->pages_end) {
3388 				ASSERT(decr);
3389 				*hi = MIN(kcagepfn, pfnhi);
3390 				*lo = MAX(pfnlo, mseg->pages_base);
3391 				rc = 1;
3392 			}
3393 		} else {
3394 			/* entire mseg outside of kernel cage */
3395 			*lo = MAX(pfnlo, mseg->pages_base);
3396 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3397 			rc = 1;
3398 		}
3399 	}
3400 	return (rc);
3401 }
3402 
3403 /*
3404  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3405  * page with size code 'szc'. Claiming such a page requires acquiring
3406  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3407  * relocating pages in use and concatenating these constituent pages into a
3408  * large page.
3409  *
3410  * The page lists do not have such a large page and page_freelist_split has
3411  * already failed to demote larger pages and/or coalesce smaller free pages.
3412  *
3413  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3414  * pages with the same color as 'bin'.
3415  *
3416  * 'pfnflag' specifies the subset of the pfn range to search.
3417  */
3418 
3419 static page_t *
3420 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3421     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3422 {
3423 	struct memseg *mseg;
3424 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3425 	pgcnt_t szcpgmask = szcpgcnt - 1;
3426 	pfn_t	randpfn;
3427 	page_t *pp, *randpp, *endpp;
3428 	uint_t colors, ceq_mask;
3429 	/* LINTED : set but not used in function */
3430 	uint_t color_mask;
3431 	pfn_t hi, lo;
3432 	uint_t skip;
3433 	MEM_NODE_ITERATOR_DECL(it);
3434 
3435 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3436 
3437 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3438 
3439 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3440 		return (NULL);
3441 
3442 	ASSERT(szc < mmu_page_sizes);
3443 
3444 	colors = PAGE_GET_PAGECOLORS(szc);
3445 	color_mask = colors - 1;
3446 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3447 		uchar_t ceq = colorequivszc[szc];
3448 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3449 
3450 		ASSERT(ceq_dif > 0);
3451 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3452 	} else {
3453 		ceq_mask = 0;
3454 	}
3455 
3456 	ASSERT(bin < colors);
3457 
3458 	/* clear "non-significant" color bits */
3459 	bin &= ceq_mask;
3460 
3461 	/*
3462 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3463 	 * when there have been previous page_get_contig_page failures to
3464 	 * limit the search.
3465 	 *
3466 	 * The high bit in pfnflag specifies the number of 'slots' in the
3467 	 * pfn range and the remainder of pfnflag specifies which slot.
3468 	 * For example, a value of 1010b would mean the second slot of
3469 	 * the pfn range that has been divided into 8 slots.
3470 	 */
3471 	if (pfnflag > 1) {
3472 		int	slots = 1 << (highbit(pfnflag) - 1);
3473 		int	slotid = pfnflag & (slots - 1);
3474 		pgcnt_t	szcpages;
3475 		int	slotlen;
3476 
3477 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3478 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3479 		slotlen = howmany(szcpages, slots);
3480 		/* skip if 'slotid' slot is empty */
3481 		if (slotid * slotlen >= szcpages)
3482 			return (NULL);
3483 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3484 		ASSERT(pfnlo < pfnhi);
3485 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3486 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3487 	}
3488 
3489 	/*
3490 	 * This routine is can be called recursively so we shouldn't
3491 	 * acquire a reader lock if a write request is pending. This
3492 	 * could lead to a deadlock with the DR thread.
3493 	 *
3494 	 * Returning NULL informs the caller that we could not get
3495 	 * a contig page with the required characteristics.
3496 	 */
3497 
3498 	if (!memsegs_trylock(0))
3499 		return (NULL);
3500 
3501 	/*
3502 	 * loop through memsegs to look for contig page candidates
3503 	 */
3504 
3505 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3506 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3507 			/* no overlap */
3508 			continue;
3509 		}
3510 
3511 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3512 			/* mseg too small */
3513 			continue;
3514 
3515 		/*
3516 		 * trim off kernel cage pages from pfn range and check for
3517 		 * a trimmed pfn range returned that does not span the
3518 		 * desired large page size.
3519 		 */
3520 		if (kcage_on) {
3521 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3522 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3523 				continue;
3524 		} else {
3525 			lo = MAX(pfnlo, mseg->pages_base);
3526 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3527 		}
3528 
3529 		/* round to szcpgcnt boundaries */
3530 		lo = P2ROUNDUP(lo, szcpgcnt);
3531 
3532 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3533 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3534 
3535 		if (hi <= lo)
3536 			continue;
3537 
3538 		/*
3539 		 * set lo to point to the pfn for the desired bin. Large
3540 		 * page sizes may only have a single page color
3541 		 */
3542 		skip = szcpgcnt;
3543 		if (ceq_mask > 0 || interleaved_mnodes) {
3544 			/* set lo to point at appropriate color */
3545 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3546 			    (interleaved_mnodes &&
3547 			    PFN_2_MEM_NODE(lo) != mnode)) {
3548 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3549 				    color_mask, &it);
3550 			}
3551 			if (hi <= lo)
3552 				/* mseg cannot satisfy color request */
3553 				continue;
3554 		}
3555 
3556 		/* randomly choose a point between lo and hi to begin search */
3557 
3558 		randpfn = (pfn_t)GETTICK();
3559 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3560 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3561 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3562 			if (randpfn != (pfn_t)-1) {
3563 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3564 				    ceq_mask, color_mask, &it);
3565 			}
3566 			if (randpfn >= hi) {
3567 				randpfn = lo;
3568 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3569 				    &it);
3570 			}
3571 		}
3572 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3573 
3574 		ASSERT(randpp->p_pagenum == randpfn);
3575 
3576 		pp = randpp;
3577 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3578 
3579 		ASSERT(randpp + szcpgcnt <= endpp);
3580 
3581 		do {
3582 			ASSERT(!(pp->p_pagenum & szcpgmask));
3583 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3584 
3585 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3586 				/* pages unlocked by page_claim on failure */
3587 				if (page_claim_contig_pages(pp, szc, flags)) {
3588 					memsegs_unlock(0);
3589 					return (pp);
3590 				}
3591 			}
3592 
3593 			if (ceq_mask == 0 && !interleaved_mnodes) {
3594 				pp += skip;
3595 			} else {
3596 				pfn_t pfn = pp->p_pagenum;
3597 
3598 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3599 				    ceq_mask, color_mask, &it);
3600 				if (pfn == (pfn_t)-1) {
3601 					pp = endpp;
3602 				} else {
3603 					pp = mseg->pages +
3604 					    (pfn - mseg->pages_base);
3605 				}
3606 			}
3607 			if (pp >= endpp) {
3608 				/* start from the beginning */
3609 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3610 				pp = mseg->pages + (lo - mseg->pages_base);
3611 				ASSERT(pp->p_pagenum == lo);
3612 				ASSERT(pp + szcpgcnt <= endpp);
3613 			}
3614 		} while (pp != randpp);
3615 	}
3616 	memsegs_unlock(0);
3617 	return (NULL);
3618 }
3619 
3620 
3621 /*
3622  * controlling routine that searches through physical memory in an attempt to
3623  * claim a large page based on the input parameters.
3624  * on the page free lists.
3625  *
3626  * calls page_geti_contig_pages with an initial pfn range from the mnode
3627  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3628  * that overlaps with the kernel cage or does not match the requested page
3629  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3630  * page_geti_contig_pages may further limit the search range based on
3631  * previous failure counts (pgcpfailcnt[]).
3632  *
3633  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3634  * pagesize page that satisfies mtype.
3635  */
3636 page_t *
3637 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3638     uint_t flags)
3639 {
3640 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3641 	page_t		*pp;
3642 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3643 
3644 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3645 
3646 	/* no allocations from cage */
3647 	flags |= PGI_NOCAGE;
3648 
3649 	/* LINTED */
3650 	MTYPE_START(mnode, mtype, flags);
3651 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3652 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3653 		return (NULL);
3654 	}
3655 
3656 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3657 
3658 	/* do not limit search and ignore color if hi pri */
3659 
3660 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3661 		pfnflag = pgcpfailcnt[szc];
3662 
3663 	/* remove color match to improve chances */
3664 
3665 	if (flags & PGI_PGCPHIPRI || pfnflag)
3666 		flags &= ~PG_MATCH_COLOR;
3667 
3668 	do {
3669 		/* get pfn range based on mnode and mtype */
3670 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3671 
3672 		ASSERT(pfnhi >= pfnlo);
3673 
3674 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3675 		    pfnlo, pfnhi, pfnflag);
3676 
3677 		if (pp != NULL) {
3678 			pfnflag = pgcpfailcnt[szc];
3679 			if (pfnflag) {
3680 				/* double the search size */
3681 				pgcpfailcnt[szc] = pfnflag >> 1;
3682 			}
3683 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3684 			return (pp);
3685 		}
3686 		MTYPE_NEXT(mnode, mtype, flags);
3687 	} while (mtype >= 0);
3688 
3689 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3690 	return (NULL);
3691 }
3692 
3693 #if defined(__i386) || defined(__amd64)
3694 /*
3695  * Determine the likelihood of finding/coalescing a szc page.
3696  * Return 0 if the likelihood is small otherwise return 1.
3697  *
3698  * For now, be conservative and check only 1g pages and return 0
3699  * if there had been previous coalescing failures and the szc pages
3700  * needed to satisfy request would exhaust most of freemem.
3701  */
3702 int
3703 page_chk_freelist(uint_t szc)
3704 {
3705 	pgcnt_t		pgcnt;
3706 
3707 	if (szc <= 1)
3708 		return (1);
3709 
3710 	pgcnt = page_get_pagecnt(szc);
3711 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3712 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3713 		return (0);
3714 	}
3715 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3716 	return (1);
3717 }
3718 #endif
3719 
3720 /*
3721  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3722  *
3723  * Does its own locking and accounting.
3724  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3725  * pages of the proper color even if there are pages of a different color.
3726  *
3727  * Finds a page, removes it, THEN locks it.
3728  */
3729 
3730 /*ARGSUSED*/
3731 page_t *
3732 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3733 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3734 {
3735 	struct as	*as = seg->s_as;
3736 	page_t		*pp = NULL;
3737 	ulong_t		bin;
3738 	uchar_t		szc;
3739 	int		mnode;
3740 	int		mtype;
3741 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3742 	lgrp_mnode_cookie_t	lgrp_cookie;
3743 
3744 	page_get_func = page_get_mnode_freelist;
3745 
3746 	/*
3747 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3748 	 * assume we wish to allocate near to the current thread's home.
3749 	 */
3750 	if (!LGRP_EXISTS(lgrp))
3751 		lgrp = lgrp_home_lgrp();
3752 
3753 	if (kcage_on) {
3754 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3755 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3756 		    curthread != kcage_cageout_thread) {
3757 			/*
3758 			 * Set a "reserve" of kcage_throttlefree pages for
3759 			 * PG_PANIC and cageout thread allocations.
3760 			 *
3761 			 * Everybody else has to serialize in
3762 			 * page_create_get_something() to get a cage page, so
3763 			 * that we don't deadlock cageout!
3764 			 */
3765 			return (NULL);
3766 		}
3767 	} else {
3768 		flags &= ~PG_NORELOC;
3769 		flags |= PGI_NOCAGE;
3770 	}
3771 
3772 	/* LINTED */
3773 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3774 
3775 	/*
3776 	 * Convert size to page size code.
3777 	 */
3778 	if ((szc = page_szc(size)) == (uchar_t)-1)
3779 		panic("page_get_freelist: illegal page size request");
3780 	ASSERT(szc < mmu_page_sizes);
3781 
3782 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3783 
3784 	/* LINTED */
3785 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3786 
3787 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3788 
3789 	/*
3790 	 * Try to get a local page first, but try remote if we can't
3791 	 * get a page of the right color.
3792 	 */
3793 pgretry:
3794 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3795 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3796 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3797 		if (pp != NULL) {
3798 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3799 			DTRACE_PROBE4(page__get,
3800 			    lgrp_t *, lgrp,
3801 			    int, mnode,
3802 			    ulong_t, bin,
3803 			    uint_t, flags);
3804 			return (pp);
3805 		}
3806 	}
3807 	ASSERT(pp == NULL);
3808 
3809 	/*
3810 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3811 	 * remote free lists.  Caller expected to call page_get_cachelist which
3812 	 * will check local cache lists and remote free lists.
3813 	 */
3814 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3815 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3816 		return (NULL);
3817 	}
3818 
3819 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3820 
3821 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3822 
3823 	if (!(flags & PG_LOCAL)) {
3824 		/*
3825 		 * Try to get a non-local freelist page.
3826 		 */
3827 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3828 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3829 			pp = page_get_func(mnode, bin, mtype, szc, flags);
3830 			if (pp != NULL) {
3831 				DTRACE_PROBE4(page__get,
3832 				    lgrp_t *, lgrp,
3833 				    int, mnode,
3834 				    ulong_t, bin,
3835 				    uint_t, flags);
3836 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3837 				return (pp);
3838 			}
3839 		}
3840 		ASSERT(pp == NULL);
3841 	}
3842 
3843 	/*
3844 	 * when the cage is off chances are page_get_contig_pages() will fail
3845 	 * to lock a large page chunk therefore when the cage is off it's not
3846 	 * called by default.  this can be changed via /etc/system.
3847 	 *
3848 	 * page_get_contig_pages() also called to acquire a base pagesize page
3849 	 * for page_create_get_something().
3850 	 */
3851 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3852 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3853 	    (page_get_func != page_get_contig_pages)) {
3854 
3855 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3856 		page_get_func = page_get_contig_pages;
3857 		goto pgretry;
3858 	}
3859 
3860 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3861 	    page_get_func == page_get_contig_pages)
3862 		SETPGCPFAILCNT(szc);
3863 
3864 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3865 	return (NULL);
3866 }
3867 
3868 /*
3869  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3870  *
3871  * Does its own locking.
3872  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3873  * pages of the proper color even if there are pages of a different color.
3874  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3875  * try to lock one of them.  If no page can be locked, try the
3876  * next bin.  Return NULL if a page can not be found and locked.
3877  *
3878  * Finds a pages, trys to lock it, then removes it.
3879  */
3880 
3881 /*ARGSUSED*/
3882 page_t *
3883 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3884     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3885 {
3886 	page_t		*pp;
3887 	struct as	*as = seg->s_as;
3888 	ulong_t		bin;
3889 	/*LINTED*/
3890 	int		mnode;
3891 	int		mtype;
3892 	lgrp_mnode_cookie_t	lgrp_cookie;
3893 
3894 	/*
3895 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3896 	 * assume we wish to allocate near to the current thread's home.
3897 	 */
3898 	if (!LGRP_EXISTS(lgrp))
3899 		lgrp = lgrp_home_lgrp();
3900 
3901 	if (!kcage_on) {
3902 		flags &= ~PG_NORELOC;
3903 		flags |= PGI_NOCAGE;
3904 	}
3905 
3906 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3907 	    kcage_freemem <= kcage_throttlefree) {
3908 		/*
3909 		 * Reserve kcage_throttlefree pages for critical kernel
3910 		 * threads.
3911 		 *
3912 		 * Everybody else has to go to page_create_get_something()
3913 		 * to get a cage page, so we don't deadlock cageout.
3914 		 */
3915 		return (NULL);
3916 	}
3917 
3918 	/* LINTED */
3919 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3920 
3921 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3922 
3923 	/* LINTED */
3924 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3925 
3926 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3927 
3928 	/*
3929 	 * Try local cachelists first
3930 	 */
3931 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3932 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3933 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3934 		if (pp != NULL) {
3935 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3936 			DTRACE_PROBE4(page__get,
3937 			    lgrp_t *, lgrp,
3938 			    int, mnode,
3939 			    ulong_t, bin,
3940 			    uint_t, flags);
3941 			return (pp);
3942 		}
3943 	}
3944 
3945 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3946 
3947 	/*
3948 	 * Try freelists/cachelists that are farther away
3949 	 * This is our only chance to allocate remote pages for PAGESIZE
3950 	 * requests.
3951 	 */
3952 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3953 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3954 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3955 		    0, flags);
3956 		if (pp != NULL) {
3957 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3958 			DTRACE_PROBE4(page__get,
3959 			    lgrp_t *, lgrp,
3960 			    int, mnode,
3961 			    ulong_t, bin,
3962 			    uint_t, flags);
3963 			return (pp);
3964 		}
3965 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3966 		if (pp != NULL) {
3967 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3968 			DTRACE_PROBE4(page__get,
3969 			    lgrp_t *, lgrp,
3970 			    int, mnode,
3971 			    ulong_t, bin,
3972 			    uint_t, flags);
3973 			return (pp);
3974 		}
3975 	}
3976 
3977 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3978 	return (NULL);
3979 }
3980 
3981 page_t *
3982 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3983 {
3984 	kmutex_t		*pcm;
3985 	page_t			*pp, *first_pp;
3986 	uint_t			sbin;
3987 	int			plw_initialized;
3988 	page_list_walker_t	plw;
3989 
3990 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3991 
3992 	/* LINTED */
3993 	MTYPE_START(mnode, mtype, flags);
3994 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3995 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3996 		return (NULL);
3997 	}
3998 
3999 try_again:
4000 
4001 	plw_initialized = 0;
4002 	plw.plw_ceq_dif = 1;
4003 
4004 	/*
4005 	 * Only hold one cachelist lock at a time, that way we
4006 	 * can start anywhere and not have to worry about lock
4007 	 * ordering.
4008 	 */
4009 
4010 	for (plw.plw_count = 0;
4011 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4012 		sbin = bin;
4013 		do {
4014 
4015 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
4016 				goto bin_empty_1;
4017 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
4018 			mutex_enter(pcm);
4019 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
4020 			if (pp == NULL)
4021 				goto bin_empty_0;
4022 
4023 			first_pp = pp;
4024 			ASSERT(pp->p_vnode);
4025 			ASSERT(PP_ISAGED(pp) == 0);
4026 			ASSERT(pp->p_szc == 0);
4027 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4028 			while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) {
4029 				pp = pp->p_next;
4030 				ASSERT(pp->p_szc == 0);
4031 				if (pp == first_pp) {
4032 					/*
4033 					 * We have searched the complete list!
4034 					 * And all of them (might only be one)
4035 					 * are locked. This can happen since
4036 					 * these pages can also be found via
4037 					 * the hash list. When found via the
4038 					 * hash list, they are locked first,
4039 					 * then removed. We give up to let the
4040 					 * other thread run.
4041 					 */
4042 					pp = NULL;
4043 					break;
4044 				}
4045 				ASSERT(pp->p_vnode);
4046 				ASSERT(PP_ISFREE(pp));
4047 				ASSERT(PP_ISAGED(pp) == 0);
4048 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4049 				    mnode);
4050 			}
4051 
4052 			if (pp) {
4053 				page_t	**ppp;
4054 				/*
4055 				 * Found and locked a page.
4056 				 * Pull it off the list.
4057 				 */
4058 				ASSERT(mtype == PP_2_MTYPE(pp));
4059 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4060 				page_sub(ppp, pp);
4061 				/*
4062 				 * Subtract counters before releasing pcm mutex
4063 				 * to avoid a race with page_freelist_coalesce
4064 				 * and page_freelist_split.
4065 				 */
4066 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4067 				mutex_exit(pcm);
4068 				ASSERT(pp->p_vnode);
4069 				ASSERT(PP_ISAGED(pp) == 0);
4070 #if defined(__sparc)
4071 				ASSERT(!kcage_on ||
4072 				    (flags & PG_NORELOC) == 0 ||
4073 				    PP_ISNORELOC(pp));
4074 				if (PP_ISNORELOC(pp)) {
4075 					kcage_freemem_sub(1);
4076 				}
4077 #endif
4078 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4079 				return (pp);
4080 			}
4081 bin_empty_0:
4082 			mutex_exit(pcm);
4083 bin_empty_1:
4084 			if (plw_initialized == 0) {
4085 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
4086 				plw_initialized = 1;
4087 			}
4088 			/* calculate the next bin with equivalent color */
4089 			bin = ADD_MASKED(bin, plw.plw_bin_step,
4090 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
4091 		} while (sbin != bin);
4092 
4093 		if (plw.plw_ceq_dif > 1)
4094 			bin = page_list_walk_next_bin(0, bin, &plw);
4095 	}
4096 
4097 	MTYPE_NEXT(mnode, mtype, flags);
4098 	if (mtype >= 0)
4099 		goto try_again;
4100 
4101 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4102 	return (NULL);
4103 }
4104 
4105 #ifdef DEBUG
4106 #define	REPL_PAGE_STATS
4107 #endif /* DEBUG */
4108 
4109 #ifdef REPL_PAGE_STATS
4110 struct repl_page_stats {
4111 	uint_t	ngets;
4112 	uint_t	ngets_noreloc;
4113 	uint_t	npgr_noreloc;
4114 	uint_t	nnopage_first;
4115 	uint_t	nnopage;
4116 	uint_t	nhashout;
4117 	uint_t	nnofree;
4118 	uint_t	nnext_pp;
4119 } repl_page_stats;
4120 #define	REPL_STAT_INCR(v)	atomic_inc_32(&repl_page_stats.v)
4121 #else /* REPL_PAGE_STATS */
4122 #define	REPL_STAT_INCR(v)
4123 #endif /* REPL_PAGE_STATS */
4124 
4125 int	pgrppgcp;
4126 
4127 /*
4128  * The freemem accounting must be done by the caller.
4129  * First we try to get a replacement page of the same size as like_pp,
4130  * if that is not possible, then we just get a set of discontiguous
4131  * PAGESIZE pages.
4132  */
4133 page_t *
4134 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4135     uint_t pgrflags)
4136 {
4137 	page_t		*like_pp;
4138 	page_t		*pp, *pplist;
4139 	page_t		*pl = NULL;
4140 	ulong_t		bin;
4141 	int		mnode, page_mnode;
4142 	int		szc;
4143 	spgcnt_t	npgs, pg_cnt;
4144 	pfn_t		pfnum;
4145 	int		mtype;
4146 	int		flags = 0;
4147 	lgrp_mnode_cookie_t	lgrp_cookie;
4148 	lgrp_t		*lgrp;
4149 
4150 	REPL_STAT_INCR(ngets);
4151 	like_pp = orig_like_pp;
4152 	ASSERT(PAGE_EXCL(like_pp));
4153 
4154 	szc = like_pp->p_szc;
4155 	npgs = page_get_pagecnt(szc);
4156 	/*
4157 	 * Now we reset like_pp to the base page_t.
4158 	 * That way, we won't walk past the end of this 'szc' page.
4159 	 */
4160 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4161 	like_pp = page_numtopp_nolock(pfnum);
4162 	ASSERT(like_pp->p_szc == szc);
4163 
4164 	if (PP_ISNORELOC(like_pp)) {
4165 		ASSERT(kcage_on);
4166 		REPL_STAT_INCR(ngets_noreloc);
4167 		flags = PGI_RELOCONLY;
4168 	} else if (pgrflags & PGR_NORELOC) {
4169 		ASSERT(kcage_on);
4170 		REPL_STAT_INCR(npgr_noreloc);
4171 		flags = PG_NORELOC;
4172 	}
4173 
4174 	/*
4175 	 * Kernel pages must always be replaced with the same size
4176 	 * pages, since we cannot properly handle demotion of kernel
4177 	 * pages.
4178 	 */
4179 	if (PP_ISKAS(like_pp))
4180 		pgrflags |= PGR_SAMESZC;
4181 
4182 	/* LINTED */
4183 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4184 
4185 	while (npgs) {
4186 		pplist = NULL;
4187 		for (;;) {
4188 			pg_cnt = page_get_pagecnt(szc);
4189 			bin = PP_2_BIN(like_pp);
4190 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4191 			ASSERT(pg_cnt <= npgs);
4192 
4193 			/*
4194 			 * If an lgroup was specified, try to get the
4195 			 * page from that lgroup.
4196 			 * NOTE: Must be careful with code below because
4197 			 *	 lgroup may disappear and reappear since there
4198 			 *	 is no locking for lgroup here.
4199 			 */
4200 			if (LGRP_EXISTS(lgrp_target)) {
4201 				/*
4202 				 * Keep local variable for lgroup separate
4203 				 * from lgroup argument since this code should
4204 				 * only be exercised when lgroup argument
4205 				 * exists....
4206 				 */
4207 				lgrp = lgrp_target;
4208 
4209 				/* Try the lgroup's freelists first */
4210 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4211 				    LGRP_SRCH_LOCAL);
4212 				while ((pplist == NULL) &&
4213 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4214 				    != -1) {
4215 					pplist =
4216 					    page_get_mnode_freelist(mnode, bin,
4217 					    mtype, szc, flags);
4218 				}
4219 
4220 				/*
4221 				 * Now try it's cachelists if this is a
4222 				 * small page. Don't need to do it for
4223 				 * larger ones since page_freelist_coalesce()
4224 				 * already failed.
4225 				 */
4226 				if (pplist != NULL || szc != 0)
4227 					break;
4228 
4229 				/* Now try it's cachelists */
4230 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4231 				    LGRP_SRCH_LOCAL);
4232 
4233 				while ((pplist == NULL) &&
4234 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4235 				    != -1) {
4236 					pplist =
4237 					    page_get_mnode_cachelist(bin, flags,
4238 					    mnode, mtype);
4239 				}
4240 				if (pplist != NULL) {
4241 					page_hashout(pplist, NULL);
4242 					PP_SETAGED(pplist);
4243 					REPL_STAT_INCR(nhashout);
4244 					break;
4245 				}
4246 				/* Done looking in this lgroup. Bail out. */
4247 				break;
4248 			}
4249 
4250 			/*
4251 			 * No lgroup was specified (or lgroup was removed by
4252 			 * DR, so just try to get the page as close to
4253 			 * like_pp's mnode as possible.
4254 			 * First try the local freelist...
4255 			 */
4256 			mnode = PP_2_MEM_NODE(like_pp);
4257 			pplist = page_get_mnode_freelist(mnode, bin,
4258 			    mtype, szc, flags);
4259 			if (pplist != NULL)
4260 				break;
4261 
4262 			REPL_STAT_INCR(nnofree);
4263 
4264 			/*
4265 			 * ...then the local cachelist. Don't need to do it for
4266 			 * larger pages cause page_freelist_coalesce() already
4267 			 * failed there anyway.
4268 			 */
4269 			if (szc == 0) {
4270 				pplist = page_get_mnode_cachelist(bin, flags,
4271 				    mnode, mtype);
4272 				if (pplist != NULL) {
4273 					page_hashout(pplist, NULL);
4274 					PP_SETAGED(pplist);
4275 					REPL_STAT_INCR(nhashout);
4276 					break;
4277 				}
4278 			}
4279 
4280 			/* Now try remote freelists */
4281 			page_mnode = mnode;
4282 			lgrp =
4283 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4284 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4285 			    LGRP_SRCH_HIER);
4286 			while (pplist == NULL &&
4287 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4288 			    != -1) {
4289 				/*
4290 				 * Skip local mnode.
4291 				 */
4292 				if ((mnode == page_mnode) ||
4293 				    (mem_node_config[mnode].exists == 0))
4294 					continue;
4295 
4296 				pplist = page_get_mnode_freelist(mnode,
4297 				    bin, mtype, szc, flags);
4298 			}
4299 
4300 			if (pplist != NULL)
4301 				break;
4302 
4303 
4304 			/* Now try remote cachelists */
4305 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4306 			    LGRP_SRCH_HIER);
4307 			while (pplist == NULL && szc == 0) {
4308 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4309 				if (mnode == -1)
4310 					break;
4311 				/*
4312 				 * Skip local mnode.
4313 				 */
4314 				if ((mnode == page_mnode) ||
4315 				    (mem_node_config[mnode].exists == 0))
4316 					continue;
4317 
4318 				pplist = page_get_mnode_cachelist(bin,
4319 				    flags, mnode, mtype);
4320 
4321 				if (pplist != NULL) {
4322 					page_hashout(pplist, NULL);
4323 					PP_SETAGED(pplist);
4324 					REPL_STAT_INCR(nhashout);
4325 					break;
4326 				}
4327 			}
4328 
4329 			/*
4330 			 * Break out of while loop under the following cases:
4331 			 * - If we successfully got a page.
4332 			 * - If pgrflags specified only returning a specific
4333 			 *   page size and we could not find that page size.
4334 			 * - If we could not satisfy the request with PAGESIZE
4335 			 *   or larger pages.
4336 			 */
4337 			if (pplist != NULL || szc == 0)
4338 				break;
4339 
4340 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4341 				/* try to find contig page */
4342 
4343 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4344 				    LGRP_SRCH_HIER);
4345 
4346 				while ((pplist == NULL) &&
4347 				    (mnode =
4348 				    lgrp_memnode_choose(&lgrp_cookie))
4349 				    != -1) {
4350 					pplist = page_get_contig_pages(
4351 					    mnode, bin, mtype, szc,
4352 					    flags | PGI_PGCPHIPRI);
4353 				}
4354 				break;
4355 			}
4356 
4357 			/*
4358 			 * The correct thing to do here is try the next
4359 			 * page size down using szc--. Due to a bug
4360 			 * with the processing of HAT_RELOAD_SHARE
4361 			 * where the sfmmu_ttecnt arrays of all
4362 			 * hats sharing an ISM segment don't get updated,
4363 			 * using intermediate size pages for relocation
4364 			 * can lead to continuous page faults.
4365 			 */
4366 			szc = 0;
4367 		}
4368 
4369 		if (pplist != NULL) {
4370 			DTRACE_PROBE4(page__get,
4371 			    lgrp_t *, lgrp,
4372 			    int, mnode,
4373 			    ulong_t, bin,
4374 			    uint_t, flags);
4375 
4376 			while (pplist != NULL && pg_cnt--) {
4377 				ASSERT(pplist != NULL);
4378 				pp = pplist;
4379 				page_sub(&pplist, pp);
4380 				PP_CLRFREE(pp);
4381 				PP_CLRAGED(pp);
4382 				page_list_concat(&pl, &pp);
4383 				npgs--;
4384 				like_pp = like_pp + 1;
4385 				REPL_STAT_INCR(nnext_pp);
4386 			}
4387 			ASSERT(pg_cnt == 0);
4388 		} else {
4389 			break;
4390 		}
4391 	}
4392 
4393 	if (npgs) {
4394 		/*
4395 		 * We were unable to allocate the necessary number
4396 		 * of pages.
4397 		 * We need to free up any pl.
4398 		 */
4399 		REPL_STAT_INCR(nnopage);
4400 		page_free_replacement_page(pl);
4401 		return (NULL);
4402 	} else {
4403 		return (pl);
4404 	}
4405 }
4406 
4407 /*
4408  * demote a free large page to it's constituent pages
4409  */
4410 void
4411 page_demote_free_pages(page_t *pp)
4412 {
4413 
4414 	int mnode;
4415 
4416 	ASSERT(pp != NULL);
4417 	ASSERT(PAGE_LOCKED(pp));
4418 	ASSERT(PP_ISFREE(pp));
4419 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4420 
4421 	mnode = PP_2_MEM_NODE(pp);
4422 	page_freelist_lock(mnode);
4423 	if (pp->p_szc != 0) {
4424 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4425 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4426 	}
4427 	page_freelist_unlock(mnode);
4428 	ASSERT(pp->p_szc == 0);
4429 }
4430 
4431 /*
4432  * Factor in colorequiv to check additional 'equivalent' bins.
4433  * colorequiv may be set in /etc/system
4434  */
4435 void
4436 page_set_colorequiv_arr(void)
4437 {
4438 	if (colorequiv > 1) {
4439 		int i;
4440 		uint_t sv_a = lowbit(colorequiv) - 1;
4441 
4442 		if (sv_a > 15)
4443 			sv_a = 15;
4444 
4445 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4446 			uint_t colors;
4447 			uint_t a = sv_a;
4448 
4449 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4450 				continue;
4451 			}
4452 			while ((colors >> a) == 0)
4453 				a--;
4454 			if ((a << 4) > colorequivszc[i]) {
4455 				colorequivszc[i] = (a << 4);
4456 			}
4457 		}
4458 	}
4459 }
4460