xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 46b592853d0f4f11781b6b0a7533f267c6aee132)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 
35 /*
36  * This file contains common functions to access and manage the page lists.
37  * Many of these routines originated from platform dependent modules
38  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
39  * a platform independent manner.
40  *
41  * vm/vm_dep.h provides for platform specific support.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/debug.h>
46 #include <sys/cmn_err.h>
47 #include <sys/systm.h>
48 #include <sys/atomic.h>
49 #include <sys/sysmacros.h>
50 #include <vm/as.h>
51 #include <vm/page.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/seg_vn.h>
54 #include <sys/vmsystm.h>
55 #include <sys/memnode.h>
56 #include <vm/vm_dep.h>
57 #include <sys/lgrp.h>
58 #include <sys/mem_config.h>
59 #include <sys/callb.h>
60 #include <sys/mem_cage.h>
61 #include <sys/sdt.h>
62 #include <sys/dumphdr.h>
63 
64 extern uint_t	vac_colors;
65 
66 #define	MAX_PRAGMA_ALIGN	128
67 
68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
69 
70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
71 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
72 #else
73 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
74 #endif
75 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
76 
77 /*
78  * number of page colors equivalent to reqested color in page_get routines.
79  * If set, keeps large pages intact longer and keeps MPO allocation
80  * from the local mnode in favor of acquiring the 'correct' page color from
81  * a demoted large page or from a remote mnode.
82  */
83 uint_t	colorequiv;
84 
85 /*
86  * color equivalency mask for each page size.
87  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
88  * High 4 bits determine the number of high order bits of the color to ignore.
89  * Low 4 bits determines number of low order bits of color to ignore (it's only
90  * relevant for hashed index based page coloring).
91  */
92 uchar_t colorequivszc[MMU_PAGE_SIZES];
93 
94 /*
95  * if set, specifies the percentage of large pages that are free from within
96  * a large page region before attempting to lock those pages for
97  * page_get_contig_pages processing.
98  *
99  * Should be turned on when kpr is available when page_trylock_contig_pages
100  * can be more selective.
101  */
102 
103 int	ptcpthreshold;
104 
105 /*
106  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
107  * Enabled by default via pgcplimitsearch.
108  *
109  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
110  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
111  * bound. This upper bound range guarantees:
112  *    - all large page 'slots' will be searched over time
113  *    - the minimum (1) large page candidates considered on each pgcp call
114  *    - count doesn't wrap around to 0
115  */
116 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
117 int	pgcplimitsearch = 1;
118 
119 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
120 #define	SETPGCPFAILCNT(szc)						\
121 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
122 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
123 
124 #ifdef VM_STATS
125 struct vmm_vmstats_str  vmm_vmstats;
126 
127 #endif /* VM_STATS */
128 
129 #if defined(__sparc)
130 #define	LPGCREATE	0
131 #else
132 /* enable page_get_contig_pages */
133 #define	LPGCREATE	1
134 #endif
135 
136 int pg_contig_disable;
137 int pg_lpgcreate_nocage = LPGCREATE;
138 
139 /*
140  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
141  */
142 #define	PFNNULL		0
143 
144 /* Flags involved in promotion and demotion routines */
145 #define	PC_FREE		0x1	/* put page on freelist */
146 #define	PC_ALLOC	0x2	/* return page for allocation */
147 
148 /*
149  * Flag for page_demote to be used with PC_FREE to denote that we don't care
150  * what the color is as the color parameter to the function is ignored.
151  */
152 #define	PC_NO_COLOR	(-1)
153 
154 /* mtype value for page_promote to use when mtype does not matter */
155 #define	PC_MTYPE_ANY	(-1)
156 
157 /*
158  * page counters candidates info
159  * See page_ctrs_cands comment below for more details.
160  * fields are as follows:
161  *	pcc_pages_free:		# pages which freelist coalesce can create
162  *	pcc_color_free:		pointer to page free counts per color
163  */
164 typedef struct pcc_info {
165 	pgcnt_t	pcc_pages_free;
166 	pgcnt_t	*pcc_color_free;
167 	uint_t	pad[12];
168 } pcc_info_t;
169 
170 /*
171  * On big machines it can take a long time to check page_counters
172  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
173  * updated sum of all elements of the corresponding page_counters arrays.
174  * page_freelist_coalesce() searches page_counters only if an appropriate
175  * element of page_ctrs_cands array is greater than 0.
176  *
177  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
178  */
179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
180 
181 /*
182  * Return in val the total number of free pages which can be created
183  * for the given mnode (m), mrange (g), and region size (r)
184  */
185 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
186 	int i;								\
187 	val = 0;							\
188 	for (i = 0; i < NPC_MUTEX; i++) {				\
189 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
190 	}								\
191 }
192 
193 /*
194  * Return in val the total number of free pages which can be created
195  * for the given mnode (m), mrange (g), region size (r), and color (c)
196  */
197 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
198 	int i;								\
199 	val = 0;							\
200 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
201 	for (i = 0; i < NPC_MUTEX; i++) {				\
202 	    val +=							\
203 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
204 	}								\
205 }
206 
207 /*
208  * We can only allow a single thread to update a counter within the physical
209  * range of the largest supported page size. That is the finest granularity
210  * possible since the counter values are dependent on each other
211  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
212  * ctr_mutex lock index for a particular physical range.
213  */
214 static kmutex_t	*ctr_mutex[NPC_MUTEX];
215 
216 #define	PP_CTR_LOCK_INDX(pp)						\
217 	(((pp)->p_pagenum >>						\
218 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
219 
220 #define	INVALID_COLOR 0xffffffff
221 #define	INVALID_MASK  0xffffffff
222 
223 /*
224  * Local functions prototypes.
225  */
226 
227 void page_ctr_add(int, int, page_t *, int);
228 void page_ctr_add_internal(int, int, page_t *, int);
229 void page_ctr_sub(int, int, page_t *, int);
230 void page_ctr_sub_internal(int, int, page_t *, int);
231 void page_freelist_lock(int);
232 void page_freelist_unlock(int);
233 page_t *page_promote(int, pfn_t, uchar_t, int, int);
234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
235 page_t *page_freelist_split(uchar_t,
236     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
238 static int page_trylock_cons(page_t *pp, se_t se);
239 
240 /*
241  * The page_counters array below is used to keep track of free contiguous
242  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
243  * This contains an array of counters, the size of the array, a shift value
244  * used to convert a pagenum into a counter array index or vice versa, as
245  * well as a cache of the last successful index to be promoted to a larger
246  * page size.  As an optimization, we keep track of the last successful index
247  * to be promoted per page color for the given size region, and this is
248  * allocated dynamically based upon the number of colors for a given
249  * region size.
250  *
251  * Conceptually, the page counters are represented as:
252  *
253  *	page_counters[region_size][mnode]
254  *
255  *	region_size:	size code of a candidate larger page made up
256  *			of contiguous free smaller pages.
257  *
258  *	page_counters[region_size][mnode].hpm_counters[index]:
259  *		represents how many (region_size - 1) pages either
260  *		exist or can be created within the given index range.
261  *
262  * Let's look at a sparc example:
263  *	If we want to create a free 512k page, we look at region_size 2
264  *	for the mnode we want.  We calculate the index and look at a specific
265  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
266  *	this location, it means that 8 64k pages either exist or can be created
267  *	from 8K pages in order to make a single free 512k page at the given
268  *	index.  Note that when a region is full, it will contribute to the
269  *	counts in the region above it.  Thus we will not know what page
270  *	size the free pages will be which can be promoted to this new free
271  *	page unless we look at all regions below the current region.
272  */
273 
274 /*
275  * Note: hpmctr_t is defined in platform vm_dep.h
276  * hw_page_map_t contains all the information needed for the page_counters
277  * logic. The fields are as follows:
278  *
279  *	hpm_counters:	dynamically allocated array to hold counter data
280  *	hpm_entries:	entries in hpm_counters
281  *	hpm_shift:	shift for pnum/array index conv
282  *	hpm_base:	PFN mapped to counter index 0
283  *	hpm_color_current:	last index in counter array for this color at
284  *				which we successfully created a large page
285  */
286 typedef struct hw_page_map {
287 	hpmctr_t	*hpm_counters;
288 	size_t		hpm_entries;
289 	int		hpm_shift;
290 	pfn_t		hpm_base;
291 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
292 #if defined(__sparc)
293 	uint_t		pad[4];
294 #endif
295 } hw_page_map_t;
296 
297 /*
298  * Element zero is not used, but is allocated for convenience.
299  */
300 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
301 
302 /*
303  * Cached value of MNODE_RANGE_CNT(mnode).
304  * This is a function call in x86.
305  */
306 static int mnode_nranges[MAX_MEM_NODES];
307 static int mnode_maxmrange[MAX_MEM_NODES];
308 
309 /*
310  * The following macros are convenient ways to get access to the individual
311  * elements of the page_counters arrays.  They can be used on both
312  * the left side and right side of equations.
313  */
314 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
315 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
316 
317 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
318 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
319 
320 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
321 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
322 
323 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
324 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
325 
326 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
327 	(page_counters[(rg_szc)][(mnode)].hpm_base)
328 
329 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
330 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
331 
332 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
333 	(page_counters[(rg_szc)][(mnode)].				\
334 	hpm_color_current[(mrange)][(color)])
335 
336 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
337 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
338 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
339 
340 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
341 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
342 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
343 
344 /*
345  * Protects the hpm_counters and hpm_color_current memory from changing while
346  * looking at page counters information.
347  * Grab the write lock to modify what these fields point at.
348  * Grab the read lock to prevent any pointers from changing.
349  * The write lock can not be held during memory allocation due to a possible
350  * recursion deadlock with trying to grab the read lock while the
351  * write lock is already held.
352  */
353 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
354 
355 
356 /*
357  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
358  */
359 void
360 cpu_vm_data_init(struct cpu *cp)
361 {
362 	if (cp == CPU0) {
363 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
364 	} else {
365 		void	*kmptr;
366 		int	align;
367 		size_t	sz;
368 
369 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
370 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
371 		kmptr = kmem_zalloc(sz, KM_SLEEP);
372 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
373 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
374 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
375 	}
376 }
377 
378 /*
379  * free cpu_vm_data
380  */
381 void
382 cpu_vm_data_destroy(struct cpu *cp)
383 {
384 	if (cp->cpu_seqid && cp->cpu_vm_data) {
385 		ASSERT(cp != CPU0);
386 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
387 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
388 	}
389 	cp->cpu_vm_data = NULL;
390 }
391 
392 
393 /*
394  * page size to page size code
395  */
396 int
397 page_szc(size_t pagesize)
398 {
399 	int	i = 0;
400 
401 	while (hw_page_array[i].hp_size) {
402 		if (pagesize == hw_page_array[i].hp_size)
403 			return (i);
404 		i++;
405 	}
406 	return (-1);
407 }
408 
409 /*
410  * page size to page size code with the restriction that it be a supported
411  * user page size.  If it's not a supported user page size, -1 will be returned.
412  */
413 int
414 page_szc_user_filtered(size_t pagesize)
415 {
416 	int szc = page_szc(pagesize);
417 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
418 		return (szc);
419 	}
420 	return (-1);
421 }
422 
423 /*
424  * Return how many page sizes are available for the user to use.  This is
425  * what the hardware supports and not based upon how the OS implements the
426  * support of different page sizes.
427  *
428  * If legacy is non-zero, return the number of pagesizes available to legacy
429  * applications. The number of legacy page sizes might be less than the
430  * exported user page sizes. This is to prevent legacy applications that
431  * use the largest page size returned from getpagesizes(3c) from inadvertantly
432  * using the 'new' large pagesizes.
433  */
434 uint_t
435 page_num_user_pagesizes(int legacy)
436 {
437 	if (legacy)
438 		return (mmu_legacy_page_sizes);
439 	return (mmu_exported_page_sizes);
440 }
441 
442 uint_t
443 page_num_pagesizes(void)
444 {
445 	return (mmu_page_sizes);
446 }
447 
448 /*
449  * returns the count of the number of base pagesize pages associated with szc
450  */
451 pgcnt_t
452 page_get_pagecnt(uint_t szc)
453 {
454 	if (szc >= mmu_page_sizes)
455 		panic("page_get_pagecnt: out of range %d", szc);
456 	return (hw_page_array[szc].hp_pgcnt);
457 }
458 
459 size_t
460 page_get_pagesize(uint_t szc)
461 {
462 	if (szc >= mmu_page_sizes)
463 		panic("page_get_pagesize: out of range %d", szc);
464 	return (hw_page_array[szc].hp_size);
465 }
466 
467 /*
468  * Return the size of a page based upon the index passed in.  An index of
469  * zero refers to the smallest page size in the system, and as index increases
470  * it refers to the next larger supported page size in the system.
471  * Note that szc and userszc may not be the same due to unsupported szc's on
472  * some systems.
473  */
474 size_t
475 page_get_user_pagesize(uint_t userszc)
476 {
477 	uint_t szc = USERSZC_2_SZC(userszc);
478 
479 	if (szc >= mmu_page_sizes)
480 		panic("page_get_user_pagesize: out of range %d", szc);
481 	return (hw_page_array[szc].hp_size);
482 }
483 
484 uint_t
485 page_get_shift(uint_t szc)
486 {
487 	if (szc >= mmu_page_sizes)
488 		panic("page_get_shift: out of range %d", szc);
489 	return (PAGE_GET_SHIFT(szc));
490 }
491 
492 uint_t
493 page_get_pagecolors(uint_t szc)
494 {
495 	if (szc >= mmu_page_sizes)
496 		panic("page_get_pagecolors: out of range %d", szc);
497 	return (PAGE_GET_PAGECOLORS(szc));
498 }
499 
500 /*
501  * this assigns the desired equivalent color after a split
502  */
503 uint_t
504 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
505     uint_t ncolor, uint_t ceq_mask)
506 {
507 	ASSERT(nszc > szc);
508 	ASSERT(szc < mmu_page_sizes);
509 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
510 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
511 
512 	color &= ceq_mask;
513 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
514 	return (color | (ncolor & ~ceq_mask));
515 }
516 
517 /*
518  * The interleaved_mnodes flag is set when mnodes overlap in
519  * the physbase..physmax range, but have disjoint slices.
520  * In this case hpm_counters is shared by all mnodes.
521  * This flag is set dynamically by the platform.
522  */
523 int interleaved_mnodes = 0;
524 
525 /*
526  * Called by startup().
527  * Size up the per page size free list counters based on physmax
528  * of each node and max_mem_nodes.
529  *
530  * If interleaved_mnodes is set we need to find the first mnode that
531  * exists. hpm_counters for the first mnode will then be shared by
532  * all other mnodes. If interleaved_mnodes is not set, just set
533  * first=mnode each time. That means there will be no sharing.
534  */
535 size_t
536 page_ctrs_sz(void)
537 {
538 	int	r;		/* region size */
539 	int	mnode;
540 	int	firstmn;	/* first mnode that exists */
541 	int	nranges;
542 	pfn_t	physbase;
543 	pfn_t	physmax;
544 	uint_t	ctrs_sz = 0;
545 	int 	i;
546 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
547 
548 	/*
549 	 * We need to determine how many page colors there are for each
550 	 * page size in order to allocate memory for any color specific
551 	 * arrays.
552 	 */
553 	for (i = 0; i < mmu_page_sizes; i++) {
554 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
555 	}
556 
557 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
558 
559 		pgcnt_t r_pgcnt;
560 		pfn_t   r_base;
561 		pgcnt_t r_align;
562 
563 		if (mem_node_config[mnode].exists == 0)
564 			continue;
565 
566 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
567 		nranges = MNODE_RANGE_CNT(mnode);
568 		mnode_nranges[mnode] = nranges;
569 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
570 
571 		/*
572 		 * determine size needed for page counter arrays with
573 		 * base aligned to large page size.
574 		 */
575 		for (r = 1; r < mmu_page_sizes; r++) {
576 			/* add in space for hpm_color_current */
577 			ctrs_sz += sizeof (size_t) *
578 			    colors_per_szc[r] * nranges;
579 
580 			if (firstmn != mnode)
581 				continue;
582 
583 			/* add in space for hpm_counters */
584 			r_align = page_get_pagecnt(r);
585 			r_base = physbase;
586 			r_base &= ~(r_align - 1);
587 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
588 
589 			/*
590 			 * Round up to always allocate on pointer sized
591 			 * boundaries.
592 			 */
593 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
594 			    sizeof (hpmctr_t *));
595 		}
596 	}
597 
598 	for (r = 1; r < mmu_page_sizes; r++) {
599 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
600 	}
601 
602 	/* add in space for page_ctrs_cands and pcc_color_free */
603 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
604 	    mmu_page_sizes * NPC_MUTEX;
605 
606 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
607 
608 		if (mem_node_config[mnode].exists == 0)
609 			continue;
610 
611 		nranges = mnode_nranges[mnode];
612 		ctrs_sz += sizeof (pcc_info_t) * nranges *
613 		    mmu_page_sizes * NPC_MUTEX;
614 		for (r = 1; r < mmu_page_sizes; r++) {
615 			ctrs_sz += sizeof (pgcnt_t) * nranges *
616 			    colors_per_szc[r] * NPC_MUTEX;
617 		}
618 	}
619 
620 	/* ctr_mutex */
621 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
622 
623 	/* size for page list counts */
624 	PLCNT_SZ(ctrs_sz);
625 
626 	/*
627 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
628 	 * address of the counters to ecache_alignsize boundary for every
629 	 * memory node.
630 	 */
631 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
632 }
633 
634 caddr_t
635 page_ctrs_alloc(caddr_t alloc_base)
636 {
637 	int	mnode;
638 	int	mrange, nranges;
639 	int	r;		/* region size */
640 	int	i;
641 	int	firstmn;	/* first mnode that exists */
642 	pfn_t	physbase;
643 	pfn_t	physmax;
644 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
645 
646 	/*
647 	 * We need to determine how many page colors there are for each
648 	 * page size in order to allocate memory for any color specific
649 	 * arrays.
650 	 */
651 	for (i = 0; i < mmu_page_sizes; i++) {
652 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
653 	}
654 
655 	for (r = 1; r < mmu_page_sizes; r++) {
656 		page_counters[r] = (hw_page_map_t *)alloc_base;
657 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
658 	}
659 
660 	/* page_ctrs_cands and pcc_color_free array */
661 	for (i = 0; i < NPC_MUTEX; i++) {
662 		for (r = 1; r < mmu_page_sizes; r++) {
663 
664 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
665 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
666 
667 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
668 				pcc_info_t *pi;
669 
670 				if (mem_node_config[mnode].exists == 0)
671 					continue;
672 
673 				nranges = mnode_nranges[mnode];
674 
675 				pi = (pcc_info_t *)alloc_base;
676 				alloc_base += sizeof (pcc_info_t) * nranges;
677 				page_ctrs_cands[i][r][mnode] = pi;
678 
679 				for (mrange = 0; mrange < nranges; mrange++) {
680 					pi->pcc_color_free =
681 					    (pgcnt_t *)alloc_base;
682 					alloc_base += sizeof (pgcnt_t) *
683 					    colors_per_szc[r];
684 					pi++;
685 				}
686 			}
687 		}
688 	}
689 
690 	/* ctr_mutex */
691 	for (i = 0; i < NPC_MUTEX; i++) {
692 		ctr_mutex[i] = (kmutex_t *)alloc_base;
693 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
694 	}
695 
696 	/* initialize page list counts */
697 	PLCNT_INIT(alloc_base);
698 
699 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
700 
701 		pgcnt_t r_pgcnt;
702 		pfn_t	r_base;
703 		pgcnt_t r_align;
704 		int	r_shift;
705 		int	nranges = mnode_nranges[mnode];
706 
707 		if (mem_node_config[mnode].exists == 0)
708 			continue;
709 
710 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
711 
712 		for (r = 1; r < mmu_page_sizes; r++) {
713 			/*
714 			 * the page_counters base has to be aligned to the
715 			 * page count of page size code r otherwise the counts
716 			 * will cross large page boundaries.
717 			 */
718 			r_align = page_get_pagecnt(r);
719 			r_base = physbase;
720 			/* base needs to be aligned - lower to aligned value */
721 			r_base &= ~(r_align - 1);
722 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
723 			r_shift = PAGE_BSZS_SHIFT(r);
724 
725 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
726 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
727 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
728 			for (mrange = 0; mrange < nranges; mrange++) {
729 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
730 				    r, mrange) = (size_t *)alloc_base;
731 				alloc_base += sizeof (size_t) *
732 				    colors_per_szc[r];
733 			}
734 			for (i = 0; i < colors_per_szc[r]; i++) {
735 				uint_t color_mask = colors_per_szc[r] - 1;
736 				pfn_t  pfnum = r_base;
737 				size_t idx;
738 				int mrange;
739 				MEM_NODE_ITERATOR_DECL(it);
740 
741 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
742 				if (pfnum == (pfn_t)-1) {
743 					idx = 0;
744 				} else {
745 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
746 					    color_mask, color_mask, &it);
747 					idx = PNUM_TO_IDX(mnode, r, pfnum);
748 					idx = (idx >= r_pgcnt) ? 0 : idx;
749 				}
750 				for (mrange = 0; mrange < nranges; mrange++) {
751 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
752 					    r, i, mrange) = idx;
753 				}
754 			}
755 
756 			/* hpm_counters may be shared by all mnodes */
757 			if (firstmn == mnode) {
758 				PAGE_COUNTERS_COUNTERS(mnode, r) =
759 				    (hpmctr_t *)alloc_base;
760 				alloc_base +=
761 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
762 				    sizeof (hpmctr_t *));
763 			} else {
764 				PAGE_COUNTERS_COUNTERS(mnode, r) =
765 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
766 			}
767 
768 			/*
769 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
770 			 * satisfy the identity requirement.
771 			 * We should be able to go from one to the other
772 			 * and get consistent values.
773 			 */
774 			ASSERT(PNUM_TO_IDX(mnode, r,
775 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
776 			ASSERT(IDX_TO_PNUM(mnode, r,
777 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
778 		}
779 		/*
780 		 * Roundup the start address of the page_counters to
781 		 * cache aligned boundary for every memory node.
782 		 * page_ctrs_sz() has added some slop for these roundups.
783 		 */
784 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
785 		    L2CACHE_ALIGN);
786 	}
787 
788 	/* Initialize other page counter specific data structures. */
789 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
790 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
791 	}
792 
793 	return (alloc_base);
794 }
795 
796 /*
797  * Functions to adjust region counters for each size free list.
798  * Caller is responsible to acquire the ctr_mutex lock if necessary and
799  * thus can be called during startup without locks.
800  */
801 /* ARGSUSED */
802 void
803 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
804 {
805 	ssize_t		r;	/* region size */
806 	ssize_t		idx;
807 	pfn_t		pfnum;
808 	int		lckidx;
809 
810 	ASSERT(mnode == PP_2_MEM_NODE(pp));
811 	ASSERT(mtype == PP_2_MTYPE(pp));
812 
813 	ASSERT(pp->p_szc < mmu_page_sizes);
814 
815 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
816 
817 	/* no counter update needed for largest page size */
818 	if (pp->p_szc >= mmu_page_sizes - 1) {
819 		return;
820 	}
821 
822 	r = pp->p_szc + 1;
823 	pfnum = pp->p_pagenum;
824 	lckidx = PP_CTR_LOCK_INDX(pp);
825 
826 	/*
827 	 * Increment the count of free pages for the current
828 	 * region. Continue looping up in region size incrementing
829 	 * count if the preceeding region is full.
830 	 */
831 	while (r < mmu_page_sizes) {
832 		idx = PNUM_TO_IDX(mnode, r, pfnum);
833 
834 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
835 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
836 
837 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
838 			break;
839 		} else {
840 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
841 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
842 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
843 
844 			cand->pcc_pages_free++;
845 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
846 		}
847 		r++;
848 	}
849 }
850 
851 void
852 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
853 {
854 	int		lckidx = PP_CTR_LOCK_INDX(pp);
855 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
856 
857 	mutex_enter(lock);
858 	page_ctr_add_internal(mnode, mtype, pp, flags);
859 	mutex_exit(lock);
860 }
861 
862 void
863 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
864 {
865 	int		lckidx;
866 	ssize_t		r;	/* region size */
867 	ssize_t		idx;
868 	pfn_t		pfnum;
869 
870 	ASSERT(mnode == PP_2_MEM_NODE(pp));
871 	ASSERT(mtype == PP_2_MTYPE(pp));
872 
873 	ASSERT(pp->p_szc < mmu_page_sizes);
874 
875 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
876 
877 	/* no counter update needed for largest page size */
878 	if (pp->p_szc >= mmu_page_sizes - 1) {
879 		return;
880 	}
881 
882 	r = pp->p_szc + 1;
883 	pfnum = pp->p_pagenum;
884 	lckidx = PP_CTR_LOCK_INDX(pp);
885 
886 	/*
887 	 * Decrement the count of free pages for the current
888 	 * region. Continue looping up in region size decrementing
889 	 * count if the preceeding region was full.
890 	 */
891 	while (r < mmu_page_sizes) {
892 		idx = PNUM_TO_IDX(mnode, r, pfnum);
893 
894 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
895 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
896 
897 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
898 			break;
899 		} else {
900 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
901 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
902 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
903 
904 			ASSERT(cand->pcc_pages_free != 0);
905 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
906 
907 			cand->pcc_pages_free--;
908 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
909 		}
910 		r++;
911 	}
912 }
913 
914 void
915 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
916 {
917 	int		lckidx = PP_CTR_LOCK_INDX(pp);
918 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
919 
920 	mutex_enter(lock);
921 	page_ctr_sub_internal(mnode, mtype, pp, flags);
922 	mutex_exit(lock);
923 }
924 
925 /*
926  * Adjust page counters following a memory attach, since typically the
927  * size of the array needs to change, and the PFN to counter index
928  * mapping needs to change.
929  *
930  * It is possible this mnode did not exist at startup. In that case
931  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
932  * to change (a theoretical possibility on x86), which means pcc_color_free
933  * arrays must be extended.
934  */
935 uint_t
936 page_ctrs_adjust(int mnode)
937 {
938 	pgcnt_t npgs;
939 	int	r;		/* region size */
940 	int	i;
941 	size_t	pcsz, old_csz;
942 	hpmctr_t *new_ctr, *old_ctr;
943 	pfn_t	oldbase, newbase;
944 	pfn_t	physbase, physmax;
945 	size_t	old_npgs;
946 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
947 	size_t	size_cache[MMU_PAGE_SIZES];
948 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
949 	size_t	*old_color_array[MAX_MNODE_MRANGES];
950 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
951 	pcc_info_t **cands_cache;
952 	pcc_info_t *old_pi, *pi;
953 	pgcnt_t *pgcntp;
954 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
955 	int cands_cache_nranges;
956 	int old_maxmrange, new_maxmrange;
957 	int rc = 0;
958 	int oldmnode;
959 
960 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
961 	    MMU_PAGE_SIZES, KM_NOSLEEP);
962 	if (cands_cache == NULL)
963 		return (ENOMEM);
964 
965 	i = -1;
966 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
967 
968 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
969 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
970 
971 	/* prepare to free non-null pointers on the way out */
972 	cands_cache_nranges = nranges;
973 	bzero(ctr_cache, sizeof (ctr_cache));
974 	bzero(color_cache, sizeof (color_cache));
975 
976 	/*
977 	 * We need to determine how many page colors there are for each
978 	 * page size in order to allocate memory for any color specific
979 	 * arrays.
980 	 */
981 	for (r = 0; r < mmu_page_sizes; r++) {
982 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
983 	}
984 
985 	/*
986 	 * Preallocate all of the new hpm_counters arrays as we can't
987 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
988 	 * If we can't allocate all of the arrays, undo our work so far
989 	 * and return failure.
990 	 */
991 	for (r = 1; r < mmu_page_sizes; r++) {
992 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
993 		size_cache[r] = pcsz;
994 		ctr_cache[r] = kmem_zalloc(pcsz *
995 		    sizeof (hpmctr_t), KM_NOSLEEP);
996 		if (ctr_cache[r] == NULL) {
997 			rc = ENOMEM;
998 			goto cleanup;
999 		}
1000 	}
1001 
1002 	/*
1003 	 * Preallocate all of the new color current arrays as we can't
1004 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1005 	 * If we can't allocate all of the arrays, undo our work so far
1006 	 * and return failure.
1007 	 */
1008 	for (r = 1; r < mmu_page_sizes; r++) {
1009 		for (mrange = 0; mrange < nranges; mrange++) {
1010 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1011 			    colors_per_szc[r], KM_NOSLEEP);
1012 			if (color_cache[r][mrange] == NULL) {
1013 				rc = ENOMEM;
1014 				goto cleanup;
1015 			}
1016 		}
1017 	}
1018 
1019 	/*
1020 	 * Preallocate all of the new pcc_info_t arrays as we can't
1021 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1022 	 * If we can't allocate all of the arrays, undo our work so far
1023 	 * and return failure.
1024 	 */
1025 	for (r = 1; r < mmu_page_sizes; r++) {
1026 		for (i = 0; i < NPC_MUTEX; i++) {
1027 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1028 			    KM_NOSLEEP);
1029 			if (pi == NULL) {
1030 				rc = ENOMEM;
1031 				goto cleanup;
1032 			}
1033 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1034 
1035 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1036 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1037 				    sizeof (pgcnt_t), KM_NOSLEEP);
1038 				if (pgcntp == NULL) {
1039 					rc = ENOMEM;
1040 					goto cleanup;
1041 				}
1042 				pi->pcc_color_free = pgcntp;
1043 			}
1044 		}
1045 	}
1046 
1047 	/*
1048 	 * Grab the write lock to prevent others from walking these arrays
1049 	 * while we are modifying them.
1050 	 */
1051 	PAGE_CTRS_WRITE_LOCK(mnode);
1052 
1053 	/*
1054 	 * For interleaved mnodes, find the first mnode
1055 	 * with valid page counters since the current
1056 	 * mnode may have just been added and not have
1057 	 * valid page counters.
1058 	 */
1059 	if (interleaved_mnodes) {
1060 		for (i = 0; i < max_mem_nodes; i++)
1061 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1062 				break;
1063 		ASSERT(i < max_mem_nodes);
1064 		oldmnode = i;
1065 	} else
1066 		oldmnode = mnode;
1067 
1068 	old_nranges = mnode_nranges[mnode];
1069 	cands_cache_nranges = old_nranges;
1070 	mnode_nranges[mnode] = nranges;
1071 	old_maxmrange = mnode_maxmrange[mnode];
1072 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1073 	new_maxmrange = mnode_maxmrange[mnode];
1074 
1075 	for (r = 1; r < mmu_page_sizes; r++) {
1076 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1077 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1078 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1079 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1080 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1081 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1082 			old_color_array[mrange] =
1083 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1084 			    r, mrange);
1085 		}
1086 
1087 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1088 		new_ctr = ctr_cache[r];
1089 		ctr_cache[r] = NULL;
1090 		if (old_ctr != NULL &&
1091 		    (oldbase + old_npgs > newbase) &&
1092 		    (newbase + npgs > oldbase)) {
1093 			/*
1094 			 * Map the intersection of the old and new
1095 			 * counters into the new array.
1096 			 */
1097 			size_t offset;
1098 			if (newbase > oldbase) {
1099 				offset = (newbase - oldbase) >>
1100 				    PAGE_COUNTERS_SHIFT(mnode, r);
1101 				bcopy(old_ctr + offset, new_ctr,
1102 				    MIN(pcsz, (old_csz - offset)) *
1103 				    sizeof (hpmctr_t));
1104 			} else {
1105 				offset = (oldbase - newbase) >>
1106 				    PAGE_COUNTERS_SHIFT(mnode, r);
1107 				bcopy(old_ctr, new_ctr + offset,
1108 				    MIN(pcsz - offset, old_csz) *
1109 				    sizeof (hpmctr_t));
1110 			}
1111 		}
1112 
1113 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1114 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1115 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1116 
1117 		/* update shared hpm_counters in other mnodes */
1118 		if (interleaved_mnodes) {
1119 			for (i = 0; i < max_mem_nodes; i++) {
1120 				if (i == mnode)
1121 					continue;
1122 				ASSERT(
1123 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1124 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1125 				if (mem_node_config[i].exists == 0)
1126 					continue;
1127 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1128 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1129 				PAGE_COUNTERS_BASE(i, r) = newbase;
1130 			}
1131 		}
1132 
1133 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1134 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1135 			    color_cache[r][mrange];
1136 			color_cache[r][mrange] = NULL;
1137 		}
1138 		/*
1139 		 * for now, just reset on these events as it's probably
1140 		 * not worthwhile to try and optimize this.
1141 		 */
1142 		for (i = 0; i < colors_per_szc[r]; i++) {
1143 			uint_t color_mask = colors_per_szc[r] - 1;
1144 			int mlo = interleaved_mnodes ? 0 : mnode;
1145 			int mhi = interleaved_mnodes ? max_mem_nodes :
1146 			    (mnode + 1);
1147 			int m;
1148 			pfn_t  pfnum;
1149 			size_t idx;
1150 			MEM_NODE_ITERATOR_DECL(it);
1151 
1152 			for (m = mlo; m < mhi; m++) {
1153 				if (mem_node_config[m].exists == 0)
1154 					continue;
1155 				pfnum = newbase;
1156 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1157 				if (pfnum == (pfn_t)-1) {
1158 					idx = 0;
1159 				} else {
1160 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1161 					    color_mask, color_mask, &it);
1162 					idx = PNUM_TO_IDX(m, r, pfnum);
1163 					idx = (idx < pcsz) ? idx : 0;
1164 				}
1165 				for (mrange = 0; mrange < nranges; mrange++) {
1166 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1167 					    r, mrange) != NULL)
1168 						PAGE_COUNTERS_CURRENT_COLOR(m,
1169 						    r, i, mrange) = idx;
1170 				}
1171 			}
1172 		}
1173 
1174 		/* cache info for freeing out of the critical path */
1175 		if ((caddr_t)old_ctr >= kernelheap &&
1176 		    (caddr_t)old_ctr < ekernelheap) {
1177 			ctr_cache[r] = old_ctr;
1178 			size_cache[r] = old_csz;
1179 		}
1180 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1181 			size_t *tmp = old_color_array[mrange];
1182 			if ((caddr_t)tmp >= kernelheap &&
1183 			    (caddr_t)tmp < ekernelheap) {
1184 				color_cache[r][mrange] = tmp;
1185 			}
1186 		}
1187 		/*
1188 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1189 		 * satisfy the identity requirement.
1190 		 * We should be able to go from one to the other
1191 		 * and get consistent values.
1192 		 */
1193 		ASSERT(PNUM_TO_IDX(mnode, r,
1194 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1195 		ASSERT(IDX_TO_PNUM(mnode, r,
1196 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1197 
1198 		/* pcc_info_t and pcc_color_free */
1199 		for (i = 0; i < NPC_MUTEX; i++) {
1200 			pcc_info_t *epi;
1201 			pcc_info_t *eold_pi;
1202 
1203 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1204 			old_pi = page_ctrs_cands[i][r][mnode];
1205 			page_ctrs_cands[i][r][mnode] = pi;
1206 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1207 
1208 			/* preserve old pcc_color_free values, if any */
1209 			if (old_pi == NULL)
1210 				continue;
1211 
1212 			/*
1213 			 * when/if x86 does DR, must account for
1214 			 * possible change in range index when
1215 			 * preserving pcc_info
1216 			 */
1217 			epi = &pi[nranges];
1218 			eold_pi = &old_pi[old_nranges];
1219 			if (new_maxmrange > old_maxmrange) {
1220 				pi += new_maxmrange - old_maxmrange;
1221 			} else if (new_maxmrange < old_maxmrange) {
1222 				old_pi += old_maxmrange - new_maxmrange;
1223 			}
1224 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1225 				pcc_info_t tmp = *pi;
1226 				*pi = *old_pi;
1227 				*old_pi = tmp;
1228 			}
1229 		}
1230 	}
1231 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1232 
1233 	/*
1234 	 * Now that we have dropped the write lock, it is safe to free all
1235 	 * of the memory we have cached above.
1236 	 * We come thru here to free memory when pre-alloc fails, and also to
1237 	 * free old pointers which were recorded while locked.
1238 	 */
1239 cleanup:
1240 	for (r = 1; r < mmu_page_sizes; r++) {
1241 		if (ctr_cache[r] != NULL) {
1242 			kmem_free(ctr_cache[r],
1243 			    size_cache[r] * sizeof (hpmctr_t));
1244 		}
1245 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1246 			if (color_cache[r][mrange] != NULL) {
1247 				kmem_free(color_cache[r][mrange],
1248 				    colors_per_szc[r] * sizeof (size_t));
1249 			}
1250 		}
1251 		for (i = 0; i < NPC_MUTEX; i++) {
1252 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1253 			if (pi == NULL)
1254 				continue;
1255 			nr = cands_cache_nranges;
1256 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1257 				pgcntp = pi->pcc_color_free;
1258 				if (pgcntp == NULL)
1259 					continue;
1260 				if ((caddr_t)pgcntp >= kernelheap &&
1261 				    (caddr_t)pgcntp < ekernelheap) {
1262 					kmem_free(pgcntp,
1263 					    colors_per_szc[r] *
1264 					    sizeof (pgcnt_t));
1265 				}
1266 			}
1267 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1268 			if ((caddr_t)pi >= kernelheap &&
1269 			    (caddr_t)pi < ekernelheap) {
1270 				kmem_free(pi, nr * sizeof (pcc_info_t));
1271 			}
1272 		}
1273 	}
1274 
1275 	kmem_free(cands_cache,
1276 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1277 	return (rc);
1278 }
1279 
1280 
1281 #ifdef DEBUG
1282 
1283 /*
1284  * confirm pp is a large page corresponding to szc
1285  */
1286 void
1287 chk_lpg(page_t *pp, uchar_t szc)
1288 {
1289 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1290 	uint_t noreloc;
1291 
1292 	if (npgs == 1) {
1293 		ASSERT(pp->p_szc == 0);
1294 		ASSERT(pp->p_next == pp);
1295 		ASSERT(pp->p_prev == pp);
1296 		return;
1297 	}
1298 
1299 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1300 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1301 
1302 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1303 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1304 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1305 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1306 
1307 	/*
1308 	 * Check list of pages.
1309 	 */
1310 	noreloc = PP_ISNORELOC(pp);
1311 	while (npgs--) {
1312 		if (npgs != 0) {
1313 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1314 			ASSERT(pp->p_next == (pp + 1));
1315 		}
1316 		ASSERT(pp->p_szc == szc);
1317 		ASSERT(PP_ISFREE(pp));
1318 		ASSERT(PP_ISAGED(pp));
1319 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1320 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1321 		ASSERT(pp->p_vnode  == NULL);
1322 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1323 
1324 		pp = pp->p_next;
1325 	}
1326 }
1327 #endif /* DEBUG */
1328 
1329 void
1330 page_freelist_lock(int mnode)
1331 {
1332 	int i;
1333 	for (i = 0; i < NPC_MUTEX; i++) {
1334 		mutex_enter(FPC_MUTEX(mnode, i));
1335 		mutex_enter(CPC_MUTEX(mnode, i));
1336 	}
1337 }
1338 
1339 void
1340 page_freelist_unlock(int mnode)
1341 {
1342 	int i;
1343 	for (i = 0; i < NPC_MUTEX; i++) {
1344 		mutex_exit(FPC_MUTEX(mnode, i));
1345 		mutex_exit(CPC_MUTEX(mnode, i));
1346 	}
1347 }
1348 
1349 /*
1350  * add pp to the specified page list. Defaults to head of the page list
1351  * unless PG_LIST_TAIL is specified.
1352  */
1353 void
1354 page_list_add(page_t *pp, int flags)
1355 {
1356 	page_t		**ppp;
1357 	kmutex_t	*pcm;
1358 	uint_t		bin, mtype;
1359 	int		mnode;
1360 
1361 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1362 	ASSERT(PP_ISFREE(pp));
1363 	ASSERT(!hat_page_is_mapped(pp));
1364 	ASSERT(hat_page_getshare(pp) == 0);
1365 
1366 	/*
1367 	 * Large pages should be freed via page_list_add_pages().
1368 	 */
1369 	ASSERT(pp->p_szc == 0);
1370 
1371 	/*
1372 	 * Don't need to lock the freelist first here
1373 	 * because the page isn't on the freelist yet.
1374 	 * This means p_szc can't change on us.
1375 	 */
1376 
1377 	bin = PP_2_BIN(pp);
1378 	mnode = PP_2_MEM_NODE(pp);
1379 	mtype = PP_2_MTYPE(pp);
1380 
1381 	if (flags & PG_LIST_ISINIT) {
1382 		/*
1383 		 * PG_LIST_ISINIT is set during system startup (ie. single
1384 		 * threaded), add a page to the free list and add to the
1385 		 * the free region counters w/o any locking
1386 		 */
1387 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1388 
1389 		/* inline version of page_add() */
1390 		if (*ppp != NULL) {
1391 			pp->p_next = *ppp;
1392 			pp->p_prev = (*ppp)->p_prev;
1393 			(*ppp)->p_prev = pp;
1394 			pp->p_prev->p_next = pp;
1395 		} else
1396 			*ppp = pp;
1397 
1398 		page_ctr_add_internal(mnode, mtype, pp, flags);
1399 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1400 	} else {
1401 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1402 
1403 		if (flags & PG_FREE_LIST) {
1404 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1405 			ASSERT(PP_ISAGED(pp));
1406 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1407 
1408 		} else {
1409 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1410 			ASSERT(pp->p_vnode);
1411 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1412 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1413 		}
1414 		mutex_enter(pcm);
1415 		page_add(ppp, pp);
1416 
1417 		if (flags & PG_LIST_TAIL)
1418 			*ppp = (*ppp)->p_next;
1419 		/*
1420 		 * Add counters before releasing pcm mutex to avoid a race with
1421 		 * page_freelist_coalesce and page_freelist_split.
1422 		 */
1423 		page_ctr_add(mnode, mtype, pp, flags);
1424 		mutex_exit(pcm);
1425 	}
1426 
1427 
1428 #if defined(__sparc)
1429 	if (PP_ISNORELOC(pp)) {
1430 		kcage_freemem_add(1);
1431 	}
1432 #endif
1433 	/*
1434 	 * It is up to the caller to unlock the page!
1435 	 */
1436 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1437 }
1438 
1439 
1440 #ifdef __sparc
1441 /*
1442  * This routine is only used by kcage_init during system startup.
1443  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1444  * without the overhead of taking locks and updating counters.
1445  */
1446 void
1447 page_list_noreloc_startup(page_t *pp)
1448 {
1449 	page_t		**ppp;
1450 	uint_t		bin;
1451 	int		mnode;
1452 	int		mtype;
1453 	int		flags = 0;
1454 
1455 	/*
1456 	 * If this is a large page on the freelist then
1457 	 * break it up into smaller pages.
1458 	 */
1459 	if (pp->p_szc != 0)
1460 		page_boot_demote(pp);
1461 
1462 	/*
1463 	 * Get list page is currently on.
1464 	 */
1465 	bin = PP_2_BIN(pp);
1466 	mnode = PP_2_MEM_NODE(pp);
1467 	mtype = PP_2_MTYPE(pp);
1468 	ASSERT(mtype == MTYPE_RELOC);
1469 	ASSERT(pp->p_szc == 0);
1470 
1471 	if (PP_ISAGED(pp)) {
1472 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1473 		flags |= PG_FREE_LIST;
1474 	} else {
1475 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1476 		flags |= PG_CACHE_LIST;
1477 	}
1478 
1479 	ASSERT(*ppp != NULL);
1480 
1481 	/*
1482 	 * Delete page from current list.
1483 	 */
1484 	if (*ppp == pp)
1485 		*ppp = pp->p_next;		/* go to next page */
1486 	if (*ppp == pp) {
1487 		*ppp = NULL;			/* page list is gone */
1488 	} else {
1489 		pp->p_prev->p_next = pp->p_next;
1490 		pp->p_next->p_prev = pp->p_prev;
1491 	}
1492 
1493 	/*
1494 	 * Decrement page counters
1495 	 */
1496 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1497 
1498 	/*
1499 	 * Set no reloc for cage initted pages.
1500 	 */
1501 	PP_SETNORELOC(pp);
1502 
1503 	mtype = PP_2_MTYPE(pp);
1504 	ASSERT(mtype == MTYPE_NORELOC);
1505 
1506 	/*
1507 	 * Get new list for page.
1508 	 */
1509 	if (PP_ISAGED(pp)) {
1510 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1511 	} else {
1512 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1513 	}
1514 
1515 	/*
1516 	 * Insert page on new list.
1517 	 */
1518 	if (*ppp == NULL) {
1519 		*ppp = pp;
1520 		pp->p_next = pp->p_prev = pp;
1521 	} else {
1522 		pp->p_next = *ppp;
1523 		pp->p_prev = (*ppp)->p_prev;
1524 		(*ppp)->p_prev = pp;
1525 		pp->p_prev->p_next = pp;
1526 	}
1527 
1528 	/*
1529 	 * Increment page counters
1530 	 */
1531 	page_ctr_add_internal(mnode, mtype, pp, flags);
1532 
1533 	/*
1534 	 * Update cage freemem counter
1535 	 */
1536 	atomic_add_long(&kcage_freemem, 1);
1537 }
1538 #else	/* __sparc */
1539 
1540 /* ARGSUSED */
1541 void
1542 page_list_noreloc_startup(page_t *pp)
1543 {
1544 	panic("page_list_noreloc_startup: should be here only for sparc");
1545 }
1546 #endif
1547 
1548 void
1549 page_list_add_pages(page_t *pp, int flags)
1550 {
1551 	kmutex_t *pcm;
1552 	pgcnt_t	pgcnt;
1553 	uint_t	bin, mtype, i;
1554 	int	mnode;
1555 
1556 	/* default to freelist/head */
1557 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1558 
1559 	CHK_LPG(pp, pp->p_szc);
1560 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1561 
1562 	bin = PP_2_BIN(pp);
1563 	mnode = PP_2_MEM_NODE(pp);
1564 	mtype = PP_2_MTYPE(pp);
1565 
1566 	if (flags & PG_LIST_ISINIT) {
1567 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1568 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1569 		ASSERT(!PP_ISNORELOC(pp));
1570 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1571 	} else {
1572 
1573 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1574 
1575 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1576 
1577 		mutex_enter(pcm);
1578 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1579 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1580 		mutex_exit(pcm);
1581 
1582 		pgcnt = page_get_pagecnt(pp->p_szc);
1583 #if defined(__sparc)
1584 		if (PP_ISNORELOC(pp))
1585 			kcage_freemem_add(pgcnt);
1586 #endif
1587 		for (i = 0; i < pgcnt; i++, pp++)
1588 			page_unlock_nocapture(pp);
1589 	}
1590 }
1591 
1592 /*
1593  * During boot, need to demote a large page to base
1594  * pagesize pages for seg_kmem for use in boot_alloc()
1595  */
1596 void
1597 page_boot_demote(page_t *pp)
1598 {
1599 	ASSERT(pp->p_szc != 0);
1600 	ASSERT(PP_ISFREE(pp));
1601 	ASSERT(PP_ISAGED(pp));
1602 
1603 	(void) page_demote(PP_2_MEM_NODE(pp),
1604 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1605 	    PC_FREE);
1606 
1607 	ASSERT(PP_ISFREE(pp));
1608 	ASSERT(PP_ISAGED(pp));
1609 	ASSERT(pp->p_szc == 0);
1610 }
1611 
1612 /*
1613  * Take a particular page off of whatever freelist the page
1614  * is claimed to be on.
1615  *
1616  * NOTE: Only used for PAGESIZE pages.
1617  */
1618 void
1619 page_list_sub(page_t *pp, int flags)
1620 {
1621 	int		bin;
1622 	uint_t		mtype;
1623 	int		mnode;
1624 	kmutex_t	*pcm;
1625 	page_t		**ppp;
1626 
1627 	ASSERT(PAGE_EXCL(pp));
1628 	ASSERT(PP_ISFREE(pp));
1629 
1630 	/*
1631 	 * The p_szc field can only be changed by page_promote()
1632 	 * and page_demote(). Only free pages can be promoted and
1633 	 * demoted and the free list MUST be locked during these
1634 	 * operations. So to prevent a race in page_list_sub()
1635 	 * between computing which bin of the freelist lock to
1636 	 * grab and actually grabing the lock we check again that
1637 	 * the bin we locked is still the correct one. Notice that
1638 	 * the p_szc field could have actually changed on us but
1639 	 * if the bin happens to still be the same we are safe.
1640 	 */
1641 try_again:
1642 	bin = PP_2_BIN(pp);
1643 	mnode = PP_2_MEM_NODE(pp);
1644 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1645 	mutex_enter(pcm);
1646 	if (PP_2_BIN(pp) != bin) {
1647 		mutex_exit(pcm);
1648 		goto try_again;
1649 	}
1650 	mtype = PP_2_MTYPE(pp);
1651 
1652 	if (flags & PG_FREE_LIST) {
1653 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1654 		ASSERT(PP_ISAGED(pp));
1655 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1656 	} else {
1657 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1658 		ASSERT(!PP_ISAGED(pp));
1659 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1660 	}
1661 
1662 	/*
1663 	 * Common PAGESIZE case.
1664 	 *
1665 	 * Note that we locked the freelist. This prevents
1666 	 * any page promotion/demotion operations. Therefore
1667 	 * the p_szc will not change until we drop pcm mutex.
1668 	 */
1669 	if (pp->p_szc == 0) {
1670 		page_sub(ppp, pp);
1671 		/*
1672 		 * Subtract counters before releasing pcm mutex
1673 		 * to avoid race with page_freelist_coalesce.
1674 		 */
1675 		page_ctr_sub(mnode, mtype, pp, flags);
1676 		mutex_exit(pcm);
1677 
1678 #if defined(__sparc)
1679 		if (PP_ISNORELOC(pp)) {
1680 			kcage_freemem_sub(1);
1681 		}
1682 #endif
1683 		return;
1684 	}
1685 
1686 	/*
1687 	 * Large pages on the cache list are not supported.
1688 	 */
1689 	if (flags & PG_CACHE_LIST)
1690 		panic("page_list_sub: large page on cachelist");
1691 
1692 	/*
1693 	 * Slow but rare.
1694 	 *
1695 	 * Somebody wants this particular page which is part
1696 	 * of a large page. In this case we just demote the page
1697 	 * if it's on the freelist.
1698 	 *
1699 	 * We have to drop pcm before locking the entire freelist.
1700 	 * Once we have re-locked the freelist check to make sure
1701 	 * the page hasn't already been demoted or completely
1702 	 * freed.
1703 	 */
1704 	mutex_exit(pcm);
1705 	page_freelist_lock(mnode);
1706 	if (pp->p_szc != 0) {
1707 		/*
1708 		 * Large page is on freelist.
1709 		 */
1710 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1711 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1712 	}
1713 	ASSERT(PP_ISFREE(pp));
1714 	ASSERT(PP_ISAGED(pp));
1715 	ASSERT(pp->p_szc == 0);
1716 
1717 	/*
1718 	 * Subtract counters before releasing pcm mutex
1719 	 * to avoid race with page_freelist_coalesce.
1720 	 */
1721 	bin = PP_2_BIN(pp);
1722 	mtype = PP_2_MTYPE(pp);
1723 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1724 
1725 	page_sub(ppp, pp);
1726 	page_ctr_sub(mnode, mtype, pp, flags);
1727 	page_freelist_unlock(mnode);
1728 
1729 #if defined(__sparc)
1730 	if (PP_ISNORELOC(pp)) {
1731 		kcage_freemem_sub(1);
1732 	}
1733 #endif
1734 }
1735 
1736 void
1737 page_list_sub_pages(page_t *pp, uint_t szc)
1738 {
1739 	kmutex_t *pcm;
1740 	uint_t	bin, mtype;
1741 	int	mnode;
1742 
1743 	ASSERT(PAGE_EXCL(pp));
1744 	ASSERT(PP_ISFREE(pp));
1745 	ASSERT(PP_ISAGED(pp));
1746 
1747 	/*
1748 	 * See comment in page_list_sub().
1749 	 */
1750 try_again:
1751 	bin = PP_2_BIN(pp);
1752 	mnode = PP_2_MEM_NODE(pp);
1753 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1754 	mutex_enter(pcm);
1755 	if (PP_2_BIN(pp) != bin) {
1756 		mutex_exit(pcm);
1757 		goto	try_again;
1758 	}
1759 
1760 	/*
1761 	 * If we're called with a page larger than szc or it got
1762 	 * promoted above szc before we locked the freelist then
1763 	 * drop pcm and re-lock entire freelist. If page still larger
1764 	 * than szc then demote it.
1765 	 */
1766 	if (pp->p_szc > szc) {
1767 		mutex_exit(pcm);
1768 		pcm = NULL;
1769 		page_freelist_lock(mnode);
1770 		if (pp->p_szc > szc) {
1771 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1772 			(void) page_demote(mnode,
1773 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1774 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1775 		}
1776 		bin = PP_2_BIN(pp);
1777 	}
1778 	ASSERT(PP_ISFREE(pp));
1779 	ASSERT(PP_ISAGED(pp));
1780 	ASSERT(pp->p_szc <= szc);
1781 	ASSERT(pp == PP_PAGEROOT(pp));
1782 
1783 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1784 
1785 	mtype = PP_2_MTYPE(pp);
1786 	if (pp->p_szc != 0) {
1787 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1788 		CHK_LPG(pp, pp->p_szc);
1789 	} else {
1790 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1791 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1792 	}
1793 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1794 
1795 	if (pcm != NULL) {
1796 		mutex_exit(pcm);
1797 	} else {
1798 		page_freelist_unlock(mnode);
1799 	}
1800 
1801 #if defined(__sparc)
1802 	if (PP_ISNORELOC(pp)) {
1803 		pgcnt_t	pgcnt;
1804 
1805 		pgcnt = page_get_pagecnt(pp->p_szc);
1806 		kcage_freemem_sub(pgcnt);
1807 	}
1808 #endif
1809 }
1810 
1811 /*
1812  * Add the page to the front of a linked list of pages
1813  * using the p_next & p_prev pointers for the list.
1814  * The caller is responsible for protecting the list pointers.
1815  */
1816 void
1817 mach_page_add(page_t **ppp, page_t *pp)
1818 {
1819 	if (*ppp == NULL) {
1820 		pp->p_next = pp->p_prev = pp;
1821 	} else {
1822 		pp->p_next = *ppp;
1823 		pp->p_prev = (*ppp)->p_prev;
1824 		(*ppp)->p_prev = pp;
1825 		pp->p_prev->p_next = pp;
1826 	}
1827 	*ppp = pp;
1828 }
1829 
1830 /*
1831  * Remove this page from a linked list of pages
1832  * using the p_next & p_prev pointers for the list.
1833  *
1834  * The caller is responsible for protecting the list pointers.
1835  */
1836 void
1837 mach_page_sub(page_t **ppp, page_t *pp)
1838 {
1839 	ASSERT(PP_ISFREE(pp));
1840 
1841 	if (*ppp == NULL || pp == NULL)
1842 		panic("mach_page_sub");
1843 
1844 	if (*ppp == pp)
1845 		*ppp = pp->p_next;		/* go to next page */
1846 
1847 	if (*ppp == pp)
1848 		*ppp = NULL;			/* page list is gone */
1849 	else {
1850 		pp->p_prev->p_next = pp->p_next;
1851 		pp->p_next->p_prev = pp->p_prev;
1852 	}
1853 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1854 }
1855 
1856 /*
1857  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1858  */
1859 void
1860 page_promote_size(page_t *pp, uint_t cur_szc)
1861 {
1862 	pfn_t pfn;
1863 	int mnode;
1864 	int idx;
1865 	int new_szc = cur_szc + 1;
1866 	int full = FULL_REGION_CNT(new_szc);
1867 
1868 	pfn = page_pptonum(pp);
1869 	mnode = PFN_2_MEM_NODE(pfn);
1870 
1871 	page_freelist_lock(mnode);
1872 
1873 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1874 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1875 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1876 
1877 	page_freelist_unlock(mnode);
1878 }
1879 
1880 static uint_t page_promote_err;
1881 static uint_t page_promote_noreloc_err;
1882 
1883 /*
1884  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1885  * for the given mnode starting at pfnum. Pages involved are on the freelist
1886  * before the call and may be returned to the caller if requested, otherwise
1887  * they will be placed back on the freelist.
1888  * If flags is PC_ALLOC, then the large page will be returned to the user in
1889  * a state which is consistent with a page being taken off the freelist.  If
1890  * we failed to lock the new large page, then we will return NULL to the
1891  * caller and put the large page on the freelist instead.
1892  * If flags is PC_FREE, then the large page will be placed on the freelist,
1893  * and NULL will be returned.
1894  * The caller is responsible for locking the freelist as well as any other
1895  * accounting which needs to be done for a returned page.
1896  *
1897  * RFE: For performance pass in pp instead of pfnum so
1898  * 	we can avoid excessive calls to page_numtopp_nolock().
1899  *	This would depend on an assumption that all contiguous
1900  *	pages are in the same memseg so we can just add/dec
1901  *	our pp.
1902  *
1903  * Lock ordering:
1904  *
1905  *	There is a potential but rare deadlock situation
1906  *	for page promotion and demotion operations. The problem
1907  *	is there are two paths into the freelist manager and
1908  *	they have different lock orders:
1909  *
1910  *	page_create()
1911  *		lock freelist
1912  *		page_lock(EXCL)
1913  *		unlock freelist
1914  *		return
1915  *		caller drops page_lock
1916  *
1917  *	page_free() and page_reclaim()
1918  *		caller grabs page_lock(EXCL)
1919  *
1920  *		lock freelist
1921  *		unlock freelist
1922  *		drop page_lock
1923  *
1924  *	What prevents a thread in page_create() from deadlocking
1925  *	with a thread freeing or reclaiming the same page is the
1926  *	page_trylock() in page_get_freelist(). If the trylock fails
1927  *	it skips the page.
1928  *
1929  *	The lock ordering for promotion and demotion is the same as
1930  *	for page_create(). Since the same deadlock could occur during
1931  *	page promotion and freeing or reclaiming of a page on the
1932  *	cache list we might have to fail the operation and undo what
1933  *	have done so far. Again this is rare.
1934  */
1935 page_t *
1936 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1937 {
1938 	page_t		*pp, *pplist, *tpp, *start_pp;
1939 	pgcnt_t		new_npgs, npgs;
1940 	uint_t		bin;
1941 	pgcnt_t		tmpnpgs, pages_left;
1942 	uint_t		noreloc;
1943 	int 		which_list;
1944 	ulong_t		index;
1945 	kmutex_t	*phm;
1946 
1947 	/*
1948 	 * General algorithm:
1949 	 * Find the starting page
1950 	 * Walk each page struct removing it from the freelist,
1951 	 * and linking it to all the other pages removed.
1952 	 * Once all pages are off the freelist,
1953 	 * walk the list, modifying p_szc to new_szc and what
1954 	 * ever other info needs to be done to create a large free page.
1955 	 * According to the flags, either return the page or put it
1956 	 * on the freelist.
1957 	 */
1958 
1959 	start_pp = page_numtopp_nolock(pfnum);
1960 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1961 	new_npgs = page_get_pagecnt(new_szc);
1962 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1963 
1964 	/* don't return page of the wrong mtype */
1965 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1966 			return (NULL);
1967 
1968 	/*
1969 	 * Loop through smaller pages to confirm that all pages
1970 	 * give the same result for PP_ISNORELOC().
1971 	 * We can check this reliably here as the protocol for setting
1972 	 * P_NORELOC requires pages to be taken off the free list first.
1973 	 */
1974 	noreloc = PP_ISNORELOC(start_pp);
1975 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1976 		if (noreloc != PP_ISNORELOC(pp)) {
1977 			page_promote_noreloc_err++;
1978 			page_promote_err++;
1979 			return (NULL);
1980 		}
1981 	}
1982 
1983 	pages_left = new_npgs;
1984 	pplist = NULL;
1985 	pp = start_pp;
1986 
1987 	/* Loop around coalescing the smaller pages into a big page. */
1988 	while (pages_left) {
1989 		/*
1990 		 * Remove from the freelist.
1991 		 */
1992 		ASSERT(PP_ISFREE(pp));
1993 		bin = PP_2_BIN(pp);
1994 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1995 		mtype = PP_2_MTYPE(pp);
1996 		if (PP_ISAGED(pp)) {
1997 
1998 			/*
1999 			 * PG_FREE_LIST
2000 			 */
2001 			if (pp->p_szc) {
2002 				page_vpsub(&PAGE_FREELISTS(mnode,
2003 				    pp->p_szc, bin, mtype), pp);
2004 			} else {
2005 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2006 				    bin, mtype), pp);
2007 			}
2008 			which_list = PG_FREE_LIST;
2009 		} else {
2010 			ASSERT(pp->p_szc == 0);
2011 
2012 			/*
2013 			 * PG_CACHE_LIST
2014 			 *
2015 			 * Since this page comes from the
2016 			 * cachelist, we must destroy the
2017 			 * vnode association.
2018 			 */
2019 			if (!page_trylock(pp, SE_EXCL)) {
2020 				goto fail_promote;
2021 			}
2022 
2023 			/*
2024 			 * We need to be careful not to deadlock
2025 			 * with another thread in page_lookup().
2026 			 * The page_lookup() thread could be holding
2027 			 * the same phm that we need if the two
2028 			 * pages happen to hash to the same phm lock.
2029 			 * At this point we have locked the entire
2030 			 * freelist and page_lookup() could be trying
2031 			 * to grab a freelist lock.
2032 			 */
2033 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2034 			phm = PAGE_HASH_MUTEX(index);
2035 			if (!mutex_tryenter(phm)) {
2036 				page_unlock_nocapture(pp);
2037 				goto fail_promote;
2038 			}
2039 
2040 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2041 			page_hashout(pp, phm);
2042 			mutex_exit(phm);
2043 			PP_SETAGED(pp);
2044 			page_unlock_nocapture(pp);
2045 			which_list = PG_CACHE_LIST;
2046 		}
2047 		page_ctr_sub(mnode, mtype, pp, which_list);
2048 
2049 		/*
2050 		 * Concatenate the smaller page(s) onto
2051 		 * the large page list.
2052 		 */
2053 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2054 		pages_left -= npgs;
2055 		tpp = pp;
2056 		while (npgs--) {
2057 			tpp->p_szc = new_szc;
2058 			tpp = tpp->p_next;
2059 		}
2060 		page_list_concat(&pplist, &pp);
2061 		pp += tmpnpgs;
2062 	}
2063 	CHK_LPG(pplist, new_szc);
2064 
2065 	/*
2066 	 * return the page to the user if requested
2067 	 * in the properly locked state.
2068 	 */
2069 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2070 		return (pplist);
2071 	}
2072 
2073 	/*
2074 	 * Otherwise place the new large page on the freelist
2075 	 */
2076 	bin = PP_2_BIN(pplist);
2077 	mnode = PP_2_MEM_NODE(pplist);
2078 	mtype = PP_2_MTYPE(pplist);
2079 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2080 
2081 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2082 	return (NULL);
2083 
2084 fail_promote:
2085 	/*
2086 	 * A thread must have still been freeing or
2087 	 * reclaiming the page on the cachelist.
2088 	 * To prevent a deadlock undo what we have
2089 	 * done sofar and return failure. This
2090 	 * situation can only happen while promoting
2091 	 * PAGESIZE pages.
2092 	 */
2093 	page_promote_err++;
2094 	while (pplist) {
2095 		pp = pplist;
2096 		mach_page_sub(&pplist, pp);
2097 		pp->p_szc = 0;
2098 		bin = PP_2_BIN(pp);
2099 		mtype = PP_2_MTYPE(pp);
2100 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2101 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2102 	}
2103 	return (NULL);
2104 
2105 }
2106 
2107 /*
2108  * Break up a large page into smaller size pages.
2109  * Pages involved are on the freelist before the call and may
2110  * be returned to the caller if requested, otherwise they will
2111  * be placed back on the freelist.
2112  * The caller is responsible for locking the freelist as well as any other
2113  * accounting which needs to be done for a returned page.
2114  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2115  * technically, any value may be passed in but PC_NO_COLOR is the standard
2116  * which should be followed for clarity's sake.
2117  * Returns a page whose pfn is < pfnmax
2118  */
2119 page_t *
2120 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2121     uchar_t new_szc, int color, int flags)
2122 {
2123 	page_t	*pp, *pplist, *npplist;
2124 	pgcnt_t	npgs, n;
2125 	uint_t	bin;
2126 	uint_t	mtype;
2127 	page_t	*ret_pp = NULL;
2128 
2129 	ASSERT(cur_szc != 0);
2130 	ASSERT(new_szc < cur_szc);
2131 
2132 	pplist = page_numtopp_nolock(pfnum);
2133 	ASSERT(pplist != NULL);
2134 
2135 	ASSERT(pplist->p_szc == cur_szc);
2136 
2137 	bin = PP_2_BIN(pplist);
2138 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2139 	mtype = PP_2_MTYPE(pplist);
2140 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2141 
2142 	CHK_LPG(pplist, cur_szc);
2143 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2144 
2145 	/*
2146 	 * Number of PAGESIZE pages for smaller new_szc
2147 	 * page.
2148 	 */
2149 	npgs = page_get_pagecnt(new_szc);
2150 
2151 	while (pplist) {
2152 		pp = pplist;
2153 
2154 		ASSERT(pp->p_szc == cur_szc);
2155 
2156 		/*
2157 		 * We either break it up into PAGESIZE pages or larger.
2158 		 */
2159 		if (npgs == 1) {	/* PAGESIZE case */
2160 			mach_page_sub(&pplist, pp);
2161 			ASSERT(pp->p_szc == cur_szc);
2162 			ASSERT(new_szc == 0);
2163 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2164 			pp->p_szc = new_szc;
2165 			bin = PP_2_BIN(pp);
2166 			if ((bin == color) && (flags == PC_ALLOC) &&
2167 			    (ret_pp == NULL) && (pfnmax == 0 ||
2168 			    pp->p_pagenum < pfnmax) &&
2169 			    page_trylock_cons(pp, SE_EXCL)) {
2170 				ret_pp = pp;
2171 			} else {
2172 				mtype = PP_2_MTYPE(pp);
2173 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2174 				    mtype), pp);
2175 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2176 			}
2177 		} else {
2178 			page_t *try_to_return_this_page = NULL;
2179 			int count = 0;
2180 
2181 			/*
2182 			 * Break down into smaller lists of pages.
2183 			 */
2184 			page_list_break(&pplist, &npplist, npgs);
2185 
2186 			pp = pplist;
2187 			n = npgs;
2188 			while (n--) {
2189 				ASSERT(pp->p_szc == cur_szc);
2190 				/*
2191 				 * Check whether all the pages in this list
2192 				 * fit the request criteria.
2193 				 */
2194 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2195 					count++;
2196 				}
2197 				pp->p_szc = new_szc;
2198 				pp = pp->p_next;
2199 			}
2200 
2201 			if (count == npgs &&
2202 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2203 				try_to_return_this_page = pp;
2204 			}
2205 
2206 			CHK_LPG(pplist, new_szc);
2207 
2208 			bin = PP_2_BIN(pplist);
2209 			if (try_to_return_this_page)
2210 				ASSERT(mnode ==
2211 				    PP_2_MEM_NODE(try_to_return_this_page));
2212 			if ((bin == color) && (flags == PC_ALLOC) &&
2213 			    (ret_pp == NULL) && try_to_return_this_page &&
2214 			    page_trylock_cons(try_to_return_this_page,
2215 			    SE_EXCL)) {
2216 				ret_pp = try_to_return_this_page;
2217 			} else {
2218 				mtype = PP_2_MTYPE(pp);
2219 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2220 				    bin, mtype), pplist);
2221 
2222 				page_ctr_add(mnode, mtype, pplist,
2223 				    PG_FREE_LIST);
2224 			}
2225 			pplist = npplist;
2226 		}
2227 	}
2228 	return (ret_pp);
2229 }
2230 
2231 int mpss_coalesce_disable = 0;
2232 
2233 /*
2234  * Coalesce free pages into a page of the given szc and color if possible.
2235  * Return the pointer to the page created, otherwise, return NULL.
2236  *
2237  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2238  */
2239 page_t *
2240 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2241     int mtype, pfn_t pfnhi)
2242 {
2243 	int 	r = szc;		/* region size */
2244 	int	mrange;
2245 	uint_t 	full, bin, color_mask, wrap = 0;
2246 	pfn_t	pfnum, lo, hi;
2247 	size_t	len, idx, idx0;
2248 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2249 	page_t	*ret_pp;
2250 	MEM_NODE_ITERATOR_DECL(it);
2251 #if defined(__sparc)
2252 	pfn_t pfnum0, nlo, nhi;
2253 #endif
2254 
2255 	if (mpss_coalesce_disable) {
2256 		ASSERT(szc < MMU_PAGE_SIZES);
2257 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2258 		return (NULL);
2259 	}
2260 
2261 	ASSERT(szc < mmu_page_sizes);
2262 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2263 	ASSERT(ceq_mask <= color_mask);
2264 	ASSERT(color <= color_mask);
2265 	color &= ceq_mask;
2266 
2267 	/* Prevent page_counters dynamic memory from being freed */
2268 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2269 
2270 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2271 	ASSERT(mrange < mnode_nranges[mnode]);
2272 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2273 
2274 	/* get pfn range for mtype */
2275 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2276 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2277 	hi++;
2278 
2279 	/* use lower limit if given */
2280 	if (pfnhi != PFNNULL && pfnhi < hi)
2281 		hi = pfnhi;
2282 
2283 	/* round to szcpgcnt boundaries */
2284 	lo = P2ROUNDUP(lo, szcpgcnt);
2285 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2286 	if (lo == (pfn_t)-1) {
2287 		rw_exit(&page_ctrs_rwlock[mnode]);
2288 		return (NULL);
2289 	}
2290 	hi = hi & ~(szcpgcnt - 1);
2291 
2292 	/* set lo to the closest pfn of the right color */
2293 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2294 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2295 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2296 		    &it);
2297 	}
2298 
2299 	if (hi <= lo) {
2300 		rw_exit(&page_ctrs_rwlock[mnode]);
2301 		return (NULL);
2302 	}
2303 
2304 	full = FULL_REGION_CNT(r);
2305 
2306 	/* calculate the number of page candidates and initial search index */
2307 	bin = color;
2308 	idx0 = (size_t)(-1);
2309 	do {
2310 		pgcnt_t acand;
2311 
2312 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2313 		if (acand) {
2314 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2315 			    r, bin, mrange);
2316 			idx0 = MIN(idx0, idx);
2317 			cands += acand;
2318 		}
2319 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2320 	} while (bin != color);
2321 
2322 	if (cands == 0) {
2323 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2324 		rw_exit(&page_ctrs_rwlock[mnode]);
2325 		return (NULL);
2326 	}
2327 
2328 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2329 	if (pfnum < lo || pfnum >= hi) {
2330 		pfnum = lo;
2331 	} else {
2332 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2333 		if (pfnum == (pfn_t)-1) {
2334 			pfnum = lo;
2335 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2336 			ASSERT(pfnum != (pfn_t)-1);
2337 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2338 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2339 			/* invalid color, get the closest correct pfn */
2340 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2341 			    color_mask, &it);
2342 			if (pfnum >= hi) {
2343 				pfnum = lo;
2344 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2345 			}
2346 		}
2347 	}
2348 
2349 	/* set starting index */
2350 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2351 	ASSERT(idx0 < len);
2352 
2353 #if defined(__sparc)
2354 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2355 	nhi = 0;		/* search kcage ranges */
2356 #endif
2357 
2358 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2359 
2360 #if defined(__sparc)
2361 		/*
2362 		 * Find lowest intersection of kcage ranges and mnode.
2363 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2364 		 */
2365 		if (nhi <= pfnum) {
2366 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2367 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2368 				goto wrapit;
2369 
2370 			/* jump to the next page in the range */
2371 			if (pfnum < nlo) {
2372 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2373 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2374 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2375 				if (idx >= len || pfnum >= hi)
2376 					goto wrapit;
2377 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2378 				    ceq_mask)
2379 					goto next;
2380 				if (interleaved_mnodes &&
2381 				    PFN_2_MEM_NODE(pfnum) != mnode)
2382 					goto next;
2383 			}
2384 		}
2385 #endif
2386 
2387 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2388 			goto next;
2389 
2390 		/*
2391 		 * RFE: For performance maybe we can do something less
2392 		 *	brutal than locking the entire freelist. So far
2393 		 * 	this doesn't seem to be a performance problem?
2394 		 */
2395 		page_freelist_lock(mnode);
2396 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2397 			ret_pp =
2398 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2399 			if (ret_pp != NULL) {
2400 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2401 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2402 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2403 				page_freelist_unlock(mnode);
2404 				rw_exit(&page_ctrs_rwlock[mnode]);
2405 #if defined(__sparc)
2406 				if (PP_ISNORELOC(ret_pp)) {
2407 					pgcnt_t npgs;
2408 
2409 					npgs = page_get_pagecnt(ret_pp->p_szc);
2410 					kcage_freemem_sub(npgs);
2411 				}
2412 #endif
2413 				return (ret_pp);
2414 			}
2415 		} else {
2416 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2417 		}
2418 
2419 		page_freelist_unlock(mnode);
2420 		/*
2421 		 * No point looking for another page if we've
2422 		 * already tried all of the ones that
2423 		 * page_ctr_cands indicated.  Stash off where we left
2424 		 * off.
2425 		 * Note: this is not exact since we don't hold the
2426 		 * page_freelist_locks before we initially get the
2427 		 * value of cands for performance reasons, but should
2428 		 * be a decent approximation.
2429 		 */
2430 		if (--cands == 0) {
2431 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2432 			    idx;
2433 			break;
2434 		}
2435 next:
2436 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2437 		    color_mask, &it);
2438 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2439 		if (idx >= len || pfnum >= hi) {
2440 wrapit:
2441 			pfnum = lo;
2442 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2443 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2444 			wrap++;
2445 #if defined(__sparc)
2446 			nhi = 0;	/* search kcage ranges */
2447 #endif
2448 		}
2449 	}
2450 
2451 	rw_exit(&page_ctrs_rwlock[mnode]);
2452 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2453 	return (NULL);
2454 }
2455 
2456 /*
2457  * For the given mnode, promote as many small pages to large pages as possible.
2458  * mnode can be -1, which means do them all
2459  */
2460 void
2461 page_freelist_coalesce_all(int mnode)
2462 {
2463 	int 	r;		/* region size */
2464 	int 	idx, full;
2465 	size_t	len;
2466 	int doall = interleaved_mnodes || mnode < 0;
2467 	int mlo = doall ? 0 : mnode;
2468 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2469 
2470 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2471 
2472 	if (mpss_coalesce_disable) {
2473 		return;
2474 	}
2475 
2476 	/*
2477 	 * Lock the entire freelist and coalesce what we can.
2478 	 *
2479 	 * Always promote to the largest page possible
2480 	 * first to reduce the number of page promotions.
2481 	 */
2482 	for (mnode = mlo; mnode < mhi; mnode++) {
2483 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2484 		page_freelist_lock(mnode);
2485 	}
2486 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2487 		for (mnode = mlo; mnode < mhi; mnode++) {
2488 			pgcnt_t cands = 0;
2489 			int mrange, nranges = mnode_nranges[mnode];
2490 
2491 			for (mrange = 0; mrange < nranges; mrange++) {
2492 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2493 				if (cands != 0)
2494 					break;
2495 			}
2496 			if (cands == 0) {
2497 				VM_STAT_ADD(vmm_vmstats.
2498 				    page_ctrs_cands_skip_all);
2499 				continue;
2500 			}
2501 
2502 			full = FULL_REGION_CNT(r);
2503 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2504 
2505 			for (idx = 0; idx < len; idx++) {
2506 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2507 					pfn_t pfnum =
2508 					    IDX_TO_PNUM(mnode, r, idx);
2509 					int tmnode = interleaved_mnodes ?
2510 					    PFN_2_MEM_NODE(pfnum) : mnode;
2511 
2512 					ASSERT(pfnum >=
2513 					    mem_node_config[tmnode].physbase &&
2514 					    pfnum <
2515 					    mem_node_config[tmnode].physmax);
2516 
2517 					(void) page_promote(tmnode,
2518 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2519 				}
2520 			}
2521 			/* shared hpm_counters covers all mnodes, so we quit */
2522 			if (interleaved_mnodes)
2523 				break;
2524 		}
2525 	}
2526 	for (mnode = mlo; mnode < mhi; mnode++) {
2527 		page_freelist_unlock(mnode);
2528 		rw_exit(&page_ctrs_rwlock[mnode]);
2529 	}
2530 }
2531 
2532 /*
2533  * This is where all polices for moving pages around
2534  * to different page size free lists is implemented.
2535  * Returns 1 on success, 0 on failure.
2536  *
2537  * So far these are the priorities for this algorithm in descending
2538  * order:
2539  *
2540  *	1) When servicing a request try to do so with a free page
2541  *	   from next size up. Helps defer fragmentation as long
2542  *	   as possible.
2543  *
2544  *	2) Page coalesce on demand. Only when a freelist
2545  *	   larger than PAGESIZE is empty and step 1
2546  *	   will not work since all larger size lists are
2547  *	   also empty.
2548  *
2549  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2550  */
2551 
2552 page_t *
2553 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2554     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2555 {
2556 	uchar_t nszc = szc + 1;
2557 	uint_t 	bin, sbin, bin_prev;
2558 	page_t	*pp, *firstpp;
2559 	page_t	*ret_pp = NULL;
2560 	uint_t  color_mask;
2561 
2562 	if (nszc == mmu_page_sizes)
2563 		return (NULL);
2564 
2565 	ASSERT(nszc < mmu_page_sizes);
2566 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2567 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2568 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2569 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2570 
2571 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2572 	/*
2573 	 * First try to break up a larger page to fill current size freelist.
2574 	 */
2575 	while (plw->plw_bins[nszc] != 0) {
2576 
2577 		ASSERT(nszc < mmu_page_sizes);
2578 
2579 		/*
2580 		 * If page found then demote it.
2581 		 */
2582 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2583 			page_freelist_lock(mnode);
2584 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2585 
2586 			/*
2587 			 * If pfnhi is not PFNNULL, look for large page below
2588 			 * pfnhi. PFNNULL signifies no pfn requirement.
2589 			 */
2590 			if (pp &&
2591 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2592 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2593 				do {
2594 					pp = pp->p_vpnext;
2595 					if (pp == firstpp) {
2596 						pp = NULL;
2597 						break;
2598 					}
2599 				} while ((pfnhi != PFNNULL &&
2600 				    pp->p_pagenum >= pfnhi) ||
2601 				    (pfnlo != PFNNULL &&
2602 				    pp->p_pagenum < pfnlo));
2603 
2604 				if (pfnhi != PFNNULL && pp != NULL)
2605 					ASSERT(pp->p_pagenum < pfnhi);
2606 
2607 				if (pfnlo != PFNNULL && pp != NULL)
2608 					ASSERT(pp->p_pagenum >= pfnlo);
2609 			}
2610 			if (pp) {
2611 				uint_t ccolor = page_correct_color(szc, nszc,
2612 				    color, bin, plw->plw_ceq_mask[szc]);
2613 
2614 				ASSERT(pp->p_szc == nszc);
2615 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2616 				ret_pp = page_demote(mnode, pp->p_pagenum,
2617 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2618 				if (ret_pp) {
2619 					page_freelist_unlock(mnode);
2620 #if defined(__sparc)
2621 					if (PP_ISNORELOC(ret_pp)) {
2622 						pgcnt_t npgs;
2623 
2624 						npgs = page_get_pagecnt(
2625 						    ret_pp->p_szc);
2626 						kcage_freemem_sub(npgs);
2627 					}
2628 #endif
2629 					return (ret_pp);
2630 				}
2631 			}
2632 			page_freelist_unlock(mnode);
2633 		}
2634 
2635 		/* loop through next size bins */
2636 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2637 		plw->plw_bins[nszc]--;
2638 
2639 		if (bin == sbin) {
2640 			uchar_t nnszc = nszc + 1;
2641 
2642 			/* we are done with this page size - check next */
2643 			if (plw->plw_bins[nnszc] == 0)
2644 				/* we have already checked next size bins */
2645 				break;
2646 
2647 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2648 			if (bin_prev != INVALID_COLOR) {
2649 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2650 				if (!((bin ^ bin_prev) &
2651 				    plw->plw_ceq_mask[nnszc]))
2652 					break;
2653 			}
2654 			ASSERT(nnszc < mmu_page_sizes);
2655 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2656 			nszc = nnszc;
2657 			ASSERT(nszc < mmu_page_sizes);
2658 		}
2659 	}
2660 
2661 	return (ret_pp);
2662 }
2663 
2664 /*
2665  * Helper routine used only by the freelist code to lock
2666  * a page. If the page is a large page then it succeeds in
2667  * locking all the constituent pages or none at all.
2668  * Returns 1 on sucess, 0 on failure.
2669  */
2670 static int
2671 page_trylock_cons(page_t *pp, se_t se)
2672 {
2673 	page_t	*tpp, *first_pp = pp;
2674 
2675 	/*
2676 	 * Fail if can't lock first or only page.
2677 	 */
2678 	if (!page_trylock(pp, se)) {
2679 		return (0);
2680 	}
2681 
2682 	/*
2683 	 * PAGESIZE: common case.
2684 	 */
2685 	if (pp->p_szc == 0) {
2686 		return (1);
2687 	}
2688 
2689 	/*
2690 	 * Large page case.
2691 	 */
2692 	tpp = pp->p_next;
2693 	while (tpp != pp) {
2694 		if (!page_trylock(tpp, se)) {
2695 			/*
2696 			 * On failure unlock what we have locked so far.
2697 			 * We want to avoid attempting to capture these
2698 			 * pages as the pcm mutex may be held which could
2699 			 * lead to a recursive mutex panic.
2700 			 */
2701 			while (first_pp != tpp) {
2702 				page_unlock_nocapture(first_pp);
2703 				first_pp = first_pp->p_next;
2704 			}
2705 			return (0);
2706 		}
2707 		tpp = tpp->p_next;
2708 	}
2709 	return (1);
2710 }
2711 
2712 /*
2713  * init context for walking page lists
2714  * Called when a page of the given szc in unavailable. Sets markers
2715  * for the beginning of the search to detect when search has
2716  * completed a full cycle. Sets flags for splitting larger pages
2717  * and coalescing smaller pages. Page walking procedes until a page
2718  * of the desired equivalent color is found.
2719  */
2720 void
2721 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2722     int use_ceq, page_list_walker_t *plw)
2723 {
2724 	uint_t  nszc, ceq_mask, colors;
2725 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2726 
2727 	ASSERT(szc < mmu_page_sizes);
2728 	colors = PAGE_GET_PAGECOLORS(szc);
2729 
2730 	plw->plw_colors = colors;
2731 	plw->plw_color_mask = colors - 1;
2732 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2733 	plw->plw_bin_split_prev = bin;
2734 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2735 
2736 	/*
2737 	 * if vac aliasing is possible make sure lower order color
2738 	 * bits are never ignored
2739 	 */
2740 	if (vac_colors > 1)
2741 		ceq &= 0xf0;
2742 
2743 	/*
2744 	 * calculate the number of non-equivalent colors and
2745 	 * color equivalency mask
2746 	 */
2747 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2748 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2749 	ASSERT(plw->plw_ceq_dif > 0);
2750 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2751 
2752 	if (flags & PG_MATCH_COLOR) {
2753 		if (cpu_page_colors <  0) {
2754 			/*
2755 			 * this is a heterogeneous machine with different CPUs
2756 			 * having different size e$ (not supported for ni2/rock
2757 			 */
2758 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2759 			cpucolors = MAX(cpucolors, 1);
2760 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2761 			plw->plw_ceq_mask[szc] =
2762 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2763 		}
2764 		plw->plw_ceq_dif = 1;
2765 	}
2766 
2767 	/* we can split pages in the freelist, but not the cachelist */
2768 	if (can_split) {
2769 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2770 
2771 		/* set next szc color masks and number of free list bins */
2772 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2773 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2774 			    plw->plw_ceq_mask[szc]);
2775 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2776 		}
2777 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2778 		plw->plw_bins[nszc] = 0;
2779 
2780 	} else {
2781 		ASSERT(szc == 0);
2782 		plw->plw_do_split = 0;
2783 		plw->plw_bins[1] = 0;
2784 		plw->plw_ceq_mask[1] = INVALID_MASK;
2785 	}
2786 }
2787 
2788 /*
2789  * set mark to flag where next split should occur
2790  */
2791 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2792 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2793 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2794 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2795 	plw->plw_split_next =						     \
2796 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2797 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2798 		plw->plw_split_next =					     \
2799 		INC_MASKED(plw->plw_split_next,				     \
2800 		    neq_mask, plw->plw_color_mask);			     \
2801 	}								     \
2802 }
2803 
2804 uint_t
2805 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2806 {
2807 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2808 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2809 	uchar_t nszc = szc + 1;
2810 
2811 	nbin = ADD_MASKED(bin,
2812 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2813 
2814 	if (plw->plw_do_split) {
2815 		plw->plw_bin_split_prev = bin;
2816 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2817 		plw->plw_do_split = 0;
2818 	}
2819 
2820 	if (szc == 0) {
2821 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2822 			if (nbin == plw->plw_bin0 &&
2823 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2824 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2825 				    neq_mask, plw->plw_color_mask);
2826 				plw->plw_bin_split_prev = plw->plw_bin0;
2827 			}
2828 
2829 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2830 				plw->plw_bin_marker =
2831 				    nbin = INC_MASKED(nbin, neq_mask,
2832 				    plw->plw_color_mask);
2833 				plw->plw_bin_split_prev = plw->plw_bin0;
2834 				/*
2835 				 * large pages all have the same vac color
2836 				 * so by now we should be done with next
2837 				 * size page splitting process
2838 				 */
2839 				ASSERT(plw->plw_bins[1] == 0);
2840 				plw->plw_do_split = 0;
2841 				return (nbin);
2842 			}
2843 
2844 		} else {
2845 			uint_t bin_jump = (vac_colors == 1) ?
2846 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2847 
2848 			bin_jump &= ~(vac_colors - 1);
2849 
2850 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2851 			    plw->plw_color_mask);
2852 
2853 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2854 
2855 				plw->plw_bin_marker = nbin = nbin0;
2856 
2857 				if (plw->plw_bins[nszc] != 0) {
2858 					/*
2859 					 * check if next page size bin is the
2860 					 * same as the next page size bin for
2861 					 * bin0
2862 					 */
2863 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2864 					    nbin);
2865 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2866 					    plw->plw_bin0);
2867 
2868 					if ((bin0_nsz ^ nbin_nsz) &
2869 					    plw->plw_ceq_mask[nszc])
2870 						plw->plw_do_split = 1;
2871 				}
2872 				return (nbin);
2873 			}
2874 		}
2875 	}
2876 
2877 	if (plw->plw_bins[nszc] != 0) {
2878 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2879 		if (!((plw->plw_split_next ^ nbin_nsz) &
2880 		    plw->plw_ceq_mask[nszc]))
2881 			plw->plw_do_split = 1;
2882 	}
2883 
2884 	return (nbin);
2885 }
2886 
2887 page_t *
2888 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2889     uint_t flags)
2890 {
2891 	kmutex_t		*pcm;
2892 	page_t			*pp, *first_pp;
2893 	uint_t			sbin;
2894 	int			plw_initialized;
2895 	page_list_walker_t	plw;
2896 
2897 	ASSERT(szc < mmu_page_sizes);
2898 
2899 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2900 
2901 	MTYPE_START(mnode, mtype, flags);
2902 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2903 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2904 		return (NULL);
2905 	}
2906 try_again:
2907 
2908 	plw_initialized = 0;
2909 	plw.plw_ceq_dif = 1;
2910 
2911 	/*
2912 	 * Only hold one freelist lock at a time, that way we
2913 	 * can start anywhere and not have to worry about lock
2914 	 * ordering.
2915 	 */
2916 	for (plw.plw_count = 0;
2917 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2918 		sbin = bin;
2919 		do {
2920 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2921 				goto bin_empty_1;
2922 
2923 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2924 			mutex_enter(pcm);
2925 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2926 			if (pp == NULL)
2927 				goto bin_empty_0;
2928 
2929 			/*
2930 			 * These were set before the page
2931 			 * was put on the free list,
2932 			 * they must still be set.
2933 			 */
2934 			ASSERT(PP_ISFREE(pp));
2935 			ASSERT(PP_ISAGED(pp));
2936 			ASSERT(pp->p_vnode == NULL);
2937 			ASSERT(pp->p_hash == NULL);
2938 			ASSERT(pp->p_offset == (u_offset_t)-1);
2939 			ASSERT(pp->p_szc == szc);
2940 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2941 
2942 			/*
2943 			 * Walk down the hash chain.
2944 			 * 8k pages are linked on p_next
2945 			 * and p_prev fields. Large pages
2946 			 * are a contiguous group of
2947 			 * constituent pages linked together
2948 			 * on their p_next and p_prev fields.
2949 			 * The large pages are linked together
2950 			 * on the hash chain using p_vpnext
2951 			 * p_vpprev of the base constituent
2952 			 * page of each large page.
2953 			 */
2954 			first_pp = pp;
2955 			while (!page_trylock_cons(pp, SE_EXCL) ||
2956 			    IS_DUMP_PAGE(pp)) {
2957 				if (szc == 0) {
2958 					pp = pp->p_next;
2959 				} else {
2960 					pp = pp->p_vpnext;
2961 				}
2962 
2963 				ASSERT(PP_ISFREE(pp));
2964 				ASSERT(PP_ISAGED(pp));
2965 				ASSERT(pp->p_vnode == NULL);
2966 				ASSERT(pp->p_hash == NULL);
2967 				ASSERT(pp->p_offset == (u_offset_t)-1);
2968 				ASSERT(pp->p_szc == szc);
2969 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2970 
2971 				if (pp == first_pp)
2972 					goto bin_empty_0;
2973 			}
2974 
2975 			ASSERT(pp != NULL);
2976 			ASSERT(mtype == PP_2_MTYPE(pp));
2977 			ASSERT(pp->p_szc == szc);
2978 			if (szc == 0) {
2979 				page_sub(&PAGE_FREELISTS(mnode,
2980 				    szc, bin, mtype), pp);
2981 			} else {
2982 				page_vpsub(&PAGE_FREELISTS(mnode,
2983 				    szc, bin, mtype), pp);
2984 				CHK_LPG(pp, szc);
2985 			}
2986 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2987 
2988 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2989 				panic("free page is not. pp %p", (void *)pp);
2990 			mutex_exit(pcm);
2991 
2992 #if defined(__sparc)
2993 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2994 			    (flags & PG_NORELOC) == 0);
2995 
2996 			if (PP_ISNORELOC(pp))
2997 				kcage_freemem_sub(page_get_pagecnt(szc));
2998 #endif
2999 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3000 			return (pp);
3001 
3002 bin_empty_0:
3003 			mutex_exit(pcm);
3004 bin_empty_1:
3005 			if (plw_initialized == 0) {
3006 				page_list_walk_init(szc, flags, bin, 1, 1,
3007 				    &plw);
3008 				plw_initialized = 1;
3009 				ASSERT(plw.plw_colors <=
3010 				    PAGE_GET_PAGECOLORS(szc));
3011 				ASSERT(plw.plw_colors > 0);
3012 				ASSERT((plw.plw_colors &
3013 				    (plw.plw_colors - 1)) == 0);
3014 				ASSERT(bin < plw.plw_colors);
3015 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3016 			}
3017 			/* calculate the next bin with equivalent color */
3018 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3019 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
3020 		} while (sbin != bin);
3021 
3022 		/*
3023 		 * color bins are all empty if color match. Try and
3024 		 * satisfy the request by breaking up or coalescing
3025 		 * pages from a different size freelist of the correct
3026 		 * color that satisfies the ORIGINAL color requested.
3027 		 * If that fails then try pages of the same size but
3028 		 * different colors assuming we are not called with
3029 		 * PG_MATCH_COLOR.
3030 		 */
3031 		if (plw.plw_do_split &&
3032 		    (pp = page_freelist_split(szc, bin, mnode,
3033 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3034 			return (pp);
3035 
3036 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3037 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3038 			return (pp);
3039 
3040 		if (plw.plw_ceq_dif > 1)
3041 			bin = page_list_walk_next_bin(szc, bin, &plw);
3042 	}
3043 
3044 	/* if allowed, cycle through additional mtypes */
3045 	MTYPE_NEXT(mnode, mtype, flags);
3046 	if (mtype >= 0)
3047 		goto try_again;
3048 
3049 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3050 
3051 	return (NULL);
3052 }
3053 
3054 /*
3055  * Returns the count of free pages for 'pp' with size code 'szc'.
3056  * Note: This function does not return an exact value as the page freelist
3057  * locks are not held and thus the values in the page_counters may be
3058  * changing as we walk through the data.
3059  */
3060 static int
3061 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3062 {
3063 	pgcnt_t	pgfree;
3064 	pgcnt_t cnt;
3065 	ssize_t	r = szc;	/* region size */
3066 	ssize_t	idx;
3067 	int	i;
3068 	int	full, range;
3069 
3070 	/* Make sure pagenum passed in is aligned properly */
3071 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3072 	ASSERT(szc > 0);
3073 
3074 	/* Prevent page_counters dynamic memory from being freed */
3075 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3076 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3077 	cnt = PAGE_COUNTERS(mnode, r, idx);
3078 	pgfree = cnt << PNUM_SHIFT(r - 1);
3079 	range = FULL_REGION_CNT(szc);
3080 
3081 	/* Check for completely full region */
3082 	if (cnt == range) {
3083 		rw_exit(&page_ctrs_rwlock[mnode]);
3084 		return (pgfree);
3085 	}
3086 
3087 	while (--r > 0) {
3088 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3089 		full = FULL_REGION_CNT(r);
3090 		for (i = 0; i < range; i++, idx++) {
3091 			cnt = PAGE_COUNTERS(mnode, r, idx);
3092 			/*
3093 			 * If cnt here is full, that means we have already
3094 			 * accounted for these pages earlier.
3095 			 */
3096 			if (cnt != full) {
3097 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3098 			}
3099 		}
3100 		range *= full;
3101 	}
3102 	rw_exit(&page_ctrs_rwlock[mnode]);
3103 	return (pgfree);
3104 }
3105 
3106 /*
3107  * Called from page_geti_contig_pages to exclusively lock constituent pages
3108  * starting from 'spp' for page size code 'szc'.
3109  *
3110  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3111  * region needs to be greater than or equal to the threshold.
3112  */
3113 static int
3114 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3115 {
3116 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3117 	pgcnt_t pgfree, i;
3118 	page_t *pp;
3119 
3120 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3121 
3122 
3123 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3124 		goto skipptcpcheck;
3125 	/*
3126 	 * check if there are sufficient free pages available before attempting
3127 	 * to trylock. Count is approximate as page counters can change.
3128 	 */
3129 	pgfree = page_freecnt(mnode, spp, szc);
3130 
3131 	/* attempt to trylock if there are sufficient already free pages */
3132 	if (pgfree < pgcnt/ptcpthreshold) {
3133 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3134 		return (0);
3135 	}
3136 
3137 skipptcpcheck:
3138 
3139 	for (i = 0; i < pgcnt; i++) {
3140 		pp = &spp[i];
3141 		if (!page_trylock(pp, SE_EXCL)) {
3142 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3143 			while (--i != (pgcnt_t)-1) {
3144 				pp = &spp[i];
3145 				ASSERT(PAGE_EXCL(pp));
3146 				page_unlock_nocapture(pp);
3147 			}
3148 			return (0);
3149 		}
3150 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3151 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3152 		    !PP_ISFREE(pp)) {
3153 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3154 			ASSERT(i == 0);
3155 			page_unlock_nocapture(pp);
3156 			return (0);
3157 		}
3158 		if (PP_ISNORELOC(pp)) {
3159 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3160 			while (i != (pgcnt_t)-1) {
3161 				pp = &spp[i];
3162 				ASSERT(PAGE_EXCL(pp));
3163 				page_unlock_nocapture(pp);
3164 				i--;
3165 			}
3166 			return (0);
3167 		}
3168 	}
3169 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3170 	return (1);
3171 }
3172 
3173 /*
3174  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3175  * of 'szc' constituent pages that had been locked exclusively previously.
3176  * Will attempt to relocate constituent pages in use.
3177  */
3178 static page_t *
3179 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3180 {
3181 	spgcnt_t pgcnt, npgs, i;
3182 	page_t *targpp, *rpp, *hpp;
3183 	page_t *replpp = NULL;
3184 	page_t *pplist = NULL;
3185 
3186 	ASSERT(pp != NULL);
3187 
3188 	pgcnt = page_get_pagecnt(szc);
3189 	while (pgcnt) {
3190 		ASSERT(PAGE_EXCL(pp));
3191 		ASSERT(!PP_ISNORELOC(pp));
3192 		if (PP_ISFREE(pp)) {
3193 			/*
3194 			 * If this is a PG_FREE_LIST page then its
3195 			 * size code can change underneath us due to
3196 			 * page promotion or demotion. As an optimzation
3197 			 * use page_list_sub_pages() instead of
3198 			 * page_list_sub().
3199 			 */
3200 			if (PP_ISAGED(pp)) {
3201 				page_list_sub_pages(pp, szc);
3202 				if (pp->p_szc == szc) {
3203 					return (pp);
3204 				}
3205 				ASSERT(pp->p_szc < szc);
3206 				npgs = page_get_pagecnt(pp->p_szc);
3207 				hpp = pp;
3208 				for (i = 0; i < npgs; i++, pp++) {
3209 					pp->p_szc = szc;
3210 				}
3211 				page_list_concat(&pplist, &hpp);
3212 				pgcnt -= npgs;
3213 				continue;
3214 			}
3215 			ASSERT(!PP_ISAGED(pp));
3216 			ASSERT(pp->p_szc == 0);
3217 			page_list_sub(pp, PG_CACHE_LIST);
3218 			page_hashout(pp, NULL);
3219 			PP_SETAGED(pp);
3220 			pp->p_szc = szc;
3221 			page_list_concat(&pplist, &pp);
3222 			pp++;
3223 			pgcnt--;
3224 			continue;
3225 		}
3226 		npgs = page_get_pagecnt(pp->p_szc);
3227 
3228 		/*
3229 		 * page_create_wait freemem accounting done by caller of
3230 		 * page_get_freelist and not necessary to call it prior to
3231 		 * calling page_get_replacement_page.
3232 		 *
3233 		 * page_get_replacement_page can call page_get_contig_pages
3234 		 * to acquire a large page (szc > 0); the replacement must be
3235 		 * smaller than the contig page size to avoid looping or
3236 		 * szc == 0 and PGI_PGCPSZC0 is set.
3237 		 */
3238 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3239 			replpp = page_get_replacement_page(pp, NULL, 0);
3240 			if (replpp) {
3241 				npgs = page_get_pagecnt(pp->p_szc);
3242 				ASSERT(npgs <= pgcnt);
3243 				targpp = pp;
3244 			}
3245 		}
3246 
3247 		/*
3248 		 * If replacement is NULL or do_page_relocate fails, fail
3249 		 * coalescing of pages.
3250 		 */
3251 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3252 		    &npgs, NULL) != 0)) {
3253 			/*
3254 			 * Unlock un-processed target list
3255 			 */
3256 			while (pgcnt--) {
3257 				ASSERT(PAGE_EXCL(pp));
3258 				page_unlock_nocapture(pp);
3259 				pp++;
3260 			}
3261 			/*
3262 			 * Free the processed target list.
3263 			 */
3264 			while (pplist) {
3265 				pp = pplist;
3266 				page_sub(&pplist, pp);
3267 				ASSERT(PAGE_EXCL(pp));
3268 				ASSERT(pp->p_szc == szc);
3269 				ASSERT(PP_ISFREE(pp));
3270 				ASSERT(PP_ISAGED(pp));
3271 				pp->p_szc = 0;
3272 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3273 				page_unlock_nocapture(pp);
3274 			}
3275 
3276 			if (replpp != NULL)
3277 				page_free_replacement_page(replpp);
3278 
3279 			return (NULL);
3280 		}
3281 		ASSERT(pp == targpp);
3282 
3283 		/* LINTED */
3284 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3285 
3286 		pp += npgs;
3287 		pgcnt -= npgs;
3288 
3289 		while (npgs--) {
3290 			ASSERT(PAGE_EXCL(targpp));
3291 			ASSERT(!PP_ISFREE(targpp));
3292 			ASSERT(!PP_ISNORELOC(targpp));
3293 			PP_SETFREE(targpp);
3294 			ASSERT(PP_ISAGED(targpp));
3295 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3296 			    (flags & PGI_PGCPSZC0)));
3297 			targpp->p_szc = szc;
3298 			targpp = targpp->p_next;
3299 
3300 			rpp = replpp;
3301 			ASSERT(rpp != NULL);
3302 			page_sub(&replpp, rpp);
3303 			ASSERT(PAGE_EXCL(rpp));
3304 			ASSERT(!PP_ISFREE(rpp));
3305 			page_unlock_nocapture(rpp);
3306 		}
3307 		ASSERT(targpp == hpp);
3308 		ASSERT(replpp == NULL);
3309 		page_list_concat(&pplist, &targpp);
3310 	}
3311 	CHK_LPG(pplist, szc);
3312 	return (pplist);
3313 }
3314 
3315 /*
3316  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3317  * of 0 means nothing left after trim.
3318  */
3319 int
3320 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3321 {
3322 	pfn_t	kcagepfn;
3323 	int	decr;
3324 	int	rc = 0;
3325 
3326 	if (PP_ISNORELOC(mseg->pages)) {
3327 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3328 
3329 			/* lower part of this mseg inside kernel cage */
3330 			decr = kcage_current_pfn(&kcagepfn);
3331 
3332 			/* kernel cage may have transitioned past mseg */
3333 			if (kcagepfn >= mseg->pages_base &&
3334 			    kcagepfn < mseg->pages_end) {
3335 				ASSERT(decr == 0);
3336 				*lo = MAX(kcagepfn, pfnlo);
3337 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
3338 				rc = 1;
3339 			}
3340 		}
3341 		/* else entire mseg in the cage */
3342 	} else {
3343 		if (PP_ISNORELOC(mseg->epages - 1)) {
3344 
3345 			/* upper part of this mseg inside kernel cage */
3346 			decr = kcage_current_pfn(&kcagepfn);
3347 
3348 			/* kernel cage may have transitioned past mseg */
3349 			if (kcagepfn >= mseg->pages_base &&
3350 			    kcagepfn < mseg->pages_end) {
3351 				ASSERT(decr);
3352 				*hi = MIN(kcagepfn, pfnhi);
3353 				*lo = MAX(pfnlo, mseg->pages_base);
3354 				rc = 1;
3355 			}
3356 		} else {
3357 			/* entire mseg outside of kernel cage */
3358 			*lo = MAX(pfnlo, mseg->pages_base);
3359 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3360 			rc = 1;
3361 		}
3362 	}
3363 	return (rc);
3364 }
3365 
3366 /*
3367  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3368  * page with size code 'szc'. Claiming such a page requires acquiring
3369  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3370  * relocating pages in use and concatenating these constituent pages into a
3371  * large page.
3372  *
3373  * The page lists do not have such a large page and page_freelist_split has
3374  * already failed to demote larger pages and/or coalesce smaller free pages.
3375  *
3376  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3377  * pages with the same color as 'bin'.
3378  *
3379  * 'pfnflag' specifies the subset of the pfn range to search.
3380  */
3381 
3382 static page_t *
3383 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3384     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3385 {
3386 	struct memseg *mseg;
3387 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3388 	pgcnt_t szcpgmask = szcpgcnt - 1;
3389 	pfn_t	randpfn;
3390 	page_t *pp, *randpp, *endpp;
3391 	uint_t colors, ceq_mask;
3392 	/* LINTED : set but not used in function */
3393 	uint_t color_mask;
3394 	pfn_t hi, lo;
3395 	uint_t skip;
3396 	MEM_NODE_ITERATOR_DECL(it);
3397 
3398 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3399 
3400 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3401 
3402 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3403 		return (NULL);
3404 
3405 	ASSERT(szc < mmu_page_sizes);
3406 
3407 	colors = PAGE_GET_PAGECOLORS(szc);
3408 	color_mask = colors - 1;
3409 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3410 		uchar_t ceq = colorequivszc[szc];
3411 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3412 
3413 		ASSERT(ceq_dif > 0);
3414 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3415 	} else {
3416 		ceq_mask = 0;
3417 	}
3418 
3419 	ASSERT(bin < colors);
3420 
3421 	/* clear "non-significant" color bits */
3422 	bin &= ceq_mask;
3423 
3424 	/*
3425 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3426 	 * when there have been previous page_get_contig_page failures to
3427 	 * limit the search.
3428 	 *
3429 	 * The high bit in pfnflag specifies the number of 'slots' in the
3430 	 * pfn range and the remainder of pfnflag specifies which slot.
3431 	 * For example, a value of 1010b would mean the second slot of
3432 	 * the pfn range that has been divided into 8 slots.
3433 	 */
3434 	if (pfnflag > 1) {
3435 		int	slots = 1 << (highbit(pfnflag) - 1);
3436 		int	slotid = pfnflag & (slots - 1);
3437 		pgcnt_t	szcpages;
3438 		int	slotlen;
3439 
3440 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3441 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3442 		slotlen = howmany(szcpages, slots);
3443 		/* skip if 'slotid' slot is empty */
3444 		if (slotid * slotlen >= szcpages)
3445 			return (NULL);
3446 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3447 		ASSERT(pfnlo < pfnhi);
3448 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3449 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3450 	}
3451 
3452 	memsegs_lock(0);
3453 
3454 	/*
3455 	 * loop through memsegs to look for contig page candidates
3456 	 */
3457 
3458 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3459 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3460 			/* no overlap */
3461 			continue;
3462 		}
3463 
3464 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3465 			/* mseg too small */
3466 			continue;
3467 
3468 		/*
3469 		 * trim off kernel cage pages from pfn range and check for
3470 		 * a trimmed pfn range returned that does not span the
3471 		 * desired large page size.
3472 		 */
3473 		if (kcage_on) {
3474 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3475 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3476 				continue;
3477 		} else {
3478 			lo = MAX(pfnlo, mseg->pages_base);
3479 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3480 		}
3481 
3482 		/* round to szcpgcnt boundaries */
3483 		lo = P2ROUNDUP(lo, szcpgcnt);
3484 
3485 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3486 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3487 
3488 		if (hi <= lo)
3489 			continue;
3490 
3491 		/*
3492 		 * set lo to point to the pfn for the desired bin. Large
3493 		 * page sizes may only have a single page color
3494 		 */
3495 		skip = szcpgcnt;
3496 		if (ceq_mask > 0 || interleaved_mnodes) {
3497 			/* set lo to point at appropriate color */
3498 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3499 			    (interleaved_mnodes &&
3500 			    PFN_2_MEM_NODE(lo) != mnode)) {
3501 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3502 				    color_mask, &it);
3503 			}
3504 			if (hi <= lo)
3505 				/* mseg cannot satisfy color request */
3506 				continue;
3507 		}
3508 
3509 		/* randomly choose a point between lo and hi to begin search */
3510 
3511 		randpfn = (pfn_t)GETTICK();
3512 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3513 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3514 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3515 			if (randpfn != (pfn_t)-1) {
3516 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3517 				    ceq_mask, color_mask, &it);
3518 			}
3519 			if (randpfn >= hi) {
3520 				randpfn = lo;
3521 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3522 				    &it);
3523 			}
3524 		}
3525 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3526 
3527 		ASSERT(randpp->p_pagenum == randpfn);
3528 
3529 		pp = randpp;
3530 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3531 
3532 		ASSERT(randpp + szcpgcnt <= endpp);
3533 
3534 		do {
3535 			ASSERT(!(pp->p_pagenum & szcpgmask));
3536 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3537 
3538 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3539 				/* pages unlocked by page_claim on failure */
3540 				if (page_claim_contig_pages(pp, szc, flags)) {
3541 					memsegs_unlock(0);
3542 					return (pp);
3543 				}
3544 			}
3545 
3546 			if (ceq_mask == 0 && !interleaved_mnodes) {
3547 				pp += skip;
3548 			} else {
3549 				pfn_t pfn = pp->p_pagenum;
3550 
3551 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3552 				    ceq_mask, color_mask, &it);
3553 				if (pfn == (pfn_t)-1) {
3554 					pp = endpp;
3555 				} else {
3556 					pp = mseg->pages +
3557 					    (pfn - mseg->pages_base);
3558 				}
3559 			}
3560 			if (pp >= endpp) {
3561 				/* start from the beginning */
3562 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3563 				pp = mseg->pages + (lo - mseg->pages_base);
3564 				ASSERT(pp->p_pagenum == lo);
3565 				ASSERT(pp + szcpgcnt <= endpp);
3566 			}
3567 		} while (pp != randpp);
3568 	}
3569 	memsegs_unlock(0);
3570 	return (NULL);
3571 }
3572 
3573 
3574 /*
3575  * controlling routine that searches through physical memory in an attempt to
3576  * claim a large page based on the input parameters.
3577  * on the page free lists.
3578  *
3579  * calls page_geti_contig_pages with an initial pfn range from the mnode
3580  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3581  * that overlaps with the kernel cage or does not match the requested page
3582  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3583  * page_geti_contig_pages may further limit the search range based on
3584  * previous failure counts (pgcpfailcnt[]).
3585  *
3586  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3587  * pagesize page that satisfies mtype.
3588  */
3589 page_t *
3590 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3591     uint_t flags)
3592 {
3593 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3594 	page_t		*pp;
3595 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3596 
3597 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3598 
3599 	/* no allocations from cage */
3600 	flags |= PGI_NOCAGE;
3601 
3602 	/* LINTED */
3603 	MTYPE_START(mnode, mtype, flags);
3604 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3605 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3606 		return (NULL);
3607 	}
3608 
3609 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3610 
3611 	/* do not limit search and ignore color if hi pri */
3612 
3613 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3614 		pfnflag = pgcpfailcnt[szc];
3615 
3616 	/* remove color match to improve chances */
3617 
3618 	if (flags & PGI_PGCPHIPRI || pfnflag)
3619 		flags &= ~PG_MATCH_COLOR;
3620 
3621 	do {
3622 		/* get pfn range based on mnode and mtype */
3623 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3624 
3625 		ASSERT(pfnhi >= pfnlo);
3626 
3627 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3628 		    pfnlo, pfnhi, pfnflag);
3629 
3630 		if (pp != NULL) {
3631 			pfnflag = pgcpfailcnt[szc];
3632 			if (pfnflag) {
3633 				/* double the search size */
3634 				pgcpfailcnt[szc] = pfnflag >> 1;
3635 			}
3636 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3637 			return (pp);
3638 		}
3639 		MTYPE_NEXT(mnode, mtype, flags);
3640 	} while (mtype >= 0);
3641 
3642 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3643 	return (NULL);
3644 }
3645 
3646 #if defined(__i386) || defined(__amd64)
3647 /*
3648  * Determine the likelihood of finding/coalescing a szc page.
3649  * Return 0 if the likelihood is small otherwise return 1.
3650  *
3651  * For now, be conservative and check only 1g pages and return 0
3652  * if there had been previous coalescing failures and the szc pages
3653  * needed to satisfy request would exhaust most of freemem.
3654  */
3655 int
3656 page_chk_freelist(uint_t szc)
3657 {
3658 	pgcnt_t		pgcnt;
3659 
3660 	if (szc <= 1)
3661 		return (1);
3662 
3663 	pgcnt = page_get_pagecnt(szc);
3664 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3665 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3666 		return (0);
3667 	}
3668 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3669 	return (1);
3670 }
3671 #endif
3672 
3673 /*
3674  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3675  *
3676  * Does its own locking and accounting.
3677  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3678  * pages of the proper color even if there are pages of a different color.
3679  *
3680  * Finds a page, removes it, THEN locks it.
3681  */
3682 
3683 /*ARGSUSED*/
3684 page_t *
3685 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3686 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3687 {
3688 	struct as	*as = seg->s_as;
3689 	page_t		*pp = NULL;
3690 	ulong_t		bin;
3691 	uchar_t		szc;
3692 	int		mnode;
3693 	int		mtype;
3694 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3695 	lgrp_mnode_cookie_t	lgrp_cookie;
3696 
3697 	page_get_func = page_get_mnode_freelist;
3698 
3699 	/*
3700 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3701 	 * assume we wish to allocate near to the current thread's home.
3702 	 */
3703 	if (!LGRP_EXISTS(lgrp))
3704 		lgrp = lgrp_home_lgrp();
3705 
3706 	if (kcage_on) {
3707 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3708 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3709 		    curthread != kcage_cageout_thread) {
3710 			/*
3711 			 * Set a "reserve" of kcage_throttlefree pages for
3712 			 * PG_PANIC and cageout thread allocations.
3713 			 *
3714 			 * Everybody else has to serialize in
3715 			 * page_create_get_something() to get a cage page, so
3716 			 * that we don't deadlock cageout!
3717 			 */
3718 			return (NULL);
3719 		}
3720 	} else {
3721 		flags &= ~PG_NORELOC;
3722 		flags |= PGI_NOCAGE;
3723 	}
3724 
3725 	/* LINTED */
3726 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3727 
3728 	/*
3729 	 * Convert size to page size code.
3730 	 */
3731 	if ((szc = page_szc(size)) == (uchar_t)-1)
3732 		panic("page_get_freelist: illegal page size request");
3733 	ASSERT(szc < mmu_page_sizes);
3734 
3735 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3736 
3737 	/* LINTED */
3738 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3739 
3740 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3741 
3742 	/*
3743 	 * Try to get a local page first, but try remote if we can't
3744 	 * get a page of the right color.
3745 	 */
3746 pgretry:
3747 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3748 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3749 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3750 		if (pp != NULL) {
3751 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3752 			DTRACE_PROBE4(page__get,
3753 			    lgrp_t *, lgrp,
3754 			    int, mnode,
3755 			    ulong_t, bin,
3756 			    uint_t, flags);
3757 			return (pp);
3758 		}
3759 	}
3760 	ASSERT(pp == NULL);
3761 
3762 	/*
3763 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3764 	 * remote free lists.  Caller expected to call page_get_cachelist which
3765 	 * will check local cache lists and remote free lists.
3766 	 */
3767 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3768 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3769 		return (NULL);
3770 	}
3771 
3772 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3773 
3774 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3775 
3776 	if (!(flags & PG_LOCAL)) {
3777 		/*
3778 		 * Try to get a non-local freelist page.
3779 		 */
3780 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3781 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3782 			pp = page_get_func(mnode, bin, mtype, szc, flags);
3783 			if (pp != NULL) {
3784 				DTRACE_PROBE4(page__get,
3785 				    lgrp_t *, lgrp,
3786 				    int, mnode,
3787 				    ulong_t, bin,
3788 				    uint_t, flags);
3789 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3790 				return (pp);
3791 			}
3792 		}
3793 		ASSERT(pp == NULL);
3794 	}
3795 
3796 	/*
3797 	 * when the cage is off chances are page_get_contig_pages() will fail
3798 	 * to lock a large page chunk therefore when the cage is off it's not
3799 	 * called by default.  this can be changed via /etc/system.
3800 	 *
3801 	 * page_get_contig_pages() also called to acquire a base pagesize page
3802 	 * for page_create_get_something().
3803 	 */
3804 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3805 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3806 	    (page_get_func != page_get_contig_pages)) {
3807 
3808 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3809 		page_get_func = page_get_contig_pages;
3810 		goto pgretry;
3811 	}
3812 
3813 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3814 	    page_get_func == page_get_contig_pages)
3815 		SETPGCPFAILCNT(szc);
3816 
3817 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3818 	return (NULL);
3819 }
3820 
3821 /*
3822  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3823  *
3824  * Does its own locking.
3825  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3826  * pages of the proper color even if there are pages of a different color.
3827  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3828  * try to lock one of them.  If no page can be locked, try the
3829  * next bin.  Return NULL if a page can not be found and locked.
3830  *
3831  * Finds a pages, trys to lock it, then removes it.
3832  */
3833 
3834 /*ARGSUSED*/
3835 page_t *
3836 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3837     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3838 {
3839 	page_t		*pp;
3840 	struct as	*as = seg->s_as;
3841 	ulong_t		bin;
3842 	/*LINTED*/
3843 	int		mnode;
3844 	int		mtype;
3845 	lgrp_mnode_cookie_t	lgrp_cookie;
3846 
3847 	/*
3848 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3849 	 * assume we wish to allocate near to the current thread's home.
3850 	 */
3851 	if (!LGRP_EXISTS(lgrp))
3852 		lgrp = lgrp_home_lgrp();
3853 
3854 	if (!kcage_on) {
3855 		flags &= ~PG_NORELOC;
3856 		flags |= PGI_NOCAGE;
3857 	}
3858 
3859 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3860 	    kcage_freemem <= kcage_throttlefree) {
3861 		/*
3862 		 * Reserve kcage_throttlefree pages for critical kernel
3863 		 * threads.
3864 		 *
3865 		 * Everybody else has to go to page_create_get_something()
3866 		 * to get a cage page, so we don't deadlock cageout.
3867 		 */
3868 		return (NULL);
3869 	}
3870 
3871 	/* LINTED */
3872 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3873 
3874 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3875 
3876 	/* LINTED */
3877 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3878 
3879 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3880 
3881 	/*
3882 	 * Try local cachelists first
3883 	 */
3884 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3885 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3886 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3887 		if (pp != NULL) {
3888 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3889 			DTRACE_PROBE4(page__get,
3890 			    lgrp_t *, lgrp,
3891 			    int, mnode,
3892 			    ulong_t, bin,
3893 			    uint_t, flags);
3894 			return (pp);
3895 		}
3896 	}
3897 
3898 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3899 
3900 	/*
3901 	 * Try freelists/cachelists that are farther away
3902 	 * This is our only chance to allocate remote pages for PAGESIZE
3903 	 * requests.
3904 	 */
3905 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3906 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3907 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3908 		    0, flags);
3909 		if (pp != NULL) {
3910 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3911 			DTRACE_PROBE4(page__get,
3912 			    lgrp_t *, lgrp,
3913 			    int, mnode,
3914 			    ulong_t, bin,
3915 			    uint_t, flags);
3916 			return (pp);
3917 		}
3918 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3919 		if (pp != NULL) {
3920 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3921 			DTRACE_PROBE4(page__get,
3922 			    lgrp_t *, lgrp,
3923 			    int, mnode,
3924 			    ulong_t, bin,
3925 			    uint_t, flags);
3926 			return (pp);
3927 		}
3928 	}
3929 
3930 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3931 	return (NULL);
3932 }
3933 
3934 page_t *
3935 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3936 {
3937 	kmutex_t		*pcm;
3938 	page_t			*pp, *first_pp;
3939 	uint_t			sbin;
3940 	int			plw_initialized;
3941 	page_list_walker_t	plw;
3942 
3943 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3944 
3945 	/* LINTED */
3946 	MTYPE_START(mnode, mtype, flags);
3947 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3948 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3949 		return (NULL);
3950 	}
3951 
3952 try_again:
3953 
3954 	plw_initialized = 0;
3955 	plw.plw_ceq_dif = 1;
3956 
3957 	/*
3958 	 * Only hold one cachelist lock at a time, that way we
3959 	 * can start anywhere and not have to worry about lock
3960 	 * ordering.
3961 	 */
3962 
3963 	for (plw.plw_count = 0;
3964 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3965 		sbin = bin;
3966 		do {
3967 
3968 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
3969 				goto bin_empty_1;
3970 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3971 			mutex_enter(pcm);
3972 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3973 			if (pp == NULL)
3974 				goto bin_empty_0;
3975 
3976 			first_pp = pp;
3977 			ASSERT(pp->p_vnode);
3978 			ASSERT(PP_ISAGED(pp) == 0);
3979 			ASSERT(pp->p_szc == 0);
3980 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3981 			while (!page_trylock(pp, SE_EXCL)) {
3982 				pp = pp->p_next;
3983 				ASSERT(pp->p_szc == 0);
3984 				if (pp == first_pp) {
3985 					/*
3986 					 * We have searched the complete list!
3987 					 * And all of them (might only be one)
3988 					 * are locked. This can happen since
3989 					 * these pages can also be found via
3990 					 * the hash list. When found via the
3991 					 * hash list, they are locked first,
3992 					 * then removed. We give up to let the
3993 					 * other thread run.
3994 					 */
3995 					pp = NULL;
3996 					break;
3997 				}
3998 				ASSERT(pp->p_vnode);
3999 				ASSERT(PP_ISFREE(pp));
4000 				ASSERT(PP_ISAGED(pp) == 0);
4001 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4002 				    mnode);
4003 			}
4004 
4005 			if (pp) {
4006 				page_t	**ppp;
4007 				/*
4008 				 * Found and locked a page.
4009 				 * Pull it off the list.
4010 				 */
4011 				ASSERT(mtype == PP_2_MTYPE(pp));
4012 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4013 				page_sub(ppp, pp);
4014 				/*
4015 				 * Subtract counters before releasing pcm mutex
4016 				 * to avoid a race with page_freelist_coalesce
4017 				 * and page_freelist_split.
4018 				 */
4019 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4020 				mutex_exit(pcm);
4021 				ASSERT(pp->p_vnode);
4022 				ASSERT(PP_ISAGED(pp) == 0);
4023 #if defined(__sparc)
4024 				ASSERT(!kcage_on ||
4025 				    (flags & PG_NORELOC) == 0 ||
4026 				    PP_ISNORELOC(pp));
4027 				if (PP_ISNORELOC(pp)) {
4028 					kcage_freemem_sub(1);
4029 				}
4030 #endif
4031 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4032 				return (pp);
4033 			}
4034 bin_empty_0:
4035 			mutex_exit(pcm);
4036 bin_empty_1:
4037 			if (plw_initialized == 0) {
4038 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
4039 				plw_initialized = 1;
4040 			}
4041 			/* calculate the next bin with equivalent color */
4042 			bin = ADD_MASKED(bin, plw.plw_bin_step,
4043 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
4044 		} while (sbin != bin);
4045 
4046 		if (plw.plw_ceq_dif > 1)
4047 			bin = page_list_walk_next_bin(0, bin, &plw);
4048 	}
4049 
4050 	MTYPE_NEXT(mnode, mtype, flags);
4051 	if (mtype >= 0)
4052 		goto try_again;
4053 
4054 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4055 	return (NULL);
4056 }
4057 
4058 #ifdef DEBUG
4059 #define	REPL_PAGE_STATS
4060 #endif /* DEBUG */
4061 
4062 #ifdef REPL_PAGE_STATS
4063 struct repl_page_stats {
4064 	uint_t	ngets;
4065 	uint_t	ngets_noreloc;
4066 	uint_t	npgr_noreloc;
4067 	uint_t	nnopage_first;
4068 	uint_t	nnopage;
4069 	uint_t	nhashout;
4070 	uint_t	nnofree;
4071 	uint_t	nnext_pp;
4072 } repl_page_stats;
4073 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
4074 #else /* REPL_PAGE_STATS */
4075 #define	REPL_STAT_INCR(v)
4076 #endif /* REPL_PAGE_STATS */
4077 
4078 int	pgrppgcp;
4079 
4080 /*
4081  * The freemem accounting must be done by the caller.
4082  * First we try to get a replacement page of the same size as like_pp,
4083  * if that is not possible, then we just get a set of discontiguous
4084  * PAGESIZE pages.
4085  */
4086 page_t *
4087 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4088     uint_t pgrflags)
4089 {
4090 	page_t		*like_pp;
4091 	page_t		*pp, *pplist;
4092 	page_t		*pl = NULL;
4093 	ulong_t		bin;
4094 	int		mnode, page_mnode;
4095 	int		szc;
4096 	spgcnt_t	npgs, pg_cnt;
4097 	pfn_t		pfnum;
4098 	int		mtype;
4099 	int		flags = 0;
4100 	lgrp_mnode_cookie_t	lgrp_cookie;
4101 	lgrp_t		*lgrp;
4102 
4103 	REPL_STAT_INCR(ngets);
4104 	like_pp = orig_like_pp;
4105 	ASSERT(PAGE_EXCL(like_pp));
4106 
4107 	szc = like_pp->p_szc;
4108 	npgs = page_get_pagecnt(szc);
4109 	/*
4110 	 * Now we reset like_pp to the base page_t.
4111 	 * That way, we won't walk past the end of this 'szc' page.
4112 	 */
4113 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4114 	like_pp = page_numtopp_nolock(pfnum);
4115 	ASSERT(like_pp->p_szc == szc);
4116 
4117 	if (PP_ISNORELOC(like_pp)) {
4118 		ASSERT(kcage_on);
4119 		REPL_STAT_INCR(ngets_noreloc);
4120 		flags = PGI_RELOCONLY;
4121 	} else if (pgrflags & PGR_NORELOC) {
4122 		ASSERT(kcage_on);
4123 		REPL_STAT_INCR(npgr_noreloc);
4124 		flags = PG_NORELOC;
4125 	}
4126 
4127 	/*
4128 	 * Kernel pages must always be replaced with the same size
4129 	 * pages, since we cannot properly handle demotion of kernel
4130 	 * pages.
4131 	 */
4132 	if (PP_ISKAS(like_pp))
4133 		pgrflags |= PGR_SAMESZC;
4134 
4135 	/* LINTED */
4136 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4137 
4138 	while (npgs) {
4139 		pplist = NULL;
4140 		for (;;) {
4141 			pg_cnt = page_get_pagecnt(szc);
4142 			bin = PP_2_BIN(like_pp);
4143 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4144 			ASSERT(pg_cnt <= npgs);
4145 
4146 			/*
4147 			 * If an lgroup was specified, try to get the
4148 			 * page from that lgroup.
4149 			 * NOTE: Must be careful with code below because
4150 			 *	 lgroup may disappear and reappear since there
4151 			 *	 is no locking for lgroup here.
4152 			 */
4153 			if (LGRP_EXISTS(lgrp_target)) {
4154 				/*
4155 				 * Keep local variable for lgroup separate
4156 				 * from lgroup argument since this code should
4157 				 * only be exercised when lgroup argument
4158 				 * exists....
4159 				 */
4160 				lgrp = lgrp_target;
4161 
4162 				/* Try the lgroup's freelists first */
4163 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4164 				    LGRP_SRCH_LOCAL);
4165 				while ((pplist == NULL) &&
4166 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4167 				    != -1) {
4168 					pplist =
4169 					    page_get_mnode_freelist(mnode, bin,
4170 					    mtype, szc, flags);
4171 				}
4172 
4173 				/*
4174 				 * Now try it's cachelists if this is a
4175 				 * small page. Don't need to do it for
4176 				 * larger ones since page_freelist_coalesce()
4177 				 * already failed.
4178 				 */
4179 				if (pplist != NULL || szc != 0)
4180 					break;
4181 
4182 				/* Now try it's cachelists */
4183 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4184 				    LGRP_SRCH_LOCAL);
4185 
4186 				while ((pplist == NULL) &&
4187 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4188 				    != -1) {
4189 					pplist =
4190 					    page_get_mnode_cachelist(bin, flags,
4191 					    mnode, mtype);
4192 				}
4193 				if (pplist != NULL) {
4194 					page_hashout(pplist, NULL);
4195 					PP_SETAGED(pplist);
4196 					REPL_STAT_INCR(nhashout);
4197 					break;
4198 				}
4199 				/* Done looking in this lgroup. Bail out. */
4200 				break;
4201 			}
4202 
4203 			/*
4204 			 * No lgroup was specified (or lgroup was removed by
4205 			 * DR, so just try to get the page as close to
4206 			 * like_pp's mnode as possible.
4207 			 * First try the local freelist...
4208 			 */
4209 			mnode = PP_2_MEM_NODE(like_pp);
4210 			pplist = page_get_mnode_freelist(mnode, bin,
4211 			    mtype, szc, flags);
4212 			if (pplist != NULL)
4213 				break;
4214 
4215 			REPL_STAT_INCR(nnofree);
4216 
4217 			/*
4218 			 * ...then the local cachelist. Don't need to do it for
4219 			 * larger pages cause page_freelist_coalesce() already
4220 			 * failed there anyway.
4221 			 */
4222 			if (szc == 0) {
4223 				pplist = page_get_mnode_cachelist(bin, flags,
4224 				    mnode, mtype);
4225 				if (pplist != NULL) {
4226 					page_hashout(pplist, NULL);
4227 					PP_SETAGED(pplist);
4228 					REPL_STAT_INCR(nhashout);
4229 					break;
4230 				}
4231 			}
4232 
4233 			/* Now try remote freelists */
4234 			page_mnode = mnode;
4235 			lgrp =
4236 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4237 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4238 			    LGRP_SRCH_HIER);
4239 			while (pplist == NULL &&
4240 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4241 			    != -1) {
4242 				/*
4243 				 * Skip local mnode.
4244 				 */
4245 				if ((mnode == page_mnode) ||
4246 				    (mem_node_config[mnode].exists == 0))
4247 					continue;
4248 
4249 				pplist = page_get_mnode_freelist(mnode,
4250 				    bin, mtype, szc, flags);
4251 			}
4252 
4253 			if (pplist != NULL)
4254 				break;
4255 
4256 
4257 			/* Now try remote cachelists */
4258 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4259 			    LGRP_SRCH_HIER);
4260 			while (pplist == NULL && szc == 0) {
4261 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4262 				if (mnode == -1)
4263 					break;
4264 				/*
4265 				 * Skip local mnode.
4266 				 */
4267 				if ((mnode == page_mnode) ||
4268 				    (mem_node_config[mnode].exists == 0))
4269 					continue;
4270 
4271 				pplist = page_get_mnode_cachelist(bin,
4272 				    flags, mnode, mtype);
4273 
4274 				if (pplist != NULL) {
4275 					page_hashout(pplist, NULL);
4276 					PP_SETAGED(pplist);
4277 					REPL_STAT_INCR(nhashout);
4278 					break;
4279 				}
4280 			}
4281 
4282 			/*
4283 			 * Break out of while loop under the following cases:
4284 			 * - If we successfully got a page.
4285 			 * - If pgrflags specified only returning a specific
4286 			 *   page size and we could not find that page size.
4287 			 * - If we could not satisfy the request with PAGESIZE
4288 			 *   or larger pages.
4289 			 */
4290 			if (pplist != NULL || szc == 0)
4291 				break;
4292 
4293 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4294 				/* try to find contig page */
4295 
4296 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4297 				    LGRP_SRCH_HIER);
4298 
4299 				while ((pplist == NULL) &&
4300 				    (mnode =
4301 				    lgrp_memnode_choose(&lgrp_cookie))
4302 				    != -1) {
4303 					pplist = page_get_contig_pages(
4304 					    mnode, bin, mtype, szc,
4305 					    flags | PGI_PGCPHIPRI);
4306 				}
4307 				break;
4308 			}
4309 
4310 			/*
4311 			 * The correct thing to do here is try the next
4312 			 * page size down using szc--. Due to a bug
4313 			 * with the processing of HAT_RELOAD_SHARE
4314 			 * where the sfmmu_ttecnt arrays of all
4315 			 * hats sharing an ISM segment don't get updated,
4316 			 * using intermediate size pages for relocation
4317 			 * can lead to continuous page faults.
4318 			 */
4319 			szc = 0;
4320 		}
4321 
4322 		if (pplist != NULL) {
4323 			DTRACE_PROBE4(page__get,
4324 			    lgrp_t *, lgrp,
4325 			    int, mnode,
4326 			    ulong_t, bin,
4327 			    uint_t, flags);
4328 
4329 			while (pplist != NULL && pg_cnt--) {
4330 				ASSERT(pplist != NULL);
4331 				pp = pplist;
4332 				page_sub(&pplist, pp);
4333 				PP_CLRFREE(pp);
4334 				PP_CLRAGED(pp);
4335 				page_list_concat(&pl, &pp);
4336 				npgs--;
4337 				like_pp = like_pp + 1;
4338 				REPL_STAT_INCR(nnext_pp);
4339 			}
4340 			ASSERT(pg_cnt == 0);
4341 		} else {
4342 			break;
4343 		}
4344 	}
4345 
4346 	if (npgs) {
4347 		/*
4348 		 * We were unable to allocate the necessary number
4349 		 * of pages.
4350 		 * We need to free up any pl.
4351 		 */
4352 		REPL_STAT_INCR(nnopage);
4353 		page_free_replacement_page(pl);
4354 		return (NULL);
4355 	} else {
4356 		return (pl);
4357 	}
4358 }
4359 
4360 /*
4361  * demote a free large page to it's constituent pages
4362  */
4363 void
4364 page_demote_free_pages(page_t *pp)
4365 {
4366 
4367 	int mnode;
4368 
4369 	ASSERT(pp != NULL);
4370 	ASSERT(PAGE_LOCKED(pp));
4371 	ASSERT(PP_ISFREE(pp));
4372 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4373 
4374 	mnode = PP_2_MEM_NODE(pp);
4375 	page_freelist_lock(mnode);
4376 	if (pp->p_szc != 0) {
4377 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4378 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4379 	}
4380 	page_freelist_unlock(mnode);
4381 	ASSERT(pp->p_szc == 0);
4382 }
4383 
4384 /*
4385  * Factor in colorequiv to check additional 'equivalent' bins.
4386  * colorequiv may be set in /etc/system
4387  */
4388 void
4389 page_set_colorequiv_arr(void)
4390 {
4391 	if (colorequiv > 1) {
4392 		int i;
4393 		uint_t sv_a = lowbit(colorequiv) - 1;
4394 
4395 		if (sv_a > 15)
4396 			sv_a = 15;
4397 
4398 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4399 			uint_t colors;
4400 			uint_t a = sv_a;
4401 
4402 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4403 				continue;
4404 			}
4405 			while ((colors >> a) == 0)
4406 				a--;
4407 			if ((a << 4) > colorequivszc[i]) {
4408 				colorequivszc[i] = (a << 4);
4409 			}
4410 		}
4411 	}
4412 }
4413