xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision b60f2a0b921611326383e4789e0874e9e8a2e708)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * This file contains common functions to access and manage the page lists.
38  * Many of these routines originated from platform dependent modules
39  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
40  * a platform independent manner.
41  *
42  * vm/vm_dep.h provides for platform specific support.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/atomic.h>
50 #include <sys/sysmacros.h>
51 #include <vm/as.h>
52 #include <vm/page.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_vn.h>
55 #include <sys/vmsystm.h>
56 #include <sys/memnode.h>
57 #include <vm/vm_dep.h>
58 #include <sys/lgrp.h>
59 #include <sys/mem_config.h>
60 #include <sys/callb.h>
61 #include <sys/mem_cage.h>
62 #include <sys/sdt.h>
63 
64 extern uint_t	vac_colors;
65 
66 #define	MAX_PRAGMA_ALIGN	128
67 
68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
69 
70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
71 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
72 #else
73 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
74 #endif
75 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
76 
77 /*
78  * number of page colors equivalent to reqested color in page_get routines.
79  * If set, keeps large pages intact longer and keeps MPO allocation
80  * from the local mnode in favor of acquiring the 'correct' page color from
81  * a demoted large page or from a remote mnode.
82  */
83 uint_t	colorequiv;
84 
85 /*
86  * color equivalency mask for each page size.
87  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
88  * High 4 bits determine the number of high order bits of the color to ignore.
89  * Low 4 bits determines number of low order bits of color to ignore (it's only
90  * relevant for hashed index based page coloring).
91  */
92 uchar_t colorequivszc[MMU_PAGE_SIZES];
93 
94 /*
95  * if set, specifies the percentage of large pages that are free from within
96  * a large page region before attempting to lock those pages for
97  * page_get_contig_pages processing.
98  *
99  * Should be turned on when kpr is available when page_trylock_contig_pages
100  * can be more selective.
101  */
102 
103 int	ptcpthreshold;
104 
105 /*
106  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
107  * Enabled by default via pgcplimitsearch.
108  *
109  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
110  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
111  * bound. This upper bound range guarantees:
112  *    - all large page 'slots' will be searched over time
113  *    - the minimum (1) large page candidates considered on each pgcp call
114  *    - count doesn't wrap around to 0
115  */
116 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
117 int	pgcplimitsearch = 1;
118 
119 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
120 #define	SETPGCPFAILCNT(szc)						\
121 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
122 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
123 
124 #ifdef VM_STATS
125 struct vmm_vmstats_str  vmm_vmstats;
126 
127 #endif /* VM_STATS */
128 
129 #if defined(__sparc)
130 #define	LPGCREATE	0
131 #else
132 /* enable page_get_contig_pages */
133 #define	LPGCREATE	1
134 #endif
135 
136 int pg_contig_disable;
137 int pg_lpgcreate_nocage = LPGCREATE;
138 
139 /*
140  * page_freelist_split pfn flag to signify no hi pfn requirement.
141  */
142 #define	PFNNULL		0
143 
144 /* Flags involved in promotion and demotion routines */
145 #define	PC_FREE		0x1	/* put page on freelist */
146 #define	PC_ALLOC	0x2	/* return page for allocation */
147 
148 /*
149  * Flag for page_demote to be used with PC_FREE to denote that we don't care
150  * what the color is as the color parameter to the function is ignored.
151  */
152 #define	PC_NO_COLOR	(-1)
153 
154 /* mtype value for page_promote to use when mtype does not matter */
155 #define	PC_MTYPE_ANY	(-1)
156 
157 /*
158  * page counters candidates info
159  * See page_ctrs_cands comment below for more details.
160  * fields are as follows:
161  *	pcc_pages_free:		# pages which freelist coalesce can create
162  *	pcc_color_free:		pointer to page free counts per color
163  */
164 typedef struct pcc_info {
165 	pgcnt_t	pcc_pages_free;
166 	pgcnt_t	*pcc_color_free;
167 } pcc_info_t;
168 
169 /*
170  * On big machines it can take a long time to check page_counters
171  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
172  * updated sum of all elements of the corresponding page_counters arrays.
173  * page_freelist_coalesce() searches page_counters only if an appropriate
174  * element of page_ctrs_cands array is greater than 0.
175  *
176  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
177  */
178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
179 
180 /*
181  * Return in val the total number of free pages which can be created
182  * for the given mnode (m), mrange (g), and region size (r)
183  */
184 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
185 	int i;								\
186 	val = 0;							\
187 	for (i = 0; i < NPC_MUTEX; i++) {				\
188 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
189 	}								\
190 }
191 
192 /*
193  * Return in val the total number of free pages which can be created
194  * for the given mnode (m), mrange (g), region size (r), and color (c)
195  */
196 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
197 	int i;								\
198 	val = 0;							\
199 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
200 	for (i = 0; i < NPC_MUTEX; i++) {				\
201 	    val +=							\
202 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
203 	}								\
204 }
205 
206 /*
207  * We can only allow a single thread to update a counter within the physical
208  * range of the largest supported page size. That is the finest granularity
209  * possible since the counter values are dependent on each other
210  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
211  * ctr_mutex lock index for a particular physical range.
212  */
213 static kmutex_t	*ctr_mutex[NPC_MUTEX];
214 
215 #define	PP_CTR_LOCK_INDX(pp)						\
216 	(((pp)->p_pagenum >>						\
217 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
218 
219 #define	INVALID_COLOR 0xffffffff
220 #define	INVALID_MASK  0xffffffff
221 
222 /*
223  * Local functions prototypes.
224  */
225 
226 void page_ctr_add(int, int, page_t *, int);
227 void page_ctr_add_internal(int, int, page_t *, int);
228 void page_ctr_sub(int, int, page_t *, int);
229 void page_ctr_sub_internal(int, int, page_t *, int);
230 void page_freelist_lock(int);
231 void page_freelist_unlock(int);
232 page_t *page_promote(int, pfn_t, uchar_t, int, int);
233 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
234 page_t *page_freelist_split(uchar_t,
235     uint_t, int, int, pfn_t, page_list_walker_t *);
236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
237 static int page_trylock_cons(page_t *pp, se_t se);
238 
239 /*
240  * The page_counters array below is used to keep track of free contiguous
241  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
242  * This contains an array of counters, the size of the array, a shift value
243  * used to convert a pagenum into a counter array index or vice versa, as
244  * well as a cache of the last successful index to be promoted to a larger
245  * page size.  As an optimization, we keep track of the last successful index
246  * to be promoted per page color for the given size region, and this is
247  * allocated dynamically based upon the number of colors for a given
248  * region size.
249  *
250  * Conceptually, the page counters are represented as:
251  *
252  *	page_counters[region_size][mnode]
253  *
254  *	region_size:	size code of a candidate larger page made up
255  *			of contiguous free smaller pages.
256  *
257  *	page_counters[region_size][mnode].hpm_counters[index]:
258  *		represents how many (region_size - 1) pages either
259  *		exist or can be created within the given index range.
260  *
261  * Let's look at a sparc example:
262  *	If we want to create a free 512k page, we look at region_size 2
263  *	for the mnode we want.  We calculate the index and look at a specific
264  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
265  *	this location, it means that 8 64k pages either exist or can be created
266  *	from 8K pages in order to make a single free 512k page at the given
267  *	index.  Note that when a region is full, it will contribute to the
268  *	counts in the region above it.  Thus we will not know what page
269  *	size the free pages will be which can be promoted to this new free
270  *	page unless we look at all regions below the current region.
271  */
272 
273 /*
274  * Note: hpmctr_t is defined in platform vm_dep.h
275  * hw_page_map_t contains all the information needed for the page_counters
276  * logic. The fields are as follows:
277  *
278  *	hpm_counters:	dynamically allocated array to hold counter data
279  *	hpm_entries:	entries in hpm_counters
280  *	hpm_shift:	shift for pnum/array index conv
281  *	hpm_base:	PFN mapped to counter index 0
282  *	hpm_color_current:	last index in counter array for this color at
283  *				which we successfully created a large page
284  */
285 typedef struct hw_page_map {
286 	hpmctr_t	*hpm_counters;
287 	size_t		hpm_entries;
288 	int		hpm_shift;
289 	pfn_t		hpm_base;
290 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
291 } hw_page_map_t;
292 
293 /*
294  * Element zero is not used, but is allocated for convenience.
295  */
296 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
297 
298 /*
299  * Cached value of MNODE_RANGE_CNT(mnode).
300  * This is a function call in x86.
301  */
302 static int mnode_nranges[MAX_MEM_NODES];
303 static int mnode_maxmrange[MAX_MEM_NODES];
304 
305 /*
306  * The following macros are convenient ways to get access to the individual
307  * elements of the page_counters arrays.  They can be used on both
308  * the left side and right side of equations.
309  */
310 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
311 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
312 
313 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
314 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
315 
316 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
317 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
318 
319 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
320 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
321 
322 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
323 	(page_counters[(rg_szc)][(mnode)].hpm_base)
324 
325 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
326 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
327 
328 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
329 	(page_counters[(rg_szc)][(mnode)].				\
330 	hpm_color_current[(mrange)][(color)])
331 
332 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
333 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
334 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
335 
336 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
337 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
338 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
339 
340 /*
341  * Protects the hpm_counters and hpm_color_current memory from changing while
342  * looking at page counters information.
343  * Grab the write lock to modify what these fields point at.
344  * Grab the read lock to prevent any pointers from changing.
345  * The write lock can not be held during memory allocation due to a possible
346  * recursion deadlock with trying to grab the read lock while the
347  * write lock is already held.
348  */
349 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
350 
351 
352 /*
353  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
354  */
355 void
356 cpu_vm_data_init(struct cpu *cp)
357 {
358 	if (cp == CPU0) {
359 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
360 	} else {
361 		void	*kmptr;
362 		int	align;
363 		size_t	sz;
364 
365 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
366 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
367 		kmptr = kmem_zalloc(sz, KM_SLEEP);
368 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
369 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
370 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
371 	}
372 }
373 
374 /*
375  * free cpu_vm_data
376  */
377 void
378 cpu_vm_data_destroy(struct cpu *cp)
379 {
380 	if (cp->cpu_seqid && cp->cpu_vm_data) {
381 		ASSERT(cp != CPU0);
382 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
383 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
384 	}
385 	cp->cpu_vm_data = NULL;
386 }
387 
388 
389 /*
390  * page size to page size code
391  */
392 int
393 page_szc(size_t pagesize)
394 {
395 	int	i = 0;
396 
397 	while (hw_page_array[i].hp_size) {
398 		if (pagesize == hw_page_array[i].hp_size)
399 			return (i);
400 		i++;
401 	}
402 	return (-1);
403 }
404 
405 /*
406  * page size to page size code with the restriction that it be a supported
407  * user page size.  If it's not a supported user page size, -1 will be returned.
408  */
409 int
410 page_szc_user_filtered(size_t pagesize)
411 {
412 	int szc = page_szc(pagesize);
413 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
414 		return (szc);
415 	}
416 	return (-1);
417 }
418 
419 /*
420  * Return how many page sizes are available for the user to use.  This is
421  * what the hardware supports and not based upon how the OS implements the
422  * support of different page sizes.
423  *
424  * If legacy is non-zero, return the number of pagesizes available to legacy
425  * applications. The number of legacy page sizes might be less than the
426  * exported user page sizes. This is to prevent legacy applications that
427  * use the largest page size returned from getpagesizes(3c) from inadvertantly
428  * using the 'new' large pagesizes.
429  */
430 uint_t
431 page_num_user_pagesizes(int legacy)
432 {
433 	if (legacy)
434 		return (mmu_legacy_page_sizes);
435 	return (mmu_exported_page_sizes);
436 }
437 
438 uint_t
439 page_num_pagesizes(void)
440 {
441 	return (mmu_page_sizes);
442 }
443 
444 /*
445  * returns the count of the number of base pagesize pages associated with szc
446  */
447 pgcnt_t
448 page_get_pagecnt(uint_t szc)
449 {
450 	if (szc >= mmu_page_sizes)
451 		panic("page_get_pagecnt: out of range %d", szc);
452 	return (hw_page_array[szc].hp_pgcnt);
453 }
454 
455 size_t
456 page_get_pagesize(uint_t szc)
457 {
458 	if (szc >= mmu_page_sizes)
459 		panic("page_get_pagesize: out of range %d", szc);
460 	return (hw_page_array[szc].hp_size);
461 }
462 
463 /*
464  * Return the size of a page based upon the index passed in.  An index of
465  * zero refers to the smallest page size in the system, and as index increases
466  * it refers to the next larger supported page size in the system.
467  * Note that szc and userszc may not be the same due to unsupported szc's on
468  * some systems.
469  */
470 size_t
471 page_get_user_pagesize(uint_t userszc)
472 {
473 	uint_t szc = USERSZC_2_SZC(userszc);
474 
475 	if (szc >= mmu_page_sizes)
476 		panic("page_get_user_pagesize: out of range %d", szc);
477 	return (hw_page_array[szc].hp_size);
478 }
479 
480 uint_t
481 page_get_shift(uint_t szc)
482 {
483 	if (szc >= mmu_page_sizes)
484 		panic("page_get_shift: out of range %d", szc);
485 	return (PAGE_GET_SHIFT(szc));
486 }
487 
488 uint_t
489 page_get_pagecolors(uint_t szc)
490 {
491 	if (szc >= mmu_page_sizes)
492 		panic("page_get_pagecolors: out of range %d", szc);
493 	return (PAGE_GET_PAGECOLORS(szc));
494 }
495 
496 /*
497  * this assigns the desired equivalent color after a split
498  */
499 uint_t
500 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
501     uint_t ncolor, uint_t ceq_mask)
502 {
503 	ASSERT(nszc > szc);
504 	ASSERT(szc < mmu_page_sizes);
505 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
506 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
507 
508 	color &= ceq_mask;
509 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
510 	return (color | (ncolor & ~ceq_mask));
511 }
512 
513 /*
514  * The interleaved_mnodes flag is set when mnodes overlap in
515  * the physbase..physmax range, but have disjoint slices.
516  * In this case hpm_counters is shared by all mnodes.
517  * This flag is set dynamically by the platform.
518  */
519 int interleaved_mnodes = 0;
520 
521 /*
522  * Called by startup().
523  * Size up the per page size free list counters based on physmax
524  * of each node and max_mem_nodes.
525  *
526  * If interleaved_mnodes is set we need to find the first mnode that
527  * exists. hpm_counters for the first mnode will then be shared by
528  * all other mnodes. If interleaved_mnodes is not set, just set
529  * first=mnode each time. That means there will be no sharing.
530  */
531 size_t
532 page_ctrs_sz(void)
533 {
534 	int	r;		/* region size */
535 	int	mnode;
536 	int	firstmn;	/* first mnode that exists */
537 	int	nranges;
538 	pfn_t	physbase;
539 	pfn_t	physmax;
540 	uint_t	ctrs_sz = 0;
541 	int 	i;
542 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
543 
544 	/*
545 	 * We need to determine how many page colors there are for each
546 	 * page size in order to allocate memory for any color specific
547 	 * arrays.
548 	 */
549 	for (i = 0; i < mmu_page_sizes; i++) {
550 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
551 	}
552 
553 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
554 
555 		pgcnt_t r_pgcnt;
556 		pfn_t   r_base;
557 		pgcnt_t r_align;
558 
559 		if (mem_node_config[mnode].exists == 0)
560 			continue;
561 
562 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
563 		nranges = MNODE_RANGE_CNT(mnode);
564 		mnode_nranges[mnode] = nranges;
565 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
566 
567 		/*
568 		 * determine size needed for page counter arrays with
569 		 * base aligned to large page size.
570 		 */
571 		for (r = 1; r < mmu_page_sizes; r++) {
572 			/* add in space for hpm_color_current */
573 			ctrs_sz += sizeof (size_t) *
574 			    colors_per_szc[r] * nranges;
575 
576 			if (firstmn != mnode)
577 				continue;
578 
579 			/* add in space for hpm_counters */
580 			r_align = page_get_pagecnt(r);
581 			r_base = physbase;
582 			r_base &= ~(r_align - 1);
583 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
584 
585 			/*
586 			 * Round up to always allocate on pointer sized
587 			 * boundaries.
588 			 */
589 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
590 			    sizeof (hpmctr_t *));
591 		}
592 	}
593 
594 	for (r = 1; r < mmu_page_sizes; r++) {
595 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
596 	}
597 
598 	/* add in space for page_ctrs_cands and pcc_color_free */
599 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
600 	    mmu_page_sizes * NPC_MUTEX;
601 
602 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
603 
604 		if (mem_node_config[mnode].exists == 0)
605 			continue;
606 
607 		nranges = mnode_nranges[mnode];
608 		ctrs_sz += sizeof (pcc_info_t) * nranges *
609 		    mmu_page_sizes * NPC_MUTEX;
610 		for (r = 1; r < mmu_page_sizes; r++) {
611 			ctrs_sz += sizeof (pgcnt_t) * nranges *
612 			    colors_per_szc[r] * NPC_MUTEX;
613 		}
614 	}
615 
616 	/* ctr_mutex */
617 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
618 
619 	/* size for page list counts */
620 	PLCNT_SZ(ctrs_sz);
621 
622 	/*
623 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
624 	 * address of the counters to ecache_alignsize boundary for every
625 	 * memory node.
626 	 */
627 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
628 }
629 
630 caddr_t
631 page_ctrs_alloc(caddr_t alloc_base)
632 {
633 	int	mnode;
634 	int	mrange, nranges;
635 	int	r;		/* region size */
636 	int	i;
637 	int	firstmn;	/* first mnode that exists */
638 	pfn_t	physbase;
639 	pfn_t	physmax;
640 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
641 
642 	/*
643 	 * We need to determine how many page colors there are for each
644 	 * page size in order to allocate memory for any color specific
645 	 * arrays.
646 	 */
647 	for (i = 0; i < mmu_page_sizes; i++) {
648 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
649 	}
650 
651 	for (r = 1; r < mmu_page_sizes; r++) {
652 		page_counters[r] = (hw_page_map_t *)alloc_base;
653 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
654 	}
655 
656 	/* page_ctrs_cands and pcc_color_free array */
657 	for (i = 0; i < NPC_MUTEX; i++) {
658 		for (r = 1; r < mmu_page_sizes; r++) {
659 
660 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
661 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
662 
663 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
664 				pcc_info_t *pi;
665 
666 				if (mem_node_config[mnode].exists == 0)
667 					continue;
668 
669 				nranges = mnode_nranges[mnode];
670 
671 				pi = (pcc_info_t *)alloc_base;
672 				alloc_base += sizeof (pcc_info_t) * nranges;
673 				page_ctrs_cands[i][r][mnode] = pi;
674 
675 				for (mrange = 0; mrange < nranges; mrange++) {
676 					pi->pcc_color_free =
677 					    (pgcnt_t *)alloc_base;
678 					alloc_base += sizeof (pgcnt_t) *
679 					    colors_per_szc[r];
680 					pi++;
681 				}
682 			}
683 		}
684 	}
685 
686 	/* ctr_mutex */
687 	for (i = 0; i < NPC_MUTEX; i++) {
688 		ctr_mutex[i] = (kmutex_t *)alloc_base;
689 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
690 	}
691 
692 	/* initialize page list counts */
693 	PLCNT_INIT(alloc_base);
694 
695 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
696 
697 		pgcnt_t r_pgcnt;
698 		pfn_t	r_base;
699 		pgcnt_t r_align;
700 		int	r_shift;
701 		int	nranges = mnode_nranges[mnode];
702 
703 		if (mem_node_config[mnode].exists == 0)
704 			continue;
705 
706 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
707 
708 		for (r = 1; r < mmu_page_sizes; r++) {
709 			/*
710 			 * the page_counters base has to be aligned to the
711 			 * page count of page size code r otherwise the counts
712 			 * will cross large page boundaries.
713 			 */
714 			r_align = page_get_pagecnt(r);
715 			r_base = physbase;
716 			/* base needs to be aligned - lower to aligned value */
717 			r_base &= ~(r_align - 1);
718 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
719 			r_shift = PAGE_BSZS_SHIFT(r);
720 
721 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
722 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
723 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
724 			for (mrange = 0; mrange < nranges; mrange++) {
725 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
726 				    r, mrange) = (size_t *)alloc_base;
727 				alloc_base += sizeof (size_t) *
728 				    colors_per_szc[r];
729 			}
730 			for (i = 0; i < colors_per_szc[r]; i++) {
731 				uint_t color_mask = colors_per_szc[r] - 1;
732 				pfn_t  pfnum = r_base;
733 				size_t idx;
734 				int mrange;
735 				MEM_NODE_ITERATOR_DECL(it);
736 
737 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
738 				if (pfnum == (pfn_t)-1) {
739 					idx = 0;
740 				} else {
741 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
742 					    color_mask, color_mask, &it);
743 					idx = PNUM_TO_IDX(mnode, r, pfnum);
744 					idx = (idx >= r_pgcnt) ? 0 : idx;
745 				}
746 				for (mrange = 0; mrange < nranges; mrange++) {
747 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
748 					    r, i, mrange) = idx;
749 				}
750 			}
751 
752 			/* hpm_counters may be shared by all mnodes */
753 			if (firstmn == mnode) {
754 				PAGE_COUNTERS_COUNTERS(mnode, r) =
755 				    (hpmctr_t *)alloc_base;
756 				alloc_base +=
757 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
758 				    sizeof (hpmctr_t *));
759 			} else {
760 				PAGE_COUNTERS_COUNTERS(mnode, r) =
761 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
762 			}
763 
764 			/*
765 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
766 			 * satisfy the identity requirement.
767 			 * We should be able to go from one to the other
768 			 * and get consistent values.
769 			 */
770 			ASSERT(PNUM_TO_IDX(mnode, r,
771 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
772 			ASSERT(IDX_TO_PNUM(mnode, r,
773 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
774 		}
775 		/*
776 		 * Roundup the start address of the page_counters to
777 		 * cache aligned boundary for every memory node.
778 		 * page_ctrs_sz() has added some slop for these roundups.
779 		 */
780 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
781 		    L2CACHE_ALIGN);
782 	}
783 
784 	/* Initialize other page counter specific data structures. */
785 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
786 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
787 	}
788 
789 	return (alloc_base);
790 }
791 
792 /*
793  * Functions to adjust region counters for each size free list.
794  * Caller is responsible to acquire the ctr_mutex lock if necessary and
795  * thus can be called during startup without locks.
796  */
797 /* ARGSUSED */
798 void
799 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
800 {
801 	ssize_t		r;	/* region size */
802 	ssize_t		idx;
803 	pfn_t		pfnum;
804 	int		lckidx;
805 
806 	ASSERT(mnode == PP_2_MEM_NODE(pp));
807 	ASSERT(mtype == PP_2_MTYPE(pp));
808 
809 	ASSERT(pp->p_szc < mmu_page_sizes);
810 
811 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
812 
813 	/* no counter update needed for largest page size */
814 	if (pp->p_szc >= mmu_page_sizes - 1) {
815 		return;
816 	}
817 
818 	r = pp->p_szc + 1;
819 	pfnum = pp->p_pagenum;
820 	lckidx = PP_CTR_LOCK_INDX(pp);
821 
822 	/*
823 	 * Increment the count of free pages for the current
824 	 * region. Continue looping up in region size incrementing
825 	 * count if the preceeding region is full.
826 	 */
827 	while (r < mmu_page_sizes) {
828 		idx = PNUM_TO_IDX(mnode, r, pfnum);
829 
830 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
831 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
832 
833 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
834 			break;
835 		} else {
836 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
837 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
838 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
839 
840 			cand->pcc_pages_free++;
841 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
842 		}
843 		r++;
844 	}
845 }
846 
847 void
848 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
849 {
850 	int		lckidx = PP_CTR_LOCK_INDX(pp);
851 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
852 
853 	mutex_enter(lock);
854 	page_ctr_add_internal(mnode, mtype, pp, flags);
855 	mutex_exit(lock);
856 }
857 
858 void
859 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
860 {
861 	int		lckidx;
862 	ssize_t		r;	/* region size */
863 	ssize_t		idx;
864 	pfn_t		pfnum;
865 
866 	ASSERT(mnode == PP_2_MEM_NODE(pp));
867 	ASSERT(mtype == PP_2_MTYPE(pp));
868 
869 	ASSERT(pp->p_szc < mmu_page_sizes);
870 
871 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
872 
873 	/* no counter update needed for largest page size */
874 	if (pp->p_szc >= mmu_page_sizes - 1) {
875 		return;
876 	}
877 
878 	r = pp->p_szc + 1;
879 	pfnum = pp->p_pagenum;
880 	lckidx = PP_CTR_LOCK_INDX(pp);
881 
882 	/*
883 	 * Decrement the count of free pages for the current
884 	 * region. Continue looping up in region size decrementing
885 	 * count if the preceeding region was full.
886 	 */
887 	while (r < mmu_page_sizes) {
888 		idx = PNUM_TO_IDX(mnode, r, pfnum);
889 
890 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
891 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
892 
893 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
894 			break;
895 		} else {
896 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
897 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
898 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
899 
900 			ASSERT(cand->pcc_pages_free != 0);
901 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
902 
903 			cand->pcc_pages_free--;
904 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
905 		}
906 		r++;
907 	}
908 }
909 
910 void
911 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
912 {
913 	int		lckidx = PP_CTR_LOCK_INDX(pp);
914 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
915 
916 	mutex_enter(lock);
917 	page_ctr_sub_internal(mnode, mtype, pp, flags);
918 	mutex_exit(lock);
919 }
920 
921 /*
922  * Adjust page counters following a memory attach, since typically the
923  * size of the array needs to change, and the PFN to counter index
924  * mapping needs to change.
925  *
926  * It is possible this mnode did not exist at startup. In that case
927  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
928  * to change (a theoretical possibility on x86), which means pcc_color_free
929  * arrays must be extended.
930  */
931 uint_t
932 page_ctrs_adjust(int mnode)
933 {
934 	pgcnt_t npgs;
935 	int	r;		/* region size */
936 	int	i;
937 	size_t	pcsz, old_csz;
938 	hpmctr_t *new_ctr, *old_ctr;
939 	pfn_t	oldbase, newbase;
940 	pfn_t	physbase, physmax;
941 	size_t	old_npgs;
942 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
943 	size_t	size_cache[MMU_PAGE_SIZES];
944 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
945 	size_t	*old_color_array[MAX_MNODE_MRANGES];
946 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
947 	pcc_info_t **cands_cache;
948 	pcc_info_t *old_pi, *pi;
949 	pgcnt_t *pgcntp;
950 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
951 	int cands_cache_nranges;
952 	int old_maxmrange, new_maxmrange;
953 	int rc = 0;
954 
955 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
956 	    MMU_PAGE_SIZES, KM_NOSLEEP);
957 	if (cands_cache == NULL)
958 		return (ENOMEM);
959 
960 	i = -1;
961 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
962 
963 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
964 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
965 
966 	/* prepare to free non-null pointers on the way out */
967 	cands_cache_nranges = nranges;
968 	bzero(ctr_cache, sizeof (ctr_cache));
969 	bzero(color_cache, sizeof (color_cache));
970 
971 	/*
972 	 * We need to determine how many page colors there are for each
973 	 * page size in order to allocate memory for any color specific
974 	 * arrays.
975 	 */
976 	for (r = 0; r < mmu_page_sizes; r++) {
977 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
978 	}
979 
980 	/*
981 	 * Preallocate all of the new hpm_counters arrays as we can't
982 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
983 	 * If we can't allocate all of the arrays, undo our work so far
984 	 * and return failure.
985 	 */
986 	for (r = 1; r < mmu_page_sizes; r++) {
987 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
988 		size_cache[r] = pcsz;
989 		ctr_cache[r] = kmem_zalloc(pcsz *
990 		    sizeof (hpmctr_t), KM_NOSLEEP);
991 		if (ctr_cache[r] == NULL) {
992 			rc = ENOMEM;
993 			goto cleanup;
994 		}
995 	}
996 
997 	/*
998 	 * Preallocate all of the new color current arrays as we can't
999 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1000 	 * If we can't allocate all of the arrays, undo our work so far
1001 	 * and return failure.
1002 	 */
1003 	for (r = 1; r < mmu_page_sizes; r++) {
1004 		for (mrange = 0; mrange < nranges; mrange++) {
1005 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1006 			    colors_per_szc[r], KM_NOSLEEP);
1007 			if (color_cache[r][mrange] == NULL) {
1008 				rc = ENOMEM;
1009 				goto cleanup;
1010 			}
1011 		}
1012 	}
1013 
1014 	/*
1015 	 * Preallocate all of the new pcc_info_t arrays as we can't
1016 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1017 	 * If we can't allocate all of the arrays, undo our work so far
1018 	 * and return failure.
1019 	 */
1020 	for (r = 1; r < mmu_page_sizes; r++) {
1021 		for (i = 0; i < NPC_MUTEX; i++) {
1022 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1023 			    KM_NOSLEEP);
1024 			if (pi == NULL) {
1025 				rc = ENOMEM;
1026 				goto cleanup;
1027 			}
1028 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1029 
1030 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1031 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1032 				    sizeof (pgcnt_t), KM_NOSLEEP);
1033 				if (pgcntp == NULL) {
1034 					rc = ENOMEM;
1035 					goto cleanup;
1036 				}
1037 				pi->pcc_color_free = pgcntp;
1038 			}
1039 		}
1040 	}
1041 
1042 	/*
1043 	 * Grab the write lock to prevent others from walking these arrays
1044 	 * while we are modifying them.
1045 	 */
1046 	PAGE_CTRS_WRITE_LOCK(mnode);
1047 
1048 	old_nranges = mnode_nranges[mnode];
1049 	cands_cache_nranges = old_nranges;
1050 	mnode_nranges[mnode] = nranges;
1051 	old_maxmrange = mnode_maxmrange[mnode];
1052 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1053 	new_maxmrange = mnode_maxmrange[mnode];
1054 
1055 	for (r = 1; r < mmu_page_sizes; r++) {
1056 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1057 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
1058 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
1059 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
1060 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
1061 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1062 			old_color_array[mrange] =
1063 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1064 			    r, mrange);
1065 		}
1066 
1067 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1068 		new_ctr = ctr_cache[r];
1069 		ctr_cache[r] = NULL;
1070 		if (old_ctr != NULL &&
1071 		    (oldbase + old_npgs > newbase) &&
1072 		    (newbase + npgs > oldbase)) {
1073 			/*
1074 			 * Map the intersection of the old and new
1075 			 * counters into the new array.
1076 			 */
1077 			size_t offset;
1078 			if (newbase > oldbase) {
1079 				offset = (newbase - oldbase) >>
1080 				    PAGE_COUNTERS_SHIFT(mnode, r);
1081 				bcopy(old_ctr + offset, new_ctr,
1082 				    MIN(pcsz, (old_csz - offset)) *
1083 				    sizeof (hpmctr_t));
1084 			} else {
1085 				offset = (oldbase - newbase) >>
1086 				    PAGE_COUNTERS_SHIFT(mnode, r);
1087 				bcopy(old_ctr, new_ctr + offset,
1088 				    MIN(pcsz - offset, old_csz) *
1089 				    sizeof (hpmctr_t));
1090 			}
1091 		}
1092 
1093 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1094 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1095 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1096 
1097 		/* update shared hpm_counters in other mnodes */
1098 		if (interleaved_mnodes) {
1099 			for (i = 0; i < max_mem_nodes; i++) {
1100 				if (i == mnode)
1101 					continue;
1102 				if (mem_node_config[i].exists == 0)
1103 					continue;
1104 				ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr);
1105 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1106 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1107 				PAGE_COUNTERS_BASE(i, r) = newbase;
1108 			}
1109 		}
1110 
1111 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1112 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1113 			    color_cache[r][mrange];
1114 			color_cache[r][mrange] = NULL;
1115 		}
1116 		/*
1117 		 * for now, just reset on these events as it's probably
1118 		 * not worthwhile to try and optimize this.
1119 		 */
1120 		for (i = 0; i < colors_per_szc[r]; i++) {
1121 			uint_t color_mask = colors_per_szc[r] - 1;
1122 			int mlo = interleaved_mnodes ? 0 : mnode;
1123 			int mhi = interleaved_mnodes ? max_mem_nodes :
1124 			    (mnode + 1);
1125 			int m;
1126 			pfn_t  pfnum = newbase;
1127 			size_t idx;
1128 			MEM_NODE_ITERATOR_DECL(it);
1129 
1130 			for (m = mlo; m < mhi; m++) {
1131 				if (mem_node_config[m].exists == 0)
1132 					continue;
1133 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1134 				if (pfnum == (pfn_t)-1) {
1135 					idx = 0;
1136 				} else {
1137 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1138 					    color_mask, color_mask, &it);
1139 					idx = PNUM_TO_IDX(m, r, pfnum);
1140 					idx = (idx < pcsz) ? idx : 0;
1141 				}
1142 				for (mrange = 0; mrange < nranges; mrange++) {
1143 					PAGE_COUNTERS_CURRENT_COLOR(m,
1144 					    r, i, mrange) = idx;
1145 				}
1146 			}
1147 		}
1148 
1149 		/* cache info for freeing out of the critical path */
1150 		if ((caddr_t)old_ctr >= kernelheap &&
1151 		    (caddr_t)old_ctr < ekernelheap) {
1152 			ctr_cache[r] = old_ctr;
1153 			size_cache[r] = old_csz;
1154 		}
1155 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1156 			size_t *tmp = old_color_array[mrange];
1157 			if ((caddr_t)tmp >= kernelheap &&
1158 			    (caddr_t)tmp < ekernelheap) {
1159 				color_cache[r][mrange] = tmp;
1160 			}
1161 		}
1162 		/*
1163 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1164 		 * satisfy the identity requirement.
1165 		 * We should be able to go from one to the other
1166 		 * and get consistent values.
1167 		 */
1168 		ASSERT(PNUM_TO_IDX(mnode, r,
1169 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1170 		ASSERT(IDX_TO_PNUM(mnode, r,
1171 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1172 
1173 		/* pcc_info_t and pcc_color_free */
1174 		for (i = 0; i < NPC_MUTEX; i++) {
1175 			pcc_info_t *epi;
1176 			pcc_info_t *eold_pi;
1177 
1178 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1179 			old_pi = page_ctrs_cands[i][r][mnode];
1180 			page_ctrs_cands[i][r][mnode] = pi;
1181 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1182 
1183 			/* preserve old pcc_color_free values, if any */
1184 			if (old_pi == NULL)
1185 				continue;
1186 
1187 			/*
1188 			 * when/if x86 does DR, must account for
1189 			 * possible change in range index when
1190 			 * preserving pcc_info
1191 			 */
1192 			epi = &pi[nranges];
1193 			eold_pi = &old_pi[old_nranges];
1194 			if (new_maxmrange > old_maxmrange) {
1195 				pi += new_maxmrange - old_maxmrange;
1196 			} else if (new_maxmrange < old_maxmrange) {
1197 				old_pi += old_maxmrange - new_maxmrange;
1198 			}
1199 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1200 				pcc_info_t tmp = *pi;
1201 				*pi = *old_pi;
1202 				*old_pi = tmp;
1203 			}
1204 		}
1205 	}
1206 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1207 
1208 	/*
1209 	 * Now that we have dropped the write lock, it is safe to free all
1210 	 * of the memory we have cached above.
1211 	 * We come thru here to free memory when pre-alloc fails, and also to
1212 	 * free old pointers which were recorded while locked.
1213 	 */
1214 cleanup:
1215 	for (r = 1; r < mmu_page_sizes; r++) {
1216 		if (ctr_cache[r] != NULL) {
1217 			kmem_free(ctr_cache[r],
1218 			    size_cache[r] * sizeof (hpmctr_t));
1219 		}
1220 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1221 			if (color_cache[r][mrange] != NULL) {
1222 				kmem_free(color_cache[r][mrange],
1223 				    colors_per_szc[r] * sizeof (size_t));
1224 			}
1225 		}
1226 		for (i = 0; i < NPC_MUTEX; i++) {
1227 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1228 			if (pi == NULL)
1229 				continue;
1230 			nr = cands_cache_nranges;
1231 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1232 				pgcntp = pi->pcc_color_free;
1233 				if (pgcntp == NULL)
1234 					continue;
1235 				if ((caddr_t)pgcntp >= kernelheap &&
1236 				    (caddr_t)pgcntp < ekernelheap) {
1237 					kmem_free(pgcntp,
1238 					    colors_per_szc[r] *
1239 					    sizeof (pgcnt_t));
1240 				}
1241 			}
1242 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1243 			if ((caddr_t)pi >= kernelheap &&
1244 			    (caddr_t)pi < ekernelheap) {
1245 				kmem_free(pi, nr * sizeof (pcc_info_t));
1246 			}
1247 		}
1248 	}
1249 
1250 	kmem_free(cands_cache,
1251 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1252 	return (rc);
1253 }
1254 
1255 
1256 #ifdef DEBUG
1257 
1258 /*
1259  * confirm pp is a large page corresponding to szc
1260  */
1261 void
1262 chk_lpg(page_t *pp, uchar_t szc)
1263 {
1264 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1265 	uint_t noreloc;
1266 
1267 	if (npgs == 1) {
1268 		ASSERT(pp->p_szc == 0);
1269 		ASSERT(pp->p_next == pp);
1270 		ASSERT(pp->p_prev == pp);
1271 		return;
1272 	}
1273 
1274 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1275 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1276 
1277 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1278 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1279 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1280 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1281 
1282 	/*
1283 	 * Check list of pages.
1284 	 */
1285 	noreloc = PP_ISNORELOC(pp);
1286 	while (npgs--) {
1287 		if (npgs != 0) {
1288 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1289 			ASSERT(pp->p_next == (pp + 1));
1290 		}
1291 		ASSERT(pp->p_szc == szc);
1292 		ASSERT(PP_ISFREE(pp));
1293 		ASSERT(PP_ISAGED(pp));
1294 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1295 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1296 		ASSERT(pp->p_vnode  == NULL);
1297 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1298 
1299 		pp = pp->p_next;
1300 	}
1301 }
1302 #endif /* DEBUG */
1303 
1304 void
1305 page_freelist_lock(int mnode)
1306 {
1307 	int i;
1308 	for (i = 0; i < NPC_MUTEX; i++) {
1309 		mutex_enter(FPC_MUTEX(mnode, i));
1310 		mutex_enter(CPC_MUTEX(mnode, i));
1311 	}
1312 }
1313 
1314 void
1315 page_freelist_unlock(int mnode)
1316 {
1317 	int i;
1318 	for (i = 0; i < NPC_MUTEX; i++) {
1319 		mutex_exit(FPC_MUTEX(mnode, i));
1320 		mutex_exit(CPC_MUTEX(mnode, i));
1321 	}
1322 }
1323 
1324 /*
1325  * add pp to the specified page list. Defaults to head of the page list
1326  * unless PG_LIST_TAIL is specified.
1327  */
1328 void
1329 page_list_add(page_t *pp, int flags)
1330 {
1331 	page_t		**ppp;
1332 	kmutex_t	*pcm;
1333 	uint_t		bin, mtype;
1334 	int		mnode;
1335 
1336 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1337 	ASSERT(PP_ISFREE(pp));
1338 	ASSERT(!hat_page_is_mapped(pp));
1339 	ASSERT(hat_page_getshare(pp) == 0);
1340 
1341 	/*
1342 	 * Large pages should be freed via page_list_add_pages().
1343 	 */
1344 	ASSERT(pp->p_szc == 0);
1345 
1346 	/*
1347 	 * Don't need to lock the freelist first here
1348 	 * because the page isn't on the freelist yet.
1349 	 * This means p_szc can't change on us.
1350 	 */
1351 
1352 	bin = PP_2_BIN(pp);
1353 	mnode = PP_2_MEM_NODE(pp);
1354 	mtype = PP_2_MTYPE(pp);
1355 
1356 	if (flags & PG_LIST_ISINIT) {
1357 		/*
1358 		 * PG_LIST_ISINIT is set during system startup (ie. single
1359 		 * threaded), add a page to the free list and add to the
1360 		 * the free region counters w/o any locking
1361 		 */
1362 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1363 
1364 		/* inline version of page_add() */
1365 		if (*ppp != NULL) {
1366 			pp->p_next = *ppp;
1367 			pp->p_prev = (*ppp)->p_prev;
1368 			(*ppp)->p_prev = pp;
1369 			pp->p_prev->p_next = pp;
1370 		} else
1371 			*ppp = pp;
1372 
1373 		page_ctr_add_internal(mnode, mtype, pp, flags);
1374 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1375 	} else {
1376 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1377 
1378 		if (flags & PG_FREE_LIST) {
1379 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1380 			ASSERT(PP_ISAGED(pp));
1381 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1382 
1383 		} else {
1384 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1385 			ASSERT(pp->p_vnode);
1386 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1387 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1388 		}
1389 		mutex_enter(pcm);
1390 		page_add(ppp, pp);
1391 
1392 		if (flags & PG_LIST_TAIL)
1393 			*ppp = (*ppp)->p_next;
1394 		/*
1395 		 * Add counters before releasing pcm mutex to avoid a race with
1396 		 * page_freelist_coalesce and page_freelist_split.
1397 		 */
1398 		page_ctr_add(mnode, mtype, pp, flags);
1399 		mutex_exit(pcm);
1400 	}
1401 
1402 
1403 #if defined(__sparc)
1404 	if (PP_ISNORELOC(pp)) {
1405 		kcage_freemem_add(1);
1406 	}
1407 #endif
1408 	/*
1409 	 * It is up to the caller to unlock the page!
1410 	 */
1411 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1412 }
1413 
1414 
1415 #ifdef __sparc
1416 /*
1417  * This routine is only used by kcage_init during system startup.
1418  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1419  * without the overhead of taking locks and updating counters.
1420  */
1421 void
1422 page_list_noreloc_startup(page_t *pp)
1423 {
1424 	page_t		**ppp;
1425 	uint_t		bin;
1426 	int		mnode;
1427 	int		mtype;
1428 	int		flags = 0;
1429 
1430 	/*
1431 	 * If this is a large page on the freelist then
1432 	 * break it up into smaller pages.
1433 	 */
1434 	if (pp->p_szc != 0)
1435 		page_boot_demote(pp);
1436 
1437 	/*
1438 	 * Get list page is currently on.
1439 	 */
1440 	bin = PP_2_BIN(pp);
1441 	mnode = PP_2_MEM_NODE(pp);
1442 	mtype = PP_2_MTYPE(pp);
1443 	ASSERT(mtype == MTYPE_RELOC);
1444 	ASSERT(pp->p_szc == 0);
1445 
1446 	if (PP_ISAGED(pp)) {
1447 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1448 		flags |= PG_FREE_LIST;
1449 	} else {
1450 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1451 		flags |= PG_CACHE_LIST;
1452 	}
1453 
1454 	ASSERT(*ppp != NULL);
1455 
1456 	/*
1457 	 * Delete page from current list.
1458 	 */
1459 	if (*ppp == pp)
1460 		*ppp = pp->p_next;		/* go to next page */
1461 	if (*ppp == pp) {
1462 		*ppp = NULL;			/* page list is gone */
1463 	} else {
1464 		pp->p_prev->p_next = pp->p_next;
1465 		pp->p_next->p_prev = pp->p_prev;
1466 	}
1467 
1468 	/*
1469 	 * Decrement page counters
1470 	 */
1471 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1472 
1473 	/*
1474 	 * Set no reloc for cage initted pages.
1475 	 */
1476 	PP_SETNORELOC(pp);
1477 
1478 	mtype = PP_2_MTYPE(pp);
1479 	ASSERT(mtype == MTYPE_NORELOC);
1480 
1481 	/*
1482 	 * Get new list for page.
1483 	 */
1484 	if (PP_ISAGED(pp)) {
1485 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1486 	} else {
1487 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1488 	}
1489 
1490 	/*
1491 	 * Insert page on new list.
1492 	 */
1493 	if (*ppp == NULL) {
1494 		*ppp = pp;
1495 		pp->p_next = pp->p_prev = pp;
1496 	} else {
1497 		pp->p_next = *ppp;
1498 		pp->p_prev = (*ppp)->p_prev;
1499 		(*ppp)->p_prev = pp;
1500 		pp->p_prev->p_next = pp;
1501 	}
1502 
1503 	/*
1504 	 * Increment page counters
1505 	 */
1506 	page_ctr_add_internal(mnode, mtype, pp, flags);
1507 
1508 	/*
1509 	 * Update cage freemem counter
1510 	 */
1511 	atomic_add_long(&kcage_freemem, 1);
1512 }
1513 #else	/* __sparc */
1514 
1515 /* ARGSUSED */
1516 void
1517 page_list_noreloc_startup(page_t *pp)
1518 {
1519 	panic("page_list_noreloc_startup: should be here only for sparc");
1520 }
1521 #endif
1522 
1523 void
1524 page_list_add_pages(page_t *pp, int flags)
1525 {
1526 	kmutex_t *pcm;
1527 	pgcnt_t	pgcnt;
1528 	uint_t	bin, mtype, i;
1529 	int	mnode;
1530 
1531 	/* default to freelist/head */
1532 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1533 
1534 	CHK_LPG(pp, pp->p_szc);
1535 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1536 
1537 	bin = PP_2_BIN(pp);
1538 	mnode = PP_2_MEM_NODE(pp);
1539 	mtype = PP_2_MTYPE(pp);
1540 
1541 	if (flags & PG_LIST_ISINIT) {
1542 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1543 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1544 		ASSERT(!PP_ISNORELOC(pp));
1545 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1546 	} else {
1547 
1548 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1549 
1550 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1551 
1552 		mutex_enter(pcm);
1553 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1554 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1555 		mutex_exit(pcm);
1556 
1557 		pgcnt = page_get_pagecnt(pp->p_szc);
1558 #if defined(__sparc)
1559 		if (PP_ISNORELOC(pp))
1560 			kcage_freemem_add(pgcnt);
1561 #endif
1562 		for (i = 0; i < pgcnt; i++, pp++)
1563 			page_unlock_nocapture(pp);
1564 	}
1565 }
1566 
1567 /*
1568  * During boot, need to demote a large page to base
1569  * pagesize pages for seg_kmem for use in boot_alloc()
1570  */
1571 void
1572 page_boot_demote(page_t *pp)
1573 {
1574 	ASSERT(pp->p_szc != 0);
1575 	ASSERT(PP_ISFREE(pp));
1576 	ASSERT(PP_ISAGED(pp));
1577 
1578 	(void) page_demote(PP_2_MEM_NODE(pp),
1579 	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
1580 	    PC_FREE);
1581 
1582 	ASSERT(PP_ISFREE(pp));
1583 	ASSERT(PP_ISAGED(pp));
1584 	ASSERT(pp->p_szc == 0);
1585 }
1586 
1587 /*
1588  * Take a particular page off of whatever freelist the page
1589  * is claimed to be on.
1590  *
1591  * NOTE: Only used for PAGESIZE pages.
1592  */
1593 void
1594 page_list_sub(page_t *pp, int flags)
1595 {
1596 	int		bin;
1597 	uint_t		mtype;
1598 	int		mnode;
1599 	kmutex_t	*pcm;
1600 	page_t		**ppp;
1601 
1602 	ASSERT(PAGE_EXCL(pp));
1603 	ASSERT(PP_ISFREE(pp));
1604 
1605 	/*
1606 	 * The p_szc field can only be changed by page_promote()
1607 	 * and page_demote(). Only free pages can be promoted and
1608 	 * demoted and the free list MUST be locked during these
1609 	 * operations. So to prevent a race in page_list_sub()
1610 	 * between computing which bin of the freelist lock to
1611 	 * grab and actually grabing the lock we check again that
1612 	 * the bin we locked is still the correct one. Notice that
1613 	 * the p_szc field could have actually changed on us but
1614 	 * if the bin happens to still be the same we are safe.
1615 	 */
1616 try_again:
1617 	bin = PP_2_BIN(pp);
1618 	mnode = PP_2_MEM_NODE(pp);
1619 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1620 	mutex_enter(pcm);
1621 	if (PP_2_BIN(pp) != bin) {
1622 		mutex_exit(pcm);
1623 		goto try_again;
1624 	}
1625 	mtype = PP_2_MTYPE(pp);
1626 
1627 	if (flags & PG_FREE_LIST) {
1628 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1629 		ASSERT(PP_ISAGED(pp));
1630 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1631 	} else {
1632 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1633 		ASSERT(!PP_ISAGED(pp));
1634 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1635 	}
1636 
1637 	/*
1638 	 * Common PAGESIZE case.
1639 	 *
1640 	 * Note that we locked the freelist. This prevents
1641 	 * any page promotion/demotion operations. Therefore
1642 	 * the p_szc will not change until we drop pcm mutex.
1643 	 */
1644 	if (pp->p_szc == 0) {
1645 		page_sub(ppp, pp);
1646 		/*
1647 		 * Subtract counters before releasing pcm mutex
1648 		 * to avoid race with page_freelist_coalesce.
1649 		 */
1650 		page_ctr_sub(mnode, mtype, pp, flags);
1651 		mutex_exit(pcm);
1652 
1653 #if defined(__sparc)
1654 		if (PP_ISNORELOC(pp)) {
1655 			kcage_freemem_sub(1);
1656 		}
1657 #endif
1658 		return;
1659 	}
1660 
1661 	/*
1662 	 * Large pages on the cache list are not supported.
1663 	 */
1664 	if (flags & PG_CACHE_LIST)
1665 		panic("page_list_sub: large page on cachelist");
1666 
1667 	/*
1668 	 * Slow but rare.
1669 	 *
1670 	 * Somebody wants this particular page which is part
1671 	 * of a large page. In this case we just demote the page
1672 	 * if it's on the freelist.
1673 	 *
1674 	 * We have to drop pcm before locking the entire freelist.
1675 	 * Once we have re-locked the freelist check to make sure
1676 	 * the page hasn't already been demoted or completely
1677 	 * freed.
1678 	 */
1679 	mutex_exit(pcm);
1680 	page_freelist_lock(mnode);
1681 	if (pp->p_szc != 0) {
1682 		/*
1683 		 * Large page is on freelist.
1684 		 */
1685 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1686 		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1687 	}
1688 	ASSERT(PP_ISFREE(pp));
1689 	ASSERT(PP_ISAGED(pp));
1690 	ASSERT(pp->p_szc == 0);
1691 
1692 	/*
1693 	 * Subtract counters before releasing pcm mutex
1694 	 * to avoid race with page_freelist_coalesce.
1695 	 */
1696 	bin = PP_2_BIN(pp);
1697 	mtype = PP_2_MTYPE(pp);
1698 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1699 
1700 	page_sub(ppp, pp);
1701 	page_ctr_sub(mnode, mtype, pp, flags);
1702 	page_freelist_unlock(mnode);
1703 
1704 #if defined(__sparc)
1705 	if (PP_ISNORELOC(pp)) {
1706 		kcage_freemem_sub(1);
1707 	}
1708 #endif
1709 }
1710 
1711 void
1712 page_list_sub_pages(page_t *pp, uint_t szc)
1713 {
1714 	kmutex_t *pcm;
1715 	uint_t	bin, mtype;
1716 	int	mnode;
1717 
1718 	ASSERT(PAGE_EXCL(pp));
1719 	ASSERT(PP_ISFREE(pp));
1720 	ASSERT(PP_ISAGED(pp));
1721 
1722 	/*
1723 	 * See comment in page_list_sub().
1724 	 */
1725 try_again:
1726 	bin = PP_2_BIN(pp);
1727 	mnode = PP_2_MEM_NODE(pp);
1728 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1729 	mutex_enter(pcm);
1730 	if (PP_2_BIN(pp) != bin) {
1731 		mutex_exit(pcm);
1732 		goto	try_again;
1733 	}
1734 
1735 	/*
1736 	 * If we're called with a page larger than szc or it got
1737 	 * promoted above szc before we locked the freelist then
1738 	 * drop pcm and re-lock entire freelist. If page still larger
1739 	 * than szc then demote it.
1740 	 */
1741 	if (pp->p_szc > szc) {
1742 		mutex_exit(pcm);
1743 		pcm = NULL;
1744 		page_freelist_lock(mnode);
1745 		if (pp->p_szc > szc) {
1746 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1747 			(void) page_demote(mnode,
1748 			    PFN_BASE(pp->p_pagenum, pp->p_szc),
1749 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1750 		}
1751 		bin = PP_2_BIN(pp);
1752 	}
1753 	ASSERT(PP_ISFREE(pp));
1754 	ASSERT(PP_ISAGED(pp));
1755 	ASSERT(pp->p_szc <= szc);
1756 	ASSERT(pp == PP_PAGEROOT(pp));
1757 
1758 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1759 
1760 	mtype = PP_2_MTYPE(pp);
1761 	if (pp->p_szc != 0) {
1762 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1763 		CHK_LPG(pp, pp->p_szc);
1764 	} else {
1765 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1766 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1767 	}
1768 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1769 
1770 	if (pcm != NULL) {
1771 		mutex_exit(pcm);
1772 	} else {
1773 		page_freelist_unlock(mnode);
1774 	}
1775 
1776 #if defined(__sparc)
1777 	if (PP_ISNORELOC(pp)) {
1778 		pgcnt_t	pgcnt;
1779 
1780 		pgcnt = page_get_pagecnt(pp->p_szc);
1781 		kcage_freemem_sub(pgcnt);
1782 	}
1783 #endif
1784 }
1785 
1786 /*
1787  * Add the page to the front of a linked list of pages
1788  * using the p_next & p_prev pointers for the list.
1789  * The caller is responsible for protecting the list pointers.
1790  */
1791 void
1792 mach_page_add(page_t **ppp, page_t *pp)
1793 {
1794 	if (*ppp == NULL) {
1795 		pp->p_next = pp->p_prev = pp;
1796 	} else {
1797 		pp->p_next = *ppp;
1798 		pp->p_prev = (*ppp)->p_prev;
1799 		(*ppp)->p_prev = pp;
1800 		pp->p_prev->p_next = pp;
1801 	}
1802 	*ppp = pp;
1803 }
1804 
1805 /*
1806  * Remove this page from a linked list of pages
1807  * using the p_next & p_prev pointers for the list.
1808  *
1809  * The caller is responsible for protecting the list pointers.
1810  */
1811 void
1812 mach_page_sub(page_t **ppp, page_t *pp)
1813 {
1814 	ASSERT(PP_ISFREE(pp));
1815 
1816 	if (*ppp == NULL || pp == NULL)
1817 		panic("mach_page_sub");
1818 
1819 	if (*ppp == pp)
1820 		*ppp = pp->p_next;		/* go to next page */
1821 
1822 	if (*ppp == pp)
1823 		*ppp = NULL;			/* page list is gone */
1824 	else {
1825 		pp->p_prev->p_next = pp->p_next;
1826 		pp->p_next->p_prev = pp->p_prev;
1827 	}
1828 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1829 }
1830 
1831 /*
1832  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1833  */
1834 void
1835 page_promote_size(page_t *pp, uint_t cur_szc)
1836 {
1837 	pfn_t pfn;
1838 	int mnode;
1839 	int idx;
1840 	int new_szc = cur_szc + 1;
1841 	int full = FULL_REGION_CNT(new_szc);
1842 
1843 	pfn = page_pptonum(pp);
1844 	mnode = PFN_2_MEM_NODE(pfn);
1845 
1846 	page_freelist_lock(mnode);
1847 
1848 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1849 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1850 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1851 
1852 	page_freelist_unlock(mnode);
1853 }
1854 
1855 static uint_t page_promote_err;
1856 static uint_t page_promote_noreloc_err;
1857 
1858 /*
1859  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1860  * for the given mnode starting at pfnum. Pages involved are on the freelist
1861  * before the call and may be returned to the caller if requested, otherwise
1862  * they will be placed back on the freelist.
1863  * If flags is PC_ALLOC, then the large page will be returned to the user in
1864  * a state which is consistent with a page being taken off the freelist.  If
1865  * we failed to lock the new large page, then we will return NULL to the
1866  * caller and put the large page on the freelist instead.
1867  * If flags is PC_FREE, then the large page will be placed on the freelist,
1868  * and NULL will be returned.
1869  * The caller is responsible for locking the freelist as well as any other
1870  * accounting which needs to be done for a returned page.
1871  *
1872  * RFE: For performance pass in pp instead of pfnum so
1873  * 	we can avoid excessive calls to page_numtopp_nolock().
1874  *	This would depend on an assumption that all contiguous
1875  *	pages are in the same memseg so we can just add/dec
1876  *	our pp.
1877  *
1878  * Lock ordering:
1879  *
1880  *	There is a potential but rare deadlock situation
1881  *	for page promotion and demotion operations. The problem
1882  *	is there are two paths into the freelist manager and
1883  *	they have different lock orders:
1884  *
1885  *	page_create()
1886  *		lock freelist
1887  *		page_lock(EXCL)
1888  *		unlock freelist
1889  *		return
1890  *		caller drops page_lock
1891  *
1892  *	page_free() and page_reclaim()
1893  *		caller grabs page_lock(EXCL)
1894  *
1895  *		lock freelist
1896  *		unlock freelist
1897  *		drop page_lock
1898  *
1899  *	What prevents a thread in page_create() from deadlocking
1900  *	with a thread freeing or reclaiming the same page is the
1901  *	page_trylock() in page_get_freelist(). If the trylock fails
1902  *	it skips the page.
1903  *
1904  *	The lock ordering for promotion and demotion is the same as
1905  *	for page_create(). Since the same deadlock could occur during
1906  *	page promotion and freeing or reclaiming of a page on the
1907  *	cache list we might have to fail the operation and undo what
1908  *	have done so far. Again this is rare.
1909  */
1910 page_t *
1911 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1912 {
1913 	page_t		*pp, *pplist, *tpp, *start_pp;
1914 	pgcnt_t		new_npgs, npgs;
1915 	uint_t		bin;
1916 	pgcnt_t		tmpnpgs, pages_left;
1917 	uint_t		noreloc;
1918 	int 		which_list;
1919 	ulong_t		index;
1920 	kmutex_t	*phm;
1921 
1922 	/*
1923 	 * General algorithm:
1924 	 * Find the starting page
1925 	 * Walk each page struct removing it from the freelist,
1926 	 * and linking it to all the other pages removed.
1927 	 * Once all pages are off the freelist,
1928 	 * walk the list, modifying p_szc to new_szc and what
1929 	 * ever other info needs to be done to create a large free page.
1930 	 * According to the flags, either return the page or put it
1931 	 * on the freelist.
1932 	 */
1933 
1934 	start_pp = page_numtopp_nolock(pfnum);
1935 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1936 	new_npgs = page_get_pagecnt(new_szc);
1937 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1938 
1939 	/* don't return page of the wrong mtype */
1940 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1941 			return (NULL);
1942 
1943 	/*
1944 	 * Loop through smaller pages to confirm that all pages
1945 	 * give the same result for PP_ISNORELOC().
1946 	 * We can check this reliably here as the protocol for setting
1947 	 * P_NORELOC requires pages to be taken off the free list first.
1948 	 */
1949 	noreloc = PP_ISNORELOC(start_pp);
1950 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1951 		if (noreloc != PP_ISNORELOC(pp)) {
1952 			page_promote_noreloc_err++;
1953 			page_promote_err++;
1954 			return (NULL);
1955 		}
1956 	}
1957 
1958 	pages_left = new_npgs;
1959 	pplist = NULL;
1960 	pp = start_pp;
1961 
1962 	/* Loop around coalescing the smaller pages into a big page. */
1963 	while (pages_left) {
1964 		/*
1965 		 * Remove from the freelist.
1966 		 */
1967 		ASSERT(PP_ISFREE(pp));
1968 		bin = PP_2_BIN(pp);
1969 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1970 		mtype = PP_2_MTYPE(pp);
1971 		if (PP_ISAGED(pp)) {
1972 
1973 			/*
1974 			 * PG_FREE_LIST
1975 			 */
1976 			if (pp->p_szc) {
1977 				page_vpsub(&PAGE_FREELISTS(mnode,
1978 				    pp->p_szc, bin, mtype), pp);
1979 			} else {
1980 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1981 				    bin, mtype), pp);
1982 			}
1983 			which_list = PG_FREE_LIST;
1984 		} else {
1985 			ASSERT(pp->p_szc == 0);
1986 
1987 			/*
1988 			 * PG_CACHE_LIST
1989 			 *
1990 			 * Since this page comes from the
1991 			 * cachelist, we must destroy the
1992 			 * vnode association.
1993 			 */
1994 			if (!page_trylock(pp, SE_EXCL)) {
1995 				goto fail_promote;
1996 			}
1997 
1998 			/*
1999 			 * We need to be careful not to deadlock
2000 			 * with another thread in page_lookup().
2001 			 * The page_lookup() thread could be holding
2002 			 * the same phm that we need if the two
2003 			 * pages happen to hash to the same phm lock.
2004 			 * At this point we have locked the entire
2005 			 * freelist and page_lookup() could be trying
2006 			 * to grab a freelist lock.
2007 			 */
2008 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2009 			phm = PAGE_HASH_MUTEX(index);
2010 			if (!mutex_tryenter(phm)) {
2011 				page_unlock_nocapture(pp);
2012 				goto fail_promote;
2013 			}
2014 
2015 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2016 			page_hashout(pp, phm);
2017 			mutex_exit(phm);
2018 			PP_SETAGED(pp);
2019 			page_unlock_nocapture(pp);
2020 			which_list = PG_CACHE_LIST;
2021 		}
2022 		page_ctr_sub(mnode, mtype, pp, which_list);
2023 
2024 		/*
2025 		 * Concatenate the smaller page(s) onto
2026 		 * the large page list.
2027 		 */
2028 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2029 		pages_left -= npgs;
2030 		tpp = pp;
2031 		while (npgs--) {
2032 			tpp->p_szc = new_szc;
2033 			tpp = tpp->p_next;
2034 		}
2035 		page_list_concat(&pplist, &pp);
2036 		pp += tmpnpgs;
2037 	}
2038 	CHK_LPG(pplist, new_szc);
2039 
2040 	/*
2041 	 * return the page to the user if requested
2042 	 * in the properly locked state.
2043 	 */
2044 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2045 		return (pplist);
2046 	}
2047 
2048 	/*
2049 	 * Otherwise place the new large page on the freelist
2050 	 */
2051 	bin = PP_2_BIN(pplist);
2052 	mnode = PP_2_MEM_NODE(pplist);
2053 	mtype = PP_2_MTYPE(pplist);
2054 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2055 
2056 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2057 	return (NULL);
2058 
2059 fail_promote:
2060 	/*
2061 	 * A thread must have still been freeing or
2062 	 * reclaiming the page on the cachelist.
2063 	 * To prevent a deadlock undo what we have
2064 	 * done sofar and return failure. This
2065 	 * situation can only happen while promoting
2066 	 * PAGESIZE pages.
2067 	 */
2068 	page_promote_err++;
2069 	while (pplist) {
2070 		pp = pplist;
2071 		mach_page_sub(&pplist, pp);
2072 		pp->p_szc = 0;
2073 		bin = PP_2_BIN(pp);
2074 		mtype = PP_2_MTYPE(pp);
2075 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2076 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2077 	}
2078 	return (NULL);
2079 
2080 }
2081 
2082 /*
2083  * Break up a large page into smaller size pages.
2084  * Pages involved are on the freelist before the call and may
2085  * be returned to the caller if requested, otherwise they will
2086  * be placed back on the freelist.
2087  * The caller is responsible for locking the freelist as well as any other
2088  * accounting which needs to be done for a returned page.
2089  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2090  * technically, any value may be passed in but PC_NO_COLOR is the standard
2091  * which should be followed for clarity's sake.
2092  */
2093 page_t *
2094 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
2095     int color, int flags)
2096 {
2097 	page_t	*pp, *pplist, *npplist;
2098 	pgcnt_t	npgs, n;
2099 	uint_t	bin;
2100 	uint_t	mtype;
2101 	page_t	*ret_pp = NULL;
2102 
2103 	ASSERT(cur_szc != 0);
2104 	ASSERT(new_szc < cur_szc);
2105 
2106 	pplist = page_numtopp_nolock(pfnum);
2107 	ASSERT(pplist != NULL);
2108 
2109 	ASSERT(pplist->p_szc == cur_szc);
2110 
2111 	bin = PP_2_BIN(pplist);
2112 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2113 	mtype = PP_2_MTYPE(pplist);
2114 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2115 
2116 	CHK_LPG(pplist, cur_szc);
2117 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2118 
2119 	/*
2120 	 * Number of PAGESIZE pages for smaller new_szc
2121 	 * page.
2122 	 */
2123 	npgs = page_get_pagecnt(new_szc);
2124 
2125 	while (pplist) {
2126 		pp = pplist;
2127 
2128 		ASSERT(pp->p_szc == cur_szc);
2129 
2130 		/*
2131 		 * We either break it up into PAGESIZE pages or larger.
2132 		 */
2133 		if (npgs == 1) {	/* PAGESIZE case */
2134 			mach_page_sub(&pplist, pp);
2135 			ASSERT(pp->p_szc == cur_szc);
2136 			ASSERT(new_szc == 0);
2137 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2138 			pp->p_szc = new_szc;
2139 			bin = PP_2_BIN(pp);
2140 			if ((bin == color) && (flags == PC_ALLOC) &&
2141 			    (ret_pp == NULL) &&
2142 			    page_trylock_cons(pp, SE_EXCL)) {
2143 				ret_pp = pp;
2144 			} else {
2145 				mtype = PP_2_MTYPE(pp);
2146 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2147 				    mtype), pp);
2148 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2149 			}
2150 		} else {
2151 
2152 			/*
2153 			 * Break down into smaller lists of pages.
2154 			 */
2155 			page_list_break(&pplist, &npplist, npgs);
2156 
2157 			pp = pplist;
2158 			n = npgs;
2159 			while (n--) {
2160 				ASSERT(pp->p_szc == cur_szc);
2161 				pp->p_szc = new_szc;
2162 				pp = pp->p_next;
2163 			}
2164 
2165 			CHK_LPG(pplist, new_szc);
2166 
2167 			bin = PP_2_BIN(pplist);
2168 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2169 			if ((bin == color) && (flags == PC_ALLOC) &&
2170 			    (ret_pp == NULL) &&
2171 			    page_trylock_cons(pp, SE_EXCL)) {
2172 				ret_pp = pp;
2173 			} else {
2174 				mtype = PP_2_MTYPE(pp);
2175 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2176 				    bin, mtype), pplist);
2177 
2178 				page_ctr_add(mnode, mtype, pplist,
2179 				    PG_FREE_LIST);
2180 			}
2181 			pplist = npplist;
2182 		}
2183 	}
2184 	return (ret_pp);
2185 }
2186 
2187 int mpss_coalesce_disable = 0;
2188 
2189 /*
2190  * Coalesce free pages into a page of the given szc and color if possible.
2191  * Return the pointer to the page created, otherwise, return NULL.
2192  *
2193  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2194  */
2195 page_t *
2196 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2197     int mtype, pfn_t pfnhi)
2198 {
2199 	int 	r = szc;		/* region size */
2200 	int	mrange;
2201 	uint_t 	full, bin, color_mask, wrap = 0;
2202 	pfn_t	pfnum, lo, hi;
2203 	size_t	len, idx, idx0;
2204 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2205 	page_t	*ret_pp;
2206 	MEM_NODE_ITERATOR_DECL(it);
2207 #if defined(__sparc)
2208 	pfn_t pfnum0, nlo, nhi;
2209 #endif
2210 
2211 	if (mpss_coalesce_disable) {
2212 		ASSERT(szc < MMU_PAGE_SIZES);
2213 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2214 		return (NULL);
2215 	}
2216 
2217 	ASSERT(szc < mmu_page_sizes);
2218 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2219 	ASSERT(ceq_mask <= color_mask);
2220 	ASSERT(color <= color_mask);
2221 	color &= ceq_mask;
2222 
2223 	/* Prevent page_counters dynamic memory from being freed */
2224 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2225 
2226 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2227 	ASSERT(mrange < mnode_nranges[mnode]);
2228 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2229 
2230 	/* get pfn range for mtype */
2231 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2232 #if defined(__sparc)
2233 	lo = PAGE_COUNTERS_BASE(mnode, r);
2234 	hi = IDX_TO_PNUM(mnode, r, len);
2235 #else
2236 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2237 	hi++;
2238 #endif
2239 
2240 	/* use lower limit if given */
2241 	if (pfnhi != PFNNULL && pfnhi < hi)
2242 		hi = pfnhi;
2243 
2244 	/* round to szcpgcnt boundaries */
2245 	lo = P2ROUNDUP(lo, szcpgcnt);
2246 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2247 	if (lo == (pfn_t)-1) {
2248 		rw_exit(&page_ctrs_rwlock[mnode]);
2249 		return (NULL);
2250 	}
2251 	hi = hi & ~(szcpgcnt - 1);
2252 
2253 	/* set lo to the closest pfn of the right color */
2254 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2255 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2256 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2257 		    &it);
2258 	}
2259 
2260 	if (hi <= lo) {
2261 		rw_exit(&page_ctrs_rwlock[mnode]);
2262 		return (NULL);
2263 	}
2264 
2265 	full = FULL_REGION_CNT(r);
2266 
2267 	/* calculate the number of page candidates and initial search index */
2268 	bin = color;
2269 	idx0 = (size_t)(-1);
2270 	do {
2271 		pgcnt_t acand;
2272 
2273 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2274 		if (acand) {
2275 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2276 			    r, bin, mrange);
2277 			idx0 = MIN(idx0, idx);
2278 			cands += acand;
2279 		}
2280 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2281 	} while (bin != color);
2282 
2283 	if (cands == 0) {
2284 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2285 		rw_exit(&page_ctrs_rwlock[mnode]);
2286 		return (NULL);
2287 	}
2288 
2289 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2290 	if (pfnum < lo || pfnum >= hi) {
2291 		pfnum = lo;
2292 	} else {
2293 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2294 		if (pfnum == (pfn_t)-1) {
2295 			pfnum = lo;
2296 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2297 			ASSERT(pfnum != (pfn_t)-1);
2298 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2299 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2300 			/* invalid color, get the closest correct pfn */
2301 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2302 			    color_mask, &it);
2303 			if (pfnum >= hi) {
2304 				pfnum = lo;
2305 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2306 			}
2307 		}
2308 	}
2309 
2310 	/* set starting index */
2311 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2312 	ASSERT(idx0 < len);
2313 
2314 #if defined(__sparc)
2315 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2316 	nhi = 0;		/* search kcage ranges */
2317 #endif
2318 
2319 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2320 
2321 #if defined(__sparc)
2322 		/*
2323 		 * Find lowest intersection of kcage ranges and mnode.
2324 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2325 		 */
2326 		if (nhi <= pfnum) {
2327 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2328 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2329 				goto wrapit;
2330 
2331 			/* jump to the next page in the range */
2332 			if (pfnum < nlo) {
2333 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2334 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2335 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2336 				if (idx >= len || pfnum >= hi)
2337 					goto wrapit;
2338 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2339 				    ceq_mask)
2340 					goto next;
2341 				if (interleaved_mnodes &&
2342 				    PFN_2_MEM_NODE(pfnum) != mnode)
2343 					goto next;
2344 			}
2345 		}
2346 #endif
2347 
2348 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2349 			goto next;
2350 
2351 		/*
2352 		 * RFE: For performance maybe we can do something less
2353 		 *	brutal than locking the entire freelist. So far
2354 		 * 	this doesn't seem to be a performance problem?
2355 		 */
2356 		page_freelist_lock(mnode);
2357 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2358 			ret_pp =
2359 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2360 			if (ret_pp != NULL) {
2361 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2362 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2363 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2364 				page_freelist_unlock(mnode);
2365 				rw_exit(&page_ctrs_rwlock[mnode]);
2366 #if defined(__sparc)
2367 				if (PP_ISNORELOC(ret_pp)) {
2368 					pgcnt_t npgs;
2369 
2370 					npgs = page_get_pagecnt(ret_pp->p_szc);
2371 					kcage_freemem_sub(npgs);
2372 				}
2373 #endif
2374 				return (ret_pp);
2375 			}
2376 		} else {
2377 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2378 		}
2379 
2380 		page_freelist_unlock(mnode);
2381 		/*
2382 		 * No point looking for another page if we've
2383 		 * already tried all of the ones that
2384 		 * page_ctr_cands indicated.  Stash off where we left
2385 		 * off.
2386 		 * Note: this is not exact since we don't hold the
2387 		 * page_freelist_locks before we initially get the
2388 		 * value of cands for performance reasons, but should
2389 		 * be a decent approximation.
2390 		 */
2391 		if (--cands == 0) {
2392 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2393 			    idx;
2394 			break;
2395 		}
2396 next:
2397 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2398 		    color_mask, &it);
2399 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2400 		if (idx >= len || pfnum >= hi) {
2401 wrapit:
2402 			pfnum = lo;
2403 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2404 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2405 			wrap++;
2406 #if defined(__sparc)
2407 			nhi = 0;	/* search kcage ranges */
2408 #endif
2409 		}
2410 	}
2411 
2412 	rw_exit(&page_ctrs_rwlock[mnode]);
2413 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2414 	return (NULL);
2415 }
2416 
2417 /*
2418  * For the given mnode, promote as many small pages to large pages as possible.
2419  * mnode can be -1, which means do them all
2420  */
2421 void
2422 page_freelist_coalesce_all(int mnode)
2423 {
2424 	int 	r;		/* region size */
2425 	int 	idx, full;
2426 	size_t	len;
2427 	int doall = interleaved_mnodes || mnode < 0;
2428 	int mlo = doall ? 0 : mnode;
2429 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2430 
2431 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2432 
2433 	if (mpss_coalesce_disable) {
2434 		return;
2435 	}
2436 
2437 	/*
2438 	 * Lock the entire freelist and coalesce what we can.
2439 	 *
2440 	 * Always promote to the largest page possible
2441 	 * first to reduce the number of page promotions.
2442 	 */
2443 	for (mnode = mlo; mnode < mhi; mnode++) {
2444 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2445 		page_freelist_lock(mnode);
2446 	}
2447 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2448 		for (mnode = mlo; mnode < mhi; mnode++) {
2449 			pgcnt_t cands = 0;
2450 			int mrange, nranges = mnode_nranges[mnode];
2451 
2452 			for (mrange = 0; mrange < nranges; mrange++) {
2453 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2454 				if (cands != 0)
2455 					break;
2456 			}
2457 			if (cands == 0) {
2458 				VM_STAT_ADD(vmm_vmstats.
2459 				    page_ctrs_cands_skip_all);
2460 				continue;
2461 			}
2462 
2463 			full = FULL_REGION_CNT(r);
2464 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2465 
2466 			for (idx = 0; idx < len; idx++) {
2467 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2468 					pfn_t pfnum =
2469 					    IDX_TO_PNUM(mnode, r, idx);
2470 					int tmnode = interleaved_mnodes ?
2471 					    PFN_2_MEM_NODE(pfnum) : mnode;
2472 
2473 					ASSERT(pfnum >=
2474 					    mem_node_config[tmnode].physbase &&
2475 					    pfnum <
2476 					    mem_node_config[tmnode].physmax);
2477 
2478 					(void) page_promote(tmnode,
2479 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2480 				}
2481 			}
2482 			/* shared hpm_counters covers all mnodes, so we quit */
2483 			if (interleaved_mnodes)
2484 				break;
2485 		}
2486 	}
2487 	for (mnode = mlo; mnode < mhi; mnode++) {
2488 		page_freelist_unlock(mnode);
2489 		rw_exit(&page_ctrs_rwlock[mnode]);
2490 	}
2491 }
2492 
2493 /*
2494  * This is where all polices for moving pages around
2495  * to different page size free lists is implemented.
2496  * Returns 1 on success, 0 on failure.
2497  *
2498  * So far these are the priorities for this algorithm in descending
2499  * order:
2500  *
2501  *	1) When servicing a request try to do so with a free page
2502  *	   from next size up. Helps defer fragmentation as long
2503  *	   as possible.
2504  *
2505  *	2) Page coalesce on demand. Only when a freelist
2506  *	   larger than PAGESIZE is empty and step 1
2507  *	   will not work since all larger size lists are
2508  *	   also empty.
2509  *
2510  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2511  */
2512 
2513 page_t *
2514 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2515     pfn_t pfnhi, page_list_walker_t *plw)
2516 {
2517 	uchar_t nszc = szc + 1;
2518 	uint_t 	bin, sbin, bin_prev;
2519 	page_t	*pp, *firstpp;
2520 	page_t	*ret_pp = NULL;
2521 	uint_t  color_mask;
2522 
2523 	if (nszc == mmu_page_sizes)
2524 		return (NULL);
2525 
2526 	ASSERT(nszc < mmu_page_sizes);
2527 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2528 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2529 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2530 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2531 
2532 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2533 	/*
2534 	 * First try to break up a larger page to fill current size freelist.
2535 	 */
2536 	while (plw->plw_bins[nszc] != 0) {
2537 
2538 		ASSERT(nszc < mmu_page_sizes);
2539 
2540 		/*
2541 		 * If page found then demote it.
2542 		 */
2543 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2544 			page_freelist_lock(mnode);
2545 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2546 
2547 			/*
2548 			 * If pfnhi is not PFNNULL, look for large page below
2549 			 * pfnhi. PFNNULL signifies no pfn requirement.
2550 			 */
2551 			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
2552 				do {
2553 					pp = pp->p_vpnext;
2554 					if (pp == firstpp) {
2555 						pp = NULL;
2556 						break;
2557 					}
2558 				} while (pp->p_pagenum >= pfnhi);
2559 			}
2560 			if (pp) {
2561 				uint_t ccolor = page_correct_color(szc, nszc,
2562 				    color, bin, plw->plw_ceq_mask[szc]);
2563 
2564 				ASSERT(pp->p_szc == nszc);
2565 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2566 				ret_pp = page_demote(mnode, pp->p_pagenum,
2567 				    pp->p_szc, szc, ccolor, PC_ALLOC);
2568 				if (ret_pp) {
2569 					page_freelist_unlock(mnode);
2570 #if defined(__sparc)
2571 					if (PP_ISNORELOC(ret_pp)) {
2572 						pgcnt_t npgs;
2573 
2574 						npgs = page_get_pagecnt(
2575 						    ret_pp->p_szc);
2576 						kcage_freemem_sub(npgs);
2577 					}
2578 #endif
2579 					return (ret_pp);
2580 				}
2581 			}
2582 			page_freelist_unlock(mnode);
2583 		}
2584 
2585 		/* loop through next size bins */
2586 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2587 		plw->plw_bins[nszc]--;
2588 
2589 		if (bin == sbin) {
2590 			uchar_t nnszc = nszc + 1;
2591 
2592 			/* we are done with this page size - check next */
2593 			if (plw->plw_bins[nnszc] == 0)
2594 				/* we have already checked next size bins */
2595 				break;
2596 
2597 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2598 			if (bin_prev != INVALID_COLOR) {
2599 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2600 				if (!((bin ^ bin_prev) &
2601 				    plw->plw_ceq_mask[nnszc]))
2602 					break;
2603 			}
2604 			ASSERT(nnszc < mmu_page_sizes);
2605 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2606 			nszc = nnszc;
2607 			ASSERT(nszc < mmu_page_sizes);
2608 		}
2609 	}
2610 
2611 	return (ret_pp);
2612 }
2613 
2614 /*
2615  * Helper routine used only by the freelist code to lock
2616  * a page. If the page is a large page then it succeeds in
2617  * locking all the constituent pages or none at all.
2618  * Returns 1 on sucess, 0 on failure.
2619  */
2620 static int
2621 page_trylock_cons(page_t *pp, se_t se)
2622 {
2623 	page_t	*tpp, *first_pp = pp;
2624 
2625 	/*
2626 	 * Fail if can't lock first or only page.
2627 	 */
2628 	if (!page_trylock(pp, se)) {
2629 		return (0);
2630 	}
2631 
2632 	/*
2633 	 * PAGESIZE: common case.
2634 	 */
2635 	if (pp->p_szc == 0) {
2636 		return (1);
2637 	}
2638 
2639 	/*
2640 	 * Large page case.
2641 	 */
2642 	tpp = pp->p_next;
2643 	while (tpp != pp) {
2644 		if (!page_trylock(tpp, se)) {
2645 			/*
2646 			 * On failure unlock what we have locked so far.
2647 			 * We want to avoid attempting to capture these
2648 			 * pages as the pcm mutex may be held which could
2649 			 * lead to a recursive mutex panic.
2650 			 */
2651 			while (first_pp != tpp) {
2652 				page_unlock_nocapture(first_pp);
2653 				first_pp = first_pp->p_next;
2654 			}
2655 			return (0);
2656 		}
2657 		tpp = tpp->p_next;
2658 	}
2659 	return (1);
2660 }
2661 
2662 /*
2663  * init context for walking page lists
2664  * Called when a page of the given szc in unavailable. Sets markers
2665  * for the beginning of the search to detect when search has
2666  * completed a full cycle. Sets flags for splitting larger pages
2667  * and coalescing smaller pages. Page walking procedes until a page
2668  * of the desired equivalent color is found.
2669  */
2670 void
2671 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2672     int use_ceq, page_list_walker_t *plw)
2673 {
2674 	uint_t  nszc, ceq_mask, colors;
2675 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2676 
2677 	ASSERT(szc < mmu_page_sizes);
2678 	colors = PAGE_GET_PAGECOLORS(szc);
2679 
2680 	plw->plw_colors = colors;
2681 	plw->plw_color_mask = colors - 1;
2682 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2683 	plw->plw_bin_split_prev = bin;
2684 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2685 
2686 	/*
2687 	 * if vac aliasing is possible make sure lower order color
2688 	 * bits are never ignored
2689 	 */
2690 	if (vac_colors > 1)
2691 		ceq &= 0xf0;
2692 
2693 	/*
2694 	 * calculate the number of non-equivalent colors and
2695 	 * color equivalency mask
2696 	 */
2697 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2698 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2699 	ASSERT(plw->plw_ceq_dif > 0);
2700 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2701 
2702 	if (flags & PG_MATCH_COLOR) {
2703 		if (cpu_page_colors <  0) {
2704 			/*
2705 			 * this is a heterogeneous machine with different CPUs
2706 			 * having different size e$ (not supported for ni2/rock
2707 			 */
2708 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2709 			cpucolors = MAX(cpucolors, 1);
2710 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2711 			plw->plw_ceq_mask[szc] =
2712 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2713 		}
2714 		plw->plw_ceq_dif = 1;
2715 	}
2716 
2717 	/* we can split pages in the freelist, but not the cachelist */
2718 	if (can_split) {
2719 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2720 
2721 		/* set next szc color masks and number of free list bins */
2722 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2723 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2724 			    plw->plw_ceq_mask[szc]);
2725 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2726 		}
2727 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2728 		plw->plw_bins[nszc] = 0;
2729 
2730 	} else {
2731 		ASSERT(szc == 0);
2732 		plw->plw_do_split = 0;
2733 		plw->plw_bins[1] = 0;
2734 		plw->plw_ceq_mask[1] = INVALID_MASK;
2735 	}
2736 }
2737 
2738 /*
2739  * set mark to flag where next split should occur
2740  */
2741 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2742 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2743 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2744 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2745 	plw->plw_split_next =						     \
2746 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2747 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2748 		plw->plw_split_next =					     \
2749 		INC_MASKED(plw->plw_split_next,				     \
2750 		    neq_mask, plw->plw_color_mask);			     \
2751 	}								     \
2752 }
2753 
2754 uint_t
2755 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2756 {
2757 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2758 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2759 	uchar_t nszc = szc + 1;
2760 
2761 	nbin = ADD_MASKED(bin,
2762 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2763 
2764 	if (plw->plw_do_split) {
2765 		plw->plw_bin_split_prev = bin;
2766 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2767 		plw->plw_do_split = 0;
2768 	}
2769 
2770 	if (szc == 0) {
2771 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2772 			if (nbin == plw->plw_bin0 &&
2773 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2774 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2775 				    neq_mask, plw->plw_color_mask);
2776 				plw->plw_bin_split_prev = plw->plw_bin0;
2777 			}
2778 
2779 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2780 				plw->plw_bin_marker =
2781 				    nbin = INC_MASKED(nbin, neq_mask,
2782 				    plw->plw_color_mask);
2783 				plw->plw_bin_split_prev = plw->plw_bin0;
2784 				/*
2785 				 * large pages all have the same vac color
2786 				 * so by now we should be done with next
2787 				 * size page splitting process
2788 				 */
2789 				ASSERT(plw->plw_bins[1] == 0);
2790 				plw->plw_do_split = 0;
2791 				return (nbin);
2792 			}
2793 
2794 		} else {
2795 			uint_t bin_jump = (vac_colors == 1) ?
2796 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2797 
2798 			bin_jump &= ~(vac_colors - 1);
2799 
2800 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2801 			    plw->plw_color_mask);
2802 
2803 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2804 
2805 				plw->plw_bin_marker = nbin = nbin0;
2806 
2807 				if (plw->plw_bins[nszc] != 0) {
2808 					/*
2809 					 * check if next page size bin is the
2810 					 * same as the next page size bin for
2811 					 * bin0
2812 					 */
2813 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2814 					    nbin);
2815 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2816 					    plw->plw_bin0);
2817 
2818 					if ((bin0_nsz ^ nbin_nsz) &
2819 					    plw->plw_ceq_mask[nszc])
2820 						plw->plw_do_split = 1;
2821 				}
2822 				return (nbin);
2823 			}
2824 		}
2825 	}
2826 
2827 	if (plw->plw_bins[nszc] != 0) {
2828 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2829 		if (!((plw->plw_split_next ^ nbin_nsz) &
2830 		    plw->plw_ceq_mask[nszc]))
2831 			plw->plw_do_split = 1;
2832 	}
2833 
2834 	return (nbin);
2835 }
2836 
2837 page_t *
2838 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2839     uint_t flags)
2840 {
2841 	kmutex_t		*pcm;
2842 	page_t			*pp, *first_pp;
2843 	uint_t			sbin;
2844 	int			plw_initialized;
2845 	page_list_walker_t	plw;
2846 
2847 	ASSERT(szc < mmu_page_sizes);
2848 
2849 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2850 
2851 	MTYPE_START(mnode, mtype, flags);
2852 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2853 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2854 		return (NULL);
2855 	}
2856 try_again:
2857 
2858 	plw_initialized = 0;
2859 	plw.plw_ceq_dif = 1;
2860 
2861 	/*
2862 	 * Only hold one freelist lock at a time, that way we
2863 	 * can start anywhere and not have to worry about lock
2864 	 * ordering.
2865 	 */
2866 	for (plw.plw_count = 0;
2867 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2868 		sbin = bin;
2869 		do {
2870 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2871 				goto bin_empty_1;
2872 
2873 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2874 			mutex_enter(pcm);
2875 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2876 			if (pp == NULL)
2877 				goto bin_empty_0;
2878 
2879 			/*
2880 			 * These were set before the page
2881 			 * was put on the free list,
2882 			 * they must still be set.
2883 			 */
2884 			ASSERT(PP_ISFREE(pp));
2885 			ASSERT(PP_ISAGED(pp));
2886 			ASSERT(pp->p_vnode == NULL);
2887 			ASSERT(pp->p_hash == NULL);
2888 			ASSERT(pp->p_offset == (u_offset_t)-1);
2889 			ASSERT(pp->p_szc == szc);
2890 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2891 
2892 			/*
2893 			 * Walk down the hash chain.
2894 			 * 8k pages are linked on p_next
2895 			 * and p_prev fields. Large pages
2896 			 * are a contiguous group of
2897 			 * constituent pages linked together
2898 			 * on their p_next and p_prev fields.
2899 			 * The large pages are linked together
2900 			 * on the hash chain using p_vpnext
2901 			 * p_vpprev of the base constituent
2902 			 * page of each large page.
2903 			 */
2904 			first_pp = pp;
2905 			while (!page_trylock_cons(pp, SE_EXCL)) {
2906 				if (szc == 0) {
2907 					pp = pp->p_next;
2908 				} else {
2909 					pp = pp->p_vpnext;
2910 				}
2911 
2912 				ASSERT(PP_ISFREE(pp));
2913 				ASSERT(PP_ISAGED(pp));
2914 				ASSERT(pp->p_vnode == NULL);
2915 				ASSERT(pp->p_hash == NULL);
2916 				ASSERT(pp->p_offset == (u_offset_t)-1);
2917 				ASSERT(pp->p_szc == szc);
2918 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2919 
2920 				if (pp == first_pp)
2921 					goto bin_empty_0;
2922 			}
2923 
2924 			ASSERT(pp != NULL);
2925 			ASSERT(mtype == PP_2_MTYPE(pp));
2926 			ASSERT(pp->p_szc == szc);
2927 			if (szc == 0) {
2928 				page_sub(&PAGE_FREELISTS(mnode,
2929 				    szc, bin, mtype), pp);
2930 			} else {
2931 				page_vpsub(&PAGE_FREELISTS(mnode,
2932 				    szc, bin, mtype), pp);
2933 				CHK_LPG(pp, szc);
2934 			}
2935 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2936 
2937 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2938 				panic("free page is not. pp %p", (void *)pp);
2939 			mutex_exit(pcm);
2940 
2941 #if defined(__sparc)
2942 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2943 			    (flags & PG_NORELOC) == 0);
2944 
2945 			if (PP_ISNORELOC(pp))
2946 				kcage_freemem_sub(page_get_pagecnt(szc));
2947 #endif
2948 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2949 			return (pp);
2950 
2951 bin_empty_0:
2952 			mutex_exit(pcm);
2953 bin_empty_1:
2954 			if (plw_initialized == 0) {
2955 				page_list_walk_init(szc, flags, bin, 1, 1,
2956 				    &plw);
2957 				plw_initialized = 1;
2958 				ASSERT(plw.plw_colors <=
2959 				    PAGE_GET_PAGECOLORS(szc));
2960 				ASSERT(plw.plw_colors > 0);
2961 				ASSERT((plw.plw_colors &
2962 				    (plw.plw_colors - 1)) == 0);
2963 				ASSERT(bin < plw.plw_colors);
2964 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2965 			}
2966 			/* calculate the next bin with equivalent color */
2967 			bin = ADD_MASKED(bin, plw.plw_bin_step,
2968 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
2969 		} while (sbin != bin);
2970 
2971 		/*
2972 		 * color bins are all empty if color match. Try and
2973 		 * satisfy the request by breaking up or coalescing
2974 		 * pages from a different size freelist of the correct
2975 		 * color that satisfies the ORIGINAL color requested.
2976 		 * If that fails then try pages of the same size but
2977 		 * different colors assuming we are not called with
2978 		 * PG_MATCH_COLOR.
2979 		 */
2980 		if (plw.plw_do_split &&
2981 		    (pp = page_freelist_split(szc, bin, mnode,
2982 		    mtype, PFNNULL, &plw)) != NULL)
2983 			return (pp);
2984 
2985 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2986 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
2987 			return (pp);
2988 
2989 		if (plw.plw_ceq_dif > 1)
2990 			bin = page_list_walk_next_bin(szc, bin, &plw);
2991 	}
2992 
2993 	/* if allowed, cycle through additional mtypes */
2994 	MTYPE_NEXT(mnode, mtype, flags);
2995 	if (mtype >= 0)
2996 		goto try_again;
2997 
2998 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2999 
3000 	return (NULL);
3001 }
3002 
3003 /*
3004  * Returns the count of free pages for 'pp' with size code 'szc'.
3005  * Note: This function does not return an exact value as the page freelist
3006  * locks are not held and thus the values in the page_counters may be
3007  * changing as we walk through the data.
3008  */
3009 static int
3010 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3011 {
3012 	pgcnt_t	pgfree;
3013 	pgcnt_t cnt;
3014 	ssize_t	r = szc;	/* region size */
3015 	ssize_t	idx;
3016 	int	i;
3017 	int	full, range;
3018 
3019 	/* Make sure pagenum passed in is aligned properly */
3020 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3021 	ASSERT(szc > 0);
3022 
3023 	/* Prevent page_counters dynamic memory from being freed */
3024 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3025 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3026 	cnt = PAGE_COUNTERS(mnode, r, idx);
3027 	pgfree = cnt << PNUM_SHIFT(r - 1);
3028 	range = FULL_REGION_CNT(szc);
3029 
3030 	/* Check for completely full region */
3031 	if (cnt == range) {
3032 		rw_exit(&page_ctrs_rwlock[mnode]);
3033 		return (pgfree);
3034 	}
3035 
3036 	while (--r > 0) {
3037 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3038 		full = FULL_REGION_CNT(r);
3039 		for (i = 0; i < range; i++, idx++) {
3040 			cnt = PAGE_COUNTERS(mnode, r, idx);
3041 			/*
3042 			 * If cnt here is full, that means we have already
3043 			 * accounted for these pages earlier.
3044 			 */
3045 			if (cnt != full) {
3046 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3047 			}
3048 		}
3049 		range *= full;
3050 	}
3051 	rw_exit(&page_ctrs_rwlock[mnode]);
3052 	return (pgfree);
3053 }
3054 
3055 /*
3056  * Called from page_geti_contig_pages to exclusively lock constituent pages
3057  * starting from 'spp' for page size code 'szc'.
3058  *
3059  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3060  * region needs to be greater than or equal to the threshold.
3061  */
3062 static int
3063 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3064 {
3065 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3066 	pgcnt_t pgfree, i;
3067 	page_t *pp;
3068 
3069 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3070 
3071 
3072 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3073 		goto skipptcpcheck;
3074 	/*
3075 	 * check if there are sufficient free pages available before attempting
3076 	 * to trylock. Count is approximate as page counters can change.
3077 	 */
3078 	pgfree = page_freecnt(mnode, spp, szc);
3079 
3080 	/* attempt to trylock if there are sufficient already free pages */
3081 	if (pgfree < pgcnt/ptcpthreshold) {
3082 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3083 		return (0);
3084 	}
3085 
3086 skipptcpcheck:
3087 
3088 	for (i = 0; i < pgcnt; i++) {
3089 		pp = &spp[i];
3090 		if (!page_trylock(pp, SE_EXCL)) {
3091 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3092 			while (--i != (pgcnt_t)-1) {
3093 				pp = &spp[i];
3094 				ASSERT(PAGE_EXCL(pp));
3095 				page_unlock_nocapture(pp);
3096 			}
3097 			return (0);
3098 		}
3099 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3100 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3101 		    !PP_ISFREE(pp)) {
3102 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3103 			ASSERT(i == 0);
3104 			page_unlock_nocapture(pp);
3105 			return (0);
3106 		}
3107 		if (PP_ISNORELOC(pp)) {
3108 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3109 			while (i != (pgcnt_t)-1) {
3110 				pp = &spp[i];
3111 				ASSERT(PAGE_EXCL(pp));
3112 				page_unlock_nocapture(pp);
3113 				i--;
3114 			}
3115 			return (0);
3116 		}
3117 	}
3118 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3119 	return (1);
3120 }
3121 
3122 /*
3123  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3124  * of 'szc' constituent pages that had been locked exclusively previously.
3125  * Will attempt to relocate constituent pages in use.
3126  */
3127 static page_t *
3128 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3129 {
3130 	spgcnt_t pgcnt, npgs, i;
3131 	page_t *targpp, *rpp, *hpp;
3132 	page_t *replpp = NULL;
3133 	page_t *pplist = NULL;
3134 
3135 	ASSERT(pp != NULL);
3136 
3137 	pgcnt = page_get_pagecnt(szc);
3138 	while (pgcnt) {
3139 		ASSERT(PAGE_EXCL(pp));
3140 		ASSERT(!PP_ISNORELOC(pp));
3141 		if (PP_ISFREE(pp)) {
3142 			/*
3143 			 * If this is a PG_FREE_LIST page then its
3144 			 * size code can change underneath us due to
3145 			 * page promotion or demotion. As an optimzation
3146 			 * use page_list_sub_pages() instead of
3147 			 * page_list_sub().
3148 			 */
3149 			if (PP_ISAGED(pp)) {
3150 				page_list_sub_pages(pp, szc);
3151 				if (pp->p_szc == szc) {
3152 					return (pp);
3153 				}
3154 				ASSERT(pp->p_szc < szc);
3155 				npgs = page_get_pagecnt(pp->p_szc);
3156 				hpp = pp;
3157 				for (i = 0; i < npgs; i++, pp++) {
3158 					pp->p_szc = szc;
3159 				}
3160 				page_list_concat(&pplist, &hpp);
3161 				pgcnt -= npgs;
3162 				continue;
3163 			}
3164 			ASSERT(!PP_ISAGED(pp));
3165 			ASSERT(pp->p_szc == 0);
3166 			page_list_sub(pp, PG_CACHE_LIST);
3167 			page_hashout(pp, NULL);
3168 			PP_SETAGED(pp);
3169 			pp->p_szc = szc;
3170 			page_list_concat(&pplist, &pp);
3171 			pp++;
3172 			pgcnt--;
3173 			continue;
3174 		}
3175 		npgs = page_get_pagecnt(pp->p_szc);
3176 
3177 		/*
3178 		 * page_create_wait freemem accounting done by caller of
3179 		 * page_get_freelist and not necessary to call it prior to
3180 		 * calling page_get_replacement_page.
3181 		 *
3182 		 * page_get_replacement_page can call page_get_contig_pages
3183 		 * to acquire a large page (szc > 0); the replacement must be
3184 		 * smaller than the contig page size to avoid looping or
3185 		 * szc == 0 and PGI_PGCPSZC0 is set.
3186 		 */
3187 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3188 			replpp = page_get_replacement_page(pp, NULL, 0);
3189 			if (replpp) {
3190 				npgs = page_get_pagecnt(pp->p_szc);
3191 				ASSERT(npgs <= pgcnt);
3192 				targpp = pp;
3193 			}
3194 		}
3195 
3196 		/*
3197 		 * If replacement is NULL or do_page_relocate fails, fail
3198 		 * coalescing of pages.
3199 		 */
3200 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3201 		    &npgs, NULL) != 0)) {
3202 			/*
3203 			 * Unlock un-processed target list
3204 			 */
3205 			while (pgcnt--) {
3206 				ASSERT(PAGE_EXCL(pp));
3207 				page_unlock_nocapture(pp);
3208 				pp++;
3209 			}
3210 			/*
3211 			 * Free the processed target list.
3212 			 */
3213 			while (pplist) {
3214 				pp = pplist;
3215 				page_sub(&pplist, pp);
3216 				ASSERT(PAGE_EXCL(pp));
3217 				ASSERT(pp->p_szc == szc);
3218 				ASSERT(PP_ISFREE(pp));
3219 				ASSERT(PP_ISAGED(pp));
3220 				pp->p_szc = 0;
3221 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3222 				page_unlock_nocapture(pp);
3223 			}
3224 
3225 			if (replpp != NULL)
3226 				page_free_replacement_page(replpp);
3227 
3228 			return (NULL);
3229 		}
3230 		ASSERT(pp == targpp);
3231 
3232 		/* LINTED */
3233 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3234 
3235 		pp += npgs;
3236 		pgcnt -= npgs;
3237 
3238 		while (npgs--) {
3239 			ASSERT(PAGE_EXCL(targpp));
3240 			ASSERT(!PP_ISFREE(targpp));
3241 			ASSERT(!PP_ISNORELOC(targpp));
3242 			PP_SETFREE(targpp);
3243 			ASSERT(PP_ISAGED(targpp));
3244 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3245 			    (flags & PGI_PGCPSZC0)));
3246 			targpp->p_szc = szc;
3247 			targpp = targpp->p_next;
3248 
3249 			rpp = replpp;
3250 			ASSERT(rpp != NULL);
3251 			page_sub(&replpp, rpp);
3252 			ASSERT(PAGE_EXCL(rpp));
3253 			ASSERT(!PP_ISFREE(rpp));
3254 			page_unlock_nocapture(rpp);
3255 		}
3256 		ASSERT(targpp == hpp);
3257 		ASSERT(replpp == NULL);
3258 		page_list_concat(&pplist, &targpp);
3259 	}
3260 	CHK_LPG(pplist, szc);
3261 	return (pplist);
3262 }
3263 
3264 /*
3265  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3266  * of 0 means nothing left after trim.
3267  */
3268 int
3269 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3270 {
3271 	pfn_t	kcagepfn;
3272 	int	decr;
3273 	int	rc = 0;
3274 
3275 	if (PP_ISNORELOC(mseg->pages)) {
3276 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3277 
3278 			/* lower part of this mseg inside kernel cage */
3279 			decr = kcage_current_pfn(&kcagepfn);
3280 
3281 			/* kernel cage may have transitioned past mseg */
3282 			if (kcagepfn >= mseg->pages_base &&
3283 			    kcagepfn < mseg->pages_end) {
3284 				ASSERT(decr == 0);
3285 				*lo = MAX(kcagepfn, pfnlo);
3286 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
3287 				rc = 1;
3288 			}
3289 		}
3290 		/* else entire mseg in the cage */
3291 	} else {
3292 		if (PP_ISNORELOC(mseg->epages - 1)) {
3293 
3294 			/* upper part of this mseg inside kernel cage */
3295 			decr = kcage_current_pfn(&kcagepfn);
3296 
3297 			/* kernel cage may have transitioned past mseg */
3298 			if (kcagepfn >= mseg->pages_base &&
3299 			    kcagepfn < mseg->pages_end) {
3300 				ASSERT(decr);
3301 				*hi = MIN(kcagepfn, pfnhi);
3302 				*lo = MAX(pfnlo, mseg->pages_base);
3303 				rc = 1;
3304 			}
3305 		} else {
3306 			/* entire mseg outside of kernel cage */
3307 			*lo = MAX(pfnlo, mseg->pages_base);
3308 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3309 			rc = 1;
3310 		}
3311 	}
3312 	return (rc);
3313 }
3314 
3315 /*
3316  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3317  * page with size code 'szc'. Claiming such a page requires acquiring
3318  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3319  * relocating pages in use and concatenating these constituent pages into a
3320  * large page.
3321  *
3322  * The page lists do not have such a large page and page_freelist_split has
3323  * already failed to demote larger pages and/or coalesce smaller free pages.
3324  *
3325  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3326  * pages with the same color as 'bin'.
3327  *
3328  * 'pfnflag' specifies the subset of the pfn range to search.
3329  */
3330 
3331 static page_t *
3332 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3333     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3334 {
3335 	struct memseg *mseg;
3336 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3337 	pgcnt_t szcpgmask = szcpgcnt - 1;
3338 	pfn_t	randpfn;
3339 	page_t *pp, *randpp, *endpp;
3340 	uint_t colors, ceq_mask;
3341 	/* LINTED : set but not used in function */
3342 	uint_t color_mask;
3343 	pfn_t hi, lo;
3344 	uint_t skip;
3345 	MEM_NODE_ITERATOR_DECL(it);
3346 
3347 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3348 
3349 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3350 
3351 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3352 		return (NULL);
3353 
3354 	ASSERT(szc < mmu_page_sizes);
3355 
3356 	colors = PAGE_GET_PAGECOLORS(szc);
3357 	color_mask = colors - 1;
3358 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3359 		uchar_t ceq = colorequivszc[szc];
3360 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3361 
3362 		ASSERT(ceq_dif > 0);
3363 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3364 	} else {
3365 		ceq_mask = 0;
3366 	}
3367 
3368 	ASSERT(bin < colors);
3369 
3370 	/* clear "non-significant" color bits */
3371 	bin &= ceq_mask;
3372 
3373 	/*
3374 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3375 	 * when there have been previous page_get_contig_page failures to
3376 	 * limit the search.
3377 	 *
3378 	 * The high bit in pfnflag specifies the number of 'slots' in the
3379 	 * pfn range and the remainder of pfnflag specifies which slot.
3380 	 * For example, a value of 1010b would mean the second slot of
3381 	 * the pfn range that has been divided into 8 slots.
3382 	 */
3383 	if (pfnflag > 1) {
3384 		int	slots = 1 << (highbit(pfnflag) - 1);
3385 		int	slotid = pfnflag & (slots - 1);
3386 		pgcnt_t	szcpages;
3387 		int	slotlen;
3388 
3389 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3390 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3391 		slotlen = howmany(szcpages, slots);
3392 		/* skip if 'slotid' slot is empty */
3393 		if (slotid * slotlen >= szcpages)
3394 			return (NULL);
3395 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3396 		ASSERT(pfnlo < pfnhi);
3397 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3398 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3399 	}
3400 
3401 	memsegs_lock(0);
3402 
3403 	/*
3404 	 * loop through memsegs to look for contig page candidates
3405 	 */
3406 
3407 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3408 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3409 			/* no overlap */
3410 			continue;
3411 		}
3412 
3413 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3414 			/* mseg too small */
3415 			continue;
3416 
3417 		/*
3418 		 * trim off kernel cage pages from pfn range and check for
3419 		 * a trimmed pfn range returned that does not span the
3420 		 * desired large page size.
3421 		 */
3422 		if (kcage_on) {
3423 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3424 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3425 				continue;
3426 		} else {
3427 			lo = MAX(pfnlo, mseg->pages_base);
3428 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3429 		}
3430 
3431 		/* round to szcpgcnt boundaries */
3432 		lo = P2ROUNDUP(lo, szcpgcnt);
3433 
3434 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3435 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3436 
3437 		if (hi <= lo)
3438 			continue;
3439 
3440 		/*
3441 		 * set lo to point to the pfn for the desired bin. Large
3442 		 * page sizes may only have a single page color
3443 		 */
3444 		skip = szcpgcnt;
3445 		if (ceq_mask > 0 || interleaved_mnodes) {
3446 			/* set lo to point at appropriate color */
3447 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3448 			    (interleaved_mnodes &&
3449 			    PFN_2_MEM_NODE(lo) != mnode)) {
3450 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3451 				    color_mask, &it);
3452 			}
3453 			if (hi <= lo)
3454 				/* mseg cannot satisfy color request */
3455 				continue;
3456 		}
3457 
3458 		/* randomly choose a point between lo and hi to begin search */
3459 
3460 		randpfn = (pfn_t)GETTICK();
3461 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3462 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3463 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3464 			if (randpfn != (pfn_t)-1) {
3465 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3466 				    ceq_mask, color_mask, &it);
3467 			}
3468 			if (randpfn >= hi) {
3469 				randpfn = lo;
3470 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3471 				    &it);
3472 			}
3473 		}
3474 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3475 
3476 		ASSERT(randpp->p_pagenum == randpfn);
3477 
3478 		pp = randpp;
3479 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3480 
3481 		ASSERT(randpp + szcpgcnt <= endpp);
3482 
3483 		do {
3484 			ASSERT(!(pp->p_pagenum & szcpgmask));
3485 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3486 
3487 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3488 				/* pages unlocked by page_claim on failure */
3489 				if (page_claim_contig_pages(pp, szc, flags)) {
3490 					memsegs_unlock(0);
3491 					return (pp);
3492 				}
3493 			}
3494 
3495 			if (ceq_mask == 0 && !interleaved_mnodes) {
3496 				pp += skip;
3497 			} else {
3498 				pfn_t pfn = pp->p_pagenum;
3499 
3500 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3501 				    ceq_mask, color_mask, &it);
3502 				if (pfn == (pfn_t)-1) {
3503 					pp = endpp;
3504 				} else {
3505 					pp = mseg->pages +
3506 					    (pfn - mseg->pages_base);
3507 				}
3508 			}
3509 			if (pp >= endpp) {
3510 				/* start from the beginning */
3511 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3512 				pp = mseg->pages + (lo - mseg->pages_base);
3513 				ASSERT(pp->p_pagenum == lo);
3514 				ASSERT(pp + szcpgcnt <= endpp);
3515 			}
3516 		} while (pp != randpp);
3517 	}
3518 	memsegs_unlock(0);
3519 	return (NULL);
3520 }
3521 
3522 
3523 /*
3524  * controlling routine that searches through physical memory in an attempt to
3525  * claim a large page based on the input parameters.
3526  * on the page free lists.
3527  *
3528  * calls page_geti_contig_pages with an initial pfn range from the mnode
3529  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3530  * that overlaps with the kernel cage or does not match the requested page
3531  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3532  * page_geti_contig_pages may further limit the search range based on
3533  * previous failure counts (pgcpfailcnt[]).
3534  *
3535  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3536  * pagesize page that satisfies mtype.
3537  */
3538 page_t *
3539 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3540     uint_t flags)
3541 {
3542 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3543 	page_t		*pp;
3544 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3545 
3546 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3547 
3548 	/* no allocations from cage */
3549 	flags |= PGI_NOCAGE;
3550 
3551 	/* LINTED */
3552 	MTYPE_START(mnode, mtype, flags);
3553 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3554 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3555 		return (NULL);
3556 	}
3557 
3558 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3559 
3560 	/* do not limit search and ignore color if hi pri */
3561 
3562 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3563 		pfnflag = pgcpfailcnt[szc];
3564 
3565 	/* remove color match to improve chances */
3566 
3567 	if (flags & PGI_PGCPHIPRI || pfnflag)
3568 		flags &= ~PG_MATCH_COLOR;
3569 
3570 	do {
3571 		/* get pfn range based on mnode and mtype */
3572 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3573 
3574 		ASSERT(pfnhi >= pfnlo);
3575 
3576 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3577 		    pfnlo, pfnhi, pfnflag);
3578 
3579 		if (pp != NULL) {
3580 			pfnflag = pgcpfailcnt[szc];
3581 			if (pfnflag) {
3582 				/* double the search size */
3583 				pgcpfailcnt[szc] = pfnflag >> 1;
3584 			}
3585 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3586 			return (pp);
3587 		}
3588 		MTYPE_NEXT(mnode, mtype, flags);
3589 	} while (mtype >= 0);
3590 
3591 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3592 	return (NULL);
3593 }
3594 
3595 #if defined(__i386) || defined(__amd64)
3596 /*
3597  * Determine the likelihood of finding/coalescing a szc page.
3598  * Return 0 if the likelihood is small otherwise return 1.
3599  *
3600  * For now, be conservative and check only 1g pages and return 0
3601  * if there had been previous coalescing failures and the szc pages
3602  * needed to satisfy request would exhaust most of freemem.
3603  */
3604 int
3605 page_chk_freelist(uint_t szc)
3606 {
3607 	pgcnt_t		pgcnt;
3608 
3609 	if (szc <= 1)
3610 		return (1);
3611 
3612 	pgcnt = page_get_pagecnt(szc);
3613 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3614 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3615 		return (0);
3616 	}
3617 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3618 	return (1);
3619 }
3620 #endif
3621 
3622 /*
3623  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3624  *
3625  * Does its own locking and accounting.
3626  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3627  * pages of the proper color even if there are pages of a different color.
3628  *
3629  * Finds a page, removes it, THEN locks it.
3630  */
3631 
3632 /*ARGSUSED*/
3633 page_t *
3634 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3635 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3636 {
3637 	struct as	*as = seg->s_as;
3638 	page_t		*pp = NULL;
3639 	ulong_t		bin;
3640 	uchar_t		szc;
3641 	int		mnode;
3642 	int		mtype;
3643 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3644 	lgrp_mnode_cookie_t	lgrp_cookie;
3645 
3646 	page_get_func = page_get_mnode_freelist;
3647 
3648 	/*
3649 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3650 	 * assume we wish to allocate near to the current thread's home.
3651 	 */
3652 	if (!LGRP_EXISTS(lgrp))
3653 		lgrp = lgrp_home_lgrp();
3654 
3655 	if (kcage_on) {
3656 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3657 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3658 		    curthread != kcage_cageout_thread) {
3659 			/*
3660 			 * Set a "reserve" of kcage_throttlefree pages for
3661 			 * PG_PANIC and cageout thread allocations.
3662 			 *
3663 			 * Everybody else has to serialize in
3664 			 * page_create_get_something() to get a cage page, so
3665 			 * that we don't deadlock cageout!
3666 			 */
3667 			return (NULL);
3668 		}
3669 	} else {
3670 		flags &= ~PG_NORELOC;
3671 		flags |= PGI_NOCAGE;
3672 	}
3673 
3674 	/* LINTED */
3675 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3676 
3677 	/*
3678 	 * Convert size to page size code.
3679 	 */
3680 	if ((szc = page_szc(size)) == (uchar_t)-1)
3681 		panic("page_get_freelist: illegal page size request");
3682 	ASSERT(szc < mmu_page_sizes);
3683 
3684 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3685 
3686 	/* LINTED */
3687 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3688 
3689 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3690 
3691 	/*
3692 	 * Try to get a local page first, but try remote if we can't
3693 	 * get a page of the right color.
3694 	 */
3695 pgretry:
3696 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3697 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3698 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3699 		if (pp != NULL) {
3700 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3701 			DTRACE_PROBE4(page__get,
3702 			    lgrp_t *, lgrp,
3703 			    int, mnode,
3704 			    ulong_t, bin,
3705 			    uint_t, flags);
3706 			return (pp);
3707 		}
3708 	}
3709 	ASSERT(pp == NULL);
3710 
3711 	/*
3712 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3713 	 * remote free lists.  Caller expected to call page_get_cachelist which
3714 	 * will check local cache lists and remote free lists.
3715 	 */
3716 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3717 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3718 		return (NULL);
3719 	}
3720 
3721 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3722 
3723 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3724 
3725 	if (!(flags & PG_LOCAL)) {
3726 		/*
3727 		 * Try to get a non-local freelist page.
3728 		 */
3729 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3730 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3731 			pp = page_get_func(mnode, bin, mtype, szc, flags);
3732 			if (pp != NULL) {
3733 				DTRACE_PROBE4(page__get,
3734 				    lgrp_t *, lgrp,
3735 				    int, mnode,
3736 				    ulong_t, bin,
3737 				    uint_t, flags);
3738 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3739 				return (pp);
3740 			}
3741 		}
3742 		ASSERT(pp == NULL);
3743 	}
3744 
3745 	/*
3746 	 * when the cage is off chances are page_get_contig_pages() will fail
3747 	 * to lock a large page chunk therefore when the cage is off it's not
3748 	 * called by default.  this can be changed via /etc/system.
3749 	 *
3750 	 * page_get_contig_pages() also called to acquire a base pagesize page
3751 	 * for page_create_get_something().
3752 	 */
3753 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3754 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3755 	    (page_get_func != page_get_contig_pages)) {
3756 
3757 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3758 		page_get_func = page_get_contig_pages;
3759 		goto pgretry;
3760 	}
3761 
3762 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3763 	    page_get_func == page_get_contig_pages)
3764 		SETPGCPFAILCNT(szc);
3765 
3766 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3767 	return (NULL);
3768 }
3769 
3770 /*
3771  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3772  *
3773  * Does its own locking.
3774  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3775  * pages of the proper color even if there are pages of a different color.
3776  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3777  * try to lock one of them.  If no page can be locked, try the
3778  * next bin.  Return NULL if a page can not be found and locked.
3779  *
3780  * Finds a pages, trys to lock it, then removes it.
3781  */
3782 
3783 /*ARGSUSED*/
3784 page_t *
3785 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3786     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3787 {
3788 	page_t		*pp;
3789 	struct as	*as = seg->s_as;
3790 	ulong_t		bin;
3791 	/*LINTED*/
3792 	int		mnode;
3793 	int		mtype;
3794 	lgrp_mnode_cookie_t	lgrp_cookie;
3795 
3796 	/*
3797 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3798 	 * assume we wish to allocate near to the current thread's home.
3799 	 */
3800 	if (!LGRP_EXISTS(lgrp))
3801 		lgrp = lgrp_home_lgrp();
3802 
3803 	if (!kcage_on) {
3804 		flags &= ~PG_NORELOC;
3805 		flags |= PGI_NOCAGE;
3806 	}
3807 
3808 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3809 	    kcage_freemem <= kcage_throttlefree) {
3810 		/*
3811 		 * Reserve kcage_throttlefree pages for critical kernel
3812 		 * threads.
3813 		 *
3814 		 * Everybody else has to go to page_create_get_something()
3815 		 * to get a cage page, so we don't deadlock cageout.
3816 		 */
3817 		return (NULL);
3818 	}
3819 
3820 	/* LINTED */
3821 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3822 
3823 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3824 
3825 	/* LINTED */
3826 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3827 
3828 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3829 
3830 	/*
3831 	 * Try local cachelists first
3832 	 */
3833 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3834 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3835 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3836 		if (pp != NULL) {
3837 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3838 			DTRACE_PROBE4(page__get,
3839 			    lgrp_t *, lgrp,
3840 			    int, mnode,
3841 			    ulong_t, bin,
3842 			    uint_t, flags);
3843 			return (pp);
3844 		}
3845 	}
3846 
3847 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3848 
3849 	/*
3850 	 * Try freelists/cachelists that are farther away
3851 	 * This is our only chance to allocate remote pages for PAGESIZE
3852 	 * requests.
3853 	 */
3854 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3855 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3856 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3857 		    0, flags);
3858 		if (pp != NULL) {
3859 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3860 			DTRACE_PROBE4(page__get,
3861 			    lgrp_t *, lgrp,
3862 			    int, mnode,
3863 			    ulong_t, bin,
3864 			    uint_t, flags);
3865 			return (pp);
3866 		}
3867 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3868 		if (pp != NULL) {
3869 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3870 			DTRACE_PROBE4(page__get,
3871 			    lgrp_t *, lgrp,
3872 			    int, mnode,
3873 			    ulong_t, bin,
3874 			    uint_t, flags);
3875 			return (pp);
3876 		}
3877 	}
3878 
3879 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3880 	return (NULL);
3881 }
3882 
3883 page_t *
3884 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3885 {
3886 	kmutex_t		*pcm;
3887 	page_t			*pp, *first_pp;
3888 	uint_t			sbin;
3889 	int			plw_initialized;
3890 	page_list_walker_t	plw;
3891 
3892 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3893 
3894 	/* LINTED */
3895 	MTYPE_START(mnode, mtype, flags);
3896 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3897 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3898 		return (NULL);
3899 	}
3900 
3901 try_again:
3902 
3903 	plw_initialized = 0;
3904 	plw.plw_ceq_dif = 1;
3905 
3906 	/*
3907 	 * Only hold one cachelist lock at a time, that way we
3908 	 * can start anywhere and not have to worry about lock
3909 	 * ordering.
3910 	 */
3911 
3912 	for (plw.plw_count = 0;
3913 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3914 		sbin = bin;
3915 		do {
3916 
3917 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
3918 				goto bin_empty_1;
3919 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3920 			mutex_enter(pcm);
3921 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3922 			if (pp == NULL)
3923 				goto bin_empty_0;
3924 
3925 			first_pp = pp;
3926 			ASSERT(pp->p_vnode);
3927 			ASSERT(PP_ISAGED(pp) == 0);
3928 			ASSERT(pp->p_szc == 0);
3929 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3930 			while (!page_trylock(pp, SE_EXCL)) {
3931 				pp = pp->p_next;
3932 				ASSERT(pp->p_szc == 0);
3933 				if (pp == first_pp) {
3934 					/*
3935 					 * We have searched the complete list!
3936 					 * And all of them (might only be one)
3937 					 * are locked. This can happen since
3938 					 * these pages can also be found via
3939 					 * the hash list. When found via the
3940 					 * hash list, they are locked first,
3941 					 * then removed. We give up to let the
3942 					 * other thread run.
3943 					 */
3944 					pp = NULL;
3945 					break;
3946 				}
3947 				ASSERT(pp->p_vnode);
3948 				ASSERT(PP_ISFREE(pp));
3949 				ASSERT(PP_ISAGED(pp) == 0);
3950 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3951 				    mnode);
3952 			}
3953 
3954 			if (pp) {
3955 				page_t	**ppp;
3956 				/*
3957 				 * Found and locked a page.
3958 				 * Pull it off the list.
3959 				 */
3960 				ASSERT(mtype == PP_2_MTYPE(pp));
3961 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3962 				page_sub(ppp, pp);
3963 				/*
3964 				 * Subtract counters before releasing pcm mutex
3965 				 * to avoid a race with page_freelist_coalesce
3966 				 * and page_freelist_split.
3967 				 */
3968 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3969 				mutex_exit(pcm);
3970 				ASSERT(pp->p_vnode);
3971 				ASSERT(PP_ISAGED(pp) == 0);
3972 #if defined(__sparc)
3973 				ASSERT(!kcage_on ||
3974 				    (flags & PG_NORELOC) == 0 ||
3975 				    PP_ISNORELOC(pp));
3976 				if (PP_ISNORELOC(pp)) {
3977 					kcage_freemem_sub(1);
3978 				}
3979 #endif
3980 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3981 				return (pp);
3982 			}
3983 bin_empty_0:
3984 			mutex_exit(pcm);
3985 bin_empty_1:
3986 			if (plw_initialized == 0) {
3987 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
3988 				plw_initialized = 1;
3989 			}
3990 			/* calculate the next bin with equivalent color */
3991 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3992 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
3993 		} while (sbin != bin);
3994 
3995 		if (plw.plw_ceq_dif > 1)
3996 			bin = page_list_walk_next_bin(0, bin, &plw);
3997 	}
3998 
3999 	MTYPE_NEXT(mnode, mtype, flags);
4000 	if (mtype >= 0)
4001 		goto try_again;
4002 
4003 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4004 	return (NULL);
4005 }
4006 
4007 #ifdef DEBUG
4008 #define	REPL_PAGE_STATS
4009 #endif /* DEBUG */
4010 
4011 #ifdef REPL_PAGE_STATS
4012 struct repl_page_stats {
4013 	uint_t	ngets;
4014 	uint_t	ngets_noreloc;
4015 	uint_t	npgr_noreloc;
4016 	uint_t	nnopage_first;
4017 	uint_t	nnopage;
4018 	uint_t	nhashout;
4019 	uint_t	nnofree;
4020 	uint_t	nnext_pp;
4021 } repl_page_stats;
4022 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
4023 #else /* REPL_PAGE_STATS */
4024 #define	REPL_STAT_INCR(v)
4025 #endif /* REPL_PAGE_STATS */
4026 
4027 int	pgrppgcp;
4028 
4029 /*
4030  * The freemem accounting must be done by the caller.
4031  * First we try to get a replacement page of the same size as like_pp,
4032  * if that is not possible, then we just get a set of discontiguous
4033  * PAGESIZE pages.
4034  */
4035 page_t *
4036 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4037     uint_t pgrflags)
4038 {
4039 	page_t		*like_pp;
4040 	page_t		*pp, *pplist;
4041 	page_t		*pl = NULL;
4042 	ulong_t		bin;
4043 	int		mnode, page_mnode;
4044 	int		szc;
4045 	spgcnt_t	npgs, pg_cnt;
4046 	pfn_t		pfnum;
4047 	int		mtype;
4048 	int		flags = 0;
4049 	lgrp_mnode_cookie_t	lgrp_cookie;
4050 	lgrp_t		*lgrp;
4051 
4052 	REPL_STAT_INCR(ngets);
4053 	like_pp = orig_like_pp;
4054 	ASSERT(PAGE_EXCL(like_pp));
4055 
4056 	szc = like_pp->p_szc;
4057 	npgs = page_get_pagecnt(szc);
4058 	/*
4059 	 * Now we reset like_pp to the base page_t.
4060 	 * That way, we won't walk past the end of this 'szc' page.
4061 	 */
4062 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4063 	like_pp = page_numtopp_nolock(pfnum);
4064 	ASSERT(like_pp->p_szc == szc);
4065 
4066 	if (PP_ISNORELOC(like_pp)) {
4067 		ASSERT(kcage_on);
4068 		REPL_STAT_INCR(ngets_noreloc);
4069 		flags = PGI_RELOCONLY;
4070 	} else if (pgrflags & PGR_NORELOC) {
4071 		ASSERT(kcage_on);
4072 		REPL_STAT_INCR(npgr_noreloc);
4073 		flags = PG_NORELOC;
4074 	}
4075 
4076 	/*
4077 	 * Kernel pages must always be replaced with the same size
4078 	 * pages, since we cannot properly handle demotion of kernel
4079 	 * pages.
4080 	 */
4081 	if (PP_ISKAS(like_pp))
4082 		pgrflags |= PGR_SAMESZC;
4083 
4084 	/* LINTED */
4085 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4086 
4087 	while (npgs) {
4088 		pplist = NULL;
4089 		for (;;) {
4090 			pg_cnt = page_get_pagecnt(szc);
4091 			bin = PP_2_BIN(like_pp);
4092 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4093 			ASSERT(pg_cnt <= npgs);
4094 
4095 			/*
4096 			 * If an lgroup was specified, try to get the
4097 			 * page from that lgroup.
4098 			 * NOTE: Must be careful with code below because
4099 			 *	 lgroup may disappear and reappear since there
4100 			 *	 is no locking for lgroup here.
4101 			 */
4102 			if (LGRP_EXISTS(lgrp_target)) {
4103 				/*
4104 				 * Keep local variable for lgroup separate
4105 				 * from lgroup argument since this code should
4106 				 * only be exercised when lgroup argument
4107 				 * exists....
4108 				 */
4109 				lgrp = lgrp_target;
4110 
4111 				/* Try the lgroup's freelists first */
4112 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4113 				    LGRP_SRCH_LOCAL);
4114 				while ((pplist == NULL) &&
4115 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4116 				    != -1) {
4117 					pplist =
4118 					    page_get_mnode_freelist(mnode, bin,
4119 					    mtype, szc, flags);
4120 				}
4121 
4122 				/*
4123 				 * Now try it's cachelists if this is a
4124 				 * small page. Don't need to do it for
4125 				 * larger ones since page_freelist_coalesce()
4126 				 * already failed.
4127 				 */
4128 				if (pplist != NULL || szc != 0)
4129 					break;
4130 
4131 				/* Now try it's cachelists */
4132 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4133 				    LGRP_SRCH_LOCAL);
4134 
4135 				while ((pplist == NULL) &&
4136 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4137 				    != -1) {
4138 					pplist =
4139 					    page_get_mnode_cachelist(bin, flags,
4140 					    mnode, mtype);
4141 				}
4142 				if (pplist != NULL) {
4143 					page_hashout(pplist, NULL);
4144 					PP_SETAGED(pplist);
4145 					REPL_STAT_INCR(nhashout);
4146 					break;
4147 				}
4148 				/* Done looking in this lgroup. Bail out. */
4149 				break;
4150 			}
4151 
4152 			/*
4153 			 * No lgroup was specified (or lgroup was removed by
4154 			 * DR, so just try to get the page as close to
4155 			 * like_pp's mnode as possible.
4156 			 * First try the local freelist...
4157 			 */
4158 			mnode = PP_2_MEM_NODE(like_pp);
4159 			pplist = page_get_mnode_freelist(mnode, bin,
4160 			    mtype, szc, flags);
4161 			if (pplist != NULL)
4162 				break;
4163 
4164 			REPL_STAT_INCR(nnofree);
4165 
4166 			/*
4167 			 * ...then the local cachelist. Don't need to do it for
4168 			 * larger pages cause page_freelist_coalesce() already
4169 			 * failed there anyway.
4170 			 */
4171 			if (szc == 0) {
4172 				pplist = page_get_mnode_cachelist(bin, flags,
4173 				    mnode, mtype);
4174 				if (pplist != NULL) {
4175 					page_hashout(pplist, NULL);
4176 					PP_SETAGED(pplist);
4177 					REPL_STAT_INCR(nhashout);
4178 					break;
4179 				}
4180 			}
4181 
4182 			/* Now try remote freelists */
4183 			page_mnode = mnode;
4184 			lgrp =
4185 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4186 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4187 			    LGRP_SRCH_HIER);
4188 			while (pplist == NULL &&
4189 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4190 			    != -1) {
4191 				/*
4192 				 * Skip local mnode.
4193 				 */
4194 				if ((mnode == page_mnode) ||
4195 				    (mem_node_config[mnode].exists == 0))
4196 					continue;
4197 
4198 				pplist = page_get_mnode_freelist(mnode,
4199 				    bin, mtype, szc, flags);
4200 			}
4201 
4202 			if (pplist != NULL)
4203 				break;
4204 
4205 
4206 			/* Now try remote cachelists */
4207 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4208 			    LGRP_SRCH_HIER);
4209 			while (pplist == NULL && szc == 0) {
4210 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4211 				if (mnode == -1)
4212 					break;
4213 				/*
4214 				 * Skip local mnode.
4215 				 */
4216 				if ((mnode == page_mnode) ||
4217 				    (mem_node_config[mnode].exists == 0))
4218 					continue;
4219 
4220 				pplist = page_get_mnode_cachelist(bin,
4221 				    flags, mnode, mtype);
4222 
4223 				if (pplist != NULL) {
4224 					page_hashout(pplist, NULL);
4225 					PP_SETAGED(pplist);
4226 					REPL_STAT_INCR(nhashout);
4227 					break;
4228 				}
4229 			}
4230 
4231 			/*
4232 			 * Break out of while loop under the following cases:
4233 			 * - If we successfully got a page.
4234 			 * - If pgrflags specified only returning a specific
4235 			 *   page size and we could not find that page size.
4236 			 * - If we could not satisfy the request with PAGESIZE
4237 			 *   or larger pages.
4238 			 */
4239 			if (pplist != NULL || szc == 0)
4240 				break;
4241 
4242 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4243 				/* try to find contig page */
4244 
4245 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4246 				    LGRP_SRCH_HIER);
4247 
4248 				while ((pplist == NULL) &&
4249 				    (mnode =
4250 				    lgrp_memnode_choose(&lgrp_cookie))
4251 				    != -1) {
4252 					pplist = page_get_contig_pages(
4253 					    mnode, bin, mtype, szc,
4254 					    flags | PGI_PGCPHIPRI);
4255 				}
4256 				break;
4257 			}
4258 
4259 			/*
4260 			 * The correct thing to do here is try the next
4261 			 * page size down using szc--. Due to a bug
4262 			 * with the processing of HAT_RELOAD_SHARE
4263 			 * where the sfmmu_ttecnt arrays of all
4264 			 * hats sharing an ISM segment don't get updated,
4265 			 * using intermediate size pages for relocation
4266 			 * can lead to continuous page faults.
4267 			 */
4268 			szc = 0;
4269 		}
4270 
4271 		if (pplist != NULL) {
4272 			DTRACE_PROBE4(page__get,
4273 			    lgrp_t *, lgrp,
4274 			    int, mnode,
4275 			    ulong_t, bin,
4276 			    uint_t, flags);
4277 
4278 			while (pplist != NULL && pg_cnt--) {
4279 				ASSERT(pplist != NULL);
4280 				pp = pplist;
4281 				page_sub(&pplist, pp);
4282 				PP_CLRFREE(pp);
4283 				PP_CLRAGED(pp);
4284 				page_list_concat(&pl, &pp);
4285 				npgs--;
4286 				like_pp = like_pp + 1;
4287 				REPL_STAT_INCR(nnext_pp);
4288 			}
4289 			ASSERT(pg_cnt == 0);
4290 		} else {
4291 			break;
4292 		}
4293 	}
4294 
4295 	if (npgs) {
4296 		/*
4297 		 * We were unable to allocate the necessary number
4298 		 * of pages.
4299 		 * We need to free up any pl.
4300 		 */
4301 		REPL_STAT_INCR(nnopage);
4302 		page_free_replacement_page(pl);
4303 		return (NULL);
4304 	} else {
4305 		return (pl);
4306 	}
4307 }
4308 
4309 /*
4310  * demote a free large page to it's constituent pages
4311  */
4312 void
4313 page_demote_free_pages(page_t *pp)
4314 {
4315 
4316 	int mnode;
4317 
4318 	ASSERT(pp != NULL);
4319 	ASSERT(PAGE_LOCKED(pp));
4320 	ASSERT(PP_ISFREE(pp));
4321 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4322 
4323 	mnode = PP_2_MEM_NODE(pp);
4324 	page_freelist_lock(mnode);
4325 	if (pp->p_szc != 0) {
4326 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4327 		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4328 	}
4329 	page_freelist_unlock(mnode);
4330 	ASSERT(pp->p_szc == 0);
4331 }
4332 
4333 /*
4334  * Factor in colorequiv to check additional 'equivalent' bins.
4335  * colorequiv may be set in /etc/system
4336  */
4337 void
4338 page_set_colorequiv_arr(void)
4339 {
4340 	if (colorequiv > 1) {
4341 		int i;
4342 		uint_t sv_a = lowbit(colorequiv) - 1;
4343 
4344 		if (sv_a > 15)
4345 			sv_a = 15;
4346 
4347 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4348 			uint_t colors;
4349 			uint_t a = sv_a;
4350 
4351 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4352 				continue;
4353 			}
4354 			while ((colors >> a) == 0)
4355 				a--;
4356 			if ((a << 4) > colorequivszc[i]) {
4357 				colorequivszc[i] = (a << 4);
4358 			}
4359 		}
4360 	}
4361 }
4362