xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision d670ce0b8f4bf35907a3b851264a57e04d74d22d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 
35 /*
36  * This file contains common functions to access and manage the page lists.
37  * Many of these routines originated from platform dependent modules
38  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
39  * a platform independent manner.
40  *
41  * vm/vm_dep.h provides for platform specific support.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/debug.h>
46 #include <sys/cmn_err.h>
47 #include <sys/systm.h>
48 #include <sys/atomic.h>
49 #include <sys/sysmacros.h>
50 #include <vm/as.h>
51 #include <vm/page.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/seg_vn.h>
54 #include <sys/vmsystm.h>
55 #include <sys/memnode.h>
56 #include <vm/vm_dep.h>
57 #include <sys/lgrp.h>
58 #include <sys/mem_config.h>
59 #include <sys/callb.h>
60 #include <sys/mem_cage.h>
61 #include <sys/sdt.h>
62 
63 extern uint_t	vac_colors;
64 
65 #define	MAX_PRAGMA_ALIGN	128
66 
67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
68 
69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
70 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
71 #else
72 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
73 #endif
74 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
75 
76 /*
77  * number of page colors equivalent to reqested color in page_get routines.
78  * If set, keeps large pages intact longer and keeps MPO allocation
79  * from the local mnode in favor of acquiring the 'correct' page color from
80  * a demoted large page or from a remote mnode.
81  */
82 uint_t	colorequiv;
83 
84 /*
85  * color equivalency mask for each page size.
86  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
87  * High 4 bits determine the number of high order bits of the color to ignore.
88  * Low 4 bits determines number of low order bits of color to ignore (it's only
89  * relevant for hashed index based page coloring).
90  */
91 uchar_t colorequivszc[MMU_PAGE_SIZES];
92 
93 /*
94  * if set, specifies the percentage of large pages that are free from within
95  * a large page region before attempting to lock those pages for
96  * page_get_contig_pages processing.
97  *
98  * Should be turned on when kpr is available when page_trylock_contig_pages
99  * can be more selective.
100  */
101 
102 int	ptcpthreshold;
103 
104 /*
105  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
106  * Enabled by default via pgcplimitsearch.
107  *
108  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
109  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
110  * bound. This upper bound range guarantees:
111  *    - all large page 'slots' will be searched over time
112  *    - the minimum (1) large page candidates considered on each pgcp call
113  *    - count doesn't wrap around to 0
114  */
115 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
116 int	pgcplimitsearch = 1;
117 
118 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
119 #define	SETPGCPFAILCNT(szc)						\
120 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
121 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
122 
123 #ifdef VM_STATS
124 struct vmm_vmstats_str  vmm_vmstats;
125 
126 #endif /* VM_STATS */
127 
128 #if defined(__sparc)
129 #define	LPGCREATE	0
130 #else
131 /* enable page_get_contig_pages */
132 #define	LPGCREATE	1
133 #endif
134 
135 int pg_contig_disable;
136 int pg_lpgcreate_nocage = LPGCREATE;
137 
138 /*
139  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
140  */
141 #define	PFNNULL		0
142 
143 /* Flags involved in promotion and demotion routines */
144 #define	PC_FREE		0x1	/* put page on freelist */
145 #define	PC_ALLOC	0x2	/* return page for allocation */
146 
147 /*
148  * Flag for page_demote to be used with PC_FREE to denote that we don't care
149  * what the color is as the color parameter to the function is ignored.
150  */
151 #define	PC_NO_COLOR	(-1)
152 
153 /* mtype value for page_promote to use when mtype does not matter */
154 #define	PC_MTYPE_ANY	(-1)
155 
156 /*
157  * page counters candidates info
158  * See page_ctrs_cands comment below for more details.
159  * fields are as follows:
160  *	pcc_pages_free:		# pages which freelist coalesce can create
161  *	pcc_color_free:		pointer to page free counts per color
162  */
163 typedef struct pcc_info {
164 	pgcnt_t	pcc_pages_free;
165 	pgcnt_t	*pcc_color_free;
166 	uint_t	pad[12];
167 } pcc_info_t;
168 
169 /*
170  * On big machines it can take a long time to check page_counters
171  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
172  * updated sum of all elements of the corresponding page_counters arrays.
173  * page_freelist_coalesce() searches page_counters only if an appropriate
174  * element of page_ctrs_cands array is greater than 0.
175  *
176  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
177  */
178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
179 
180 /*
181  * Return in val the total number of free pages which can be created
182  * for the given mnode (m), mrange (g), and region size (r)
183  */
184 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
185 	int i;								\
186 	val = 0;							\
187 	for (i = 0; i < NPC_MUTEX; i++) {				\
188 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
189 	}								\
190 }
191 
192 /*
193  * Return in val the total number of free pages which can be created
194  * for the given mnode (m), mrange (g), region size (r), and color (c)
195  */
196 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
197 	int i;								\
198 	val = 0;							\
199 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
200 	for (i = 0; i < NPC_MUTEX; i++) {				\
201 	    val +=							\
202 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
203 	}								\
204 }
205 
206 /*
207  * We can only allow a single thread to update a counter within the physical
208  * range of the largest supported page size. That is the finest granularity
209  * possible since the counter values are dependent on each other
210  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
211  * ctr_mutex lock index for a particular physical range.
212  */
213 static kmutex_t	*ctr_mutex[NPC_MUTEX];
214 
215 #define	PP_CTR_LOCK_INDX(pp)						\
216 	(((pp)->p_pagenum >>						\
217 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
218 
219 #define	INVALID_COLOR 0xffffffff
220 #define	INVALID_MASK  0xffffffff
221 
222 /*
223  * Local functions prototypes.
224  */
225 
226 void page_ctr_add(int, int, page_t *, int);
227 void page_ctr_add_internal(int, int, page_t *, int);
228 void page_ctr_sub(int, int, page_t *, int);
229 void page_ctr_sub_internal(int, int, page_t *, int);
230 void page_freelist_lock(int);
231 void page_freelist_unlock(int);
232 page_t *page_promote(int, pfn_t, uchar_t, int, int);
233 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
234 page_t *page_freelist_split(uchar_t,
235     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
237 static int page_trylock_cons(page_t *pp, se_t se);
238 
239 /*
240  * The page_counters array below is used to keep track of free contiguous
241  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
242  * This contains an array of counters, the size of the array, a shift value
243  * used to convert a pagenum into a counter array index or vice versa, as
244  * well as a cache of the last successful index to be promoted to a larger
245  * page size.  As an optimization, we keep track of the last successful index
246  * to be promoted per page color for the given size region, and this is
247  * allocated dynamically based upon the number of colors for a given
248  * region size.
249  *
250  * Conceptually, the page counters are represented as:
251  *
252  *	page_counters[region_size][mnode]
253  *
254  *	region_size:	size code of a candidate larger page made up
255  *			of contiguous free smaller pages.
256  *
257  *	page_counters[region_size][mnode].hpm_counters[index]:
258  *		represents how many (region_size - 1) pages either
259  *		exist or can be created within the given index range.
260  *
261  * Let's look at a sparc example:
262  *	If we want to create a free 512k page, we look at region_size 2
263  *	for the mnode we want.  We calculate the index and look at a specific
264  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
265  *	this location, it means that 8 64k pages either exist or can be created
266  *	from 8K pages in order to make a single free 512k page at the given
267  *	index.  Note that when a region is full, it will contribute to the
268  *	counts in the region above it.  Thus we will not know what page
269  *	size the free pages will be which can be promoted to this new free
270  *	page unless we look at all regions below the current region.
271  */
272 
273 /*
274  * Note: hpmctr_t is defined in platform vm_dep.h
275  * hw_page_map_t contains all the information needed for the page_counters
276  * logic. The fields are as follows:
277  *
278  *	hpm_counters:	dynamically allocated array to hold counter data
279  *	hpm_entries:	entries in hpm_counters
280  *	hpm_shift:	shift for pnum/array index conv
281  *	hpm_base:	PFN mapped to counter index 0
282  *	hpm_color_current:	last index in counter array for this color at
283  *				which we successfully created a large page
284  */
285 typedef struct hw_page_map {
286 	hpmctr_t	*hpm_counters;
287 	size_t		hpm_entries;
288 	int		hpm_shift;
289 	pfn_t		hpm_base;
290 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
291 #if defined(__sparc)
292 	uint_t		pad[4];
293 #endif
294 } hw_page_map_t;
295 
296 /*
297  * Element zero is not used, but is allocated for convenience.
298  */
299 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
300 
301 /*
302  * Cached value of MNODE_RANGE_CNT(mnode).
303  * This is a function call in x86.
304  */
305 static int mnode_nranges[MAX_MEM_NODES];
306 static int mnode_maxmrange[MAX_MEM_NODES];
307 
308 /*
309  * The following macros are convenient ways to get access to the individual
310  * elements of the page_counters arrays.  They can be used on both
311  * the left side and right side of equations.
312  */
313 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
314 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
315 
316 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
317 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
318 
319 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
320 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
321 
322 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
323 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
324 
325 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
326 	(page_counters[(rg_szc)][(mnode)].hpm_base)
327 
328 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
329 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
330 
331 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
332 	(page_counters[(rg_szc)][(mnode)].				\
333 	hpm_color_current[(mrange)][(color)])
334 
335 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
336 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
337 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
338 
339 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
340 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
341 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
342 
343 /*
344  * Protects the hpm_counters and hpm_color_current memory from changing while
345  * looking at page counters information.
346  * Grab the write lock to modify what these fields point at.
347  * Grab the read lock to prevent any pointers from changing.
348  * The write lock can not be held during memory allocation due to a possible
349  * recursion deadlock with trying to grab the read lock while the
350  * write lock is already held.
351  */
352 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
353 
354 
355 /*
356  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
357  */
358 void
359 cpu_vm_data_init(struct cpu *cp)
360 {
361 	if (cp == CPU0) {
362 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
363 	} else {
364 		void	*kmptr;
365 		int	align;
366 		size_t	sz;
367 
368 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
369 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
370 		kmptr = kmem_zalloc(sz, KM_SLEEP);
371 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
372 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
373 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
374 	}
375 }
376 
377 /*
378  * free cpu_vm_data
379  */
380 void
381 cpu_vm_data_destroy(struct cpu *cp)
382 {
383 	if (cp->cpu_seqid && cp->cpu_vm_data) {
384 		ASSERT(cp != CPU0);
385 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
386 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
387 	}
388 	cp->cpu_vm_data = NULL;
389 }
390 
391 
392 /*
393  * page size to page size code
394  */
395 int
396 page_szc(size_t pagesize)
397 {
398 	int	i = 0;
399 
400 	while (hw_page_array[i].hp_size) {
401 		if (pagesize == hw_page_array[i].hp_size)
402 			return (i);
403 		i++;
404 	}
405 	return (-1);
406 }
407 
408 /*
409  * page size to page size code with the restriction that it be a supported
410  * user page size.  If it's not a supported user page size, -1 will be returned.
411  */
412 int
413 page_szc_user_filtered(size_t pagesize)
414 {
415 	int szc = page_szc(pagesize);
416 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
417 		return (szc);
418 	}
419 	return (-1);
420 }
421 
422 /*
423  * Return how many page sizes are available for the user to use.  This is
424  * what the hardware supports and not based upon how the OS implements the
425  * support of different page sizes.
426  *
427  * If legacy is non-zero, return the number of pagesizes available to legacy
428  * applications. The number of legacy page sizes might be less than the
429  * exported user page sizes. This is to prevent legacy applications that
430  * use the largest page size returned from getpagesizes(3c) from inadvertantly
431  * using the 'new' large pagesizes.
432  */
433 uint_t
434 page_num_user_pagesizes(int legacy)
435 {
436 	if (legacy)
437 		return (mmu_legacy_page_sizes);
438 	return (mmu_exported_page_sizes);
439 }
440 
441 uint_t
442 page_num_pagesizes(void)
443 {
444 	return (mmu_page_sizes);
445 }
446 
447 /*
448  * returns the count of the number of base pagesize pages associated with szc
449  */
450 pgcnt_t
451 page_get_pagecnt(uint_t szc)
452 {
453 	if (szc >= mmu_page_sizes)
454 		panic("page_get_pagecnt: out of range %d", szc);
455 	return (hw_page_array[szc].hp_pgcnt);
456 }
457 
458 size_t
459 page_get_pagesize(uint_t szc)
460 {
461 	if (szc >= mmu_page_sizes)
462 		panic("page_get_pagesize: out of range %d", szc);
463 	return (hw_page_array[szc].hp_size);
464 }
465 
466 /*
467  * Return the size of a page based upon the index passed in.  An index of
468  * zero refers to the smallest page size in the system, and as index increases
469  * it refers to the next larger supported page size in the system.
470  * Note that szc and userszc may not be the same due to unsupported szc's on
471  * some systems.
472  */
473 size_t
474 page_get_user_pagesize(uint_t userszc)
475 {
476 	uint_t szc = USERSZC_2_SZC(userszc);
477 
478 	if (szc >= mmu_page_sizes)
479 		panic("page_get_user_pagesize: out of range %d", szc);
480 	return (hw_page_array[szc].hp_size);
481 }
482 
483 uint_t
484 page_get_shift(uint_t szc)
485 {
486 	if (szc >= mmu_page_sizes)
487 		panic("page_get_shift: out of range %d", szc);
488 	return (PAGE_GET_SHIFT(szc));
489 }
490 
491 uint_t
492 page_get_pagecolors(uint_t szc)
493 {
494 	if (szc >= mmu_page_sizes)
495 		panic("page_get_pagecolors: out of range %d", szc);
496 	return (PAGE_GET_PAGECOLORS(szc));
497 }
498 
499 /*
500  * this assigns the desired equivalent color after a split
501  */
502 uint_t
503 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
504     uint_t ncolor, uint_t ceq_mask)
505 {
506 	ASSERT(nszc > szc);
507 	ASSERT(szc < mmu_page_sizes);
508 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
509 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
510 
511 	color &= ceq_mask;
512 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
513 	return (color | (ncolor & ~ceq_mask));
514 }
515 
516 /*
517  * The interleaved_mnodes flag is set when mnodes overlap in
518  * the physbase..physmax range, but have disjoint slices.
519  * In this case hpm_counters is shared by all mnodes.
520  * This flag is set dynamically by the platform.
521  */
522 int interleaved_mnodes = 0;
523 
524 /*
525  * Called by startup().
526  * Size up the per page size free list counters based on physmax
527  * of each node and max_mem_nodes.
528  *
529  * If interleaved_mnodes is set we need to find the first mnode that
530  * exists. hpm_counters for the first mnode will then be shared by
531  * all other mnodes. If interleaved_mnodes is not set, just set
532  * first=mnode each time. That means there will be no sharing.
533  */
534 size_t
535 page_ctrs_sz(void)
536 {
537 	int	r;		/* region size */
538 	int	mnode;
539 	int	firstmn;	/* first mnode that exists */
540 	int	nranges;
541 	pfn_t	physbase;
542 	pfn_t	physmax;
543 	uint_t	ctrs_sz = 0;
544 	int 	i;
545 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
546 
547 	/*
548 	 * We need to determine how many page colors there are for each
549 	 * page size in order to allocate memory for any color specific
550 	 * arrays.
551 	 */
552 	for (i = 0; i < mmu_page_sizes; i++) {
553 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
554 	}
555 
556 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
557 
558 		pgcnt_t r_pgcnt;
559 		pfn_t   r_base;
560 		pgcnt_t r_align;
561 
562 		if (mem_node_config[mnode].exists == 0)
563 			continue;
564 
565 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
566 		nranges = MNODE_RANGE_CNT(mnode);
567 		mnode_nranges[mnode] = nranges;
568 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
569 
570 		/*
571 		 * determine size needed for page counter arrays with
572 		 * base aligned to large page size.
573 		 */
574 		for (r = 1; r < mmu_page_sizes; r++) {
575 			/* add in space for hpm_color_current */
576 			ctrs_sz += sizeof (size_t) *
577 			    colors_per_szc[r] * nranges;
578 
579 			if (firstmn != mnode)
580 				continue;
581 
582 			/* add in space for hpm_counters */
583 			r_align = page_get_pagecnt(r);
584 			r_base = physbase;
585 			r_base &= ~(r_align - 1);
586 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
587 
588 			/*
589 			 * Round up to always allocate on pointer sized
590 			 * boundaries.
591 			 */
592 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
593 			    sizeof (hpmctr_t *));
594 		}
595 	}
596 
597 	for (r = 1; r < mmu_page_sizes; r++) {
598 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
599 	}
600 
601 	/* add in space for page_ctrs_cands and pcc_color_free */
602 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
603 	    mmu_page_sizes * NPC_MUTEX;
604 
605 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
606 
607 		if (mem_node_config[mnode].exists == 0)
608 			continue;
609 
610 		nranges = mnode_nranges[mnode];
611 		ctrs_sz += sizeof (pcc_info_t) * nranges *
612 		    mmu_page_sizes * NPC_MUTEX;
613 		for (r = 1; r < mmu_page_sizes; r++) {
614 			ctrs_sz += sizeof (pgcnt_t) * nranges *
615 			    colors_per_szc[r] * NPC_MUTEX;
616 		}
617 	}
618 
619 	/* ctr_mutex */
620 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
621 
622 	/* size for page list counts */
623 	PLCNT_SZ(ctrs_sz);
624 
625 	/*
626 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
627 	 * address of the counters to ecache_alignsize boundary for every
628 	 * memory node.
629 	 */
630 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
631 }
632 
633 caddr_t
634 page_ctrs_alloc(caddr_t alloc_base)
635 {
636 	int	mnode;
637 	int	mrange, nranges;
638 	int	r;		/* region size */
639 	int	i;
640 	int	firstmn;	/* first mnode that exists */
641 	pfn_t	physbase;
642 	pfn_t	physmax;
643 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
644 
645 	/*
646 	 * We need to determine how many page colors there are for each
647 	 * page size in order to allocate memory for any color specific
648 	 * arrays.
649 	 */
650 	for (i = 0; i < mmu_page_sizes; i++) {
651 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
652 	}
653 
654 	for (r = 1; r < mmu_page_sizes; r++) {
655 		page_counters[r] = (hw_page_map_t *)alloc_base;
656 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
657 	}
658 
659 	/* page_ctrs_cands and pcc_color_free array */
660 	for (i = 0; i < NPC_MUTEX; i++) {
661 		for (r = 1; r < mmu_page_sizes; r++) {
662 
663 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
664 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
665 
666 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
667 				pcc_info_t *pi;
668 
669 				if (mem_node_config[mnode].exists == 0)
670 					continue;
671 
672 				nranges = mnode_nranges[mnode];
673 
674 				pi = (pcc_info_t *)alloc_base;
675 				alloc_base += sizeof (pcc_info_t) * nranges;
676 				page_ctrs_cands[i][r][mnode] = pi;
677 
678 				for (mrange = 0; mrange < nranges; mrange++) {
679 					pi->pcc_color_free =
680 					    (pgcnt_t *)alloc_base;
681 					alloc_base += sizeof (pgcnt_t) *
682 					    colors_per_szc[r];
683 					pi++;
684 				}
685 			}
686 		}
687 	}
688 
689 	/* ctr_mutex */
690 	for (i = 0; i < NPC_MUTEX; i++) {
691 		ctr_mutex[i] = (kmutex_t *)alloc_base;
692 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
693 	}
694 
695 	/* initialize page list counts */
696 	PLCNT_INIT(alloc_base);
697 
698 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
699 
700 		pgcnt_t r_pgcnt;
701 		pfn_t	r_base;
702 		pgcnt_t r_align;
703 		int	r_shift;
704 		int	nranges = mnode_nranges[mnode];
705 
706 		if (mem_node_config[mnode].exists == 0)
707 			continue;
708 
709 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
710 
711 		for (r = 1; r < mmu_page_sizes; r++) {
712 			/*
713 			 * the page_counters base has to be aligned to the
714 			 * page count of page size code r otherwise the counts
715 			 * will cross large page boundaries.
716 			 */
717 			r_align = page_get_pagecnt(r);
718 			r_base = physbase;
719 			/* base needs to be aligned - lower to aligned value */
720 			r_base &= ~(r_align - 1);
721 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
722 			r_shift = PAGE_BSZS_SHIFT(r);
723 
724 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
725 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
726 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
727 			for (mrange = 0; mrange < nranges; mrange++) {
728 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
729 				    r, mrange) = (size_t *)alloc_base;
730 				alloc_base += sizeof (size_t) *
731 				    colors_per_szc[r];
732 			}
733 			for (i = 0; i < colors_per_szc[r]; i++) {
734 				uint_t color_mask = colors_per_szc[r] - 1;
735 				pfn_t  pfnum = r_base;
736 				size_t idx;
737 				int mrange;
738 				MEM_NODE_ITERATOR_DECL(it);
739 
740 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
741 				if (pfnum == (pfn_t)-1) {
742 					idx = 0;
743 				} else {
744 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
745 					    color_mask, color_mask, &it);
746 					idx = PNUM_TO_IDX(mnode, r, pfnum);
747 					idx = (idx >= r_pgcnt) ? 0 : idx;
748 				}
749 				for (mrange = 0; mrange < nranges; mrange++) {
750 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
751 					    r, i, mrange) = idx;
752 				}
753 			}
754 
755 			/* hpm_counters may be shared by all mnodes */
756 			if (firstmn == mnode) {
757 				PAGE_COUNTERS_COUNTERS(mnode, r) =
758 				    (hpmctr_t *)alloc_base;
759 				alloc_base +=
760 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
761 				    sizeof (hpmctr_t *));
762 			} else {
763 				PAGE_COUNTERS_COUNTERS(mnode, r) =
764 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
765 			}
766 
767 			/*
768 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
769 			 * satisfy the identity requirement.
770 			 * We should be able to go from one to the other
771 			 * and get consistent values.
772 			 */
773 			ASSERT(PNUM_TO_IDX(mnode, r,
774 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
775 			ASSERT(IDX_TO_PNUM(mnode, r,
776 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
777 		}
778 		/*
779 		 * Roundup the start address of the page_counters to
780 		 * cache aligned boundary for every memory node.
781 		 * page_ctrs_sz() has added some slop for these roundups.
782 		 */
783 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
784 		    L2CACHE_ALIGN);
785 	}
786 
787 	/* Initialize other page counter specific data structures. */
788 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
789 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
790 	}
791 
792 	return (alloc_base);
793 }
794 
795 /*
796  * Functions to adjust region counters for each size free list.
797  * Caller is responsible to acquire the ctr_mutex lock if necessary and
798  * thus can be called during startup without locks.
799  */
800 /* ARGSUSED */
801 void
802 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
803 {
804 	ssize_t		r;	/* region size */
805 	ssize_t		idx;
806 	pfn_t		pfnum;
807 	int		lckidx;
808 
809 	ASSERT(mnode == PP_2_MEM_NODE(pp));
810 	ASSERT(mtype == PP_2_MTYPE(pp));
811 
812 	ASSERT(pp->p_szc < mmu_page_sizes);
813 
814 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
815 
816 	/* no counter update needed for largest page size */
817 	if (pp->p_szc >= mmu_page_sizes - 1) {
818 		return;
819 	}
820 
821 	r = pp->p_szc + 1;
822 	pfnum = pp->p_pagenum;
823 	lckidx = PP_CTR_LOCK_INDX(pp);
824 
825 	/*
826 	 * Increment the count of free pages for the current
827 	 * region. Continue looping up in region size incrementing
828 	 * count if the preceeding region is full.
829 	 */
830 	while (r < mmu_page_sizes) {
831 		idx = PNUM_TO_IDX(mnode, r, pfnum);
832 
833 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
834 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
835 
836 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
837 			break;
838 		} else {
839 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
840 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
841 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
842 
843 			cand->pcc_pages_free++;
844 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
845 		}
846 		r++;
847 	}
848 }
849 
850 void
851 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
852 {
853 	int		lckidx = PP_CTR_LOCK_INDX(pp);
854 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
855 
856 	mutex_enter(lock);
857 	page_ctr_add_internal(mnode, mtype, pp, flags);
858 	mutex_exit(lock);
859 }
860 
861 void
862 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
863 {
864 	int		lckidx;
865 	ssize_t		r;	/* region size */
866 	ssize_t		idx;
867 	pfn_t		pfnum;
868 
869 	ASSERT(mnode == PP_2_MEM_NODE(pp));
870 	ASSERT(mtype == PP_2_MTYPE(pp));
871 
872 	ASSERT(pp->p_szc < mmu_page_sizes);
873 
874 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
875 
876 	/* no counter update needed for largest page size */
877 	if (pp->p_szc >= mmu_page_sizes - 1) {
878 		return;
879 	}
880 
881 	r = pp->p_szc + 1;
882 	pfnum = pp->p_pagenum;
883 	lckidx = PP_CTR_LOCK_INDX(pp);
884 
885 	/*
886 	 * Decrement the count of free pages for the current
887 	 * region. Continue looping up in region size decrementing
888 	 * count if the preceeding region was full.
889 	 */
890 	while (r < mmu_page_sizes) {
891 		idx = PNUM_TO_IDX(mnode, r, pfnum);
892 
893 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
894 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
895 
896 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
897 			break;
898 		} else {
899 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
900 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
901 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
902 
903 			ASSERT(cand->pcc_pages_free != 0);
904 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
905 
906 			cand->pcc_pages_free--;
907 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
908 		}
909 		r++;
910 	}
911 }
912 
913 void
914 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
915 {
916 	int		lckidx = PP_CTR_LOCK_INDX(pp);
917 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
918 
919 	mutex_enter(lock);
920 	page_ctr_sub_internal(mnode, mtype, pp, flags);
921 	mutex_exit(lock);
922 }
923 
924 /*
925  * Adjust page counters following a memory attach, since typically the
926  * size of the array needs to change, and the PFN to counter index
927  * mapping needs to change.
928  *
929  * It is possible this mnode did not exist at startup. In that case
930  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
931  * to change (a theoretical possibility on x86), which means pcc_color_free
932  * arrays must be extended.
933  */
934 uint_t
935 page_ctrs_adjust(int mnode)
936 {
937 	pgcnt_t npgs;
938 	int	r;		/* region size */
939 	int	i;
940 	size_t	pcsz, old_csz;
941 	hpmctr_t *new_ctr, *old_ctr;
942 	pfn_t	oldbase, newbase;
943 	pfn_t	physbase, physmax;
944 	size_t	old_npgs;
945 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
946 	size_t	size_cache[MMU_PAGE_SIZES];
947 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
948 	size_t	*old_color_array[MAX_MNODE_MRANGES];
949 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
950 	pcc_info_t **cands_cache;
951 	pcc_info_t *old_pi, *pi;
952 	pgcnt_t *pgcntp;
953 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
954 	int cands_cache_nranges;
955 	int old_maxmrange, new_maxmrange;
956 	int rc = 0;
957 	int oldmnode;
958 
959 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
960 	    MMU_PAGE_SIZES, KM_NOSLEEP);
961 	if (cands_cache == NULL)
962 		return (ENOMEM);
963 
964 	i = -1;
965 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
966 
967 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
968 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
969 
970 	/* prepare to free non-null pointers on the way out */
971 	cands_cache_nranges = nranges;
972 	bzero(ctr_cache, sizeof (ctr_cache));
973 	bzero(color_cache, sizeof (color_cache));
974 
975 	/*
976 	 * We need to determine how many page colors there are for each
977 	 * page size in order to allocate memory for any color specific
978 	 * arrays.
979 	 */
980 	for (r = 0; r < mmu_page_sizes; r++) {
981 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
982 	}
983 
984 	/*
985 	 * Preallocate all of the new hpm_counters arrays as we can't
986 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
987 	 * If we can't allocate all of the arrays, undo our work so far
988 	 * and return failure.
989 	 */
990 	for (r = 1; r < mmu_page_sizes; r++) {
991 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
992 		size_cache[r] = pcsz;
993 		ctr_cache[r] = kmem_zalloc(pcsz *
994 		    sizeof (hpmctr_t), KM_NOSLEEP);
995 		if (ctr_cache[r] == NULL) {
996 			rc = ENOMEM;
997 			goto cleanup;
998 		}
999 	}
1000 
1001 	/*
1002 	 * Preallocate all of the new color current arrays as we can't
1003 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1004 	 * If we can't allocate all of the arrays, undo our work so far
1005 	 * and return failure.
1006 	 */
1007 	for (r = 1; r < mmu_page_sizes; r++) {
1008 		for (mrange = 0; mrange < nranges; mrange++) {
1009 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1010 			    colors_per_szc[r], KM_NOSLEEP);
1011 			if (color_cache[r][mrange] == NULL) {
1012 				rc = ENOMEM;
1013 				goto cleanup;
1014 			}
1015 		}
1016 	}
1017 
1018 	/*
1019 	 * Preallocate all of the new pcc_info_t arrays as we can't
1020 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1021 	 * If we can't allocate all of the arrays, undo our work so far
1022 	 * and return failure.
1023 	 */
1024 	for (r = 1; r < mmu_page_sizes; r++) {
1025 		for (i = 0; i < NPC_MUTEX; i++) {
1026 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1027 			    KM_NOSLEEP);
1028 			if (pi == NULL) {
1029 				rc = ENOMEM;
1030 				goto cleanup;
1031 			}
1032 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1033 
1034 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1035 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1036 				    sizeof (pgcnt_t), KM_NOSLEEP);
1037 				if (pgcntp == NULL) {
1038 					rc = ENOMEM;
1039 					goto cleanup;
1040 				}
1041 				pi->pcc_color_free = pgcntp;
1042 			}
1043 		}
1044 	}
1045 
1046 	/*
1047 	 * Grab the write lock to prevent others from walking these arrays
1048 	 * while we are modifying them.
1049 	 */
1050 	PAGE_CTRS_WRITE_LOCK(mnode);
1051 
1052 	/*
1053 	 * For interleaved mnodes, find the first mnode
1054 	 * with valid page counters since the current
1055 	 * mnode may have just been added and not have
1056 	 * valid page counters.
1057 	 */
1058 	if (interleaved_mnodes) {
1059 		for (i = 0; i < max_mem_nodes; i++)
1060 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1061 				break;
1062 		ASSERT(i < max_mem_nodes);
1063 		oldmnode = i;
1064 	} else
1065 		oldmnode = mnode;
1066 
1067 	old_nranges = mnode_nranges[mnode];
1068 	cands_cache_nranges = old_nranges;
1069 	mnode_nranges[mnode] = nranges;
1070 	old_maxmrange = mnode_maxmrange[mnode];
1071 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1072 	new_maxmrange = mnode_maxmrange[mnode];
1073 
1074 	for (r = 1; r < mmu_page_sizes; r++) {
1075 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1076 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1077 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1078 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1079 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1080 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1081 			old_color_array[mrange] =
1082 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1083 			    r, mrange);
1084 		}
1085 
1086 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1087 		new_ctr = ctr_cache[r];
1088 		ctr_cache[r] = NULL;
1089 		if (old_ctr != NULL &&
1090 		    (oldbase + old_npgs > newbase) &&
1091 		    (newbase + npgs > oldbase)) {
1092 			/*
1093 			 * Map the intersection of the old and new
1094 			 * counters into the new array.
1095 			 */
1096 			size_t offset;
1097 			if (newbase > oldbase) {
1098 				offset = (newbase - oldbase) >>
1099 				    PAGE_COUNTERS_SHIFT(mnode, r);
1100 				bcopy(old_ctr + offset, new_ctr,
1101 				    MIN(pcsz, (old_csz - offset)) *
1102 				    sizeof (hpmctr_t));
1103 			} else {
1104 				offset = (oldbase - newbase) >>
1105 				    PAGE_COUNTERS_SHIFT(mnode, r);
1106 				bcopy(old_ctr, new_ctr + offset,
1107 				    MIN(pcsz - offset, old_csz) *
1108 				    sizeof (hpmctr_t));
1109 			}
1110 		}
1111 
1112 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1113 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1114 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1115 
1116 		/* update shared hpm_counters in other mnodes */
1117 		if (interleaved_mnodes) {
1118 			for (i = 0; i < max_mem_nodes; i++) {
1119 				if (i == mnode)
1120 					continue;
1121 				ASSERT(
1122 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1123 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1124 				if (mem_node_config[i].exists == 0)
1125 					continue;
1126 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1127 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1128 				PAGE_COUNTERS_BASE(i, r) = newbase;
1129 			}
1130 		}
1131 
1132 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1133 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1134 			    color_cache[r][mrange];
1135 			color_cache[r][mrange] = NULL;
1136 		}
1137 		/*
1138 		 * for now, just reset on these events as it's probably
1139 		 * not worthwhile to try and optimize this.
1140 		 */
1141 		for (i = 0; i < colors_per_szc[r]; i++) {
1142 			uint_t color_mask = colors_per_szc[r] - 1;
1143 			int mlo = interleaved_mnodes ? 0 : mnode;
1144 			int mhi = interleaved_mnodes ? max_mem_nodes :
1145 			    (mnode + 1);
1146 			int m;
1147 			pfn_t  pfnum;
1148 			size_t idx;
1149 			MEM_NODE_ITERATOR_DECL(it);
1150 
1151 			for (m = mlo; m < mhi; m++) {
1152 				if (mem_node_config[m].exists == 0)
1153 					continue;
1154 				pfnum = newbase;
1155 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1156 				if (pfnum == (pfn_t)-1) {
1157 					idx = 0;
1158 				} else {
1159 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1160 					    color_mask, color_mask, &it);
1161 					idx = PNUM_TO_IDX(m, r, pfnum);
1162 					idx = (idx < pcsz) ? idx : 0;
1163 				}
1164 				for (mrange = 0; mrange < nranges; mrange++) {
1165 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1166 					    r, mrange) != NULL)
1167 						PAGE_COUNTERS_CURRENT_COLOR(m,
1168 						    r, i, mrange) = idx;
1169 				}
1170 			}
1171 		}
1172 
1173 		/* cache info for freeing out of the critical path */
1174 		if ((caddr_t)old_ctr >= kernelheap &&
1175 		    (caddr_t)old_ctr < ekernelheap) {
1176 			ctr_cache[r] = old_ctr;
1177 			size_cache[r] = old_csz;
1178 		}
1179 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1180 			size_t *tmp = old_color_array[mrange];
1181 			if ((caddr_t)tmp >= kernelheap &&
1182 			    (caddr_t)tmp < ekernelheap) {
1183 				color_cache[r][mrange] = tmp;
1184 			}
1185 		}
1186 		/*
1187 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1188 		 * satisfy the identity requirement.
1189 		 * We should be able to go from one to the other
1190 		 * and get consistent values.
1191 		 */
1192 		ASSERT(PNUM_TO_IDX(mnode, r,
1193 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1194 		ASSERT(IDX_TO_PNUM(mnode, r,
1195 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1196 
1197 		/* pcc_info_t and pcc_color_free */
1198 		for (i = 0; i < NPC_MUTEX; i++) {
1199 			pcc_info_t *epi;
1200 			pcc_info_t *eold_pi;
1201 
1202 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1203 			old_pi = page_ctrs_cands[i][r][mnode];
1204 			page_ctrs_cands[i][r][mnode] = pi;
1205 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1206 
1207 			/* preserve old pcc_color_free values, if any */
1208 			if (old_pi == NULL)
1209 				continue;
1210 
1211 			/*
1212 			 * when/if x86 does DR, must account for
1213 			 * possible change in range index when
1214 			 * preserving pcc_info
1215 			 */
1216 			epi = &pi[nranges];
1217 			eold_pi = &old_pi[old_nranges];
1218 			if (new_maxmrange > old_maxmrange) {
1219 				pi += new_maxmrange - old_maxmrange;
1220 			} else if (new_maxmrange < old_maxmrange) {
1221 				old_pi += old_maxmrange - new_maxmrange;
1222 			}
1223 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1224 				pcc_info_t tmp = *pi;
1225 				*pi = *old_pi;
1226 				*old_pi = tmp;
1227 			}
1228 		}
1229 	}
1230 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1231 
1232 	/*
1233 	 * Now that we have dropped the write lock, it is safe to free all
1234 	 * of the memory we have cached above.
1235 	 * We come thru here to free memory when pre-alloc fails, and also to
1236 	 * free old pointers which were recorded while locked.
1237 	 */
1238 cleanup:
1239 	for (r = 1; r < mmu_page_sizes; r++) {
1240 		if (ctr_cache[r] != NULL) {
1241 			kmem_free(ctr_cache[r],
1242 			    size_cache[r] * sizeof (hpmctr_t));
1243 		}
1244 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1245 			if (color_cache[r][mrange] != NULL) {
1246 				kmem_free(color_cache[r][mrange],
1247 				    colors_per_szc[r] * sizeof (size_t));
1248 			}
1249 		}
1250 		for (i = 0; i < NPC_MUTEX; i++) {
1251 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1252 			if (pi == NULL)
1253 				continue;
1254 			nr = cands_cache_nranges;
1255 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1256 				pgcntp = pi->pcc_color_free;
1257 				if (pgcntp == NULL)
1258 					continue;
1259 				if ((caddr_t)pgcntp >= kernelheap &&
1260 				    (caddr_t)pgcntp < ekernelheap) {
1261 					kmem_free(pgcntp,
1262 					    colors_per_szc[r] *
1263 					    sizeof (pgcnt_t));
1264 				}
1265 			}
1266 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1267 			if ((caddr_t)pi >= kernelheap &&
1268 			    (caddr_t)pi < ekernelheap) {
1269 				kmem_free(pi, nr * sizeof (pcc_info_t));
1270 			}
1271 		}
1272 	}
1273 
1274 	kmem_free(cands_cache,
1275 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1276 	return (rc);
1277 }
1278 
1279 
1280 #ifdef DEBUG
1281 
1282 /*
1283  * confirm pp is a large page corresponding to szc
1284  */
1285 void
1286 chk_lpg(page_t *pp, uchar_t szc)
1287 {
1288 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1289 	uint_t noreloc;
1290 
1291 	if (npgs == 1) {
1292 		ASSERT(pp->p_szc == 0);
1293 		ASSERT(pp->p_next == pp);
1294 		ASSERT(pp->p_prev == pp);
1295 		return;
1296 	}
1297 
1298 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1299 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1300 
1301 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1302 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1303 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1304 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1305 
1306 	/*
1307 	 * Check list of pages.
1308 	 */
1309 	noreloc = PP_ISNORELOC(pp);
1310 	while (npgs--) {
1311 		if (npgs != 0) {
1312 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1313 			ASSERT(pp->p_next == (pp + 1));
1314 		}
1315 		ASSERT(pp->p_szc == szc);
1316 		ASSERT(PP_ISFREE(pp));
1317 		ASSERT(PP_ISAGED(pp));
1318 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1319 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1320 		ASSERT(pp->p_vnode  == NULL);
1321 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1322 
1323 		pp = pp->p_next;
1324 	}
1325 }
1326 #endif /* DEBUG */
1327 
1328 void
1329 page_freelist_lock(int mnode)
1330 {
1331 	int i;
1332 	for (i = 0; i < NPC_MUTEX; i++) {
1333 		mutex_enter(FPC_MUTEX(mnode, i));
1334 		mutex_enter(CPC_MUTEX(mnode, i));
1335 	}
1336 }
1337 
1338 void
1339 page_freelist_unlock(int mnode)
1340 {
1341 	int i;
1342 	for (i = 0; i < NPC_MUTEX; i++) {
1343 		mutex_exit(FPC_MUTEX(mnode, i));
1344 		mutex_exit(CPC_MUTEX(mnode, i));
1345 	}
1346 }
1347 
1348 /*
1349  * add pp to the specified page list. Defaults to head of the page list
1350  * unless PG_LIST_TAIL is specified.
1351  */
1352 void
1353 page_list_add(page_t *pp, int flags)
1354 {
1355 	page_t		**ppp;
1356 	kmutex_t	*pcm;
1357 	uint_t		bin, mtype;
1358 	int		mnode;
1359 
1360 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1361 	ASSERT(PP_ISFREE(pp));
1362 	ASSERT(!hat_page_is_mapped(pp));
1363 	ASSERT(hat_page_getshare(pp) == 0);
1364 
1365 	/*
1366 	 * Large pages should be freed via page_list_add_pages().
1367 	 */
1368 	ASSERT(pp->p_szc == 0);
1369 
1370 	/*
1371 	 * Don't need to lock the freelist first here
1372 	 * because the page isn't on the freelist yet.
1373 	 * This means p_szc can't change on us.
1374 	 */
1375 
1376 	bin = PP_2_BIN(pp);
1377 	mnode = PP_2_MEM_NODE(pp);
1378 	mtype = PP_2_MTYPE(pp);
1379 
1380 	if (flags & PG_LIST_ISINIT) {
1381 		/*
1382 		 * PG_LIST_ISINIT is set during system startup (ie. single
1383 		 * threaded), add a page to the free list and add to the
1384 		 * the free region counters w/o any locking
1385 		 */
1386 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1387 
1388 		/* inline version of page_add() */
1389 		if (*ppp != NULL) {
1390 			pp->p_next = *ppp;
1391 			pp->p_prev = (*ppp)->p_prev;
1392 			(*ppp)->p_prev = pp;
1393 			pp->p_prev->p_next = pp;
1394 		} else
1395 			*ppp = pp;
1396 
1397 		page_ctr_add_internal(mnode, mtype, pp, flags);
1398 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1399 	} else {
1400 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1401 
1402 		if (flags & PG_FREE_LIST) {
1403 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1404 			ASSERT(PP_ISAGED(pp));
1405 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1406 
1407 		} else {
1408 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1409 			ASSERT(pp->p_vnode);
1410 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1411 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1412 		}
1413 		mutex_enter(pcm);
1414 		page_add(ppp, pp);
1415 
1416 		if (flags & PG_LIST_TAIL)
1417 			*ppp = (*ppp)->p_next;
1418 		/*
1419 		 * Add counters before releasing pcm mutex to avoid a race with
1420 		 * page_freelist_coalesce and page_freelist_split.
1421 		 */
1422 		page_ctr_add(mnode, mtype, pp, flags);
1423 		mutex_exit(pcm);
1424 	}
1425 
1426 
1427 #if defined(__sparc)
1428 	if (PP_ISNORELOC(pp)) {
1429 		kcage_freemem_add(1);
1430 	}
1431 #endif
1432 	/*
1433 	 * It is up to the caller to unlock the page!
1434 	 */
1435 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1436 }
1437 
1438 
1439 #ifdef __sparc
1440 /*
1441  * This routine is only used by kcage_init during system startup.
1442  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1443  * without the overhead of taking locks and updating counters.
1444  */
1445 void
1446 page_list_noreloc_startup(page_t *pp)
1447 {
1448 	page_t		**ppp;
1449 	uint_t		bin;
1450 	int		mnode;
1451 	int		mtype;
1452 	int		flags = 0;
1453 
1454 	/*
1455 	 * If this is a large page on the freelist then
1456 	 * break it up into smaller pages.
1457 	 */
1458 	if (pp->p_szc != 0)
1459 		page_boot_demote(pp);
1460 
1461 	/*
1462 	 * Get list page is currently on.
1463 	 */
1464 	bin = PP_2_BIN(pp);
1465 	mnode = PP_2_MEM_NODE(pp);
1466 	mtype = PP_2_MTYPE(pp);
1467 	ASSERT(mtype == MTYPE_RELOC);
1468 	ASSERT(pp->p_szc == 0);
1469 
1470 	if (PP_ISAGED(pp)) {
1471 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1472 		flags |= PG_FREE_LIST;
1473 	} else {
1474 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1475 		flags |= PG_CACHE_LIST;
1476 	}
1477 
1478 	ASSERT(*ppp != NULL);
1479 
1480 	/*
1481 	 * Delete page from current list.
1482 	 */
1483 	if (*ppp == pp)
1484 		*ppp = pp->p_next;		/* go to next page */
1485 	if (*ppp == pp) {
1486 		*ppp = NULL;			/* page list is gone */
1487 	} else {
1488 		pp->p_prev->p_next = pp->p_next;
1489 		pp->p_next->p_prev = pp->p_prev;
1490 	}
1491 
1492 	/*
1493 	 * Decrement page counters
1494 	 */
1495 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1496 
1497 	/*
1498 	 * Set no reloc for cage initted pages.
1499 	 */
1500 	PP_SETNORELOC(pp);
1501 
1502 	mtype = PP_2_MTYPE(pp);
1503 	ASSERT(mtype == MTYPE_NORELOC);
1504 
1505 	/*
1506 	 * Get new list for page.
1507 	 */
1508 	if (PP_ISAGED(pp)) {
1509 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1510 	} else {
1511 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1512 	}
1513 
1514 	/*
1515 	 * Insert page on new list.
1516 	 */
1517 	if (*ppp == NULL) {
1518 		*ppp = pp;
1519 		pp->p_next = pp->p_prev = pp;
1520 	} else {
1521 		pp->p_next = *ppp;
1522 		pp->p_prev = (*ppp)->p_prev;
1523 		(*ppp)->p_prev = pp;
1524 		pp->p_prev->p_next = pp;
1525 	}
1526 
1527 	/*
1528 	 * Increment page counters
1529 	 */
1530 	page_ctr_add_internal(mnode, mtype, pp, flags);
1531 
1532 	/*
1533 	 * Update cage freemem counter
1534 	 */
1535 	atomic_add_long(&kcage_freemem, 1);
1536 }
1537 #else	/* __sparc */
1538 
1539 /* ARGSUSED */
1540 void
1541 page_list_noreloc_startup(page_t *pp)
1542 {
1543 	panic("page_list_noreloc_startup: should be here only for sparc");
1544 }
1545 #endif
1546 
1547 void
1548 page_list_add_pages(page_t *pp, int flags)
1549 {
1550 	kmutex_t *pcm;
1551 	pgcnt_t	pgcnt;
1552 	uint_t	bin, mtype, i;
1553 	int	mnode;
1554 
1555 	/* default to freelist/head */
1556 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1557 
1558 	CHK_LPG(pp, pp->p_szc);
1559 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1560 
1561 	bin = PP_2_BIN(pp);
1562 	mnode = PP_2_MEM_NODE(pp);
1563 	mtype = PP_2_MTYPE(pp);
1564 
1565 	if (flags & PG_LIST_ISINIT) {
1566 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1567 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1568 		ASSERT(!PP_ISNORELOC(pp));
1569 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1570 	} else {
1571 
1572 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1573 
1574 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1575 
1576 		mutex_enter(pcm);
1577 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1578 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1579 		mutex_exit(pcm);
1580 
1581 		pgcnt = page_get_pagecnt(pp->p_szc);
1582 #if defined(__sparc)
1583 		if (PP_ISNORELOC(pp))
1584 			kcage_freemem_add(pgcnt);
1585 #endif
1586 		for (i = 0; i < pgcnt; i++, pp++)
1587 			page_unlock_nocapture(pp);
1588 	}
1589 }
1590 
1591 /*
1592  * During boot, need to demote a large page to base
1593  * pagesize pages for seg_kmem for use in boot_alloc()
1594  */
1595 void
1596 page_boot_demote(page_t *pp)
1597 {
1598 	ASSERT(pp->p_szc != 0);
1599 	ASSERT(PP_ISFREE(pp));
1600 	ASSERT(PP_ISAGED(pp));
1601 
1602 	(void) page_demote(PP_2_MEM_NODE(pp),
1603 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1604 	    PC_FREE);
1605 
1606 	ASSERT(PP_ISFREE(pp));
1607 	ASSERT(PP_ISAGED(pp));
1608 	ASSERT(pp->p_szc == 0);
1609 }
1610 
1611 /*
1612  * Take a particular page off of whatever freelist the page
1613  * is claimed to be on.
1614  *
1615  * NOTE: Only used for PAGESIZE pages.
1616  */
1617 void
1618 page_list_sub(page_t *pp, int flags)
1619 {
1620 	int		bin;
1621 	uint_t		mtype;
1622 	int		mnode;
1623 	kmutex_t	*pcm;
1624 	page_t		**ppp;
1625 
1626 	ASSERT(PAGE_EXCL(pp));
1627 	ASSERT(PP_ISFREE(pp));
1628 
1629 	/*
1630 	 * The p_szc field can only be changed by page_promote()
1631 	 * and page_demote(). Only free pages can be promoted and
1632 	 * demoted and the free list MUST be locked during these
1633 	 * operations. So to prevent a race in page_list_sub()
1634 	 * between computing which bin of the freelist lock to
1635 	 * grab and actually grabing the lock we check again that
1636 	 * the bin we locked is still the correct one. Notice that
1637 	 * the p_szc field could have actually changed on us but
1638 	 * if the bin happens to still be the same we are safe.
1639 	 */
1640 try_again:
1641 	bin = PP_2_BIN(pp);
1642 	mnode = PP_2_MEM_NODE(pp);
1643 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1644 	mutex_enter(pcm);
1645 	if (PP_2_BIN(pp) != bin) {
1646 		mutex_exit(pcm);
1647 		goto try_again;
1648 	}
1649 	mtype = PP_2_MTYPE(pp);
1650 
1651 	if (flags & PG_FREE_LIST) {
1652 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1653 		ASSERT(PP_ISAGED(pp));
1654 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1655 	} else {
1656 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1657 		ASSERT(!PP_ISAGED(pp));
1658 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1659 	}
1660 
1661 	/*
1662 	 * Common PAGESIZE case.
1663 	 *
1664 	 * Note that we locked the freelist. This prevents
1665 	 * any page promotion/demotion operations. Therefore
1666 	 * the p_szc will not change until we drop pcm mutex.
1667 	 */
1668 	if (pp->p_szc == 0) {
1669 		page_sub(ppp, pp);
1670 		/*
1671 		 * Subtract counters before releasing pcm mutex
1672 		 * to avoid race with page_freelist_coalesce.
1673 		 */
1674 		page_ctr_sub(mnode, mtype, pp, flags);
1675 		mutex_exit(pcm);
1676 
1677 #if defined(__sparc)
1678 		if (PP_ISNORELOC(pp)) {
1679 			kcage_freemem_sub(1);
1680 		}
1681 #endif
1682 		return;
1683 	}
1684 
1685 	/*
1686 	 * Large pages on the cache list are not supported.
1687 	 */
1688 	if (flags & PG_CACHE_LIST)
1689 		panic("page_list_sub: large page on cachelist");
1690 
1691 	/*
1692 	 * Slow but rare.
1693 	 *
1694 	 * Somebody wants this particular page which is part
1695 	 * of a large page. In this case we just demote the page
1696 	 * if it's on the freelist.
1697 	 *
1698 	 * We have to drop pcm before locking the entire freelist.
1699 	 * Once we have re-locked the freelist check to make sure
1700 	 * the page hasn't already been demoted or completely
1701 	 * freed.
1702 	 */
1703 	mutex_exit(pcm);
1704 	page_freelist_lock(mnode);
1705 	if (pp->p_szc != 0) {
1706 		/*
1707 		 * Large page is on freelist.
1708 		 */
1709 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1710 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1711 	}
1712 	ASSERT(PP_ISFREE(pp));
1713 	ASSERT(PP_ISAGED(pp));
1714 	ASSERT(pp->p_szc == 0);
1715 
1716 	/*
1717 	 * Subtract counters before releasing pcm mutex
1718 	 * to avoid race with page_freelist_coalesce.
1719 	 */
1720 	bin = PP_2_BIN(pp);
1721 	mtype = PP_2_MTYPE(pp);
1722 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1723 
1724 	page_sub(ppp, pp);
1725 	page_ctr_sub(mnode, mtype, pp, flags);
1726 	page_freelist_unlock(mnode);
1727 
1728 #if defined(__sparc)
1729 	if (PP_ISNORELOC(pp)) {
1730 		kcage_freemem_sub(1);
1731 	}
1732 #endif
1733 }
1734 
1735 void
1736 page_list_sub_pages(page_t *pp, uint_t szc)
1737 {
1738 	kmutex_t *pcm;
1739 	uint_t	bin, mtype;
1740 	int	mnode;
1741 
1742 	ASSERT(PAGE_EXCL(pp));
1743 	ASSERT(PP_ISFREE(pp));
1744 	ASSERT(PP_ISAGED(pp));
1745 
1746 	/*
1747 	 * See comment in page_list_sub().
1748 	 */
1749 try_again:
1750 	bin = PP_2_BIN(pp);
1751 	mnode = PP_2_MEM_NODE(pp);
1752 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1753 	mutex_enter(pcm);
1754 	if (PP_2_BIN(pp) != bin) {
1755 		mutex_exit(pcm);
1756 		goto	try_again;
1757 	}
1758 
1759 	/*
1760 	 * If we're called with a page larger than szc or it got
1761 	 * promoted above szc before we locked the freelist then
1762 	 * drop pcm and re-lock entire freelist. If page still larger
1763 	 * than szc then demote it.
1764 	 */
1765 	if (pp->p_szc > szc) {
1766 		mutex_exit(pcm);
1767 		pcm = NULL;
1768 		page_freelist_lock(mnode);
1769 		if (pp->p_szc > szc) {
1770 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1771 			(void) page_demote(mnode,
1772 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1773 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1774 		}
1775 		bin = PP_2_BIN(pp);
1776 	}
1777 	ASSERT(PP_ISFREE(pp));
1778 	ASSERT(PP_ISAGED(pp));
1779 	ASSERT(pp->p_szc <= szc);
1780 	ASSERT(pp == PP_PAGEROOT(pp));
1781 
1782 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1783 
1784 	mtype = PP_2_MTYPE(pp);
1785 	if (pp->p_szc != 0) {
1786 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1787 		CHK_LPG(pp, pp->p_szc);
1788 	} else {
1789 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1790 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1791 	}
1792 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1793 
1794 	if (pcm != NULL) {
1795 		mutex_exit(pcm);
1796 	} else {
1797 		page_freelist_unlock(mnode);
1798 	}
1799 
1800 #if defined(__sparc)
1801 	if (PP_ISNORELOC(pp)) {
1802 		pgcnt_t	pgcnt;
1803 
1804 		pgcnt = page_get_pagecnt(pp->p_szc);
1805 		kcage_freemem_sub(pgcnt);
1806 	}
1807 #endif
1808 }
1809 
1810 /*
1811  * Add the page to the front of a linked list of pages
1812  * using the p_next & p_prev pointers for the list.
1813  * The caller is responsible for protecting the list pointers.
1814  */
1815 void
1816 mach_page_add(page_t **ppp, page_t *pp)
1817 {
1818 	if (*ppp == NULL) {
1819 		pp->p_next = pp->p_prev = pp;
1820 	} else {
1821 		pp->p_next = *ppp;
1822 		pp->p_prev = (*ppp)->p_prev;
1823 		(*ppp)->p_prev = pp;
1824 		pp->p_prev->p_next = pp;
1825 	}
1826 	*ppp = pp;
1827 }
1828 
1829 /*
1830  * Remove this page from a linked list of pages
1831  * using the p_next & p_prev pointers for the list.
1832  *
1833  * The caller is responsible for protecting the list pointers.
1834  */
1835 void
1836 mach_page_sub(page_t **ppp, page_t *pp)
1837 {
1838 	ASSERT(PP_ISFREE(pp));
1839 
1840 	if (*ppp == NULL || pp == NULL)
1841 		panic("mach_page_sub");
1842 
1843 	if (*ppp == pp)
1844 		*ppp = pp->p_next;		/* go to next page */
1845 
1846 	if (*ppp == pp)
1847 		*ppp = NULL;			/* page list is gone */
1848 	else {
1849 		pp->p_prev->p_next = pp->p_next;
1850 		pp->p_next->p_prev = pp->p_prev;
1851 	}
1852 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1853 }
1854 
1855 /*
1856  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1857  */
1858 void
1859 page_promote_size(page_t *pp, uint_t cur_szc)
1860 {
1861 	pfn_t pfn;
1862 	int mnode;
1863 	int idx;
1864 	int new_szc = cur_szc + 1;
1865 	int full = FULL_REGION_CNT(new_szc);
1866 
1867 	pfn = page_pptonum(pp);
1868 	mnode = PFN_2_MEM_NODE(pfn);
1869 
1870 	page_freelist_lock(mnode);
1871 
1872 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1873 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1874 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1875 
1876 	page_freelist_unlock(mnode);
1877 }
1878 
1879 static uint_t page_promote_err;
1880 static uint_t page_promote_noreloc_err;
1881 
1882 /*
1883  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1884  * for the given mnode starting at pfnum. Pages involved are on the freelist
1885  * before the call and may be returned to the caller if requested, otherwise
1886  * they will be placed back on the freelist.
1887  * If flags is PC_ALLOC, then the large page will be returned to the user in
1888  * a state which is consistent with a page being taken off the freelist.  If
1889  * we failed to lock the new large page, then we will return NULL to the
1890  * caller and put the large page on the freelist instead.
1891  * If flags is PC_FREE, then the large page will be placed on the freelist,
1892  * and NULL will be returned.
1893  * The caller is responsible for locking the freelist as well as any other
1894  * accounting which needs to be done for a returned page.
1895  *
1896  * RFE: For performance pass in pp instead of pfnum so
1897  * 	we can avoid excessive calls to page_numtopp_nolock().
1898  *	This would depend on an assumption that all contiguous
1899  *	pages are in the same memseg so we can just add/dec
1900  *	our pp.
1901  *
1902  * Lock ordering:
1903  *
1904  *	There is a potential but rare deadlock situation
1905  *	for page promotion and demotion operations. The problem
1906  *	is there are two paths into the freelist manager and
1907  *	they have different lock orders:
1908  *
1909  *	page_create()
1910  *		lock freelist
1911  *		page_lock(EXCL)
1912  *		unlock freelist
1913  *		return
1914  *		caller drops page_lock
1915  *
1916  *	page_free() and page_reclaim()
1917  *		caller grabs page_lock(EXCL)
1918  *
1919  *		lock freelist
1920  *		unlock freelist
1921  *		drop page_lock
1922  *
1923  *	What prevents a thread in page_create() from deadlocking
1924  *	with a thread freeing or reclaiming the same page is the
1925  *	page_trylock() in page_get_freelist(). If the trylock fails
1926  *	it skips the page.
1927  *
1928  *	The lock ordering for promotion and demotion is the same as
1929  *	for page_create(). Since the same deadlock could occur during
1930  *	page promotion and freeing or reclaiming of a page on the
1931  *	cache list we might have to fail the operation and undo what
1932  *	have done so far. Again this is rare.
1933  */
1934 page_t *
1935 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1936 {
1937 	page_t		*pp, *pplist, *tpp, *start_pp;
1938 	pgcnt_t		new_npgs, npgs;
1939 	uint_t		bin;
1940 	pgcnt_t		tmpnpgs, pages_left;
1941 	uint_t		noreloc;
1942 	int 		which_list;
1943 	ulong_t		index;
1944 	kmutex_t	*phm;
1945 
1946 	/*
1947 	 * General algorithm:
1948 	 * Find the starting page
1949 	 * Walk each page struct removing it from the freelist,
1950 	 * and linking it to all the other pages removed.
1951 	 * Once all pages are off the freelist,
1952 	 * walk the list, modifying p_szc to new_szc and what
1953 	 * ever other info needs to be done to create a large free page.
1954 	 * According to the flags, either return the page or put it
1955 	 * on the freelist.
1956 	 */
1957 
1958 	start_pp = page_numtopp_nolock(pfnum);
1959 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1960 	new_npgs = page_get_pagecnt(new_szc);
1961 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1962 
1963 	/* don't return page of the wrong mtype */
1964 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1965 			return (NULL);
1966 
1967 	/*
1968 	 * Loop through smaller pages to confirm that all pages
1969 	 * give the same result for PP_ISNORELOC().
1970 	 * We can check this reliably here as the protocol for setting
1971 	 * P_NORELOC requires pages to be taken off the free list first.
1972 	 */
1973 	noreloc = PP_ISNORELOC(start_pp);
1974 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1975 		if (noreloc != PP_ISNORELOC(pp)) {
1976 			page_promote_noreloc_err++;
1977 			page_promote_err++;
1978 			return (NULL);
1979 		}
1980 	}
1981 
1982 	pages_left = new_npgs;
1983 	pplist = NULL;
1984 	pp = start_pp;
1985 
1986 	/* Loop around coalescing the smaller pages into a big page. */
1987 	while (pages_left) {
1988 		/*
1989 		 * Remove from the freelist.
1990 		 */
1991 		ASSERT(PP_ISFREE(pp));
1992 		bin = PP_2_BIN(pp);
1993 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1994 		mtype = PP_2_MTYPE(pp);
1995 		if (PP_ISAGED(pp)) {
1996 
1997 			/*
1998 			 * PG_FREE_LIST
1999 			 */
2000 			if (pp->p_szc) {
2001 				page_vpsub(&PAGE_FREELISTS(mnode,
2002 				    pp->p_szc, bin, mtype), pp);
2003 			} else {
2004 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2005 				    bin, mtype), pp);
2006 			}
2007 			which_list = PG_FREE_LIST;
2008 		} else {
2009 			ASSERT(pp->p_szc == 0);
2010 
2011 			/*
2012 			 * PG_CACHE_LIST
2013 			 *
2014 			 * Since this page comes from the
2015 			 * cachelist, we must destroy the
2016 			 * vnode association.
2017 			 */
2018 			if (!page_trylock(pp, SE_EXCL)) {
2019 				goto fail_promote;
2020 			}
2021 
2022 			/*
2023 			 * We need to be careful not to deadlock
2024 			 * with another thread in page_lookup().
2025 			 * The page_lookup() thread could be holding
2026 			 * the same phm that we need if the two
2027 			 * pages happen to hash to the same phm lock.
2028 			 * At this point we have locked the entire
2029 			 * freelist and page_lookup() could be trying
2030 			 * to grab a freelist lock.
2031 			 */
2032 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2033 			phm = PAGE_HASH_MUTEX(index);
2034 			if (!mutex_tryenter(phm)) {
2035 				page_unlock_nocapture(pp);
2036 				goto fail_promote;
2037 			}
2038 
2039 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2040 			page_hashout(pp, phm);
2041 			mutex_exit(phm);
2042 			PP_SETAGED(pp);
2043 			page_unlock_nocapture(pp);
2044 			which_list = PG_CACHE_LIST;
2045 		}
2046 		page_ctr_sub(mnode, mtype, pp, which_list);
2047 
2048 		/*
2049 		 * Concatenate the smaller page(s) onto
2050 		 * the large page list.
2051 		 */
2052 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2053 		pages_left -= npgs;
2054 		tpp = pp;
2055 		while (npgs--) {
2056 			tpp->p_szc = new_szc;
2057 			tpp = tpp->p_next;
2058 		}
2059 		page_list_concat(&pplist, &pp);
2060 		pp += tmpnpgs;
2061 	}
2062 	CHK_LPG(pplist, new_szc);
2063 
2064 	/*
2065 	 * return the page to the user if requested
2066 	 * in the properly locked state.
2067 	 */
2068 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2069 		return (pplist);
2070 	}
2071 
2072 	/*
2073 	 * Otherwise place the new large page on the freelist
2074 	 */
2075 	bin = PP_2_BIN(pplist);
2076 	mnode = PP_2_MEM_NODE(pplist);
2077 	mtype = PP_2_MTYPE(pplist);
2078 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2079 
2080 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2081 	return (NULL);
2082 
2083 fail_promote:
2084 	/*
2085 	 * A thread must have still been freeing or
2086 	 * reclaiming the page on the cachelist.
2087 	 * To prevent a deadlock undo what we have
2088 	 * done sofar and return failure. This
2089 	 * situation can only happen while promoting
2090 	 * PAGESIZE pages.
2091 	 */
2092 	page_promote_err++;
2093 	while (pplist) {
2094 		pp = pplist;
2095 		mach_page_sub(&pplist, pp);
2096 		pp->p_szc = 0;
2097 		bin = PP_2_BIN(pp);
2098 		mtype = PP_2_MTYPE(pp);
2099 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2100 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2101 	}
2102 	return (NULL);
2103 
2104 }
2105 
2106 /*
2107  * Break up a large page into smaller size pages.
2108  * Pages involved are on the freelist before the call and may
2109  * be returned to the caller if requested, otherwise they will
2110  * be placed back on the freelist.
2111  * The caller is responsible for locking the freelist as well as any other
2112  * accounting which needs to be done for a returned page.
2113  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2114  * technically, any value may be passed in but PC_NO_COLOR is the standard
2115  * which should be followed for clarity's sake.
2116  * Returns a page whose pfn is < pfnmax
2117  */
2118 page_t *
2119 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2120     uchar_t new_szc, int color, int flags)
2121 {
2122 	page_t	*pp, *pplist, *npplist;
2123 	pgcnt_t	npgs, n;
2124 	uint_t	bin;
2125 	uint_t	mtype;
2126 	page_t	*ret_pp = NULL;
2127 
2128 	ASSERT(cur_szc != 0);
2129 	ASSERT(new_szc < cur_szc);
2130 
2131 	pplist = page_numtopp_nolock(pfnum);
2132 	ASSERT(pplist != NULL);
2133 
2134 	ASSERT(pplist->p_szc == cur_szc);
2135 
2136 	bin = PP_2_BIN(pplist);
2137 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2138 	mtype = PP_2_MTYPE(pplist);
2139 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2140 
2141 	CHK_LPG(pplist, cur_szc);
2142 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2143 
2144 	/*
2145 	 * Number of PAGESIZE pages for smaller new_szc
2146 	 * page.
2147 	 */
2148 	npgs = page_get_pagecnt(new_szc);
2149 
2150 	while (pplist) {
2151 		pp = pplist;
2152 
2153 		ASSERT(pp->p_szc == cur_szc);
2154 
2155 		/*
2156 		 * We either break it up into PAGESIZE pages or larger.
2157 		 */
2158 		if (npgs == 1) {	/* PAGESIZE case */
2159 			mach_page_sub(&pplist, pp);
2160 			ASSERT(pp->p_szc == cur_szc);
2161 			ASSERT(new_szc == 0);
2162 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2163 			pp->p_szc = new_szc;
2164 			bin = PP_2_BIN(pp);
2165 			if ((bin == color) && (flags == PC_ALLOC) &&
2166 			    (ret_pp == NULL) && (pfnmax == 0 ||
2167 			    pp->p_pagenum < pfnmax) &&
2168 			    page_trylock_cons(pp, SE_EXCL)) {
2169 				ret_pp = pp;
2170 			} else {
2171 				mtype = PP_2_MTYPE(pp);
2172 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2173 				    mtype), pp);
2174 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2175 			}
2176 		} else {
2177 			page_t *try_to_return_this_page = NULL;
2178 			int count = 0;
2179 
2180 			/*
2181 			 * Break down into smaller lists of pages.
2182 			 */
2183 			page_list_break(&pplist, &npplist, npgs);
2184 
2185 			pp = pplist;
2186 			n = npgs;
2187 			while (n--) {
2188 				ASSERT(pp->p_szc == cur_szc);
2189 				/*
2190 				 * Check whether all the pages in this list
2191 				 * fit the request criteria.
2192 				 */
2193 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2194 					count++;
2195 				}
2196 				pp->p_szc = new_szc;
2197 				pp = pp->p_next;
2198 			}
2199 
2200 			if (count == npgs &&
2201 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2202 				try_to_return_this_page = pp;
2203 			}
2204 
2205 			CHK_LPG(pplist, new_szc);
2206 
2207 			bin = PP_2_BIN(pplist);
2208 			if (try_to_return_this_page)
2209 				ASSERT(mnode ==
2210 				    PP_2_MEM_NODE(try_to_return_this_page));
2211 			if ((bin == color) && (flags == PC_ALLOC) &&
2212 			    (ret_pp == NULL) && try_to_return_this_page &&
2213 			    page_trylock_cons(try_to_return_this_page,
2214 			    SE_EXCL)) {
2215 				ret_pp = try_to_return_this_page;
2216 			} else {
2217 				mtype = PP_2_MTYPE(pp);
2218 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2219 				    bin, mtype), pplist);
2220 
2221 				page_ctr_add(mnode, mtype, pplist,
2222 				    PG_FREE_LIST);
2223 			}
2224 			pplist = npplist;
2225 		}
2226 	}
2227 	return (ret_pp);
2228 }
2229 
2230 int mpss_coalesce_disable = 0;
2231 
2232 /*
2233  * Coalesce free pages into a page of the given szc and color if possible.
2234  * Return the pointer to the page created, otherwise, return NULL.
2235  *
2236  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2237  */
2238 page_t *
2239 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2240     int mtype, pfn_t pfnhi)
2241 {
2242 	int 	r = szc;		/* region size */
2243 	int	mrange;
2244 	uint_t 	full, bin, color_mask, wrap = 0;
2245 	pfn_t	pfnum, lo, hi;
2246 	size_t	len, idx, idx0;
2247 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2248 	page_t	*ret_pp;
2249 	MEM_NODE_ITERATOR_DECL(it);
2250 #if defined(__sparc)
2251 	pfn_t pfnum0, nlo, nhi;
2252 #endif
2253 
2254 	if (mpss_coalesce_disable) {
2255 		ASSERT(szc < MMU_PAGE_SIZES);
2256 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2257 		return (NULL);
2258 	}
2259 
2260 	ASSERT(szc < mmu_page_sizes);
2261 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2262 	ASSERT(ceq_mask <= color_mask);
2263 	ASSERT(color <= color_mask);
2264 	color &= ceq_mask;
2265 
2266 	/* Prevent page_counters dynamic memory from being freed */
2267 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2268 
2269 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2270 	ASSERT(mrange < mnode_nranges[mnode]);
2271 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2272 
2273 	/* get pfn range for mtype */
2274 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2275 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2276 	hi++;
2277 
2278 	/* use lower limit if given */
2279 	if (pfnhi != PFNNULL && pfnhi < hi)
2280 		hi = pfnhi;
2281 
2282 	/* round to szcpgcnt boundaries */
2283 	lo = P2ROUNDUP(lo, szcpgcnt);
2284 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2285 	if (lo == (pfn_t)-1) {
2286 		rw_exit(&page_ctrs_rwlock[mnode]);
2287 		return (NULL);
2288 	}
2289 	hi = hi & ~(szcpgcnt - 1);
2290 
2291 	/* set lo to the closest pfn of the right color */
2292 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2293 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2294 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2295 		    &it);
2296 	}
2297 
2298 	if (hi <= lo) {
2299 		rw_exit(&page_ctrs_rwlock[mnode]);
2300 		return (NULL);
2301 	}
2302 
2303 	full = FULL_REGION_CNT(r);
2304 
2305 	/* calculate the number of page candidates and initial search index */
2306 	bin = color;
2307 	idx0 = (size_t)(-1);
2308 	do {
2309 		pgcnt_t acand;
2310 
2311 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2312 		if (acand) {
2313 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2314 			    r, bin, mrange);
2315 			idx0 = MIN(idx0, idx);
2316 			cands += acand;
2317 		}
2318 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2319 	} while (bin != color);
2320 
2321 	if (cands == 0) {
2322 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2323 		rw_exit(&page_ctrs_rwlock[mnode]);
2324 		return (NULL);
2325 	}
2326 
2327 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2328 	if (pfnum < lo || pfnum >= hi) {
2329 		pfnum = lo;
2330 	} else {
2331 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2332 		if (pfnum == (pfn_t)-1) {
2333 			pfnum = lo;
2334 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2335 			ASSERT(pfnum != (pfn_t)-1);
2336 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2337 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2338 			/* invalid color, get the closest correct pfn */
2339 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2340 			    color_mask, &it);
2341 			if (pfnum >= hi) {
2342 				pfnum = lo;
2343 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2344 			}
2345 		}
2346 	}
2347 
2348 	/* set starting index */
2349 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2350 	ASSERT(idx0 < len);
2351 
2352 #if defined(__sparc)
2353 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2354 	nhi = 0;		/* search kcage ranges */
2355 #endif
2356 
2357 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2358 
2359 #if defined(__sparc)
2360 		/*
2361 		 * Find lowest intersection of kcage ranges and mnode.
2362 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2363 		 */
2364 		if (nhi <= pfnum) {
2365 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2366 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2367 				goto wrapit;
2368 
2369 			/* jump to the next page in the range */
2370 			if (pfnum < nlo) {
2371 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2372 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2373 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2374 				if (idx >= len || pfnum >= hi)
2375 					goto wrapit;
2376 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2377 				    ceq_mask)
2378 					goto next;
2379 				if (interleaved_mnodes &&
2380 				    PFN_2_MEM_NODE(pfnum) != mnode)
2381 					goto next;
2382 			}
2383 		}
2384 #endif
2385 
2386 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2387 			goto next;
2388 
2389 		/*
2390 		 * RFE: For performance maybe we can do something less
2391 		 *	brutal than locking the entire freelist. So far
2392 		 * 	this doesn't seem to be a performance problem?
2393 		 */
2394 		page_freelist_lock(mnode);
2395 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2396 			ret_pp =
2397 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2398 			if (ret_pp != NULL) {
2399 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2400 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2401 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2402 				page_freelist_unlock(mnode);
2403 				rw_exit(&page_ctrs_rwlock[mnode]);
2404 #if defined(__sparc)
2405 				if (PP_ISNORELOC(ret_pp)) {
2406 					pgcnt_t npgs;
2407 
2408 					npgs = page_get_pagecnt(ret_pp->p_szc);
2409 					kcage_freemem_sub(npgs);
2410 				}
2411 #endif
2412 				return (ret_pp);
2413 			}
2414 		} else {
2415 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2416 		}
2417 
2418 		page_freelist_unlock(mnode);
2419 		/*
2420 		 * No point looking for another page if we've
2421 		 * already tried all of the ones that
2422 		 * page_ctr_cands indicated.  Stash off where we left
2423 		 * off.
2424 		 * Note: this is not exact since we don't hold the
2425 		 * page_freelist_locks before we initially get the
2426 		 * value of cands for performance reasons, but should
2427 		 * be a decent approximation.
2428 		 */
2429 		if (--cands == 0) {
2430 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2431 			    idx;
2432 			break;
2433 		}
2434 next:
2435 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2436 		    color_mask, &it);
2437 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2438 		if (idx >= len || pfnum >= hi) {
2439 wrapit:
2440 			pfnum = lo;
2441 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2442 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2443 			wrap++;
2444 #if defined(__sparc)
2445 			nhi = 0;	/* search kcage ranges */
2446 #endif
2447 		}
2448 	}
2449 
2450 	rw_exit(&page_ctrs_rwlock[mnode]);
2451 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2452 	return (NULL);
2453 }
2454 
2455 /*
2456  * For the given mnode, promote as many small pages to large pages as possible.
2457  * mnode can be -1, which means do them all
2458  */
2459 void
2460 page_freelist_coalesce_all(int mnode)
2461 {
2462 	int 	r;		/* region size */
2463 	int 	idx, full;
2464 	size_t	len;
2465 	int doall = interleaved_mnodes || mnode < 0;
2466 	int mlo = doall ? 0 : mnode;
2467 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2468 
2469 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2470 
2471 	if (mpss_coalesce_disable) {
2472 		return;
2473 	}
2474 
2475 	/*
2476 	 * Lock the entire freelist and coalesce what we can.
2477 	 *
2478 	 * Always promote to the largest page possible
2479 	 * first to reduce the number of page promotions.
2480 	 */
2481 	for (mnode = mlo; mnode < mhi; mnode++) {
2482 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2483 		page_freelist_lock(mnode);
2484 	}
2485 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2486 		for (mnode = mlo; mnode < mhi; mnode++) {
2487 			pgcnt_t cands = 0;
2488 			int mrange, nranges = mnode_nranges[mnode];
2489 
2490 			for (mrange = 0; mrange < nranges; mrange++) {
2491 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2492 				if (cands != 0)
2493 					break;
2494 			}
2495 			if (cands == 0) {
2496 				VM_STAT_ADD(vmm_vmstats.
2497 				    page_ctrs_cands_skip_all);
2498 				continue;
2499 			}
2500 
2501 			full = FULL_REGION_CNT(r);
2502 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2503 
2504 			for (idx = 0; idx < len; idx++) {
2505 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2506 					pfn_t pfnum =
2507 					    IDX_TO_PNUM(mnode, r, idx);
2508 					int tmnode = interleaved_mnodes ?
2509 					    PFN_2_MEM_NODE(pfnum) : mnode;
2510 
2511 					ASSERT(pfnum >=
2512 					    mem_node_config[tmnode].physbase &&
2513 					    pfnum <
2514 					    mem_node_config[tmnode].physmax);
2515 
2516 					(void) page_promote(tmnode,
2517 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2518 				}
2519 			}
2520 			/* shared hpm_counters covers all mnodes, so we quit */
2521 			if (interleaved_mnodes)
2522 				break;
2523 		}
2524 	}
2525 	for (mnode = mlo; mnode < mhi; mnode++) {
2526 		page_freelist_unlock(mnode);
2527 		rw_exit(&page_ctrs_rwlock[mnode]);
2528 	}
2529 }
2530 
2531 /*
2532  * This is where all polices for moving pages around
2533  * to different page size free lists is implemented.
2534  * Returns 1 on success, 0 on failure.
2535  *
2536  * So far these are the priorities for this algorithm in descending
2537  * order:
2538  *
2539  *	1) When servicing a request try to do so with a free page
2540  *	   from next size up. Helps defer fragmentation as long
2541  *	   as possible.
2542  *
2543  *	2) Page coalesce on demand. Only when a freelist
2544  *	   larger than PAGESIZE is empty and step 1
2545  *	   will not work since all larger size lists are
2546  *	   also empty.
2547  *
2548  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2549  */
2550 
2551 page_t *
2552 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2553     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2554 {
2555 	uchar_t nszc = szc + 1;
2556 	uint_t 	bin, sbin, bin_prev;
2557 	page_t	*pp, *firstpp;
2558 	page_t	*ret_pp = NULL;
2559 	uint_t  color_mask;
2560 
2561 	if (nszc == mmu_page_sizes)
2562 		return (NULL);
2563 
2564 	ASSERT(nszc < mmu_page_sizes);
2565 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2566 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2567 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2568 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2569 
2570 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2571 	/*
2572 	 * First try to break up a larger page to fill current size freelist.
2573 	 */
2574 	while (plw->plw_bins[nszc] != 0) {
2575 
2576 		ASSERT(nszc < mmu_page_sizes);
2577 
2578 		/*
2579 		 * If page found then demote it.
2580 		 */
2581 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2582 			page_freelist_lock(mnode);
2583 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2584 
2585 			/*
2586 			 * If pfnhi is not PFNNULL, look for large page below
2587 			 * pfnhi. PFNNULL signifies no pfn requirement.
2588 			 */
2589 			if (pp &&
2590 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2591 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2592 				do {
2593 					pp = pp->p_vpnext;
2594 					if (pp == firstpp) {
2595 						pp = NULL;
2596 						break;
2597 					}
2598 				} while ((pfnhi != PFNNULL &&
2599 				    pp->p_pagenum >= pfnhi) ||
2600 				    (pfnlo != PFNNULL &&
2601 				    pp->p_pagenum < pfnlo));
2602 
2603 				if (pfnhi != PFNNULL && pp != NULL)
2604 					ASSERT(pp->p_pagenum < pfnhi);
2605 
2606 				if (pfnlo != PFNNULL && pp != NULL)
2607 					ASSERT(pp->p_pagenum >= pfnlo);
2608 			}
2609 			if (pp) {
2610 				uint_t ccolor = page_correct_color(szc, nszc,
2611 				    color, bin, plw->plw_ceq_mask[szc]);
2612 
2613 				ASSERT(pp->p_szc == nszc);
2614 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2615 				ret_pp = page_demote(mnode, pp->p_pagenum,
2616 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2617 				if (ret_pp) {
2618 					page_freelist_unlock(mnode);
2619 #if defined(__sparc)
2620 					if (PP_ISNORELOC(ret_pp)) {
2621 						pgcnt_t npgs;
2622 
2623 						npgs = page_get_pagecnt(
2624 						    ret_pp->p_szc);
2625 						kcage_freemem_sub(npgs);
2626 					}
2627 #endif
2628 					return (ret_pp);
2629 				}
2630 			}
2631 			page_freelist_unlock(mnode);
2632 		}
2633 
2634 		/* loop through next size bins */
2635 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2636 		plw->plw_bins[nszc]--;
2637 
2638 		if (bin == sbin) {
2639 			uchar_t nnszc = nszc + 1;
2640 
2641 			/* we are done with this page size - check next */
2642 			if (plw->plw_bins[nnszc] == 0)
2643 				/* we have already checked next size bins */
2644 				break;
2645 
2646 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2647 			if (bin_prev != INVALID_COLOR) {
2648 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2649 				if (!((bin ^ bin_prev) &
2650 				    plw->plw_ceq_mask[nnszc]))
2651 					break;
2652 			}
2653 			ASSERT(nnszc < mmu_page_sizes);
2654 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2655 			nszc = nnszc;
2656 			ASSERT(nszc < mmu_page_sizes);
2657 		}
2658 	}
2659 
2660 	return (ret_pp);
2661 }
2662 
2663 /*
2664  * Helper routine used only by the freelist code to lock
2665  * a page. If the page is a large page then it succeeds in
2666  * locking all the constituent pages or none at all.
2667  * Returns 1 on sucess, 0 on failure.
2668  */
2669 static int
2670 page_trylock_cons(page_t *pp, se_t se)
2671 {
2672 	page_t	*tpp, *first_pp = pp;
2673 
2674 	/*
2675 	 * Fail if can't lock first or only page.
2676 	 */
2677 	if (!page_trylock(pp, se)) {
2678 		return (0);
2679 	}
2680 
2681 	/*
2682 	 * PAGESIZE: common case.
2683 	 */
2684 	if (pp->p_szc == 0) {
2685 		return (1);
2686 	}
2687 
2688 	/*
2689 	 * Large page case.
2690 	 */
2691 	tpp = pp->p_next;
2692 	while (tpp != pp) {
2693 		if (!page_trylock(tpp, se)) {
2694 			/*
2695 			 * On failure unlock what we have locked so far.
2696 			 * We want to avoid attempting to capture these
2697 			 * pages as the pcm mutex may be held which could
2698 			 * lead to a recursive mutex panic.
2699 			 */
2700 			while (first_pp != tpp) {
2701 				page_unlock_nocapture(first_pp);
2702 				first_pp = first_pp->p_next;
2703 			}
2704 			return (0);
2705 		}
2706 		tpp = tpp->p_next;
2707 	}
2708 	return (1);
2709 }
2710 
2711 /*
2712  * init context for walking page lists
2713  * Called when a page of the given szc in unavailable. Sets markers
2714  * for the beginning of the search to detect when search has
2715  * completed a full cycle. Sets flags for splitting larger pages
2716  * and coalescing smaller pages. Page walking procedes until a page
2717  * of the desired equivalent color is found.
2718  */
2719 void
2720 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2721     int use_ceq, page_list_walker_t *plw)
2722 {
2723 	uint_t  nszc, ceq_mask, colors;
2724 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2725 
2726 	ASSERT(szc < mmu_page_sizes);
2727 	colors = PAGE_GET_PAGECOLORS(szc);
2728 
2729 	plw->plw_colors = colors;
2730 	plw->plw_color_mask = colors - 1;
2731 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2732 	plw->plw_bin_split_prev = bin;
2733 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2734 
2735 	/*
2736 	 * if vac aliasing is possible make sure lower order color
2737 	 * bits are never ignored
2738 	 */
2739 	if (vac_colors > 1)
2740 		ceq &= 0xf0;
2741 
2742 	/*
2743 	 * calculate the number of non-equivalent colors and
2744 	 * color equivalency mask
2745 	 */
2746 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2747 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2748 	ASSERT(plw->plw_ceq_dif > 0);
2749 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2750 
2751 	if (flags & PG_MATCH_COLOR) {
2752 		if (cpu_page_colors <  0) {
2753 			/*
2754 			 * this is a heterogeneous machine with different CPUs
2755 			 * having different size e$ (not supported for ni2/rock
2756 			 */
2757 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2758 			cpucolors = MAX(cpucolors, 1);
2759 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2760 			plw->plw_ceq_mask[szc] =
2761 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2762 		}
2763 		plw->plw_ceq_dif = 1;
2764 	}
2765 
2766 	/* we can split pages in the freelist, but not the cachelist */
2767 	if (can_split) {
2768 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2769 
2770 		/* set next szc color masks and number of free list bins */
2771 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2772 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2773 			    plw->plw_ceq_mask[szc]);
2774 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2775 		}
2776 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2777 		plw->plw_bins[nszc] = 0;
2778 
2779 	} else {
2780 		ASSERT(szc == 0);
2781 		plw->plw_do_split = 0;
2782 		plw->plw_bins[1] = 0;
2783 		plw->plw_ceq_mask[1] = INVALID_MASK;
2784 	}
2785 }
2786 
2787 /*
2788  * set mark to flag where next split should occur
2789  */
2790 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2791 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2792 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2793 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2794 	plw->plw_split_next =						     \
2795 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2796 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2797 		plw->plw_split_next =					     \
2798 		INC_MASKED(plw->plw_split_next,				     \
2799 		    neq_mask, plw->plw_color_mask);			     \
2800 	}								     \
2801 }
2802 
2803 uint_t
2804 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2805 {
2806 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2807 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2808 	uchar_t nszc = szc + 1;
2809 
2810 	nbin = ADD_MASKED(bin,
2811 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2812 
2813 	if (plw->plw_do_split) {
2814 		plw->plw_bin_split_prev = bin;
2815 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2816 		plw->plw_do_split = 0;
2817 	}
2818 
2819 	if (szc == 0) {
2820 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2821 			if (nbin == plw->plw_bin0 &&
2822 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2823 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2824 				    neq_mask, plw->plw_color_mask);
2825 				plw->plw_bin_split_prev = plw->plw_bin0;
2826 			}
2827 
2828 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2829 				plw->plw_bin_marker =
2830 				    nbin = INC_MASKED(nbin, neq_mask,
2831 				    plw->plw_color_mask);
2832 				plw->plw_bin_split_prev = plw->plw_bin0;
2833 				/*
2834 				 * large pages all have the same vac color
2835 				 * so by now we should be done with next
2836 				 * size page splitting process
2837 				 */
2838 				ASSERT(plw->plw_bins[1] == 0);
2839 				plw->plw_do_split = 0;
2840 				return (nbin);
2841 			}
2842 
2843 		} else {
2844 			uint_t bin_jump = (vac_colors == 1) ?
2845 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2846 
2847 			bin_jump &= ~(vac_colors - 1);
2848 
2849 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2850 			    plw->plw_color_mask);
2851 
2852 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2853 
2854 				plw->plw_bin_marker = nbin = nbin0;
2855 
2856 				if (plw->plw_bins[nszc] != 0) {
2857 					/*
2858 					 * check if next page size bin is the
2859 					 * same as the next page size bin for
2860 					 * bin0
2861 					 */
2862 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2863 					    nbin);
2864 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2865 					    plw->plw_bin0);
2866 
2867 					if ((bin0_nsz ^ nbin_nsz) &
2868 					    plw->plw_ceq_mask[nszc])
2869 						plw->plw_do_split = 1;
2870 				}
2871 				return (nbin);
2872 			}
2873 		}
2874 	}
2875 
2876 	if (plw->plw_bins[nszc] != 0) {
2877 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2878 		if (!((plw->plw_split_next ^ nbin_nsz) &
2879 		    plw->plw_ceq_mask[nszc]))
2880 			plw->plw_do_split = 1;
2881 	}
2882 
2883 	return (nbin);
2884 }
2885 
2886 page_t *
2887 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2888     uint_t flags)
2889 {
2890 	kmutex_t		*pcm;
2891 	page_t			*pp, *first_pp;
2892 	uint_t			sbin;
2893 	int			plw_initialized;
2894 	page_list_walker_t	plw;
2895 
2896 	ASSERT(szc < mmu_page_sizes);
2897 
2898 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2899 
2900 	MTYPE_START(mnode, mtype, flags);
2901 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2902 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2903 		return (NULL);
2904 	}
2905 try_again:
2906 
2907 	plw_initialized = 0;
2908 	plw.plw_ceq_dif = 1;
2909 
2910 	/*
2911 	 * Only hold one freelist lock at a time, that way we
2912 	 * can start anywhere and not have to worry about lock
2913 	 * ordering.
2914 	 */
2915 	for (plw.plw_count = 0;
2916 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2917 		sbin = bin;
2918 		do {
2919 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2920 				goto bin_empty_1;
2921 
2922 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2923 			mutex_enter(pcm);
2924 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2925 			if (pp == NULL)
2926 				goto bin_empty_0;
2927 
2928 			/*
2929 			 * These were set before the page
2930 			 * was put on the free list,
2931 			 * they must still be set.
2932 			 */
2933 			ASSERT(PP_ISFREE(pp));
2934 			ASSERT(PP_ISAGED(pp));
2935 			ASSERT(pp->p_vnode == NULL);
2936 			ASSERT(pp->p_hash == NULL);
2937 			ASSERT(pp->p_offset == (u_offset_t)-1);
2938 			ASSERT(pp->p_szc == szc);
2939 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2940 
2941 			/*
2942 			 * Walk down the hash chain.
2943 			 * 8k pages are linked on p_next
2944 			 * and p_prev fields. Large pages
2945 			 * are a contiguous group of
2946 			 * constituent pages linked together
2947 			 * on their p_next and p_prev fields.
2948 			 * The large pages are linked together
2949 			 * on the hash chain using p_vpnext
2950 			 * p_vpprev of the base constituent
2951 			 * page of each large page.
2952 			 */
2953 			first_pp = pp;
2954 			while (!page_trylock_cons(pp, SE_EXCL)) {
2955 				if (szc == 0) {
2956 					pp = pp->p_next;
2957 				} else {
2958 					pp = pp->p_vpnext;
2959 				}
2960 
2961 				ASSERT(PP_ISFREE(pp));
2962 				ASSERT(PP_ISAGED(pp));
2963 				ASSERT(pp->p_vnode == NULL);
2964 				ASSERT(pp->p_hash == NULL);
2965 				ASSERT(pp->p_offset == (u_offset_t)-1);
2966 				ASSERT(pp->p_szc == szc);
2967 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2968 
2969 				if (pp == first_pp)
2970 					goto bin_empty_0;
2971 			}
2972 
2973 			ASSERT(pp != NULL);
2974 			ASSERT(mtype == PP_2_MTYPE(pp));
2975 			ASSERT(pp->p_szc == szc);
2976 			if (szc == 0) {
2977 				page_sub(&PAGE_FREELISTS(mnode,
2978 				    szc, bin, mtype), pp);
2979 			} else {
2980 				page_vpsub(&PAGE_FREELISTS(mnode,
2981 				    szc, bin, mtype), pp);
2982 				CHK_LPG(pp, szc);
2983 			}
2984 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2985 
2986 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2987 				panic("free page is not. pp %p", (void *)pp);
2988 			mutex_exit(pcm);
2989 
2990 #if defined(__sparc)
2991 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2992 			    (flags & PG_NORELOC) == 0);
2993 
2994 			if (PP_ISNORELOC(pp))
2995 				kcage_freemem_sub(page_get_pagecnt(szc));
2996 #endif
2997 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2998 			return (pp);
2999 
3000 bin_empty_0:
3001 			mutex_exit(pcm);
3002 bin_empty_1:
3003 			if (plw_initialized == 0) {
3004 				page_list_walk_init(szc, flags, bin, 1, 1,
3005 				    &plw);
3006 				plw_initialized = 1;
3007 				ASSERT(plw.plw_colors <=
3008 				    PAGE_GET_PAGECOLORS(szc));
3009 				ASSERT(plw.plw_colors > 0);
3010 				ASSERT((plw.plw_colors &
3011 				    (plw.plw_colors - 1)) == 0);
3012 				ASSERT(bin < plw.plw_colors);
3013 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3014 			}
3015 			/* calculate the next bin with equivalent color */
3016 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3017 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
3018 		} while (sbin != bin);
3019 
3020 		/*
3021 		 * color bins are all empty if color match. Try and
3022 		 * satisfy the request by breaking up or coalescing
3023 		 * pages from a different size freelist of the correct
3024 		 * color that satisfies the ORIGINAL color requested.
3025 		 * If that fails then try pages of the same size but
3026 		 * different colors assuming we are not called with
3027 		 * PG_MATCH_COLOR.
3028 		 */
3029 		if (plw.plw_do_split &&
3030 		    (pp = page_freelist_split(szc, bin, mnode,
3031 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3032 			return (pp);
3033 
3034 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3035 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3036 			return (pp);
3037 
3038 		if (plw.plw_ceq_dif > 1)
3039 			bin = page_list_walk_next_bin(szc, bin, &plw);
3040 	}
3041 
3042 	/* if allowed, cycle through additional mtypes */
3043 	MTYPE_NEXT(mnode, mtype, flags);
3044 	if (mtype >= 0)
3045 		goto try_again;
3046 
3047 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3048 
3049 	return (NULL);
3050 }
3051 
3052 /*
3053  * Returns the count of free pages for 'pp' with size code 'szc'.
3054  * Note: This function does not return an exact value as the page freelist
3055  * locks are not held and thus the values in the page_counters may be
3056  * changing as we walk through the data.
3057  */
3058 static int
3059 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3060 {
3061 	pgcnt_t	pgfree;
3062 	pgcnt_t cnt;
3063 	ssize_t	r = szc;	/* region size */
3064 	ssize_t	idx;
3065 	int	i;
3066 	int	full, range;
3067 
3068 	/* Make sure pagenum passed in is aligned properly */
3069 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3070 	ASSERT(szc > 0);
3071 
3072 	/* Prevent page_counters dynamic memory from being freed */
3073 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3074 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3075 	cnt = PAGE_COUNTERS(mnode, r, idx);
3076 	pgfree = cnt << PNUM_SHIFT(r - 1);
3077 	range = FULL_REGION_CNT(szc);
3078 
3079 	/* Check for completely full region */
3080 	if (cnt == range) {
3081 		rw_exit(&page_ctrs_rwlock[mnode]);
3082 		return (pgfree);
3083 	}
3084 
3085 	while (--r > 0) {
3086 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3087 		full = FULL_REGION_CNT(r);
3088 		for (i = 0; i < range; i++, idx++) {
3089 			cnt = PAGE_COUNTERS(mnode, r, idx);
3090 			/*
3091 			 * If cnt here is full, that means we have already
3092 			 * accounted for these pages earlier.
3093 			 */
3094 			if (cnt != full) {
3095 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3096 			}
3097 		}
3098 		range *= full;
3099 	}
3100 	rw_exit(&page_ctrs_rwlock[mnode]);
3101 	return (pgfree);
3102 }
3103 
3104 /*
3105  * Called from page_geti_contig_pages to exclusively lock constituent pages
3106  * starting from 'spp' for page size code 'szc'.
3107  *
3108  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3109  * region needs to be greater than or equal to the threshold.
3110  */
3111 static int
3112 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3113 {
3114 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3115 	pgcnt_t pgfree, i;
3116 	page_t *pp;
3117 
3118 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3119 
3120 
3121 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3122 		goto skipptcpcheck;
3123 	/*
3124 	 * check if there are sufficient free pages available before attempting
3125 	 * to trylock. Count is approximate as page counters can change.
3126 	 */
3127 	pgfree = page_freecnt(mnode, spp, szc);
3128 
3129 	/* attempt to trylock if there are sufficient already free pages */
3130 	if (pgfree < pgcnt/ptcpthreshold) {
3131 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3132 		return (0);
3133 	}
3134 
3135 skipptcpcheck:
3136 
3137 	for (i = 0; i < pgcnt; i++) {
3138 		pp = &spp[i];
3139 		if (!page_trylock(pp, SE_EXCL)) {
3140 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3141 			while (--i != (pgcnt_t)-1) {
3142 				pp = &spp[i];
3143 				ASSERT(PAGE_EXCL(pp));
3144 				page_unlock_nocapture(pp);
3145 			}
3146 			return (0);
3147 		}
3148 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3149 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3150 		    !PP_ISFREE(pp)) {
3151 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3152 			ASSERT(i == 0);
3153 			page_unlock_nocapture(pp);
3154 			return (0);
3155 		}
3156 		if (PP_ISNORELOC(pp)) {
3157 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3158 			while (i != (pgcnt_t)-1) {
3159 				pp = &spp[i];
3160 				ASSERT(PAGE_EXCL(pp));
3161 				page_unlock_nocapture(pp);
3162 				i--;
3163 			}
3164 			return (0);
3165 		}
3166 	}
3167 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3168 	return (1);
3169 }
3170 
3171 /*
3172  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3173  * of 'szc' constituent pages that had been locked exclusively previously.
3174  * Will attempt to relocate constituent pages in use.
3175  */
3176 static page_t *
3177 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3178 {
3179 	spgcnt_t pgcnt, npgs, i;
3180 	page_t *targpp, *rpp, *hpp;
3181 	page_t *replpp = NULL;
3182 	page_t *pplist = NULL;
3183 
3184 	ASSERT(pp != NULL);
3185 
3186 	pgcnt = page_get_pagecnt(szc);
3187 	while (pgcnt) {
3188 		ASSERT(PAGE_EXCL(pp));
3189 		ASSERT(!PP_ISNORELOC(pp));
3190 		if (PP_ISFREE(pp)) {
3191 			/*
3192 			 * If this is a PG_FREE_LIST page then its
3193 			 * size code can change underneath us due to
3194 			 * page promotion or demotion. As an optimzation
3195 			 * use page_list_sub_pages() instead of
3196 			 * page_list_sub().
3197 			 */
3198 			if (PP_ISAGED(pp)) {
3199 				page_list_sub_pages(pp, szc);
3200 				if (pp->p_szc == szc) {
3201 					return (pp);
3202 				}
3203 				ASSERT(pp->p_szc < szc);
3204 				npgs = page_get_pagecnt(pp->p_szc);
3205 				hpp = pp;
3206 				for (i = 0; i < npgs; i++, pp++) {
3207 					pp->p_szc = szc;
3208 				}
3209 				page_list_concat(&pplist, &hpp);
3210 				pgcnt -= npgs;
3211 				continue;
3212 			}
3213 			ASSERT(!PP_ISAGED(pp));
3214 			ASSERT(pp->p_szc == 0);
3215 			page_list_sub(pp, PG_CACHE_LIST);
3216 			page_hashout(pp, NULL);
3217 			PP_SETAGED(pp);
3218 			pp->p_szc = szc;
3219 			page_list_concat(&pplist, &pp);
3220 			pp++;
3221 			pgcnt--;
3222 			continue;
3223 		}
3224 		npgs = page_get_pagecnt(pp->p_szc);
3225 
3226 		/*
3227 		 * page_create_wait freemem accounting done by caller of
3228 		 * page_get_freelist and not necessary to call it prior to
3229 		 * calling page_get_replacement_page.
3230 		 *
3231 		 * page_get_replacement_page can call page_get_contig_pages
3232 		 * to acquire a large page (szc > 0); the replacement must be
3233 		 * smaller than the contig page size to avoid looping or
3234 		 * szc == 0 and PGI_PGCPSZC0 is set.
3235 		 */
3236 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3237 			replpp = page_get_replacement_page(pp, NULL, 0);
3238 			if (replpp) {
3239 				npgs = page_get_pagecnt(pp->p_szc);
3240 				ASSERT(npgs <= pgcnt);
3241 				targpp = pp;
3242 			}
3243 		}
3244 
3245 		/*
3246 		 * If replacement is NULL or do_page_relocate fails, fail
3247 		 * coalescing of pages.
3248 		 */
3249 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3250 		    &npgs, NULL) != 0)) {
3251 			/*
3252 			 * Unlock un-processed target list
3253 			 */
3254 			while (pgcnt--) {
3255 				ASSERT(PAGE_EXCL(pp));
3256 				page_unlock_nocapture(pp);
3257 				pp++;
3258 			}
3259 			/*
3260 			 * Free the processed target list.
3261 			 */
3262 			while (pplist) {
3263 				pp = pplist;
3264 				page_sub(&pplist, pp);
3265 				ASSERT(PAGE_EXCL(pp));
3266 				ASSERT(pp->p_szc == szc);
3267 				ASSERT(PP_ISFREE(pp));
3268 				ASSERT(PP_ISAGED(pp));
3269 				pp->p_szc = 0;
3270 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3271 				page_unlock_nocapture(pp);
3272 			}
3273 
3274 			if (replpp != NULL)
3275 				page_free_replacement_page(replpp);
3276 
3277 			return (NULL);
3278 		}
3279 		ASSERT(pp == targpp);
3280 
3281 		/* LINTED */
3282 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3283 
3284 		pp += npgs;
3285 		pgcnt -= npgs;
3286 
3287 		while (npgs--) {
3288 			ASSERT(PAGE_EXCL(targpp));
3289 			ASSERT(!PP_ISFREE(targpp));
3290 			ASSERT(!PP_ISNORELOC(targpp));
3291 			PP_SETFREE(targpp);
3292 			ASSERT(PP_ISAGED(targpp));
3293 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3294 			    (flags & PGI_PGCPSZC0)));
3295 			targpp->p_szc = szc;
3296 			targpp = targpp->p_next;
3297 
3298 			rpp = replpp;
3299 			ASSERT(rpp != NULL);
3300 			page_sub(&replpp, rpp);
3301 			ASSERT(PAGE_EXCL(rpp));
3302 			ASSERT(!PP_ISFREE(rpp));
3303 			page_unlock_nocapture(rpp);
3304 		}
3305 		ASSERT(targpp == hpp);
3306 		ASSERT(replpp == NULL);
3307 		page_list_concat(&pplist, &targpp);
3308 	}
3309 	CHK_LPG(pplist, szc);
3310 	return (pplist);
3311 }
3312 
3313 /*
3314  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3315  * of 0 means nothing left after trim.
3316  */
3317 int
3318 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3319 {
3320 	pfn_t	kcagepfn;
3321 	int	decr;
3322 	int	rc = 0;
3323 
3324 	if (PP_ISNORELOC(mseg->pages)) {
3325 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3326 
3327 			/* lower part of this mseg inside kernel cage */
3328 			decr = kcage_current_pfn(&kcagepfn);
3329 
3330 			/* kernel cage may have transitioned past mseg */
3331 			if (kcagepfn >= mseg->pages_base &&
3332 			    kcagepfn < mseg->pages_end) {
3333 				ASSERT(decr == 0);
3334 				*lo = MAX(kcagepfn, pfnlo);
3335 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
3336 				rc = 1;
3337 			}
3338 		}
3339 		/* else entire mseg in the cage */
3340 	} else {
3341 		if (PP_ISNORELOC(mseg->epages - 1)) {
3342 
3343 			/* upper part of this mseg inside kernel cage */
3344 			decr = kcage_current_pfn(&kcagepfn);
3345 
3346 			/* kernel cage may have transitioned past mseg */
3347 			if (kcagepfn >= mseg->pages_base &&
3348 			    kcagepfn < mseg->pages_end) {
3349 				ASSERT(decr);
3350 				*hi = MIN(kcagepfn, pfnhi);
3351 				*lo = MAX(pfnlo, mseg->pages_base);
3352 				rc = 1;
3353 			}
3354 		} else {
3355 			/* entire mseg outside of kernel cage */
3356 			*lo = MAX(pfnlo, mseg->pages_base);
3357 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3358 			rc = 1;
3359 		}
3360 	}
3361 	return (rc);
3362 }
3363 
3364 /*
3365  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3366  * page with size code 'szc'. Claiming such a page requires acquiring
3367  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3368  * relocating pages in use and concatenating these constituent pages into a
3369  * large page.
3370  *
3371  * The page lists do not have such a large page and page_freelist_split has
3372  * already failed to demote larger pages and/or coalesce smaller free pages.
3373  *
3374  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3375  * pages with the same color as 'bin'.
3376  *
3377  * 'pfnflag' specifies the subset of the pfn range to search.
3378  */
3379 
3380 static page_t *
3381 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3382     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3383 {
3384 	struct memseg *mseg;
3385 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3386 	pgcnt_t szcpgmask = szcpgcnt - 1;
3387 	pfn_t	randpfn;
3388 	page_t *pp, *randpp, *endpp;
3389 	uint_t colors, ceq_mask;
3390 	/* LINTED : set but not used in function */
3391 	uint_t color_mask;
3392 	pfn_t hi, lo;
3393 	uint_t skip;
3394 	MEM_NODE_ITERATOR_DECL(it);
3395 
3396 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3397 
3398 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3399 
3400 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3401 		return (NULL);
3402 
3403 	ASSERT(szc < mmu_page_sizes);
3404 
3405 	colors = PAGE_GET_PAGECOLORS(szc);
3406 	color_mask = colors - 1;
3407 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3408 		uchar_t ceq = colorequivszc[szc];
3409 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3410 
3411 		ASSERT(ceq_dif > 0);
3412 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3413 	} else {
3414 		ceq_mask = 0;
3415 	}
3416 
3417 	ASSERT(bin < colors);
3418 
3419 	/* clear "non-significant" color bits */
3420 	bin &= ceq_mask;
3421 
3422 	/*
3423 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3424 	 * when there have been previous page_get_contig_page failures to
3425 	 * limit the search.
3426 	 *
3427 	 * The high bit in pfnflag specifies the number of 'slots' in the
3428 	 * pfn range and the remainder of pfnflag specifies which slot.
3429 	 * For example, a value of 1010b would mean the second slot of
3430 	 * the pfn range that has been divided into 8 slots.
3431 	 */
3432 	if (pfnflag > 1) {
3433 		int	slots = 1 << (highbit(pfnflag) - 1);
3434 		int	slotid = pfnflag & (slots - 1);
3435 		pgcnt_t	szcpages;
3436 		int	slotlen;
3437 
3438 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3439 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3440 		slotlen = howmany(szcpages, slots);
3441 		/* skip if 'slotid' slot is empty */
3442 		if (slotid * slotlen >= szcpages)
3443 			return (NULL);
3444 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3445 		ASSERT(pfnlo < pfnhi);
3446 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3447 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3448 	}
3449 
3450 	memsegs_lock(0);
3451 
3452 	/*
3453 	 * loop through memsegs to look for contig page candidates
3454 	 */
3455 
3456 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3457 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3458 			/* no overlap */
3459 			continue;
3460 		}
3461 
3462 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3463 			/* mseg too small */
3464 			continue;
3465 
3466 		/*
3467 		 * trim off kernel cage pages from pfn range and check for
3468 		 * a trimmed pfn range returned that does not span the
3469 		 * desired large page size.
3470 		 */
3471 		if (kcage_on) {
3472 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3473 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3474 				continue;
3475 		} else {
3476 			lo = MAX(pfnlo, mseg->pages_base);
3477 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3478 		}
3479 
3480 		/* round to szcpgcnt boundaries */
3481 		lo = P2ROUNDUP(lo, szcpgcnt);
3482 
3483 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3484 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3485 
3486 		if (hi <= lo)
3487 			continue;
3488 
3489 		/*
3490 		 * set lo to point to the pfn for the desired bin. Large
3491 		 * page sizes may only have a single page color
3492 		 */
3493 		skip = szcpgcnt;
3494 		if (ceq_mask > 0 || interleaved_mnodes) {
3495 			/* set lo to point at appropriate color */
3496 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3497 			    (interleaved_mnodes &&
3498 			    PFN_2_MEM_NODE(lo) != mnode)) {
3499 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3500 				    color_mask, &it);
3501 			}
3502 			if (hi <= lo)
3503 				/* mseg cannot satisfy color request */
3504 				continue;
3505 		}
3506 
3507 		/* randomly choose a point between lo and hi to begin search */
3508 
3509 		randpfn = (pfn_t)GETTICK();
3510 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3511 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3512 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3513 			if (randpfn != (pfn_t)-1) {
3514 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3515 				    ceq_mask, color_mask, &it);
3516 			}
3517 			if (randpfn >= hi) {
3518 				randpfn = lo;
3519 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3520 				    &it);
3521 			}
3522 		}
3523 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3524 
3525 		ASSERT(randpp->p_pagenum == randpfn);
3526 
3527 		pp = randpp;
3528 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3529 
3530 		ASSERT(randpp + szcpgcnt <= endpp);
3531 
3532 		do {
3533 			ASSERT(!(pp->p_pagenum & szcpgmask));
3534 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3535 
3536 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3537 				/* pages unlocked by page_claim on failure */
3538 				if (page_claim_contig_pages(pp, szc, flags)) {
3539 					memsegs_unlock(0);
3540 					return (pp);
3541 				}
3542 			}
3543 
3544 			if (ceq_mask == 0 && !interleaved_mnodes) {
3545 				pp += skip;
3546 			} else {
3547 				pfn_t pfn = pp->p_pagenum;
3548 
3549 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3550 				    ceq_mask, color_mask, &it);
3551 				if (pfn == (pfn_t)-1) {
3552 					pp = endpp;
3553 				} else {
3554 					pp = mseg->pages +
3555 					    (pfn - mseg->pages_base);
3556 				}
3557 			}
3558 			if (pp >= endpp) {
3559 				/* start from the beginning */
3560 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3561 				pp = mseg->pages + (lo - mseg->pages_base);
3562 				ASSERT(pp->p_pagenum == lo);
3563 				ASSERT(pp + szcpgcnt <= endpp);
3564 			}
3565 		} while (pp != randpp);
3566 	}
3567 	memsegs_unlock(0);
3568 	return (NULL);
3569 }
3570 
3571 
3572 /*
3573  * controlling routine that searches through physical memory in an attempt to
3574  * claim a large page based on the input parameters.
3575  * on the page free lists.
3576  *
3577  * calls page_geti_contig_pages with an initial pfn range from the mnode
3578  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3579  * that overlaps with the kernel cage or does not match the requested page
3580  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3581  * page_geti_contig_pages may further limit the search range based on
3582  * previous failure counts (pgcpfailcnt[]).
3583  *
3584  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3585  * pagesize page that satisfies mtype.
3586  */
3587 page_t *
3588 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3589     uint_t flags)
3590 {
3591 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3592 	page_t		*pp;
3593 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3594 
3595 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3596 
3597 	/* no allocations from cage */
3598 	flags |= PGI_NOCAGE;
3599 
3600 	/* LINTED */
3601 	MTYPE_START(mnode, mtype, flags);
3602 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3603 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3604 		return (NULL);
3605 	}
3606 
3607 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3608 
3609 	/* do not limit search and ignore color if hi pri */
3610 
3611 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3612 		pfnflag = pgcpfailcnt[szc];
3613 
3614 	/* remove color match to improve chances */
3615 
3616 	if (flags & PGI_PGCPHIPRI || pfnflag)
3617 		flags &= ~PG_MATCH_COLOR;
3618 
3619 	do {
3620 		/* get pfn range based on mnode and mtype */
3621 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3622 
3623 		ASSERT(pfnhi >= pfnlo);
3624 
3625 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3626 		    pfnlo, pfnhi, pfnflag);
3627 
3628 		if (pp != NULL) {
3629 			pfnflag = pgcpfailcnt[szc];
3630 			if (pfnflag) {
3631 				/* double the search size */
3632 				pgcpfailcnt[szc] = pfnflag >> 1;
3633 			}
3634 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3635 			return (pp);
3636 		}
3637 		MTYPE_NEXT(mnode, mtype, flags);
3638 	} while (mtype >= 0);
3639 
3640 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3641 	return (NULL);
3642 }
3643 
3644 #if defined(__i386) || defined(__amd64)
3645 /*
3646  * Determine the likelihood of finding/coalescing a szc page.
3647  * Return 0 if the likelihood is small otherwise return 1.
3648  *
3649  * For now, be conservative and check only 1g pages and return 0
3650  * if there had been previous coalescing failures and the szc pages
3651  * needed to satisfy request would exhaust most of freemem.
3652  */
3653 int
3654 page_chk_freelist(uint_t szc)
3655 {
3656 	pgcnt_t		pgcnt;
3657 
3658 	if (szc <= 1)
3659 		return (1);
3660 
3661 	pgcnt = page_get_pagecnt(szc);
3662 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3663 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3664 		return (0);
3665 	}
3666 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3667 	return (1);
3668 }
3669 #endif
3670 
3671 /*
3672  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3673  *
3674  * Does its own locking and accounting.
3675  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3676  * pages of the proper color even if there are pages of a different color.
3677  *
3678  * Finds a page, removes it, THEN locks it.
3679  */
3680 
3681 /*ARGSUSED*/
3682 page_t *
3683 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3684 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3685 {
3686 	struct as	*as = seg->s_as;
3687 	page_t		*pp = NULL;
3688 	ulong_t		bin;
3689 	uchar_t		szc;
3690 	int		mnode;
3691 	int		mtype;
3692 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3693 	lgrp_mnode_cookie_t	lgrp_cookie;
3694 
3695 	page_get_func = page_get_mnode_freelist;
3696 
3697 	/*
3698 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3699 	 * assume we wish to allocate near to the current thread's home.
3700 	 */
3701 	if (!LGRP_EXISTS(lgrp))
3702 		lgrp = lgrp_home_lgrp();
3703 
3704 	if (kcage_on) {
3705 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3706 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3707 		    curthread != kcage_cageout_thread) {
3708 			/*
3709 			 * Set a "reserve" of kcage_throttlefree pages for
3710 			 * PG_PANIC and cageout thread allocations.
3711 			 *
3712 			 * Everybody else has to serialize in
3713 			 * page_create_get_something() to get a cage page, so
3714 			 * that we don't deadlock cageout!
3715 			 */
3716 			return (NULL);
3717 		}
3718 	} else {
3719 		flags &= ~PG_NORELOC;
3720 		flags |= PGI_NOCAGE;
3721 	}
3722 
3723 	/* LINTED */
3724 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3725 
3726 	/*
3727 	 * Convert size to page size code.
3728 	 */
3729 	if ((szc = page_szc(size)) == (uchar_t)-1)
3730 		panic("page_get_freelist: illegal page size request");
3731 	ASSERT(szc < mmu_page_sizes);
3732 
3733 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3734 
3735 	/* LINTED */
3736 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3737 
3738 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3739 
3740 	/*
3741 	 * Try to get a local page first, but try remote if we can't
3742 	 * get a page of the right color.
3743 	 */
3744 pgretry:
3745 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3746 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3747 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3748 		if (pp != NULL) {
3749 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3750 			DTRACE_PROBE4(page__get,
3751 			    lgrp_t *, lgrp,
3752 			    int, mnode,
3753 			    ulong_t, bin,
3754 			    uint_t, flags);
3755 			return (pp);
3756 		}
3757 	}
3758 	ASSERT(pp == NULL);
3759 
3760 	/*
3761 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3762 	 * remote free lists.  Caller expected to call page_get_cachelist which
3763 	 * will check local cache lists and remote free lists.
3764 	 */
3765 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3766 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3767 		return (NULL);
3768 	}
3769 
3770 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3771 
3772 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3773 
3774 	if (!(flags & PG_LOCAL)) {
3775 		/*
3776 		 * Try to get a non-local freelist page.
3777 		 */
3778 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3779 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3780 			pp = page_get_func(mnode, bin, mtype, szc, flags);
3781 			if (pp != NULL) {
3782 				DTRACE_PROBE4(page__get,
3783 				    lgrp_t *, lgrp,
3784 				    int, mnode,
3785 				    ulong_t, bin,
3786 				    uint_t, flags);
3787 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3788 				return (pp);
3789 			}
3790 		}
3791 		ASSERT(pp == NULL);
3792 	}
3793 
3794 	/*
3795 	 * when the cage is off chances are page_get_contig_pages() will fail
3796 	 * to lock a large page chunk therefore when the cage is off it's not
3797 	 * called by default.  this can be changed via /etc/system.
3798 	 *
3799 	 * page_get_contig_pages() also called to acquire a base pagesize page
3800 	 * for page_create_get_something().
3801 	 */
3802 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3803 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3804 	    (page_get_func != page_get_contig_pages)) {
3805 
3806 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3807 		page_get_func = page_get_contig_pages;
3808 		goto pgretry;
3809 	}
3810 
3811 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3812 	    page_get_func == page_get_contig_pages)
3813 		SETPGCPFAILCNT(szc);
3814 
3815 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3816 	return (NULL);
3817 }
3818 
3819 /*
3820  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3821  *
3822  * Does its own locking.
3823  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3824  * pages of the proper color even if there are pages of a different color.
3825  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3826  * try to lock one of them.  If no page can be locked, try the
3827  * next bin.  Return NULL if a page can not be found and locked.
3828  *
3829  * Finds a pages, trys to lock it, then removes it.
3830  */
3831 
3832 /*ARGSUSED*/
3833 page_t *
3834 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3835     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3836 {
3837 	page_t		*pp;
3838 	struct as	*as = seg->s_as;
3839 	ulong_t		bin;
3840 	/*LINTED*/
3841 	int		mnode;
3842 	int		mtype;
3843 	lgrp_mnode_cookie_t	lgrp_cookie;
3844 
3845 	/*
3846 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3847 	 * assume we wish to allocate near to the current thread's home.
3848 	 */
3849 	if (!LGRP_EXISTS(lgrp))
3850 		lgrp = lgrp_home_lgrp();
3851 
3852 	if (!kcage_on) {
3853 		flags &= ~PG_NORELOC;
3854 		flags |= PGI_NOCAGE;
3855 	}
3856 
3857 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3858 	    kcage_freemem <= kcage_throttlefree) {
3859 		/*
3860 		 * Reserve kcage_throttlefree pages for critical kernel
3861 		 * threads.
3862 		 *
3863 		 * Everybody else has to go to page_create_get_something()
3864 		 * to get a cage page, so we don't deadlock cageout.
3865 		 */
3866 		return (NULL);
3867 	}
3868 
3869 	/* LINTED */
3870 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3871 
3872 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3873 
3874 	/* LINTED */
3875 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3876 
3877 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3878 
3879 	/*
3880 	 * Try local cachelists first
3881 	 */
3882 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3883 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3884 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3885 		if (pp != NULL) {
3886 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3887 			DTRACE_PROBE4(page__get,
3888 			    lgrp_t *, lgrp,
3889 			    int, mnode,
3890 			    ulong_t, bin,
3891 			    uint_t, flags);
3892 			return (pp);
3893 		}
3894 	}
3895 
3896 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3897 
3898 	/*
3899 	 * Try freelists/cachelists that are farther away
3900 	 * This is our only chance to allocate remote pages for PAGESIZE
3901 	 * requests.
3902 	 */
3903 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3904 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3905 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3906 		    0, flags);
3907 		if (pp != NULL) {
3908 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3909 			DTRACE_PROBE4(page__get,
3910 			    lgrp_t *, lgrp,
3911 			    int, mnode,
3912 			    ulong_t, bin,
3913 			    uint_t, flags);
3914 			return (pp);
3915 		}
3916 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3917 		if (pp != NULL) {
3918 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3919 			DTRACE_PROBE4(page__get,
3920 			    lgrp_t *, lgrp,
3921 			    int, mnode,
3922 			    ulong_t, bin,
3923 			    uint_t, flags);
3924 			return (pp);
3925 		}
3926 	}
3927 
3928 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3929 	return (NULL);
3930 }
3931 
3932 page_t *
3933 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3934 {
3935 	kmutex_t		*pcm;
3936 	page_t			*pp, *first_pp;
3937 	uint_t			sbin;
3938 	int			plw_initialized;
3939 	page_list_walker_t	plw;
3940 
3941 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3942 
3943 	/* LINTED */
3944 	MTYPE_START(mnode, mtype, flags);
3945 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3946 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3947 		return (NULL);
3948 	}
3949 
3950 try_again:
3951 
3952 	plw_initialized = 0;
3953 	plw.plw_ceq_dif = 1;
3954 
3955 	/*
3956 	 * Only hold one cachelist lock at a time, that way we
3957 	 * can start anywhere and not have to worry about lock
3958 	 * ordering.
3959 	 */
3960 
3961 	for (plw.plw_count = 0;
3962 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3963 		sbin = bin;
3964 		do {
3965 
3966 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
3967 				goto bin_empty_1;
3968 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3969 			mutex_enter(pcm);
3970 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3971 			if (pp == NULL)
3972 				goto bin_empty_0;
3973 
3974 			first_pp = pp;
3975 			ASSERT(pp->p_vnode);
3976 			ASSERT(PP_ISAGED(pp) == 0);
3977 			ASSERT(pp->p_szc == 0);
3978 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3979 			while (!page_trylock(pp, SE_EXCL)) {
3980 				pp = pp->p_next;
3981 				ASSERT(pp->p_szc == 0);
3982 				if (pp == first_pp) {
3983 					/*
3984 					 * We have searched the complete list!
3985 					 * And all of them (might only be one)
3986 					 * are locked. This can happen since
3987 					 * these pages can also be found via
3988 					 * the hash list. When found via the
3989 					 * hash list, they are locked first,
3990 					 * then removed. We give up to let the
3991 					 * other thread run.
3992 					 */
3993 					pp = NULL;
3994 					break;
3995 				}
3996 				ASSERT(pp->p_vnode);
3997 				ASSERT(PP_ISFREE(pp));
3998 				ASSERT(PP_ISAGED(pp) == 0);
3999 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4000 				    mnode);
4001 			}
4002 
4003 			if (pp) {
4004 				page_t	**ppp;
4005 				/*
4006 				 * Found and locked a page.
4007 				 * Pull it off the list.
4008 				 */
4009 				ASSERT(mtype == PP_2_MTYPE(pp));
4010 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4011 				page_sub(ppp, pp);
4012 				/*
4013 				 * Subtract counters before releasing pcm mutex
4014 				 * to avoid a race with page_freelist_coalesce
4015 				 * and page_freelist_split.
4016 				 */
4017 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4018 				mutex_exit(pcm);
4019 				ASSERT(pp->p_vnode);
4020 				ASSERT(PP_ISAGED(pp) == 0);
4021 #if defined(__sparc)
4022 				ASSERT(!kcage_on ||
4023 				    (flags & PG_NORELOC) == 0 ||
4024 				    PP_ISNORELOC(pp));
4025 				if (PP_ISNORELOC(pp)) {
4026 					kcage_freemem_sub(1);
4027 				}
4028 #endif
4029 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4030 				return (pp);
4031 			}
4032 bin_empty_0:
4033 			mutex_exit(pcm);
4034 bin_empty_1:
4035 			if (plw_initialized == 0) {
4036 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
4037 				plw_initialized = 1;
4038 			}
4039 			/* calculate the next bin with equivalent color */
4040 			bin = ADD_MASKED(bin, plw.plw_bin_step,
4041 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
4042 		} while (sbin != bin);
4043 
4044 		if (plw.plw_ceq_dif > 1)
4045 			bin = page_list_walk_next_bin(0, bin, &plw);
4046 	}
4047 
4048 	MTYPE_NEXT(mnode, mtype, flags);
4049 	if (mtype >= 0)
4050 		goto try_again;
4051 
4052 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4053 	return (NULL);
4054 }
4055 
4056 #ifdef DEBUG
4057 #define	REPL_PAGE_STATS
4058 #endif /* DEBUG */
4059 
4060 #ifdef REPL_PAGE_STATS
4061 struct repl_page_stats {
4062 	uint_t	ngets;
4063 	uint_t	ngets_noreloc;
4064 	uint_t	npgr_noreloc;
4065 	uint_t	nnopage_first;
4066 	uint_t	nnopage;
4067 	uint_t	nhashout;
4068 	uint_t	nnofree;
4069 	uint_t	nnext_pp;
4070 } repl_page_stats;
4071 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
4072 #else /* REPL_PAGE_STATS */
4073 #define	REPL_STAT_INCR(v)
4074 #endif /* REPL_PAGE_STATS */
4075 
4076 int	pgrppgcp;
4077 
4078 /*
4079  * The freemem accounting must be done by the caller.
4080  * First we try to get a replacement page of the same size as like_pp,
4081  * if that is not possible, then we just get a set of discontiguous
4082  * PAGESIZE pages.
4083  */
4084 page_t *
4085 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4086     uint_t pgrflags)
4087 {
4088 	page_t		*like_pp;
4089 	page_t		*pp, *pplist;
4090 	page_t		*pl = NULL;
4091 	ulong_t		bin;
4092 	int		mnode, page_mnode;
4093 	int		szc;
4094 	spgcnt_t	npgs, pg_cnt;
4095 	pfn_t		pfnum;
4096 	int		mtype;
4097 	int		flags = 0;
4098 	lgrp_mnode_cookie_t	lgrp_cookie;
4099 	lgrp_t		*lgrp;
4100 
4101 	REPL_STAT_INCR(ngets);
4102 	like_pp = orig_like_pp;
4103 	ASSERT(PAGE_EXCL(like_pp));
4104 
4105 	szc = like_pp->p_szc;
4106 	npgs = page_get_pagecnt(szc);
4107 	/*
4108 	 * Now we reset like_pp to the base page_t.
4109 	 * That way, we won't walk past the end of this 'szc' page.
4110 	 */
4111 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4112 	like_pp = page_numtopp_nolock(pfnum);
4113 	ASSERT(like_pp->p_szc == szc);
4114 
4115 	if (PP_ISNORELOC(like_pp)) {
4116 		ASSERT(kcage_on);
4117 		REPL_STAT_INCR(ngets_noreloc);
4118 		flags = PGI_RELOCONLY;
4119 	} else if (pgrflags & PGR_NORELOC) {
4120 		ASSERT(kcage_on);
4121 		REPL_STAT_INCR(npgr_noreloc);
4122 		flags = PG_NORELOC;
4123 	}
4124 
4125 	/*
4126 	 * Kernel pages must always be replaced with the same size
4127 	 * pages, since we cannot properly handle demotion of kernel
4128 	 * pages.
4129 	 */
4130 	if (PP_ISKAS(like_pp))
4131 		pgrflags |= PGR_SAMESZC;
4132 
4133 	/* LINTED */
4134 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4135 
4136 	while (npgs) {
4137 		pplist = NULL;
4138 		for (;;) {
4139 			pg_cnt = page_get_pagecnt(szc);
4140 			bin = PP_2_BIN(like_pp);
4141 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4142 			ASSERT(pg_cnt <= npgs);
4143 
4144 			/*
4145 			 * If an lgroup was specified, try to get the
4146 			 * page from that lgroup.
4147 			 * NOTE: Must be careful with code below because
4148 			 *	 lgroup may disappear and reappear since there
4149 			 *	 is no locking for lgroup here.
4150 			 */
4151 			if (LGRP_EXISTS(lgrp_target)) {
4152 				/*
4153 				 * Keep local variable for lgroup separate
4154 				 * from lgroup argument since this code should
4155 				 * only be exercised when lgroup argument
4156 				 * exists....
4157 				 */
4158 				lgrp = lgrp_target;
4159 
4160 				/* Try the lgroup's freelists first */
4161 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4162 				    LGRP_SRCH_LOCAL);
4163 				while ((pplist == NULL) &&
4164 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4165 				    != -1) {
4166 					pplist =
4167 					    page_get_mnode_freelist(mnode, bin,
4168 					    mtype, szc, flags);
4169 				}
4170 
4171 				/*
4172 				 * Now try it's cachelists if this is a
4173 				 * small page. Don't need to do it for
4174 				 * larger ones since page_freelist_coalesce()
4175 				 * already failed.
4176 				 */
4177 				if (pplist != NULL || szc != 0)
4178 					break;
4179 
4180 				/* Now try it's cachelists */
4181 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4182 				    LGRP_SRCH_LOCAL);
4183 
4184 				while ((pplist == NULL) &&
4185 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4186 				    != -1) {
4187 					pplist =
4188 					    page_get_mnode_cachelist(bin, flags,
4189 					    mnode, mtype);
4190 				}
4191 				if (pplist != NULL) {
4192 					page_hashout(pplist, NULL);
4193 					PP_SETAGED(pplist);
4194 					REPL_STAT_INCR(nhashout);
4195 					break;
4196 				}
4197 				/* Done looking in this lgroup. Bail out. */
4198 				break;
4199 			}
4200 
4201 			/*
4202 			 * No lgroup was specified (or lgroup was removed by
4203 			 * DR, so just try to get the page as close to
4204 			 * like_pp's mnode as possible.
4205 			 * First try the local freelist...
4206 			 */
4207 			mnode = PP_2_MEM_NODE(like_pp);
4208 			pplist = page_get_mnode_freelist(mnode, bin,
4209 			    mtype, szc, flags);
4210 			if (pplist != NULL)
4211 				break;
4212 
4213 			REPL_STAT_INCR(nnofree);
4214 
4215 			/*
4216 			 * ...then the local cachelist. Don't need to do it for
4217 			 * larger pages cause page_freelist_coalesce() already
4218 			 * failed there anyway.
4219 			 */
4220 			if (szc == 0) {
4221 				pplist = page_get_mnode_cachelist(bin, flags,
4222 				    mnode, mtype);
4223 				if (pplist != NULL) {
4224 					page_hashout(pplist, NULL);
4225 					PP_SETAGED(pplist);
4226 					REPL_STAT_INCR(nhashout);
4227 					break;
4228 				}
4229 			}
4230 
4231 			/* Now try remote freelists */
4232 			page_mnode = mnode;
4233 			lgrp =
4234 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4235 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4236 			    LGRP_SRCH_HIER);
4237 			while (pplist == NULL &&
4238 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4239 			    != -1) {
4240 				/*
4241 				 * Skip local mnode.
4242 				 */
4243 				if ((mnode == page_mnode) ||
4244 				    (mem_node_config[mnode].exists == 0))
4245 					continue;
4246 
4247 				pplist = page_get_mnode_freelist(mnode,
4248 				    bin, mtype, szc, flags);
4249 			}
4250 
4251 			if (pplist != NULL)
4252 				break;
4253 
4254 
4255 			/* Now try remote cachelists */
4256 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4257 			    LGRP_SRCH_HIER);
4258 			while (pplist == NULL && szc == 0) {
4259 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4260 				if (mnode == -1)
4261 					break;
4262 				/*
4263 				 * Skip local mnode.
4264 				 */
4265 				if ((mnode == page_mnode) ||
4266 				    (mem_node_config[mnode].exists == 0))
4267 					continue;
4268 
4269 				pplist = page_get_mnode_cachelist(bin,
4270 				    flags, mnode, mtype);
4271 
4272 				if (pplist != NULL) {
4273 					page_hashout(pplist, NULL);
4274 					PP_SETAGED(pplist);
4275 					REPL_STAT_INCR(nhashout);
4276 					break;
4277 				}
4278 			}
4279 
4280 			/*
4281 			 * Break out of while loop under the following cases:
4282 			 * - If we successfully got a page.
4283 			 * - If pgrflags specified only returning a specific
4284 			 *   page size and we could not find that page size.
4285 			 * - If we could not satisfy the request with PAGESIZE
4286 			 *   or larger pages.
4287 			 */
4288 			if (pplist != NULL || szc == 0)
4289 				break;
4290 
4291 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4292 				/* try to find contig page */
4293 
4294 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4295 				    LGRP_SRCH_HIER);
4296 
4297 				while ((pplist == NULL) &&
4298 				    (mnode =
4299 				    lgrp_memnode_choose(&lgrp_cookie))
4300 				    != -1) {
4301 					pplist = page_get_contig_pages(
4302 					    mnode, bin, mtype, szc,
4303 					    flags | PGI_PGCPHIPRI);
4304 				}
4305 				break;
4306 			}
4307 
4308 			/*
4309 			 * The correct thing to do here is try the next
4310 			 * page size down using szc--. Due to a bug
4311 			 * with the processing of HAT_RELOAD_SHARE
4312 			 * where the sfmmu_ttecnt arrays of all
4313 			 * hats sharing an ISM segment don't get updated,
4314 			 * using intermediate size pages for relocation
4315 			 * can lead to continuous page faults.
4316 			 */
4317 			szc = 0;
4318 		}
4319 
4320 		if (pplist != NULL) {
4321 			DTRACE_PROBE4(page__get,
4322 			    lgrp_t *, lgrp,
4323 			    int, mnode,
4324 			    ulong_t, bin,
4325 			    uint_t, flags);
4326 
4327 			while (pplist != NULL && pg_cnt--) {
4328 				ASSERT(pplist != NULL);
4329 				pp = pplist;
4330 				page_sub(&pplist, pp);
4331 				PP_CLRFREE(pp);
4332 				PP_CLRAGED(pp);
4333 				page_list_concat(&pl, &pp);
4334 				npgs--;
4335 				like_pp = like_pp + 1;
4336 				REPL_STAT_INCR(nnext_pp);
4337 			}
4338 			ASSERT(pg_cnt == 0);
4339 		} else {
4340 			break;
4341 		}
4342 	}
4343 
4344 	if (npgs) {
4345 		/*
4346 		 * We were unable to allocate the necessary number
4347 		 * of pages.
4348 		 * We need to free up any pl.
4349 		 */
4350 		REPL_STAT_INCR(nnopage);
4351 		page_free_replacement_page(pl);
4352 		return (NULL);
4353 	} else {
4354 		return (pl);
4355 	}
4356 }
4357 
4358 /*
4359  * demote a free large page to it's constituent pages
4360  */
4361 void
4362 page_demote_free_pages(page_t *pp)
4363 {
4364 
4365 	int mnode;
4366 
4367 	ASSERT(pp != NULL);
4368 	ASSERT(PAGE_LOCKED(pp));
4369 	ASSERT(PP_ISFREE(pp));
4370 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4371 
4372 	mnode = PP_2_MEM_NODE(pp);
4373 	page_freelist_lock(mnode);
4374 	if (pp->p_szc != 0) {
4375 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4376 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4377 	}
4378 	page_freelist_unlock(mnode);
4379 	ASSERT(pp->p_szc == 0);
4380 }
4381 
4382 /*
4383  * Factor in colorequiv to check additional 'equivalent' bins.
4384  * colorequiv may be set in /etc/system
4385  */
4386 void
4387 page_set_colorequiv_arr(void)
4388 {
4389 	if (colorequiv > 1) {
4390 		int i;
4391 		uint_t sv_a = lowbit(colorequiv) - 1;
4392 
4393 		if (sv_a > 15)
4394 			sv_a = 15;
4395 
4396 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4397 			uint_t colors;
4398 			uint_t a = sv_a;
4399 
4400 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4401 				continue;
4402 			}
4403 			while ((colors >> a) == 0)
4404 				a--;
4405 			if ((a << 4) > colorequivszc[i]) {
4406 				colorequivszc[i] = (a << 4);
4407 			}
4408 		}
4409 	}
4410 }
4411