xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 051d39bbeea3e1b0fd8395dc97be34acb3241891)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * This file contains common functions to access and manage the page lists.
38  * Many of these routines originated from platform dependent modules
39  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
40  * a platform independent manner.
41  *
42  * vm/vm_dep.h provides for platform specific support.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/atomic.h>
50 #include <sys/sysmacros.h>
51 #include <vm/as.h>
52 #include <vm/page.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_vn.h>
55 #include <sys/memnode.h>
56 #include <vm/vm_dep.h>
57 #include <sys/lgrp.h>
58 #include <sys/mem_config.h>
59 #include <sys/callb.h>
60 #include <sys/mem_cage.h>
61 #include <sys/sdt.h>
62 
63 extern uint_t	vac_colors;
64 
65 #define	MAX_PRAGMA_ALIGN	128
66 
67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
68 
69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
70 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
71 #else
72 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
73 #endif
74 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
75 
76 /*
77  * number of page colors equivalent to reqested color in page_get routines.
78  * If set, keeps large pages intact longer and keeps MPO allocation
79  * from the local mnode in favor of acquiring the 'correct' page color from
80  * a demoted large page or from a remote mnode.
81  */
82 uint_t	colorequiv;
83 
84 /*
85  * color equivalency mask for each page size.
86  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
87  * High 4 bits determine the number of high order bits of the color to ignore.
88  * Low 4 bits determines number of low order bits of color to ignore (it's only
89  * relevant for hashed index based page coloring).
90  */
91 uchar_t colorequivszc[MMU_PAGE_SIZES];
92 
93 /*
94  * if set, specifies the percentage of large pages that are free from within
95  * a large page region before attempting to lock those pages for
96  * page_get_contig_pages processing.
97  *
98  * Should be turned on when kpr is available when page_trylock_contig_pages
99  * can be more selective.
100  */
101 
102 int	ptcpthreshold;
103 
104 /*
105  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
106  * Enabled by default via pgcplimitsearch.
107  *
108  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
109  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
110  * bound. This upper bound range guarantees:
111  *    - all large page 'slots' will be searched over time
112  *    - the minimum (1) large page candidates considered on each pgcp call
113  *    - count doesn't wrap around to 0
114  */
115 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
116 int	pgcplimitsearch = 1;
117 
118 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
119 #define	SETPGCPFAILCNT(szc)						\
120 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
121 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
122 
123 #ifdef VM_STATS
124 struct vmm_vmstats_str  vmm_vmstats;
125 
126 #endif /* VM_STATS */
127 
128 #if defined(__sparc)
129 #define	LPGCREATE	0
130 #else
131 /* enable page_get_contig_pages */
132 #define	LPGCREATE	1
133 #endif
134 
135 int pg_contig_disable;
136 int pg_lpgcreate_nocage = LPGCREATE;
137 
138 /*
139  * page_freelist_split pfn flag to signify no hi pfn requirement.
140  */
141 #define	PFNNULL		0
142 
143 /* Flags involved in promotion and demotion routines */
144 #define	PC_FREE		0x1	/* put page on freelist */
145 #define	PC_ALLOC	0x2	/* return page for allocation */
146 
147 /*
148  * Flag for page_demote to be used with PC_FREE to denote that we don't care
149  * what the color is as the color parameter to the function is ignored.
150  */
151 #define	PC_NO_COLOR	(-1)
152 
153 /* mtype value for page_promote to use when mtype does not matter */
154 #define	PC_MTYPE_ANY	(-1)
155 
156 /*
157  * page counters candidates info
158  * See page_ctrs_cands comment below for more details.
159  * fields are as follows:
160  *	pcc_pages_free:		# pages which freelist coalesce can create
161  *	pcc_color_free:		pointer to page free counts per color
162  */
163 typedef struct pcc_info {
164 	pgcnt_t	pcc_pages_free;
165 	pgcnt_t	*pcc_color_free;
166 } pcc_info_t;
167 
168 /*
169  * On big machines it can take a long time to check page_counters
170  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
171  * updated sum of all elements of the corresponding page_counters arrays.
172  * page_freelist_coalesce() searches page_counters only if an appropriate
173  * element of page_ctrs_cands array is greater than 0.
174  *
175  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
176  */
177 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
178 
179 /*
180  * Return in val the total number of free pages which can be created
181  * for the given mnode (m), mrange (g), and region size (r)
182  */
183 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
184 	int i;								\
185 	val = 0;							\
186 	for (i = 0; i < NPC_MUTEX; i++) {				\
187 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
188 	}								\
189 }
190 
191 /*
192  * Return in val the total number of free pages which can be created
193  * for the given mnode (m), mrange (g), region size (r), and color (c)
194  */
195 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
196 	int i;								\
197 	val = 0;							\
198 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
199 	for (i = 0; i < NPC_MUTEX; i++) {				\
200 	    val +=							\
201 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
202 	}								\
203 }
204 
205 /*
206  * We can only allow a single thread to update a counter within the physical
207  * range of the largest supported page size. That is the finest granularity
208  * possible since the counter values are dependent on each other
209  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
210  * ctr_mutex lock index for a particular physical range.
211  */
212 static kmutex_t	*ctr_mutex[NPC_MUTEX];
213 
214 #define	PP_CTR_LOCK_INDX(pp)						\
215 	(((pp)->p_pagenum >>						\
216 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
217 
218 #define	INVALID_COLOR 0xffffffff
219 #define	INVALID_MASK  0xffffffff
220 
221 /*
222  * Local functions prototypes.
223  */
224 
225 void page_ctr_add(int, int, page_t *, int);
226 void page_ctr_add_internal(int, int, page_t *, int);
227 void page_ctr_sub(int, int, page_t *, int);
228 void page_ctr_sub_internal(int, int, page_t *, int);
229 void page_freelist_lock(int);
230 void page_freelist_unlock(int);
231 page_t *page_promote(int, pfn_t, uchar_t, int, int);
232 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
233 page_t *page_freelist_split(uchar_t,
234     uint_t, int, int, pfn_t, page_list_walker_t *);
235 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
236 static int page_trylock_cons(page_t *pp, se_t se);
237 
238 /*
239  * The page_counters array below is used to keep track of free contiguous
240  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
241  * This contains an array of counters, the size of the array, a shift value
242  * used to convert a pagenum into a counter array index or vice versa, as
243  * well as a cache of the last successful index to be promoted to a larger
244  * page size.  As an optimization, we keep track of the last successful index
245  * to be promoted per page color for the given size region, and this is
246  * allocated dynamically based upon the number of colors for a given
247  * region size.
248  *
249  * Conceptually, the page counters are represented as:
250  *
251  *	page_counters[region_size][mnode]
252  *
253  *	region_size:	size code of a candidate larger page made up
254  *			of contiguous free smaller pages.
255  *
256  *	page_counters[region_size][mnode].hpm_counters[index]:
257  *		represents how many (region_size - 1) pages either
258  *		exist or can be created within the given index range.
259  *
260  * Let's look at a sparc example:
261  *	If we want to create a free 512k page, we look at region_size 2
262  *	for the mnode we want.  We calculate the index and look at a specific
263  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
264  *	this location, it means that 8 64k pages either exist or can be created
265  *	from 8K pages in order to make a single free 512k page at the given
266  *	index.  Note that when a region is full, it will contribute to the
267  *	counts in the region above it.  Thus we will not know what page
268  *	size the free pages will be which can be promoted to this new free
269  *	page unless we look at all regions below the current region.
270  */
271 
272 /*
273  * Note: hpmctr_t is defined in platform vm_dep.h
274  * hw_page_map_t contains all the information needed for the page_counters
275  * logic. The fields are as follows:
276  *
277  *	hpm_counters:	dynamically allocated array to hold counter data
278  *	hpm_entries:	entries in hpm_counters
279  *	hpm_shift:	shift for pnum/array index conv
280  *	hpm_base:	PFN mapped to counter index 0
281  *	hpm_color_current:	last index in counter array for this color at
282  *				which we successfully created a large page
283  */
284 typedef struct hw_page_map {
285 	hpmctr_t	*hpm_counters;
286 	size_t		hpm_entries;
287 	int		hpm_shift;
288 	pfn_t		hpm_base;
289 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
290 } hw_page_map_t;
291 
292 /*
293  * Element zero is not used, but is allocated for convenience.
294  */
295 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
296 
297 /*
298  * Cached value of MNODE_RANGE_CNT(mnode).
299  * This is a function call in x86.
300  */
301 static int mnode_nranges[MAX_MEM_NODES];
302 static int mnode_maxmrange[MAX_MEM_NODES];
303 
304 /*
305  * The following macros are convenient ways to get access to the individual
306  * elements of the page_counters arrays.  They can be used on both
307  * the left side and right side of equations.
308  */
309 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
310 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
311 
312 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
313 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
314 
315 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
316 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
317 
318 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
319 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
320 
321 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
322 	(page_counters[(rg_szc)][(mnode)].hpm_base)
323 
324 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
325 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
326 
327 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
328 	(page_counters[(rg_szc)][(mnode)].				\
329 	hpm_color_current[(mrange)][(color)])
330 
331 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
332 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
333 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
334 
335 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
336 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
337 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
338 
339 /*
340  * Protects the hpm_counters and hpm_color_current memory from changing while
341  * looking at page counters information.
342  * Grab the write lock to modify what these fields point at.
343  * Grab the read lock to prevent any pointers from changing.
344  * The write lock can not be held during memory allocation due to a possible
345  * recursion deadlock with trying to grab the read lock while the
346  * write lock is already held.
347  */
348 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
349 
350 
351 /*
352  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
353  */
354 void
355 cpu_vm_data_init(struct cpu *cp)
356 {
357 	if (cp == CPU0) {
358 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
359 	} else {
360 		void	*kmptr;
361 		int	align;
362 		size_t	sz;
363 
364 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
365 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
366 		kmptr = kmem_zalloc(sz, KM_SLEEP);
367 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
368 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
369 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
370 	}
371 }
372 
373 /*
374  * free cpu_vm_data
375  */
376 void
377 cpu_vm_data_destroy(struct cpu *cp)
378 {
379 	if (cp->cpu_seqid && cp->cpu_vm_data) {
380 		ASSERT(cp != CPU0);
381 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
382 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
383 	}
384 	cp->cpu_vm_data = NULL;
385 }
386 
387 
388 /*
389  * page size to page size code
390  */
391 int
392 page_szc(size_t pagesize)
393 {
394 	int	i = 0;
395 
396 	while (hw_page_array[i].hp_size) {
397 		if (pagesize == hw_page_array[i].hp_size)
398 			return (i);
399 		i++;
400 	}
401 	return (-1);
402 }
403 
404 /*
405  * page size to page size code with the restriction that it be a supported
406  * user page size.  If it's not a supported user page size, -1 will be returned.
407  */
408 int
409 page_szc_user_filtered(size_t pagesize)
410 {
411 	int szc = page_szc(pagesize);
412 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
413 		return (szc);
414 	}
415 	return (-1);
416 }
417 
418 /*
419  * Return how many page sizes are available for the user to use.  This is
420  * what the hardware supports and not based upon how the OS implements the
421  * support of different page sizes.
422  */
423 uint_t
424 page_num_user_pagesizes(void)
425 {
426 	return (mmu_exported_page_sizes);
427 }
428 
429 uint_t
430 page_num_pagesizes(void)
431 {
432 	return (mmu_page_sizes);
433 }
434 
435 /*
436  * returns the count of the number of base pagesize pages associated with szc
437  */
438 pgcnt_t
439 page_get_pagecnt(uint_t szc)
440 {
441 	if (szc >= mmu_page_sizes)
442 		panic("page_get_pagecnt: out of range %d", szc);
443 	return (hw_page_array[szc].hp_pgcnt);
444 }
445 
446 size_t
447 page_get_pagesize(uint_t szc)
448 {
449 	if (szc >= mmu_page_sizes)
450 		panic("page_get_pagesize: out of range %d", szc);
451 	return (hw_page_array[szc].hp_size);
452 }
453 
454 /*
455  * Return the size of a page based upon the index passed in.  An index of
456  * zero refers to the smallest page size in the system, and as index increases
457  * it refers to the next larger supported page size in the system.
458  * Note that szc and userszc may not be the same due to unsupported szc's on
459  * some systems.
460  */
461 size_t
462 page_get_user_pagesize(uint_t userszc)
463 {
464 	uint_t szc = USERSZC_2_SZC(userszc);
465 
466 	if (szc >= mmu_page_sizes)
467 		panic("page_get_user_pagesize: out of range %d", szc);
468 	return (hw_page_array[szc].hp_size);
469 }
470 
471 uint_t
472 page_get_shift(uint_t szc)
473 {
474 	if (szc >= mmu_page_sizes)
475 		panic("page_get_shift: out of range %d", szc);
476 	return (PAGE_GET_SHIFT(szc));
477 }
478 
479 uint_t
480 page_get_pagecolors(uint_t szc)
481 {
482 	if (szc >= mmu_page_sizes)
483 		panic("page_get_pagecolors: out of range %d", szc);
484 	return (PAGE_GET_PAGECOLORS(szc));
485 }
486 
487 /*
488  * this assigns the desired equivalent color after a split
489  */
490 uint_t
491 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
492     uint_t ncolor, uint_t ceq_mask)
493 {
494 	ASSERT(nszc > szc);
495 	ASSERT(szc < mmu_page_sizes);
496 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
497 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
498 
499 	color &= ceq_mask;
500 	ncolor <<= PAGE_GET_COLOR_SHIFT(szc, nszc);
501 	return (color | (ncolor & ~ceq_mask));
502 }
503 
504 /*
505  * Called by startup().
506  * Size up the per page size free list counters based on physmax
507  * of each node and max_mem_nodes.
508  */
509 size_t
510 page_ctrs_sz(void)
511 {
512 	int	r;		/* region size */
513 	int	mnode;
514 	int	nranges;
515 	uint_t	ctrs_sz = 0;
516 	int 	i;
517 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
518 
519 	/*
520 	 * We need to determine how many page colors there are for each
521 	 * page size in order to allocate memory for any color specific
522 	 * arrays.
523 	 */
524 	for (i = 0; i < mmu_page_sizes; i++) {
525 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
526 	}
527 
528 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
529 
530 		pgcnt_t r_pgcnt;
531 		pfn_t   r_base;
532 		pgcnt_t r_align;
533 
534 		if (mem_node_config[mnode].exists == 0)
535 			continue;
536 
537 		nranges = MNODE_RANGE_CNT(mnode);
538 		mnode_nranges[mnode] = nranges;
539 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
540 
541 		/*
542 		 * determine size needed for page counter arrays with
543 		 * base aligned to large page size.
544 		 */
545 		for (r = 1; r < mmu_page_sizes; r++) {
546 			/* add in space for hpm_counters */
547 			r_align = page_get_pagecnt(r);
548 			r_base = mem_node_config[mnode].physbase;
549 			r_base &= ~(r_align - 1);
550 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
551 			    r_base + 1, r_align);
552 			/*
553 			 * Round up to always allocate on pointer sized
554 			 * boundaries.
555 			 */
556 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
557 			    sizeof (hpmctr_t *));
558 
559 			/* add in space for hpm_color_current */
560 			ctrs_sz += sizeof (size_t) *
561 			    colors_per_szc[r] * nranges;
562 		}
563 	}
564 
565 	for (r = 1; r < mmu_page_sizes; r++) {
566 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
567 	}
568 
569 	/* add in space for page_ctrs_cands and pcc_color_free */
570 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
571 	    mmu_page_sizes * NPC_MUTEX;
572 
573 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
574 
575 		if (mem_node_config[mnode].exists == 0)
576 			continue;
577 
578 		nranges = mnode_nranges[mnode];
579 		ctrs_sz += sizeof (pcc_info_t) * nranges *
580 		    mmu_page_sizes * NPC_MUTEX;
581 		for (r = 1; r < mmu_page_sizes; r++) {
582 			ctrs_sz += sizeof (pgcnt_t) * nranges *
583 			    colors_per_szc[r] * NPC_MUTEX;
584 		}
585 	}
586 
587 	/* ctr_mutex */
588 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
589 
590 	/* size for page list counts */
591 	PLCNT_SZ(ctrs_sz);
592 
593 	/*
594 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
595 	 * address of the counters to ecache_alignsize boundary for every
596 	 * memory node.
597 	 */
598 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
599 }
600 
601 caddr_t
602 page_ctrs_alloc(caddr_t alloc_base)
603 {
604 	int	mnode;
605 	int	mrange, nranges;
606 	int	r;		/* region size */
607 	int	i;
608 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
609 
610 	/*
611 	 * We need to determine how many page colors there are for each
612 	 * page size in order to allocate memory for any color specific
613 	 * arrays.
614 	 */
615 	for (i = 0; i < mmu_page_sizes; i++) {
616 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
617 	}
618 
619 	for (r = 1; r < mmu_page_sizes; r++) {
620 		page_counters[r] = (hw_page_map_t *)alloc_base;
621 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
622 	}
623 
624 	/* page_ctrs_cands and pcc_color_free array */
625 	for (i = 0; i < NPC_MUTEX; i++) {
626 		for (r = 1; r < mmu_page_sizes; r++) {
627 
628 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
629 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
630 
631 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
632 				pcc_info_t *pi;
633 
634 				if (mem_node_config[mnode].exists == 0)
635 					continue;
636 
637 				nranges = mnode_nranges[mnode];
638 
639 				pi = (pcc_info_t *)alloc_base;
640 				alloc_base += sizeof (pcc_info_t) * nranges;
641 				page_ctrs_cands[i][r][mnode] = pi;
642 
643 				for (mrange = 0; mrange < nranges; mrange++) {
644 					pi->pcc_color_free =
645 					    (pgcnt_t *)alloc_base;
646 					alloc_base += sizeof (pgcnt_t) *
647 					    colors_per_szc[r];
648 					pi++;
649 				}
650 			}
651 		}
652 	}
653 
654 	/* ctr_mutex */
655 	for (i = 0; i < NPC_MUTEX; i++) {
656 		ctr_mutex[i] = (kmutex_t *)alloc_base;
657 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
658 	}
659 
660 	/* initialize page list counts */
661 	PLCNT_INIT(alloc_base);
662 
663 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
664 
665 		pgcnt_t r_pgcnt;
666 		pfn_t	r_base;
667 		pgcnt_t r_align;
668 		int	r_shift;
669 		int	nranges = mnode_nranges[mnode];
670 
671 		if (mem_node_config[mnode].exists == 0)
672 			continue;
673 
674 		for (r = 1; r < mmu_page_sizes; r++) {
675 			/*
676 			 * the page_counters base has to be aligned to the
677 			 * page count of page size code r otherwise the counts
678 			 * will cross large page boundaries.
679 			 */
680 			r_align = page_get_pagecnt(r);
681 			r_base = mem_node_config[mnode].physbase;
682 			/* base needs to be aligned - lower to aligned value */
683 			r_base &= ~(r_align - 1);
684 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
685 			    r_base + 1, r_align);
686 			r_shift = PAGE_BSZS_SHIFT(r);
687 
688 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
689 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
690 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
691 			for (mrange = 0; mrange < nranges; mrange++) {
692 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
693 				    r, mrange) = (size_t *)alloc_base;
694 				alloc_base += sizeof (size_t) *
695 				    colors_per_szc[r];
696 			}
697 			for (i = 0; i < colors_per_szc[r]; i++) {
698 				uint_t color_mask = colors_per_szc[r] - 1;
699 				pfn_t  pfnum = r_base;
700 				size_t idx;
701 				int mrange;
702 
703 				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
704 				    color_mask, color_mask);
705 				idx = PNUM_TO_IDX(mnode, r, pfnum);
706 				idx = (idx >= r_pgcnt) ? 0 : idx;
707 				for (mrange = 0; mrange < nranges; mrange++) {
708 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
709 					    r, i, mrange) = idx;
710 				}
711 			}
712 			PAGE_COUNTERS_COUNTERS(mnode, r) =
713 			    (hpmctr_t *)alloc_base;
714 			/*
715 			 * Round up to make alloc_base always be aligned on
716 			 * a pointer boundary.
717 			 */
718 			alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
719 			    sizeof (hpmctr_t *));
720 
721 			/*
722 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
723 			 * satisfy the identity requirement.
724 			 * We should be able to go from one to the other
725 			 * and get consistent values.
726 			 */
727 			ASSERT(PNUM_TO_IDX(mnode, r,
728 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
729 			ASSERT(IDX_TO_PNUM(mnode, r,
730 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
731 		}
732 		/*
733 		 * Roundup the start address of the page_counters to
734 		 * cache aligned boundary for every memory node.
735 		 * page_ctrs_sz() has added some slop for these roundups.
736 		 */
737 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
738 			L2CACHE_ALIGN);
739 	}
740 
741 	/* Initialize other page counter specific data structures. */
742 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
743 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
744 	}
745 
746 	return (alloc_base);
747 }
748 
749 /*
750  * Functions to adjust region counters for each size free list.
751  * Caller is responsible to acquire the ctr_mutex lock if necessary and
752  * thus can be called during startup without locks.
753  */
754 /* ARGSUSED */
755 void
756 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
757 {
758 	ssize_t		r;	/* region size */
759 	ssize_t		idx;
760 	pfn_t		pfnum;
761 	int		lckidx;
762 
763 	ASSERT(mnode == PP_2_MEM_NODE(pp));
764 	ASSERT(mtype == PP_2_MTYPE(pp));
765 
766 	ASSERT(pp->p_szc < mmu_page_sizes);
767 
768 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
769 
770 	/* no counter update needed for largest page size */
771 	if (pp->p_szc >= mmu_page_sizes - 1) {
772 		return;
773 	}
774 
775 	r = pp->p_szc + 1;
776 	pfnum = pp->p_pagenum;
777 	lckidx = PP_CTR_LOCK_INDX(pp);
778 
779 	/*
780 	 * Increment the count of free pages for the current
781 	 * region. Continue looping up in region size incrementing
782 	 * count if the preceeding region is full.
783 	 */
784 	while (r < mmu_page_sizes) {
785 		idx = PNUM_TO_IDX(mnode, r, pfnum);
786 
787 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
788 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
789 
790 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
791 			break;
792 		} else {
793 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
794 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
795 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
796 
797 			cand->pcc_pages_free++;
798 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
799 		}
800 		r++;
801 	}
802 }
803 
804 void
805 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
806 {
807 	int		lckidx = PP_CTR_LOCK_INDX(pp);
808 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
809 
810 	mutex_enter(lock);
811 	page_ctr_add_internal(mnode, mtype, pp, flags);
812 	mutex_exit(lock);
813 }
814 
815 void
816 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
817 {
818 	int		lckidx;
819 	ssize_t		r;	/* region size */
820 	ssize_t		idx;
821 	pfn_t		pfnum;
822 
823 	ASSERT(mnode == PP_2_MEM_NODE(pp));
824 	ASSERT(mtype == PP_2_MTYPE(pp));
825 
826 	ASSERT(pp->p_szc < mmu_page_sizes);
827 
828 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
829 
830 	/* no counter update needed for largest page size */
831 	if (pp->p_szc >= mmu_page_sizes - 1) {
832 		return;
833 	}
834 
835 	r = pp->p_szc + 1;
836 	pfnum = pp->p_pagenum;
837 	lckidx = PP_CTR_LOCK_INDX(pp);
838 
839 	/*
840 	 * Decrement the count of free pages for the current
841 	 * region. Continue looping up in region size decrementing
842 	 * count if the preceeding region was full.
843 	 */
844 	while (r < mmu_page_sizes) {
845 		idx = PNUM_TO_IDX(mnode, r, pfnum);
846 
847 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
848 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
849 
850 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
851 			break;
852 		} else {
853 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
854 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
855 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
856 
857 			ASSERT(cand->pcc_pages_free != 0);
858 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
859 
860 			cand->pcc_pages_free--;
861 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
862 		}
863 		r++;
864 	}
865 }
866 
867 void
868 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
869 {
870 	int		lckidx = PP_CTR_LOCK_INDX(pp);
871 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
872 
873 	mutex_enter(lock);
874 	page_ctr_sub_internal(mnode, mtype, pp, flags);
875 	mutex_exit(lock);
876 }
877 
878 /*
879  * Adjust page counters following a memory attach, since typically the
880  * size of the array needs to change, and the PFN to counter index
881  * mapping needs to change.
882  *
883  * It is possible this mnode did not exist at startup. In that case
884  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
885  * to change (a theoretical possibility on x86), which means pcc_color_free
886  * arrays must be extended.
887  */
888 uint_t
889 page_ctrs_adjust(int mnode)
890 {
891 	pgcnt_t npgs;
892 	int	r;		/* region size */
893 	int	i;
894 	size_t	pcsz, old_csz;
895 	hpmctr_t *new_ctr, *old_ctr;
896 	pfn_t	oldbase, newbase;
897 	size_t	old_npgs;
898 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
899 	size_t	size_cache[MMU_PAGE_SIZES];
900 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
901 	size_t	*old_color_array[MAX_MNODE_MRANGES];
902 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
903 	pcc_info_t **cands_cache;
904 	pcc_info_t *old_pi, *pi;
905 	pgcnt_t *pgcntp;
906 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
907 	int cands_cache_nranges;
908 	int old_maxmrange, new_maxmrange;
909 	int rc = 0;
910 
911 	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
912 	npgs = roundup(mem_node_config[mnode].physmax,
913 	    PC_BASE_ALIGN) - newbase;
914 
915 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
916 	    MMU_PAGE_SIZES, KM_NOSLEEP);
917 	if (cands_cache == NULL)
918 		return (ENOMEM);
919 
920 	/* prepare to free non-null pointers on the way out */
921 	cands_cache_nranges = nranges;
922 	bzero(ctr_cache, sizeof (ctr_cache));
923 	bzero(color_cache, sizeof (color_cache));
924 
925 	/*
926 	 * We need to determine how many page colors there are for each
927 	 * page size in order to allocate memory for any color specific
928 	 * arrays.
929 	 */
930 	for (r = 0; r < mmu_page_sizes; r++) {
931 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
932 	}
933 
934 	/*
935 	 * Preallocate all of the new hpm_counters arrays as we can't
936 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
937 	 * If we can't allocate all of the arrays, undo our work so far
938 	 * and return failure.
939 	 */
940 	for (r = 1; r < mmu_page_sizes; r++) {
941 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
942 		size_cache[r] = pcsz;
943 		ctr_cache[r] = kmem_zalloc(pcsz *
944 		    sizeof (hpmctr_t), KM_NOSLEEP);
945 		if (ctr_cache[r] == NULL) {
946 			rc = ENOMEM;
947 			goto cleanup;
948 		}
949 	}
950 
951 	/*
952 	 * Preallocate all of the new color current arrays as we can't
953 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
954 	 * If we can't allocate all of the arrays, undo our work so far
955 	 * and return failure.
956 	 */
957 	for (r = 1; r < mmu_page_sizes; r++) {
958 		for (mrange = 0; mrange < nranges; mrange++) {
959 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
960 			    colors_per_szc[r], KM_NOSLEEP);
961 			if (color_cache[r][mrange] == NULL) {
962 				rc = ENOMEM;
963 				goto cleanup;
964 			}
965 		}
966 	}
967 
968 	/*
969 	 * Preallocate all of the new pcc_info_t arrays as we can't
970 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
971 	 * If we can't allocate all of the arrays, undo our work so far
972 	 * and return failure.
973 	 */
974 	for (r = 1; r < mmu_page_sizes; r++) {
975 		for (i = 0; i < NPC_MUTEX; i++) {
976 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
977 			    KM_NOSLEEP);
978 			if (pi == NULL) {
979 				rc = ENOMEM;
980 				goto cleanup;
981 			}
982 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
983 
984 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
985 				pgcntp = kmem_zalloc(colors_per_szc[r] *
986 				    sizeof (pgcnt_t), KM_NOSLEEP);
987 				if (pgcntp == NULL) {
988 					rc = ENOMEM;
989 					goto cleanup;
990 				}
991 				pi->pcc_color_free = pgcntp;
992 			}
993 		}
994 	}
995 
996 	/*
997 	 * Grab the write lock to prevent others from walking these arrays
998 	 * while we are modifying them.
999 	 */
1000 	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
1001 	page_freelist_lock(mnode);
1002 
1003 	old_nranges = mnode_nranges[mnode];
1004 	cands_cache_nranges = old_nranges;
1005 	mnode_nranges[mnode] = nranges;
1006 	old_maxmrange = mnode_maxmrange[mnode];
1007 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1008 	new_maxmrange = mnode_maxmrange[mnode];
1009 
1010 	for (r = 1; r < mmu_page_sizes; r++) {
1011 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1012 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
1013 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
1014 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
1015 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
1016 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1017 			old_color_array[mrange] =
1018 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1019 				r, mrange);
1020 		}
1021 
1022 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1023 		new_ctr = ctr_cache[r];
1024 		ctr_cache[r] = NULL;
1025 		if (old_ctr != NULL &&
1026 		    (oldbase + old_npgs > newbase) &&
1027 		    (newbase + npgs > oldbase)) {
1028 			/*
1029 			 * Map the intersection of the old and new
1030 			 * counters into the new array.
1031 			 */
1032 			size_t offset;
1033 			if (newbase > oldbase) {
1034 				offset = (newbase - oldbase) >>
1035 				    PAGE_COUNTERS_SHIFT(mnode, r);
1036 				bcopy(old_ctr + offset, new_ctr,
1037 				    MIN(pcsz, (old_csz - offset)) *
1038 				    sizeof (hpmctr_t));
1039 			} else {
1040 				offset = (oldbase - newbase) >>
1041 				    PAGE_COUNTERS_SHIFT(mnode, r);
1042 				bcopy(old_ctr, new_ctr + offset,
1043 				    MIN(pcsz - offset, old_csz) *
1044 				    sizeof (hpmctr_t));
1045 			}
1046 		}
1047 
1048 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1049 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1050 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1051 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1052 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1053 			    color_cache[r][mrange];
1054 			color_cache[r][mrange] = NULL;
1055 		}
1056 		/*
1057 		 * for now, just reset on these events as it's probably
1058 		 * not worthwhile to try and optimize this.
1059 		 */
1060 		for (i = 0; i < colors_per_szc[r]; i++) {
1061 			uint_t color_mask = colors_per_szc[r] - 1;
1062 			pfn_t  pfnum = newbase;
1063 			size_t idx;
1064 
1065 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
1066 			    color_mask);
1067 			idx = PNUM_TO_IDX(mnode, r, pfnum);
1068 			idx = (idx < pcsz) ? idx : 0;
1069 			for (mrange = 0; mrange < nranges; mrange++) {
1070 				PAGE_COUNTERS_CURRENT_COLOR(mnode,
1071 				    r, i, mrange) = idx;
1072 			}
1073 		}
1074 
1075 		/* cache info for freeing out of the critical path */
1076 		if ((caddr_t)old_ctr >= kernelheap &&
1077 		    (caddr_t)old_ctr < ekernelheap) {
1078 			ctr_cache[r] = old_ctr;
1079 			size_cache[r] = old_csz;
1080 		}
1081 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1082 			size_t *tmp = old_color_array[mrange];
1083 			if ((caddr_t)tmp >= kernelheap &&
1084 			    (caddr_t)tmp < ekernelheap) {
1085 				color_cache[r][mrange] = tmp;
1086 			}
1087 		}
1088 		/*
1089 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1090 		 * satisfy the identity requirement.
1091 		 * We should be able to go from one to the other
1092 		 * and get consistent values.
1093 		 */
1094 		ASSERT(PNUM_TO_IDX(mnode, r,
1095 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1096 		ASSERT(IDX_TO_PNUM(mnode, r,
1097 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1098 
1099 		/* pcc_info_t and pcc_color_free */
1100 		for (i = 0; i < NPC_MUTEX; i++) {
1101 			pcc_info_t *epi;
1102 			pcc_info_t *eold_pi;
1103 
1104 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1105 			old_pi = page_ctrs_cands[i][r][mnode];
1106 			page_ctrs_cands[i][r][mnode] = pi;
1107 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1108 
1109 			/* preserve old pcc_color_free values, if any */
1110 			if (old_pi == NULL)
1111 				continue;
1112 
1113 			/*
1114 			 * when/if x86 does DR, must account for
1115 			 * possible change in range index when
1116 			 * preserving pcc_info
1117 			 */
1118 			epi = &pi[nranges];
1119 			eold_pi = &old_pi[old_nranges];
1120 			if (new_maxmrange > old_maxmrange) {
1121 				pi += new_maxmrange - old_maxmrange;
1122 			} else if (new_maxmrange < old_maxmrange) {
1123 				old_pi += old_maxmrange - new_maxmrange;
1124 			}
1125 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1126 				pcc_info_t tmp = *pi;
1127 				*pi = *old_pi;
1128 				*old_pi = tmp;
1129 			}
1130 		}
1131 	}
1132 	page_freelist_unlock(mnode);
1133 	rw_exit(&page_ctrs_rwlock[mnode]);
1134 
1135 	/*
1136 	 * Now that we have dropped the write lock, it is safe to free all
1137 	 * of the memory we have cached above.
1138 	 * We come thru here to free memory when pre-alloc fails, and also to
1139 	 * free old pointers which were recorded while locked.
1140 	 */
1141 cleanup:
1142 	for (r = 1; r < mmu_page_sizes; r++) {
1143 		if (ctr_cache[r] != NULL) {
1144 			kmem_free(ctr_cache[r],
1145 			    size_cache[r] * sizeof (hpmctr_t));
1146 		}
1147 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1148 			if (color_cache[r][mrange] != NULL) {
1149 				kmem_free(color_cache[r][mrange],
1150 				    colors_per_szc[r] * sizeof (size_t));
1151 			}
1152 		}
1153 		for (i = 0; i < NPC_MUTEX; i++) {
1154 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1155 			if (pi == NULL)
1156 				continue;
1157 			nr = cands_cache_nranges;
1158 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1159 				pgcntp = pi->pcc_color_free;
1160 				if (pgcntp == NULL)
1161 					continue;
1162 				if ((caddr_t)pgcntp >= kernelheap &&
1163 				    (caddr_t)pgcntp < ekernelheap) {
1164 					kmem_free(pgcntp,
1165 					    colors_per_szc[r] *
1166 					    sizeof (pgcnt_t));
1167 				}
1168 			}
1169 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1170 			if ((caddr_t)pi >= kernelheap &&
1171 			    (caddr_t)pi < ekernelheap) {
1172 				kmem_free(pi, nr * sizeof (pcc_info_t));
1173 			}
1174 		}
1175 	}
1176 
1177 	kmem_free(cands_cache,
1178 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1179 	return (rc);
1180 }
1181 
1182 
1183 #ifdef DEBUG
1184 
1185 /*
1186  * confirm pp is a large page corresponding to szc
1187  */
1188 void
1189 chk_lpg(page_t *pp, uchar_t szc)
1190 {
1191 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1192 	uint_t noreloc;
1193 
1194 	if (npgs == 1) {
1195 		ASSERT(pp->p_szc == 0);
1196 		ASSERT(pp->p_next == pp);
1197 		ASSERT(pp->p_prev == pp);
1198 		return;
1199 	}
1200 
1201 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1202 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1203 
1204 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1205 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1206 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1207 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1208 
1209 	/*
1210 	 * Check list of pages.
1211 	 */
1212 	noreloc = PP_ISNORELOC(pp);
1213 	while (npgs--) {
1214 		if (npgs != 0) {
1215 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1216 			ASSERT(pp->p_next == (pp + 1));
1217 		}
1218 		ASSERT(pp->p_szc == szc);
1219 		ASSERT(PP_ISFREE(pp));
1220 		ASSERT(PP_ISAGED(pp));
1221 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1222 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1223 		ASSERT(pp->p_vnode  == NULL);
1224 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1225 
1226 		pp = pp->p_next;
1227 	}
1228 }
1229 #endif /* DEBUG */
1230 
1231 void
1232 page_freelist_lock(int mnode)
1233 {
1234 	int i;
1235 	for (i = 0; i < NPC_MUTEX; i++) {
1236 		mutex_enter(FPC_MUTEX(mnode, i));
1237 		mutex_enter(CPC_MUTEX(mnode, i));
1238 	}
1239 }
1240 
1241 void
1242 page_freelist_unlock(int mnode)
1243 {
1244 	int i;
1245 	for (i = 0; i < NPC_MUTEX; i++) {
1246 		mutex_exit(FPC_MUTEX(mnode, i));
1247 		mutex_exit(CPC_MUTEX(mnode, i));
1248 	}
1249 }
1250 
1251 /*
1252  * add pp to the specified page list. Defaults to head of the page list
1253  * unless PG_LIST_TAIL is specified.
1254  */
1255 void
1256 page_list_add(page_t *pp, int flags)
1257 {
1258 	page_t		**ppp;
1259 	kmutex_t	*pcm;
1260 	uint_t		bin, mtype;
1261 	int		mnode;
1262 
1263 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1264 	ASSERT(PP_ISFREE(pp));
1265 	ASSERT(!hat_page_is_mapped(pp));
1266 	ASSERT(hat_page_getshare(pp) == 0);
1267 
1268 	/*
1269 	 * Large pages should be freed via page_list_add_pages().
1270 	 */
1271 	ASSERT(pp->p_szc == 0);
1272 
1273 	/*
1274 	 * Don't need to lock the freelist first here
1275 	 * because the page isn't on the freelist yet.
1276 	 * This means p_szc can't change on us.
1277 	 */
1278 
1279 	bin = PP_2_BIN(pp);
1280 	mnode = PP_2_MEM_NODE(pp);
1281 	mtype = PP_2_MTYPE(pp);
1282 
1283 	if (flags & PG_LIST_ISINIT) {
1284 		/*
1285 		 * PG_LIST_ISINIT is set during system startup (ie. single
1286 		 * threaded), add a page to the free list and add to the
1287 		 * the free region counters w/o any locking
1288 		 */
1289 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1290 
1291 		/* inline version of page_add() */
1292 		if (*ppp != NULL) {
1293 			pp->p_next = *ppp;
1294 			pp->p_prev = (*ppp)->p_prev;
1295 			(*ppp)->p_prev = pp;
1296 			pp->p_prev->p_next = pp;
1297 		} else
1298 			*ppp = pp;
1299 
1300 		page_ctr_add_internal(mnode, mtype, pp, flags);
1301 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1302 	} else {
1303 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1304 
1305 		if (flags & PG_FREE_LIST) {
1306 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1307 			ASSERT(PP_ISAGED(pp));
1308 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1309 
1310 		} else {
1311 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1312 			ASSERT(pp->p_vnode);
1313 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1314 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1315 		}
1316 		mutex_enter(pcm);
1317 		page_add(ppp, pp);
1318 
1319 		if (flags & PG_LIST_TAIL)
1320 			*ppp = (*ppp)->p_next;
1321 		/*
1322 		 * Add counters before releasing pcm mutex to avoid a race with
1323 		 * page_freelist_coalesce and page_freelist_split.
1324 		 */
1325 		page_ctr_add(mnode, mtype, pp, flags);
1326 		mutex_exit(pcm);
1327 	}
1328 
1329 
1330 #if defined(__sparc)
1331 	if (PP_ISNORELOC(pp)) {
1332 		kcage_freemem_add(1);
1333 	}
1334 #endif
1335 	/*
1336 	 * It is up to the caller to unlock the page!
1337 	 */
1338 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1339 }
1340 
1341 
1342 #ifdef __sparc
1343 /*
1344  * This routine is only used by kcage_init during system startup.
1345  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1346  * without the overhead of taking locks and updating counters.
1347  */
1348 void
1349 page_list_noreloc_startup(page_t *pp)
1350 {
1351 	page_t		**ppp;
1352 	uint_t		bin;
1353 	int		mnode;
1354 	int		mtype;
1355 	int		flags = 0;
1356 
1357 	/*
1358 	 * If this is a large page on the freelist then
1359 	 * break it up into smaller pages.
1360 	 */
1361 	if (pp->p_szc != 0)
1362 		page_boot_demote(pp);
1363 
1364 	/*
1365 	 * Get list page is currently on.
1366 	 */
1367 	bin = PP_2_BIN(pp);
1368 	mnode = PP_2_MEM_NODE(pp);
1369 	mtype = PP_2_MTYPE(pp);
1370 	ASSERT(mtype == MTYPE_RELOC);
1371 	ASSERT(pp->p_szc == 0);
1372 
1373 	if (PP_ISAGED(pp)) {
1374 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1375 		flags |= PG_FREE_LIST;
1376 	} else {
1377 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1378 		flags |= PG_CACHE_LIST;
1379 	}
1380 
1381 	ASSERT(*ppp != NULL);
1382 
1383 	/*
1384 	 * Delete page from current list.
1385 	 */
1386 	if (*ppp == pp)
1387 		*ppp = pp->p_next;		/* go to next page */
1388 	if (*ppp == pp) {
1389 		*ppp = NULL;			/* page list is gone */
1390 	} else {
1391 		pp->p_prev->p_next = pp->p_next;
1392 		pp->p_next->p_prev = pp->p_prev;
1393 	}
1394 
1395 	/*
1396 	 * Decrement page counters
1397 	 */
1398 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1399 
1400 	/*
1401 	 * Set no reloc for cage initted pages.
1402 	 */
1403 	PP_SETNORELOC(pp);
1404 
1405 	mtype = PP_2_MTYPE(pp);
1406 	ASSERT(mtype == MTYPE_NORELOC);
1407 
1408 	/*
1409 	 * Get new list for page.
1410 	 */
1411 	if (PP_ISAGED(pp)) {
1412 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1413 	} else {
1414 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1415 	}
1416 
1417 	/*
1418 	 * Insert page on new list.
1419 	 */
1420 	if (*ppp == NULL) {
1421 		*ppp = pp;
1422 		pp->p_next = pp->p_prev = pp;
1423 	} else {
1424 		pp->p_next = *ppp;
1425 		pp->p_prev = (*ppp)->p_prev;
1426 		(*ppp)->p_prev = pp;
1427 		pp->p_prev->p_next = pp;
1428 	}
1429 
1430 	/*
1431 	 * Increment page counters
1432 	 */
1433 	page_ctr_add_internal(mnode, mtype, pp, flags);
1434 
1435 	/*
1436 	 * Update cage freemem counter
1437 	 */
1438 	atomic_add_long(&kcage_freemem, 1);
1439 }
1440 #else	/* __sparc */
1441 
1442 /* ARGSUSED */
1443 void
1444 page_list_noreloc_startup(page_t *pp)
1445 {
1446 	panic("page_list_noreloc_startup: should be here only for sparc");
1447 }
1448 #endif
1449 
1450 void
1451 page_list_add_pages(page_t *pp, int flags)
1452 {
1453 	kmutex_t *pcm;
1454 	pgcnt_t	pgcnt;
1455 	uint_t	bin, mtype, i;
1456 	int	mnode;
1457 
1458 	/* default to freelist/head */
1459 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1460 
1461 	CHK_LPG(pp, pp->p_szc);
1462 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1463 
1464 	bin = PP_2_BIN(pp);
1465 	mnode = PP_2_MEM_NODE(pp);
1466 	mtype = PP_2_MTYPE(pp);
1467 
1468 	if (flags & PG_LIST_ISINIT) {
1469 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1470 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1471 		ASSERT(!PP_ISNORELOC(pp));
1472 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1473 	} else {
1474 
1475 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1476 
1477 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1478 
1479 		mutex_enter(pcm);
1480 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1481 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1482 		mutex_exit(pcm);
1483 
1484 		pgcnt = page_get_pagecnt(pp->p_szc);
1485 #if defined(__sparc)
1486 		if (PP_ISNORELOC(pp))
1487 			kcage_freemem_add(pgcnt);
1488 #endif
1489 		for (i = 0; i < pgcnt; i++, pp++)
1490 			page_unlock_nocapture(pp);
1491 	}
1492 }
1493 
1494 /*
1495  * During boot, need to demote a large page to base
1496  * pagesize pages for seg_kmem for use in boot_alloc()
1497  */
1498 void
1499 page_boot_demote(page_t *pp)
1500 {
1501 	ASSERT(pp->p_szc != 0);
1502 	ASSERT(PP_ISFREE(pp));
1503 	ASSERT(PP_ISAGED(pp));
1504 
1505 	(void) page_demote(PP_2_MEM_NODE(pp),
1506 	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
1507 	    PC_FREE);
1508 
1509 	ASSERT(PP_ISFREE(pp));
1510 	ASSERT(PP_ISAGED(pp));
1511 	ASSERT(pp->p_szc == 0);
1512 }
1513 
1514 /*
1515  * Take a particular page off of whatever freelist the page
1516  * is claimed to be on.
1517  *
1518  * NOTE: Only used for PAGESIZE pages.
1519  */
1520 void
1521 page_list_sub(page_t *pp, int flags)
1522 {
1523 	int		bin;
1524 	uint_t		mtype;
1525 	int		mnode;
1526 	kmutex_t	*pcm;
1527 	page_t		**ppp;
1528 
1529 	ASSERT(PAGE_EXCL(pp));
1530 	ASSERT(PP_ISFREE(pp));
1531 
1532 	/*
1533 	 * The p_szc field can only be changed by page_promote()
1534 	 * and page_demote(). Only free pages can be promoted and
1535 	 * demoted and the free list MUST be locked during these
1536 	 * operations. So to prevent a race in page_list_sub()
1537 	 * between computing which bin of the freelist lock to
1538 	 * grab and actually grabing the lock we check again that
1539 	 * the bin we locked is still the correct one. Notice that
1540 	 * the p_szc field could have actually changed on us but
1541 	 * if the bin happens to still be the same we are safe.
1542 	 */
1543 try_again:
1544 	bin = PP_2_BIN(pp);
1545 	mnode = PP_2_MEM_NODE(pp);
1546 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1547 	mutex_enter(pcm);
1548 	if (PP_2_BIN(pp) != bin) {
1549 		mutex_exit(pcm);
1550 		goto try_again;
1551 	}
1552 	mtype = PP_2_MTYPE(pp);
1553 
1554 	if (flags & PG_FREE_LIST) {
1555 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1556 		ASSERT(PP_ISAGED(pp));
1557 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1558 	} else {
1559 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1560 		ASSERT(!PP_ISAGED(pp));
1561 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1562 	}
1563 
1564 	/*
1565 	 * Common PAGESIZE case.
1566 	 *
1567 	 * Note that we locked the freelist. This prevents
1568 	 * any page promotion/demotion operations. Therefore
1569 	 * the p_szc will not change until we drop pcm mutex.
1570 	 */
1571 	if (pp->p_szc == 0) {
1572 		page_sub(ppp, pp);
1573 		/*
1574 		 * Subtract counters before releasing pcm mutex
1575 		 * to avoid race with page_freelist_coalesce.
1576 		 */
1577 		page_ctr_sub(mnode, mtype, pp, flags);
1578 		mutex_exit(pcm);
1579 
1580 #if defined(__sparc)
1581 		if (PP_ISNORELOC(pp)) {
1582 			kcage_freemem_sub(1);
1583 		}
1584 #endif
1585 		return;
1586 	}
1587 
1588 	/*
1589 	 * Large pages on the cache list are not supported.
1590 	 */
1591 	if (flags & PG_CACHE_LIST)
1592 		panic("page_list_sub: large page on cachelist");
1593 
1594 	/*
1595 	 * Slow but rare.
1596 	 *
1597 	 * Somebody wants this particular page which is part
1598 	 * of a large page. In this case we just demote the page
1599 	 * if it's on the freelist.
1600 	 *
1601 	 * We have to drop pcm before locking the entire freelist.
1602 	 * Once we have re-locked the freelist check to make sure
1603 	 * the page hasn't already been demoted or completely
1604 	 * freed.
1605 	 */
1606 	mutex_exit(pcm);
1607 	page_freelist_lock(mnode);
1608 	if (pp->p_szc != 0) {
1609 		/*
1610 		 * Large page is on freelist.
1611 		 */
1612 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1613 		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1614 	}
1615 	ASSERT(PP_ISFREE(pp));
1616 	ASSERT(PP_ISAGED(pp));
1617 	ASSERT(pp->p_szc == 0);
1618 
1619 	/*
1620 	 * Subtract counters before releasing pcm mutex
1621 	 * to avoid race with page_freelist_coalesce.
1622 	 */
1623 	bin = PP_2_BIN(pp);
1624 	mtype = PP_2_MTYPE(pp);
1625 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1626 
1627 	page_sub(ppp, pp);
1628 	page_ctr_sub(mnode, mtype, pp, flags);
1629 	page_freelist_unlock(mnode);
1630 
1631 #if defined(__sparc)
1632 	if (PP_ISNORELOC(pp)) {
1633 		kcage_freemem_sub(1);
1634 	}
1635 #endif
1636 }
1637 
1638 void
1639 page_list_sub_pages(page_t *pp, uint_t szc)
1640 {
1641 	kmutex_t *pcm;
1642 	uint_t	bin, mtype;
1643 	int	mnode;
1644 
1645 	ASSERT(PAGE_EXCL(pp));
1646 	ASSERT(PP_ISFREE(pp));
1647 	ASSERT(PP_ISAGED(pp));
1648 
1649 	/*
1650 	 * See comment in page_list_sub().
1651 	 */
1652 try_again:
1653 	bin = PP_2_BIN(pp);
1654 	mnode = PP_2_MEM_NODE(pp);
1655 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1656 	mutex_enter(pcm);
1657 	if (PP_2_BIN(pp) != bin) {
1658 		mutex_exit(pcm);
1659 		goto	try_again;
1660 	}
1661 
1662 	/*
1663 	 * If we're called with a page larger than szc or it got
1664 	 * promoted above szc before we locked the freelist then
1665 	 * drop pcm and re-lock entire freelist. If page still larger
1666 	 * than szc then demote it.
1667 	 */
1668 	if (pp->p_szc > szc) {
1669 		mutex_exit(pcm);
1670 		pcm = NULL;
1671 		page_freelist_lock(mnode);
1672 		if (pp->p_szc > szc) {
1673 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1674 			(void) page_demote(mnode,
1675 			    PFN_BASE(pp->p_pagenum, pp->p_szc),
1676 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1677 		}
1678 		bin = PP_2_BIN(pp);
1679 	}
1680 	ASSERT(PP_ISFREE(pp));
1681 	ASSERT(PP_ISAGED(pp));
1682 	ASSERT(pp->p_szc <= szc);
1683 	ASSERT(pp == PP_PAGEROOT(pp));
1684 
1685 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1686 
1687 	mtype = PP_2_MTYPE(pp);
1688 	if (pp->p_szc != 0) {
1689 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1690 		CHK_LPG(pp, pp->p_szc);
1691 	} else {
1692 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1693 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1694 	}
1695 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1696 
1697 	if (pcm != NULL) {
1698 		mutex_exit(pcm);
1699 	} else {
1700 		page_freelist_unlock(mnode);
1701 	}
1702 
1703 #if defined(__sparc)
1704 	if (PP_ISNORELOC(pp)) {
1705 		pgcnt_t	pgcnt;
1706 
1707 		pgcnt = page_get_pagecnt(pp->p_szc);
1708 		kcage_freemem_sub(pgcnt);
1709 	}
1710 #endif
1711 }
1712 
1713 /*
1714  * Add the page to the front of a linked list of pages
1715  * using the p_next & p_prev pointers for the list.
1716  * The caller is responsible for protecting the list pointers.
1717  */
1718 void
1719 mach_page_add(page_t **ppp, page_t *pp)
1720 {
1721 	if (*ppp == NULL) {
1722 		pp->p_next = pp->p_prev = pp;
1723 	} else {
1724 		pp->p_next = *ppp;
1725 		pp->p_prev = (*ppp)->p_prev;
1726 		(*ppp)->p_prev = pp;
1727 		pp->p_prev->p_next = pp;
1728 	}
1729 	*ppp = pp;
1730 }
1731 
1732 /*
1733  * Remove this page from a linked list of pages
1734  * using the p_next & p_prev pointers for the list.
1735  *
1736  * The caller is responsible for protecting the list pointers.
1737  */
1738 void
1739 mach_page_sub(page_t **ppp, page_t *pp)
1740 {
1741 	ASSERT(PP_ISFREE(pp));
1742 
1743 	if (*ppp == NULL || pp == NULL)
1744 		panic("mach_page_sub");
1745 
1746 	if (*ppp == pp)
1747 		*ppp = pp->p_next;		/* go to next page */
1748 
1749 	if (*ppp == pp)
1750 		*ppp = NULL;			/* page list is gone */
1751 	else {
1752 		pp->p_prev->p_next = pp->p_next;
1753 		pp->p_next->p_prev = pp->p_prev;
1754 	}
1755 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1756 }
1757 
1758 /*
1759  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1760  */
1761 void
1762 page_promote_size(page_t *pp, uint_t cur_szc)
1763 {
1764 	pfn_t pfn;
1765 	int mnode;
1766 	int idx;
1767 	int new_szc = cur_szc + 1;
1768 	int full = FULL_REGION_CNT(new_szc);
1769 
1770 	pfn = page_pptonum(pp);
1771 	mnode = PFN_2_MEM_NODE(pfn);
1772 
1773 	page_freelist_lock(mnode);
1774 
1775 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1776 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1777 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1778 
1779 	page_freelist_unlock(mnode);
1780 }
1781 
1782 static uint_t page_promote_err;
1783 static uint_t page_promote_noreloc_err;
1784 
1785 /*
1786  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1787  * for the given mnode starting at pfnum. Pages involved are on the freelist
1788  * before the call and may be returned to the caller if requested, otherwise
1789  * they will be placed back on the freelist.
1790  * If flags is PC_ALLOC, then the large page will be returned to the user in
1791  * a state which is consistent with a page being taken off the freelist.  If
1792  * we failed to lock the new large page, then we will return NULL to the
1793  * caller and put the large page on the freelist instead.
1794  * If flags is PC_FREE, then the large page will be placed on the freelist,
1795  * and NULL will be returned.
1796  * The caller is responsible for locking the freelist as well as any other
1797  * accounting which needs to be done for a returned page.
1798  *
1799  * RFE: For performance pass in pp instead of pfnum so
1800  * 	we can avoid excessive calls to page_numtopp_nolock().
1801  *	This would depend on an assumption that all contiguous
1802  *	pages are in the same memseg so we can just add/dec
1803  *	our pp.
1804  *
1805  * Lock ordering:
1806  *
1807  *	There is a potential but rare deadlock situation
1808  *	for page promotion and demotion operations. The problem
1809  *	is there are two paths into the freelist manager and
1810  *	they have different lock orders:
1811  *
1812  *	page_create()
1813  *		lock freelist
1814  *		page_lock(EXCL)
1815  *		unlock freelist
1816  *		return
1817  *		caller drops page_lock
1818  *
1819  *	page_free() and page_reclaim()
1820  *		caller grabs page_lock(EXCL)
1821  *
1822  *		lock freelist
1823  *		unlock freelist
1824  *		drop page_lock
1825  *
1826  *	What prevents a thread in page_create() from deadlocking
1827  *	with a thread freeing or reclaiming the same page is the
1828  *	page_trylock() in page_get_freelist(). If the trylock fails
1829  *	it skips the page.
1830  *
1831  *	The lock ordering for promotion and demotion is the same as
1832  *	for page_create(). Since the same deadlock could occur during
1833  *	page promotion and freeing or reclaiming of a page on the
1834  *	cache list we might have to fail the operation and undo what
1835  *	have done so far. Again this is rare.
1836  */
1837 page_t *
1838 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1839 {
1840 	page_t		*pp, *pplist, *tpp, *start_pp;
1841 	pgcnt_t		new_npgs, npgs;
1842 	uint_t		bin;
1843 	pgcnt_t		tmpnpgs, pages_left;
1844 	uint_t		noreloc;
1845 	int 		which_list;
1846 	ulong_t		index;
1847 	kmutex_t	*phm;
1848 
1849 	/*
1850 	 * General algorithm:
1851 	 * Find the starting page
1852 	 * Walk each page struct removing it from the freelist,
1853 	 * and linking it to all the other pages removed.
1854 	 * Once all pages are off the freelist,
1855 	 * walk the list, modifying p_szc to new_szc and what
1856 	 * ever other info needs to be done to create a large free page.
1857 	 * According to the flags, either return the page or put it
1858 	 * on the freelist.
1859 	 */
1860 
1861 	start_pp = page_numtopp_nolock(pfnum);
1862 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1863 	new_npgs = page_get_pagecnt(new_szc);
1864 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1865 
1866 	/* don't return page of the wrong mtype */
1867 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1868 			return (NULL);
1869 
1870 	/*
1871 	 * Loop through smaller pages to confirm that all pages
1872 	 * give the same result for PP_ISNORELOC().
1873 	 * We can check this reliably here as the protocol for setting
1874 	 * P_NORELOC requires pages to be taken off the free list first.
1875 	 */
1876 	noreloc = PP_ISNORELOC(start_pp);
1877 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1878 		if (noreloc != PP_ISNORELOC(pp)) {
1879 			page_promote_noreloc_err++;
1880 			page_promote_err++;
1881 			return (NULL);
1882 		}
1883 	}
1884 
1885 	pages_left = new_npgs;
1886 	pplist = NULL;
1887 	pp = start_pp;
1888 
1889 	/* Loop around coalescing the smaller pages into a big page. */
1890 	while (pages_left) {
1891 		/*
1892 		 * Remove from the freelist.
1893 		 */
1894 		ASSERT(PP_ISFREE(pp));
1895 		bin = PP_2_BIN(pp);
1896 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1897 		mtype = PP_2_MTYPE(pp);
1898 		if (PP_ISAGED(pp)) {
1899 
1900 			/*
1901 			 * PG_FREE_LIST
1902 			 */
1903 			if (pp->p_szc) {
1904 				page_vpsub(&PAGE_FREELISTS(mnode,
1905 				    pp->p_szc, bin, mtype), pp);
1906 			} else {
1907 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1908 				    bin, mtype), pp);
1909 			}
1910 			which_list = PG_FREE_LIST;
1911 		} else {
1912 			ASSERT(pp->p_szc == 0);
1913 
1914 			/*
1915 			 * PG_CACHE_LIST
1916 			 *
1917 			 * Since this page comes from the
1918 			 * cachelist, we must destroy the
1919 			 * vnode association.
1920 			 */
1921 			if (!page_trylock(pp, SE_EXCL)) {
1922 				goto fail_promote;
1923 			}
1924 
1925 			/*
1926 			 * We need to be careful not to deadlock
1927 			 * with another thread in page_lookup().
1928 			 * The page_lookup() thread could be holding
1929 			 * the same phm that we need if the two
1930 			 * pages happen to hash to the same phm lock.
1931 			 * At this point we have locked the entire
1932 			 * freelist and page_lookup() could be trying
1933 			 * to grab a freelist lock.
1934 			 */
1935 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
1936 			phm = PAGE_HASH_MUTEX(index);
1937 			if (!mutex_tryenter(phm)) {
1938 				page_unlock_nocapture(pp);
1939 				goto fail_promote;
1940 			}
1941 
1942 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
1943 			page_hashout(pp, phm);
1944 			mutex_exit(phm);
1945 			PP_SETAGED(pp);
1946 			page_unlock_nocapture(pp);
1947 			which_list = PG_CACHE_LIST;
1948 		}
1949 		page_ctr_sub(mnode, mtype, pp, which_list);
1950 
1951 		/*
1952 		 * Concatenate the smaller page(s) onto
1953 		 * the large page list.
1954 		 */
1955 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
1956 		pages_left -= npgs;
1957 		tpp = pp;
1958 		while (npgs--) {
1959 			tpp->p_szc = new_szc;
1960 			tpp = tpp->p_next;
1961 		}
1962 		page_list_concat(&pplist, &pp);
1963 		pp += tmpnpgs;
1964 	}
1965 	CHK_LPG(pplist, new_szc);
1966 
1967 	/*
1968 	 * return the page to the user if requested
1969 	 * in the properly locked state.
1970 	 */
1971 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
1972 		return (pplist);
1973 	}
1974 
1975 	/*
1976 	 * Otherwise place the new large page on the freelist
1977 	 */
1978 	bin = PP_2_BIN(pplist);
1979 	mnode = PP_2_MEM_NODE(pplist);
1980 	mtype = PP_2_MTYPE(pplist);
1981 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
1982 
1983 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
1984 	return (NULL);
1985 
1986 fail_promote:
1987 	/*
1988 	 * A thread must have still been freeing or
1989 	 * reclaiming the page on the cachelist.
1990 	 * To prevent a deadlock undo what we have
1991 	 * done sofar and return failure. This
1992 	 * situation can only happen while promoting
1993 	 * PAGESIZE pages.
1994 	 */
1995 	page_promote_err++;
1996 	while (pplist) {
1997 		pp = pplist;
1998 		mach_page_sub(&pplist, pp);
1999 		pp->p_szc = 0;
2000 		bin = PP_2_BIN(pp);
2001 		mtype = PP_2_MTYPE(pp);
2002 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2003 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2004 	}
2005 	return (NULL);
2006 
2007 }
2008 
2009 /*
2010  * Break up a large page into smaller size pages.
2011  * Pages involved are on the freelist before the call and may
2012  * be returned to the caller if requested, otherwise they will
2013  * be placed back on the freelist.
2014  * The caller is responsible for locking the freelist as well as any other
2015  * accounting which needs to be done for a returned page.
2016  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2017  * technically, any value may be passed in but PC_NO_COLOR is the standard
2018  * which should be followed for clarity's sake.
2019  */
2020 page_t *
2021 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
2022     int color, int flags)
2023 {
2024 	page_t	*pp, *pplist, *npplist;
2025 	pgcnt_t	npgs, n;
2026 	uint_t	bin;
2027 	uint_t	mtype;
2028 	page_t	*ret_pp = NULL;
2029 
2030 	ASSERT(cur_szc != 0);
2031 	ASSERT(new_szc < cur_szc);
2032 
2033 	pplist = page_numtopp_nolock(pfnum);
2034 	ASSERT(pplist != NULL);
2035 
2036 	ASSERT(pplist->p_szc == cur_szc);
2037 
2038 	bin = PP_2_BIN(pplist);
2039 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2040 	mtype = PP_2_MTYPE(pplist);
2041 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2042 
2043 	CHK_LPG(pplist, cur_szc);
2044 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2045 
2046 	/*
2047 	 * Number of PAGESIZE pages for smaller new_szc
2048 	 * page.
2049 	 */
2050 	npgs = page_get_pagecnt(new_szc);
2051 
2052 	while (pplist) {
2053 		pp = pplist;
2054 
2055 		ASSERT(pp->p_szc == cur_szc);
2056 
2057 		/*
2058 		 * We either break it up into PAGESIZE pages or larger.
2059 		 */
2060 		if (npgs == 1) {	/* PAGESIZE case */
2061 			mach_page_sub(&pplist, pp);
2062 			ASSERT(pp->p_szc == cur_szc);
2063 			ASSERT(new_szc == 0);
2064 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2065 			pp->p_szc = new_szc;
2066 			bin = PP_2_BIN(pp);
2067 			if ((bin == color) && (flags == PC_ALLOC) &&
2068 			    (ret_pp == NULL) &&
2069 			    page_trylock_cons(pp, SE_EXCL)) {
2070 				ret_pp = pp;
2071 			} else {
2072 				mtype = PP_2_MTYPE(pp);
2073 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2074 				    mtype), pp);
2075 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2076 			}
2077 		} else {
2078 
2079 			/*
2080 			 * Break down into smaller lists of pages.
2081 			 */
2082 			page_list_break(&pplist, &npplist, npgs);
2083 
2084 			pp = pplist;
2085 			n = npgs;
2086 			while (n--) {
2087 				ASSERT(pp->p_szc == cur_szc);
2088 				pp->p_szc = new_szc;
2089 				pp = pp->p_next;
2090 			}
2091 
2092 			CHK_LPG(pplist, new_szc);
2093 
2094 			bin = PP_2_BIN(pplist);
2095 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2096 			if ((bin == color) && (flags == PC_ALLOC) &&
2097 			    (ret_pp == NULL) &&
2098 			    page_trylock_cons(pp, SE_EXCL)) {
2099 				ret_pp = pp;
2100 			} else {
2101 				mtype = PP_2_MTYPE(pp);
2102 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2103 				    bin, mtype), pplist);
2104 
2105 				page_ctr_add(mnode, mtype, pplist,
2106 				    PG_FREE_LIST);
2107 			}
2108 			pplist = npplist;
2109 		}
2110 	}
2111 	return (ret_pp);
2112 }
2113 
2114 int mpss_coalesce_disable = 0;
2115 
2116 /*
2117  * Coalesce free pages into a page of the given szc and color if possible.
2118  * Return the pointer to the page created, otherwise, return NULL.
2119  *
2120  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2121  */
2122 page_t *
2123 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2124     int mtype, pfn_t pfnhi)
2125 {
2126 	int 	r = szc;		/* region size */
2127 	int	mrange;
2128 	uint_t 	full, bin, color_mask, wrap = 0;
2129 	pfn_t	pfnum, lo, hi;
2130 	size_t	len, idx, idx0;
2131 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2132 	page_t	*ret_pp;
2133 #if defined(__sparc)
2134 	pfn_t pfnum0, nlo, nhi;
2135 #endif
2136 
2137 	if (mpss_coalesce_disable) {
2138 		ASSERT(szc < MMU_PAGE_SIZES);
2139 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2140 		return (NULL);
2141 	}
2142 
2143 	ASSERT(szc < mmu_page_sizes);
2144 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2145 	ASSERT(ceq_mask <= color_mask);
2146 	ASSERT(color <= color_mask);
2147 	color &= ceq_mask;
2148 
2149 	/* Prevent page_counters dynamic memory from being freed */
2150 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2151 
2152 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2153 	ASSERT(mrange < mnode_nranges[mnode]);
2154 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2155 
2156 	/* get pfn range for mtype */
2157 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2158 #if defined(__sparc)
2159 	lo = PAGE_COUNTERS_BASE(mnode, r);
2160 	hi = IDX_TO_PNUM(mnode, r, len);
2161 #else
2162 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2163 	hi++;
2164 #endif
2165 
2166 	/* use lower limit if given */
2167 	if (pfnhi != PFNNULL && pfnhi < hi)
2168 		hi = pfnhi;
2169 
2170 	/* round to szcpgcnt boundaries */
2171 	lo = P2ROUNDUP(lo, szcpgcnt);
2172 	hi = hi & ~(szcpgcnt - 1);
2173 
2174 	/* set lo to the closest pfn of the right color */
2175 	if ((PFN_2_COLOR(lo, szc) ^ color) & ceq_mask) {
2176 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask);
2177 	}
2178 
2179 	if (hi <= lo) {
2180 		rw_exit(&page_ctrs_rwlock[mnode]);
2181 		return (NULL);
2182 	}
2183 
2184 	full = FULL_REGION_CNT(r);
2185 
2186 	/* calculate the number of page candidates and initial search index */
2187 	bin = color;
2188 	idx0 = (size_t)(-1);
2189 	do {
2190 		pgcnt_t acand;
2191 
2192 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2193 		if (acand) {
2194 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2195 			    r, bin, mrange);
2196 			idx0 = MIN(idx0, idx);
2197 			cands += acand;
2198 		}
2199 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2200 	} while (bin != color);
2201 
2202 	if (cands == 0) {
2203 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2204 		rw_exit(&page_ctrs_rwlock[mnode]);
2205 		return (NULL);
2206 	}
2207 
2208 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2209 	if (pfnum < lo || pfnum >= hi) {
2210 		pfnum = lo;
2211 	} else if ((PFN_2_COLOR(pfnum, szc) ^ color) & ceq_mask) {
2212 		/* pfnum has invalid color get the closest correct pfn */
2213 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2214 		    color_mask);
2215 		pfnum = (pfnum >= hi) ? lo : pfnum;
2216 	}
2217 
2218 	/* set starting index */
2219 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2220 	ASSERT(idx0 < len);
2221 
2222 #if defined(__sparc)
2223 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2224 	nhi = 0;		/* search kcage ranges */
2225 #endif
2226 
2227 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2228 
2229 #if defined(__sparc)
2230 		/*
2231 		 * Find lowest intersection of kcage ranges and mnode.
2232 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2233 		 */
2234 		if (nhi <= pfnum) {
2235 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2236 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2237 				goto wrapit;
2238 
2239 			/* jump to the next page in the range */
2240 			if (pfnum < nlo) {
2241 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2242 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2243 				if (idx >= len || pfnum >= hi)
2244 					goto wrapit;
2245 				if ((PFN_2_COLOR(pfnum, szc) ^ color) &
2246 				    ceq_mask)
2247 					goto next;
2248 			}
2249 		}
2250 #endif
2251 
2252 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2253 			goto next;
2254 
2255 		/*
2256 		 * RFE: For performance maybe we can do something less
2257 		 *	brutal than locking the entire freelist. So far
2258 		 * 	this doesn't seem to be a performance problem?
2259 		 */
2260 		page_freelist_lock(mnode);
2261 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2262 			ret_pp =
2263 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2264 			if (ret_pp != NULL) {
2265 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2266 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2267 				    PFN_2_COLOR(pfnum, szc), mrange) = idx;
2268 				page_freelist_unlock(mnode);
2269 				rw_exit(&page_ctrs_rwlock[mnode]);
2270 #if defined(__sparc)
2271 				if (PP_ISNORELOC(ret_pp)) {
2272 					pgcnt_t npgs;
2273 
2274 					npgs = page_get_pagecnt(ret_pp->p_szc);
2275 					kcage_freemem_sub(npgs);
2276 				}
2277 #endif
2278 				return (ret_pp);
2279 			}
2280 		} else {
2281 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2282 		}
2283 
2284 		page_freelist_unlock(mnode);
2285 		/*
2286 		 * No point looking for another page if we've
2287 		 * already tried all of the ones that
2288 		 * page_ctr_cands indicated.  Stash off where we left
2289 		 * off.
2290 		 * Note: this is not exact since we don't hold the
2291 		 * page_freelist_locks before we initially get the
2292 		 * value of cands for performance reasons, but should
2293 		 * be a decent approximation.
2294 		 */
2295 		if (--cands == 0) {
2296 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2297 			    idx;
2298 			break;
2299 		}
2300 next:
2301 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2302 			    color_mask);
2303 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2304 		if (idx >= len || pfnum >= hi) {
2305 wrapit:
2306 			pfnum = lo;
2307 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2308 			wrap++;
2309 #if defined(__sparc)
2310 			nhi = 0;	/* search kcage ranges */
2311 #endif
2312 		}
2313 	}
2314 
2315 	rw_exit(&page_ctrs_rwlock[mnode]);
2316 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2317 	return (NULL);
2318 }
2319 
2320 /*
2321  * For the given mnode, promote as many small pages to large pages as possible.
2322  */
2323 void
2324 page_freelist_coalesce_all(int mnode)
2325 {
2326 	int 	r;		/* region size */
2327 	int 	idx, full;
2328 	pfn_t	pfnum;
2329 	size_t	len;
2330 
2331 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2332 
2333 	if (mpss_coalesce_disable) {
2334 		return;
2335 	}
2336 
2337 	/*
2338 	 * Lock the entire freelist and coalesce what we can.
2339 	 *
2340 	 * Always promote to the largest page possible
2341 	 * first to reduce the number of page promotions.
2342 	 */
2343 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2344 	page_freelist_lock(mnode);
2345 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2346 		pgcnt_t cands = 0;
2347 		int mrange, nranges = mnode_nranges[mnode];
2348 
2349 		for (mrange = 0; mrange < nranges; mrange++) {
2350 			PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2351 			if (cands != 0)
2352 				break;
2353 		}
2354 		if (cands == 0) {
2355 			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
2356 			continue;
2357 		}
2358 
2359 		full = FULL_REGION_CNT(r);
2360 		len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2361 
2362 		for (idx = 0; idx < len; idx++) {
2363 			if (PAGE_COUNTERS(mnode, r, idx) == full) {
2364 				pfnum = IDX_TO_PNUM(mnode, r, idx);
2365 				ASSERT(pfnum >=
2366 				    mem_node_config[mnode].physbase &&
2367 				    pfnum <
2368 				    mem_node_config[mnode].physmax);
2369 				(void) page_promote(mnode,
2370 				    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2371 			}
2372 		}
2373 	}
2374 	page_freelist_unlock(mnode);
2375 	rw_exit(&page_ctrs_rwlock[mnode]);
2376 }
2377 
2378 /*
2379  * This is where all polices for moving pages around
2380  * to different page size free lists is implemented.
2381  * Returns 1 on success, 0 on failure.
2382  *
2383  * So far these are the priorities for this algorithm in descending
2384  * order:
2385  *
2386  *	1) When servicing a request try to do so with a free page
2387  *	   from next size up. Helps defer fragmentation as long
2388  *	   as possible.
2389  *
2390  *	2) Page coalesce on demand. Only when a freelist
2391  *	   larger than PAGESIZE is empty and step 1
2392  *	   will not work since all larger size lists are
2393  *	   also empty.
2394  *
2395  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2396  */
2397 
2398 page_t *
2399 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2400     pfn_t pfnhi, page_list_walker_t *plw)
2401 {
2402 	uchar_t nszc = szc + 1;
2403 	uint_t 	bin, sbin, bin_prev;
2404 	page_t	*pp, *firstpp;
2405 	page_t	*ret_pp = NULL;
2406 	uint_t  color_mask;
2407 
2408 	if (nszc == mmu_page_sizes)
2409 		return (NULL);
2410 
2411 	ASSERT(nszc < mmu_page_sizes);
2412 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2413 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2414 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2415 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2416 
2417 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2418 	/*
2419 	 * First try to break up a larger page to fill current size freelist.
2420 	 */
2421 	while (plw->plw_bins[nszc] != 0) {
2422 
2423 		ASSERT(nszc < mmu_page_sizes);
2424 
2425 		/*
2426 		 * If page found then demote it.
2427 		 */
2428 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2429 			page_freelist_lock(mnode);
2430 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2431 
2432 			/*
2433 			 * If pfnhi is not PFNNULL, look for large page below
2434 			 * pfnhi. PFNNULL signifies no pfn requirement.
2435 			 */
2436 			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
2437 				do {
2438 					pp = pp->p_vpnext;
2439 					if (pp == firstpp) {
2440 						pp = NULL;
2441 						break;
2442 					}
2443 				} while (pp->p_pagenum >= pfnhi);
2444 			}
2445 			if (pp) {
2446 				uint_t ccolor = page_correct_color(szc, nszc,
2447 				    color, bin, plw->plw_ceq_mask[szc]);
2448 
2449 				ASSERT(pp->p_szc == nszc);
2450 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2451 				ret_pp = page_demote(mnode, pp->p_pagenum,
2452 				    pp->p_szc, szc, ccolor, PC_ALLOC);
2453 				if (ret_pp) {
2454 					page_freelist_unlock(mnode);
2455 #if defined(__sparc)
2456 					if (PP_ISNORELOC(ret_pp)) {
2457 						pgcnt_t npgs;
2458 
2459 						npgs = page_get_pagecnt(
2460 						    ret_pp->p_szc);
2461 						kcage_freemem_sub(npgs);
2462 					}
2463 #endif
2464 					return (ret_pp);
2465 				}
2466 			}
2467 			page_freelist_unlock(mnode);
2468 		}
2469 
2470 		/* loop through next size bins */
2471 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2472 		plw->plw_bins[nszc]--;
2473 
2474 		if (bin == sbin) {
2475 			uchar_t nnszc = nszc + 1;
2476 
2477 			/* we are done with this page size - check next */
2478 			if (plw->plw_bins[nnszc] == 0)
2479 				/* we have already checked next size bins */
2480 				break;
2481 
2482 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2483 			if (bin_prev != INVALID_COLOR) {
2484 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2485 				if (!((bin ^ bin_prev) &
2486 				    plw->plw_ceq_mask[nnszc]))
2487 					break;
2488 			}
2489 			ASSERT(nnszc < mmu_page_sizes);
2490 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2491 			nszc = nnszc;
2492 			ASSERT(nszc < mmu_page_sizes);
2493 		}
2494 	}
2495 
2496 	return (ret_pp);
2497 }
2498 
2499 /*
2500  * Helper routine used only by the freelist code to lock
2501  * a page. If the page is a large page then it succeeds in
2502  * locking all the constituent pages or none at all.
2503  * Returns 1 on sucess, 0 on failure.
2504  */
2505 static int
2506 page_trylock_cons(page_t *pp, se_t se)
2507 {
2508 	page_t	*tpp, *first_pp = pp;
2509 
2510 	/*
2511 	 * Fail if can't lock first or only page.
2512 	 */
2513 	if (!page_trylock(pp, se)) {
2514 		return (0);
2515 	}
2516 
2517 	/*
2518 	 * PAGESIZE: common case.
2519 	 */
2520 	if (pp->p_szc == 0) {
2521 		return (1);
2522 	}
2523 
2524 	/*
2525 	 * Large page case.
2526 	 */
2527 	tpp = pp->p_next;
2528 	while (tpp != pp) {
2529 		if (!page_trylock(tpp, se)) {
2530 			/*
2531 			 * On failure unlock what we have locked so far.
2532 			 * We want to avoid attempting to capture these
2533 			 * pages as the pcm mutex may be held which could
2534 			 * lead to a recursive mutex panic.
2535 			 */
2536 			while (first_pp != tpp) {
2537 				page_unlock_nocapture(first_pp);
2538 				first_pp = first_pp->p_next;
2539 			}
2540 			return (0);
2541 		}
2542 		tpp = tpp->p_next;
2543 	}
2544 	return (1);
2545 }
2546 
2547 /*
2548  * init context for walking page lists
2549  * Called when a page of the given szc in unavailable. Sets markers
2550  * for the beginning of the search to detect when search has
2551  * completed a full cycle. Sets flags for splitting larger pages
2552  * and coalescing smaller pages. Page walking procedes until a page
2553  * of the desired equivalent color is found.
2554  */
2555 void
2556 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2557     int use_ceq, page_list_walker_t *plw)
2558 {
2559 	uint_t  nszc, ceq_mask, colors;
2560 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2561 
2562 	ASSERT(szc < mmu_page_sizes);
2563 	colors = PAGE_GET_PAGECOLORS(szc);
2564 
2565 	plw->plw_colors = colors;
2566 	plw->plw_color_mask = colors - 1;
2567 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2568 	plw->plw_bin_split_prev = bin;
2569 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2570 
2571 	/*
2572 	 * if vac aliasing is possible make sure lower order color
2573 	 * bits are never ignored
2574 	 */
2575 	if (vac_colors > 1)
2576 		ceq &= 0xf0;
2577 
2578 	/*
2579 	 * calculate the number of non-equivalent colors and
2580 	 * color equivalency mask
2581 	 */
2582 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2583 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2584 	ASSERT(plw->plw_ceq_dif > 0);
2585 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2586 
2587 	if (flags & PG_MATCH_COLOR) {
2588 		if (cpu_page_colors <  0) {
2589 			/*
2590 			 * this is a heterogeneous machine with different CPUs
2591 			 * having different size e$ (not supported for ni2/rock
2592 			 */
2593 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2594 			cpucolors = MAX(cpucolors, 1);
2595 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2596 			plw->plw_ceq_mask[szc] =
2597 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2598 		}
2599 		plw->plw_ceq_dif = 1;
2600 	}
2601 
2602 	/* we can split pages in the freelist, but not the cachelist */
2603 	if (can_split) {
2604 	    plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2605 
2606 	    /* calculate next sizes color masks and number of free list bins */
2607 	    for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2608 		plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2609 		    plw->plw_ceq_mask[szc]);
2610 		plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2611 	    }
2612 	    plw->plw_ceq_mask[nszc] = INVALID_MASK;
2613 	    plw->plw_bins[nszc] = 0;
2614 
2615 	} else {
2616 	    ASSERT(szc == 0);
2617 	    plw->plw_do_split = 0;
2618 	    plw->plw_bins[1] = 0;
2619 	    plw->plw_ceq_mask[1] = INVALID_MASK;
2620 	}
2621 }
2622 
2623 /*
2624  * set mark to flag where next split should occur
2625  */
2626 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2627 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2628 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2629 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2630 	plw->plw_split_next =						     \
2631 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2632 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2633 		plw->plw_split_next =					     \
2634 		INC_MASKED(plw->plw_split_next,				     \
2635 		    neq_mask, plw->plw_color_mask);			     \
2636 	}								     \
2637 }
2638 
2639 uint_t
2640 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2641 {
2642 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2643 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2644 	uchar_t nszc = szc + 1;
2645 
2646 	nbin = ADD_MASKED(bin,
2647 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2648 
2649 	if (plw->plw_do_split) {
2650 		plw->plw_bin_split_prev = bin;
2651 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2652 		plw->plw_do_split = 0;
2653 	}
2654 
2655 	if (szc == 0) {
2656 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2657 			if (nbin == plw->plw_bin0 &&
2658 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2659 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2660 				    neq_mask, plw->plw_color_mask);
2661 				plw->plw_bin_split_prev = plw->plw_bin0;
2662 			}
2663 
2664 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2665 				plw->plw_bin_marker =
2666 				    nbin = INC_MASKED(nbin, neq_mask,
2667 					plw->plw_color_mask);
2668 				plw->plw_bin_split_prev = plw->plw_bin0;
2669 				/*
2670 				 * large pages all have the same vac color
2671 				 * so by now we should be done with next
2672 				 * size page splitting process
2673 				 */
2674 				ASSERT(plw->plw_bins[1] == 0);
2675 				plw->plw_do_split = 0;
2676 				return (nbin);
2677 			}
2678 
2679 		} else {
2680 			uint_t bin_jump = (vac_colors == 1) ?
2681 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2682 
2683 			bin_jump &= ~(vac_colors - 1);
2684 
2685 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2686 			    plw->plw_color_mask);
2687 
2688 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2689 
2690 				plw->plw_bin_marker = nbin = nbin0;
2691 
2692 				if (plw->plw_bins[nszc] != 0) {
2693 					/*
2694 					 * check if next page size bin is the
2695 					 * same as the next page size bin for
2696 					 * bin0
2697 					 */
2698 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2699 					    nbin);
2700 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2701 					    plw->plw_bin0);
2702 
2703 					if ((bin0_nsz ^ nbin_nsz) &
2704 					    plw->plw_ceq_mask[nszc])
2705 						plw->plw_do_split = 1;
2706 				}
2707 				return (nbin);
2708 			}
2709 		}
2710 	}
2711 
2712 	if (plw->plw_bins[nszc] != 0) {
2713 	    nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2714 	    if (!((plw->plw_split_next ^ nbin_nsz) &
2715 		plw->plw_ceq_mask[nszc]))
2716 		plw->plw_do_split = 1;
2717 	}
2718 
2719 	return (nbin);
2720 }
2721 
2722 page_t *
2723 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2724     uint_t flags)
2725 {
2726 	kmutex_t		*pcm;
2727 	page_t			*pp, *first_pp;
2728 	uint_t			sbin;
2729 	int			plw_initialized;
2730 	page_list_walker_t	plw;
2731 
2732 	ASSERT(szc < mmu_page_sizes);
2733 
2734 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2735 
2736 	MTYPE_START(mnode, mtype, flags);
2737 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2738 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2739 		return (NULL);
2740 	}
2741 try_again:
2742 
2743 	plw_initialized = 0;
2744 	plw.plw_ceq_dif = 1;
2745 
2746 	/*
2747 	 * Only hold one freelist lock at a time, that way we
2748 	 * can start anywhere and not have to worry about lock
2749 	 * ordering.
2750 	 */
2751 	for (plw.plw_count = 0;
2752 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2753 		sbin = bin;
2754 		do {
2755 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2756 				goto bin_empty_1;
2757 
2758 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2759 			mutex_enter(pcm);
2760 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2761 			if (pp == NULL)
2762 				goto bin_empty_0;
2763 
2764 			/*
2765 			 * These were set before the page
2766 			 * was put on the free list,
2767 			 * they must still be set.
2768 			 */
2769 			ASSERT(PP_ISFREE(pp));
2770 			ASSERT(PP_ISAGED(pp));
2771 			ASSERT(pp->p_vnode == NULL);
2772 			ASSERT(pp->p_hash == NULL);
2773 			ASSERT(pp->p_offset == (u_offset_t)-1);
2774 			ASSERT(pp->p_szc == szc);
2775 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2776 
2777 			/*
2778 			 * Walk down the hash chain.
2779 			 * 8k pages are linked on p_next
2780 			 * and p_prev fields. Large pages
2781 			 * are a contiguous group of
2782 			 * constituent pages linked together
2783 			 * on their p_next and p_prev fields.
2784 			 * The large pages are linked together
2785 			 * on the hash chain using p_vpnext
2786 			 * p_vpprev of the base constituent
2787 			 * page of each large page.
2788 			 */
2789 			first_pp = pp;
2790 			while (!page_trylock_cons(pp, SE_EXCL)) {
2791 				if (szc == 0) {
2792 					pp = pp->p_next;
2793 				} else {
2794 					pp = pp->p_vpnext;
2795 				}
2796 
2797 				ASSERT(PP_ISFREE(pp));
2798 				ASSERT(PP_ISAGED(pp));
2799 				ASSERT(pp->p_vnode == NULL);
2800 				ASSERT(pp->p_hash == NULL);
2801 				ASSERT(pp->p_offset == (u_offset_t)-1);
2802 				ASSERT(pp->p_szc == szc);
2803 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2804 
2805 				if (pp == first_pp)
2806 					goto bin_empty_0;
2807 			}
2808 
2809 			ASSERT(pp != NULL);
2810 			ASSERT(mtype == PP_2_MTYPE(pp));
2811 			ASSERT(pp->p_szc == szc);
2812 			if (szc == 0) {
2813 				page_sub(&PAGE_FREELISTS(mnode,
2814 				    szc, bin, mtype), pp);
2815 			} else {
2816 				page_vpsub(&PAGE_FREELISTS(mnode,
2817 				    szc, bin, mtype), pp);
2818 				CHK_LPG(pp, szc);
2819 			}
2820 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2821 
2822 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2823 				panic("free page is not. pp %p", (void *)pp);
2824 			mutex_exit(pcm);
2825 
2826 #if defined(__sparc)
2827 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2828 			    (flags & PG_NORELOC) == 0);
2829 
2830 			if (PP_ISNORELOC(pp))
2831 				kcage_freemem_sub(page_get_pagecnt(szc));
2832 #endif
2833 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2834 			return (pp);
2835 
2836 bin_empty_0:
2837 			mutex_exit(pcm);
2838 bin_empty_1:
2839 			if (plw_initialized == 0) {
2840 				page_list_walk_init(szc, flags, bin, 1, 1,
2841 				    &plw);
2842 				plw_initialized = 1;
2843 				ASSERT(plw.plw_colors <=
2844 				    PAGE_GET_PAGECOLORS(szc));
2845 				ASSERT(plw.plw_colors > 0);
2846 				ASSERT((plw.plw_colors &
2847 				    (plw.plw_colors - 1)) == 0);
2848 				ASSERT(bin < plw.plw_colors);
2849 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2850 			}
2851 			/* calculate the next bin with equivalent color */
2852 			bin = ADD_MASKED(bin, plw.plw_bin_step,
2853 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
2854 		} while (sbin != bin);
2855 
2856 		/*
2857 		 * color bins are all empty if color match. Try and
2858 		 * satisfy the request by breaking up or coalescing
2859 		 * pages from a different size freelist of the correct
2860 		 * color that satisfies the ORIGINAL color requested.
2861 		 * If that fails then try pages of the same size but
2862 		 * different colors assuming we are not called with
2863 		 * PG_MATCH_COLOR.
2864 		 */
2865 		if (plw.plw_do_split &&
2866 		    (pp = page_freelist_split(szc, bin, mnode,
2867 			mtype, PFNNULL, &plw)) != NULL)
2868 		    return (pp);
2869 
2870 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2871 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
2872 			return (pp);
2873 
2874 		if (plw.plw_ceq_dif > 1)
2875 			bin = page_list_walk_next_bin(szc, bin, &plw);
2876 	}
2877 
2878 	/* if allowed, cycle through additional mtypes */
2879 	MTYPE_NEXT(mnode, mtype, flags);
2880 	if (mtype >= 0)
2881 		goto try_again;
2882 
2883 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2884 
2885 	return (NULL);
2886 }
2887 
2888 /*
2889  * Returns the count of free pages for 'pp' with size code 'szc'.
2890  * Note: This function does not return an exact value as the page freelist
2891  * locks are not held and thus the values in the page_counters may be
2892  * changing as we walk through the data.
2893  */
2894 static int
2895 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2896 {
2897 	pgcnt_t	pgfree;
2898 	pgcnt_t cnt;
2899 	ssize_t	r = szc;	/* region size */
2900 	ssize_t	idx;
2901 	int	i;
2902 	int	full, range;
2903 
2904 	/* Make sure pagenum passed in is aligned properly */
2905 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
2906 	ASSERT(szc > 0);
2907 
2908 	/* Prevent page_counters dynamic memory from being freed */
2909 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2910 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2911 	cnt = PAGE_COUNTERS(mnode, r, idx);
2912 	pgfree = cnt << PNUM_SHIFT(r - 1);
2913 	range = FULL_REGION_CNT(szc);
2914 
2915 	/* Check for completely full region */
2916 	if (cnt == range) {
2917 		rw_exit(&page_ctrs_rwlock[mnode]);
2918 		return (pgfree);
2919 	}
2920 
2921 	while (--r > 0) {
2922 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2923 		full = FULL_REGION_CNT(r);
2924 		for (i = 0; i < range; i++, idx++) {
2925 			cnt = PAGE_COUNTERS(mnode, r, idx);
2926 			/*
2927 			 * If cnt here is full, that means we have already
2928 			 * accounted for these pages earlier.
2929 			 */
2930 			if (cnt != full) {
2931 				pgfree += (cnt << PNUM_SHIFT(r - 1));
2932 			}
2933 		}
2934 		range *= full;
2935 	}
2936 	rw_exit(&page_ctrs_rwlock[mnode]);
2937 	return (pgfree);
2938 }
2939 
2940 /*
2941  * Called from page_geti_contig_pages to exclusively lock constituent pages
2942  * starting from 'spp' for page size code 'szc'.
2943  *
2944  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2945  * region needs to be greater than or equal to the threshold.
2946  */
2947 static int
2948 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
2949 {
2950 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
2951 	pgcnt_t pgfree, i;
2952 	page_t *pp;
2953 
2954 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
2955 
2956 
2957 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
2958 		goto skipptcpcheck;
2959 	/*
2960 	 * check if there are sufficient free pages available before attempting
2961 	 * to trylock. Count is approximate as page counters can change.
2962 	 */
2963 	pgfree = page_freecnt(mnode, spp, szc);
2964 
2965 	/* attempt to trylock if there are sufficient already free pages */
2966 	if (pgfree < pgcnt/ptcpthreshold) {
2967 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
2968 		return (0);
2969 	}
2970 
2971 skipptcpcheck:
2972 
2973 	for (i = 0; i < pgcnt; i++) {
2974 		pp = &spp[i];
2975 		if (!page_trylock(pp, SE_EXCL)) {
2976 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
2977 			while (--i != (pgcnt_t)-1) {
2978 				pp = &spp[i];
2979 				ASSERT(PAGE_EXCL(pp));
2980 				page_unlock_nocapture(pp);
2981 			}
2982 			return (0);
2983 		}
2984 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
2985 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
2986 		    !PP_ISFREE(pp)) {
2987 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
2988 			ASSERT(i == 0);
2989 			page_unlock_nocapture(pp);
2990 			return (0);
2991 		}
2992 		if (PP_ISNORELOC(pp)) {
2993 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
2994 			while (i != (pgcnt_t)-1) {
2995 				pp = &spp[i];
2996 				ASSERT(PAGE_EXCL(pp));
2997 				page_unlock_nocapture(pp);
2998 				i--;
2999 			}
3000 			return (0);
3001 		}
3002 	}
3003 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3004 	return (1);
3005 }
3006 
3007 /*
3008  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3009  * of 'szc' constituent pages that had been locked exclusively previously.
3010  * Will attempt to relocate constituent pages in use.
3011  */
3012 static page_t *
3013 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3014 {
3015 	spgcnt_t pgcnt, npgs, i;
3016 	page_t *targpp, *rpp, *hpp;
3017 	page_t *replpp = NULL;
3018 	page_t *pplist = NULL;
3019 
3020 	ASSERT(pp != NULL);
3021 
3022 	pgcnt = page_get_pagecnt(szc);
3023 	while (pgcnt) {
3024 		ASSERT(PAGE_EXCL(pp));
3025 		ASSERT(!PP_ISNORELOC(pp));
3026 		if (PP_ISFREE(pp)) {
3027 			/*
3028 			 * If this is a PG_FREE_LIST page then its
3029 			 * size code can change underneath us due to
3030 			 * page promotion or demotion. As an optimzation
3031 			 * use page_list_sub_pages() instead of
3032 			 * page_list_sub().
3033 			 */
3034 			if (PP_ISAGED(pp)) {
3035 				page_list_sub_pages(pp, szc);
3036 				if (pp->p_szc == szc) {
3037 					return (pp);
3038 				}
3039 				ASSERT(pp->p_szc < szc);
3040 				npgs = page_get_pagecnt(pp->p_szc);
3041 				hpp = pp;
3042 				for (i = 0; i < npgs; i++, pp++) {
3043 					pp->p_szc = szc;
3044 				}
3045 				page_list_concat(&pplist, &hpp);
3046 				pgcnt -= npgs;
3047 				continue;
3048 			}
3049 			ASSERT(!PP_ISAGED(pp));
3050 			ASSERT(pp->p_szc == 0);
3051 			page_list_sub(pp, PG_CACHE_LIST);
3052 			page_hashout(pp, NULL);
3053 			PP_SETAGED(pp);
3054 			pp->p_szc = szc;
3055 			page_list_concat(&pplist, &pp);
3056 			pp++;
3057 			pgcnt--;
3058 			continue;
3059 		}
3060 		npgs = page_get_pagecnt(pp->p_szc);
3061 
3062 		/*
3063 		 * page_create_wait freemem accounting done by caller of
3064 		 * page_get_freelist and not necessary to call it prior to
3065 		 * calling page_get_replacement_page.
3066 		 *
3067 		 * page_get_replacement_page can call page_get_contig_pages
3068 		 * to acquire a large page (szc > 0); the replacement must be
3069 		 * smaller than the contig page size to avoid looping or
3070 		 * szc == 0 and PGI_PGCPSZC0 is set.
3071 		 */
3072 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3073 			replpp = page_get_replacement_page(pp, NULL, 0);
3074 			if (replpp) {
3075 				npgs = page_get_pagecnt(pp->p_szc);
3076 				ASSERT(npgs <= pgcnt);
3077 				targpp = pp;
3078 			}
3079 		}
3080 
3081 		/*
3082 		 * If replacement is NULL or do_page_relocate fails, fail
3083 		 * coalescing of pages.
3084 		 */
3085 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3086 		    &npgs, NULL) != 0)) {
3087 			/*
3088 			 * Unlock un-processed target list
3089 			 */
3090 			while (pgcnt--) {
3091 				ASSERT(PAGE_EXCL(pp));
3092 				page_unlock_nocapture(pp);
3093 				pp++;
3094 			}
3095 			/*
3096 			 * Free the processed target list.
3097 			 */
3098 			while (pplist) {
3099 				pp = pplist;
3100 				page_sub(&pplist, pp);
3101 				ASSERT(PAGE_EXCL(pp));
3102 				ASSERT(pp->p_szc == szc);
3103 				ASSERT(PP_ISFREE(pp));
3104 				ASSERT(PP_ISAGED(pp));
3105 				pp->p_szc = 0;
3106 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3107 				page_unlock_nocapture(pp);
3108 			}
3109 
3110 			if (replpp != NULL)
3111 				page_free_replacement_page(replpp);
3112 
3113 			return (NULL);
3114 		}
3115 		ASSERT(pp == targpp);
3116 
3117 		/* LINTED */
3118 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3119 
3120 		pp += npgs;
3121 		pgcnt -= npgs;
3122 
3123 		while (npgs--) {
3124 			ASSERT(PAGE_EXCL(targpp));
3125 			ASSERT(!PP_ISFREE(targpp));
3126 			ASSERT(!PP_ISNORELOC(targpp));
3127 			PP_SETFREE(targpp);
3128 			ASSERT(PP_ISAGED(targpp));
3129 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3130 			    (flags & PGI_PGCPSZC0)));
3131 			targpp->p_szc = szc;
3132 			targpp = targpp->p_next;
3133 
3134 			rpp = replpp;
3135 			ASSERT(rpp != NULL);
3136 			page_sub(&replpp, rpp);
3137 			ASSERT(PAGE_EXCL(rpp));
3138 			ASSERT(!PP_ISFREE(rpp));
3139 			page_unlock_nocapture(rpp);
3140 		}
3141 		ASSERT(targpp == hpp);
3142 		ASSERT(replpp == NULL);
3143 		page_list_concat(&pplist, &targpp);
3144 	}
3145 	CHK_LPG(pplist, szc);
3146 	return (pplist);
3147 }
3148 
3149 /*
3150  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3151  * of 0 means nothing left after trim.
3152  */
3153 int
3154 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3155 {
3156 	pfn_t	kcagepfn;
3157 	int	decr;
3158 	int	rc = 0;
3159 
3160 	if (PP_ISNORELOC(mseg->pages)) {
3161 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3162 
3163 			/* lower part of this mseg inside kernel cage */
3164 			decr = kcage_current_pfn(&kcagepfn);
3165 
3166 			/* kernel cage may have transitioned past mseg */
3167 			if (kcagepfn >= mseg->pages_base &&
3168 			    kcagepfn < mseg->pages_end) {
3169 				ASSERT(decr == 0);
3170 				*lo = kcagepfn;
3171 				*hi = MIN(pfnhi,
3172 				    (mseg->pages_end - 1));
3173 				rc = 1;
3174 			}
3175 		}
3176 		/* else entire mseg in the cage */
3177 	} else {
3178 		if (PP_ISNORELOC(mseg->epages - 1)) {
3179 
3180 			/* upper part of this mseg inside kernel cage */
3181 			decr = kcage_current_pfn(&kcagepfn);
3182 
3183 			/* kernel cage may have transitioned past mseg */
3184 			if (kcagepfn >= mseg->pages_base &&
3185 			    kcagepfn < mseg->pages_end) {
3186 				ASSERT(decr);
3187 				*hi = kcagepfn;
3188 				*lo = MAX(pfnlo, mseg->pages_base);
3189 				rc = 1;
3190 			}
3191 		} else {
3192 			/* entire mseg outside of kernel cage */
3193 			*lo = MAX(pfnlo, mseg->pages_base);
3194 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3195 			rc = 1;
3196 		}
3197 	}
3198 	return (rc);
3199 }
3200 
3201 /*
3202  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3203  * page with size code 'szc'. Claiming such a page requires acquiring
3204  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3205  * relocating pages in use and concatenating these constituent pages into a
3206  * large page.
3207  *
3208  * The page lists do not have such a large page and page_freelist_split has
3209  * already failed to demote larger pages and/or coalesce smaller free pages.
3210  *
3211  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3212  * pages with the same color as 'bin'.
3213  *
3214  * 'pfnflag' specifies the subset of the pfn range to search.
3215  */
3216 
3217 
3218 static page_t *
3219 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3220     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3221 {
3222 	struct memseg *mseg;
3223 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3224 	pgcnt_t szcpgmask = szcpgcnt - 1;
3225 	pfn_t	randpfn;
3226 	page_t *pp, *randpp, *endpp;
3227 	uint_t colors, ceq_mask;
3228 	/* LINTED : set but not used in function */
3229 	uint_t color_mask;
3230 	pfn_t hi, lo;
3231 	uint_t skip;
3232 
3233 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3234 
3235 	if ((pfnhi - pfnlo) + 1 < szcpgcnt)
3236 		return (NULL);
3237 
3238 	ASSERT(szc < mmu_page_sizes);
3239 
3240 	colors = PAGE_GET_PAGECOLORS(szc);
3241 	color_mask = colors - 1;
3242 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3243 		uchar_t ceq = colorequivszc[szc];
3244 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3245 
3246 		ASSERT(ceq_dif > 0);
3247 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3248 	} else {
3249 		ceq_mask = 0;
3250 	}
3251 
3252 	ASSERT(bin < colors);
3253 
3254 	/* clear "non-significant" color bits */
3255 	bin &= ceq_mask;
3256 
3257 	/*
3258 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3259 	 * when there have been previous page_get_contig_page failures to
3260 	 * limit the search.
3261 	 *
3262 	 * The high bit in pfnflag specifies the number of 'slots' in the
3263 	 * pfn range and the remainder of pfnflag specifies which slot.
3264 	 * For example, a value of 1010b would mean the second slot of
3265 	 * the pfn range that has been divided into 8 slots.
3266 	 */
3267 	if (pfnflag > 1) {
3268 		int	slots = 1 << (highbit(pfnflag) - 1);
3269 		int	slotid = pfnflag & (slots - 1);
3270 		pgcnt_t	szcpages;
3271 		int	slotlen;
3272 
3273 		pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3274 		pfnhi = pfnhi & ~(szcpgcnt - 1);
3275 
3276 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3277 		slotlen = howmany(szcpages, slots);
3278 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3279 		ASSERT(pfnlo < pfnhi);
3280 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3281 			pfnhi = pfnlo + (slotlen * szcpgcnt);
3282 	}
3283 
3284 	memsegs_lock(0);
3285 
3286 	/*
3287 	 * loop through memsegs to look for contig page candidates
3288 	 */
3289 
3290 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3291 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3292 			/* no overlap */
3293 			continue;
3294 		}
3295 
3296 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3297 			/* mseg too small */
3298 			continue;
3299 
3300 		/* trim off kernel cage pages from pfn range */
3301 		if (kcage_on) {
3302 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
3303 				continue;
3304 		} else {
3305 			lo = MAX(pfnlo, mseg->pages_base);
3306 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3307 		}
3308 
3309 		/* round to szcpgcnt boundaries */
3310 		lo = P2ROUNDUP(lo, szcpgcnt);
3311 		hi = hi & ~(szcpgcnt - 1);
3312 
3313 		if (hi <= lo)
3314 			continue;
3315 
3316 		/*
3317 		 * set lo to point to the pfn for the desired bin. Large
3318 		 * page sizes may only have a single page color
3319 		 */
3320 		skip = szcpgcnt;
3321 		if (ceq_mask > 0) {
3322 			/* set lo to point at appropriate color */
3323 			PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3324 			    color_mask);
3325 			if (hi <= lo)
3326 				/* mseg cannot satisfy color request */
3327 				continue;
3328 		}
3329 
3330 		/* randomly choose a point between lo and hi to begin search */
3331 
3332 		randpfn = (pfn_t)GETTICK();
3333 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3334 		if (ceq_mask) {
3335 			PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, ceq_mask,
3336 			    color_mask);
3337 			randpfn = (randpfn >= hi) ? lo : randpfn;
3338 		}
3339 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3340 
3341 		ASSERT(randpp->p_pagenum == randpfn);
3342 
3343 		pp = randpp;
3344 		endpp =  mseg->pages + (hi - mseg->pages_base);
3345 
3346 		ASSERT(randpp + szcpgcnt <= endpp);
3347 
3348 		do {
3349 			ASSERT(!(pp->p_pagenum & szcpgmask));
3350 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3351 
3352 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3353 				/* pages unlocked by page_claim on failure */
3354 				if (page_claim_contig_pages(pp, szc, flags)) {
3355 					memsegs_unlock(0);
3356 					return (pp);
3357 				}
3358 			}
3359 
3360 			if (ceq_mask == 0) {
3361 				pp += skip;
3362 			} else {
3363 				pfn_t pfn = pp->p_pagenum;
3364 
3365 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3366 				    ceq_mask, color_mask);
3367 				pp = mseg->pages + (pfn - mseg->pages_base);
3368 			}
3369 			if (pp >= endpp) {
3370 				/* start from the beginning */
3371 				pp = mseg->pages + (lo - mseg->pages_base);
3372 				ASSERT(pp->p_pagenum == lo);
3373 				ASSERT(pp + szcpgcnt <= endpp);
3374 			}
3375 		} while (pp != randpp);
3376 	}
3377 	memsegs_unlock(0);
3378 	return (NULL);
3379 }
3380 
3381 
3382 /*
3383  * controlling routine that searches through physical memory in an attempt to
3384  * claim a large page based on the input parameters.
3385  * on the page free lists.
3386  *
3387  * calls page_geti_contig_pages with an initial pfn range from the mnode
3388  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3389  * that overlaps with the kernel cage or does not match the requested page
3390  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3391  * page_geti_contig_pages may further limit the search range based on
3392  * previous failure counts (pgcpfailcnt[]).
3393  *
3394  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3395  * pagesize page that satisfies mtype.
3396  */
3397 page_t *
3398 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3399     uint_t flags)
3400 {
3401 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3402 	page_t		*pp;
3403 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3404 
3405 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3406 
3407 	/* no allocations from cage */
3408 	flags |= PGI_NOCAGE;
3409 
3410 	/* LINTED */
3411 	MTYPE_START(mnode, mtype, flags);
3412 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3413 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3414 		return (NULL);
3415 	}
3416 
3417 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3418 
3419 	/* do not limit search and ignore color if hi pri */
3420 
3421 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3422 		pfnflag = pgcpfailcnt[szc];
3423 
3424 	/* remove color match to improve chances */
3425 
3426 	if (flags & PGI_PGCPHIPRI || pfnflag)
3427 		flags &= ~PG_MATCH_COLOR;
3428 
3429 	do {
3430 		/* get pfn range based on mnode and mtype */
3431 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3432 
3433 		ASSERT(pfnhi >= pfnlo);
3434 
3435 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3436 		    pfnlo, pfnhi, pfnflag);
3437 
3438 		if (pp != NULL) {
3439 			pfnflag = pgcpfailcnt[szc];
3440 			if (pfnflag) {
3441 				/* double the search size */
3442 				pgcpfailcnt[szc] = pfnflag >> 1;
3443 			}
3444 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3445 			return (pp);
3446 		}
3447 		MTYPE_NEXT(mnode, mtype, flags);
3448 	} while (mtype >= 0);
3449 
3450 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3451 	return (NULL);
3452 }
3453 
3454 
3455 /*
3456  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3457  *
3458  * Does its own locking and accounting.
3459  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3460  * pages of the proper color even if there are pages of a different color.
3461  *
3462  * Finds a page, removes it, THEN locks it.
3463  */
3464 
3465 /*ARGSUSED*/
3466 page_t *
3467 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3468 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3469 {
3470 	struct as	*as = seg->s_as;
3471 	page_t		*pp = NULL;
3472 	ulong_t		bin;
3473 	uchar_t		szc;
3474 	int		mnode;
3475 	int		mtype;
3476 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3477 	lgrp_mnode_cookie_t	lgrp_cookie;
3478 
3479 	page_get_func = page_get_mnode_freelist;
3480 
3481 	/*
3482 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3483 	 * assume we wish to allocate near to the current thread's home.
3484 	 */
3485 	if (!LGRP_EXISTS(lgrp))
3486 		lgrp = lgrp_home_lgrp();
3487 
3488 	if (kcage_on) {
3489 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3490 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3491 		    curthread != kcage_cageout_thread) {
3492 			/*
3493 			 * Set a "reserve" of kcage_throttlefree pages for
3494 			 * PG_PANIC and cageout thread allocations.
3495 			 *
3496 			 * Everybody else has to serialize in
3497 			 * page_create_get_something() to get a cage page, so
3498 			 * that we don't deadlock cageout!
3499 			 */
3500 			return (NULL);
3501 		}
3502 	} else {
3503 		flags &= ~PG_NORELOC;
3504 		flags |= PGI_NOCAGE;
3505 	}
3506 
3507 	/* LINTED */
3508 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3509 
3510 	/*
3511 	 * Convert size to page size code.
3512 	 */
3513 	if ((szc = page_szc(size)) == (uchar_t)-1)
3514 		panic("page_get_freelist: illegal page size request");
3515 	ASSERT(szc < mmu_page_sizes);
3516 
3517 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3518 
3519 	/* LINTED */
3520 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3521 
3522 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3523 
3524 	/*
3525 	 * Try to get a local page first, but try remote if we can't
3526 	 * get a page of the right color.
3527 	 */
3528 pgretry:
3529 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3530 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3531 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3532 		if (pp != NULL) {
3533 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3534 			DTRACE_PROBE4(page__get,
3535 			    lgrp_t *, lgrp,
3536 			    int, mnode,
3537 			    ulong_t, bin,
3538 			    uint_t, flags);
3539 			return (pp);
3540 		}
3541 	}
3542 	ASSERT(pp == NULL);
3543 
3544 	/*
3545 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3546 	 * remote free lists.  Caller expected to call page_get_cachelist which
3547 	 * will check local cache lists and remote free lists.
3548 	 */
3549 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3550 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3551 		return (NULL);
3552 	}
3553 
3554 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3555 
3556 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3557 
3558 	/*
3559 	 * Try to get a non-local freelist page.
3560 	 */
3561 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3562 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3563 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3564 		if (pp != NULL) {
3565 			DTRACE_PROBE4(page__get,
3566 			    lgrp_t *, lgrp,
3567 			    int, mnode,
3568 			    ulong_t, bin,
3569 			    uint_t, flags);
3570 			VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3571 			return (pp);
3572 		}
3573 	}
3574 
3575 	ASSERT(pp == NULL);
3576 
3577 	/*
3578 	 * when the cage is off chances are page_get_contig_pages() will fail
3579 	 * to lock a large page chunk therefore when the cage is off it's not
3580 	 * called by default.  this can be changed via /etc/system.
3581 	 *
3582 	 * page_get_contig_pages() also called to acquire a base pagesize page
3583 	 * for page_create_get_something().
3584 	 */
3585 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3586 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3587 	    (page_get_func != page_get_contig_pages)) {
3588 
3589 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3590 		page_get_func = page_get_contig_pages;
3591 		goto pgretry;
3592 	}
3593 
3594 	if (pgcplimitsearch && page_get_func == page_get_contig_pages)
3595 		SETPGCPFAILCNT(szc);
3596 
3597 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3598 	return (NULL);
3599 }
3600 
3601 /*
3602  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3603  *
3604  * Does its own locking.
3605  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3606  * pages of the proper color even if there are pages of a different color.
3607  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3608  * try to lock one of them.  If no page can be locked, try the
3609  * next bin.  Return NULL if a page can not be found and locked.
3610  *
3611  * Finds a pages, trys to lock it, then removes it.
3612  */
3613 
3614 /*ARGSUSED*/
3615 page_t *
3616 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3617     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3618 {
3619 	page_t		*pp;
3620 	struct as	*as = seg->s_as;
3621 	ulong_t		bin;
3622 	/*LINTED*/
3623 	int		mnode;
3624 	int		mtype;
3625 	lgrp_mnode_cookie_t	lgrp_cookie;
3626 
3627 	/*
3628 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3629 	 * assume we wish to allocate near to the current thread's home.
3630 	 */
3631 	if (!LGRP_EXISTS(lgrp))
3632 		lgrp = lgrp_home_lgrp();
3633 
3634 	if (!kcage_on) {
3635 		flags &= ~PG_NORELOC;
3636 		flags |= PGI_NOCAGE;
3637 	}
3638 
3639 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3640 	    kcage_freemem <= kcage_throttlefree) {
3641 		/*
3642 		 * Reserve kcage_throttlefree pages for critical kernel
3643 		 * threads.
3644 		 *
3645 		 * Everybody else has to go to page_create_get_something()
3646 		 * to get a cage page, so we don't deadlock cageout.
3647 		 */
3648 		return (NULL);
3649 	}
3650 
3651 	/* LINTED */
3652 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3653 
3654 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3655 
3656 	/* LINTED */
3657 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3658 
3659 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3660 
3661 	/*
3662 	 * Try local cachelists first
3663 	 */
3664 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3665 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3666 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3667 		if (pp != NULL) {
3668 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3669 			DTRACE_PROBE4(page__get,
3670 			    lgrp_t *, lgrp,
3671 			    int, mnode,
3672 			    ulong_t, bin,
3673 			    uint_t, flags);
3674 			return (pp);
3675 		}
3676 	}
3677 
3678 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3679 
3680 	/*
3681 	 * Try freelists/cachelists that are farther away
3682 	 * This is our only chance to allocate remote pages for PAGESIZE
3683 	 * requests.
3684 	 */
3685 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3686 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3687 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3688 		    0, flags);
3689 		if (pp != NULL) {
3690 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3691 			DTRACE_PROBE4(page__get,
3692 			    lgrp_t *, lgrp,
3693 			    int, mnode,
3694 			    ulong_t, bin,
3695 			    uint_t, flags);
3696 			return (pp);
3697 		}
3698 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3699 		if (pp != NULL) {
3700 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3701 			DTRACE_PROBE4(page__get,
3702 			    lgrp_t *, lgrp,
3703 			    int, mnode,
3704 			    ulong_t, bin,
3705 			    uint_t, flags);
3706 			return (pp);
3707 		}
3708 	}
3709 
3710 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3711 	return (NULL);
3712 }
3713 
3714 page_t *
3715 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3716 {
3717 	kmutex_t		*pcm;
3718 	page_t			*pp, *first_pp;
3719 	uint_t			sbin;
3720 	int			plw_initialized;
3721 	page_list_walker_t	plw;
3722 
3723 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3724 
3725 	/* LINTED */
3726 	MTYPE_START(mnode, mtype, flags);
3727 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3728 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3729 		return (NULL);
3730 	}
3731 
3732 try_again:
3733 
3734 	plw_initialized = 0;
3735 	plw.plw_ceq_dif = 1;
3736 
3737 	/*
3738 	 * Only hold one cachelist lock at a time, that way we
3739 	 * can start anywhere and not have to worry about lock
3740 	 * ordering.
3741 	 */
3742 
3743 	for (plw.plw_count = 0;
3744 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3745 		sbin = bin;
3746 		do {
3747 
3748 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
3749 				goto bin_empty_1;
3750 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3751 			mutex_enter(pcm);
3752 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3753 			if (pp == NULL)
3754 				goto bin_empty_0;
3755 
3756 			first_pp = pp;
3757 			ASSERT(pp->p_vnode);
3758 			ASSERT(PP_ISAGED(pp) == 0);
3759 			ASSERT(pp->p_szc == 0);
3760 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3761 			while (!page_trylock(pp, SE_EXCL)) {
3762 				pp = pp->p_next;
3763 				ASSERT(pp->p_szc == 0);
3764 				if (pp == first_pp) {
3765 					/*
3766 					 * We have searched the complete list!
3767 					 * And all of them (might only be one)
3768 					 * are locked. This can happen since
3769 					 * these pages can also be found via
3770 					 * the hash list. When found via the
3771 					 * hash list, they are locked first,
3772 					 * then removed. We give up to let the
3773 					 * other thread run.
3774 					 */
3775 					pp = NULL;
3776 					break;
3777 				}
3778 				ASSERT(pp->p_vnode);
3779 				ASSERT(PP_ISFREE(pp));
3780 				ASSERT(PP_ISAGED(pp) == 0);
3781 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3782 				    mnode);
3783 			}
3784 
3785 			if (pp) {
3786 				page_t	**ppp;
3787 				/*
3788 				 * Found and locked a page.
3789 				 * Pull it off the list.
3790 				 */
3791 				ASSERT(mtype == PP_2_MTYPE(pp));
3792 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3793 				page_sub(ppp, pp);
3794 				/*
3795 				 * Subtract counters before releasing pcm mutex
3796 				 * to avoid a race with page_freelist_coalesce
3797 				 * and page_freelist_split.
3798 				 */
3799 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3800 				mutex_exit(pcm);
3801 				ASSERT(pp->p_vnode);
3802 				ASSERT(PP_ISAGED(pp) == 0);
3803 #if defined(__sparc)
3804 				ASSERT(!kcage_on ||
3805 				    (flags & PG_NORELOC) == 0 ||
3806 				    PP_ISNORELOC(pp));
3807 				if (PP_ISNORELOC(pp)) {
3808 					kcage_freemem_sub(1);
3809 				}
3810 #endif
3811 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3812 				return (pp);
3813 			}
3814 bin_empty_0:
3815 			mutex_exit(pcm);
3816 bin_empty_1:
3817 			if (plw_initialized == 0) {
3818 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
3819 				plw_initialized = 1;
3820 			}
3821 			/* calculate the next bin with equivalent color */
3822 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3823 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
3824 		} while (sbin != bin);
3825 
3826 		if (plw.plw_ceq_dif > 1)
3827 			bin = page_list_walk_next_bin(0, bin, &plw);
3828 	}
3829 
3830 	MTYPE_NEXT(mnode, mtype, flags);
3831 	if (mtype >= 0)
3832 		goto try_again;
3833 
3834 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3835 	return (NULL);
3836 }
3837 
3838 #ifdef DEBUG
3839 #define	REPL_PAGE_STATS
3840 #endif /* DEBUG */
3841 
3842 #ifdef REPL_PAGE_STATS
3843 struct repl_page_stats {
3844 	uint_t	ngets;
3845 	uint_t	ngets_noreloc;
3846 	uint_t	npgr_noreloc;
3847 	uint_t	nnopage_first;
3848 	uint_t	nnopage;
3849 	uint_t	nhashout;
3850 	uint_t	nnofree;
3851 	uint_t	nnext_pp;
3852 } repl_page_stats;
3853 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
3854 #else /* REPL_PAGE_STATS */
3855 #define	REPL_STAT_INCR(v)
3856 #endif /* REPL_PAGE_STATS */
3857 
3858 int	pgrppgcp;
3859 
3860 /*
3861  * The freemem accounting must be done by the caller.
3862  * First we try to get a replacement page of the same size as like_pp,
3863  * if that is not possible, then we just get a set of discontiguous
3864  * PAGESIZE pages.
3865  */
3866 page_t *
3867 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3868     uint_t pgrflags)
3869 {
3870 	page_t		*like_pp;
3871 	page_t		*pp, *pplist;
3872 	page_t		*pl = NULL;
3873 	ulong_t		bin;
3874 	int		mnode, page_mnode;
3875 	int		szc;
3876 	spgcnt_t	npgs, pg_cnt;
3877 	pfn_t		pfnum;
3878 	int		mtype;
3879 	int		flags = 0;
3880 	lgrp_mnode_cookie_t	lgrp_cookie;
3881 	lgrp_t		*lgrp;
3882 
3883 	REPL_STAT_INCR(ngets);
3884 	like_pp = orig_like_pp;
3885 	ASSERT(PAGE_EXCL(like_pp));
3886 
3887 	szc = like_pp->p_szc;
3888 	npgs = page_get_pagecnt(szc);
3889 	/*
3890 	 * Now we reset like_pp to the base page_t.
3891 	 * That way, we won't walk past the end of this 'szc' page.
3892 	 */
3893 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
3894 	like_pp = page_numtopp_nolock(pfnum);
3895 	ASSERT(like_pp->p_szc == szc);
3896 
3897 	if (PP_ISNORELOC(like_pp)) {
3898 		ASSERT(kcage_on);
3899 		REPL_STAT_INCR(ngets_noreloc);
3900 		flags = PGI_RELOCONLY;
3901 	} else if (pgrflags & PGR_NORELOC) {
3902 		ASSERT(kcage_on);
3903 		REPL_STAT_INCR(npgr_noreloc);
3904 		flags = PG_NORELOC;
3905 	}
3906 
3907 	/*
3908 	 * Kernel pages must always be replaced with the same size
3909 	 * pages, since we cannot properly handle demotion of kernel
3910 	 * pages.
3911 	 */
3912 	if (PP_ISKAS(like_pp))
3913 		pgrflags |= PGR_SAMESZC;
3914 
3915 	/* LINTED */
3916 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
3917 
3918 	while (npgs) {
3919 		pplist = NULL;
3920 		for (;;) {
3921 			pg_cnt = page_get_pagecnt(szc);
3922 			bin = PP_2_BIN(like_pp);
3923 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
3924 			ASSERT(pg_cnt <= npgs);
3925 
3926 			/*
3927 			 * If an lgroup was specified, try to get the
3928 			 * page from that lgroup.
3929 			 * NOTE: Must be careful with code below because
3930 			 *	 lgroup may disappear and reappear since there
3931 			 *	 is no locking for lgroup here.
3932 			 */
3933 			if (LGRP_EXISTS(lgrp_target)) {
3934 				/*
3935 				 * Keep local variable for lgroup separate
3936 				 * from lgroup argument since this code should
3937 				 * only be exercised when lgroup argument
3938 				 * exists....
3939 				 */
3940 				lgrp = lgrp_target;
3941 
3942 				/* Try the lgroup's freelists first */
3943 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3944 				    LGRP_SRCH_LOCAL);
3945 				while ((pplist == NULL) &&
3946 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3947 				    != -1) {
3948 					pplist = page_get_mnode_freelist(
3949 						mnode, bin, mtype, szc,
3950 						    flags);
3951 				}
3952 
3953 				/*
3954 				 * Now try it's cachelists if this is a
3955 				 * small page. Don't need to do it for
3956 				 * larger ones since page_freelist_coalesce()
3957 				 * already failed.
3958 				 */
3959 				if (pplist != NULL || szc != 0)
3960 					break;
3961 
3962 				/* Now try it's cachelists */
3963 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3964 				    LGRP_SRCH_LOCAL);
3965 
3966 				while ((pplist == NULL) &&
3967 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3968 				    != -1) {
3969 					pplist = page_get_mnode_cachelist(
3970 						bin, flags, mnode, mtype);
3971 				}
3972 				if (pplist != NULL) {
3973 					page_hashout(pplist, NULL);
3974 					PP_SETAGED(pplist);
3975 					REPL_STAT_INCR(nhashout);
3976 					break;
3977 				}
3978 				/* Done looking in this lgroup. Bail out. */
3979 				break;
3980 			}
3981 
3982 			/*
3983 			 * No lgroup was specified (or lgroup was removed by
3984 			 * DR, so just try to get the page as close to
3985 			 * like_pp's mnode as possible.
3986 			 * First try the local freelist...
3987 			 */
3988 			mnode = PP_2_MEM_NODE(like_pp);
3989 			pplist = page_get_mnode_freelist(mnode, bin,
3990 			    mtype, szc, flags);
3991 			if (pplist != NULL)
3992 				break;
3993 
3994 			REPL_STAT_INCR(nnofree);
3995 
3996 			/*
3997 			 * ...then the local cachelist. Don't need to do it for
3998 			 * larger pages cause page_freelist_coalesce() already
3999 			 * failed there anyway.
4000 			 */
4001 			if (szc == 0) {
4002 				pplist = page_get_mnode_cachelist(bin, flags,
4003 				    mnode, mtype);
4004 				if (pplist != NULL) {
4005 					page_hashout(pplist, NULL);
4006 					PP_SETAGED(pplist);
4007 					REPL_STAT_INCR(nhashout);
4008 					break;
4009 				}
4010 			}
4011 
4012 			/* Now try remote freelists */
4013 			page_mnode = mnode;
4014 			lgrp =
4015 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4016 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4017 			    LGRP_SRCH_HIER);
4018 			while (pplist == NULL &&
4019 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4020 			    != -1) {
4021 				/*
4022 				 * Skip local mnode.
4023 				 */
4024 				if ((mnode == page_mnode) ||
4025 				    (mem_node_config[mnode].exists == 0))
4026 					continue;
4027 
4028 				pplist = page_get_mnode_freelist(mnode,
4029 				    bin, mtype, szc, flags);
4030 			}
4031 
4032 			if (pplist != NULL)
4033 				break;
4034 
4035 
4036 			/* Now try remote cachelists */
4037 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4038 			    LGRP_SRCH_HIER);
4039 			while (pplist == NULL && szc == 0) {
4040 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4041 				if (mnode == -1)
4042 					break;
4043 				/*
4044 				 * Skip local mnode.
4045 				 */
4046 				if ((mnode == page_mnode) ||
4047 				    (mem_node_config[mnode].exists == 0))
4048 					continue;
4049 
4050 				pplist = page_get_mnode_cachelist(bin,
4051 				    flags, mnode, mtype);
4052 
4053 				if (pplist != NULL) {
4054 					page_hashout(pplist, NULL);
4055 					PP_SETAGED(pplist);
4056 					REPL_STAT_INCR(nhashout);
4057 					break;
4058 				}
4059 			}
4060 
4061 			/*
4062 			 * Break out of while loop under the following cases:
4063 			 * - If we successfully got a page.
4064 			 * - If pgrflags specified only returning a specific
4065 			 *   page size and we could not find that page size.
4066 			 * - If we could not satisfy the request with PAGESIZE
4067 			 *   or larger pages.
4068 			 */
4069 			if (pplist != NULL || szc == 0)
4070 				break;
4071 
4072 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4073 				/* try to find contig page */
4074 
4075 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4076 				    LGRP_SRCH_HIER);
4077 
4078 				while ((pplist == NULL) &&
4079 				    (mnode =
4080 					lgrp_memnode_choose(&lgrp_cookie))
4081 				    != -1) {
4082 					pplist = page_get_contig_pages(
4083 						mnode, bin, mtype, szc,
4084 						    flags | PGI_PGCPHIPRI);
4085 				}
4086 				break;
4087 			}
4088 
4089 			/*
4090 			 * The correct thing to do here is try the next
4091 			 * page size down using szc--. Due to a bug
4092 			 * with the processing of HAT_RELOAD_SHARE
4093 			 * where the sfmmu_ttecnt arrays of all
4094 			 * hats sharing an ISM segment don't get updated,
4095 			 * using intermediate size pages for relocation
4096 			 * can lead to continuous page faults.
4097 			 */
4098 			szc = 0;
4099 		}
4100 
4101 		if (pplist != NULL) {
4102 			DTRACE_PROBE4(page__get,
4103 			    lgrp_t *, lgrp,
4104 			    int, mnode,
4105 			    ulong_t, bin,
4106 			    uint_t, flags);
4107 
4108 			while (pplist != NULL && pg_cnt--) {
4109 				ASSERT(pplist != NULL);
4110 				pp = pplist;
4111 				page_sub(&pplist, pp);
4112 				PP_CLRFREE(pp);
4113 				PP_CLRAGED(pp);
4114 				page_list_concat(&pl, &pp);
4115 				npgs--;
4116 				like_pp = like_pp + 1;
4117 				REPL_STAT_INCR(nnext_pp);
4118 			}
4119 			ASSERT(pg_cnt == 0);
4120 		} else {
4121 			break;
4122 		}
4123 	}
4124 
4125 	if (npgs) {
4126 		/*
4127 		 * We were unable to allocate the necessary number
4128 		 * of pages.
4129 		 * We need to free up any pl.
4130 		 */
4131 		REPL_STAT_INCR(nnopage);
4132 		page_free_replacement_page(pl);
4133 		return (NULL);
4134 	} else {
4135 		return (pl);
4136 	}
4137 }
4138 
4139 /*
4140  * demote a free large page to it's constituent pages
4141  */
4142 void
4143 page_demote_free_pages(page_t *pp)
4144 {
4145 
4146 	int mnode;
4147 
4148 	ASSERT(pp != NULL);
4149 	ASSERT(PAGE_LOCKED(pp));
4150 	ASSERT(PP_ISFREE(pp));
4151 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4152 
4153 	mnode = PP_2_MEM_NODE(pp);
4154 	page_freelist_lock(mnode);
4155 	if (pp->p_szc != 0) {
4156 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4157 		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4158 	}
4159 	page_freelist_unlock(mnode);
4160 	ASSERT(pp->p_szc == 0);
4161 }
4162