xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision affbd3ccca8e26191a210ec9f9ffae170f919afd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved   */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 /*
38  * This file contains common functions to access and manage the page lists.
39  * Many of these routines originated from platform dependent modules
40  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
41  * a platform independent manner.
42  *
43  * vm/vm_dep.h provides for platform specific support.
44  */
45 
46 #include <sys/types.h>
47 #include <sys/debug.h>
48 #include <sys/cmn_err.h>
49 #include <sys/systm.h>
50 #include <sys/atomic.h>
51 #include <sys/sysmacros.h>
52 #include <vm/as.h>
53 #include <vm/page.h>
54 #include <vm/seg_kmem.h>
55 #include <vm/seg_vn.h>
56 #include <sys/memnode.h>
57 #include <vm/vm_dep.h>
58 #include <sys/lgrp.h>
59 #include <sys/mem_config.h>
60 #include <sys/callb.h>
61 #include <sys/mem_cage.h>
62 #include <sys/sdt.h>
63 
64 extern uint_t	vac_colors;
65 
66 /* vm_cpu_data for the boot cpu before kmem is initialized */
67 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
68 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
69 
70 /*
71  * number of page colors equivalent to reqested color in page_get routines.
72  * If set, keeps large pages intact longer and keeps MPO allocation
73  * from the local mnode in favor of acquiring the 'correct' page color from
74  * a demoted large page or from a remote mnode.
75  */
76 int	colorequiv;
77 
78 /*
79  * if set, specifies the percentage of large pages that are free from within
80  * a large page region before attempting to lock those pages for
81  * page_get_contig_pages processing.
82  *
83  * Should be turned on when kpr is available when page_trylock_contig_pages
84  * can be more selective.
85  */
86 
87 int	ptcpthreshold;
88 
89 /*
90  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
91  * use slot 0 (base page size unused) to enable or disable limiting search.
92  * Enabled by default.
93  */
94 int	pgcpfailcnt[MMU_PAGE_SIZES];
95 int	pgcplimitsearch = 1;
96 
97 #ifdef VM_STATS
98 struct vmm_vmstats_str  vmm_vmstats;
99 
100 #endif /* VM_STATS */
101 
102 #if defined(__sparc)
103 #define	LPGCREATE	0
104 #else
105 /* enable page_get_contig_pages */
106 #define	LPGCREATE	1
107 #endif
108 
109 int pg_contig_disable;
110 int pg_lpgcreate_nocage = LPGCREATE;
111 
112 /*
113  * page_freelist_fill pfn flag to signify no hi pfn requirement.
114  */
115 #define	PFNNULL		0
116 
117 /* Flags involved in promotion and demotion routines */
118 #define	PC_FREE		0x1	/* put page on freelist */
119 #define	PC_ALLOC	0x2	/* return page for allocation */
120 
121 /*
122  * Flag for page_demote to be used with PC_FREE to denote that we don't care
123  * what the color is as the color parameter to the function is ignored.
124  */
125 #define	PC_NO_COLOR	(-1)
126 
127 /*
128  * page counters candidates info
129  * See page_ctrs_cands comment below for more details.
130  * fields are as follows:
131  *	pcc_pages_free:		# pages which freelist coalesce can create
132  *	pcc_color_free_len:	number of elements in pcc_color_free array
133  *	pcc_color_free:		pointer to page free counts per color
134  */
135 typedef struct pcc_info {
136 	pgcnt_t	pcc_pages_free;
137 	int	pcc_color_free_len;
138 	pgcnt_t	*pcc_color_free;
139 } pcc_info_t;
140 
141 /*
142  * On big machines it can take a long time to check page_counters
143  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
144  * updated sum of all elements of the corresponding page_counters arrays.
145  * page_freelist_coalesce() searches page_counters only if an appropriate
146  * element of page_ctrs_cands array is greater than 0.
147  *
148  * An extra dimension is used for page_ctrs_cands to spread the elements
149  * over a few e$ cache lines to avoid serialization during the array
150  * updates.
151  */
152 #pragma	align 64(page_ctrs_cands)
153 
154 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
155 
156 /*
157  * Return in val the total number of free pages which can be created
158  * for the given mnode (m) and region size (r)
159  */
160 #define	PGCTRS_CANDS_GETVALUE(m, r, val) {				\
161 	int i;								\
162 	val = 0;							\
163 	for (i = 0; i < NPC_MUTEX; i++) {				\
164 	    val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free;		\
165 	}								\
166 }
167 
168 /*
169  * Return in val the total number of free pages which can be created
170  * for the given mnode (m), region size (r), and color (c)
171  */
172 #define	PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) {			\
173 	int i;								\
174 	val = 0;							\
175 	ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len);	\
176 	for (i = 0; i < NPC_MUTEX; i++) {				\
177 	    val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)];	\
178 	}								\
179 }
180 
181 /*
182  * We can only allow a single thread to update a counter within the physical
183  * range of the largest supported page size. That is the finest granularity
184  * possible since the counter values are dependent on each other
185  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
186  * ctr_mutex lock index for a particular physical range.
187  */
188 static kmutex_t	*ctr_mutex[NPC_MUTEX];
189 
190 #define	PP_CTR_LOCK_INDX(pp)						\
191 	(((pp)->p_pagenum >>					\
192 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
193 
194 /*
195  * Local functions prototypes.
196  */
197 
198 void page_ctr_add(int, int, page_t *, int);
199 void page_ctr_add_internal(int, int, page_t *, int);
200 void page_ctr_sub(int, int, page_t *, int);
201 uint_t  page_convert_color(uchar_t, uchar_t, uint_t);
202 void page_freelist_lock(int);
203 void page_freelist_unlock(int);
204 page_t *page_promote(int, pfn_t, uchar_t, int);
205 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
206 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
207 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
208 static int page_trylock_cons(page_t *pp, se_t se);
209 
210 #define	PNUM_SIZE(szc)							\
211 	(hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
212 #define	PNUM_SHIFT(szc)							\
213 	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
214 
215 /*
216  * The page_counters array below is used to keep track of free contiguous
217  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
218  * This contains an array of counters, the size of the array, a shift value
219  * used to convert a pagenum into a counter array index or vice versa, as
220  * well as a cache of the last successful index to be promoted to a larger
221  * page size.  As an optimization, we keep track of the last successful index
222  * to be promoted per page color for the given size region, and this is
223  * allocated dynamically based upon the number of colors for a given
224  * region size.
225  *
226  * Conceptually, the page counters are represented as:
227  *
228  *	page_counters[region_size][mnode]
229  *
230  *	region_size:	size code of a candidate larger page made up
231  *			of contiguous free smaller pages.
232  *
233  *	page_counters[region_size][mnode].hpm_counters[index]:
234  *		represents how many (region_size - 1) pages either
235  *		exist or can be created within the given index range.
236  *
237  * Let's look at a sparc example:
238  *	If we want to create a free 512k page, we look at region_size 2
239  *	for the mnode we want.  We calculate the index and look at a specific
240  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
241  *	this location, it means that 8 64k pages either exist or can be created
242  *	from 8K pages in order to make a single free 512k page at the given
243  *	index.  Note that when a region is full, it will contribute to the
244  *	counts in the region above it.  Thus we will not know what page
245  *	size the free pages will be which can be promoted to this new free
246  *	page unless we look at all regions below the current region.
247  */
248 
249 /*
250  * Note: hpmctr_t is defined in platform vm_dep.h
251  * hw_page_map_t contains all the information needed for the page_counters
252  * logic. The fields are as follows:
253  *
254  *	hpm_counters:	dynamically allocated array to hold counter data
255  *	hpm_entries:	entries in hpm_counters
256  *	hpm_shift:	shift for pnum/array index conv
257  *	hpm_base:	PFN mapped to counter index 0
258  *	hpm_color_current_len:	# of elements in hpm_color_current "array" below
259  *	hpm_color_current:	last index in counter array for this color at
260  *				which we successfully created a large page
261  */
262 typedef struct hw_page_map {
263 	hpmctr_t	*hpm_counters;
264 	size_t		hpm_entries;
265 	int		hpm_shift;
266 	pfn_t		hpm_base;
267 	size_t		hpm_color_current_len;
268 	size_t 		*hpm_color_current;
269 } hw_page_map_t;
270 
271 /*
272  * Element zero is not used, but is allocated for convenience.
273  */
274 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
275 
276 /*
277  * The following macros are convenient ways to get access to the individual
278  * elements of the page_counters arrays.  They can be used on both
279  * the left side and right side of equations.
280  */
281 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
282 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
283 
284 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
285 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
286 
287 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
288 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
289 
290 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
291 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
292 
293 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
294 	(page_counters[(rg_szc)][(mnode)].hpm_base)
295 
296 #define	PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc)		\
297 	(page_counters[(rg_szc)][(mnode)].hpm_color_current_len)
298 
299 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc)	\
300 	(page_counters[(rg_szc)][(mnode)].hpm_color_current)
301 
302 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color)	\
303 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])
304 
305 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
306 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
307 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
308 
309 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
310 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
311 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
312 
313 /*
314  * Protects the hpm_counters and hpm_color_current memory from changing while
315  * looking at page counters information.
316  * Grab the write lock to modify what these fields point at.
317  * Grab the read lock to prevent any pointers from changing.
318  * The write lock can not be held during memory allocation due to a possible
319  * recursion deadlock with trying to grab the read lock while the
320  * write lock is already held.
321  */
322 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
323 
324 
325 /*
326  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
327  */
328 void
329 cpu_vm_data_init(struct cpu *cp)
330 {
331 	int	align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
332 
333 	ASSERT(L2CACHE_ALIGN <= L2CACHE_ALIGN_MAX);
334 
335 	if (cp == CPU0) {
336 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
337 	} else {
338 		void	*kmptr;
339 
340 		kmptr = kmem_zalloc(VM_CPU_DATA_PADSIZE + align, KM_SLEEP);
341 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
342 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
343 	}
344 }
345 
346 /*
347  * free cpu_vm_data
348  */
349 void
350 cpu_vm_data_destroy(struct cpu *cp)
351 {
352 	if (cp->cpu_seqid && cp->cpu_vm_data) {
353 		ASSERT(cp != CPU0);
354 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
355 		    VM_CPU_DATA_PADSIZE);
356 	}
357 	cp->cpu_vm_data = NULL;
358 }
359 
360 
361 /*
362  * page size to page size code
363  */
364 int
365 page_szc(size_t pagesize)
366 {
367 	int	i = 0;
368 
369 	while (hw_page_array[i].hp_size) {
370 		if (pagesize == hw_page_array[i].hp_size)
371 			return (i);
372 		i++;
373 	}
374 	return (-1);
375 }
376 
377 /*
378  * page size to page size code with the restriction that it be a supported
379  * user page size.  If it's not a supported user page size, -1 will be returned.
380  */
381 int
382 page_szc_user_filtered(size_t pagesize)
383 {
384 	int szc = page_szc(pagesize);
385 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
386 		return (szc);
387 	}
388 	return (-1);
389 }
390 
391 /*
392  * Return how many page sizes are available for the user to use.  This is
393  * what the hardware supports and not based upon how the OS implements the
394  * support of different page sizes.
395  */
396 uint_t
397 page_num_user_pagesizes(void)
398 {
399 	return (mmu_exported_page_sizes);
400 }
401 
402 uint_t
403 page_num_pagesizes(void)
404 {
405 	return (mmu_page_sizes);
406 }
407 
408 /*
409  * returns the count of the number of base pagesize pages associated with szc
410  */
411 pgcnt_t
412 page_get_pagecnt(uint_t szc)
413 {
414 	if (szc >= mmu_page_sizes)
415 		panic("page_get_pagecnt: out of range %d", szc);
416 	return (hw_page_array[szc].hp_pgcnt);
417 }
418 
419 size_t
420 page_get_pagesize(uint_t szc)
421 {
422 	if (szc >= mmu_page_sizes)
423 		panic("page_get_pagesize: out of range %d", szc);
424 	return (hw_page_array[szc].hp_size);
425 }
426 
427 /*
428  * Return the size of a page based upon the index passed in.  An index of
429  * zero refers to the smallest page size in the system, and as index increases
430  * it refers to the next larger supported page size in the system.
431  * Note that szc and userszc may not be the same due to unsupported szc's on
432  * some systems.
433  */
434 size_t
435 page_get_user_pagesize(uint_t userszc)
436 {
437 	uint_t szc = USERSZC_2_SZC(userszc);
438 
439 	if (szc >= mmu_page_sizes)
440 		panic("page_get_user_pagesize: out of range %d", szc);
441 	return (hw_page_array[szc].hp_size);
442 }
443 
444 uint_t
445 page_get_shift(uint_t szc)
446 {
447 	if (szc >= mmu_page_sizes)
448 		panic("page_get_shift: out of range %d", szc);
449 	return (hw_page_array[szc].hp_shift);
450 }
451 
452 uint_t
453 page_get_pagecolors(uint_t szc)
454 {
455 	ASSERT(page_colors != 0);
456 	return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
457 }
458 
459 /*
460  * Called by startup().
461  * Size up the per page size free list counters based on physmax
462  * of each node and max_mem_nodes.
463  */
464 size_t
465 page_ctrs_sz(void)
466 {
467 	int	r;		/* region size */
468 	int	mnode;
469 	uint_t	ctrs_sz = 0;
470 	int 	i;
471 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
472 
473 	/*
474 	 * We need to determine how many page colors there are for each
475 	 * page size in order to allocate memory for any color specific
476 	 * arrays.
477 	 */
478 	colors_per_szc[0] = page_colors;
479 	for (i = 1; i < mmu_page_sizes; i++) {
480 		colors_per_szc[i] =
481 		    page_convert_color(0, i, page_colors - 1) + 1;
482 	}
483 
484 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
485 
486 		pgcnt_t r_pgcnt;
487 		pfn_t   r_base;
488 		pgcnt_t r_align;
489 
490 		if (mem_node_config[mnode].exists == 0)
491 			continue;
492 
493 		/*
494 		 * determine size needed for page counter arrays with
495 		 * base aligned to large page size.
496 		 */
497 		for (r = 1; r < mmu_page_sizes; r++) {
498 			/* add in space for hpm_counters */
499 			r_align = page_get_pagecnt(r);
500 			r_base = mem_node_config[mnode].physbase;
501 			r_base &= ~(r_align - 1);
502 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
503 			r_base, r_align);
504 			/*
505 			 * Round up to always allocate on pointer sized
506 			 * boundaries.
507 			 */
508 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
509 			    sizeof (hpmctr_t *));
510 
511 			/* add in space for hpm_color_current */
512 			ctrs_sz += (colors_per_szc[r] *
513 			    sizeof (size_t));
514 		}
515 	}
516 
517 	for (r = 1; r < mmu_page_sizes; r++) {
518 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
519 
520 		/* add in space for page_ctrs_cands */
521 		ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
522 		ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
523 		    sizeof (pgcnt_t);
524 	}
525 
526 	/* ctr_mutex */
527 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
528 
529 	/* size for page list counts */
530 	PLCNT_SZ(ctrs_sz);
531 
532 	/*
533 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
534 	 * address of the counters to ecache_alignsize boundary for every
535 	 * memory node.
536 	 */
537 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
538 }
539 
540 caddr_t
541 page_ctrs_alloc(caddr_t alloc_base)
542 {
543 	int	mnode;
544 	int	r;		/* region size */
545 	int	i;
546 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
547 
548 	/*
549 	 * We need to determine how many page colors there are for each
550 	 * page size in order to allocate memory for any color specific
551 	 * arrays.
552 	 */
553 	colors_per_szc[0] = page_colors;
554 	for (i = 1; i < mmu_page_sizes; i++) {
555 		colors_per_szc[i] =
556 		    page_convert_color(0, i, page_colors - 1) + 1;
557 	}
558 
559 	for (r = 1; r < mmu_page_sizes; r++) {
560 		page_counters[r] = (hw_page_map_t *)alloc_base;
561 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
562 	}
563 
564 	/* page_ctrs_cands */
565 	for (r = 1; r < mmu_page_sizes; r++) {
566 		for (i = 0; i < NPC_MUTEX; i++) {
567 			page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
568 			alloc_base += max_mem_nodes * (sizeof (pcc_info_t));
569 
570 		}
571 	}
572 
573 	/* page_ctrs_cands pcc_color_free array */
574 	for (r = 1; r < mmu_page_sizes; r++) {
575 		for (i = 0; i < NPC_MUTEX; i++) {
576 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
577 				page_ctrs_cands[i][r][mnode].pcc_color_free_len
578 				    = colors_per_szc[r];
579 				page_ctrs_cands[i][r][mnode].pcc_color_free =
580 				    (pgcnt_t *)alloc_base;
581 				alloc_base += colors_per_szc[r] *
582 				    sizeof (pgcnt_t);
583 			}
584 		}
585 	}
586 
587 	/* ctr_mutex */
588 	for (i = 0; i < NPC_MUTEX; i++) {
589 		ctr_mutex[i] = (kmutex_t *)alloc_base;
590 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
591 	}
592 
593 	/* initialize page list counts */
594 	PLCNT_INIT(alloc_base);
595 
596 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
597 
598 		pgcnt_t r_pgcnt;
599 		pfn_t	r_base;
600 		pgcnt_t r_align;
601 		int	r_shift;
602 
603 		if (mem_node_config[mnode].exists == 0)
604 			continue;
605 
606 		for (r = 1; r < mmu_page_sizes; r++) {
607 			/*
608 			 * the page_counters base has to be aligned to the
609 			 * page count of page size code r otherwise the counts
610 			 * will cross large page boundaries.
611 			 */
612 			r_align = page_get_pagecnt(r);
613 			r_base = mem_node_config[mnode].physbase;
614 			/* base needs to be aligned - lower to aligned value */
615 			r_base &= ~(r_align - 1);
616 			r_pgcnt = howmany(mem_node_config[mnode].physmax -
617 			r_base, r_align);
618 			r_shift = PAGE_BSZS_SHIFT(r);
619 
620 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
621 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
622 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
623 			PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
624 			    colors_per_szc[r];
625 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
626 			    (size_t *)alloc_base;
627 			alloc_base += (sizeof (size_t) * colors_per_szc[r]);
628 			for (i = 0; i < colors_per_szc[r]; i++) {
629 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
630 			}
631 			PAGE_COUNTERS_COUNTERS(mnode, r) =
632 			    (hpmctr_t *)alloc_base;
633 			/*
634 			 * Round up to make alloc_base always be aligned on
635 			 * a pointer boundary.
636 			 */
637 			alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
638 			    sizeof (hpmctr_t *));
639 
640 			/*
641 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
642 			 * satisfy the identity requirement.
643 			 * We should be able to go from one to the other
644 			 * and get consistent values.
645 			 */
646 			ASSERT(PNUM_TO_IDX(mnode, r,
647 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
648 			ASSERT(IDX_TO_PNUM(mnode, r,
649 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
650 		}
651 		/*
652 		 * Roundup the start address of the page_counters to
653 		 * cache aligned boundary for every memory node.
654 		 * page_ctrs_sz() has added some slop for these roundups.
655 		 */
656 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
657 			L2CACHE_ALIGN);
658 	}
659 
660 	/* Initialize other page counter specific data structures. */
661 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
662 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
663 	}
664 
665 	return (alloc_base);
666 }
667 
668 /*
669  * Functions to adjust region counters for each size free list.
670  * Caller is responsible to acquire the ctr_mutex lock if necessary and
671  * thus can be called during startup without locks.
672  */
673 /* ARGSUSED */
674 void
675 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
676 {
677 	ssize_t		r;	/* region size */
678 	ssize_t		idx;
679 	pfn_t		pfnum;
680 	int		lckidx;
681 
682 	ASSERT(mnode == PP_2_MEM_NODE(pp));
683 	ASSERT(mtype == PP_2_MTYPE(pp));
684 
685 	ASSERT(pp->p_szc < mmu_page_sizes);
686 
687 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
688 
689 	/* no counter update needed for largest page size */
690 	if (pp->p_szc >= mmu_page_sizes - 1) {
691 		return;
692 	}
693 
694 	r = pp->p_szc + 1;
695 	pfnum = pp->p_pagenum;
696 	lckidx = PP_CTR_LOCK_INDX(pp);
697 
698 	/*
699 	 * Increment the count of free pages for the current
700 	 * region. Continue looping up in region size incrementing
701 	 * count if the preceeding region is full.
702 	 */
703 	while (r < mmu_page_sizes) {
704 		idx = PNUM_TO_IDX(mnode, r, pfnum);
705 
706 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
707 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
708 
709 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
710 			break;
711 
712 		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
713 		page_ctrs_cands[lckidx][r][mnode].
714 		    pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
715 		r++;
716 	}
717 }
718 
719 void
720 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
721 {
722 	int		lckidx = PP_CTR_LOCK_INDX(pp);
723 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
724 
725 	mutex_enter(lock);
726 	page_ctr_add_internal(mnode, mtype, pp, flags);
727 	mutex_exit(lock);
728 }
729 
730 void
731 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
732 {
733 	int		lckidx;
734 	kmutex_t	*lock;
735 	ssize_t		r;	/* region size */
736 	ssize_t		idx;
737 	pfn_t		pfnum;
738 
739 	ASSERT(mnode == PP_2_MEM_NODE(pp));
740 	ASSERT(mtype == PP_2_MTYPE(pp));
741 
742 	ASSERT(pp->p_szc < mmu_page_sizes);
743 
744 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
745 
746 	/* no counter update needed for largest page size */
747 	if (pp->p_szc >= mmu_page_sizes - 1) {
748 		return;
749 	}
750 
751 	r = pp->p_szc + 1;
752 	pfnum = pp->p_pagenum;
753 	lckidx = PP_CTR_LOCK_INDX(pp);
754 	lock = &ctr_mutex[lckidx][mnode];
755 
756 	/*
757 	 * Decrement the count of free pages for the current
758 	 * region. Continue looping up in region size decrementing
759 	 * count if the preceeding region was full.
760 	 */
761 	mutex_enter(lock);
762 	while (r < mmu_page_sizes) {
763 		idx = PNUM_TO_IDX(mnode, r, pfnum);
764 
765 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
766 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
767 
768 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
769 			break;
770 		}
771 		ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
772 		ASSERT(page_ctrs_cands[lckidx][r][mnode].
773 		    pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
774 
775 		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
776 		page_ctrs_cands[lckidx][r][mnode].
777 		    pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
778 		r++;
779 	}
780 	mutex_exit(lock);
781 }
782 
783 /*
784  * Adjust page counters following a memory attach, since typically the
785  * size of the array needs to change, and the PFN to counter index
786  * mapping needs to change.
787  */
788 uint_t
789 page_ctrs_adjust(int mnode)
790 {
791 	pgcnt_t npgs;
792 	int	r;		/* region size */
793 	int	i;
794 	size_t	pcsz, old_csz;
795 	hpmctr_t *new_ctr, *old_ctr;
796 	pfn_t	oldbase, newbase;
797 	size_t	old_npgs;
798 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
799 	size_t	size_cache[MMU_PAGE_SIZES];
800 	size_t	*color_cache[MMU_PAGE_SIZES];
801 	size_t	*old_color_array;
802 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
803 
804 	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
805 	npgs = roundup(mem_node_config[mnode].physmax,
806 	    PC_BASE_ALIGN) - newbase;
807 
808 	/*
809 	 * We need to determine how many page colors there are for each
810 	 * page size in order to allocate memory for any color specific
811 	 * arrays.
812 	 */
813 	colors_per_szc[0] = page_colors;
814 	for (r = 1; r < mmu_page_sizes; r++) {
815 		colors_per_szc[r] =
816 		    page_convert_color(0, r, page_colors - 1) + 1;
817 	}
818 
819 	/*
820 	 * Preallocate all of the new hpm_counters arrays as we can't
821 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
822 	 * If we can't allocate all of the arrays, undo our work so far
823 	 * and return failure.
824 	 */
825 	for (r = 1; r < mmu_page_sizes; r++) {
826 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
827 
828 		ctr_cache[r] = kmem_zalloc(pcsz *
829 		    sizeof (hpmctr_t), KM_NOSLEEP);
830 		if (ctr_cache[r] == NULL) {
831 			while (--r >= 1) {
832 				kmem_free(ctr_cache[r],
833 				    size_cache[r] * sizeof (hpmctr_t));
834 			}
835 			return (ENOMEM);
836 		}
837 		size_cache[r] = pcsz;
838 	}
839 	/*
840 	 * Preallocate all of the new color current arrays as we can't
841 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
842 	 * If we can't allocate all of the arrays, undo our work so far
843 	 * and return failure.
844 	 */
845 	for (r = 1; r < mmu_page_sizes; r++) {
846 		color_cache[r] = kmem_zalloc(sizeof (size_t) *
847 		    colors_per_szc[r], KM_NOSLEEP);
848 		if (color_cache[r] == NULL) {
849 			while (--r >= 1) {
850 				kmem_free(color_cache[r],
851 				    colors_per_szc[r] * sizeof (size_t));
852 			}
853 			for (r = 1; r < mmu_page_sizes; r++) {
854 				kmem_free(ctr_cache[r],
855 				    size_cache[r] * sizeof (hpmctr_t));
856 			}
857 			return (ENOMEM);
858 		}
859 	}
860 
861 	/*
862 	 * Grab the write lock to prevent others from walking these arrays
863 	 * while we are modifying them.
864 	 */
865 	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
866 	page_freelist_lock(mnode);
867 	for (r = 1; r < mmu_page_sizes; r++) {
868 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
869 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
870 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
871 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
872 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
873 		old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);
874 
875 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
876 		new_ctr = ctr_cache[r];
877 		ctr_cache[r] = NULL;
878 		if (old_ctr != NULL &&
879 		    (oldbase + old_npgs > newbase) &&
880 		    (newbase + npgs > oldbase)) {
881 			/*
882 			 * Map the intersection of the old and new
883 			 * counters into the new array.
884 			 */
885 			size_t offset;
886 			if (newbase > oldbase) {
887 				offset = (newbase - oldbase) >>
888 				    PAGE_COUNTERS_SHIFT(mnode, r);
889 				bcopy(old_ctr + offset, new_ctr,
890 				    MIN(pcsz, (old_csz - offset)) *
891 				    sizeof (hpmctr_t));
892 			} else {
893 				offset = (oldbase - newbase) >>
894 				    PAGE_COUNTERS_SHIFT(mnode, r);
895 				bcopy(old_ctr, new_ctr + offset,
896 				    MIN(pcsz - offset, old_csz) *
897 				    sizeof (hpmctr_t));
898 			}
899 		}
900 
901 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
902 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
903 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
904 		PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
905 		PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
906 		color_cache[r] = NULL;
907 		/*
908 		 * for now, just reset on these events as it's probably
909 		 * not worthwhile to try and optimize this.
910 		 */
911 		for (i = 0; i < colors_per_szc[r]; i++) {
912 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
913 		}
914 
915 		/* cache info for freeing out of the critical path */
916 		if ((caddr_t)old_ctr >= kernelheap &&
917 		    (caddr_t)old_ctr < ekernelheap) {
918 			ctr_cache[r] = old_ctr;
919 			size_cache[r] = old_csz;
920 		}
921 		if ((caddr_t)old_color_array >= kernelheap &&
922 		    (caddr_t)old_color_array < ekernelheap) {
923 			color_cache[r] = old_color_array;
924 		}
925 		/*
926 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
927 		 * satisfy the identity requirement.
928 		 * We should be able to go from one to the other
929 		 * and get consistent values.
930 		 */
931 		ASSERT(PNUM_TO_IDX(mnode, r,
932 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
933 		ASSERT(IDX_TO_PNUM(mnode, r,
934 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
935 	}
936 	page_freelist_unlock(mnode);
937 	rw_exit(&page_ctrs_rwlock[mnode]);
938 
939 	/*
940 	 * Now that we have dropped the write lock, it is safe to free all
941 	 * of the memory we have cached above.
942 	 */
943 	for (r = 1; r < mmu_page_sizes; r++) {
944 		if (ctr_cache[r] != NULL) {
945 			kmem_free(ctr_cache[r],
946 			    size_cache[r] * sizeof (hpmctr_t));
947 		}
948 		if (color_cache[r] != NULL) {
949 			kmem_free(color_cache[r],
950 			    colors_per_szc[r] * sizeof (size_t));
951 		}
952 	}
953 	return (0);
954 }
955 
956 /*
957  * color contains a valid color index or bin for cur_szc
958  */
959 uint_t
960 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
961 {
962 	uint_t shift;
963 
964 	if (cur_szc > new_szc) {
965 		shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
966 		return (color << shift);
967 	} else if (cur_szc < new_szc) {
968 		shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
969 		return (color >> shift);
970 	}
971 	return (color);
972 }
973 
974 #ifdef DEBUG
975 
976 /*
977  * confirm pp is a large page corresponding to szc
978  */
979 void
980 chk_lpg(page_t *pp, uchar_t szc)
981 {
982 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
983 	uint_t noreloc;
984 
985 	if (npgs == 1) {
986 		ASSERT(pp->p_szc == 0);
987 		ASSERT(pp->p_next == pp);
988 		ASSERT(pp->p_prev == pp);
989 		return;
990 	}
991 
992 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
993 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
994 
995 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
996 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
997 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
998 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
999 
1000 	/*
1001 	 * Check list of pages.
1002 	 */
1003 	noreloc = PP_ISNORELOC(pp);
1004 	while (npgs--) {
1005 		if (npgs != 0) {
1006 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1007 			ASSERT(pp->p_next == (pp + 1));
1008 		}
1009 		ASSERT(pp->p_szc == szc);
1010 		ASSERT(PP_ISFREE(pp));
1011 		ASSERT(PP_ISAGED(pp));
1012 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1013 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1014 		ASSERT(pp->p_vnode  == NULL);
1015 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1016 
1017 		pp = pp->p_next;
1018 	}
1019 }
1020 #endif /* DEBUG */
1021 
1022 void
1023 page_freelist_lock(int mnode)
1024 {
1025 	int i;
1026 	for (i = 0; i < NPC_MUTEX; i++) {
1027 		mutex_enter(FPC_MUTEX(mnode, i));
1028 		mutex_enter(CPC_MUTEX(mnode, i));
1029 	}
1030 }
1031 
1032 void
1033 page_freelist_unlock(int mnode)
1034 {
1035 	int i;
1036 	for (i = 0; i < NPC_MUTEX; i++) {
1037 		mutex_exit(FPC_MUTEX(mnode, i));
1038 		mutex_exit(CPC_MUTEX(mnode, i));
1039 	}
1040 }
1041 
1042 /*
1043  * update the page list max counts for already allocated pages that has xfer'ed
1044  * (kcage_assimilate_page) between different mtypes.
1045  */
1046 /* ARGSUSED */
1047 void
1048 page_list_xfer(page_t *pp, int to_mtype, int from_mtype)
1049 {
1050 	PLCNT_MAX_INCR(pp, PP_2_MEM_NODE(pp), to_mtype, pp->p_szc);
1051 	PLCNT_MAX_DECR(pp, PP_2_MEM_NODE(pp), from_mtype, pp->p_szc);
1052 }
1053 
1054 /*
1055  * add pp to the specified page list. Defaults to head of the page list
1056  * unless PG_LIST_TAIL is specified.
1057  */
1058 void
1059 page_list_add(page_t *pp, int flags)
1060 {
1061 	page_t		**ppp;
1062 	kmutex_t	*pcm;
1063 	uint_t		bin, mtype;
1064 	int		mnode;
1065 
1066 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1067 	ASSERT(PP_ISFREE(pp));
1068 	ASSERT(!hat_page_is_mapped(pp));
1069 	ASSERT(hat_page_getshare(pp) == 0);
1070 
1071 	/*
1072 	 * Large pages should be freed via page_list_add_pages().
1073 	 */
1074 	ASSERT(pp->p_szc == 0);
1075 
1076 	/*
1077 	 * Don't need to lock the freelist first here
1078 	 * because the page isn't on the freelist yet.
1079 	 * This means p_szc can't change on us.
1080 	 */
1081 
1082 	bin = PP_2_BIN(pp);
1083 	mnode = PP_2_MEM_NODE(pp);
1084 	mtype = PP_2_MTYPE(pp);
1085 
1086 	if (flags & PG_LIST_ISINIT) {
1087 		/*
1088 		 * PG_LIST_ISINIT is set during system startup (ie. single
1089 		 * threaded), add a page to the free list and add to the
1090 		 * the free region counters w/o any locking
1091 		 */
1092 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1093 
1094 		/* inline version of page_add() */
1095 		if (*ppp != NULL) {
1096 			pp->p_next = *ppp;
1097 			pp->p_prev = (*ppp)->p_prev;
1098 			(*ppp)->p_prev = pp;
1099 			pp->p_prev->p_next = pp;
1100 		} else
1101 			*ppp = pp;
1102 
1103 		page_ctr_add_internal(mnode, mtype, pp, flags);
1104 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1105 	} else {
1106 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1107 
1108 		if (flags & PG_FREE_LIST) {
1109 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1110 			ASSERT(PP_ISAGED(pp));
1111 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1112 
1113 		} else {
1114 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1115 			ASSERT(pp->p_vnode);
1116 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1117 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1118 		}
1119 		mutex_enter(pcm);
1120 		page_add(ppp, pp);
1121 
1122 		if (flags & PG_LIST_TAIL)
1123 			*ppp = (*ppp)->p_next;
1124 		/*
1125 		 * Add counters before releasing pcm mutex to avoid a race with
1126 		 * page_freelist_coalesce and page_freelist_fill.
1127 		 */
1128 		page_ctr_add(mnode, mtype, pp, flags);
1129 		mutex_exit(pcm);
1130 	}
1131 
1132 
1133 #if defined(__sparc)
1134 	if (PP_ISNORELOC(pp)) {
1135 		kcage_freemem_add(1);
1136 	}
1137 #endif
1138 	/*
1139 	 * It is up to the caller to unlock the page!
1140 	 */
1141 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1142 }
1143 
1144 
1145 #ifdef __sparc
1146 /*
1147  * This routine is only used by kcage_init during system startup.
1148  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1149  * without the overhead of taking locks and updating counters.
1150  */
1151 void
1152 page_list_noreloc_startup(page_t *pp)
1153 {
1154 	page_t		**ppp;
1155 	uint_t		bin;
1156 	int		mnode;
1157 	int		mtype;
1158 	int		flags = PG_LIST_ISCAGE;
1159 
1160 	/*
1161 	 * If this is a large page on the freelist then
1162 	 * break it up into smaller pages.
1163 	 */
1164 	if (pp->p_szc != 0)
1165 		page_boot_demote(pp);
1166 
1167 	/*
1168 	 * Get list page is currently on.
1169 	 */
1170 	bin = PP_2_BIN(pp);
1171 	mnode = PP_2_MEM_NODE(pp);
1172 	mtype = PP_2_MTYPE(pp);
1173 	ASSERT(mtype == MTYPE_RELOC);
1174 	ASSERT(pp->p_szc == 0);
1175 
1176 	if (PP_ISAGED(pp)) {
1177 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1178 		flags |= PG_FREE_LIST;
1179 	} else {
1180 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1181 		flags |= PG_CACHE_LIST;
1182 	}
1183 
1184 	ASSERT(*ppp != NULL);
1185 
1186 	/*
1187 	 * Delete page from current list.
1188 	 */
1189 	if (*ppp == pp)
1190 		*ppp = pp->p_next;		/* go to next page */
1191 	if (*ppp == pp) {
1192 		*ppp = NULL;			/* page list is gone */
1193 	} else {
1194 		pp->p_prev->p_next = pp->p_next;
1195 		pp->p_next->p_prev = pp->p_prev;
1196 	}
1197 
1198 	/* LINTED */
1199 	PLCNT_DECR(pp, mnode, mtype, 0, flags);
1200 
1201 	/*
1202 	 * Set no reloc for cage initted pages.
1203 	 */
1204 	PP_SETNORELOC(pp);
1205 
1206 	mtype = PP_2_MTYPE(pp);
1207 	ASSERT(mtype == MTYPE_NORELOC);
1208 
1209 	/*
1210 	 * Get new list for page.
1211 	 */
1212 	if (PP_ISAGED(pp)) {
1213 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1214 	} else {
1215 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1216 	}
1217 
1218 	/*
1219 	 * Insert page on new list.
1220 	 */
1221 	if (*ppp == NULL) {
1222 		*ppp = pp;
1223 		pp->p_next = pp->p_prev = pp;
1224 	} else {
1225 		pp->p_next = *ppp;
1226 		pp->p_prev = (*ppp)->p_prev;
1227 		(*ppp)->p_prev = pp;
1228 		pp->p_prev->p_next = pp;
1229 	}
1230 
1231 	/* LINTED */
1232 	PLCNT_INCR(pp, mnode, mtype, 0, flags);
1233 
1234 	/*
1235 	 * Update cage freemem counter
1236 	 */
1237 	atomic_add_long(&kcage_freemem, 1);
1238 }
1239 #else	/* __sparc */
1240 
1241 /* ARGSUSED */
1242 void
1243 page_list_noreloc_startup(page_t *pp)
1244 {
1245 	panic("page_list_noreloc_startup: should be here only for sparc");
1246 }
1247 #endif
1248 
1249 void
1250 page_list_add_pages(page_t *pp, int flags)
1251 {
1252 	kmutex_t *pcm;
1253 	pgcnt_t	pgcnt;
1254 	uint_t	bin, mtype, i;
1255 	int	mnode;
1256 
1257 	/* default to freelist/head */
1258 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1259 
1260 	CHK_LPG(pp, pp->p_szc);
1261 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1262 
1263 	bin = PP_2_BIN(pp);
1264 	mnode = PP_2_MEM_NODE(pp);
1265 	mtype = PP_2_MTYPE(pp);
1266 
1267 	if (flags & PG_LIST_ISINIT) {
1268 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1269 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1270 		ASSERT(!PP_ISNORELOC(pp));
1271 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1272 	} else {
1273 
1274 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1275 
1276 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1277 
1278 		mutex_enter(pcm);
1279 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1280 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1281 		mutex_exit(pcm);
1282 
1283 		pgcnt = page_get_pagecnt(pp->p_szc);
1284 #if defined(__sparc)
1285 		if (PP_ISNORELOC(pp))
1286 			kcage_freemem_add(pgcnt);
1287 #endif
1288 		for (i = 0; i < pgcnt; i++, pp++)
1289 			page_unlock(pp);
1290 	}
1291 }
1292 
1293 /*
1294  * During boot, need to demote a large page to base
1295  * pagesize pages for seg_kmem for use in boot_alloc()
1296  */
1297 void
1298 page_boot_demote(page_t *pp)
1299 {
1300 	ASSERT(pp->p_szc != 0);
1301 	ASSERT(PP_ISFREE(pp));
1302 	ASSERT(PP_ISAGED(pp));
1303 
1304 	(void) page_demote(PP_2_MEM_NODE(pp),
1305 	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
1306 	    PC_FREE);
1307 
1308 	ASSERT(PP_ISFREE(pp));
1309 	ASSERT(PP_ISAGED(pp));
1310 	ASSERT(pp->p_szc == 0);
1311 }
1312 
1313 /*
1314  * Take a particular page off of whatever freelist the page
1315  * is claimed to be on.
1316  *
1317  * NOTE: Only used for PAGESIZE pages.
1318  */
1319 void
1320 page_list_sub(page_t *pp, int flags)
1321 {
1322 	int		bin;
1323 	uint_t		mtype;
1324 	int		mnode;
1325 	kmutex_t	*pcm;
1326 	page_t		**ppp;
1327 
1328 	ASSERT(PAGE_EXCL(pp));
1329 	ASSERT(PP_ISFREE(pp));
1330 
1331 	/*
1332 	 * The p_szc field can only be changed by page_promote()
1333 	 * and page_demote(). Only free pages can be promoted and
1334 	 * demoted and the free list MUST be locked during these
1335 	 * operations. So to prevent a race in page_list_sub()
1336 	 * between computing which bin of the freelist lock to
1337 	 * grab and actually grabing the lock we check again that
1338 	 * the bin we locked is still the correct one. Notice that
1339 	 * the p_szc field could have actually changed on us but
1340 	 * if the bin happens to still be the same we are safe.
1341 	 */
1342 try_again:
1343 	bin = PP_2_BIN(pp);
1344 	mnode = PP_2_MEM_NODE(pp);
1345 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1346 	mutex_enter(pcm);
1347 	if (PP_2_BIN(pp) != bin) {
1348 		mutex_exit(pcm);
1349 		goto try_again;
1350 	}
1351 	mtype = PP_2_MTYPE(pp);
1352 
1353 	if (flags & PG_FREE_LIST) {
1354 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1355 		ASSERT(PP_ISAGED(pp));
1356 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1357 	} else {
1358 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1359 		ASSERT(!PP_ISAGED(pp));
1360 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1361 	}
1362 
1363 	/*
1364 	 * Common PAGESIZE case.
1365 	 *
1366 	 * Note that we locked the freelist. This prevents
1367 	 * any page promotion/demotion operations. Therefore
1368 	 * the p_szc will not change until we drop pcm mutex.
1369 	 */
1370 	if (pp->p_szc == 0) {
1371 		page_sub(ppp, pp);
1372 		/*
1373 		 * Subtract counters before releasing pcm mutex
1374 		 * to avoid race with page_freelist_coalesce.
1375 		 */
1376 		page_ctr_sub(mnode, mtype, pp, flags);
1377 		mutex_exit(pcm);
1378 
1379 #if defined(__sparc)
1380 		if (PP_ISNORELOC(pp)) {
1381 			kcage_freemem_sub(1);
1382 		}
1383 #endif
1384 		return;
1385 	}
1386 
1387 	/*
1388 	 * Large pages on the cache list are not supported.
1389 	 */
1390 	if (flags & PG_CACHE_LIST)
1391 		panic("page_list_sub: large page on cachelist");
1392 
1393 	/*
1394 	 * Slow but rare.
1395 	 *
1396 	 * Somebody wants this particular page which is part
1397 	 * of a large page. In this case we just demote the page
1398 	 * if it's on the freelist.
1399 	 *
1400 	 * We have to drop pcm before locking the entire freelist.
1401 	 * Once we have re-locked the freelist check to make sure
1402 	 * the page hasn't already been demoted or completely
1403 	 * freed.
1404 	 */
1405 	mutex_exit(pcm);
1406 	page_freelist_lock(mnode);
1407 	if (pp->p_szc != 0) {
1408 		/*
1409 		 * Large page is on freelist.
1410 		 */
1411 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1412 		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1413 	}
1414 	ASSERT(PP_ISFREE(pp));
1415 	ASSERT(PP_ISAGED(pp));
1416 	ASSERT(pp->p_szc == 0);
1417 
1418 	/*
1419 	 * Subtract counters before releasing pcm mutex
1420 	 * to avoid race with page_freelist_coalesce.
1421 	 */
1422 	bin = PP_2_BIN(pp);
1423 	mtype = PP_2_MTYPE(pp);
1424 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1425 
1426 	page_sub(ppp, pp);
1427 	page_ctr_sub(mnode, mtype, pp, flags);
1428 	page_freelist_unlock(mnode);
1429 
1430 #if defined(__sparc)
1431 	if (PP_ISNORELOC(pp)) {
1432 		kcage_freemem_sub(1);
1433 	}
1434 #endif
1435 }
1436 
1437 void
1438 page_list_sub_pages(page_t *pp, uint_t szc)
1439 {
1440 	kmutex_t *pcm;
1441 	uint_t	bin, mtype;
1442 	int	mnode;
1443 
1444 	ASSERT(PAGE_EXCL(pp));
1445 	ASSERT(PP_ISFREE(pp));
1446 	ASSERT(PP_ISAGED(pp));
1447 
1448 	/*
1449 	 * See comment in page_list_sub().
1450 	 */
1451 try_again:
1452 	bin = PP_2_BIN(pp);
1453 	mnode = PP_2_MEM_NODE(pp);
1454 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1455 	mutex_enter(pcm);
1456 	if (PP_2_BIN(pp) != bin) {
1457 		mutex_exit(pcm);
1458 		goto	try_again;
1459 	}
1460 
1461 	/*
1462 	 * If we're called with a page larger than szc or it got
1463 	 * promoted above szc before we locked the freelist then
1464 	 * drop pcm and re-lock entire freelist. If page still larger
1465 	 * than szc then demote it.
1466 	 */
1467 	if (pp->p_szc > szc) {
1468 		mutex_exit(pcm);
1469 		pcm = NULL;
1470 		page_freelist_lock(mnode);
1471 		if (pp->p_szc > szc) {
1472 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1473 			(void) page_demote(mnode,
1474 			    PFN_BASE(pp->p_pagenum, pp->p_szc),
1475 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1476 		}
1477 		bin = PP_2_BIN(pp);
1478 	}
1479 	ASSERT(PP_ISFREE(pp));
1480 	ASSERT(PP_ISAGED(pp));
1481 	ASSERT(pp->p_szc <= szc);
1482 	ASSERT(pp == PP_PAGEROOT(pp));
1483 
1484 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1485 
1486 	mtype = PP_2_MTYPE(pp);
1487 	if (pp->p_szc != 0) {
1488 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1489 		CHK_LPG(pp, pp->p_szc);
1490 	} else {
1491 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1492 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1493 	}
1494 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1495 
1496 	if (pcm != NULL) {
1497 		mutex_exit(pcm);
1498 	} else {
1499 		page_freelist_unlock(mnode);
1500 	}
1501 
1502 #if defined(__sparc)
1503 	if (PP_ISNORELOC(pp)) {
1504 		pgcnt_t	pgcnt;
1505 
1506 		pgcnt = page_get_pagecnt(pp->p_szc);
1507 		kcage_freemem_sub(pgcnt);
1508 	}
1509 #endif
1510 }
1511 
1512 /*
1513  * Add the page to the front of a linked list of pages
1514  * using the p_next & p_prev pointers for the list.
1515  * The caller is responsible for protecting the list pointers.
1516  */
1517 void
1518 mach_page_add(page_t **ppp, page_t *pp)
1519 {
1520 	if (*ppp == NULL) {
1521 		pp->p_next = pp->p_prev = pp;
1522 	} else {
1523 		pp->p_next = *ppp;
1524 		pp->p_prev = (*ppp)->p_prev;
1525 		(*ppp)->p_prev = pp;
1526 		pp->p_prev->p_next = pp;
1527 	}
1528 	*ppp = pp;
1529 }
1530 
1531 /*
1532  * Remove this page from a linked list of pages
1533  * using the p_next & p_prev pointers for the list.
1534  *
1535  * The caller is responsible for protecting the list pointers.
1536  */
1537 void
1538 mach_page_sub(page_t **ppp, page_t *pp)
1539 {
1540 	ASSERT(PP_ISFREE(pp));
1541 
1542 	if (*ppp == NULL || pp == NULL)
1543 		panic("mach_page_sub");
1544 
1545 	if (*ppp == pp)
1546 		*ppp = pp->p_next;		/* go to next page */
1547 
1548 	if (*ppp == pp)
1549 		*ppp = NULL;			/* page list is gone */
1550 	else {
1551 		pp->p_prev->p_next = pp->p_next;
1552 		pp->p_next->p_prev = pp->p_prev;
1553 	}
1554 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1555 }
1556 
1557 /*
1558  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1559  */
1560 void
1561 page_promote_size(page_t *pp, uint_t cur_szc)
1562 {
1563 	pfn_t pfn;
1564 	int mnode;
1565 	int idx;
1566 	int new_szc = cur_szc + 1;
1567 	int full = FULL_REGION_CNT(new_szc);
1568 
1569 	pfn = page_pptonum(pp);
1570 	mnode = PFN_2_MEM_NODE(pfn);
1571 
1572 	page_freelist_lock(mnode);
1573 
1574 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1575 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1576 		(void) page_promote(mnode, pfn, new_szc, PC_FREE);
1577 
1578 	page_freelist_unlock(mnode);
1579 }
1580 
1581 static uint_t page_promote_err;
1582 static uint_t page_promote_noreloc_err;
1583 
1584 /*
1585  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1586  * for the given mnode starting at pfnum. Pages involved are on the freelist
1587  * before the call and may be returned to the caller if requested, otherwise
1588  * they will be placed back on the freelist.
1589  * If flags is PC_ALLOC, then the large page will be returned to the user in
1590  * a state which is consistent with a page being taken off the freelist.  If
1591  * we failed to lock the new large page, then we will return NULL to the
1592  * caller and put the large page on the freelist instead.
1593  * If flags is PC_FREE, then the large page will be placed on the freelist,
1594  * and NULL will be returned.
1595  * The caller is responsible for locking the freelist as well as any other
1596  * accounting which needs to be done for a returned page.
1597  *
1598  * RFE: For performance pass in pp instead of pfnum so
1599  * 	we can avoid excessive calls to page_numtopp_nolock().
1600  *	This would depend on an assumption that all contiguous
1601  *	pages are in the same memseg so we can just add/dec
1602  *	our pp.
1603  *
1604  * Lock ordering:
1605  *
1606  *	There is a potential but rare deadlock situation
1607  *	for page promotion and demotion operations. The problem
1608  *	is there are two paths into the freelist manager and
1609  *	they have different lock orders:
1610  *
1611  *	page_create()
1612  *		lock freelist
1613  *		page_lock(EXCL)
1614  *		unlock freelist
1615  *		return
1616  *		caller drops page_lock
1617  *
1618  *	page_free() and page_reclaim()
1619  *		caller grabs page_lock(EXCL)
1620  *
1621  *		lock freelist
1622  *		unlock freelist
1623  *		drop page_lock
1624  *
1625  *	What prevents a thread in page_create() from deadlocking
1626  *	with a thread freeing or reclaiming the same page is the
1627  *	page_trylock() in page_get_freelist(). If the trylock fails
1628  *	it skips the page.
1629  *
1630  *	The lock ordering for promotion and demotion is the same as
1631  *	for page_create(). Since the same deadlock could occur during
1632  *	page promotion and freeing or reclaiming of a page on the
1633  *	cache list we might have to fail the operation and undo what
1634  *	have done so far. Again this is rare.
1635  */
1636 page_t *
1637 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
1638 {
1639 	page_t		*pp, *pplist, *tpp, *start_pp;
1640 	pgcnt_t		new_npgs, npgs;
1641 	uint_t		bin;
1642 	pgcnt_t		tmpnpgs, pages_left;
1643 	uint_t		mtype;
1644 	uint_t		noreloc;
1645 	uint_t 		i;
1646 	int 		which_list;
1647 	ulong_t		index;
1648 	kmutex_t	*phm;
1649 
1650 	/*
1651 	 * General algorithm:
1652 	 * Find the starting page
1653 	 * Walk each page struct removing it from the freelist,
1654 	 * and linking it to all the other pages removed.
1655 	 * Once all pages are off the freelist,
1656 	 * walk the list, modifying p_szc to new_szc and what
1657 	 * ever other info needs to be done to create a large free page.
1658 	 * According to the flags, either return the page or put it
1659 	 * on the freelist.
1660 	 */
1661 
1662 	start_pp = page_numtopp_nolock(pfnum);
1663 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1664 	new_npgs = page_get_pagecnt(new_szc);
1665 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1666 
1667 	/*
1668 	 * Loop through smaller pages to confirm that all pages
1669 	 * give the same result for PP_ISNORELOC().
1670 	 * We can check this reliably here as the protocol for setting
1671 	 * P_NORELOC requires pages to be taken off the free list first.
1672 	 */
1673 	for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
1674 		if (pp == start_pp) {
1675 			/* First page, set requirement. */
1676 			noreloc = PP_ISNORELOC(pp);
1677 		} else if (noreloc != PP_ISNORELOC(pp)) {
1678 			page_promote_noreloc_err++;
1679 			page_promote_err++;
1680 			return (NULL);
1681 		}
1682 	}
1683 
1684 	pages_left = new_npgs;
1685 	pplist = NULL;
1686 	pp = start_pp;
1687 
1688 	/* Loop around coalescing the smaller pages into a big page. */
1689 	while (pages_left) {
1690 		/*
1691 		 * Remove from the freelist.
1692 		 */
1693 		ASSERT(PP_ISFREE(pp));
1694 		bin = PP_2_BIN(pp);
1695 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1696 		mtype = PP_2_MTYPE(pp);
1697 		if (PP_ISAGED(pp)) {
1698 
1699 			/*
1700 			 * PG_FREE_LIST
1701 			 */
1702 			if (pp->p_szc) {
1703 				page_vpsub(&PAGE_FREELISTS(mnode,
1704 				    pp->p_szc, bin, mtype), pp);
1705 			} else {
1706 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1707 				    bin, mtype), pp);
1708 			}
1709 			which_list = PG_FREE_LIST;
1710 		} else {
1711 			ASSERT(pp->p_szc == 0);
1712 
1713 			/*
1714 			 * PG_CACHE_LIST
1715 			 *
1716 			 * Since this page comes from the
1717 			 * cachelist, we must destroy the
1718 			 * vnode association.
1719 			 */
1720 			if (!page_trylock(pp, SE_EXCL)) {
1721 				goto fail_promote;
1722 			}
1723 
1724 			/*
1725 			 * We need to be careful not to deadlock
1726 			 * with another thread in page_lookup().
1727 			 * The page_lookup() thread could be holding
1728 			 * the same phm that we need if the two
1729 			 * pages happen to hash to the same phm lock.
1730 			 * At this point we have locked the entire
1731 			 * freelist and page_lookup() could be trying
1732 			 * to grab a freelist lock.
1733 			 */
1734 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
1735 			phm = PAGE_HASH_MUTEX(index);
1736 			if (!mutex_tryenter(phm)) {
1737 				page_unlock(pp);
1738 				goto fail_promote;
1739 			}
1740 
1741 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
1742 			page_hashout(pp, phm);
1743 			mutex_exit(phm);
1744 			PP_SETAGED(pp);
1745 			page_unlock(pp);
1746 			which_list = PG_CACHE_LIST;
1747 		}
1748 		page_ctr_sub(mnode, mtype, pp, which_list);
1749 
1750 		/*
1751 		 * Concatenate the smaller page(s) onto
1752 		 * the large page list.
1753 		 */
1754 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
1755 		pages_left -= npgs;
1756 		tpp = pp;
1757 		while (npgs--) {
1758 			tpp->p_szc = new_szc;
1759 			tpp = tpp->p_next;
1760 		}
1761 		page_list_concat(&pplist, &pp);
1762 		pp += tmpnpgs;
1763 	}
1764 	CHK_LPG(pplist, new_szc);
1765 
1766 	/*
1767 	 * return the page to the user if requested
1768 	 * in the properly locked state.
1769 	 */
1770 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
1771 		return (pplist);
1772 	}
1773 
1774 	/*
1775 	 * Otherwise place the new large page on the freelist
1776 	 */
1777 	bin = PP_2_BIN(pplist);
1778 	mnode = PP_2_MEM_NODE(pplist);
1779 	mtype = PP_2_MTYPE(pplist);
1780 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
1781 
1782 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
1783 	return (NULL);
1784 
1785 fail_promote:
1786 	/*
1787 	 * A thread must have still been freeing or
1788 	 * reclaiming the page on the cachelist.
1789 	 * To prevent a deadlock undo what we have
1790 	 * done sofar and return failure. This
1791 	 * situation can only happen while promoting
1792 	 * PAGESIZE pages.
1793 	 */
1794 	page_promote_err++;
1795 	while (pplist) {
1796 		pp = pplist;
1797 		mach_page_sub(&pplist, pp);
1798 		pp->p_szc = 0;
1799 		bin = PP_2_BIN(pp);
1800 		mtype = PP_2_MTYPE(pp);
1801 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
1802 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1803 	}
1804 	return (NULL);
1805 
1806 }
1807 
1808 /*
1809  * Break up a large page into smaller size pages.
1810  * Pages involved are on the freelist before the call and may
1811  * be returned to the caller if requested, otherwise they will
1812  * be placed back on the freelist.
1813  * The caller is responsible for locking the freelist as well as any other
1814  * accounting which needs to be done for a returned page.
1815  * If flags is not PC_ALLOC, the color argument is ignored, and thus
1816  * technically, any value may be passed in but PC_NO_COLOR is the standard
1817  * which should be followed for clarity's sake.
1818  */
1819 page_t *
1820 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
1821     int color, int flags)
1822 {
1823 	page_t	*pp, *pplist, *npplist;
1824 	pgcnt_t	npgs, n;
1825 	uint_t	bin;
1826 	uint_t	mtype;
1827 	page_t	*ret_pp = NULL;
1828 
1829 	ASSERT(cur_szc != 0);
1830 	ASSERT(new_szc < cur_szc);
1831 
1832 	pplist = page_numtopp_nolock(pfnum);
1833 	ASSERT(pplist != NULL);
1834 
1835 	ASSERT(pplist->p_szc == cur_szc);
1836 
1837 	bin = PP_2_BIN(pplist);
1838 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
1839 	mtype = PP_2_MTYPE(pplist);
1840 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
1841 
1842 	CHK_LPG(pplist, cur_szc);
1843 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
1844 
1845 	/*
1846 	 * Number of PAGESIZE pages for smaller new_szc
1847 	 * page.
1848 	 */
1849 	npgs = page_get_pagecnt(new_szc);
1850 
1851 	while (pplist) {
1852 		pp = pplist;
1853 
1854 		ASSERT(pp->p_szc == cur_szc);
1855 
1856 		/*
1857 		 * We either break it up into PAGESIZE pages or larger.
1858 		 */
1859 		if (npgs == 1) {	/* PAGESIZE case */
1860 			mach_page_sub(&pplist, pp);
1861 			ASSERT(pp->p_szc == cur_szc);
1862 			ASSERT(new_szc == 0);
1863 			ASSERT(mnode == PP_2_MEM_NODE(pp));
1864 			pp->p_szc = new_szc;
1865 			bin = PP_2_BIN(pp);
1866 			if ((bin == color) && (flags == PC_ALLOC) &&
1867 			    (ret_pp == NULL) &&
1868 			    page_trylock_cons(pp, SE_EXCL)) {
1869 				ret_pp = pp;
1870 			} else {
1871 				mtype = PP_2_MTYPE(pp);
1872 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
1873 				    mtype), pp);
1874 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1875 			}
1876 		} else {
1877 
1878 			/*
1879 			 * Break down into smaller lists of pages.
1880 			 */
1881 			page_list_break(&pplist, &npplist, npgs);
1882 
1883 			pp = pplist;
1884 			n = npgs;
1885 			while (n--) {
1886 				ASSERT(pp->p_szc == cur_szc);
1887 				pp->p_szc = new_szc;
1888 				pp = pp->p_next;
1889 			}
1890 
1891 			CHK_LPG(pplist, new_szc);
1892 
1893 			bin = PP_2_BIN(pplist);
1894 			ASSERT(mnode == PP_2_MEM_NODE(pp));
1895 			if ((bin == color) && (flags == PC_ALLOC) &&
1896 			    (ret_pp == NULL) &&
1897 			    page_trylock_cons(pp, SE_EXCL)) {
1898 				ret_pp = pp;
1899 			} else {
1900 				mtype = PP_2_MTYPE(pp);
1901 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
1902 				    bin, mtype), pplist);
1903 
1904 				page_ctr_add(mnode, mtype, pplist,
1905 				    PG_FREE_LIST);
1906 			}
1907 			pplist = npplist;
1908 		}
1909 	}
1910 	return (ret_pp);
1911 }
1912 
1913 int mpss_coalesce_disable = 0;
1914 
1915 /*
1916  * Coalesce free pages into a page of the given szc and color if possible.
1917  * Return the pointer to the page created, otherwise, return NULL.
1918  */
1919 static page_t *
1920 page_freelist_coalesce(int mnode, uchar_t szc, int color)
1921 {
1922 	int 	r;		/* region size */
1923 	int 	idx, full, i;
1924 	pfn_t	pfnum;
1925 	size_t	len;
1926 	size_t	buckets_to_check;
1927 	pgcnt_t	cands;
1928 	page_t	*ret_pp;
1929 	int	color_stride;
1930 
1931 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);
1932 
1933 	if (mpss_coalesce_disable) {
1934 		return (NULL);
1935 	}
1936 
1937 	r = szc;
1938 	PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
1939 	if (cands == 0) {
1940 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
1941 		return (NULL);
1942 	}
1943 	full = FULL_REGION_CNT(r);
1944 	color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
1945 	    page_colors;
1946 
1947 	/* Prevent page_counters dynamic memory from being freed */
1948 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
1949 	len  = PAGE_COUNTERS_ENTRIES(mnode, r);
1950 	buckets_to_check = len / color_stride;
1951 	idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
1952 	ASSERT((idx % color_stride) == color);
1953 	idx += color_stride;
1954 	if (idx >= len)
1955 		idx = color;
1956 	for (i = 0; i < buckets_to_check; i++) {
1957 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
1958 			pfnum = IDX_TO_PNUM(mnode, r, idx);
1959 			ASSERT(pfnum >= mem_node_config[mnode].physbase &&
1960 			    pfnum < mem_node_config[mnode].physmax);
1961 			/*
1962 			 * RFE: For performance maybe we can do something less
1963 			 *	brutal than locking the entire freelist. So far
1964 			 * 	this doesn't seem to be a performance problem?
1965 			 */
1966 			page_freelist_lock(mnode);
1967 			if (PAGE_COUNTERS(mnode, r, idx) != full) {
1968 				VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
1969 				goto skip_this_one;
1970 			}
1971 			ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
1972 			if (ret_pp != NULL) {
1973 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
1974 				    idx;
1975 				page_freelist_unlock(mnode);
1976 				rw_exit(&page_ctrs_rwlock[mnode]);
1977 #if defined(__sparc)
1978 				if (PP_ISNORELOC(ret_pp)) {
1979 					pgcnt_t npgs;
1980 
1981 					npgs = page_get_pagecnt(ret_pp->p_szc);
1982 					kcage_freemem_sub(npgs);
1983 				}
1984 #endif
1985 				return (ret_pp);
1986 			}
1987 skip_this_one:
1988 			page_freelist_unlock(mnode);
1989 			/*
1990 			 * No point looking for another page if we've
1991 			 * already tried all of the ones that
1992 			 * page_ctr_cands indicated.  Stash off where we left
1993 			 * off.
1994 			 * Note: this is not exact since we don't hold the
1995 			 * page_freelist_locks before we initially get the
1996 			 * value of cands for performance reasons, but should
1997 			 * be a decent approximation.
1998 			 */
1999 			if (--cands == 0) {
2000 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
2001 				    idx;
2002 				break;
2003 			}
2004 		}
2005 		idx += color_stride;
2006 		if (idx >= len)
2007 			idx = color;
2008 	}
2009 	rw_exit(&page_ctrs_rwlock[mnode]);
2010 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
2011 	return (NULL);
2012 }
2013 
2014 /*
2015  * For the given mnode, promote as many small pages to large pages as possible.
2016  */
2017 void
2018 page_freelist_coalesce_all(int mnode)
2019 {
2020 	int 	r;		/* region size */
2021 	int 	idx, full;
2022 	pfn_t	pfnum;
2023 	size_t	len;
2024 
2025 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2026 
2027 	if (mpss_coalesce_disable) {
2028 		return;
2029 	}
2030 
2031 	/*
2032 	 * Lock the entire freelist and coalesce what we can.
2033 	 *
2034 	 * Always promote to the largest page possible
2035 	 * first to reduce the number of page promotions.
2036 	 */
2037 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2038 	page_freelist_lock(mnode);
2039 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2040 		pgcnt_t cands;
2041 
2042 		PGCTRS_CANDS_GETVALUE(mnode, r, cands);
2043 		if (cands == 0) {
2044 			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
2045 			continue;
2046 		}
2047 
2048 		full = FULL_REGION_CNT(r);
2049 		len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2050 
2051 		for (idx = 0; idx < len; idx++) {
2052 			if (PAGE_COUNTERS(mnode, r, idx) == full) {
2053 				pfnum = IDX_TO_PNUM(mnode, r, idx);
2054 				ASSERT(pfnum >=
2055 				    mem_node_config[mnode].physbase &&
2056 				    pfnum <
2057 				    mem_node_config[mnode].physmax);
2058 				(void) page_promote(mnode, pfnum, r, PC_FREE);
2059 			}
2060 		}
2061 	}
2062 	page_freelist_unlock(mnode);
2063 	rw_exit(&page_ctrs_rwlock[mnode]);
2064 }
2065 
2066 /*
2067  * This is where all polices for moving pages around
2068  * to different page size free lists is implemented.
2069  * Returns 1 on success, 0 on failure.
2070  *
2071  * So far these are the priorities for this algorithm in descending
2072  * order:
2073  *
2074  *	1) When servicing a request try to do so with a free page
2075  *	   from next size up. Helps defer fragmentation as long
2076  *	   as possible.
2077  *
2078  *	2) Page coalesce on demand. Only when a freelist
2079  *	   larger than PAGESIZE is empty and step 1
2080  *	   will not work since all larger size lists are
2081  *	   also empty.
2082  *
2083  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2084  */
2085 page_t *
2086 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
2087 {
2088 	uchar_t nszc = szc + 1;
2089 	int 	bin;
2090 	page_t	*pp, *firstpp;
2091 	page_t	*ret_pp = NULL;
2092 
2093 	ASSERT(szc < mmu_page_sizes);
2094 
2095 	VM_STAT_ADD(vmm_vmstats.pff_req[szc]);
2096 	/*
2097 	 * First try to break up a larger page to fill
2098 	 * current size freelist.
2099 	 */
2100 	while (nszc < mmu_page_sizes) {
2101 		/*
2102 		 * If page found then demote it.
2103 		 */
2104 		bin = page_convert_color(szc, nszc, color);
2105 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2106 			page_freelist_lock(mnode);
2107 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2108 
2109 			/*
2110 			 * If pfnhi is not PFNNULL, look for large page below
2111 			 * pfnhi. PFNNULL signifies no pfn requirement.
2112 			 */
2113 			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
2114 				do {
2115 					pp = pp->p_vpnext;
2116 					if (pp == firstpp) {
2117 						pp = NULL;
2118 						break;
2119 					}
2120 				} while (pp->p_pagenum >= pfnhi);
2121 			}
2122 			if (pp) {
2123 				ASSERT(pp->p_szc == nszc);
2124 				VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]);
2125 				ret_pp = page_demote(mnode, pp->p_pagenum,
2126 				    pp->p_szc, szc, color, PC_ALLOC);
2127 				if (ret_pp) {
2128 					page_freelist_unlock(mnode);
2129 #if defined(__sparc)
2130 					if (PP_ISNORELOC(ret_pp)) {
2131 						pgcnt_t npgs;
2132 
2133 						npgs = page_get_pagecnt(
2134 						    ret_pp->p_szc);
2135 						kcage_freemem_sub(npgs);
2136 					}
2137 #endif
2138 					return (ret_pp);
2139 				}
2140 			}
2141 			page_freelist_unlock(mnode);
2142 		}
2143 		nszc++;
2144 	}
2145 
2146 	/*
2147 	 * Ok that didn't work. Time to coalesce.
2148 	 */
2149 	if (szc != 0) {
2150 		ret_pp = page_freelist_coalesce(mnode, szc, color);
2151 		VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]);
2152 	}
2153 
2154 	return (ret_pp);
2155 }
2156 
2157 /*
2158  * Helper routine used only by the freelist code to lock
2159  * a page. If the page is a large page then it succeeds in
2160  * locking all the constituent pages or none at all.
2161  * Returns 1 on sucess, 0 on failure.
2162  */
2163 static int
2164 page_trylock_cons(page_t *pp, se_t se)
2165 {
2166 	page_t	*tpp, *first_pp = pp;
2167 
2168 	/*
2169 	 * Fail if can't lock first or only page.
2170 	 */
2171 	if (!page_trylock(pp, se)) {
2172 		return (0);
2173 	}
2174 
2175 	/*
2176 	 * PAGESIZE: common case.
2177 	 */
2178 	if (pp->p_szc == 0) {
2179 		return (1);
2180 	}
2181 
2182 	/*
2183 	 * Large page case.
2184 	 */
2185 	tpp = pp->p_next;
2186 	while (tpp != pp) {
2187 		if (!page_trylock(tpp, se)) {
2188 			/*
2189 			 * On failure unlock what we
2190 			 * have locked so far.
2191 			 */
2192 			while (first_pp != tpp) {
2193 				page_unlock(first_pp);
2194 				first_pp = first_pp->p_next;
2195 			}
2196 			return (0);
2197 		}
2198 		tpp = tpp->p_next;
2199 	}
2200 	return (1);
2201 }
2202 
2203 page_t *
2204 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2205     uint_t flags)
2206 {
2207 	kmutex_t	*pcm;
2208 	int		i, fill_tried, fill_marker;
2209 	page_t		*pp, *first_pp;
2210 	uint_t		bin_marker;
2211 	int		colors, cpucolors;
2212 	uchar_t		nszc;
2213 	uint_t		nszc_color_shift;
2214 	int		nwaybins = 0, nwaycnt;
2215 
2216 	ASSERT(szc < mmu_page_sizes);
2217 
2218 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2219 
2220 	MTYPE_START(mnode, mtype, flags);
2221 	if (mtype < 0) {	/* mnode foes not have memory in mtype range */
2222 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2223 		return (NULL);
2224 	}
2225 
2226 	/*
2227 	 * Set how many physical colors for this page size.
2228 	 */
2229 	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
2230 	    page_colors;
2231 
2232 	nszc = MIN(szc + 1, mmu_page_sizes - 1);
2233 	nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);
2234 
2235 	/* cpu_page_colors is non-zero if a page color may be in > 1 bin */
2236 	cpucolors = cpu_page_colors;
2237 
2238 	/*
2239 	 * adjust cpucolors to possibly check additional 'equivalent' bins
2240 	 * to try to minimize fragmentation of large pages by delaying calls
2241 	 * to page_freelist_fill.
2242 	 */
2243 	if (colorequiv > 1) {
2244 		int equivcolors = colors / colorequiv;
2245 
2246 		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
2247 			cpucolors = equivcolors;
2248 	}
2249 
2250 	ASSERT(colors <= page_colors);
2251 	ASSERT(colors);
2252 	ASSERT((colors & (colors - 1)) == 0);
2253 
2254 	ASSERT(bin < colors);
2255 
2256 	/*
2257 	 * Only hold one freelist lock at a time, that way we
2258 	 * can start anywhere and not have to worry about lock
2259 	 * ordering.
2260 	 */
2261 big_try_again:
2262 	fill_tried = 0;
2263 	nwaycnt = 0;
2264 	for (i = 0; i <= colors; i++) {
2265 try_again:
2266 		ASSERT(bin < colors);
2267 		if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
2268 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2269 			mutex_enter(pcm);
2270 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2271 			if (pp != NULL) {
2272 				/*
2273 				 * These were set before the page
2274 				 * was put on the free list,
2275 				 * they must still be set.
2276 				 */
2277 				ASSERT(PP_ISFREE(pp));
2278 				ASSERT(PP_ISAGED(pp));
2279 				ASSERT(pp->p_vnode == NULL);
2280 				ASSERT(pp->p_hash == NULL);
2281 				ASSERT(pp->p_offset == (u_offset_t)-1);
2282 				ASSERT(pp->p_szc == szc);
2283 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2284 
2285 				/*
2286 				 * Walk down the hash chain.
2287 				 * 8k pages are linked on p_next
2288 				 * and p_prev fields. Large pages
2289 				 * are a contiguous group of
2290 				 * constituent pages linked together
2291 				 * on their p_next and p_prev fields.
2292 				 * The large pages are linked together
2293 				 * on the hash chain using p_vpnext
2294 				 * p_vpprev of the base constituent
2295 				 * page of each large page.
2296 				 */
2297 				first_pp = pp;
2298 				while (!page_trylock_cons(pp, SE_EXCL)) {
2299 					if (szc == 0) {
2300 						pp = pp->p_next;
2301 					} else {
2302 						pp = pp->p_vpnext;
2303 					}
2304 
2305 					ASSERT(PP_ISFREE(pp));
2306 					ASSERT(PP_ISAGED(pp));
2307 					ASSERT(pp->p_vnode == NULL);
2308 					ASSERT(pp->p_hash == NULL);
2309 					ASSERT(pp->p_offset == (u_offset_t)-1);
2310 					ASSERT(pp->p_szc == szc);
2311 					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
2312 							mnode);
2313 
2314 					if (pp == first_pp) {
2315 						pp = NULL;
2316 						break;
2317 					}
2318 				}
2319 
2320 				if (pp) {
2321 					ASSERT(mtype == PP_2_MTYPE(pp));
2322 					ASSERT(pp->p_szc == szc);
2323 					if (szc == 0) {
2324 						page_sub(&PAGE_FREELISTS(mnode,
2325 						    szc, bin, mtype), pp);
2326 					} else {
2327 						page_vpsub(&PAGE_FREELISTS(
2328 						    mnode, szc, bin, mtype),
2329 						    pp);
2330 						CHK_LPG(pp, szc);
2331 					}
2332 					page_ctr_sub(mnode, mtype, pp,
2333 					    PG_FREE_LIST);
2334 
2335 					if ((PP_ISFREE(pp) == 0) ||
2336 					    (PP_ISAGED(pp) == 0))
2337 						panic("free page is not. pp %p",
2338 						    (void *)pp);
2339 					mutex_exit(pcm);
2340 
2341 #if defined(__sparc)
2342 					ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2343 					    (flags & PG_NORELOC) == 0);
2344 
2345 					if (PP_ISNORELOC(pp)) {
2346 						pgcnt_t	npgs;
2347 
2348 						npgs = page_get_pagecnt(szc);
2349 						kcage_freemem_sub(npgs);
2350 					}
2351 #endif
2352 					VM_STAT_ADD(vmm_vmstats.
2353 					    pgmf_allocok[szc]);
2354 					return (pp);
2355 				}
2356 			}
2357 			mutex_exit(pcm);
2358 		}
2359 
2360 		/*
2361 		 * Wow! The initial bin is empty.
2362 		 * If specific color is needed, check if page color may be
2363 		 * in other bins. cpucolors is:
2364 		 *   0	if the colors for this cpu is equal to page_colors.
2365 		 *	This means that pages with a particular color are in a
2366 		 *	single bin.
2367 		 *  -1	if colors of cpus (cheetah+) are heterogenous. Need to
2368 		 *	first determine the colors for the current cpu.
2369 		 *  >0	colors of all cpus are homogenous and < page_colors
2370 		 */
2371 
2372 		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
2373 			if (!nwaybins) {
2374 				/*
2375 				 * cpucolors is negative if ecache setsizes
2376 				 * are heterogenous. determine colors for this
2377 				 * particular cpu.
2378 				 */
2379 				if (cpucolors < 0) {
2380 					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
2381 					ASSERT(cpucolors > 0);
2382 					nwaybins = colors / cpucolors;
2383 				} else {
2384 					nwaybins = colors / cpucolors;
2385 					ASSERT(szc > 0 || nwaybins > 1);
2386 				}
2387 				if (nwaybins < 2)
2388 					cpucolors = 0;
2389 			}
2390 
2391 			if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
2392 				nwaycnt++;
2393 				bin = (bin + (colors / nwaybins)) &
2394 				    (colors - 1);
2395 				if (nwaycnt < nwaybins) {
2396 					goto try_again;
2397 				}
2398 			}
2399 			/* back to initial color if fall-thru */
2400 		}
2401 
2402 		/*
2403 		 * color bins are all empty if color match. Try and satisfy
2404 		 * the request by breaking up or coalescing pages from
2405 		 * a different size freelist of the correct color that
2406 		 * satisfies the ORIGINAL color requested. If that
2407 		 * fails then try pages of the same size but different
2408 		 * colors assuming we are not called with
2409 		 * PG_MATCH_COLOR.
2410 		 */
2411 		if (!fill_tried) {
2412 			fill_tried = 1;
2413 			fill_marker = bin >> nszc_color_shift;
2414 			pp = page_freelist_fill(szc, bin, mnode, mtype,
2415 			    PFNNULL);
2416 			if (pp != NULL) {
2417 				return (pp);
2418 			}
2419 		}
2420 
2421 		if (flags & PG_MATCH_COLOR)
2422 			break;
2423 
2424 		/*
2425 		 * Select next color bin to try.
2426 		 */
2427 		if (szc == 0) {
2428 			/*
2429 			 * PAGESIZE page case.
2430 			 */
2431 			if (i == 0) {
2432 				bin = (bin + BIN_STEP) & page_colors_mask;
2433 				bin_marker = bin;
2434 			} else {
2435 				bin = (bin + vac_colors) & page_colors_mask;
2436 				if (bin == bin_marker) {
2437 					bin = (bin + 1) & page_colors_mask;
2438 					bin_marker = bin;
2439 				}
2440 			}
2441 		} else {
2442 			/*
2443 			 * Large page case.
2444 			 */
2445 			bin = (bin + 1) & (colors - 1);
2446 		}
2447 		/*
2448 		 * If bin advanced to the next color bin of the
2449 		 * next larger pagesize, there is a chance the fill
2450 		 * could succeed.
2451 		 */
2452 		if (fill_marker != (bin >> nszc_color_shift))
2453 			fill_tried = 0;
2454 	}
2455 
2456 	/* if allowed, cycle through additional mtypes */
2457 	MTYPE_NEXT(mnode, mtype, flags);
2458 	if (mtype >= 0)
2459 		goto big_try_again;
2460 
2461 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2462 
2463 	return (NULL);
2464 }
2465 
2466 
2467 /*
2468  * Returns the count of free pages for 'pp' with size code 'szc'.
2469  * Note: This function does not return an exact value as the page freelist
2470  * locks are not held and thus the values in the page_counters may be
2471  * changing as we walk through the data.
2472  */
2473 static int
2474 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2475 {
2476 	pgcnt_t	pgfree;
2477 	pgcnt_t cnt;
2478 	ssize_t	r = szc;	/* region size */
2479 	ssize_t	idx;
2480 	int	i;
2481 	int	full, range;
2482 
2483 	/* Make sure pagenum passed in is aligned properly */
2484 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
2485 	ASSERT(szc > 0);
2486 
2487 	/* Prevent page_counters dynamic memory from being freed */
2488 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2489 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2490 	cnt = PAGE_COUNTERS(mnode, r, idx);
2491 	pgfree = cnt << PNUM_SHIFT(r - 1);
2492 	range = FULL_REGION_CNT(szc);
2493 
2494 	/* Check for completely full region */
2495 	if (cnt == range) {
2496 		rw_exit(&page_ctrs_rwlock[mnode]);
2497 		return (pgfree);
2498 	}
2499 
2500 	while (--r > 0) {
2501 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
2502 		full = FULL_REGION_CNT(r);
2503 		for (i = 0; i < range; i++, idx++) {
2504 			cnt = PAGE_COUNTERS(mnode, r, idx);
2505 			/*
2506 			 * If cnt here is full, that means we have already
2507 			 * accounted for these pages earlier.
2508 			 */
2509 			if (cnt != full) {
2510 				pgfree += (cnt << PNUM_SHIFT(r - 1));
2511 			}
2512 		}
2513 		range *= full;
2514 	}
2515 	rw_exit(&page_ctrs_rwlock[mnode]);
2516 	return (pgfree);
2517 }
2518 
2519 /*
2520  * Called from page_geti_contig_pages to exclusively lock constituent pages
2521  * starting from 'spp' for page size code 'szc'.
2522  *
2523  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
2524  * region needs to be greater than or equal to the threshold.
2525  */
2526 static int
2527 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
2528 {
2529 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
2530 	pgcnt_t pgfree, i;
2531 	page_t *pp;
2532 
2533 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
2534 
2535 
2536 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
2537 		goto skipptcpcheck;
2538 	/*
2539 	 * check if there are sufficient free pages available before attempting
2540 	 * to trylock. Count is approximate as page counters can change.
2541 	 */
2542 	pgfree = page_freecnt(mnode, spp, szc);
2543 
2544 	/* attempt to trylock if there are sufficient already free pages */
2545 	if (pgfree < pgcnt/ptcpthreshold) {
2546 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
2547 		return (0);
2548 	}
2549 
2550 skipptcpcheck:
2551 
2552 	for (i = 0; i < pgcnt; i++) {
2553 		pp = &spp[i];
2554 		if (!page_trylock(pp, SE_EXCL)) {
2555 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
2556 			while (--i != (pgcnt_t)-1) {
2557 				pp = &spp[i];
2558 				ASSERT(PAGE_EXCL(pp));
2559 				page_unlock(pp);
2560 			}
2561 			return (0);
2562 		}
2563 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
2564 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
2565 		    !PP_ISFREE(pp)) {
2566 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
2567 			ASSERT(i == 0);
2568 			page_unlock(pp);
2569 			return (0);
2570 		}
2571 		if (PP_ISNORELOC(pp)) {
2572 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
2573 			while (i != (pgcnt_t)-1) {
2574 				pp = &spp[i];
2575 				ASSERT(PAGE_EXCL(pp));
2576 				page_unlock(pp);
2577 				i--;
2578 			}
2579 			return (0);
2580 		}
2581 	}
2582 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
2583 	return (1);
2584 }
2585 
2586 /*
2587  * Claim large page pointed to by 'pp'. 'pp' is the starting set
2588  * of 'szc' constituent pages that had been locked exclusively previously.
2589  * Will attempt to relocate constituent pages in use.
2590  */
2591 static page_t *
2592 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
2593 {
2594 	spgcnt_t pgcnt, npgs, i;
2595 	page_t *targpp, *rpp, *hpp;
2596 	page_t *replpp = NULL;
2597 	page_t *pplist = NULL;
2598 
2599 	ASSERT(pp != NULL);
2600 
2601 	pgcnt = page_get_pagecnt(szc);
2602 	while (pgcnt) {
2603 		ASSERT(PAGE_EXCL(pp));
2604 		ASSERT(!PP_ISNORELOC(pp));
2605 		if (PP_ISFREE(pp)) {
2606 			/*
2607 			 * If this is a PG_FREE_LIST page then its
2608 			 * size code can change underneath us due to
2609 			 * page promotion or demotion. As an optimzation
2610 			 * use page_list_sub_pages() instead of
2611 			 * page_list_sub().
2612 			 */
2613 			if (PP_ISAGED(pp)) {
2614 				page_list_sub_pages(pp, szc);
2615 				if (pp->p_szc == szc) {
2616 					return (pp);
2617 				}
2618 				ASSERT(pp->p_szc < szc);
2619 				npgs = page_get_pagecnt(pp->p_szc);
2620 				hpp = pp;
2621 				for (i = 0; i < npgs; i++, pp++) {
2622 					pp->p_szc = szc;
2623 				}
2624 				page_list_concat(&pplist, &hpp);
2625 				pgcnt -= npgs;
2626 				continue;
2627 			}
2628 			ASSERT(!PP_ISAGED(pp));
2629 			ASSERT(pp->p_szc == 0);
2630 			page_list_sub(pp, PG_CACHE_LIST);
2631 			page_hashout(pp, NULL);
2632 			PP_SETAGED(pp);
2633 			pp->p_szc = szc;
2634 			page_list_concat(&pplist, &pp);
2635 			pp++;
2636 			pgcnt--;
2637 			continue;
2638 		}
2639 		npgs = page_get_pagecnt(pp->p_szc);
2640 
2641 		/*
2642 		 * page_create_wait freemem accounting done by caller of
2643 		 * page_get_freelist and not necessary to call it prior to
2644 		 * calling page_get_replacement_page.
2645 		 *
2646 		 * page_get_replacement_page can call page_get_contig_pages
2647 		 * to acquire a large page (szc > 0); the replacement must be
2648 		 * smaller than the contig page size to avoid looping or
2649 		 * szc == 0 and PGI_PGCPSZC0 is set.
2650 		 */
2651 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
2652 			replpp = page_get_replacement_page(pp, NULL, 0);
2653 			if (replpp) {
2654 				npgs = page_get_pagecnt(pp->p_szc);
2655 				ASSERT(npgs <= pgcnt);
2656 				targpp = pp;
2657 			}
2658 		}
2659 
2660 		/*
2661 		 * If replacement is NULL or do_page_relocate fails, fail
2662 		 * coalescing of pages.
2663 		 */
2664 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
2665 		    &npgs, NULL) != 0)) {
2666 			/*
2667 			 * Unlock un-processed target list
2668 			 */
2669 			while (pgcnt--) {
2670 				ASSERT(PAGE_EXCL(pp));
2671 				page_unlock(pp);
2672 				pp++;
2673 			}
2674 			/*
2675 			 * Free the processed target list.
2676 			 */
2677 			while (pplist) {
2678 				pp = pplist;
2679 				page_sub(&pplist, pp);
2680 				ASSERT(PAGE_EXCL(pp));
2681 				ASSERT(pp->p_szc == szc);
2682 				ASSERT(PP_ISFREE(pp));
2683 				ASSERT(PP_ISAGED(pp));
2684 				pp->p_szc = 0;
2685 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2686 				page_unlock(pp);
2687 			}
2688 
2689 			if (replpp != NULL)
2690 				page_free_replacement_page(replpp);
2691 
2692 			return (NULL);
2693 		}
2694 		ASSERT(pp == targpp);
2695 
2696 		/* LINTED */
2697 		ASSERT(hpp = pp); /* That's right, it's an assignment */
2698 
2699 		pp += npgs;
2700 		pgcnt -= npgs;
2701 
2702 		while (npgs--) {
2703 			ASSERT(PAGE_EXCL(targpp));
2704 			ASSERT(!PP_ISFREE(targpp));
2705 			ASSERT(!PP_ISNORELOC(targpp));
2706 			PP_SETFREE(targpp);
2707 			ASSERT(PP_ISAGED(targpp));
2708 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
2709 			    (flags & PGI_PGCPSZC0)));
2710 			targpp->p_szc = szc;
2711 			targpp = targpp->p_next;
2712 
2713 			rpp = replpp;
2714 			ASSERT(rpp != NULL);
2715 			page_sub(&replpp, rpp);
2716 			ASSERT(PAGE_EXCL(rpp));
2717 			ASSERT(!PP_ISFREE(rpp));
2718 			page_unlock(rpp);
2719 		}
2720 		ASSERT(targpp == hpp);
2721 		ASSERT(replpp == NULL);
2722 		page_list_concat(&pplist, &targpp);
2723 	}
2724 	CHK_LPG(pplist, szc);
2725 	return (pplist);
2726 }
2727 
2728 /*
2729  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
2730  * of 0 means nothing left after trim.
2731  */
2732 
2733 int
2734 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
2735 {
2736 	pfn_t	kcagepfn;
2737 	int	decr;
2738 	int	rc = 0;
2739 
2740 	if (PP_ISNORELOC(mseg->pages)) {
2741 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
2742 
2743 			/* lower part of this mseg inside kernel cage */
2744 			decr = kcage_current_pfn(&kcagepfn);
2745 
2746 			/* kernel cage may have transitioned past mseg */
2747 			if (kcagepfn >= mseg->pages_base &&
2748 			    kcagepfn < mseg->pages_end) {
2749 				ASSERT(decr == 0);
2750 				*lo = kcagepfn;
2751 				*hi = MIN(pfnhi,
2752 				    (mseg->pages_end - 1));
2753 				rc = 1;
2754 			}
2755 		}
2756 		/* else entire mseg in the cage */
2757 	} else {
2758 		if (PP_ISNORELOC(mseg->epages - 1)) {
2759 
2760 			/* upper part of this mseg inside kernel cage */
2761 			decr = kcage_current_pfn(&kcagepfn);
2762 
2763 			/* kernel cage may have transitioned past mseg */
2764 			if (kcagepfn >= mseg->pages_base &&
2765 			    kcagepfn < mseg->pages_end) {
2766 				ASSERT(decr);
2767 				*hi = kcagepfn;
2768 				*lo = MAX(pfnlo, mseg->pages_base);
2769 				rc = 1;
2770 			}
2771 		} else {
2772 			/* entire mseg outside of kernel cage */
2773 			*lo = MAX(pfnlo, mseg->pages_base);
2774 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
2775 			rc = 1;
2776 		}
2777 	}
2778 	return (rc);
2779 }
2780 
2781 /*
2782  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
2783  * page with size code 'szc'. Claiming such a page requires acquiring
2784  * exclusive locks on all constituent pages (page_trylock_contig_pages),
2785  * relocating pages in use and concatenating these constituent pages into a
2786  * large page.
2787  *
2788  * The page lists do not have such a large page and page_freelist_fill has
2789  * already failed to demote larger pages and/or coalesce smaller free pages.
2790  *
2791  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
2792  * pages with the same color as 'bin'.
2793  *
2794  * 'pfnflag' specifies the subset of the pfn range to search.
2795  */
2796 
2797 
2798 static page_t *
2799 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
2800     pfn_t pfnlo, pfn_t pfnhi, int pfnflag)
2801 {
2802 	struct memseg *mseg;
2803 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
2804 	pgcnt_t szcpgmask = szcpgcnt - 1;
2805 	pfn_t	randpfn;
2806 	page_t *pp, *randpp, *endpp;
2807 	uint_t colors;
2808 	pfn_t hi, lo;
2809 	uint_t skip;
2810 
2811 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
2812 
2813 	if ((pfnhi - pfnlo) + 1 < szcpgcnt)
2814 		return (NULL);
2815 
2816 	ASSERT(szc < mmu_page_sizes);
2817 
2818 	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
2819 	    page_colors;
2820 
2821 	ASSERT(bin < colors);
2822 
2823 	/*
2824 	 * trim the pfn range to search based on pfnflag. pfnflag is set
2825 	 * when there have been previous page_get_contig_page failures to
2826 	 * limit the search.
2827 	 *
2828 	 * The high bit in pfnflag specifies the number of 'slots' in the
2829 	 * pfn range and the remainder of pfnflag specifies which slot.
2830 	 * For example, a value of 1010b would mean the second slot of
2831 	 * the pfn range that has been divided into 8 slots.
2832 	 */
2833 	if (pfnflag > 1) {
2834 		int	slots = 1 << (highbit(pfnflag) - 1);
2835 		int	slotid = pfnflag & (slots - 1);
2836 		pgcnt_t	szcpages;
2837 		int	slotlen;
2838 
2839 		pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
2840 		pfnhi = pfnhi & ~(szcpgcnt - 1);
2841 
2842 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
2843 		slotlen = howmany(szcpages, slots);
2844 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
2845 		ASSERT(pfnlo < pfnhi);
2846 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
2847 			pfnhi = pfnlo + (slotlen * szcpgcnt);
2848 	}
2849 
2850 	memsegs_lock(0);
2851 
2852 	/*
2853 	 * loop through memsegs to look for contig page candidates
2854 	 */
2855 
2856 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
2857 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
2858 			/* no overlap */
2859 			continue;
2860 		}
2861 
2862 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
2863 			/* mseg too small */
2864 			continue;
2865 
2866 		/* trim off kernel cage pages from pfn range */
2867 		if (kcage_on) {
2868 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
2869 				continue;
2870 		} else {
2871 			lo = MAX(pfnlo, mseg->pages_base);
2872 			hi = MIN(pfnhi, (mseg->pages_end - 1));
2873 		}
2874 
2875 		/* round to szcpgcnt boundaries */
2876 		lo = P2ROUNDUP(lo, szcpgcnt);
2877 		hi = hi & ~(szcpgcnt - 1);
2878 
2879 		if (hi <= lo)
2880 			continue;
2881 
2882 		/*
2883 		 * set lo to point to the pfn for the desired bin. Large
2884 		 * page sizes may only have a single page color
2885 		 */
2886 		if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
2887 			uint_t	lobin;
2888 
2889 			/*
2890 			 * factor in colorequiv to check additional
2891 			 * 'equivalent' bins.
2892 			 */
2893 			if (colorequiv > 1 && colors > colorequiv)
2894 				colors = colors / colorequiv;
2895 
2896 			/* determine bin that lo currently points to */
2897 			lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;
2898 
2899 			/*
2900 			 * set lo to point at appropriate color and set skip
2901 			 * to arrive at the next szc page of the same color.
2902 			 */
2903 			lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;
2904 
2905 			skip = colors * szcpgcnt;
2906 		} else {
2907 			/* check all pages starting from lo */
2908 			skip = szcpgcnt;
2909 		}
2910 		if (hi <= lo)
2911 			/* mseg cannot satisfy color request */
2912 			continue;
2913 
2914 		/* randomly choose a point between lo and hi to begin search */
2915 
2916 		randpfn = (pfn_t)GETTICK();
2917 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
2918 		randpp = mseg->pages + (randpfn - mseg->pages_base);
2919 
2920 		ASSERT(randpp->p_pagenum == randpfn);
2921 
2922 		pp = randpp;
2923 		endpp =  mseg->pages + (hi - mseg->pages_base);
2924 
2925 		ASSERT(randpp + szcpgcnt <= endpp);
2926 
2927 		do {
2928 			ASSERT(!(pp->p_pagenum & szcpgmask));
2929 			ASSERT((flags & PG_MATCH_COLOR) == 0 ||
2930 			    colorequiv > 1 ||
2931 			    PP_2_BIN(pp) == bin);
2932 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
2933 				/* pages unlocked by page_claim on failure */
2934 				if (page_claim_contig_pages(pp, szc, flags)) {
2935 					memsegs_unlock(0);
2936 					return (pp);
2937 				}
2938 			}
2939 
2940 			pp += skip;
2941 			if (pp >= endpp) {
2942 				/* start from the beginning */
2943 				pp = mseg->pages + (lo - mseg->pages_base);
2944 				ASSERT(pp->p_pagenum == lo);
2945 				ASSERT(pp + szcpgcnt <= endpp);
2946 			}
2947 		} while (pp != randpp);
2948 	}
2949 	memsegs_unlock(0);
2950 	return (NULL);
2951 }
2952 
2953 
2954 /*
2955  * controlling routine that searches through physical memory in an attempt to
2956  * claim a large page based on the input parameters.
2957  * on the page free lists.
2958  *
2959  * calls page_geti_contig_pages with an initial pfn range from the mnode
2960  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
2961  * that overlaps with the kernel cage or does not match the requested page
2962  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
2963  * page_geti_contig_pages may further limit the search range based on
2964  * previous failure counts (pgcpfailcnt[]).
2965  *
2966  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
2967  * pagesize page that satisfies mtype.
2968  */
2969 page_t *
2970 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
2971     uint_t flags)
2972 {
2973 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
2974 	page_t		*pp;
2975 	int		pfnflag = 0;	/* no limit on search if 0 */
2976 
2977 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
2978 
2979 	/* LINTED */
2980 	MTYPE_START(mnode, mtype, flags);
2981 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2982 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
2983 		return (NULL);
2984 	}
2985 
2986 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
2987 
2988 	/* no allocations from cage */
2989 	flags |= PGI_NOCAGE;
2990 
2991 	/* do not limit search and ignore color if hi pri */
2992 
2993 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
2994 		pfnflag = pgcpfailcnt[szc];
2995 
2996 	/* remove color match to improve chances */
2997 
2998 	if (flags & PGI_PGCPHIPRI || pfnflag)
2999 		flags &= ~PG_MATCH_COLOR;
3000 
3001 	do {
3002 		/* get pfn range based on mnode and mtype */
3003 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3004 
3005 		ASSERT(pfnhi >= pfnlo);
3006 
3007 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3008 		    pfnlo, pfnhi, pfnflag);
3009 
3010 		if (pp != NULL) {
3011 			pfnflag = pgcpfailcnt[szc];
3012 			if (pfnflag) {
3013 				/* double the search size */
3014 				pgcpfailcnt[szc] = pfnflag >> 1;
3015 			}
3016 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3017 			return (pp);
3018 		}
3019 		MTYPE_NEXT(mnode, mtype, flags);
3020 	} while (mtype >= 0);
3021 
3022 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3023 	return (NULL);
3024 }
3025 
3026 
3027 /*
3028  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3029  *
3030  * Does its own locking and accounting.
3031  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3032  * pages of the proper color even if there are pages of a different color.
3033  *
3034  * Finds a page, removes it, THEN locks it.
3035  */
3036 
3037 /*ARGSUSED*/
3038 page_t *
3039 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3040 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3041 {
3042 	struct as	*as = seg->s_as;
3043 	page_t		*pp = NULL;
3044 	ulong_t		bin;
3045 	uchar_t		szc;
3046 	int		mnode;
3047 	int		mtype;
3048 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3049 	lgrp_mnode_cookie_t	lgrp_cookie;
3050 
3051 	page_get_func = page_get_mnode_freelist;
3052 
3053 	/*
3054 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3055 	 * assume we wish to allocate near to the current thread's home.
3056 	 */
3057 	if (!LGRP_EXISTS(lgrp))
3058 		lgrp = lgrp_home_lgrp();
3059 
3060 	if (kcage_on) {
3061 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3062 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3063 		    curthread != kcage_cageout_thread) {
3064 			/*
3065 			 * Set a "reserve" of kcage_throttlefree pages for
3066 			 * PG_PANIC and cageout thread allocations.
3067 			 *
3068 			 * Everybody else has to serialize in
3069 			 * page_create_get_something() to get a cage page, so
3070 			 * that we don't deadlock cageout!
3071 			 */
3072 			return (NULL);
3073 		}
3074 	} else {
3075 		flags &= ~PG_NORELOC;
3076 		flags |= PGI_NOCAGE;
3077 	}
3078 
3079 	/* LINTED */
3080 	MTYPE_INIT(mtype, vp, vaddr, flags);
3081 
3082 	/*
3083 	 * Convert size to page size code.
3084 	 */
3085 	if ((szc = page_szc(size)) == (uchar_t)-1)
3086 		panic("page_get_freelist: illegal page size request");
3087 	ASSERT(szc < mmu_page_sizes);
3088 
3089 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3090 
3091 	/* LINTED */
3092 	AS_2_BIN(as, seg, vp, vaddr, bin);
3093 
3094 	/* bin is for base pagesize color - convert if larger pagesize. */
3095 	if (szc)
3096 		bin = page_convert_color(0, szc, bin);
3097 
3098 	/*
3099 	 * Try to get a local page first, but try remote if we can't
3100 	 * get a page of the right color.
3101 	 */
3102 pgretry:
3103 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3104 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3105 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3106 		if (pp != NULL) {
3107 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3108 			DTRACE_PROBE4(page__get,
3109 			    lgrp_t *, lgrp,
3110 			    int, mnode,
3111 			    ulong_t, bin,
3112 			    uint_t, flags);
3113 			return (pp);
3114 		}
3115 	}
3116 	ASSERT(pp == NULL);
3117 
3118 	/*
3119 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3120 	 * remote free lists.  Caller expected to call page_get_cachelist which
3121 	 * will check local cache lists and remote free lists.
3122 	 */
3123 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3124 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3125 		return (NULL);
3126 	}
3127 
3128 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3129 
3130 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3131 
3132 	/*
3133 	 * Try to get a non-local freelist page.
3134 	 */
3135 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3136 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3137 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3138 		if (pp != NULL) {
3139 			DTRACE_PROBE4(page__get,
3140 			    lgrp_t *, lgrp,
3141 			    int, mnode,
3142 			    ulong_t, bin,
3143 			    uint_t, flags);
3144 			VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3145 			return (pp);
3146 		}
3147 	}
3148 
3149 	ASSERT(pp == NULL);
3150 
3151 	/*
3152 	 * when the cage is off chances are page_get_contig_pages() will fail
3153 	 * to lock a large page chunk therefore when the cage is off it's not
3154 	 * called by default.  this can be changed via /etc/system.
3155 	 *
3156 	 * page_get_contig_pages() also called to acquire a base pagesize page
3157 	 * for page_create_get_something().
3158 	 */
3159 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3160 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3161 	    (page_get_func != page_get_contig_pages)) {
3162 
3163 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3164 		page_get_func = page_get_contig_pages;
3165 		goto pgretry;
3166 	}
3167 
3168 	if (pgcplimitsearch && page_get_func == page_get_contig_pages)
3169 		pgcpfailcnt[szc]++;
3170 
3171 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3172 	return (NULL);
3173 }
3174 
3175 /*
3176  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3177  *
3178  * Does its own locking.
3179  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3180  * pages of the proper color even if there are pages of a different color.
3181  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3182  * try to lock one of them.  If no page can be locked, try the
3183  * next bin.  Return NULL if a page can not be found and locked.
3184  *
3185  * Finds a pages, trys to lock it, then removes it.
3186  */
3187 
3188 /*ARGSUSED*/
3189 page_t *
3190 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3191     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3192 {
3193 	page_t		*pp;
3194 	struct as	*as = seg->s_as;
3195 	ulong_t		bin;
3196 	/*LINTED*/
3197 	int		mnode;
3198 	int		mtype;
3199 	lgrp_mnode_cookie_t	lgrp_cookie;
3200 
3201 	/*
3202 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3203 	 * assume we wish to allocate near to the current thread's home.
3204 	 */
3205 	if (!LGRP_EXISTS(lgrp))
3206 		lgrp = lgrp_home_lgrp();
3207 
3208 	if (!kcage_on) {
3209 		flags &= ~PG_NORELOC;
3210 		flags |= PGI_NOCAGE;
3211 	}
3212 
3213 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3214 	    kcage_freemem <= kcage_throttlefree) {
3215 		/*
3216 		 * Reserve kcage_throttlefree pages for critical kernel
3217 		 * threads.
3218 		 *
3219 		 * Everybody else has to go to page_create_get_something()
3220 		 * to get a cage page, so we don't deadlock cageout.
3221 		 */
3222 		return (NULL);
3223 	}
3224 
3225 	/* LINTED */
3226 	AS_2_BIN(as, seg, vp, vaddr, bin);
3227 
3228 	ASSERT(bin <= page_colors_mask);
3229 
3230 	/* LINTED */
3231 	MTYPE_INIT(mtype, vp, vaddr, flags);
3232 
3233 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3234 
3235 	/*
3236 	 * Try local cachelists first
3237 	 */
3238 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3239 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3240 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3241 		if (pp != NULL) {
3242 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3243 			DTRACE_PROBE4(page__get,
3244 			    lgrp_t *, lgrp,
3245 			    int, mnode,
3246 			    ulong_t, bin,
3247 			    uint_t, flags);
3248 			return (pp);
3249 		}
3250 	}
3251 
3252 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3253 
3254 	/*
3255 	 * Try freelists/cachelists that are farther away
3256 	 * This is our only chance to allocate remote pages for PAGESIZE
3257 	 * requests.
3258 	 */
3259 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3260 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3261 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3262 		    0, flags);
3263 		if (pp != NULL) {
3264 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3265 			DTRACE_PROBE4(page__get,
3266 			    lgrp_t *, lgrp,
3267 			    int, mnode,
3268 			    ulong_t, bin,
3269 			    uint_t, flags);
3270 			return (pp);
3271 		}
3272 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3273 		if (pp != NULL) {
3274 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3275 			DTRACE_PROBE4(page__get,
3276 			    lgrp_t *, lgrp,
3277 			    int, mnode,
3278 			    ulong_t, bin,
3279 			    uint_t, flags);
3280 			return (pp);
3281 		}
3282 	}
3283 
3284 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3285 	return (NULL);
3286 }
3287 
3288 page_t *
3289 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3290 {
3291 	kmutex_t	*pcm;
3292 	int		i;
3293 	page_t		*pp;
3294 	page_t		*first_pp;
3295 	uint_t		bin_marker;
3296 	int		nwaybins, nwaycnt;
3297 	int		cpucolors;
3298 
3299 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3300 
3301 	/* LINTED */
3302 	MTYPE_START(mnode, mtype, flags);
3303 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3304 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3305 		return (NULL);
3306 	}
3307 
3308 	nwaybins = 0;
3309 	cpucolors = cpu_page_colors;
3310 	/*
3311 	 * adjust cpucolors to possibly check additional 'equivalent' bins
3312 	 * to try to minimize fragmentation of large pages by delaying calls
3313 	 * to page_freelist_fill.
3314 	 */
3315 	if (colorequiv > 1) {
3316 		int equivcolors = page_colors / colorequiv;
3317 
3318 		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
3319 			cpucolors = equivcolors;
3320 	}
3321 
3322 	/*
3323 	 * Only hold one cachelist lock at a time, that way we
3324 	 * can start anywhere and not have to worry about lock
3325 	 * ordering.
3326 	 */
3327 
3328 big_try_again:
3329 	nwaycnt = 0;
3330 	for (i = 0; i <= page_colors; i++) {
3331 		if (PAGE_CACHELISTS(mnode, bin, mtype)) {
3332 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3333 			mutex_enter(pcm);
3334 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3335 			if (pp != NULL) {
3336 				first_pp = pp;
3337 				ASSERT(pp->p_vnode);
3338 				ASSERT(PP_ISAGED(pp) == 0);
3339 				ASSERT(pp->p_szc == 0);
3340 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3341 				while (!page_trylock(pp, SE_EXCL)) {
3342 					pp = pp->p_next;
3343 					ASSERT(pp->p_szc == 0);
3344 					if (pp == first_pp) {
3345 						/*
3346 						 * We have searched the
3347 						 * complete list!
3348 						 * And all of them (might
3349 						 * only be one) are locked.
3350 						 * This can happen since
3351 						 * these pages can also be
3352 						 * found via the hash list.
3353 						 * When found via the hash
3354 						 * list, they are locked
3355 						 * first, then removed.
3356 						 * We give up to let the
3357 						 * other thread run.
3358 						 */
3359 						pp = NULL;
3360 						break;
3361 					}
3362 					ASSERT(pp->p_vnode);
3363 					ASSERT(PP_ISFREE(pp));
3364 					ASSERT(PP_ISAGED(pp) == 0);
3365 					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3366 							mnode);
3367 				}
3368 
3369 				if (pp) {
3370 					page_t	**ppp;
3371 					/*
3372 					 * Found and locked a page.
3373 					 * Pull it off the list.
3374 					 */
3375 					ASSERT(mtype == PP_2_MTYPE(pp));
3376 					ppp = &PAGE_CACHELISTS(mnode, bin,
3377 					    mtype);
3378 					page_sub(ppp, pp);
3379 					/*
3380 					 * Subtract counters before releasing
3381 					 * pcm mutex to avoid a race with
3382 					 * page_freelist_coalesce and
3383 					 * page_freelist_fill.
3384 					 */
3385 					page_ctr_sub(mnode, mtype, pp,
3386 					    PG_CACHE_LIST);
3387 					mutex_exit(pcm);
3388 					ASSERT(pp->p_vnode);
3389 					ASSERT(PP_ISAGED(pp) == 0);
3390 #if defined(__sparc)
3391 					ASSERT(!kcage_on ||
3392 					    (flags & PG_NORELOC) == 0 ||
3393 					    PP_ISNORELOC(pp));
3394 					if (PP_ISNORELOC(pp)) {
3395 						kcage_freemem_sub(1);
3396 					}
3397 #endif
3398 					VM_STAT_ADD(vmm_vmstats.
3399 					    pgmc_allocok);
3400 					return (pp);
3401 				}
3402 			}
3403 			mutex_exit(pcm);
3404 		}
3405 
3406 		/*
3407 		 * Wow! The initial bin is empty or no page in the bin could
3408 		 * be locked.
3409 		 *
3410 		 * If specific color is needed, check if page color may be in
3411 		 * other bins.
3412 		 */
3413 		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
3414 			if (!nwaybins) {
3415 				if (cpucolors < 0) {
3416 					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
3417 					ASSERT(cpucolors > 0);
3418 					nwaybins = page_colors / cpucolors;
3419 					if (nwaybins < 2)
3420 						cpucolors = 0;
3421 				} else {
3422 					nwaybins = page_colors / cpucolors;
3423 					ASSERT(nwaybins > 1);
3424 				}
3425 			}
3426 
3427 			if (++nwaycnt >= nwaybins) {
3428 				break;
3429 			}
3430 			bin = (bin + (page_colors / nwaybins)) &
3431 			    page_colors_mask;
3432 			continue;
3433 		}
3434 
3435 		if (i == 0) {
3436 			bin = (bin + BIN_STEP) & page_colors_mask;
3437 			bin_marker = bin;
3438 		} else {
3439 			bin = (bin + vac_colors) & page_colors_mask;
3440 			if (bin == bin_marker) {
3441 				bin = (bin + 1) & page_colors_mask;
3442 				bin_marker = bin;
3443 			}
3444 		}
3445 	}
3446 
3447 	MTYPE_NEXT(mnode, mtype, flags);
3448 	if (mtype >= 0)
3449 		goto big_try_again;
3450 
3451 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3452 	return (NULL);
3453 }
3454 
3455 #ifdef DEBUG
3456 #define	REPL_PAGE_STATS
3457 #endif /* DEBUG */
3458 
3459 #ifdef REPL_PAGE_STATS
3460 struct repl_page_stats {
3461 	uint_t	ngets;
3462 	uint_t	ngets_noreloc;
3463 	uint_t	npgr_noreloc;
3464 	uint_t	nnopage_first;
3465 	uint_t	nnopage;
3466 	uint_t	nhashout;
3467 	uint_t	nnofree;
3468 	uint_t	nnext_pp;
3469 } repl_page_stats;
3470 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
3471 #else /* REPL_PAGE_STATS */
3472 #define	REPL_STAT_INCR(v)
3473 #endif /* REPL_PAGE_STATS */
3474 
3475 int	pgrppgcp;
3476 
3477 /*
3478  * The freemem accounting must be done by the caller.
3479  * First we try to get a replacement page of the same size as like_pp,
3480  * if that is not possible, then we just get a set of discontiguous
3481  * PAGESIZE pages.
3482  */
3483 page_t *
3484 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3485     uint_t pgrflags)
3486 {
3487 	page_t		*like_pp;
3488 	page_t		*pp, *pplist;
3489 	page_t		*pl = NULL;
3490 	ulong_t		bin;
3491 	int		mnode, page_mnode;
3492 	int		szc;
3493 	spgcnt_t	npgs, pg_cnt;
3494 	pfn_t		pfnum;
3495 	int		mtype;
3496 	int		flags = 0;
3497 	lgrp_mnode_cookie_t	lgrp_cookie;
3498 	lgrp_t		*lgrp;
3499 
3500 	REPL_STAT_INCR(ngets);
3501 	like_pp = orig_like_pp;
3502 	ASSERT(PAGE_EXCL(like_pp));
3503 
3504 	szc = like_pp->p_szc;
3505 	npgs = page_get_pagecnt(szc);
3506 	/*
3507 	 * Now we reset like_pp to the base page_t.
3508 	 * That way, we won't walk past the end of this 'szc' page.
3509 	 */
3510 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
3511 	like_pp = page_numtopp_nolock(pfnum);
3512 	ASSERT(like_pp->p_szc == szc);
3513 
3514 	if (PP_ISNORELOC(like_pp)) {
3515 		ASSERT(kcage_on);
3516 		REPL_STAT_INCR(ngets_noreloc);
3517 		flags = PGI_RELOCONLY;
3518 	} else if (pgrflags & PGR_NORELOC) {
3519 		ASSERT(kcage_on);
3520 		REPL_STAT_INCR(npgr_noreloc);
3521 		flags = PG_NORELOC;
3522 	}
3523 
3524 	/*
3525 	 * Kernel pages must always be replaced with the same size
3526 	 * pages, since we cannot properly handle demotion of kernel
3527 	 * pages.
3528 	 */
3529 	if (like_pp->p_vnode == &kvp)
3530 		pgrflags |= PGR_SAMESZC;
3531 
3532 	/* LINTED */
3533 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode);
3534 
3535 	while (npgs) {
3536 		pplist = NULL;
3537 		for (;;) {
3538 			pg_cnt = page_get_pagecnt(szc);
3539 			bin = PP_2_BIN(like_pp);
3540 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
3541 			ASSERT(pg_cnt <= npgs);
3542 
3543 			/*
3544 			 * If an lgroup was specified, try to get the
3545 			 * page from that lgroup.
3546 			 * NOTE: Must be careful with code below because
3547 			 *	 lgroup may disappear and reappear since there
3548 			 *	 is no locking for lgroup here.
3549 			 */
3550 			if (LGRP_EXISTS(lgrp_target)) {
3551 				/*
3552 				 * Keep local variable for lgroup separate
3553 				 * from lgroup argument since this code should
3554 				 * only be exercised when lgroup argument
3555 				 * exists....
3556 				 */
3557 				lgrp = lgrp_target;
3558 
3559 				/* Try the lgroup's freelists first */
3560 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3561 				    LGRP_SRCH_LOCAL);
3562 				while ((pplist == NULL) &&
3563 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3564 				    != -1) {
3565 					pplist = page_get_mnode_freelist(
3566 						mnode, bin, mtype, szc,
3567 						    flags);
3568 				}
3569 
3570 				/*
3571 				 * Now try it's cachelists if this is a
3572 				 * small page. Don't need to do it for
3573 				 * larger ones since page_freelist_coalesce()
3574 				 * already failed.
3575 				 */
3576 				if (pplist != NULL || szc != 0)
3577 					break;
3578 
3579 				/* Now try it's cachelists */
3580 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3581 				    LGRP_SRCH_LOCAL);
3582 
3583 				while ((pplist == NULL) &&
3584 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3585 				    != -1) {
3586 					pplist = page_get_mnode_cachelist(
3587 						bin, flags, mnode, mtype);
3588 				}
3589 				if (pplist != NULL) {
3590 					page_hashout(pplist, NULL);
3591 					PP_SETAGED(pplist);
3592 					REPL_STAT_INCR(nhashout);
3593 					break;
3594 				}
3595 				/* Done looking in this lgroup. Bail out. */
3596 				break;
3597 			}
3598 
3599 			/*
3600 			 * No lgroup was specified (or lgroup was removed by
3601 			 * DR, so just try to get the page as close to
3602 			 * like_pp's mnode as possible.
3603 			 * First try the local freelist...
3604 			 */
3605 			mnode = PP_2_MEM_NODE(like_pp);
3606 			pplist = page_get_mnode_freelist(mnode, bin,
3607 			    mtype, szc, flags);
3608 			if (pplist != NULL)
3609 				break;
3610 
3611 			REPL_STAT_INCR(nnofree);
3612 
3613 			/*
3614 			 * ...then the local cachelist. Don't need to do it for
3615 			 * larger pages cause page_freelist_coalesce() already
3616 			 * failed there anyway.
3617 			 */
3618 			if (szc == 0) {
3619 				pplist = page_get_mnode_cachelist(bin, flags,
3620 				    mnode, mtype);
3621 				if (pplist != NULL) {
3622 					page_hashout(pplist, NULL);
3623 					PP_SETAGED(pplist);
3624 					REPL_STAT_INCR(nhashout);
3625 					break;
3626 				}
3627 			}
3628 
3629 			/* Now try remote freelists */
3630 			page_mnode = mnode;
3631 			lgrp =
3632 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
3633 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3634 			    LGRP_SRCH_HIER);
3635 			while (pplist == NULL &&
3636 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
3637 			    != -1) {
3638 				/*
3639 				 * Skip local mnode.
3640 				 */
3641 				if ((mnode == page_mnode) ||
3642 				    (mem_node_config[mnode].exists == 0))
3643 					continue;
3644 
3645 				pplist = page_get_mnode_freelist(mnode,
3646 				    bin, mtype, szc, flags);
3647 			}
3648 
3649 			if (pplist != NULL)
3650 				break;
3651 
3652 
3653 			/* Now try remote cachelists */
3654 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3655 			    LGRP_SRCH_HIER);
3656 			while (pplist == NULL && szc == 0) {
3657 				mnode = lgrp_memnode_choose(&lgrp_cookie);
3658 				if (mnode == -1)
3659 					break;
3660 				/*
3661 				 * Skip local mnode.
3662 				 */
3663 				if ((mnode == page_mnode) ||
3664 				    (mem_node_config[mnode].exists == 0))
3665 					continue;
3666 
3667 				pplist = page_get_mnode_cachelist(bin,
3668 				    flags, mnode, mtype);
3669 
3670 				if (pplist != NULL) {
3671 					page_hashout(pplist, NULL);
3672 					PP_SETAGED(pplist);
3673 					REPL_STAT_INCR(nhashout);
3674 					break;
3675 				}
3676 			}
3677 
3678 			/*
3679 			 * Break out of while loop under the following cases:
3680 			 * - If we successfully got a page.
3681 			 * - If pgrflags specified only returning a specific
3682 			 *   page size and we could not find that page size.
3683 			 * - If we could not satisfy the request with PAGESIZE
3684 			 *   or larger pages.
3685 			 */
3686 			if (pplist != NULL || szc == 0)
3687 				break;
3688 
3689 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
3690 				/* try to find contig page */
3691 
3692 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
3693 				    LGRP_SRCH_HIER);
3694 
3695 				while ((pplist == NULL) &&
3696 				    (mnode =
3697 					lgrp_memnode_choose(&lgrp_cookie))
3698 				    != -1) {
3699 					pplist = page_get_contig_pages(
3700 						mnode, bin, mtype, szc,
3701 						    flags | PGI_PGCPHIPRI);
3702 				}
3703 				break;
3704 			}
3705 
3706 			/*
3707 			 * The correct thing to do here is try the next
3708 			 * page size down using szc--. Due to a bug
3709 			 * with the processing of HAT_RELOAD_SHARE
3710 			 * where the sfmmu_ttecnt arrays of all
3711 			 * hats sharing an ISM segment don't get updated,
3712 			 * using intermediate size pages for relocation
3713 			 * can lead to continuous page faults.
3714 			 */
3715 			szc = 0;
3716 		}
3717 
3718 		if (pplist != NULL) {
3719 			DTRACE_PROBE4(page__get,
3720 			    lgrp_t *, lgrp,
3721 			    int, mnode,
3722 			    ulong_t, bin,
3723 			    uint_t, flags);
3724 
3725 			while (pplist != NULL && pg_cnt--) {
3726 				ASSERT(pplist != NULL);
3727 				pp = pplist;
3728 				page_sub(&pplist, pp);
3729 				PP_CLRFREE(pp);
3730 				PP_CLRAGED(pp);
3731 				page_list_concat(&pl, &pp);
3732 				npgs--;
3733 				like_pp = like_pp + 1;
3734 				REPL_STAT_INCR(nnext_pp);
3735 			}
3736 			ASSERT(pg_cnt == 0);
3737 		} else {
3738 			break;
3739 		}
3740 	}
3741 
3742 	if (npgs) {
3743 		/*
3744 		 * We were unable to allocate the necessary number
3745 		 * of pages.
3746 		 * We need to free up any pl.
3747 		 */
3748 		REPL_STAT_INCR(nnopage);
3749 		page_free_replacement_page(pl);
3750 		return (NULL);
3751 	} else {
3752 		return (pl);
3753 	}
3754 }
3755 
3756 /*
3757  * demote a free large page to it's constituent pages
3758  */
3759 void
3760 page_demote_free_pages(page_t *pp)
3761 {
3762 
3763 	int mnode;
3764 
3765 	ASSERT(pp != NULL);
3766 	ASSERT(PAGE_LOCKED(pp));
3767 	ASSERT(PP_ISFREE(pp));
3768 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
3769 
3770 	mnode = PP_2_MEM_NODE(pp);
3771 	page_freelist_lock(mnode);
3772 	if (pp->p_szc != 0) {
3773 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
3774 		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
3775 	}
3776 	page_freelist_unlock(mnode);
3777 	ASSERT(pp->p_szc == 0);
3778 }
3779