xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 1f7ad2e1275fff503991bf4b43bc5cf1d815669f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * This file contains common functions to access and manage the page lists.
38  * Many of these routines originated from platform dependent modules
39  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
40  * a platform independent manner.
41  *
42  * vm/vm_dep.h provides for platform specific support.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/atomic.h>
50 #include <sys/sysmacros.h>
51 #include <vm/as.h>
52 #include <vm/page.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_vn.h>
55 #include <sys/memnode.h>
56 #include <vm/vm_dep.h>
57 #include <sys/lgrp.h>
58 #include <sys/mem_config.h>
59 #include <sys/callb.h>
60 #include <sys/mem_cage.h>
61 #include <sys/sdt.h>
62 
63 extern uint_t	vac_colors;
64 
65 #define	MAX_PRAGMA_ALIGN	128
66 
67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
68 
69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
70 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
71 #else
72 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
73 #endif
74 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
75 
76 /*
77  * number of page colors equivalent to reqested color in page_get routines.
78  * If set, keeps large pages intact longer and keeps MPO allocation
79  * from the local mnode in favor of acquiring the 'correct' page color from
80  * a demoted large page or from a remote mnode.
81  */
82 uint_t	colorequiv;
83 
84 /*
85  * color equivalency mask for each page size.
86  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
87  * High 4 bits determine the number of high order bits of the color to ignore.
88  * Low 4 bits determines number of low order bits of color to ignore (it's only
89  * relevant for hashed index based page coloring).
90  */
91 uchar_t colorequivszc[MMU_PAGE_SIZES];
92 
93 /*
94  * if set, specifies the percentage of large pages that are free from within
95  * a large page region before attempting to lock those pages for
96  * page_get_contig_pages processing.
97  *
98  * Should be turned on when kpr is available when page_trylock_contig_pages
99  * can be more selective.
100  */
101 
102 int	ptcpthreshold;
103 
104 /*
105  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
106  * Enabled by default via pgcplimitsearch.
107  *
108  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
109  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
110  * bound. This upper bound range guarantees:
111  *    - all large page 'slots' will be searched over time
112  *    - the minimum (1) large page candidates considered on each pgcp call
113  *    - count doesn't wrap around to 0
114  */
115 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
116 int	pgcplimitsearch = 1;
117 
118 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
119 #define	SETPGCPFAILCNT(szc)						\
120 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
121 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
122 
123 #ifdef VM_STATS
124 struct vmm_vmstats_str  vmm_vmstats;
125 
126 #endif /* VM_STATS */
127 
128 #if defined(__sparc)
129 #define	LPGCREATE	0
130 #else
131 /* enable page_get_contig_pages */
132 #define	LPGCREATE	1
133 #endif
134 
135 int pg_contig_disable;
136 int pg_lpgcreate_nocage = LPGCREATE;
137 
138 /*
139  * page_freelist_split pfn flag to signify no hi pfn requirement.
140  */
141 #define	PFNNULL		0
142 
143 /* Flags involved in promotion and demotion routines */
144 #define	PC_FREE		0x1	/* put page on freelist */
145 #define	PC_ALLOC	0x2	/* return page for allocation */
146 
147 /*
148  * Flag for page_demote to be used with PC_FREE to denote that we don't care
149  * what the color is as the color parameter to the function is ignored.
150  */
151 #define	PC_NO_COLOR	(-1)
152 
153 /* mtype value for page_promote to use when mtype does not matter */
154 #define	PC_MTYPE_ANY	(-1)
155 
156 /*
157  * page counters candidates info
158  * See page_ctrs_cands comment below for more details.
159  * fields are as follows:
160  *	pcc_pages_free:		# pages which freelist coalesce can create
161  *	pcc_color_free:		pointer to page free counts per color
162  */
163 typedef struct pcc_info {
164 	pgcnt_t	pcc_pages_free;
165 	pgcnt_t	*pcc_color_free;
166 } pcc_info_t;
167 
168 /*
169  * On big machines it can take a long time to check page_counters
170  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
171  * updated sum of all elements of the corresponding page_counters arrays.
172  * page_freelist_coalesce() searches page_counters only if an appropriate
173  * element of page_ctrs_cands array is greater than 0.
174  *
175  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
176  */
177 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
178 
179 /*
180  * Return in val the total number of free pages which can be created
181  * for the given mnode (m), mrange (g), and region size (r)
182  */
183 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
184 	int i;								\
185 	val = 0;							\
186 	for (i = 0; i < NPC_MUTEX; i++) {				\
187 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
188 	}								\
189 }
190 
191 /*
192  * Return in val the total number of free pages which can be created
193  * for the given mnode (m), mrange (g), region size (r), and color (c)
194  */
195 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
196 	int i;								\
197 	val = 0;							\
198 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
199 	for (i = 0; i < NPC_MUTEX; i++) {				\
200 	    val +=							\
201 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
202 	}								\
203 }
204 
205 /*
206  * We can only allow a single thread to update a counter within the physical
207  * range of the largest supported page size. That is the finest granularity
208  * possible since the counter values are dependent on each other
209  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
210  * ctr_mutex lock index for a particular physical range.
211  */
212 static kmutex_t	*ctr_mutex[NPC_MUTEX];
213 
214 #define	PP_CTR_LOCK_INDX(pp)						\
215 	(((pp)->p_pagenum >>						\
216 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
217 
218 #define	INVALID_COLOR 0xffffffff
219 #define	INVALID_MASK  0xffffffff
220 
221 /*
222  * Local functions prototypes.
223  */
224 
225 void page_ctr_add(int, int, page_t *, int);
226 void page_ctr_add_internal(int, int, page_t *, int);
227 void page_ctr_sub(int, int, page_t *, int);
228 void page_ctr_sub_internal(int, int, page_t *, int);
229 void page_freelist_lock(int);
230 void page_freelist_unlock(int);
231 page_t *page_promote(int, pfn_t, uchar_t, int, int);
232 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
233 page_t *page_freelist_split(uchar_t,
234     uint_t, int, int, pfn_t, page_list_walker_t *);
235 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
236 static int page_trylock_cons(page_t *pp, se_t se);
237 
238 /*
239  * The page_counters array below is used to keep track of free contiguous
240  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
241  * This contains an array of counters, the size of the array, a shift value
242  * used to convert a pagenum into a counter array index or vice versa, as
243  * well as a cache of the last successful index to be promoted to a larger
244  * page size.  As an optimization, we keep track of the last successful index
245  * to be promoted per page color for the given size region, and this is
246  * allocated dynamically based upon the number of colors for a given
247  * region size.
248  *
249  * Conceptually, the page counters are represented as:
250  *
251  *	page_counters[region_size][mnode]
252  *
253  *	region_size:	size code of a candidate larger page made up
254  *			of contiguous free smaller pages.
255  *
256  *	page_counters[region_size][mnode].hpm_counters[index]:
257  *		represents how many (region_size - 1) pages either
258  *		exist or can be created within the given index range.
259  *
260  * Let's look at a sparc example:
261  *	If we want to create a free 512k page, we look at region_size 2
262  *	for the mnode we want.  We calculate the index and look at a specific
263  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
264  *	this location, it means that 8 64k pages either exist or can be created
265  *	from 8K pages in order to make a single free 512k page at the given
266  *	index.  Note that when a region is full, it will contribute to the
267  *	counts in the region above it.  Thus we will not know what page
268  *	size the free pages will be which can be promoted to this new free
269  *	page unless we look at all regions below the current region.
270  */
271 
272 /*
273  * Note: hpmctr_t is defined in platform vm_dep.h
274  * hw_page_map_t contains all the information needed for the page_counters
275  * logic. The fields are as follows:
276  *
277  *	hpm_counters:	dynamically allocated array to hold counter data
278  *	hpm_entries:	entries in hpm_counters
279  *	hpm_shift:	shift for pnum/array index conv
280  *	hpm_base:	PFN mapped to counter index 0
281  *	hpm_color_current:	last index in counter array for this color at
282  *				which we successfully created a large page
283  */
284 typedef struct hw_page_map {
285 	hpmctr_t	*hpm_counters;
286 	size_t		hpm_entries;
287 	int		hpm_shift;
288 	pfn_t		hpm_base;
289 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
290 } hw_page_map_t;
291 
292 /*
293  * Element zero is not used, but is allocated for convenience.
294  */
295 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
296 
297 /*
298  * Cached value of MNODE_RANGE_CNT(mnode).
299  * This is a function call in x86.
300  */
301 static int mnode_nranges[MAX_MEM_NODES];
302 static int mnode_maxmrange[MAX_MEM_NODES];
303 
304 /*
305  * The following macros are convenient ways to get access to the individual
306  * elements of the page_counters arrays.  They can be used on both
307  * the left side and right side of equations.
308  */
309 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
310 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
311 
312 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
313 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
314 
315 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
316 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
317 
318 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
319 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
320 
321 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
322 	(page_counters[(rg_szc)][(mnode)].hpm_base)
323 
324 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
325 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
326 
327 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
328 	(page_counters[(rg_szc)][(mnode)].				\
329 	hpm_color_current[(mrange)][(color)])
330 
331 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
332 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
333 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
334 
335 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
336 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
337 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
338 
339 /*
340  * Protects the hpm_counters and hpm_color_current memory from changing while
341  * looking at page counters information.
342  * Grab the write lock to modify what these fields point at.
343  * Grab the read lock to prevent any pointers from changing.
344  * The write lock can not be held during memory allocation due to a possible
345  * recursion deadlock with trying to grab the read lock while the
346  * write lock is already held.
347  */
348 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
349 
350 
351 /*
352  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
353  */
354 void
355 cpu_vm_data_init(struct cpu *cp)
356 {
357 	if (cp == CPU0) {
358 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
359 	} else {
360 		void	*kmptr;
361 		int	align;
362 		size_t	sz;
363 
364 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
365 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
366 		kmptr = kmem_zalloc(sz, KM_SLEEP);
367 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
368 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
369 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
370 	}
371 }
372 
373 /*
374  * free cpu_vm_data
375  */
376 void
377 cpu_vm_data_destroy(struct cpu *cp)
378 {
379 	if (cp->cpu_seqid && cp->cpu_vm_data) {
380 		ASSERT(cp != CPU0);
381 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
382 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
383 	}
384 	cp->cpu_vm_data = NULL;
385 }
386 
387 
388 /*
389  * page size to page size code
390  */
391 int
392 page_szc(size_t pagesize)
393 {
394 	int	i = 0;
395 
396 	while (hw_page_array[i].hp_size) {
397 		if (pagesize == hw_page_array[i].hp_size)
398 			return (i);
399 		i++;
400 	}
401 	return (-1);
402 }
403 
404 /*
405  * page size to page size code with the restriction that it be a supported
406  * user page size.  If it's not a supported user page size, -1 will be returned.
407  */
408 int
409 page_szc_user_filtered(size_t pagesize)
410 {
411 	int szc = page_szc(pagesize);
412 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
413 		return (szc);
414 	}
415 	return (-1);
416 }
417 
418 /*
419  * Return how many page sizes are available for the user to use.  This is
420  * what the hardware supports and not based upon how the OS implements the
421  * support of different page sizes.
422  *
423  * If legacy is non-zero, return the number of pagesizes available to legacy
424  * applications. The number of legacy page sizes might be less than the
425  * exported user page sizes. This is to prevent legacy applications that
426  * use the largest page size returned from getpagesizes(3c) from inadvertantly
427  * using the 'new' large pagesizes.
428  */
429 uint_t
430 page_num_user_pagesizes(int legacy)
431 {
432 	if (legacy)
433 		return (mmu_legacy_page_sizes);
434 	return (mmu_exported_page_sizes);
435 }
436 
437 uint_t
438 page_num_pagesizes(void)
439 {
440 	return (mmu_page_sizes);
441 }
442 
443 /*
444  * returns the count of the number of base pagesize pages associated with szc
445  */
446 pgcnt_t
447 page_get_pagecnt(uint_t szc)
448 {
449 	if (szc >= mmu_page_sizes)
450 		panic("page_get_pagecnt: out of range %d", szc);
451 	return (hw_page_array[szc].hp_pgcnt);
452 }
453 
454 size_t
455 page_get_pagesize(uint_t szc)
456 {
457 	if (szc >= mmu_page_sizes)
458 		panic("page_get_pagesize: out of range %d", szc);
459 	return (hw_page_array[szc].hp_size);
460 }
461 
462 /*
463  * Return the size of a page based upon the index passed in.  An index of
464  * zero refers to the smallest page size in the system, and as index increases
465  * it refers to the next larger supported page size in the system.
466  * Note that szc and userszc may not be the same due to unsupported szc's on
467  * some systems.
468  */
469 size_t
470 page_get_user_pagesize(uint_t userszc)
471 {
472 	uint_t szc = USERSZC_2_SZC(userszc);
473 
474 	if (szc >= mmu_page_sizes)
475 		panic("page_get_user_pagesize: out of range %d", szc);
476 	return (hw_page_array[szc].hp_size);
477 }
478 
479 uint_t
480 page_get_shift(uint_t szc)
481 {
482 	if (szc >= mmu_page_sizes)
483 		panic("page_get_shift: out of range %d", szc);
484 	return (PAGE_GET_SHIFT(szc));
485 }
486 
487 uint_t
488 page_get_pagecolors(uint_t szc)
489 {
490 	if (szc >= mmu_page_sizes)
491 		panic("page_get_pagecolors: out of range %d", szc);
492 	return (PAGE_GET_PAGECOLORS(szc));
493 }
494 
495 /*
496  * this assigns the desired equivalent color after a split
497  */
498 uint_t
499 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
500     uint_t ncolor, uint_t ceq_mask)
501 {
502 	ASSERT(nszc > szc);
503 	ASSERT(szc < mmu_page_sizes);
504 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
505 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
506 
507 	color &= ceq_mask;
508 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
509 	return (color | (ncolor & ~ceq_mask));
510 }
511 
512 /*
513  * The interleaved_mnodes flag is set when mnodes overlap in
514  * the physbase..physmax range, but have disjoint slices.
515  * In this case hpm_counters is shared by all mnodes.
516  * This flag is set dynamically by the platform.
517  */
518 int interleaved_mnodes = 0;
519 
520 /*
521  * Called by startup().
522  * Size up the per page size free list counters based on physmax
523  * of each node and max_mem_nodes.
524  *
525  * If interleaved_mnodes is set we need to find the first mnode that
526  * exists. hpm_counters for the first mnode will then be shared by
527  * all other mnodes. If interleaved_mnodes is not set, just set
528  * first=mnode each time. That means there will be no sharing.
529  */
530 size_t
531 page_ctrs_sz(void)
532 {
533 	int	r;		/* region size */
534 	int	mnode;
535 	int	firstmn;	/* first mnode that exists */
536 	int	nranges;
537 	pfn_t	physbase;
538 	pfn_t	physmax;
539 	uint_t	ctrs_sz = 0;
540 	int 	i;
541 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
542 
543 	/*
544 	 * We need to determine how many page colors there are for each
545 	 * page size in order to allocate memory for any color specific
546 	 * arrays.
547 	 */
548 	for (i = 0; i < mmu_page_sizes; i++) {
549 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
550 	}
551 
552 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
553 
554 		pgcnt_t r_pgcnt;
555 		pfn_t   r_base;
556 		pgcnt_t r_align;
557 
558 		if (mem_node_config[mnode].exists == 0)
559 			continue;
560 
561 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
562 		nranges = MNODE_RANGE_CNT(mnode);
563 		mnode_nranges[mnode] = nranges;
564 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
565 
566 		/*
567 		 * determine size needed for page counter arrays with
568 		 * base aligned to large page size.
569 		 */
570 		for (r = 1; r < mmu_page_sizes; r++) {
571 			/* add in space for hpm_color_current */
572 			ctrs_sz += sizeof (size_t) *
573 			    colors_per_szc[r] * nranges;
574 
575 			if (firstmn != mnode)
576 				continue;
577 
578 			/* add in space for hpm_counters */
579 			r_align = page_get_pagecnt(r);
580 			r_base = physbase;
581 			r_base &= ~(r_align - 1);
582 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
583 
584 			/*
585 			 * Round up to always allocate on pointer sized
586 			 * boundaries.
587 			 */
588 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
589 			    sizeof (hpmctr_t *));
590 		}
591 	}
592 
593 	for (r = 1; r < mmu_page_sizes; r++) {
594 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
595 	}
596 
597 	/* add in space for page_ctrs_cands and pcc_color_free */
598 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
599 	    mmu_page_sizes * NPC_MUTEX;
600 
601 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
602 
603 		if (mem_node_config[mnode].exists == 0)
604 			continue;
605 
606 		nranges = mnode_nranges[mnode];
607 		ctrs_sz += sizeof (pcc_info_t) * nranges *
608 		    mmu_page_sizes * NPC_MUTEX;
609 		for (r = 1; r < mmu_page_sizes; r++) {
610 			ctrs_sz += sizeof (pgcnt_t) * nranges *
611 			    colors_per_szc[r] * NPC_MUTEX;
612 		}
613 	}
614 
615 	/* ctr_mutex */
616 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
617 
618 	/* size for page list counts */
619 	PLCNT_SZ(ctrs_sz);
620 
621 	/*
622 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
623 	 * address of the counters to ecache_alignsize boundary for every
624 	 * memory node.
625 	 */
626 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
627 }
628 
629 caddr_t
630 page_ctrs_alloc(caddr_t alloc_base)
631 {
632 	int	mnode;
633 	int	mrange, nranges;
634 	int	r;		/* region size */
635 	int	i;
636 	int	firstmn;	/* first mnode that exists */
637 	pfn_t	physbase;
638 	pfn_t	physmax;
639 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
640 
641 	/*
642 	 * We need to determine how many page colors there are for each
643 	 * page size in order to allocate memory for any color specific
644 	 * arrays.
645 	 */
646 	for (i = 0; i < mmu_page_sizes; i++) {
647 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
648 	}
649 
650 	for (r = 1; r < mmu_page_sizes; r++) {
651 		page_counters[r] = (hw_page_map_t *)alloc_base;
652 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
653 	}
654 
655 	/* page_ctrs_cands and pcc_color_free array */
656 	for (i = 0; i < NPC_MUTEX; i++) {
657 		for (r = 1; r < mmu_page_sizes; r++) {
658 
659 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
660 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
661 
662 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
663 				pcc_info_t *pi;
664 
665 				if (mem_node_config[mnode].exists == 0)
666 					continue;
667 
668 				nranges = mnode_nranges[mnode];
669 
670 				pi = (pcc_info_t *)alloc_base;
671 				alloc_base += sizeof (pcc_info_t) * nranges;
672 				page_ctrs_cands[i][r][mnode] = pi;
673 
674 				for (mrange = 0; mrange < nranges; mrange++) {
675 					pi->pcc_color_free =
676 					    (pgcnt_t *)alloc_base;
677 					alloc_base += sizeof (pgcnt_t) *
678 					    colors_per_szc[r];
679 					pi++;
680 				}
681 			}
682 		}
683 	}
684 
685 	/* ctr_mutex */
686 	for (i = 0; i < NPC_MUTEX; i++) {
687 		ctr_mutex[i] = (kmutex_t *)alloc_base;
688 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
689 	}
690 
691 	/* initialize page list counts */
692 	PLCNT_INIT(alloc_base);
693 
694 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
695 
696 		pgcnt_t r_pgcnt;
697 		pfn_t	r_base;
698 		pgcnt_t r_align;
699 		int	r_shift;
700 		int	nranges = mnode_nranges[mnode];
701 
702 		if (mem_node_config[mnode].exists == 0)
703 			continue;
704 
705 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
706 
707 		for (r = 1; r < mmu_page_sizes; r++) {
708 			/*
709 			 * the page_counters base has to be aligned to the
710 			 * page count of page size code r otherwise the counts
711 			 * will cross large page boundaries.
712 			 */
713 			r_align = page_get_pagecnt(r);
714 			r_base = physbase;
715 			/* base needs to be aligned - lower to aligned value */
716 			r_base &= ~(r_align - 1);
717 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
718 			r_shift = PAGE_BSZS_SHIFT(r);
719 
720 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
721 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
722 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
723 			for (mrange = 0; mrange < nranges; mrange++) {
724 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
725 				    r, mrange) = (size_t *)alloc_base;
726 				alloc_base += sizeof (size_t) *
727 				    colors_per_szc[r];
728 			}
729 			for (i = 0; i < colors_per_szc[r]; i++) {
730 				uint_t color_mask = colors_per_szc[r] - 1;
731 				pfn_t  pfnum = r_base;
732 				size_t idx;
733 				int mrange;
734 				MEM_NODE_ITERATOR_DECL(it);
735 
736 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
737 				ASSERT(pfnum != (pfn_t)-1);
738 				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
739 				    color_mask, color_mask, &it);
740 				idx = PNUM_TO_IDX(mnode, r, pfnum);
741 				idx = (idx >= r_pgcnt) ? 0 : idx;
742 				for (mrange = 0; mrange < nranges; mrange++) {
743 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
744 					    r, i, mrange) = idx;
745 				}
746 			}
747 
748 			/* hpm_counters may be shared by all mnodes */
749 			if (firstmn == mnode) {
750 				PAGE_COUNTERS_COUNTERS(mnode, r) =
751 				    (hpmctr_t *)alloc_base;
752 				alloc_base +=
753 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
754 				    sizeof (hpmctr_t *));
755 			} else {
756 				PAGE_COUNTERS_COUNTERS(mnode, r) =
757 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
758 			}
759 
760 			/*
761 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
762 			 * satisfy the identity requirement.
763 			 * We should be able to go from one to the other
764 			 * and get consistent values.
765 			 */
766 			ASSERT(PNUM_TO_IDX(mnode, r,
767 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
768 			ASSERT(IDX_TO_PNUM(mnode, r,
769 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
770 		}
771 		/*
772 		 * Roundup the start address of the page_counters to
773 		 * cache aligned boundary for every memory node.
774 		 * page_ctrs_sz() has added some slop for these roundups.
775 		 */
776 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
777 		    L2CACHE_ALIGN);
778 	}
779 
780 	/* Initialize other page counter specific data structures. */
781 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
782 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
783 	}
784 
785 	return (alloc_base);
786 }
787 
788 /*
789  * Functions to adjust region counters for each size free list.
790  * Caller is responsible to acquire the ctr_mutex lock if necessary and
791  * thus can be called during startup without locks.
792  */
793 /* ARGSUSED */
794 void
795 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
796 {
797 	ssize_t		r;	/* region size */
798 	ssize_t		idx;
799 	pfn_t		pfnum;
800 	int		lckidx;
801 
802 	ASSERT(mnode == PP_2_MEM_NODE(pp));
803 	ASSERT(mtype == PP_2_MTYPE(pp));
804 
805 	ASSERT(pp->p_szc < mmu_page_sizes);
806 
807 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
808 
809 	/* no counter update needed for largest page size */
810 	if (pp->p_szc >= mmu_page_sizes - 1) {
811 		return;
812 	}
813 
814 	r = pp->p_szc + 1;
815 	pfnum = pp->p_pagenum;
816 	lckidx = PP_CTR_LOCK_INDX(pp);
817 
818 	/*
819 	 * Increment the count of free pages for the current
820 	 * region. Continue looping up in region size incrementing
821 	 * count if the preceeding region is full.
822 	 */
823 	while (r < mmu_page_sizes) {
824 		idx = PNUM_TO_IDX(mnode, r, pfnum);
825 
826 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
827 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
828 
829 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
830 			break;
831 		} else {
832 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
833 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
834 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
835 
836 			cand->pcc_pages_free++;
837 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
838 		}
839 		r++;
840 	}
841 }
842 
843 void
844 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
845 {
846 	int		lckidx = PP_CTR_LOCK_INDX(pp);
847 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
848 
849 	mutex_enter(lock);
850 	page_ctr_add_internal(mnode, mtype, pp, flags);
851 	mutex_exit(lock);
852 }
853 
854 void
855 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
856 {
857 	int		lckidx;
858 	ssize_t		r;	/* region size */
859 	ssize_t		idx;
860 	pfn_t		pfnum;
861 
862 	ASSERT(mnode == PP_2_MEM_NODE(pp));
863 	ASSERT(mtype == PP_2_MTYPE(pp));
864 
865 	ASSERT(pp->p_szc < mmu_page_sizes);
866 
867 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
868 
869 	/* no counter update needed for largest page size */
870 	if (pp->p_szc >= mmu_page_sizes - 1) {
871 		return;
872 	}
873 
874 	r = pp->p_szc + 1;
875 	pfnum = pp->p_pagenum;
876 	lckidx = PP_CTR_LOCK_INDX(pp);
877 
878 	/*
879 	 * Decrement the count of free pages for the current
880 	 * region. Continue looping up in region size decrementing
881 	 * count if the preceeding region was full.
882 	 */
883 	while (r < mmu_page_sizes) {
884 		idx = PNUM_TO_IDX(mnode, r, pfnum);
885 
886 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
887 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
888 
889 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
890 			break;
891 		} else {
892 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
893 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
894 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
895 
896 			ASSERT(cand->pcc_pages_free != 0);
897 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
898 
899 			cand->pcc_pages_free--;
900 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
901 		}
902 		r++;
903 	}
904 }
905 
906 void
907 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
908 {
909 	int		lckidx = PP_CTR_LOCK_INDX(pp);
910 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
911 
912 	mutex_enter(lock);
913 	page_ctr_sub_internal(mnode, mtype, pp, flags);
914 	mutex_exit(lock);
915 }
916 
917 /*
918  * Adjust page counters following a memory attach, since typically the
919  * size of the array needs to change, and the PFN to counter index
920  * mapping needs to change.
921  *
922  * It is possible this mnode did not exist at startup. In that case
923  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
924  * to change (a theoretical possibility on x86), which means pcc_color_free
925  * arrays must be extended.
926  */
927 uint_t
928 page_ctrs_adjust(int mnode)
929 {
930 	pgcnt_t npgs;
931 	int	r;		/* region size */
932 	int	i;
933 	size_t	pcsz, old_csz;
934 	hpmctr_t *new_ctr, *old_ctr;
935 	pfn_t	oldbase, newbase;
936 	pfn_t	physbase, physmax;
937 	size_t	old_npgs;
938 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
939 	size_t	size_cache[MMU_PAGE_SIZES];
940 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
941 	size_t	*old_color_array[MAX_MNODE_MRANGES];
942 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
943 	pcc_info_t **cands_cache;
944 	pcc_info_t *old_pi, *pi;
945 	pgcnt_t *pgcntp;
946 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
947 	int cands_cache_nranges;
948 	int old_maxmrange, new_maxmrange;
949 	int rc = 0;
950 
951 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
952 	    MMU_PAGE_SIZES, KM_NOSLEEP);
953 	if (cands_cache == NULL)
954 		return (ENOMEM);
955 
956 	i = -1;
957 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
958 
959 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
960 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
961 
962 	/* prepare to free non-null pointers on the way out */
963 	cands_cache_nranges = nranges;
964 	bzero(ctr_cache, sizeof (ctr_cache));
965 	bzero(color_cache, sizeof (color_cache));
966 
967 	/*
968 	 * We need to determine how many page colors there are for each
969 	 * page size in order to allocate memory for any color specific
970 	 * arrays.
971 	 */
972 	for (r = 0; r < mmu_page_sizes; r++) {
973 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
974 	}
975 
976 	/*
977 	 * Preallocate all of the new hpm_counters arrays as we can't
978 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
979 	 * If we can't allocate all of the arrays, undo our work so far
980 	 * and return failure.
981 	 */
982 	for (r = 1; r < mmu_page_sizes; r++) {
983 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
984 		size_cache[r] = pcsz;
985 		ctr_cache[r] = kmem_zalloc(pcsz *
986 		    sizeof (hpmctr_t), KM_NOSLEEP);
987 		if (ctr_cache[r] == NULL) {
988 			rc = ENOMEM;
989 			goto cleanup;
990 		}
991 	}
992 
993 	/*
994 	 * Preallocate all of the new color current arrays as we can't
995 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
996 	 * If we can't allocate all of the arrays, undo our work so far
997 	 * and return failure.
998 	 */
999 	for (r = 1; r < mmu_page_sizes; r++) {
1000 		for (mrange = 0; mrange < nranges; mrange++) {
1001 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1002 			    colors_per_szc[r], KM_NOSLEEP);
1003 			if (color_cache[r][mrange] == NULL) {
1004 				rc = ENOMEM;
1005 				goto cleanup;
1006 			}
1007 		}
1008 	}
1009 
1010 	/*
1011 	 * Preallocate all of the new pcc_info_t arrays as we can't
1012 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1013 	 * If we can't allocate all of the arrays, undo our work so far
1014 	 * and return failure.
1015 	 */
1016 	for (r = 1; r < mmu_page_sizes; r++) {
1017 		for (i = 0; i < NPC_MUTEX; i++) {
1018 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1019 			    KM_NOSLEEP);
1020 			if (pi == NULL) {
1021 				rc = ENOMEM;
1022 				goto cleanup;
1023 			}
1024 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1025 
1026 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1027 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1028 				    sizeof (pgcnt_t), KM_NOSLEEP);
1029 				if (pgcntp == NULL) {
1030 					rc = ENOMEM;
1031 					goto cleanup;
1032 				}
1033 				pi->pcc_color_free = pgcntp;
1034 			}
1035 		}
1036 	}
1037 
1038 	/*
1039 	 * Grab the write lock to prevent others from walking these arrays
1040 	 * while we are modifying them.
1041 	 */
1042 	PAGE_CTRS_WRITE_LOCK(mnode);
1043 
1044 	old_nranges = mnode_nranges[mnode];
1045 	cands_cache_nranges = old_nranges;
1046 	mnode_nranges[mnode] = nranges;
1047 	old_maxmrange = mnode_maxmrange[mnode];
1048 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1049 	new_maxmrange = mnode_maxmrange[mnode];
1050 
1051 	for (r = 1; r < mmu_page_sizes; r++) {
1052 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1053 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
1054 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
1055 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
1056 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
1057 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1058 			old_color_array[mrange] =
1059 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1060 			    r, mrange);
1061 		}
1062 
1063 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1064 		new_ctr = ctr_cache[r];
1065 		ctr_cache[r] = NULL;
1066 		if (old_ctr != NULL &&
1067 		    (oldbase + old_npgs > newbase) &&
1068 		    (newbase + npgs > oldbase)) {
1069 			/*
1070 			 * Map the intersection of the old and new
1071 			 * counters into the new array.
1072 			 */
1073 			size_t offset;
1074 			if (newbase > oldbase) {
1075 				offset = (newbase - oldbase) >>
1076 				    PAGE_COUNTERS_SHIFT(mnode, r);
1077 				bcopy(old_ctr + offset, new_ctr,
1078 				    MIN(pcsz, (old_csz - offset)) *
1079 				    sizeof (hpmctr_t));
1080 			} else {
1081 				offset = (oldbase - newbase) >>
1082 				    PAGE_COUNTERS_SHIFT(mnode, r);
1083 				bcopy(old_ctr, new_ctr + offset,
1084 				    MIN(pcsz - offset, old_csz) *
1085 				    sizeof (hpmctr_t));
1086 			}
1087 		}
1088 
1089 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1090 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1091 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1092 
1093 		/* update shared hpm_counters in other mnodes */
1094 		if (interleaved_mnodes) {
1095 			for (i = 0; i < max_mem_nodes; i++) {
1096 				if (i == mnode)
1097 					continue;
1098 				if (mem_node_config[i].exists == 0)
1099 					continue;
1100 				ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr);
1101 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1102 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1103 				PAGE_COUNTERS_BASE(i, r) = newbase;
1104 			}
1105 		}
1106 
1107 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1108 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1109 			    color_cache[r][mrange];
1110 			color_cache[r][mrange] = NULL;
1111 		}
1112 		/*
1113 		 * for now, just reset on these events as it's probably
1114 		 * not worthwhile to try and optimize this.
1115 		 */
1116 		for (i = 0; i < colors_per_szc[r]; i++) {
1117 			uint_t color_mask = colors_per_szc[r] - 1;
1118 			int mlo = interleaved_mnodes ? 0 : mnode;
1119 			int mhi = interleaved_mnodes ? max_mem_nodes :
1120 			    (mnode + 1);
1121 			int m;
1122 			pfn_t  pfnum = newbase;
1123 			size_t idx;
1124 			MEM_NODE_ITERATOR_DECL(it);
1125 
1126 			for (m = mlo; m < mhi; m++) {
1127 				if (mem_node_config[m].exists == 0)
1128 					continue;
1129 				MEM_NODE_ITERATOR_INIT(pfnum, m, &it);
1130 				ASSERT(pfnum != (pfn_t)-1);
1131 				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
1132 				    color_mask, &it);
1133 				idx = PNUM_TO_IDX(m, r, pfnum);
1134 				idx = (idx < pcsz) ? idx : 0;
1135 				for (mrange = 0; mrange < nranges; mrange++) {
1136 					PAGE_COUNTERS_CURRENT_COLOR(m,
1137 					    r, i, mrange) = idx;
1138 				}
1139 			}
1140 		}
1141 
1142 		/* cache info for freeing out of the critical path */
1143 		if ((caddr_t)old_ctr >= kernelheap &&
1144 		    (caddr_t)old_ctr < ekernelheap) {
1145 			ctr_cache[r] = old_ctr;
1146 			size_cache[r] = old_csz;
1147 		}
1148 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1149 			size_t *tmp = old_color_array[mrange];
1150 			if ((caddr_t)tmp >= kernelheap &&
1151 			    (caddr_t)tmp < ekernelheap) {
1152 				color_cache[r][mrange] = tmp;
1153 			}
1154 		}
1155 		/*
1156 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1157 		 * satisfy the identity requirement.
1158 		 * We should be able to go from one to the other
1159 		 * and get consistent values.
1160 		 */
1161 		ASSERT(PNUM_TO_IDX(mnode, r,
1162 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1163 		ASSERT(IDX_TO_PNUM(mnode, r,
1164 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1165 
1166 		/* pcc_info_t and pcc_color_free */
1167 		for (i = 0; i < NPC_MUTEX; i++) {
1168 			pcc_info_t *epi;
1169 			pcc_info_t *eold_pi;
1170 
1171 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1172 			old_pi = page_ctrs_cands[i][r][mnode];
1173 			page_ctrs_cands[i][r][mnode] = pi;
1174 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1175 
1176 			/* preserve old pcc_color_free values, if any */
1177 			if (old_pi == NULL)
1178 				continue;
1179 
1180 			/*
1181 			 * when/if x86 does DR, must account for
1182 			 * possible change in range index when
1183 			 * preserving pcc_info
1184 			 */
1185 			epi = &pi[nranges];
1186 			eold_pi = &old_pi[old_nranges];
1187 			if (new_maxmrange > old_maxmrange) {
1188 				pi += new_maxmrange - old_maxmrange;
1189 			} else if (new_maxmrange < old_maxmrange) {
1190 				old_pi += old_maxmrange - new_maxmrange;
1191 			}
1192 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1193 				pcc_info_t tmp = *pi;
1194 				*pi = *old_pi;
1195 				*old_pi = tmp;
1196 			}
1197 		}
1198 	}
1199 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1200 
1201 	/*
1202 	 * Now that we have dropped the write lock, it is safe to free all
1203 	 * of the memory we have cached above.
1204 	 * We come thru here to free memory when pre-alloc fails, and also to
1205 	 * free old pointers which were recorded while locked.
1206 	 */
1207 cleanup:
1208 	for (r = 1; r < mmu_page_sizes; r++) {
1209 		if (ctr_cache[r] != NULL) {
1210 			kmem_free(ctr_cache[r],
1211 			    size_cache[r] * sizeof (hpmctr_t));
1212 		}
1213 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1214 			if (color_cache[r][mrange] != NULL) {
1215 				kmem_free(color_cache[r][mrange],
1216 				    colors_per_szc[r] * sizeof (size_t));
1217 			}
1218 		}
1219 		for (i = 0; i < NPC_MUTEX; i++) {
1220 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1221 			if (pi == NULL)
1222 				continue;
1223 			nr = cands_cache_nranges;
1224 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1225 				pgcntp = pi->pcc_color_free;
1226 				if (pgcntp == NULL)
1227 					continue;
1228 				if ((caddr_t)pgcntp >= kernelheap &&
1229 				    (caddr_t)pgcntp < ekernelheap) {
1230 					kmem_free(pgcntp,
1231 					    colors_per_szc[r] *
1232 					    sizeof (pgcnt_t));
1233 				}
1234 			}
1235 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1236 			if ((caddr_t)pi >= kernelheap &&
1237 			    (caddr_t)pi < ekernelheap) {
1238 				kmem_free(pi, nr * sizeof (pcc_info_t));
1239 			}
1240 		}
1241 	}
1242 
1243 	kmem_free(cands_cache,
1244 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1245 	return (rc);
1246 }
1247 
1248 
1249 #ifdef DEBUG
1250 
1251 /*
1252  * confirm pp is a large page corresponding to szc
1253  */
1254 void
1255 chk_lpg(page_t *pp, uchar_t szc)
1256 {
1257 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1258 	uint_t noreloc;
1259 
1260 	if (npgs == 1) {
1261 		ASSERT(pp->p_szc == 0);
1262 		ASSERT(pp->p_next == pp);
1263 		ASSERT(pp->p_prev == pp);
1264 		return;
1265 	}
1266 
1267 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1268 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1269 
1270 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1271 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1272 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1273 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1274 
1275 	/*
1276 	 * Check list of pages.
1277 	 */
1278 	noreloc = PP_ISNORELOC(pp);
1279 	while (npgs--) {
1280 		if (npgs != 0) {
1281 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1282 			ASSERT(pp->p_next == (pp + 1));
1283 		}
1284 		ASSERT(pp->p_szc == szc);
1285 		ASSERT(PP_ISFREE(pp));
1286 		ASSERT(PP_ISAGED(pp));
1287 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1288 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1289 		ASSERT(pp->p_vnode  == NULL);
1290 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1291 
1292 		pp = pp->p_next;
1293 	}
1294 }
1295 #endif /* DEBUG */
1296 
1297 void
1298 page_freelist_lock(int mnode)
1299 {
1300 	int i;
1301 	for (i = 0; i < NPC_MUTEX; i++) {
1302 		mutex_enter(FPC_MUTEX(mnode, i));
1303 		mutex_enter(CPC_MUTEX(mnode, i));
1304 	}
1305 }
1306 
1307 void
1308 page_freelist_unlock(int mnode)
1309 {
1310 	int i;
1311 	for (i = 0; i < NPC_MUTEX; i++) {
1312 		mutex_exit(FPC_MUTEX(mnode, i));
1313 		mutex_exit(CPC_MUTEX(mnode, i));
1314 	}
1315 }
1316 
1317 /*
1318  * add pp to the specified page list. Defaults to head of the page list
1319  * unless PG_LIST_TAIL is specified.
1320  */
1321 void
1322 page_list_add(page_t *pp, int flags)
1323 {
1324 	page_t		**ppp;
1325 	kmutex_t	*pcm;
1326 	uint_t		bin, mtype;
1327 	int		mnode;
1328 
1329 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1330 	ASSERT(PP_ISFREE(pp));
1331 	ASSERT(!hat_page_is_mapped(pp));
1332 	ASSERT(hat_page_getshare(pp) == 0);
1333 
1334 	/*
1335 	 * Large pages should be freed via page_list_add_pages().
1336 	 */
1337 	ASSERT(pp->p_szc == 0);
1338 
1339 	/*
1340 	 * Don't need to lock the freelist first here
1341 	 * because the page isn't on the freelist yet.
1342 	 * This means p_szc can't change on us.
1343 	 */
1344 
1345 	bin = PP_2_BIN(pp);
1346 	mnode = PP_2_MEM_NODE(pp);
1347 	mtype = PP_2_MTYPE(pp);
1348 
1349 	if (flags & PG_LIST_ISINIT) {
1350 		/*
1351 		 * PG_LIST_ISINIT is set during system startup (ie. single
1352 		 * threaded), add a page to the free list and add to the
1353 		 * the free region counters w/o any locking
1354 		 */
1355 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1356 
1357 		/* inline version of page_add() */
1358 		if (*ppp != NULL) {
1359 			pp->p_next = *ppp;
1360 			pp->p_prev = (*ppp)->p_prev;
1361 			(*ppp)->p_prev = pp;
1362 			pp->p_prev->p_next = pp;
1363 		} else
1364 			*ppp = pp;
1365 
1366 		page_ctr_add_internal(mnode, mtype, pp, flags);
1367 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1368 	} else {
1369 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1370 
1371 		if (flags & PG_FREE_LIST) {
1372 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1373 			ASSERT(PP_ISAGED(pp));
1374 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1375 
1376 		} else {
1377 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1378 			ASSERT(pp->p_vnode);
1379 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1380 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1381 		}
1382 		mutex_enter(pcm);
1383 		page_add(ppp, pp);
1384 
1385 		if (flags & PG_LIST_TAIL)
1386 			*ppp = (*ppp)->p_next;
1387 		/*
1388 		 * Add counters before releasing pcm mutex to avoid a race with
1389 		 * page_freelist_coalesce and page_freelist_split.
1390 		 */
1391 		page_ctr_add(mnode, mtype, pp, flags);
1392 		mutex_exit(pcm);
1393 	}
1394 
1395 
1396 #if defined(__sparc)
1397 	if (PP_ISNORELOC(pp)) {
1398 		kcage_freemem_add(1);
1399 	}
1400 #endif
1401 	/*
1402 	 * It is up to the caller to unlock the page!
1403 	 */
1404 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1405 }
1406 
1407 
1408 #ifdef __sparc
1409 /*
1410  * This routine is only used by kcage_init during system startup.
1411  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1412  * without the overhead of taking locks and updating counters.
1413  */
1414 void
1415 page_list_noreloc_startup(page_t *pp)
1416 {
1417 	page_t		**ppp;
1418 	uint_t		bin;
1419 	int		mnode;
1420 	int		mtype;
1421 	int		flags = 0;
1422 
1423 	/*
1424 	 * If this is a large page on the freelist then
1425 	 * break it up into smaller pages.
1426 	 */
1427 	if (pp->p_szc != 0)
1428 		page_boot_demote(pp);
1429 
1430 	/*
1431 	 * Get list page is currently on.
1432 	 */
1433 	bin = PP_2_BIN(pp);
1434 	mnode = PP_2_MEM_NODE(pp);
1435 	mtype = PP_2_MTYPE(pp);
1436 	ASSERT(mtype == MTYPE_RELOC);
1437 	ASSERT(pp->p_szc == 0);
1438 
1439 	if (PP_ISAGED(pp)) {
1440 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1441 		flags |= PG_FREE_LIST;
1442 	} else {
1443 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1444 		flags |= PG_CACHE_LIST;
1445 	}
1446 
1447 	ASSERT(*ppp != NULL);
1448 
1449 	/*
1450 	 * Delete page from current list.
1451 	 */
1452 	if (*ppp == pp)
1453 		*ppp = pp->p_next;		/* go to next page */
1454 	if (*ppp == pp) {
1455 		*ppp = NULL;			/* page list is gone */
1456 	} else {
1457 		pp->p_prev->p_next = pp->p_next;
1458 		pp->p_next->p_prev = pp->p_prev;
1459 	}
1460 
1461 	/*
1462 	 * Decrement page counters
1463 	 */
1464 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1465 
1466 	/*
1467 	 * Set no reloc for cage initted pages.
1468 	 */
1469 	PP_SETNORELOC(pp);
1470 
1471 	mtype = PP_2_MTYPE(pp);
1472 	ASSERT(mtype == MTYPE_NORELOC);
1473 
1474 	/*
1475 	 * Get new list for page.
1476 	 */
1477 	if (PP_ISAGED(pp)) {
1478 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1479 	} else {
1480 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1481 	}
1482 
1483 	/*
1484 	 * Insert page on new list.
1485 	 */
1486 	if (*ppp == NULL) {
1487 		*ppp = pp;
1488 		pp->p_next = pp->p_prev = pp;
1489 	} else {
1490 		pp->p_next = *ppp;
1491 		pp->p_prev = (*ppp)->p_prev;
1492 		(*ppp)->p_prev = pp;
1493 		pp->p_prev->p_next = pp;
1494 	}
1495 
1496 	/*
1497 	 * Increment page counters
1498 	 */
1499 	page_ctr_add_internal(mnode, mtype, pp, flags);
1500 
1501 	/*
1502 	 * Update cage freemem counter
1503 	 */
1504 	atomic_add_long(&kcage_freemem, 1);
1505 }
1506 #else	/* __sparc */
1507 
1508 /* ARGSUSED */
1509 void
1510 page_list_noreloc_startup(page_t *pp)
1511 {
1512 	panic("page_list_noreloc_startup: should be here only for sparc");
1513 }
1514 #endif
1515 
1516 void
1517 page_list_add_pages(page_t *pp, int flags)
1518 {
1519 	kmutex_t *pcm;
1520 	pgcnt_t	pgcnt;
1521 	uint_t	bin, mtype, i;
1522 	int	mnode;
1523 
1524 	/* default to freelist/head */
1525 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1526 
1527 	CHK_LPG(pp, pp->p_szc);
1528 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1529 
1530 	bin = PP_2_BIN(pp);
1531 	mnode = PP_2_MEM_NODE(pp);
1532 	mtype = PP_2_MTYPE(pp);
1533 
1534 	if (flags & PG_LIST_ISINIT) {
1535 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1536 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1537 		ASSERT(!PP_ISNORELOC(pp));
1538 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1539 	} else {
1540 
1541 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1542 
1543 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1544 
1545 		mutex_enter(pcm);
1546 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1547 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1548 		mutex_exit(pcm);
1549 
1550 		pgcnt = page_get_pagecnt(pp->p_szc);
1551 #if defined(__sparc)
1552 		if (PP_ISNORELOC(pp))
1553 			kcage_freemem_add(pgcnt);
1554 #endif
1555 		for (i = 0; i < pgcnt; i++, pp++)
1556 			page_unlock_nocapture(pp);
1557 	}
1558 }
1559 
1560 /*
1561  * During boot, need to demote a large page to base
1562  * pagesize pages for seg_kmem for use in boot_alloc()
1563  */
1564 void
1565 page_boot_demote(page_t *pp)
1566 {
1567 	ASSERT(pp->p_szc != 0);
1568 	ASSERT(PP_ISFREE(pp));
1569 	ASSERT(PP_ISAGED(pp));
1570 
1571 	(void) page_demote(PP_2_MEM_NODE(pp),
1572 	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
1573 	    PC_FREE);
1574 
1575 	ASSERT(PP_ISFREE(pp));
1576 	ASSERT(PP_ISAGED(pp));
1577 	ASSERT(pp->p_szc == 0);
1578 }
1579 
1580 /*
1581  * Take a particular page off of whatever freelist the page
1582  * is claimed to be on.
1583  *
1584  * NOTE: Only used for PAGESIZE pages.
1585  */
1586 void
1587 page_list_sub(page_t *pp, int flags)
1588 {
1589 	int		bin;
1590 	uint_t		mtype;
1591 	int		mnode;
1592 	kmutex_t	*pcm;
1593 	page_t		**ppp;
1594 
1595 	ASSERT(PAGE_EXCL(pp));
1596 	ASSERT(PP_ISFREE(pp));
1597 
1598 	/*
1599 	 * The p_szc field can only be changed by page_promote()
1600 	 * and page_demote(). Only free pages can be promoted and
1601 	 * demoted and the free list MUST be locked during these
1602 	 * operations. So to prevent a race in page_list_sub()
1603 	 * between computing which bin of the freelist lock to
1604 	 * grab and actually grabing the lock we check again that
1605 	 * the bin we locked is still the correct one. Notice that
1606 	 * the p_szc field could have actually changed on us but
1607 	 * if the bin happens to still be the same we are safe.
1608 	 */
1609 try_again:
1610 	bin = PP_2_BIN(pp);
1611 	mnode = PP_2_MEM_NODE(pp);
1612 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1613 	mutex_enter(pcm);
1614 	if (PP_2_BIN(pp) != bin) {
1615 		mutex_exit(pcm);
1616 		goto try_again;
1617 	}
1618 	mtype = PP_2_MTYPE(pp);
1619 
1620 	if (flags & PG_FREE_LIST) {
1621 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1622 		ASSERT(PP_ISAGED(pp));
1623 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1624 	} else {
1625 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1626 		ASSERT(!PP_ISAGED(pp));
1627 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1628 	}
1629 
1630 	/*
1631 	 * Common PAGESIZE case.
1632 	 *
1633 	 * Note that we locked the freelist. This prevents
1634 	 * any page promotion/demotion operations. Therefore
1635 	 * the p_szc will not change until we drop pcm mutex.
1636 	 */
1637 	if (pp->p_szc == 0) {
1638 		page_sub(ppp, pp);
1639 		/*
1640 		 * Subtract counters before releasing pcm mutex
1641 		 * to avoid race with page_freelist_coalesce.
1642 		 */
1643 		page_ctr_sub(mnode, mtype, pp, flags);
1644 		mutex_exit(pcm);
1645 
1646 #if defined(__sparc)
1647 		if (PP_ISNORELOC(pp)) {
1648 			kcage_freemem_sub(1);
1649 		}
1650 #endif
1651 		return;
1652 	}
1653 
1654 	/*
1655 	 * Large pages on the cache list are not supported.
1656 	 */
1657 	if (flags & PG_CACHE_LIST)
1658 		panic("page_list_sub: large page on cachelist");
1659 
1660 	/*
1661 	 * Slow but rare.
1662 	 *
1663 	 * Somebody wants this particular page which is part
1664 	 * of a large page. In this case we just demote the page
1665 	 * if it's on the freelist.
1666 	 *
1667 	 * We have to drop pcm before locking the entire freelist.
1668 	 * Once we have re-locked the freelist check to make sure
1669 	 * the page hasn't already been demoted or completely
1670 	 * freed.
1671 	 */
1672 	mutex_exit(pcm);
1673 	page_freelist_lock(mnode);
1674 	if (pp->p_szc != 0) {
1675 		/*
1676 		 * Large page is on freelist.
1677 		 */
1678 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1679 		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1680 	}
1681 	ASSERT(PP_ISFREE(pp));
1682 	ASSERT(PP_ISAGED(pp));
1683 	ASSERT(pp->p_szc == 0);
1684 
1685 	/*
1686 	 * Subtract counters before releasing pcm mutex
1687 	 * to avoid race with page_freelist_coalesce.
1688 	 */
1689 	bin = PP_2_BIN(pp);
1690 	mtype = PP_2_MTYPE(pp);
1691 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1692 
1693 	page_sub(ppp, pp);
1694 	page_ctr_sub(mnode, mtype, pp, flags);
1695 	page_freelist_unlock(mnode);
1696 
1697 #if defined(__sparc)
1698 	if (PP_ISNORELOC(pp)) {
1699 		kcage_freemem_sub(1);
1700 	}
1701 #endif
1702 }
1703 
1704 void
1705 page_list_sub_pages(page_t *pp, uint_t szc)
1706 {
1707 	kmutex_t *pcm;
1708 	uint_t	bin, mtype;
1709 	int	mnode;
1710 
1711 	ASSERT(PAGE_EXCL(pp));
1712 	ASSERT(PP_ISFREE(pp));
1713 	ASSERT(PP_ISAGED(pp));
1714 
1715 	/*
1716 	 * See comment in page_list_sub().
1717 	 */
1718 try_again:
1719 	bin = PP_2_BIN(pp);
1720 	mnode = PP_2_MEM_NODE(pp);
1721 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1722 	mutex_enter(pcm);
1723 	if (PP_2_BIN(pp) != bin) {
1724 		mutex_exit(pcm);
1725 		goto	try_again;
1726 	}
1727 
1728 	/*
1729 	 * If we're called with a page larger than szc or it got
1730 	 * promoted above szc before we locked the freelist then
1731 	 * drop pcm and re-lock entire freelist. If page still larger
1732 	 * than szc then demote it.
1733 	 */
1734 	if (pp->p_szc > szc) {
1735 		mutex_exit(pcm);
1736 		pcm = NULL;
1737 		page_freelist_lock(mnode);
1738 		if (pp->p_szc > szc) {
1739 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1740 			(void) page_demote(mnode,
1741 			    PFN_BASE(pp->p_pagenum, pp->p_szc),
1742 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1743 		}
1744 		bin = PP_2_BIN(pp);
1745 	}
1746 	ASSERT(PP_ISFREE(pp));
1747 	ASSERT(PP_ISAGED(pp));
1748 	ASSERT(pp->p_szc <= szc);
1749 	ASSERT(pp == PP_PAGEROOT(pp));
1750 
1751 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1752 
1753 	mtype = PP_2_MTYPE(pp);
1754 	if (pp->p_szc != 0) {
1755 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1756 		CHK_LPG(pp, pp->p_szc);
1757 	} else {
1758 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1759 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1760 	}
1761 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1762 
1763 	if (pcm != NULL) {
1764 		mutex_exit(pcm);
1765 	} else {
1766 		page_freelist_unlock(mnode);
1767 	}
1768 
1769 #if defined(__sparc)
1770 	if (PP_ISNORELOC(pp)) {
1771 		pgcnt_t	pgcnt;
1772 
1773 		pgcnt = page_get_pagecnt(pp->p_szc);
1774 		kcage_freemem_sub(pgcnt);
1775 	}
1776 #endif
1777 }
1778 
1779 /*
1780  * Add the page to the front of a linked list of pages
1781  * using the p_next & p_prev pointers for the list.
1782  * The caller is responsible for protecting the list pointers.
1783  */
1784 void
1785 mach_page_add(page_t **ppp, page_t *pp)
1786 {
1787 	if (*ppp == NULL) {
1788 		pp->p_next = pp->p_prev = pp;
1789 	} else {
1790 		pp->p_next = *ppp;
1791 		pp->p_prev = (*ppp)->p_prev;
1792 		(*ppp)->p_prev = pp;
1793 		pp->p_prev->p_next = pp;
1794 	}
1795 	*ppp = pp;
1796 }
1797 
1798 /*
1799  * Remove this page from a linked list of pages
1800  * using the p_next & p_prev pointers for the list.
1801  *
1802  * The caller is responsible for protecting the list pointers.
1803  */
1804 void
1805 mach_page_sub(page_t **ppp, page_t *pp)
1806 {
1807 	ASSERT(PP_ISFREE(pp));
1808 
1809 	if (*ppp == NULL || pp == NULL)
1810 		panic("mach_page_sub");
1811 
1812 	if (*ppp == pp)
1813 		*ppp = pp->p_next;		/* go to next page */
1814 
1815 	if (*ppp == pp)
1816 		*ppp = NULL;			/* page list is gone */
1817 	else {
1818 		pp->p_prev->p_next = pp->p_next;
1819 		pp->p_next->p_prev = pp->p_prev;
1820 	}
1821 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1822 }
1823 
1824 /*
1825  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1826  */
1827 void
1828 page_promote_size(page_t *pp, uint_t cur_szc)
1829 {
1830 	pfn_t pfn;
1831 	int mnode;
1832 	int idx;
1833 	int new_szc = cur_szc + 1;
1834 	int full = FULL_REGION_CNT(new_szc);
1835 
1836 	pfn = page_pptonum(pp);
1837 	mnode = PFN_2_MEM_NODE(pfn);
1838 
1839 	page_freelist_lock(mnode);
1840 
1841 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1842 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1843 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1844 
1845 	page_freelist_unlock(mnode);
1846 }
1847 
1848 static uint_t page_promote_err;
1849 static uint_t page_promote_noreloc_err;
1850 
1851 /*
1852  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1853  * for the given mnode starting at pfnum. Pages involved are on the freelist
1854  * before the call and may be returned to the caller if requested, otherwise
1855  * they will be placed back on the freelist.
1856  * If flags is PC_ALLOC, then the large page will be returned to the user in
1857  * a state which is consistent with a page being taken off the freelist.  If
1858  * we failed to lock the new large page, then we will return NULL to the
1859  * caller and put the large page on the freelist instead.
1860  * If flags is PC_FREE, then the large page will be placed on the freelist,
1861  * and NULL will be returned.
1862  * The caller is responsible for locking the freelist as well as any other
1863  * accounting which needs to be done for a returned page.
1864  *
1865  * RFE: For performance pass in pp instead of pfnum so
1866  * 	we can avoid excessive calls to page_numtopp_nolock().
1867  *	This would depend on an assumption that all contiguous
1868  *	pages are in the same memseg so we can just add/dec
1869  *	our pp.
1870  *
1871  * Lock ordering:
1872  *
1873  *	There is a potential but rare deadlock situation
1874  *	for page promotion and demotion operations. The problem
1875  *	is there are two paths into the freelist manager and
1876  *	they have different lock orders:
1877  *
1878  *	page_create()
1879  *		lock freelist
1880  *		page_lock(EXCL)
1881  *		unlock freelist
1882  *		return
1883  *		caller drops page_lock
1884  *
1885  *	page_free() and page_reclaim()
1886  *		caller grabs page_lock(EXCL)
1887  *
1888  *		lock freelist
1889  *		unlock freelist
1890  *		drop page_lock
1891  *
1892  *	What prevents a thread in page_create() from deadlocking
1893  *	with a thread freeing or reclaiming the same page is the
1894  *	page_trylock() in page_get_freelist(). If the trylock fails
1895  *	it skips the page.
1896  *
1897  *	The lock ordering for promotion and demotion is the same as
1898  *	for page_create(). Since the same deadlock could occur during
1899  *	page promotion and freeing or reclaiming of a page on the
1900  *	cache list we might have to fail the operation and undo what
1901  *	have done so far. Again this is rare.
1902  */
1903 page_t *
1904 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1905 {
1906 	page_t		*pp, *pplist, *tpp, *start_pp;
1907 	pgcnt_t		new_npgs, npgs;
1908 	uint_t		bin;
1909 	pgcnt_t		tmpnpgs, pages_left;
1910 	uint_t		noreloc;
1911 	int 		which_list;
1912 	ulong_t		index;
1913 	kmutex_t	*phm;
1914 
1915 	/*
1916 	 * General algorithm:
1917 	 * Find the starting page
1918 	 * Walk each page struct removing it from the freelist,
1919 	 * and linking it to all the other pages removed.
1920 	 * Once all pages are off the freelist,
1921 	 * walk the list, modifying p_szc to new_szc and what
1922 	 * ever other info needs to be done to create a large free page.
1923 	 * According to the flags, either return the page or put it
1924 	 * on the freelist.
1925 	 */
1926 
1927 	start_pp = page_numtopp_nolock(pfnum);
1928 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1929 	new_npgs = page_get_pagecnt(new_szc);
1930 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1931 
1932 	/* don't return page of the wrong mtype */
1933 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1934 			return (NULL);
1935 
1936 	/*
1937 	 * Loop through smaller pages to confirm that all pages
1938 	 * give the same result for PP_ISNORELOC().
1939 	 * We can check this reliably here as the protocol for setting
1940 	 * P_NORELOC requires pages to be taken off the free list first.
1941 	 */
1942 	noreloc = PP_ISNORELOC(start_pp);
1943 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1944 		if (noreloc != PP_ISNORELOC(pp)) {
1945 			page_promote_noreloc_err++;
1946 			page_promote_err++;
1947 			return (NULL);
1948 		}
1949 	}
1950 
1951 	pages_left = new_npgs;
1952 	pplist = NULL;
1953 	pp = start_pp;
1954 
1955 	/* Loop around coalescing the smaller pages into a big page. */
1956 	while (pages_left) {
1957 		/*
1958 		 * Remove from the freelist.
1959 		 */
1960 		ASSERT(PP_ISFREE(pp));
1961 		bin = PP_2_BIN(pp);
1962 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1963 		mtype = PP_2_MTYPE(pp);
1964 		if (PP_ISAGED(pp)) {
1965 
1966 			/*
1967 			 * PG_FREE_LIST
1968 			 */
1969 			if (pp->p_szc) {
1970 				page_vpsub(&PAGE_FREELISTS(mnode,
1971 				    pp->p_szc, bin, mtype), pp);
1972 			} else {
1973 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1974 				    bin, mtype), pp);
1975 			}
1976 			which_list = PG_FREE_LIST;
1977 		} else {
1978 			ASSERT(pp->p_szc == 0);
1979 
1980 			/*
1981 			 * PG_CACHE_LIST
1982 			 *
1983 			 * Since this page comes from the
1984 			 * cachelist, we must destroy the
1985 			 * vnode association.
1986 			 */
1987 			if (!page_trylock(pp, SE_EXCL)) {
1988 				goto fail_promote;
1989 			}
1990 
1991 			/*
1992 			 * We need to be careful not to deadlock
1993 			 * with another thread in page_lookup().
1994 			 * The page_lookup() thread could be holding
1995 			 * the same phm that we need if the two
1996 			 * pages happen to hash to the same phm lock.
1997 			 * At this point we have locked the entire
1998 			 * freelist and page_lookup() could be trying
1999 			 * to grab a freelist lock.
2000 			 */
2001 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2002 			phm = PAGE_HASH_MUTEX(index);
2003 			if (!mutex_tryenter(phm)) {
2004 				page_unlock_nocapture(pp);
2005 				goto fail_promote;
2006 			}
2007 
2008 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2009 			page_hashout(pp, phm);
2010 			mutex_exit(phm);
2011 			PP_SETAGED(pp);
2012 			page_unlock_nocapture(pp);
2013 			which_list = PG_CACHE_LIST;
2014 		}
2015 		page_ctr_sub(mnode, mtype, pp, which_list);
2016 
2017 		/*
2018 		 * Concatenate the smaller page(s) onto
2019 		 * the large page list.
2020 		 */
2021 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2022 		pages_left -= npgs;
2023 		tpp = pp;
2024 		while (npgs--) {
2025 			tpp->p_szc = new_szc;
2026 			tpp = tpp->p_next;
2027 		}
2028 		page_list_concat(&pplist, &pp);
2029 		pp += tmpnpgs;
2030 	}
2031 	CHK_LPG(pplist, new_szc);
2032 
2033 	/*
2034 	 * return the page to the user if requested
2035 	 * in the properly locked state.
2036 	 */
2037 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2038 		return (pplist);
2039 	}
2040 
2041 	/*
2042 	 * Otherwise place the new large page on the freelist
2043 	 */
2044 	bin = PP_2_BIN(pplist);
2045 	mnode = PP_2_MEM_NODE(pplist);
2046 	mtype = PP_2_MTYPE(pplist);
2047 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2048 
2049 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2050 	return (NULL);
2051 
2052 fail_promote:
2053 	/*
2054 	 * A thread must have still been freeing or
2055 	 * reclaiming the page on the cachelist.
2056 	 * To prevent a deadlock undo what we have
2057 	 * done sofar and return failure. This
2058 	 * situation can only happen while promoting
2059 	 * PAGESIZE pages.
2060 	 */
2061 	page_promote_err++;
2062 	while (pplist) {
2063 		pp = pplist;
2064 		mach_page_sub(&pplist, pp);
2065 		pp->p_szc = 0;
2066 		bin = PP_2_BIN(pp);
2067 		mtype = PP_2_MTYPE(pp);
2068 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2069 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2070 	}
2071 	return (NULL);
2072 
2073 }
2074 
2075 /*
2076  * Break up a large page into smaller size pages.
2077  * Pages involved are on the freelist before the call and may
2078  * be returned to the caller if requested, otherwise they will
2079  * be placed back on the freelist.
2080  * The caller is responsible for locking the freelist as well as any other
2081  * accounting which needs to be done for a returned page.
2082  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2083  * technically, any value may be passed in but PC_NO_COLOR is the standard
2084  * which should be followed for clarity's sake.
2085  */
2086 page_t *
2087 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
2088     int color, int flags)
2089 {
2090 	page_t	*pp, *pplist, *npplist;
2091 	pgcnt_t	npgs, n;
2092 	uint_t	bin;
2093 	uint_t	mtype;
2094 	page_t	*ret_pp = NULL;
2095 
2096 	ASSERT(cur_szc != 0);
2097 	ASSERT(new_szc < cur_szc);
2098 
2099 	pplist = page_numtopp_nolock(pfnum);
2100 	ASSERT(pplist != NULL);
2101 
2102 	ASSERT(pplist->p_szc == cur_szc);
2103 
2104 	bin = PP_2_BIN(pplist);
2105 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2106 	mtype = PP_2_MTYPE(pplist);
2107 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2108 
2109 	CHK_LPG(pplist, cur_szc);
2110 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2111 
2112 	/*
2113 	 * Number of PAGESIZE pages for smaller new_szc
2114 	 * page.
2115 	 */
2116 	npgs = page_get_pagecnt(new_szc);
2117 
2118 	while (pplist) {
2119 		pp = pplist;
2120 
2121 		ASSERT(pp->p_szc == cur_szc);
2122 
2123 		/*
2124 		 * We either break it up into PAGESIZE pages or larger.
2125 		 */
2126 		if (npgs == 1) {	/* PAGESIZE case */
2127 			mach_page_sub(&pplist, pp);
2128 			ASSERT(pp->p_szc == cur_szc);
2129 			ASSERT(new_szc == 0);
2130 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2131 			pp->p_szc = new_szc;
2132 			bin = PP_2_BIN(pp);
2133 			if ((bin == color) && (flags == PC_ALLOC) &&
2134 			    (ret_pp == NULL) &&
2135 			    page_trylock_cons(pp, SE_EXCL)) {
2136 				ret_pp = pp;
2137 			} else {
2138 				mtype = PP_2_MTYPE(pp);
2139 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2140 				    mtype), pp);
2141 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2142 			}
2143 		} else {
2144 
2145 			/*
2146 			 * Break down into smaller lists of pages.
2147 			 */
2148 			page_list_break(&pplist, &npplist, npgs);
2149 
2150 			pp = pplist;
2151 			n = npgs;
2152 			while (n--) {
2153 				ASSERT(pp->p_szc == cur_szc);
2154 				pp->p_szc = new_szc;
2155 				pp = pp->p_next;
2156 			}
2157 
2158 			CHK_LPG(pplist, new_szc);
2159 
2160 			bin = PP_2_BIN(pplist);
2161 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2162 			if ((bin == color) && (flags == PC_ALLOC) &&
2163 			    (ret_pp == NULL) &&
2164 			    page_trylock_cons(pp, SE_EXCL)) {
2165 				ret_pp = pp;
2166 			} else {
2167 				mtype = PP_2_MTYPE(pp);
2168 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2169 				    bin, mtype), pplist);
2170 
2171 				page_ctr_add(mnode, mtype, pplist,
2172 				    PG_FREE_LIST);
2173 			}
2174 			pplist = npplist;
2175 		}
2176 	}
2177 	return (ret_pp);
2178 }
2179 
2180 int mpss_coalesce_disable = 0;
2181 
2182 /*
2183  * Coalesce free pages into a page of the given szc and color if possible.
2184  * Return the pointer to the page created, otherwise, return NULL.
2185  *
2186  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2187  */
2188 page_t *
2189 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2190     int mtype, pfn_t pfnhi)
2191 {
2192 	int 	r = szc;		/* region size */
2193 	int	mrange;
2194 	uint_t 	full, bin, color_mask, wrap = 0;
2195 	pfn_t	pfnum, lo, hi;
2196 	size_t	len, idx, idx0;
2197 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2198 	page_t	*ret_pp;
2199 	MEM_NODE_ITERATOR_DECL(it);
2200 #if defined(__sparc)
2201 	pfn_t pfnum0, nlo, nhi;
2202 #endif
2203 
2204 	if (mpss_coalesce_disable) {
2205 		ASSERT(szc < MMU_PAGE_SIZES);
2206 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2207 		return (NULL);
2208 	}
2209 
2210 	ASSERT(szc < mmu_page_sizes);
2211 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2212 	ASSERT(ceq_mask <= color_mask);
2213 	ASSERT(color <= color_mask);
2214 	color &= ceq_mask;
2215 
2216 	/* Prevent page_counters dynamic memory from being freed */
2217 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2218 
2219 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2220 	ASSERT(mrange < mnode_nranges[mnode]);
2221 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2222 
2223 	/* get pfn range for mtype */
2224 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2225 #if defined(__sparc)
2226 	lo = PAGE_COUNTERS_BASE(mnode, r);
2227 	hi = IDX_TO_PNUM(mnode, r, len);
2228 #else
2229 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2230 	hi++;
2231 #endif
2232 
2233 	/* use lower limit if given */
2234 	if (pfnhi != PFNNULL && pfnhi < hi)
2235 		hi = pfnhi;
2236 
2237 	/* round to szcpgcnt boundaries */
2238 	lo = P2ROUNDUP(lo, szcpgcnt);
2239 	MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
2240 	ASSERT(lo != (pfn_t)-1);
2241 	hi = hi & ~(szcpgcnt - 1);
2242 
2243 	/* set lo to the closest pfn of the right color */
2244 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2245 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2246 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2247 		    &it);
2248 	}
2249 
2250 	if (hi <= lo) {
2251 		rw_exit(&page_ctrs_rwlock[mnode]);
2252 		return (NULL);
2253 	}
2254 
2255 	full = FULL_REGION_CNT(r);
2256 
2257 	/* calculate the number of page candidates and initial search index */
2258 	bin = color;
2259 	idx0 = (size_t)(-1);
2260 	do {
2261 		pgcnt_t acand;
2262 
2263 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2264 		if (acand) {
2265 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2266 			    r, bin, mrange);
2267 			idx0 = MIN(idx0, idx);
2268 			cands += acand;
2269 		}
2270 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2271 	} while (bin != color);
2272 
2273 	if (cands == 0) {
2274 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2275 		rw_exit(&page_ctrs_rwlock[mnode]);
2276 		return (NULL);
2277 	}
2278 
2279 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2280 	if (pfnum < lo || pfnum >= hi) {
2281 		pfnum = lo;
2282 	} else {
2283 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2284 		if (pfnum == (pfn_t)-1) {
2285 			pfnum = lo;
2286 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2287 			ASSERT(pfnum != (pfn_t)-1);
2288 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2289 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2290 			/* invalid color, get the closest correct pfn */
2291 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2292 			    color_mask, &it);
2293 			if (pfnum >= hi) {
2294 				pfnum = lo;
2295 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2296 			}
2297 		}
2298 	}
2299 
2300 	/* set starting index */
2301 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2302 	ASSERT(idx0 < len);
2303 
2304 #if defined(__sparc)
2305 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2306 	nhi = 0;		/* search kcage ranges */
2307 #endif
2308 
2309 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2310 
2311 #if defined(__sparc)
2312 		/*
2313 		 * Find lowest intersection of kcage ranges and mnode.
2314 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2315 		 */
2316 		if (nhi <= pfnum) {
2317 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2318 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2319 				goto wrapit;
2320 
2321 			/* jump to the next page in the range */
2322 			if (pfnum < nlo) {
2323 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2324 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2325 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2326 				if (idx >= len || pfnum >= hi)
2327 					goto wrapit;
2328 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2329 				    ceq_mask)
2330 					goto next;
2331 				if (interleaved_mnodes &&
2332 				    PFN_2_MEM_NODE(pfnum) != mnode)
2333 					goto next;
2334 			}
2335 		}
2336 #endif
2337 
2338 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2339 			goto next;
2340 
2341 		/*
2342 		 * RFE: For performance maybe we can do something less
2343 		 *	brutal than locking the entire freelist. So far
2344 		 * 	this doesn't seem to be a performance problem?
2345 		 */
2346 		page_freelist_lock(mnode);
2347 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2348 			ret_pp =
2349 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2350 			if (ret_pp != NULL) {
2351 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2352 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2353 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2354 				page_freelist_unlock(mnode);
2355 				rw_exit(&page_ctrs_rwlock[mnode]);
2356 #if defined(__sparc)
2357 				if (PP_ISNORELOC(ret_pp)) {
2358 					pgcnt_t npgs;
2359 
2360 					npgs = page_get_pagecnt(ret_pp->p_szc);
2361 					kcage_freemem_sub(npgs);
2362 				}
2363 #endif
2364 				return (ret_pp);
2365 			}
2366 		} else {
2367 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2368 		}
2369 
2370 		page_freelist_unlock(mnode);
2371 		/*
2372 		 * No point looking for another page if we've
2373 		 * already tried all of the ones that
2374 		 * page_ctr_cands indicated.  Stash off where we left
2375 		 * off.
2376 		 * Note: this is not exact since we don't hold the
2377 		 * page_freelist_locks before we initially get the
2378 		 * value of cands for performance reasons, but should
2379 		 * be a decent approximation.
2380 		 */
2381 		if (--cands == 0) {
2382 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2383 			    idx;
2384 			break;
2385 		}
2386 next:
2387 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2388 		    color_mask, &it);
2389 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2390 		if (idx >= len || pfnum >= hi) {
2391 wrapit:
2392 			pfnum = lo;
2393 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2394 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2395 			wrap++;
2396 #if defined(__sparc)
2397 			nhi = 0;	/* search kcage ranges */
2398 #endif
2399 		}
2400 	}
2401 
2402 	rw_exit(&page_ctrs_rwlock[mnode]);
2403 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2404 	return (NULL);
2405 }
2406 
2407 /*
2408  * For the given mnode, promote as many small pages to large pages as possible.
2409  * mnode can be -1, which means do them all
2410  */
2411 void
2412 page_freelist_coalesce_all(int mnode)
2413 {
2414 	int 	r;		/* region size */
2415 	int 	idx, full;
2416 	size_t	len;
2417 	int doall = interleaved_mnodes || mnode < 0;
2418 	int mlo = doall ? 0 : mnode;
2419 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2420 
2421 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2422 
2423 	if (mpss_coalesce_disable) {
2424 		return;
2425 	}
2426 
2427 	/*
2428 	 * Lock the entire freelist and coalesce what we can.
2429 	 *
2430 	 * Always promote to the largest page possible
2431 	 * first to reduce the number of page promotions.
2432 	 */
2433 	for (mnode = mlo; mnode < mhi; mnode++) {
2434 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2435 		page_freelist_lock(mnode);
2436 	}
2437 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2438 		for (mnode = mlo; mnode < mhi; mnode++) {
2439 			pgcnt_t cands = 0;
2440 			int mrange, nranges = mnode_nranges[mnode];
2441 
2442 			for (mrange = 0; mrange < nranges; mrange++) {
2443 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2444 				if (cands != 0)
2445 					break;
2446 			}
2447 			if (cands == 0) {
2448 				VM_STAT_ADD(vmm_vmstats.
2449 				    page_ctrs_cands_skip_all);
2450 				continue;
2451 			}
2452 
2453 			full = FULL_REGION_CNT(r);
2454 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2455 
2456 			for (idx = 0; idx < len; idx++) {
2457 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2458 					pfn_t pfnum =
2459 					    IDX_TO_PNUM(mnode, r, idx);
2460 					int tmnode = interleaved_mnodes ?
2461 					    PFN_2_MEM_NODE(pfnum) : mnode;
2462 
2463 					ASSERT(pfnum >=
2464 					    mem_node_config[tmnode].physbase &&
2465 					    pfnum <
2466 					    mem_node_config[tmnode].physmax);
2467 
2468 					(void) page_promote(tmnode,
2469 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2470 				}
2471 			}
2472 			/* shared hpm_counters covers all mnodes, so we quit */
2473 			if (interleaved_mnodes)
2474 				break;
2475 		}
2476 	}
2477 	for (mnode = mlo; mnode < mhi; mnode++) {
2478 		page_freelist_unlock(mnode);
2479 		rw_exit(&page_ctrs_rwlock[mnode]);
2480 	}
2481 }
2482 
2483 /*
2484  * This is where all polices for moving pages around
2485  * to different page size free lists is implemented.
2486  * Returns 1 on success, 0 on failure.
2487  *
2488  * So far these are the priorities for this algorithm in descending
2489  * order:
2490  *
2491  *	1) When servicing a request try to do so with a free page
2492  *	   from next size up. Helps defer fragmentation as long
2493  *	   as possible.
2494  *
2495  *	2) Page coalesce on demand. Only when a freelist
2496  *	   larger than PAGESIZE is empty and step 1
2497  *	   will not work since all larger size lists are
2498  *	   also empty.
2499  *
2500  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2501  */
2502 
2503 page_t *
2504 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2505     pfn_t pfnhi, page_list_walker_t *plw)
2506 {
2507 	uchar_t nszc = szc + 1;
2508 	uint_t 	bin, sbin, bin_prev;
2509 	page_t	*pp, *firstpp;
2510 	page_t	*ret_pp = NULL;
2511 	uint_t  color_mask;
2512 
2513 	if (nszc == mmu_page_sizes)
2514 		return (NULL);
2515 
2516 	ASSERT(nszc < mmu_page_sizes);
2517 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2518 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2519 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2520 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2521 
2522 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2523 	/*
2524 	 * First try to break up a larger page to fill current size freelist.
2525 	 */
2526 	while (plw->plw_bins[nszc] != 0) {
2527 
2528 		ASSERT(nszc < mmu_page_sizes);
2529 
2530 		/*
2531 		 * If page found then demote it.
2532 		 */
2533 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2534 			page_freelist_lock(mnode);
2535 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2536 
2537 			/*
2538 			 * If pfnhi is not PFNNULL, look for large page below
2539 			 * pfnhi. PFNNULL signifies no pfn requirement.
2540 			 */
2541 			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
2542 				do {
2543 					pp = pp->p_vpnext;
2544 					if (pp == firstpp) {
2545 						pp = NULL;
2546 						break;
2547 					}
2548 				} while (pp->p_pagenum >= pfnhi);
2549 			}
2550 			if (pp) {
2551 				uint_t ccolor = page_correct_color(szc, nszc,
2552 				    color, bin, plw->plw_ceq_mask[szc]);
2553 
2554 				ASSERT(pp->p_szc == nszc);
2555 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2556 				ret_pp = page_demote(mnode, pp->p_pagenum,
2557 				    pp->p_szc, szc, ccolor, PC_ALLOC);
2558 				if (ret_pp) {
2559 					page_freelist_unlock(mnode);
2560 #if defined(__sparc)
2561 					if (PP_ISNORELOC(ret_pp)) {
2562 						pgcnt_t npgs;
2563 
2564 						npgs = page_get_pagecnt(
2565 						    ret_pp->p_szc);
2566 						kcage_freemem_sub(npgs);
2567 					}
2568 #endif
2569 					return (ret_pp);
2570 				}
2571 			}
2572 			page_freelist_unlock(mnode);
2573 		}
2574 
2575 		/* loop through next size bins */
2576 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2577 		plw->plw_bins[nszc]--;
2578 
2579 		if (bin == sbin) {
2580 			uchar_t nnszc = nszc + 1;
2581 
2582 			/* we are done with this page size - check next */
2583 			if (plw->plw_bins[nnszc] == 0)
2584 				/* we have already checked next size bins */
2585 				break;
2586 
2587 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2588 			if (bin_prev != INVALID_COLOR) {
2589 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2590 				if (!((bin ^ bin_prev) &
2591 				    plw->plw_ceq_mask[nnszc]))
2592 					break;
2593 			}
2594 			ASSERT(nnszc < mmu_page_sizes);
2595 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2596 			nszc = nnszc;
2597 			ASSERT(nszc < mmu_page_sizes);
2598 		}
2599 	}
2600 
2601 	return (ret_pp);
2602 }
2603 
2604 /*
2605  * Helper routine used only by the freelist code to lock
2606  * a page. If the page is a large page then it succeeds in
2607  * locking all the constituent pages or none at all.
2608  * Returns 1 on sucess, 0 on failure.
2609  */
2610 static int
2611 page_trylock_cons(page_t *pp, se_t se)
2612 {
2613 	page_t	*tpp, *first_pp = pp;
2614 
2615 	/*
2616 	 * Fail if can't lock first or only page.
2617 	 */
2618 	if (!page_trylock(pp, se)) {
2619 		return (0);
2620 	}
2621 
2622 	/*
2623 	 * PAGESIZE: common case.
2624 	 */
2625 	if (pp->p_szc == 0) {
2626 		return (1);
2627 	}
2628 
2629 	/*
2630 	 * Large page case.
2631 	 */
2632 	tpp = pp->p_next;
2633 	while (tpp != pp) {
2634 		if (!page_trylock(tpp, se)) {
2635 			/*
2636 			 * On failure unlock what we have locked so far.
2637 			 * We want to avoid attempting to capture these
2638 			 * pages as the pcm mutex may be held which could
2639 			 * lead to a recursive mutex panic.
2640 			 */
2641 			while (first_pp != tpp) {
2642 				page_unlock_nocapture(first_pp);
2643 				first_pp = first_pp->p_next;
2644 			}
2645 			return (0);
2646 		}
2647 		tpp = tpp->p_next;
2648 	}
2649 	return (1);
2650 }
2651 
2652 /*
2653  * init context for walking page lists
2654  * Called when a page of the given szc in unavailable. Sets markers
2655  * for the beginning of the search to detect when search has
2656  * completed a full cycle. Sets flags for splitting larger pages
2657  * and coalescing smaller pages. Page walking procedes until a page
2658  * of the desired equivalent color is found.
2659  */
2660 void
2661 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2662     int use_ceq, page_list_walker_t *plw)
2663 {
2664 	uint_t  nszc, ceq_mask, colors;
2665 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2666 
2667 	ASSERT(szc < mmu_page_sizes);
2668 	colors = PAGE_GET_PAGECOLORS(szc);
2669 
2670 	plw->plw_colors = colors;
2671 	plw->plw_color_mask = colors - 1;
2672 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2673 	plw->plw_bin_split_prev = bin;
2674 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2675 
2676 	/*
2677 	 * if vac aliasing is possible make sure lower order color
2678 	 * bits are never ignored
2679 	 */
2680 	if (vac_colors > 1)
2681 		ceq &= 0xf0;
2682 
2683 	/*
2684 	 * calculate the number of non-equivalent colors and
2685 	 * color equivalency mask
2686 	 */
2687 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2688 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2689 	ASSERT(plw->plw_ceq_dif > 0);
2690 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2691 
2692 	if (flags & PG_MATCH_COLOR) {
2693 		if (cpu_page_colors <  0) {
2694 			/*
2695 			 * this is a heterogeneous machine with different CPUs
2696 			 * having different size e$ (not supported for ni2/rock
2697 			 */
2698 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2699 			cpucolors = MAX(cpucolors, 1);
2700 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2701 			plw->plw_ceq_mask[szc] =
2702 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2703 		}
2704 		plw->plw_ceq_dif = 1;
2705 	}
2706 
2707 	/* we can split pages in the freelist, but not the cachelist */
2708 	if (can_split) {
2709 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2710 
2711 		/* set next szc color masks and number of free list bins */
2712 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2713 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2714 			    plw->plw_ceq_mask[szc]);
2715 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2716 		}
2717 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2718 		plw->plw_bins[nszc] = 0;
2719 
2720 	} else {
2721 		ASSERT(szc == 0);
2722 		plw->plw_do_split = 0;
2723 		plw->plw_bins[1] = 0;
2724 		plw->plw_ceq_mask[1] = INVALID_MASK;
2725 	}
2726 }
2727 
2728 /*
2729  * set mark to flag where next split should occur
2730  */
2731 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2732 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2733 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2734 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2735 	plw->plw_split_next =						     \
2736 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2737 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2738 		plw->plw_split_next =					     \
2739 		INC_MASKED(plw->plw_split_next,				     \
2740 		    neq_mask, plw->plw_color_mask);			     \
2741 	}								     \
2742 }
2743 
2744 uint_t
2745 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2746 {
2747 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2748 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2749 	uchar_t nszc = szc + 1;
2750 
2751 	nbin = ADD_MASKED(bin,
2752 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2753 
2754 	if (plw->plw_do_split) {
2755 		plw->plw_bin_split_prev = bin;
2756 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2757 		plw->plw_do_split = 0;
2758 	}
2759 
2760 	if (szc == 0) {
2761 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2762 			if (nbin == plw->plw_bin0 &&
2763 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2764 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2765 				    neq_mask, plw->plw_color_mask);
2766 				plw->plw_bin_split_prev = plw->plw_bin0;
2767 			}
2768 
2769 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2770 				plw->plw_bin_marker =
2771 				    nbin = INC_MASKED(nbin, neq_mask,
2772 				    plw->plw_color_mask);
2773 				plw->plw_bin_split_prev = plw->plw_bin0;
2774 				/*
2775 				 * large pages all have the same vac color
2776 				 * so by now we should be done with next
2777 				 * size page splitting process
2778 				 */
2779 				ASSERT(plw->plw_bins[1] == 0);
2780 				plw->plw_do_split = 0;
2781 				return (nbin);
2782 			}
2783 
2784 		} else {
2785 			uint_t bin_jump = (vac_colors == 1) ?
2786 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2787 
2788 			bin_jump &= ~(vac_colors - 1);
2789 
2790 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2791 			    plw->plw_color_mask);
2792 
2793 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2794 
2795 				plw->plw_bin_marker = nbin = nbin0;
2796 
2797 				if (plw->plw_bins[nszc] != 0) {
2798 					/*
2799 					 * check if next page size bin is the
2800 					 * same as the next page size bin for
2801 					 * bin0
2802 					 */
2803 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2804 					    nbin);
2805 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2806 					    plw->plw_bin0);
2807 
2808 					if ((bin0_nsz ^ nbin_nsz) &
2809 					    plw->plw_ceq_mask[nszc])
2810 						plw->plw_do_split = 1;
2811 				}
2812 				return (nbin);
2813 			}
2814 		}
2815 	}
2816 
2817 	if (plw->plw_bins[nszc] != 0) {
2818 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2819 		if (!((plw->plw_split_next ^ nbin_nsz) &
2820 		    plw->plw_ceq_mask[nszc]))
2821 			plw->plw_do_split = 1;
2822 	}
2823 
2824 	return (nbin);
2825 }
2826 
2827 page_t *
2828 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2829     uint_t flags)
2830 {
2831 	kmutex_t		*pcm;
2832 	page_t			*pp, *first_pp;
2833 	uint_t			sbin;
2834 	int			plw_initialized;
2835 	page_list_walker_t	plw;
2836 
2837 	ASSERT(szc < mmu_page_sizes);
2838 
2839 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2840 
2841 	MTYPE_START(mnode, mtype, flags);
2842 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2843 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2844 		return (NULL);
2845 	}
2846 try_again:
2847 
2848 	plw_initialized = 0;
2849 	plw.plw_ceq_dif = 1;
2850 
2851 	/*
2852 	 * Only hold one freelist lock at a time, that way we
2853 	 * can start anywhere and not have to worry about lock
2854 	 * ordering.
2855 	 */
2856 	for (plw.plw_count = 0;
2857 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2858 		sbin = bin;
2859 		do {
2860 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2861 				goto bin_empty_1;
2862 
2863 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2864 			mutex_enter(pcm);
2865 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2866 			if (pp == NULL)
2867 				goto bin_empty_0;
2868 
2869 			/*
2870 			 * These were set before the page
2871 			 * was put on the free list,
2872 			 * they must still be set.
2873 			 */
2874 			ASSERT(PP_ISFREE(pp));
2875 			ASSERT(PP_ISAGED(pp));
2876 			ASSERT(pp->p_vnode == NULL);
2877 			ASSERT(pp->p_hash == NULL);
2878 			ASSERT(pp->p_offset == (u_offset_t)-1);
2879 			ASSERT(pp->p_szc == szc);
2880 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2881 
2882 			/*
2883 			 * Walk down the hash chain.
2884 			 * 8k pages are linked on p_next
2885 			 * and p_prev fields. Large pages
2886 			 * are a contiguous group of
2887 			 * constituent pages linked together
2888 			 * on their p_next and p_prev fields.
2889 			 * The large pages are linked together
2890 			 * on the hash chain using p_vpnext
2891 			 * p_vpprev of the base constituent
2892 			 * page of each large page.
2893 			 */
2894 			first_pp = pp;
2895 			while (!page_trylock_cons(pp, SE_EXCL)) {
2896 				if (szc == 0) {
2897 					pp = pp->p_next;
2898 				} else {
2899 					pp = pp->p_vpnext;
2900 				}
2901 
2902 				ASSERT(PP_ISFREE(pp));
2903 				ASSERT(PP_ISAGED(pp));
2904 				ASSERT(pp->p_vnode == NULL);
2905 				ASSERT(pp->p_hash == NULL);
2906 				ASSERT(pp->p_offset == (u_offset_t)-1);
2907 				ASSERT(pp->p_szc == szc);
2908 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2909 
2910 				if (pp == first_pp)
2911 					goto bin_empty_0;
2912 			}
2913 
2914 			ASSERT(pp != NULL);
2915 			ASSERT(mtype == PP_2_MTYPE(pp));
2916 			ASSERT(pp->p_szc == szc);
2917 			if (szc == 0) {
2918 				page_sub(&PAGE_FREELISTS(mnode,
2919 				    szc, bin, mtype), pp);
2920 			} else {
2921 				page_vpsub(&PAGE_FREELISTS(mnode,
2922 				    szc, bin, mtype), pp);
2923 				CHK_LPG(pp, szc);
2924 			}
2925 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2926 
2927 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2928 				panic("free page is not. pp %p", (void *)pp);
2929 			mutex_exit(pcm);
2930 
2931 #if defined(__sparc)
2932 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2933 			    (flags & PG_NORELOC) == 0);
2934 
2935 			if (PP_ISNORELOC(pp))
2936 				kcage_freemem_sub(page_get_pagecnt(szc));
2937 #endif
2938 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2939 			return (pp);
2940 
2941 bin_empty_0:
2942 			mutex_exit(pcm);
2943 bin_empty_1:
2944 			if (plw_initialized == 0) {
2945 				page_list_walk_init(szc, flags, bin, 1, 1,
2946 				    &plw);
2947 				plw_initialized = 1;
2948 				ASSERT(plw.plw_colors <=
2949 				    PAGE_GET_PAGECOLORS(szc));
2950 				ASSERT(plw.plw_colors > 0);
2951 				ASSERT((plw.plw_colors &
2952 				    (plw.plw_colors - 1)) == 0);
2953 				ASSERT(bin < plw.plw_colors);
2954 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2955 			}
2956 			/* calculate the next bin with equivalent color */
2957 			bin = ADD_MASKED(bin, plw.plw_bin_step,
2958 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
2959 		} while (sbin != bin);
2960 
2961 		/*
2962 		 * color bins are all empty if color match. Try and
2963 		 * satisfy the request by breaking up or coalescing
2964 		 * pages from a different size freelist of the correct
2965 		 * color that satisfies the ORIGINAL color requested.
2966 		 * If that fails then try pages of the same size but
2967 		 * different colors assuming we are not called with
2968 		 * PG_MATCH_COLOR.
2969 		 */
2970 		if (plw.plw_do_split &&
2971 		    (pp = page_freelist_split(szc, bin, mnode,
2972 		    mtype, PFNNULL, &plw)) != NULL)
2973 			return (pp);
2974 
2975 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2976 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
2977 			return (pp);
2978 
2979 		if (plw.plw_ceq_dif > 1)
2980 			bin = page_list_walk_next_bin(szc, bin, &plw);
2981 	}
2982 
2983 	/* if allowed, cycle through additional mtypes */
2984 	MTYPE_NEXT(mnode, mtype, flags);
2985 	if (mtype >= 0)
2986 		goto try_again;
2987 
2988 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2989 
2990 	return (NULL);
2991 }
2992 
2993 /*
2994  * Returns the count of free pages for 'pp' with size code 'szc'.
2995  * Note: This function does not return an exact value as the page freelist
2996  * locks are not held and thus the values in the page_counters may be
2997  * changing as we walk through the data.
2998  */
2999 static int
3000 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3001 {
3002 	pgcnt_t	pgfree;
3003 	pgcnt_t cnt;
3004 	ssize_t	r = szc;	/* region size */
3005 	ssize_t	idx;
3006 	int	i;
3007 	int	full, range;
3008 
3009 	/* Make sure pagenum passed in is aligned properly */
3010 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3011 	ASSERT(szc > 0);
3012 
3013 	/* Prevent page_counters dynamic memory from being freed */
3014 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3015 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3016 	cnt = PAGE_COUNTERS(mnode, r, idx);
3017 	pgfree = cnt << PNUM_SHIFT(r - 1);
3018 	range = FULL_REGION_CNT(szc);
3019 
3020 	/* Check for completely full region */
3021 	if (cnt == range) {
3022 		rw_exit(&page_ctrs_rwlock[mnode]);
3023 		return (pgfree);
3024 	}
3025 
3026 	while (--r > 0) {
3027 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3028 		full = FULL_REGION_CNT(r);
3029 		for (i = 0; i < range; i++, idx++) {
3030 			cnt = PAGE_COUNTERS(mnode, r, idx);
3031 			/*
3032 			 * If cnt here is full, that means we have already
3033 			 * accounted for these pages earlier.
3034 			 */
3035 			if (cnt != full) {
3036 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3037 			}
3038 		}
3039 		range *= full;
3040 	}
3041 	rw_exit(&page_ctrs_rwlock[mnode]);
3042 	return (pgfree);
3043 }
3044 
3045 /*
3046  * Called from page_geti_contig_pages to exclusively lock constituent pages
3047  * starting from 'spp' for page size code 'szc'.
3048  *
3049  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3050  * region needs to be greater than or equal to the threshold.
3051  */
3052 static int
3053 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3054 {
3055 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3056 	pgcnt_t pgfree, i;
3057 	page_t *pp;
3058 
3059 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3060 
3061 
3062 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3063 		goto skipptcpcheck;
3064 	/*
3065 	 * check if there are sufficient free pages available before attempting
3066 	 * to trylock. Count is approximate as page counters can change.
3067 	 */
3068 	pgfree = page_freecnt(mnode, spp, szc);
3069 
3070 	/* attempt to trylock if there are sufficient already free pages */
3071 	if (pgfree < pgcnt/ptcpthreshold) {
3072 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3073 		return (0);
3074 	}
3075 
3076 skipptcpcheck:
3077 
3078 	for (i = 0; i < pgcnt; i++) {
3079 		pp = &spp[i];
3080 		if (!page_trylock(pp, SE_EXCL)) {
3081 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3082 			while (--i != (pgcnt_t)-1) {
3083 				pp = &spp[i];
3084 				ASSERT(PAGE_EXCL(pp));
3085 				page_unlock_nocapture(pp);
3086 			}
3087 			return (0);
3088 		}
3089 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3090 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3091 		    !PP_ISFREE(pp)) {
3092 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3093 			ASSERT(i == 0);
3094 			page_unlock_nocapture(pp);
3095 			return (0);
3096 		}
3097 		if (PP_ISNORELOC(pp)) {
3098 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3099 			while (i != (pgcnt_t)-1) {
3100 				pp = &spp[i];
3101 				ASSERT(PAGE_EXCL(pp));
3102 				page_unlock_nocapture(pp);
3103 				i--;
3104 			}
3105 			return (0);
3106 		}
3107 	}
3108 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3109 	return (1);
3110 }
3111 
3112 /*
3113  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3114  * of 'szc' constituent pages that had been locked exclusively previously.
3115  * Will attempt to relocate constituent pages in use.
3116  */
3117 static page_t *
3118 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3119 {
3120 	spgcnt_t pgcnt, npgs, i;
3121 	page_t *targpp, *rpp, *hpp;
3122 	page_t *replpp = NULL;
3123 	page_t *pplist = NULL;
3124 
3125 	ASSERT(pp != NULL);
3126 
3127 	pgcnt = page_get_pagecnt(szc);
3128 	while (pgcnt) {
3129 		ASSERT(PAGE_EXCL(pp));
3130 		ASSERT(!PP_ISNORELOC(pp));
3131 		if (PP_ISFREE(pp)) {
3132 			/*
3133 			 * If this is a PG_FREE_LIST page then its
3134 			 * size code can change underneath us due to
3135 			 * page promotion or demotion. As an optimzation
3136 			 * use page_list_sub_pages() instead of
3137 			 * page_list_sub().
3138 			 */
3139 			if (PP_ISAGED(pp)) {
3140 				page_list_sub_pages(pp, szc);
3141 				if (pp->p_szc == szc) {
3142 					return (pp);
3143 				}
3144 				ASSERT(pp->p_szc < szc);
3145 				npgs = page_get_pagecnt(pp->p_szc);
3146 				hpp = pp;
3147 				for (i = 0; i < npgs; i++, pp++) {
3148 					pp->p_szc = szc;
3149 				}
3150 				page_list_concat(&pplist, &hpp);
3151 				pgcnt -= npgs;
3152 				continue;
3153 			}
3154 			ASSERT(!PP_ISAGED(pp));
3155 			ASSERT(pp->p_szc == 0);
3156 			page_list_sub(pp, PG_CACHE_LIST);
3157 			page_hashout(pp, NULL);
3158 			PP_SETAGED(pp);
3159 			pp->p_szc = szc;
3160 			page_list_concat(&pplist, &pp);
3161 			pp++;
3162 			pgcnt--;
3163 			continue;
3164 		}
3165 		npgs = page_get_pagecnt(pp->p_szc);
3166 
3167 		/*
3168 		 * page_create_wait freemem accounting done by caller of
3169 		 * page_get_freelist and not necessary to call it prior to
3170 		 * calling page_get_replacement_page.
3171 		 *
3172 		 * page_get_replacement_page can call page_get_contig_pages
3173 		 * to acquire a large page (szc > 0); the replacement must be
3174 		 * smaller than the contig page size to avoid looping or
3175 		 * szc == 0 and PGI_PGCPSZC0 is set.
3176 		 */
3177 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3178 			replpp = page_get_replacement_page(pp, NULL, 0);
3179 			if (replpp) {
3180 				npgs = page_get_pagecnt(pp->p_szc);
3181 				ASSERT(npgs <= pgcnt);
3182 				targpp = pp;
3183 			}
3184 		}
3185 
3186 		/*
3187 		 * If replacement is NULL or do_page_relocate fails, fail
3188 		 * coalescing of pages.
3189 		 */
3190 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3191 		    &npgs, NULL) != 0)) {
3192 			/*
3193 			 * Unlock un-processed target list
3194 			 */
3195 			while (pgcnt--) {
3196 				ASSERT(PAGE_EXCL(pp));
3197 				page_unlock_nocapture(pp);
3198 				pp++;
3199 			}
3200 			/*
3201 			 * Free the processed target list.
3202 			 */
3203 			while (pplist) {
3204 				pp = pplist;
3205 				page_sub(&pplist, pp);
3206 				ASSERT(PAGE_EXCL(pp));
3207 				ASSERT(pp->p_szc == szc);
3208 				ASSERT(PP_ISFREE(pp));
3209 				ASSERT(PP_ISAGED(pp));
3210 				pp->p_szc = 0;
3211 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3212 				page_unlock_nocapture(pp);
3213 			}
3214 
3215 			if (replpp != NULL)
3216 				page_free_replacement_page(replpp);
3217 
3218 			return (NULL);
3219 		}
3220 		ASSERT(pp == targpp);
3221 
3222 		/* LINTED */
3223 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3224 
3225 		pp += npgs;
3226 		pgcnt -= npgs;
3227 
3228 		while (npgs--) {
3229 			ASSERT(PAGE_EXCL(targpp));
3230 			ASSERT(!PP_ISFREE(targpp));
3231 			ASSERT(!PP_ISNORELOC(targpp));
3232 			PP_SETFREE(targpp);
3233 			ASSERT(PP_ISAGED(targpp));
3234 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3235 			    (flags & PGI_PGCPSZC0)));
3236 			targpp->p_szc = szc;
3237 			targpp = targpp->p_next;
3238 
3239 			rpp = replpp;
3240 			ASSERT(rpp != NULL);
3241 			page_sub(&replpp, rpp);
3242 			ASSERT(PAGE_EXCL(rpp));
3243 			ASSERT(!PP_ISFREE(rpp));
3244 			page_unlock_nocapture(rpp);
3245 		}
3246 		ASSERT(targpp == hpp);
3247 		ASSERT(replpp == NULL);
3248 		page_list_concat(&pplist, &targpp);
3249 	}
3250 	CHK_LPG(pplist, szc);
3251 	return (pplist);
3252 }
3253 
3254 /*
3255  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3256  * of 0 means nothing left after trim.
3257  */
3258 int
3259 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3260 {
3261 	pfn_t	kcagepfn;
3262 	int	decr;
3263 	int	rc = 0;
3264 
3265 	if (PP_ISNORELOC(mseg->pages)) {
3266 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3267 
3268 			/* lower part of this mseg inside kernel cage */
3269 			decr = kcage_current_pfn(&kcagepfn);
3270 
3271 			/* kernel cage may have transitioned past mseg */
3272 			if (kcagepfn >= mseg->pages_base &&
3273 			    kcagepfn < mseg->pages_end) {
3274 				ASSERT(decr == 0);
3275 				*lo = kcagepfn;
3276 				*hi = MIN(pfnhi,
3277 				    (mseg->pages_end - 1));
3278 				rc = 1;
3279 			}
3280 		}
3281 		/* else entire mseg in the cage */
3282 	} else {
3283 		if (PP_ISNORELOC(mseg->epages - 1)) {
3284 
3285 			/* upper part of this mseg inside kernel cage */
3286 			decr = kcage_current_pfn(&kcagepfn);
3287 
3288 			/* kernel cage may have transitioned past mseg */
3289 			if (kcagepfn >= mseg->pages_base &&
3290 			    kcagepfn < mseg->pages_end) {
3291 				ASSERT(decr);
3292 				*hi = kcagepfn;
3293 				*lo = MAX(pfnlo, mseg->pages_base);
3294 				rc = 1;
3295 			}
3296 		} else {
3297 			/* entire mseg outside of kernel cage */
3298 			*lo = MAX(pfnlo, mseg->pages_base);
3299 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3300 			rc = 1;
3301 		}
3302 	}
3303 	return (rc);
3304 }
3305 
3306 /*
3307  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3308  * page with size code 'szc'. Claiming such a page requires acquiring
3309  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3310  * relocating pages in use and concatenating these constituent pages into a
3311  * large page.
3312  *
3313  * The page lists do not have such a large page and page_freelist_split has
3314  * already failed to demote larger pages and/or coalesce smaller free pages.
3315  *
3316  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3317  * pages with the same color as 'bin'.
3318  *
3319  * 'pfnflag' specifies the subset of the pfn range to search.
3320  */
3321 
3322 static page_t *
3323 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3324     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3325 {
3326 	struct memseg *mseg;
3327 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3328 	pgcnt_t szcpgmask = szcpgcnt - 1;
3329 	pfn_t	randpfn;
3330 	page_t *pp, *randpp, *endpp;
3331 	uint_t colors, ceq_mask;
3332 	/* LINTED : set but not used in function */
3333 	uint_t color_mask;
3334 	pfn_t hi, lo;
3335 	uint_t skip;
3336 	MEM_NODE_ITERATOR_DECL(it);
3337 
3338 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3339 
3340 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3341 
3342 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3343 		return (NULL);
3344 
3345 	ASSERT(szc < mmu_page_sizes);
3346 
3347 	colors = PAGE_GET_PAGECOLORS(szc);
3348 	color_mask = colors - 1;
3349 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3350 		uchar_t ceq = colorequivszc[szc];
3351 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3352 
3353 		ASSERT(ceq_dif > 0);
3354 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3355 	} else {
3356 		ceq_mask = 0;
3357 	}
3358 
3359 	ASSERT(bin < colors);
3360 
3361 	/* clear "non-significant" color bits */
3362 	bin &= ceq_mask;
3363 
3364 	/*
3365 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3366 	 * when there have been previous page_get_contig_page failures to
3367 	 * limit the search.
3368 	 *
3369 	 * The high bit in pfnflag specifies the number of 'slots' in the
3370 	 * pfn range and the remainder of pfnflag specifies which slot.
3371 	 * For example, a value of 1010b would mean the second slot of
3372 	 * the pfn range that has been divided into 8 slots.
3373 	 */
3374 	if (pfnflag > 1) {
3375 		int	slots = 1 << (highbit(pfnflag) - 1);
3376 		int	slotid = pfnflag & (slots - 1);
3377 		pgcnt_t	szcpages;
3378 		int	slotlen;
3379 
3380 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3381 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3382 		slotlen = howmany(szcpages, slots);
3383 		/* skip if 'slotid' slot is empty */
3384 		if (slotid * slotlen >= szcpages)
3385 			return (NULL);
3386 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3387 		ASSERT(pfnlo < pfnhi);
3388 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3389 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3390 	}
3391 
3392 	memsegs_lock(0);
3393 
3394 	/*
3395 	 * loop through memsegs to look for contig page candidates
3396 	 */
3397 
3398 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3399 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3400 			/* no overlap */
3401 			continue;
3402 		}
3403 
3404 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3405 			/* mseg too small */
3406 			continue;
3407 
3408 		/* trim off kernel cage pages from pfn range */
3409 		if (kcage_on) {
3410 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
3411 				continue;
3412 		} else {
3413 			lo = MAX(pfnlo, mseg->pages_base);
3414 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3415 		}
3416 
3417 		/* round to szcpgcnt boundaries */
3418 		lo = P2ROUNDUP(lo, szcpgcnt);
3419 
3420 		MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
3421 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3422 
3423 		if (hi <= lo)
3424 			continue;
3425 
3426 		/*
3427 		 * set lo to point to the pfn for the desired bin. Large
3428 		 * page sizes may only have a single page color
3429 		 */
3430 		skip = szcpgcnt;
3431 		if (ceq_mask > 0 || interleaved_mnodes) {
3432 			/* set lo to point at appropriate color */
3433 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3434 			    (interleaved_mnodes &&
3435 			    PFN_2_MEM_NODE(lo) != mnode)) {
3436 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3437 				    color_mask, &it);
3438 			}
3439 			if (hi <= lo)
3440 				/* mseg cannot satisfy color request */
3441 				continue;
3442 		}
3443 
3444 		/* randomly choose a point between lo and hi to begin search */
3445 
3446 		randpfn = (pfn_t)GETTICK();
3447 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3448 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
3449 		if (ceq_mask || interleaved_mnodes) {
3450 			if (randpfn != (pfn_t)-1)
3451 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3452 				    ceq_mask, color_mask, &it);
3453 			if (randpfn >= hi) {
3454 				randpfn = lo;
3455 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
3456 			}
3457 		}
3458 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3459 
3460 		ASSERT(randpp->p_pagenum == randpfn);
3461 
3462 		pp = randpp;
3463 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3464 
3465 		ASSERT(randpp + szcpgcnt <= endpp);
3466 
3467 		do {
3468 			ASSERT(!(pp->p_pagenum & szcpgmask));
3469 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3470 
3471 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3472 				/* pages unlocked by page_claim on failure */
3473 				if (page_claim_contig_pages(pp, szc, flags)) {
3474 					memsegs_unlock(0);
3475 					return (pp);
3476 				}
3477 			}
3478 
3479 			if (ceq_mask == 0 && !interleaved_mnodes) {
3480 				pp += skip;
3481 			} else {
3482 				pfn_t pfn = pp->p_pagenum;
3483 
3484 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3485 				    ceq_mask, color_mask, &it);
3486 				if (pfn == (pfn_t)-1) {
3487 					pp = endpp;
3488 				} else {
3489 					pp = mseg->pages +
3490 					    (pfn - mseg->pages_base);
3491 				}
3492 			}
3493 			if (pp >= endpp) {
3494 				/* start from the beginning */
3495 				MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
3496 				pp = mseg->pages + (lo - mseg->pages_base);
3497 				ASSERT(pp->p_pagenum == lo);
3498 				ASSERT(pp + szcpgcnt <= endpp);
3499 			}
3500 		} while (pp != randpp);
3501 	}
3502 	memsegs_unlock(0);
3503 	return (NULL);
3504 }
3505 
3506 
3507 /*
3508  * controlling routine that searches through physical memory in an attempt to
3509  * claim a large page based on the input parameters.
3510  * on the page free lists.
3511  *
3512  * calls page_geti_contig_pages with an initial pfn range from the mnode
3513  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3514  * that overlaps with the kernel cage or does not match the requested page
3515  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3516  * page_geti_contig_pages may further limit the search range based on
3517  * previous failure counts (pgcpfailcnt[]).
3518  *
3519  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3520  * pagesize page that satisfies mtype.
3521  */
3522 page_t *
3523 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3524     uint_t flags)
3525 {
3526 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3527 	page_t		*pp;
3528 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3529 
3530 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3531 
3532 	/* no allocations from cage */
3533 	flags |= PGI_NOCAGE;
3534 
3535 	/* LINTED */
3536 	MTYPE_START(mnode, mtype, flags);
3537 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3538 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3539 		return (NULL);
3540 	}
3541 
3542 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3543 
3544 	/* do not limit search and ignore color if hi pri */
3545 
3546 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3547 		pfnflag = pgcpfailcnt[szc];
3548 
3549 	/* remove color match to improve chances */
3550 
3551 	if (flags & PGI_PGCPHIPRI || pfnflag)
3552 		flags &= ~PG_MATCH_COLOR;
3553 
3554 	do {
3555 		/* get pfn range based on mnode and mtype */
3556 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3557 
3558 		ASSERT(pfnhi >= pfnlo);
3559 
3560 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3561 		    pfnlo, pfnhi, pfnflag);
3562 
3563 		if (pp != NULL) {
3564 			pfnflag = pgcpfailcnt[szc];
3565 			if (pfnflag) {
3566 				/* double the search size */
3567 				pgcpfailcnt[szc] = pfnflag >> 1;
3568 			}
3569 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3570 			return (pp);
3571 		}
3572 		MTYPE_NEXT(mnode, mtype, flags);
3573 	} while (mtype >= 0);
3574 
3575 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3576 	return (NULL);
3577 }
3578 
3579 
3580 /*
3581  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3582  *
3583  * Does its own locking and accounting.
3584  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3585  * pages of the proper color even if there are pages of a different color.
3586  *
3587  * Finds a page, removes it, THEN locks it.
3588  */
3589 
3590 /*ARGSUSED*/
3591 page_t *
3592 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3593 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3594 {
3595 	struct as	*as = seg->s_as;
3596 	page_t		*pp = NULL;
3597 	ulong_t		bin;
3598 	uchar_t		szc;
3599 	int		mnode;
3600 	int		mtype;
3601 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3602 	lgrp_mnode_cookie_t	lgrp_cookie;
3603 
3604 	page_get_func = page_get_mnode_freelist;
3605 
3606 	/*
3607 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3608 	 * assume we wish to allocate near to the current thread's home.
3609 	 */
3610 	if (!LGRP_EXISTS(lgrp))
3611 		lgrp = lgrp_home_lgrp();
3612 
3613 	if (kcage_on) {
3614 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3615 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3616 		    curthread != kcage_cageout_thread) {
3617 			/*
3618 			 * Set a "reserve" of kcage_throttlefree pages for
3619 			 * PG_PANIC and cageout thread allocations.
3620 			 *
3621 			 * Everybody else has to serialize in
3622 			 * page_create_get_something() to get a cage page, so
3623 			 * that we don't deadlock cageout!
3624 			 */
3625 			return (NULL);
3626 		}
3627 	} else {
3628 		flags &= ~PG_NORELOC;
3629 		flags |= PGI_NOCAGE;
3630 	}
3631 
3632 	/* LINTED */
3633 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3634 
3635 	/*
3636 	 * Convert size to page size code.
3637 	 */
3638 	if ((szc = page_szc(size)) == (uchar_t)-1)
3639 		panic("page_get_freelist: illegal page size request");
3640 	ASSERT(szc < mmu_page_sizes);
3641 
3642 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3643 
3644 	/* LINTED */
3645 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3646 
3647 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3648 
3649 	/*
3650 	 * Try to get a local page first, but try remote if we can't
3651 	 * get a page of the right color.
3652 	 */
3653 pgretry:
3654 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3655 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3656 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3657 		if (pp != NULL) {
3658 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3659 			DTRACE_PROBE4(page__get,
3660 			    lgrp_t *, lgrp,
3661 			    int, mnode,
3662 			    ulong_t, bin,
3663 			    uint_t, flags);
3664 			return (pp);
3665 		}
3666 	}
3667 	ASSERT(pp == NULL);
3668 
3669 	/*
3670 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3671 	 * remote free lists.  Caller expected to call page_get_cachelist which
3672 	 * will check local cache lists and remote free lists.
3673 	 */
3674 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3675 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3676 		return (NULL);
3677 	}
3678 
3679 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3680 
3681 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3682 
3683 	if (!(flags & PG_LOCAL)) {
3684 		/*
3685 		 * Try to get a non-local freelist page.
3686 		 */
3687 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3688 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3689 			pp = page_get_func(mnode, bin, mtype, szc, flags);
3690 			if (pp != NULL) {
3691 				DTRACE_PROBE4(page__get,
3692 				    lgrp_t *, lgrp,
3693 				    int, mnode,
3694 				    ulong_t, bin,
3695 				    uint_t, flags);
3696 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3697 				return (pp);
3698 			}
3699 		}
3700 		ASSERT(pp == NULL);
3701 	}
3702 
3703 	/*
3704 	 * when the cage is off chances are page_get_contig_pages() will fail
3705 	 * to lock a large page chunk therefore when the cage is off it's not
3706 	 * called by default.  this can be changed via /etc/system.
3707 	 *
3708 	 * page_get_contig_pages() also called to acquire a base pagesize page
3709 	 * for page_create_get_something().
3710 	 */
3711 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3712 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3713 	    (page_get_func != page_get_contig_pages)) {
3714 
3715 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3716 		page_get_func = page_get_contig_pages;
3717 		goto pgretry;
3718 	}
3719 
3720 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3721 	    page_get_func == page_get_contig_pages)
3722 		SETPGCPFAILCNT(szc);
3723 
3724 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3725 	return (NULL);
3726 }
3727 
3728 /*
3729  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3730  *
3731  * Does its own locking.
3732  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3733  * pages of the proper color even if there are pages of a different color.
3734  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3735  * try to lock one of them.  If no page can be locked, try the
3736  * next bin.  Return NULL if a page can not be found and locked.
3737  *
3738  * Finds a pages, trys to lock it, then removes it.
3739  */
3740 
3741 /*ARGSUSED*/
3742 page_t *
3743 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3744     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3745 {
3746 	page_t		*pp;
3747 	struct as	*as = seg->s_as;
3748 	ulong_t		bin;
3749 	/*LINTED*/
3750 	int		mnode;
3751 	int		mtype;
3752 	lgrp_mnode_cookie_t	lgrp_cookie;
3753 
3754 	/*
3755 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3756 	 * assume we wish to allocate near to the current thread's home.
3757 	 */
3758 	if (!LGRP_EXISTS(lgrp))
3759 		lgrp = lgrp_home_lgrp();
3760 
3761 	if (!kcage_on) {
3762 		flags &= ~PG_NORELOC;
3763 		flags |= PGI_NOCAGE;
3764 	}
3765 
3766 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3767 	    kcage_freemem <= kcage_throttlefree) {
3768 		/*
3769 		 * Reserve kcage_throttlefree pages for critical kernel
3770 		 * threads.
3771 		 *
3772 		 * Everybody else has to go to page_create_get_something()
3773 		 * to get a cage page, so we don't deadlock cageout.
3774 		 */
3775 		return (NULL);
3776 	}
3777 
3778 	/* LINTED */
3779 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3780 
3781 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3782 
3783 	/* LINTED */
3784 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3785 
3786 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3787 
3788 	/*
3789 	 * Try local cachelists first
3790 	 */
3791 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3792 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3793 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3794 		if (pp != NULL) {
3795 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3796 			DTRACE_PROBE4(page__get,
3797 			    lgrp_t *, lgrp,
3798 			    int, mnode,
3799 			    ulong_t, bin,
3800 			    uint_t, flags);
3801 			return (pp);
3802 		}
3803 	}
3804 
3805 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3806 
3807 	/*
3808 	 * Try freelists/cachelists that are farther away
3809 	 * This is our only chance to allocate remote pages for PAGESIZE
3810 	 * requests.
3811 	 */
3812 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3813 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3814 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3815 		    0, flags);
3816 		if (pp != NULL) {
3817 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3818 			DTRACE_PROBE4(page__get,
3819 			    lgrp_t *, lgrp,
3820 			    int, mnode,
3821 			    ulong_t, bin,
3822 			    uint_t, flags);
3823 			return (pp);
3824 		}
3825 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3826 		if (pp != NULL) {
3827 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3828 			DTRACE_PROBE4(page__get,
3829 			    lgrp_t *, lgrp,
3830 			    int, mnode,
3831 			    ulong_t, bin,
3832 			    uint_t, flags);
3833 			return (pp);
3834 		}
3835 	}
3836 
3837 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3838 	return (NULL);
3839 }
3840 
3841 page_t *
3842 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3843 {
3844 	kmutex_t		*pcm;
3845 	page_t			*pp, *first_pp;
3846 	uint_t			sbin;
3847 	int			plw_initialized;
3848 	page_list_walker_t	plw;
3849 
3850 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3851 
3852 	/* LINTED */
3853 	MTYPE_START(mnode, mtype, flags);
3854 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3855 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3856 		return (NULL);
3857 	}
3858 
3859 try_again:
3860 
3861 	plw_initialized = 0;
3862 	plw.plw_ceq_dif = 1;
3863 
3864 	/*
3865 	 * Only hold one cachelist lock at a time, that way we
3866 	 * can start anywhere and not have to worry about lock
3867 	 * ordering.
3868 	 */
3869 
3870 	for (plw.plw_count = 0;
3871 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3872 		sbin = bin;
3873 		do {
3874 
3875 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
3876 				goto bin_empty_1;
3877 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3878 			mutex_enter(pcm);
3879 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3880 			if (pp == NULL)
3881 				goto bin_empty_0;
3882 
3883 			first_pp = pp;
3884 			ASSERT(pp->p_vnode);
3885 			ASSERT(PP_ISAGED(pp) == 0);
3886 			ASSERT(pp->p_szc == 0);
3887 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3888 			while (!page_trylock(pp, SE_EXCL)) {
3889 				pp = pp->p_next;
3890 				ASSERT(pp->p_szc == 0);
3891 				if (pp == first_pp) {
3892 					/*
3893 					 * We have searched the complete list!
3894 					 * And all of them (might only be one)
3895 					 * are locked. This can happen since
3896 					 * these pages can also be found via
3897 					 * the hash list. When found via the
3898 					 * hash list, they are locked first,
3899 					 * then removed. We give up to let the
3900 					 * other thread run.
3901 					 */
3902 					pp = NULL;
3903 					break;
3904 				}
3905 				ASSERT(pp->p_vnode);
3906 				ASSERT(PP_ISFREE(pp));
3907 				ASSERT(PP_ISAGED(pp) == 0);
3908 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3909 				    mnode);
3910 			}
3911 
3912 			if (pp) {
3913 				page_t	**ppp;
3914 				/*
3915 				 * Found and locked a page.
3916 				 * Pull it off the list.
3917 				 */
3918 				ASSERT(mtype == PP_2_MTYPE(pp));
3919 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3920 				page_sub(ppp, pp);
3921 				/*
3922 				 * Subtract counters before releasing pcm mutex
3923 				 * to avoid a race with page_freelist_coalesce
3924 				 * and page_freelist_split.
3925 				 */
3926 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3927 				mutex_exit(pcm);
3928 				ASSERT(pp->p_vnode);
3929 				ASSERT(PP_ISAGED(pp) == 0);
3930 #if defined(__sparc)
3931 				ASSERT(!kcage_on ||
3932 				    (flags & PG_NORELOC) == 0 ||
3933 				    PP_ISNORELOC(pp));
3934 				if (PP_ISNORELOC(pp)) {
3935 					kcage_freemem_sub(1);
3936 				}
3937 #endif
3938 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3939 				return (pp);
3940 			}
3941 bin_empty_0:
3942 			mutex_exit(pcm);
3943 bin_empty_1:
3944 			if (plw_initialized == 0) {
3945 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
3946 				plw_initialized = 1;
3947 			}
3948 			/* calculate the next bin with equivalent color */
3949 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3950 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
3951 		} while (sbin != bin);
3952 
3953 		if (plw.plw_ceq_dif > 1)
3954 			bin = page_list_walk_next_bin(0, bin, &plw);
3955 	}
3956 
3957 	MTYPE_NEXT(mnode, mtype, flags);
3958 	if (mtype >= 0)
3959 		goto try_again;
3960 
3961 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3962 	return (NULL);
3963 }
3964 
3965 #ifdef DEBUG
3966 #define	REPL_PAGE_STATS
3967 #endif /* DEBUG */
3968 
3969 #ifdef REPL_PAGE_STATS
3970 struct repl_page_stats {
3971 	uint_t	ngets;
3972 	uint_t	ngets_noreloc;
3973 	uint_t	npgr_noreloc;
3974 	uint_t	nnopage_first;
3975 	uint_t	nnopage;
3976 	uint_t	nhashout;
3977 	uint_t	nnofree;
3978 	uint_t	nnext_pp;
3979 } repl_page_stats;
3980 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
3981 #else /* REPL_PAGE_STATS */
3982 #define	REPL_STAT_INCR(v)
3983 #endif /* REPL_PAGE_STATS */
3984 
3985 int	pgrppgcp;
3986 
3987 /*
3988  * The freemem accounting must be done by the caller.
3989  * First we try to get a replacement page of the same size as like_pp,
3990  * if that is not possible, then we just get a set of discontiguous
3991  * PAGESIZE pages.
3992  */
3993 page_t *
3994 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3995     uint_t pgrflags)
3996 {
3997 	page_t		*like_pp;
3998 	page_t		*pp, *pplist;
3999 	page_t		*pl = NULL;
4000 	ulong_t		bin;
4001 	int		mnode, page_mnode;
4002 	int		szc;
4003 	spgcnt_t	npgs, pg_cnt;
4004 	pfn_t		pfnum;
4005 	int		mtype;
4006 	int		flags = 0;
4007 	lgrp_mnode_cookie_t	lgrp_cookie;
4008 	lgrp_t		*lgrp;
4009 
4010 	REPL_STAT_INCR(ngets);
4011 	like_pp = orig_like_pp;
4012 	ASSERT(PAGE_EXCL(like_pp));
4013 
4014 	szc = like_pp->p_szc;
4015 	npgs = page_get_pagecnt(szc);
4016 	/*
4017 	 * Now we reset like_pp to the base page_t.
4018 	 * That way, we won't walk past the end of this 'szc' page.
4019 	 */
4020 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4021 	like_pp = page_numtopp_nolock(pfnum);
4022 	ASSERT(like_pp->p_szc == szc);
4023 
4024 	if (PP_ISNORELOC(like_pp)) {
4025 		ASSERT(kcage_on);
4026 		REPL_STAT_INCR(ngets_noreloc);
4027 		flags = PGI_RELOCONLY;
4028 	} else if (pgrflags & PGR_NORELOC) {
4029 		ASSERT(kcage_on);
4030 		REPL_STAT_INCR(npgr_noreloc);
4031 		flags = PG_NORELOC;
4032 	}
4033 
4034 	/*
4035 	 * Kernel pages must always be replaced with the same size
4036 	 * pages, since we cannot properly handle demotion of kernel
4037 	 * pages.
4038 	 */
4039 	if (PP_ISKAS(like_pp))
4040 		pgrflags |= PGR_SAMESZC;
4041 
4042 	/* LINTED */
4043 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4044 
4045 	while (npgs) {
4046 		pplist = NULL;
4047 		for (;;) {
4048 			pg_cnt = page_get_pagecnt(szc);
4049 			bin = PP_2_BIN(like_pp);
4050 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4051 			ASSERT(pg_cnt <= npgs);
4052 
4053 			/*
4054 			 * If an lgroup was specified, try to get the
4055 			 * page from that lgroup.
4056 			 * NOTE: Must be careful with code below because
4057 			 *	 lgroup may disappear and reappear since there
4058 			 *	 is no locking for lgroup here.
4059 			 */
4060 			if (LGRP_EXISTS(lgrp_target)) {
4061 				/*
4062 				 * Keep local variable for lgroup separate
4063 				 * from lgroup argument since this code should
4064 				 * only be exercised when lgroup argument
4065 				 * exists....
4066 				 */
4067 				lgrp = lgrp_target;
4068 
4069 				/* Try the lgroup's freelists first */
4070 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4071 				    LGRP_SRCH_LOCAL);
4072 				while ((pplist == NULL) &&
4073 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4074 				    != -1) {
4075 					pplist =
4076 					    page_get_mnode_freelist(mnode, bin,
4077 					    mtype, szc, flags);
4078 				}
4079 
4080 				/*
4081 				 * Now try it's cachelists if this is a
4082 				 * small page. Don't need to do it for
4083 				 * larger ones since page_freelist_coalesce()
4084 				 * already failed.
4085 				 */
4086 				if (pplist != NULL || szc != 0)
4087 					break;
4088 
4089 				/* Now try it's cachelists */
4090 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4091 				    LGRP_SRCH_LOCAL);
4092 
4093 				while ((pplist == NULL) &&
4094 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4095 				    != -1) {
4096 					pplist =
4097 					    page_get_mnode_cachelist(bin, flags,
4098 					    mnode, mtype);
4099 				}
4100 				if (pplist != NULL) {
4101 					page_hashout(pplist, NULL);
4102 					PP_SETAGED(pplist);
4103 					REPL_STAT_INCR(nhashout);
4104 					break;
4105 				}
4106 				/* Done looking in this lgroup. Bail out. */
4107 				break;
4108 			}
4109 
4110 			/*
4111 			 * No lgroup was specified (or lgroup was removed by
4112 			 * DR, so just try to get the page as close to
4113 			 * like_pp's mnode as possible.
4114 			 * First try the local freelist...
4115 			 */
4116 			mnode = PP_2_MEM_NODE(like_pp);
4117 			pplist = page_get_mnode_freelist(mnode, bin,
4118 			    mtype, szc, flags);
4119 			if (pplist != NULL)
4120 				break;
4121 
4122 			REPL_STAT_INCR(nnofree);
4123 
4124 			/*
4125 			 * ...then the local cachelist. Don't need to do it for
4126 			 * larger pages cause page_freelist_coalesce() already
4127 			 * failed there anyway.
4128 			 */
4129 			if (szc == 0) {
4130 				pplist = page_get_mnode_cachelist(bin, flags,
4131 				    mnode, mtype);
4132 				if (pplist != NULL) {
4133 					page_hashout(pplist, NULL);
4134 					PP_SETAGED(pplist);
4135 					REPL_STAT_INCR(nhashout);
4136 					break;
4137 				}
4138 			}
4139 
4140 			/* Now try remote freelists */
4141 			page_mnode = mnode;
4142 			lgrp =
4143 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4144 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4145 			    LGRP_SRCH_HIER);
4146 			while (pplist == NULL &&
4147 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4148 			    != -1) {
4149 				/*
4150 				 * Skip local mnode.
4151 				 */
4152 				if ((mnode == page_mnode) ||
4153 				    (mem_node_config[mnode].exists == 0))
4154 					continue;
4155 
4156 				pplist = page_get_mnode_freelist(mnode,
4157 				    bin, mtype, szc, flags);
4158 			}
4159 
4160 			if (pplist != NULL)
4161 				break;
4162 
4163 
4164 			/* Now try remote cachelists */
4165 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4166 			    LGRP_SRCH_HIER);
4167 			while (pplist == NULL && szc == 0) {
4168 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4169 				if (mnode == -1)
4170 					break;
4171 				/*
4172 				 * Skip local mnode.
4173 				 */
4174 				if ((mnode == page_mnode) ||
4175 				    (mem_node_config[mnode].exists == 0))
4176 					continue;
4177 
4178 				pplist = page_get_mnode_cachelist(bin,
4179 				    flags, mnode, mtype);
4180 
4181 				if (pplist != NULL) {
4182 					page_hashout(pplist, NULL);
4183 					PP_SETAGED(pplist);
4184 					REPL_STAT_INCR(nhashout);
4185 					break;
4186 				}
4187 			}
4188 
4189 			/*
4190 			 * Break out of while loop under the following cases:
4191 			 * - If we successfully got a page.
4192 			 * - If pgrflags specified only returning a specific
4193 			 *   page size and we could not find that page size.
4194 			 * - If we could not satisfy the request with PAGESIZE
4195 			 *   or larger pages.
4196 			 */
4197 			if (pplist != NULL || szc == 0)
4198 				break;
4199 
4200 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4201 				/* try to find contig page */
4202 
4203 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4204 				    LGRP_SRCH_HIER);
4205 
4206 				while ((pplist == NULL) &&
4207 				    (mnode =
4208 				    lgrp_memnode_choose(&lgrp_cookie))
4209 				    != -1) {
4210 					pplist = page_get_contig_pages(
4211 					    mnode, bin, mtype, szc,
4212 					    flags | PGI_PGCPHIPRI);
4213 				}
4214 				break;
4215 			}
4216 
4217 			/*
4218 			 * The correct thing to do here is try the next
4219 			 * page size down using szc--. Due to a bug
4220 			 * with the processing of HAT_RELOAD_SHARE
4221 			 * where the sfmmu_ttecnt arrays of all
4222 			 * hats sharing an ISM segment don't get updated,
4223 			 * using intermediate size pages for relocation
4224 			 * can lead to continuous page faults.
4225 			 */
4226 			szc = 0;
4227 		}
4228 
4229 		if (pplist != NULL) {
4230 			DTRACE_PROBE4(page__get,
4231 			    lgrp_t *, lgrp,
4232 			    int, mnode,
4233 			    ulong_t, bin,
4234 			    uint_t, flags);
4235 
4236 			while (pplist != NULL && pg_cnt--) {
4237 				ASSERT(pplist != NULL);
4238 				pp = pplist;
4239 				page_sub(&pplist, pp);
4240 				PP_CLRFREE(pp);
4241 				PP_CLRAGED(pp);
4242 				page_list_concat(&pl, &pp);
4243 				npgs--;
4244 				like_pp = like_pp + 1;
4245 				REPL_STAT_INCR(nnext_pp);
4246 			}
4247 			ASSERT(pg_cnt == 0);
4248 		} else {
4249 			break;
4250 		}
4251 	}
4252 
4253 	if (npgs) {
4254 		/*
4255 		 * We were unable to allocate the necessary number
4256 		 * of pages.
4257 		 * We need to free up any pl.
4258 		 */
4259 		REPL_STAT_INCR(nnopage);
4260 		page_free_replacement_page(pl);
4261 		return (NULL);
4262 	} else {
4263 		return (pl);
4264 	}
4265 }
4266 
4267 /*
4268  * demote a free large page to it's constituent pages
4269  */
4270 void
4271 page_demote_free_pages(page_t *pp)
4272 {
4273 
4274 	int mnode;
4275 
4276 	ASSERT(pp != NULL);
4277 	ASSERT(PAGE_LOCKED(pp));
4278 	ASSERT(PP_ISFREE(pp));
4279 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4280 
4281 	mnode = PP_2_MEM_NODE(pp);
4282 	page_freelist_lock(mnode);
4283 	if (pp->p_szc != 0) {
4284 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4285 		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4286 	}
4287 	page_freelist_unlock(mnode);
4288 	ASSERT(pp->p_szc == 0);
4289 }
4290 
4291 /*
4292  * Factor in colorequiv to check additional 'equivalent' bins.
4293  * colorequiv may be set in /etc/system
4294  */
4295 void
4296 page_set_colorequiv_arr(void)
4297 {
4298 	if (colorequiv > 1) {
4299 		int i;
4300 		uint_t sv_a = lowbit(colorequiv) - 1;
4301 
4302 		if (sv_a > 15)
4303 			sv_a = 15;
4304 
4305 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4306 			uint_t colors;
4307 			uint_t a = sv_a;
4308 
4309 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4310 				continue;
4311 			}
4312 			while ((colors >> a) == 0)
4313 				a--;
4314 			if ((a << 4) > colorequivszc[i]) {
4315 				colorequivszc[i] = (a << 4);
4316 			}
4317 		}
4318 	}
4319 }
4320