/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*	All Rights Reserved   */

/*
 * Portions of this source code were derived from Berkeley 4.3 BSD
 * under license from the Regents of the University of California.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * This file contains common functions to access and manage the page lists.
 * Many of these routines originated from platform dependent modules
 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
 * a platform independent manner.
 *
 * vm/vm_dep.h provides for platform specific support.
 */

#include <sys/types.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/sysmacros.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/seg_kmem.h>
#include <vm/seg_vn.h>
#include <sys/memnode.h>
#include <vm/vm_dep.h>
#include <sys/lgrp.h>
#include <sys/mem_config.h>
#include <sys/callb.h>
#include <sys/mem_cage.h>
#include <sys/sdt.h>

extern uint_t	vac_colors;

/*
 * number of page colors equivalent to reqested color in page_get routines.
 * If set, keeps large pages intact longer and keeps MPO allocation
 * from the local mnode in favor of acquiring the 'correct' page color from
 * a demoted large page or from a remote mnode.
 */
int	colorequiv;

/*
 * if set, specifies the percentage of large pages that are free from within
 * a large page region before attempting to lock those pages for
 * page_get_contig_pages processing.
 *
 * Should be turned on when kpr is available when page_trylock_contig_pages
 * can be more selective.
 */

int	ptcpthreshold;

/*
 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
 * use slot 0 (base page size unused) to enable or disable limiting search.
 * Enabled by default.
 */
int	pgcpfailcnt[MMU_PAGE_SIZES];
int	pgcplimitsearch = 1;

#ifdef VM_STATS
struct vmm_vmstats_str  vmm_vmstats;

#endif /* VM_STATS */

#if defined(__sparc)
#define	LPGCREATE	0
#else
/* enable page_get_contig_pages */
#define	LPGCREATE	1
#endif

int pg_contig_disable;
int pg_lpgcreate_nocage = LPGCREATE;

/*
 * page_freelist_fill pfn flag to signify no hi pfn requirement.
 */
#define	PFNNULL		0

/* Flags involved in promotion and demotion routines */
#define	PC_FREE		0x1	/* put page on freelist */
#define	PC_ALLOC	0x2	/* return page for allocation */

/*
 * Flag for page_demote to be used with PC_FREE to denote that we don't care
 * what the color is as the color parameter to the function is ignored.
 */
#define	PC_NO_COLOR	(-1)

/*
 * page counters candidates info
 * See page_ctrs_cands comment below for more details.
 * fields are as follows:
 *	pcc_pages_free:		# pages which freelist coalesce can create
 *	pcc_color_free_len:	number of elements in pcc_color_free array
 *	pcc_color_free:		pointer to page free counts per color
 */
typedef struct pcc_info {
	pgcnt_t	pcc_pages_free;
	int	pcc_color_free_len;
	pgcnt_t	*pcc_color_free;
} pcc_info_t;

/*
 * On big machines it can take a long time to check page_counters
 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
 * updated sum of all elements of the corresponding page_counters arrays.
 * page_freelist_coalesce() searches page_counters only if an appropriate
 * element of page_ctrs_cands array is greater than 0.
 *
 * An extra dimension is used for page_ctrs_cands to spread the elements
 * over a few e$ cache lines to avoid serialization during the array
 * updates.
 */
#pragma	align 64(page_ctrs_cands)

static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];

/*
 * Return in val the total number of free pages which can be created
 * for the given mnode (m) and region size (r)
 */
#define	PGCTRS_CANDS_GETVALUE(m, r, val) {				\
	int i;								\
	val = 0;							\
	for (i = 0; i < NPC_MUTEX; i++) {				\
	    val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free;		\
	}								\
}

/*
 * Return in val the total number of free pages which can be created
 * for the given mnode (m), region size (r), and color (c)
 */
#define	PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) {			\
	int i;								\
	val = 0;							\
	ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len);	\
	for (i = 0; i < NPC_MUTEX; i++) {				\
	    val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)];	\
	}								\
}

/*
 * We can only allow a single thread to update a counter within the physical
 * range of the largest supported page size. That is the finest granularity
 * possible since the counter values are dependent on each other
 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
 * ctr_mutex lock index for a particular physical range.
 */
static kmutex_t	*ctr_mutex[NPC_MUTEX];

#define	PP_CTR_LOCK_INDX(pp)						\
	(((pp)->p_pagenum >>					\
	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))

/*
 * Local functions prototypes.
 */

void page_ctr_add(page_t *, int);
void page_ctr_add_internal(int, page_t *, int);
void page_ctr_sub(page_t *, int);
uint_t  page_convert_color(uchar_t, uchar_t, uint_t);
void page_freelist_lock(int);
void page_freelist_unlock(int);
page_t *page_promote(int, pfn_t, uchar_t, int);
page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
static int page_trylock_cons(page_t *pp, se_t se);

#define	PNUM_SIZE(szc)							\
	(hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift)
#define	PNUM_SHIFT(szc)							\
	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)

/*
 * The page_counters array below is used to keep track of free contiguous
 * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
 * This contains an array of counters, the size of the array, a shift value
 * used to convert a pagenum into a counter array index or vice versa, as
 * well as a cache of the last successful index to be promoted to a larger
 * page size.  As an optimization, we keep track of the last successful index
 * to be promoted per page color for the given size region, and this is
 * allocated dynamically based upon the number of colors for a given
 * region size.
 *
 * Conceptually, the page counters are represented as:
 *
 *	page_counters[region_size][mnode]
 *
 *	region_size:	size code of a candidate larger page made up
 *			of contiguous free smaller pages.
 *
 *	page_counters[region_size][mnode].hpm_counters[index]:
 *		represents how many (region_size - 1) pages either
 *		exist or can be created within the given index range.
 *
 * Let's look at a sparc example:
 *	If we want to create a free 512k page, we look at region_size 2
 *	for the mnode we want.  We calculate the index and look at a specific
 *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
 *	this location, it means that 8 64k pages either exist or can be created
 *	from 8K pages in order to make a single free 512k page at the given
 *	index.  Note that when a region is full, it will contribute to the
 *	counts in the region above it.  Thus we will not know what page
 *	size the free pages will be which can be promoted to this new free
 *	page unless we look at all regions below the current region.
 */

/*
 * Note: hpmctr_t is defined in platform vm_dep.h
 * hw_page_map_t contains all the information needed for the page_counters
 * logic. The fields are as follows:
 *
 *	hpm_counters:	dynamically allocated array to hold counter data
 *	hpm_entries:	entries in hpm_counters
 *	hpm_shift:	shift for pnum/array index conv
 *	hpm_base:	PFN mapped to counter index 0
 *	hpm_color_current_len:	# of elements in hpm_color_current "array" below
 *	hpm_color_current:	last index in counter array for this color at
 *				which we successfully created a large page
 */
typedef struct hw_page_map {
	hpmctr_t	*hpm_counters;
	size_t		hpm_entries;
	int		hpm_shift;
	pfn_t		hpm_base;
	size_t		hpm_color_current_len;
	size_t 		*hpm_color_current;
} hw_page_map_t;

/*
 * Element zero is not used, but is allocated for convenience.
 */
static hw_page_map_t *page_counters[MMU_PAGE_SIZES];

/*
 * The following macros are convenient ways to get access to the individual
 * elements of the page_counters arrays.  They can be used on both
 * the left side and right side of equations.
 */
#define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])

#define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
	(page_counters[(rg_szc)][(mnode)].hpm_counters)

#define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
	(page_counters[(rg_szc)][(mnode)].hpm_shift)

#define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
	(page_counters[(rg_szc)][(mnode)].hpm_entries)

#define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
	(page_counters[(rg_szc)][(mnode)].hpm_base)

#define	PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc)		\
	(page_counters[(rg_szc)][(mnode)].hpm_color_current_len)

#define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc)	\
	(page_counters[(rg_szc)][(mnode)].hpm_color_current)

#define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color)	\
	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)])

#define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))

#define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))

/*
 * Protects the hpm_counters and hpm_color_current memory from changing while
 * looking at page counters information.
 * Grab the write lock to modify what these fields point at.
 * Grab the read lock to prevent any pointers from changing.
 * The write lock can not be held during memory allocation due to a possible
 * recursion deadlock with trying to grab the read lock while the
 * write lock is already held.
 */
krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];

/*
 * page size to page size code
 */
int
page_szc(size_t pagesize)
{
	int	i = 0;

	while (hw_page_array[i].hp_size) {
		if (pagesize == hw_page_array[i].hp_size)
			return (i);
		i++;
	}
	return (-1);
}

/*
 * page size to page size code with the restriction that it be a supported
 * user page size.  If it's not a supported user page size, -1 will be returned.
 */
int
page_szc_user_filtered(size_t pagesize)
{
	int szc = page_szc(pagesize);
	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
		return (szc);
	}
	return (-1);
}

/*
 * Return how many page sizes are available for the user to use.  This is
 * what the hardware supports and not based upon how the OS implements the
 * support of different page sizes.
 */
uint_t
page_num_user_pagesizes(void)
{
	return (mmu_exported_page_sizes);
}

uint_t
page_num_pagesizes(void)
{
	return (mmu_page_sizes);
}

/*
 * returns the count of the number of base pagesize pages associated with szc
 */
pgcnt_t
page_get_pagecnt(uint_t szc)
{
	if (szc >= mmu_page_sizes)
		panic("page_get_pagecnt: out of range %d", szc);
	return (hw_page_array[szc].hp_pgcnt);
}

size_t
page_get_pagesize(uint_t szc)
{
	if (szc >= mmu_page_sizes)
		panic("page_get_pagesize: out of range %d", szc);
	return (hw_page_array[szc].hp_size);
}

/*
 * Return the size of a page based upon the index passed in.  An index of
 * zero refers to the smallest page size in the system, and as index increases
 * it refers to the next larger supported page size in the system.
 * Note that szc and userszc may not be the same due to unsupported szc's on
 * some systems.
 */
size_t
page_get_user_pagesize(uint_t userszc)
{
	uint_t szc = USERSZC_2_SZC(userszc);

	if (szc >= mmu_page_sizes)
		panic("page_get_user_pagesize: out of range %d", szc);
	return (hw_page_array[szc].hp_size);
}

uint_t
page_get_shift(uint_t szc)
{
	if (szc >= mmu_page_sizes)
		panic("page_get_shift: out of range %d", szc);
	return (hw_page_array[szc].hp_shift);
}

uint_t
page_get_pagecolors(uint_t szc)
{
	ASSERT(page_colors != 0);
	return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1));
}

/*
 * Called by startup().
 * Size up the per page size free list counters based on physmax
 * of each node and max_mem_nodes.
 */
size_t
page_ctrs_sz(void)
{
	int	r;		/* region size */
	int	mnode;
	uint_t	ctrs_sz = 0;
	int 	i;
	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];

	/*
	 * We need to determine how many page colors there are for each
	 * page size in order to allocate memory for any color specific
	 * arrays.
	 */
	colors_per_szc[0] = page_colors;
	for (i = 1; i < mmu_page_sizes; i++) {
		colors_per_szc[i] =
		    page_convert_color(0, i, page_colors - 1) + 1;
	}

	for (mnode = 0; mnode < max_mem_nodes; mnode++) {

		pgcnt_t r_pgcnt;
		pfn_t   r_base;
		pgcnt_t r_align;

		if (mem_node_config[mnode].exists == 0)
			continue;

		/*
		 * determine size needed for page counter arrays with
		 * base aligned to large page size.
		 */
		for (r = 1; r < mmu_page_sizes; r++) {
			/* add in space for hpm_counters */
			r_align = page_get_pagecnt(r);
			r_base = mem_node_config[mnode].physbase;
			r_base &= ~(r_align - 1);
			r_pgcnt = howmany(mem_node_config[mnode].physmax -
			r_base, r_align);
			/*
			 * Round up to always allocate on pointer sized
			 * boundaries.
			 */
			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
			    sizeof (hpmctr_t *));

			/* add in space for hpm_color_current */
			ctrs_sz += (colors_per_szc[r] *
			    sizeof (size_t));
		}
	}

	for (r = 1; r < mmu_page_sizes; r++) {
		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));

		/* add in space for page_ctrs_cands */
		ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t));
		ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] *
		    sizeof (pgcnt_t);
	}

	/* ctr_mutex */
	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));

	/* size for page list counts */
	PLCNT_SZ(ctrs_sz);

	/*
	 * add some slop for roundups. page_ctrs_alloc will roundup the start
	 * address of the counters to ecache_alignsize boundary for every
	 * memory node.
	 */
	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
}

caddr_t
page_ctrs_alloc(caddr_t alloc_base)
{
	int	mnode;
	int	r;		/* region size */
	int	i;
	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];

	/*
	 * We need to determine how many page colors there are for each
	 * page size in order to allocate memory for any color specific
	 * arrays.
	 */
	colors_per_szc[0] = page_colors;
	for (i = 1; i < mmu_page_sizes; i++) {
		colors_per_szc[i] =
		    page_convert_color(0, i, page_colors - 1) + 1;
	}

	for (r = 1; r < mmu_page_sizes; r++) {
		page_counters[r] = (hw_page_map_t *)alloc_base;
		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
	}

	/* page_ctrs_cands */
	for (r = 1; r < mmu_page_sizes; r++) {
		for (i = 0; i < NPC_MUTEX; i++) {
			page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base;
			alloc_base += max_mem_nodes * (sizeof (pcc_info_t));

		}
	}

	/* page_ctrs_cands pcc_color_free array */
	for (r = 1; r < mmu_page_sizes; r++) {
		for (i = 0; i < NPC_MUTEX; i++) {
			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
				page_ctrs_cands[i][r][mnode].pcc_color_free_len
				    = colors_per_szc[r];
				page_ctrs_cands[i][r][mnode].pcc_color_free =
				    (pgcnt_t *)alloc_base;
				alloc_base += colors_per_szc[r] *
				    sizeof (pgcnt_t);
			}
		}
	}

	/* ctr_mutex */
	for (i = 0; i < NPC_MUTEX; i++) {
		ctr_mutex[i] = (kmutex_t *)alloc_base;
		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
	}

	/* initialize page list counts */
	PLCNT_INIT(alloc_base);

	for (mnode = 0; mnode < max_mem_nodes; mnode++) {

		pgcnt_t r_pgcnt;
		pfn_t	r_base;
		pgcnt_t r_align;
		int	r_shift;

		if (mem_node_config[mnode].exists == 0)
			continue;

		for (r = 1; r < mmu_page_sizes; r++) {
			/*
			 * the page_counters base has to be aligned to the
			 * page count of page size code r otherwise the counts
			 * will cross large page boundaries.
			 */
			r_align = page_get_pagecnt(r);
			r_base = mem_node_config[mnode].physbase;
			/* base needs to be aligned - lower to aligned value */
			r_base &= ~(r_align - 1);
			r_pgcnt = howmany(mem_node_config[mnode].physmax -
			r_base, r_align);
			r_shift = PAGE_BSZS_SHIFT(r);

			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
			PAGE_COUNTERS_BASE(mnode, r) = r_base;
			PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) =
			    colors_per_szc[r];
			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) =
			    (size_t *)alloc_base;
			alloc_base += (sizeof (size_t) * colors_per_szc[r]);
			for (i = 0; i < colors_per_szc[r]; i++) {
				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
			}
			PAGE_COUNTERS_COUNTERS(mnode, r) =
			    (hpmctr_t *)alloc_base;
			/*
			 * Round up to make alloc_base always be aligned on
			 * a pointer boundary.
			 */
			alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
			    sizeof (hpmctr_t *));

			/*
			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
			 * satisfy the identity requirement.
			 * We should be able to go from one to the other
			 * and get consistent values.
			 */
			ASSERT(PNUM_TO_IDX(mnode, r,
			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
			ASSERT(IDX_TO_PNUM(mnode, r,
			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
		}
		/*
		 * Roundup the start address of the page_counters to
		 * cache aligned boundary for every memory node.
		 * page_ctrs_sz() has added some slop for these roundups.
		 */
		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
			L2CACHE_ALIGN);
	}

	/* Initialize other page counter specific data structures. */
	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
	}

	return (alloc_base);
}

/*
 * Functions to adjust region counters for each size free list.
 * Caller is responsible to acquire the ctr_mutex lock if necessary and
 * thus can be called during startup without locks.
 */
/* ARGSUSED */
void
page_ctr_add_internal(int mnode, page_t *pp, int flags)
{
	ssize_t		r;	/* region size */
	ssize_t		idx;
	pfn_t		pfnum;
	int		lckidx;

	ASSERT(pp->p_szc < mmu_page_sizes);

	PLCNT_INCR(pp, mnode, pp->p_szc, flags);

	/* no counter update needed for largest page size */
	if (pp->p_szc >= mmu_page_sizes - 1) {
		return;
	}

	r = pp->p_szc + 1;
	pfnum = pp->p_pagenum;
	lckidx = PP_CTR_LOCK_INDX(pp);

	/*
	 * Increment the count of free pages for the current
	 * region. Continue looping up in region size incrementing
	 * count if the preceeding region is full.
	 */
	while (r < mmu_page_sizes) {
		idx = PNUM_TO_IDX(mnode, r, pfnum);

		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));

		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r))
			break;

		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++;
		page_ctrs_cands[lckidx][r][mnode].
		    pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
		r++;
	}
}

void
page_ctr_add(page_t *pp, int flags)
{
	int		lckidx = PP_CTR_LOCK_INDX(pp);
	int		mnode = PP_2_MEM_NODE(pp);
	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];

	mutex_enter(lock);
	page_ctr_add_internal(mnode, pp, flags);
	mutex_exit(lock);
}

void
page_ctr_sub(page_t *pp, int flags)
{
	int		lckidx;
	int		mnode = PP_2_MEM_NODE(pp);
	kmutex_t	*lock;
	ssize_t		r;	/* region size */
	ssize_t		idx;
	pfn_t		pfnum;

	ASSERT(pp->p_szc < mmu_page_sizes);

	PLCNT_DECR(pp, mnode, pp->p_szc, flags);

	/* no counter update needed for largest page size */
	if (pp->p_szc >= mmu_page_sizes - 1) {
		return;
	}

	r = pp->p_szc + 1;
	pfnum = pp->p_pagenum;
	lckidx = PP_CTR_LOCK_INDX(pp);
	lock = &ctr_mutex[lckidx][mnode];

	/*
	 * Decrement the count of free pages for the current
	 * region. Continue looping up in region size decrementing
	 * count if the preceeding region was full.
	 */
	mutex_enter(lock);
	while (r < mmu_page_sizes) {
		idx = PNUM_TO_IDX(mnode, r, pfnum);

		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);

		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
			break;
		}
		ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0);
		ASSERT(page_ctrs_cands[lckidx][r][mnode].
		    pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);

		page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--;
		page_ctrs_cands[lckidx][r][mnode].
		    pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
		r++;
	}
	mutex_exit(lock);
}

/*
 * Adjust page counters following a memory attach, since typically the
 * size of the array needs to change, and the PFN to counter index
 * mapping needs to change.
 */
uint_t
page_ctrs_adjust(int mnode)
{
	pgcnt_t npgs;
	int	r;		/* region size */
	int	i;
	size_t	pcsz, old_csz;
	hpmctr_t *new_ctr, *old_ctr;
	pfn_t	oldbase, newbase;
	size_t	old_npgs;
	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
	size_t	size_cache[MMU_PAGE_SIZES];
	size_t	*color_cache[MMU_PAGE_SIZES];
	size_t	*old_color_array;
	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];

	newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK;
	npgs = roundup(mem_node_config[mnode].physmax,
	    PC_BASE_ALIGN) - newbase;

	/*
	 * We need to determine how many page colors there are for each
	 * page size in order to allocate memory for any color specific
	 * arrays.
	 */
	colors_per_szc[0] = page_colors;
	for (r = 1; r < mmu_page_sizes; r++) {
		colors_per_szc[r] =
		    page_convert_color(0, r, page_colors - 1) + 1;
	}

	/*
	 * Preallocate all of the new hpm_counters arrays as we can't
	 * hold the page_ctrs_rwlock as a writer and allocate memory.
	 * If we can't allocate all of the arrays, undo our work so far
	 * and return failure.
	 */
	for (r = 1; r < mmu_page_sizes; r++) {
		pcsz = npgs >> PAGE_BSZS_SHIFT(r);

		ctr_cache[r] = kmem_zalloc(pcsz *
		    sizeof (hpmctr_t), KM_NOSLEEP);
		if (ctr_cache[r] == NULL) {
			while (--r >= 1) {
				kmem_free(ctr_cache[r],
				    size_cache[r] * sizeof (hpmctr_t));
			}
			return (ENOMEM);
		}
		size_cache[r] = pcsz;
	}
	/*
	 * Preallocate all of the new color current arrays as we can't
	 * hold the page_ctrs_rwlock as a writer and allocate memory.
	 * If we can't allocate all of the arrays, undo our work so far
	 * and return failure.
	 */
	for (r = 1; r < mmu_page_sizes; r++) {
		color_cache[r] = kmem_zalloc(sizeof (size_t) *
		    colors_per_szc[r], KM_NOSLEEP);
		if (color_cache[r] == NULL) {
			while (--r >= 1) {
				kmem_free(color_cache[r],
				    colors_per_szc[r] * sizeof (size_t));
			}
			for (r = 1; r < mmu_page_sizes; r++) {
				kmem_free(ctr_cache[r],
				    size_cache[r] * sizeof (hpmctr_t));
			}
			return (ENOMEM);
		}
	}

	/*
	 * Grab the write lock to prevent others from walking these arrays
	 * while we are modifying them.
	 */
	rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER);
	page_freelist_lock(mnode);
	for (r = 1; r < mmu_page_sizes; r++) {
		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
		oldbase = PAGE_COUNTERS_BASE(mnode, r);
		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
		old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r);

		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
		new_ctr = ctr_cache[r];
		ctr_cache[r] = NULL;
		if (old_ctr != NULL &&
		    (oldbase + old_npgs > newbase) &&
		    (newbase + npgs > oldbase)) {
			/*
			 * Map the intersection of the old and new
			 * counters into the new array.
			 */
			size_t offset;
			if (newbase > oldbase) {
				offset = (newbase - oldbase) >>
				    PAGE_COUNTERS_SHIFT(mnode, r);
				bcopy(old_ctr + offset, new_ctr,
				    MIN(pcsz, (old_csz - offset)) *
				    sizeof (hpmctr_t));
			} else {
				offset = (oldbase - newbase) >>
				    PAGE_COUNTERS_SHIFT(mnode, r);
				bcopy(old_ctr, new_ctr + offset,
				    MIN(pcsz - offset, old_csz) *
				    sizeof (hpmctr_t));
			}
		}

		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
		PAGE_COUNTERS_BASE(mnode, r) = newbase;
		PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r];
		PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r];
		color_cache[r] = NULL;
		/*
		 * for now, just reset on these events as it's probably
		 * not worthwhile to try and optimize this.
		 */
		for (i = 0; i < colors_per_szc[r]; i++) {
			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i;
		}

		/* cache info for freeing out of the critical path */
		if ((caddr_t)old_ctr >= kernelheap &&
		    (caddr_t)old_ctr < ekernelheap) {
			ctr_cache[r] = old_ctr;
			size_cache[r] = old_csz;
		}
		if ((caddr_t)old_color_array >= kernelheap &&
		    (caddr_t)old_color_array < ekernelheap) {
			color_cache[r] = old_color_array;
		}
		/*
		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
		 * satisfy the identity requirement.
		 * We should be able to go from one to the other
		 * and get consistent values.
		 */
		ASSERT(PNUM_TO_IDX(mnode, r,
		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
		ASSERT(IDX_TO_PNUM(mnode, r,
		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
	}
	page_freelist_unlock(mnode);
	rw_exit(&page_ctrs_rwlock[mnode]);

	/*
	 * Now that we have dropped the write lock, it is safe to free all
	 * of the memory we have cached above.
	 */
	for (r = 1; r < mmu_page_sizes; r++) {
		if (ctr_cache[r] != NULL) {
			kmem_free(ctr_cache[r],
			    size_cache[r] * sizeof (hpmctr_t));
		}
		if (color_cache[r] != NULL) {
			kmem_free(color_cache[r],
			    colors_per_szc[r] * sizeof (size_t));
		}
	}
	return (0);
}

/*
 * color contains a valid color index or bin for cur_szc
 */
uint_t
page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color)
{
	uint_t shift;

	if (cur_szc > new_szc) {
		shift = page_get_shift(cur_szc) - page_get_shift(new_szc);
		return (color << shift);
	} else if (cur_szc < new_szc) {
		shift = page_get_shift(new_szc) - page_get_shift(cur_szc);
		return (color >> shift);
	}
	return (color);
}

#ifdef DEBUG

/*
 * confirm pp is a large page corresponding to szc
 */
void
chk_lpg(page_t *pp, uchar_t szc)
{
	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
	uint_t noreloc;

	if (npgs == 1) {
		ASSERT(pp->p_szc == 0);
		ASSERT(pp->p_next == pp);
		ASSERT(pp->p_prev == pp);
		return;
	}

	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);

	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
	ASSERT(pp->p_prev == (pp + (npgs - 1)));

	/*
	 * Check list of pages.
	 */
	noreloc = PP_ISNORELOC(pp);
	while (npgs--) {
		if (npgs != 0) {
			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
			ASSERT(pp->p_next == (pp + 1));
		}
		ASSERT(pp->p_szc == szc);
		ASSERT(PP_ISFREE(pp));
		ASSERT(PP_ISAGED(pp));
		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
		ASSERT(pp->p_vnode  == NULL);
		ASSERT(PP_ISNORELOC(pp) == noreloc);

		pp = pp->p_next;
	}
}
#endif /* DEBUG */

void
page_freelist_lock(int mnode)
{
	int i;
	for (i = 0; i < NPC_MUTEX; i++) {
		mutex_enter(FPC_MUTEX(mnode, i));
		mutex_enter(CPC_MUTEX(mnode, i));
	}
}

void
page_freelist_unlock(int mnode)
{
	int i;
	for (i = 0; i < NPC_MUTEX; i++) {
		mutex_exit(FPC_MUTEX(mnode, i));
		mutex_exit(CPC_MUTEX(mnode, i));
	}
}

/*
 * add pp to the specified page list. Defaults to head of the page list
 * unless PG_LIST_TAIL is specified.
 */
void
page_list_add(page_t *pp, int flags)
{
	page_t		**ppp;
	kmutex_t	*pcm;
	uint_t		bin, mtype;
	int		mnode;

	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
	ASSERT(PP_ISFREE(pp));
	ASSERT(!hat_page_is_mapped(pp));
	ASSERT(hat_page_getshare(pp) == 0);

	/*
	 * Large pages should be freed via page_list_add_pages().
	 */
	ASSERT(pp->p_szc == 0);

	/*
	 * Don't need to lock the freelist first here
	 * because the page isn't on the freelist yet.
	 * This means p_szc can't change on us.
	 */

	bin = PP_2_BIN(pp);
	mnode = PP_2_MEM_NODE(pp);
	mtype = PP_2_MTYPE(pp);

	if (flags & PG_LIST_ISINIT) {
		/*
		 * PG_LIST_ISINIT is set during system startup (ie. single
		 * threaded), add a page to the free list and add to the
		 * the free region counters w/o any locking
		 */
		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);

		/* inline version of page_add() */
		if (*ppp != NULL) {
			pp->p_next = *ppp;
			pp->p_prev = (*ppp)->p_prev;
			(*ppp)->p_prev = pp;
			pp->p_prev->p_next = pp;
		} else
			*ppp = pp;

		page_ctr_add_internal(mnode, pp, flags);
	} else {
		pcm = PC_BIN_MUTEX(mnode, bin, flags);

		if (flags & PG_FREE_LIST) {
			ASSERT(PP_ISAGED(pp));
			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);

		} else {
			ASSERT(pp->p_vnode);
			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
		}
		mutex_enter(pcm);
		page_add(ppp, pp);

		if (flags & PG_LIST_TAIL)
			*ppp = (*ppp)->p_next;
		/*
		 * Add counters before releasing pcm mutex to avoid a race with
		 * page_freelist_coalesce and page_freelist_fill.
		 */
		page_ctr_add(pp, flags);
		mutex_exit(pcm);
	}


#if defined(__sparc)
	if (PP_ISNORELOC(pp)) {
		kcage_freemem_add(1);
	}
#endif
	/*
	 * It is up to the caller to unlock the page!
	 */
	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
}


#ifdef __sparc
/*
 * This routine is only used by kcage_init during system startup.
 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
 * without the overhead of taking locks and updating counters.
 */
void
page_list_noreloc_startup(page_t *pp)
{
	page_t		**ppp;
	uint_t		bin;
	int		mnode;
	int		mtype;
	int		flags = PG_LIST_ISCAGE;

	/*
	 * If this is a large page on the freelist then
	 * break it up into smaller pages.
	 */
	if (pp->p_szc != 0)
		page_boot_demote(pp);

	/*
	 * Get list page is currently on.
	 */
	bin = PP_2_BIN(pp);
	mnode = PP_2_MEM_NODE(pp);
	mtype = PP_2_MTYPE(pp);
	ASSERT(mtype == MTYPE_RELOC);
	ASSERT(pp->p_szc == 0);

	if (PP_ISAGED(pp)) {
		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
		flags |= PG_FREE_LIST;
	} else {
		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
		flags |= PG_CACHE_LIST;
	}

	ASSERT(*ppp != NULL);

	/*
	 * Delete page from current list.
	 */
	if (*ppp == pp)
		*ppp = pp->p_next;		/* go to next page */
	if (*ppp == pp) {
		*ppp = NULL;			/* page list is gone */
	} else {
		pp->p_prev->p_next = pp->p_next;
		pp->p_next->p_prev = pp->p_prev;
	}

	/* LINTED */
	PLCNT_DECR(pp, mnode, 0, flags);

	/*
	 * Set no reloc for cage initted pages.
	 */
	PP_SETNORELOC(pp);

	mtype = PP_2_MTYPE(pp);
	ASSERT(mtype == MTYPE_NORELOC);

	/*
	 * Get new list for page.
	 */
	if (PP_ISAGED(pp)) {
		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
	} else {
		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
	}

	/*
	 * Insert page on new list.
	 */
	if (*ppp == NULL) {
		*ppp = pp;
		pp->p_next = pp->p_prev = pp;
	} else {
		pp->p_next = *ppp;
		pp->p_prev = (*ppp)->p_prev;
		(*ppp)->p_prev = pp;
		pp->p_prev->p_next = pp;
	}

	/* LINTED */
	PLCNT_INCR(pp, mnode, 0, flags);

	/*
	 * Update cage freemem counter
	 */
	atomic_add_long(&kcage_freemem, 1);
}
#else	/* __sparc */

/* ARGSUSED */
void
page_list_noreloc_startup(page_t *pp)
{
	panic("page_list_noreloc_startup: should be here only for sparc");
}
#endif

void
page_list_add_pages(page_t *pp, int flags)
{
	kmutex_t *pcm;
	pgcnt_t	pgcnt;
	uint_t	bin, mtype, i;
	int	mnode;

	/* default to freelist/head */
	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);

	CHK_LPG(pp, pp->p_szc);
	VM_STAT_ADD(vmm_vmstats.pc_list_add_pages[pp->p_szc]);

	bin = PP_2_BIN(pp);
	mnode = PP_2_MEM_NODE(pp);
	mtype = PP_2_MTYPE(pp);

	if (flags & PG_LIST_ISINIT) {
		ASSERT(pp->p_szc == mmu_page_sizes - 1);
		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
		ASSERT(!PP_ISNORELOC(pp));
		PLCNT_INCR(pp, mnode, pp->p_szc, flags);
	} else {

		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);

		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);

		mutex_enter(pcm);
		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
		page_ctr_add(pp, PG_FREE_LIST);
		mutex_exit(pcm);

		pgcnt = page_get_pagecnt(pp->p_szc);
#if defined(__sparc)
		if (PP_ISNORELOC(pp))
			kcage_freemem_add(pgcnt);
#endif
		for (i = 0; i < pgcnt; i++, pp++)
			page_unlock(pp);
	}
}

/*
 * During boot, need to demote a large page to base
 * pagesize pages for seg_kmem for use in boot_alloc()
 */
void
page_boot_demote(page_t *pp)
{
	ASSERT(pp->p_szc != 0);
	ASSERT(PP_ISFREE(pp));
	ASSERT(PP_ISAGED(pp));

	(void) page_demote(PP_2_MEM_NODE(pp),
	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
	    PC_FREE);

	ASSERT(PP_ISFREE(pp));
	ASSERT(PP_ISAGED(pp));
	ASSERT(pp->p_szc == 0);
}

/*
 * Take a particular page off of whatever freelist the page
 * is claimed to be on.
 *
 * NOTE: Only used for PAGESIZE pages.
 */
void
page_list_sub(page_t *pp, int flags)
{
	int		bin;
	uint_t		mtype;
	int		mnode;
	kmutex_t	*pcm;
	page_t		**ppp;

	ASSERT(PAGE_EXCL(pp));
	ASSERT(PP_ISFREE(pp));

	/*
	 * The p_szc field can only be changed by page_promote()
	 * and page_demote(). Only free pages can be promoted and
	 * demoted and the free list MUST be locked during these
	 * operations. So to prevent a race in page_list_sub()
	 * between computing which bin of the freelist lock to
	 * grab and actually grabing the lock we check again that
	 * the bin we locked is still the correct one. Notice that
	 * the p_szc field could have actually changed on us but
	 * if the bin happens to still be the same we are safe.
	 */
try_again:
	bin = PP_2_BIN(pp);
	mnode = PP_2_MEM_NODE(pp);
	pcm = PC_BIN_MUTEX(mnode, bin, flags);
	mutex_enter(pcm);
	if (PP_2_BIN(pp) != bin) {
		mutex_exit(pcm);
		goto try_again;
	}
	mtype = PP_2_MTYPE(pp);

	if (flags & PG_FREE_LIST) {
		ASSERT(PP_ISAGED(pp));
		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
	} else {
		ASSERT(!PP_ISAGED(pp));
		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
	}

	/*
	 * Common PAGESIZE case.
	 *
	 * Note that we locked the freelist. This prevents
	 * any page promotion/demotion operations. Therefore
	 * the p_szc will not change until we drop pcm mutex.
	 */
	if (pp->p_szc == 0) {
		page_sub(ppp, pp);
		/*
		 * Subtract counters before releasing pcm mutex
		 * to avoid race with page_freelist_coalesce.
		 */
		page_ctr_sub(pp, flags);
		mutex_exit(pcm);

#if defined(__sparc)
		if (PP_ISNORELOC(pp)) {
			kcage_freemem_sub(1);
		}
#endif
		return;
	}

	/*
	 * Large pages on the cache list are not supported.
	 */
	if (flags & PG_CACHE_LIST)
		panic("page_list_sub: large page on cachelist");

	/*
	 * Slow but rare.
	 *
	 * Somebody wants this particular page which is part
	 * of a large page. In this case we just demote the page
	 * if it's on the freelist.
	 *
	 * We have to drop pcm before locking the entire freelist.
	 * Once we have re-locked the freelist check to make sure
	 * the page hasn't already been demoted or completely
	 * freed.
	 */
	mutex_exit(pcm);
	page_freelist_lock(mnode);
	if (pp->p_szc != 0) {
		/*
		 * Large page is on freelist.
		 */
		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
	}
	ASSERT(PP_ISFREE(pp));
	ASSERT(PP_ISAGED(pp));
	ASSERT(pp->p_szc == 0);

	/*
	 * Subtract counters before releasing pcm mutex
	 * to avoid race with page_freelist_coalesce.
	 */
	bin = PP_2_BIN(pp);
	mtype = PP_2_MTYPE(pp);
	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);

	page_sub(ppp, pp);
	page_ctr_sub(pp, flags);
	page_freelist_unlock(mnode);

#if defined(__sparc)
	if (PP_ISNORELOC(pp)) {
		kcage_freemem_sub(1);
	}
#endif
}

void
page_list_sub_pages(page_t *pp, uint_t szc)
{
	kmutex_t *pcm;
	uint_t	bin, mtype;
	int	mnode;

	ASSERT(PAGE_EXCL(pp));
	ASSERT(PP_ISFREE(pp));
	ASSERT(PP_ISAGED(pp));

	/*
	 * See comment in page_list_sub().
	 */
try_again:
	bin = PP_2_BIN(pp);
	mnode = PP_2_MEM_NODE(pp);
	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
	mutex_enter(pcm);
	if (PP_2_BIN(pp) != bin) {
		mutex_exit(pcm);
		goto	try_again;
	}

	VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages1[pp->p_szc]);

	/*
	 * If we're called with a page larger than szc or it got
	 * promoted above szc before we locked the freelist then
	 * drop pcm and re-lock entire freelist. If page still larger
	 * than szc then demote it.
	 */
	if (pp->p_szc > szc) {
		VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages2[pp->p_szc]);
		mutex_exit(pcm);
		pcm = NULL;
		page_freelist_lock(mnode);
		if (pp->p_szc > szc) {
			VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages3[pp->p_szc]);
			(void) page_demote(mnode,
			    PFN_BASE(pp->p_pagenum, pp->p_szc),
			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
		}
		bin = PP_2_BIN(pp);
	}
	ASSERT(PP_ISFREE(pp));
	ASSERT(PP_ISAGED(pp));
	ASSERT(pp->p_szc <= szc);
	ASSERT(pp == PP_PAGEROOT(pp));

	mtype = PP_2_MTYPE(pp);
	if (pp->p_szc != 0) {
		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
		CHK_LPG(pp, pp->p_szc);
	} else {
		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
	}
	page_ctr_sub(pp, PG_FREE_LIST);

	if (pcm != NULL) {
		mutex_exit(pcm);
	} else {
		page_freelist_unlock(mnode);
	}

#if defined(__sparc)
	if (PP_ISNORELOC(pp)) {
		pgcnt_t	pgcnt;

		pgcnt = page_get_pagecnt(pp->p_szc);
		kcage_freemem_sub(pgcnt);
	}
#endif
}

/*
 * Add the page to the front of a linked list of pages
 * using the p_next & p_prev pointers for the list.
 * The caller is responsible for protecting the list pointers.
 */
void
mach_page_add(page_t **ppp, page_t *pp)
{
	if (*ppp == NULL) {
		pp->p_next = pp->p_prev = pp;
	} else {
		pp->p_next = *ppp;
		pp->p_prev = (*ppp)->p_prev;
		(*ppp)->p_prev = pp;
		pp->p_prev->p_next = pp;
	}
	*ppp = pp;
}

/*
 * Remove this page from a linked list of pages
 * using the p_next & p_prev pointers for the list.
 *
 * The caller is responsible for protecting the list pointers.
 */
void
mach_page_sub(page_t **ppp, page_t *pp)
{
	ASSERT(PP_ISFREE(pp));

	if (*ppp == NULL || pp == NULL)
		panic("mach_page_sub");

	if (*ppp == pp)
		*ppp = pp->p_next;		/* go to next page */

	if (*ppp == pp)
		*ppp = NULL;			/* page list is gone */
	else {
		pp->p_prev->p_next = pp->p_next;
		pp->p_next->p_prev = pp->p_prev;
	}
	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
}

/*
 * Routine fsflush uses to gradually coalesce the free list into larger pages.
 */
void
page_promote_size(page_t *pp, uint_t cur_szc)
{
	pfn_t pfn;
	int mnode;
	int idx;
	int new_szc = cur_szc + 1;
	int full = FULL_REGION_CNT(new_szc);

	pfn = page_pptonum(pp);
	mnode = PFN_2_MEM_NODE(pfn);

	page_freelist_lock(mnode);

	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
		(void) page_promote(mnode, pfn, new_szc, PC_FREE);

	page_freelist_unlock(mnode);
}

static uint_t page_promote_err;
static uint_t page_promote_noreloc_err;

/*
 * Create a single larger page (of szc new_szc) from smaller contiguous pages
 * for the given mnode starting at pfnum. Pages involved are on the freelist
 * before the call and may be returned to the caller if requested, otherwise
 * they will be placed back on the freelist.
 * If flags is PC_ALLOC, then the large page will be returned to the user in
 * a state which is consistent with a page being taken off the freelist.  If
 * we failed to lock the new large page, then we will return NULL to the
 * caller and put the large page on the freelist instead.
 * If flags is PC_FREE, then the large page will be placed on the freelist,
 * and NULL will be returned.
 * The caller is responsible for locking the freelist as well as any other
 * accounting which needs to be done for a returned page.
 *
 * RFE: For performance pass in pp instead of pfnum so
 * 	we can avoid excessive calls to page_numtopp_nolock().
 *	This would depend on an assumption that all contiguous
 *	pages are in the same memseg so we can just add/dec
 *	our pp.
 *
 * Lock ordering:
 *
 *	There is a potential but rare deadlock situation
 *	for page promotion and demotion operations. The problem
 *	is there are two paths into the freelist manager and
 *	they have different lock orders:
 *
 *	page_create()
 *		lock freelist
 *		page_lock(EXCL)
 *		unlock freelist
 *		return
 *		caller drops page_lock
 *
 *	page_free() and page_reclaim()
 *		caller grabs page_lock(EXCL)
 *
 *		lock freelist
 *		unlock freelist
 *		drop page_lock
 *
 *	What prevents a thread in page_create() from deadlocking
 *	with a thread freeing or reclaiming the same page is the
 *	page_trylock() in page_get_freelist(). If the trylock fails
 *	it skips the page.
 *
 *	The lock ordering for promotion and demotion is the same as
 *	for page_create(). Since the same deadlock could occur during
 *	page promotion and freeing or reclaiming of a page on the
 *	cache list we might have to fail the operation and undo what
 *	have done so far. Again this is rare.
 */
page_t *
page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags)
{
	page_t		*pp, *pplist, *tpp, *start_pp;
	pgcnt_t		new_npgs, npgs;
	uint_t		bin;
	pgcnt_t		tmpnpgs, pages_left;
	uint_t		mtype;
	uint_t		noreloc;
	uint_t 		i;
	int 		which_list;
	ulong_t		index;
	kmutex_t	*phm;

	/*
	 * General algorithm:
	 * Find the starting page
	 * Walk each page struct removing it from the freelist,
	 * and linking it to all the other pages removed.
	 * Once all pages are off the freelist,
	 * walk the list, modifying p_szc to new_szc and what
	 * ever other info needs to be done to create a large free page.
	 * According to the flags, either return the page or put it
	 * on the freelist.
	 */

	start_pp = page_numtopp_nolock(pfnum);
	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
	new_npgs = page_get_pagecnt(new_szc);
	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));

	/*
	 * Loop through smaller pages to confirm that all pages
	 * give the same result for PP_ISNORELOC().
	 * We can check this reliably here as the protocol for setting
	 * P_NORELOC requires pages to be taken off the free list first.
	 */
	for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) {
		if (pp == start_pp) {
			/* First page, set requirement. */
			noreloc = PP_ISNORELOC(pp);
		} else if (noreloc != PP_ISNORELOC(pp)) {
			page_promote_noreloc_err++;
			page_promote_err++;
			return (NULL);
		}
	}

	pages_left = new_npgs;
	pplist = NULL;
	pp = start_pp;

	/* Loop around coalescing the smaller pages into a big page. */
	while (pages_left) {
		/*
		 * Remove from the freelist.
		 */
		ASSERT(PP_ISFREE(pp));
		bin = PP_2_BIN(pp);
		ASSERT(mnode == PP_2_MEM_NODE(pp));
		mtype = PP_2_MTYPE(pp);
		if (PP_ISAGED(pp)) {

			/*
			 * PG_FREE_LIST
			 */
			if (pp->p_szc) {
				page_vpsub(&PAGE_FREELISTS(mnode,
				    pp->p_szc, bin, mtype), pp);
			} else {
				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
				    bin, mtype), pp);
			}
			which_list = PG_FREE_LIST;
		} else {
			ASSERT(pp->p_szc == 0);

			/*
			 * PG_CACHE_LIST
			 *
			 * Since this page comes from the
			 * cachelist, we must destroy the
			 * vnode association.
			 */
			if (!page_trylock(pp, SE_EXCL)) {
				goto fail_promote;
			}

			/*
			 * We need to be careful not to deadlock
			 * with another thread in page_lookup().
			 * The page_lookup() thread could be holding
			 * the same phm that we need if the two
			 * pages happen to hash to the same phm lock.
			 * At this point we have locked the entire
			 * freelist and page_lookup() could be trying
			 * to grab a freelist lock.
			 */
			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
			phm = PAGE_HASH_MUTEX(index);
			if (!mutex_tryenter(phm)) {
				page_unlock(pp);
				goto fail_promote;
			}

			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
			page_hashout(pp, phm);
			mutex_exit(phm);
			PP_SETAGED(pp);
			page_unlock(pp);
			which_list = PG_CACHE_LIST;
		}
		page_ctr_sub(pp, which_list);

		/*
		 * Concatenate the smaller page(s) onto
		 * the large page list.
		 */
		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
		pages_left -= npgs;
		tpp = pp;
		while (npgs--) {
			tpp->p_szc = new_szc;
			tpp = tpp->p_next;
		}
		page_list_concat(&pplist, &pp);
		pp += tmpnpgs;
	}
	CHK_LPG(pplist, new_szc);

	/*
	 * return the page to the user if requested
	 * in the properly locked state.
	 */
	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
		return (pplist);
	}

	/*
	 * Otherwise place the new large page on the freelist
	 */
	bin = PP_2_BIN(pplist);
	mnode = PP_2_MEM_NODE(pplist);
	mtype = PP_2_MTYPE(pplist);
	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);

	page_ctr_add(pplist, PG_FREE_LIST);
	return (NULL);

fail_promote:
	/*
	 * A thread must have still been freeing or
	 * reclaiming the page on the cachelist.
	 * To prevent a deadlock undo what we have
	 * done sofar and return failure. This
	 * situation can only happen while promoting
	 * PAGESIZE pages.
	 */
	page_promote_err++;
	while (pplist) {
		pp = pplist;
		mach_page_sub(&pplist, pp);
		pp->p_szc = 0;
		bin = PP_2_BIN(pp);
		mtype = PP_2_MTYPE(pp);
		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
		page_ctr_add(pp, PG_FREE_LIST);
	}
	return (NULL);

}

/*
 * Break up a large page into smaller size pages.
 * Pages involved are on the freelist before the call and may
 * be returned to the caller if requested, otherwise they will
 * be placed back on the freelist.
 * The caller is responsible for locking the freelist as well as any other
 * accounting which needs to be done for a returned page.
 * If flags is not PC_ALLOC, the color argument is ignored, and thus
 * technically, any value may be passed in but PC_NO_COLOR is the standard
 * which should be followed for clarity's sake.
 */
page_t *
page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
    int color, int flags)
{
	page_t	*pp, *pplist, *npplist;
	pgcnt_t	npgs, n;
	uint_t	bin;
	uint_t	mtype;
	page_t	*ret_pp = NULL;

	ASSERT(cur_szc != 0);
	ASSERT(new_szc < cur_szc);

	pplist = page_numtopp_nolock(pfnum);
	ASSERT(pplist != NULL);

	ASSERT(pplist->p_szc == cur_szc);

	bin = PP_2_BIN(pplist);
	ASSERT(mnode == PP_2_MEM_NODE(pplist));
	mtype = PP_2_MTYPE(pplist);
	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);

	CHK_LPG(pplist, cur_szc);
	page_ctr_sub(pplist, PG_FREE_LIST);

	/*
	 * Number of PAGESIZE pages for smaller new_szc
	 * page.
	 */
	npgs = page_get_pagecnt(new_szc);

	while (pplist) {
		pp = pplist;

		ASSERT(pp->p_szc == cur_szc);

		/*
		 * We either break it up into PAGESIZE pages or larger.
		 */
		if (npgs == 1) {	/* PAGESIZE case */
			mach_page_sub(&pplist, pp);
			ASSERT(pp->p_szc == cur_szc);
			ASSERT(new_szc == 0);
			ASSERT(mnode == PP_2_MEM_NODE(pp));
			pp->p_szc = new_szc;
			bin = PP_2_BIN(pp);
			if ((bin == color) && (flags == PC_ALLOC) &&
			    (ret_pp == NULL) &&
			    page_trylock_cons(pp, SE_EXCL)) {
				ret_pp = pp;
			} else {
				mtype = PP_2_MTYPE(pp);
				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
				    mtype), pp);
				page_ctr_add(pp, PG_FREE_LIST);
			}
		} else {

			/*
			 * Break down into smaller lists of pages.
			 */
			page_list_break(&pplist, &npplist, npgs);

			pp = pplist;
			n = npgs;
			while (n--) {
				ASSERT(pp->p_szc == cur_szc);
				pp->p_szc = new_szc;
				pp = pp->p_next;
			}

			CHK_LPG(pplist, new_szc);

			bin = PP_2_BIN(pplist);
			ASSERT(mnode == PP_2_MEM_NODE(pp));
			if ((bin == color) && (flags == PC_ALLOC) &&
			    (ret_pp == NULL) &&
			    page_trylock_cons(pp, SE_EXCL)) {
				ret_pp = pp;
			} else {
				mtype = PP_2_MTYPE(pp);
				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
				    bin, mtype), pplist);

				page_ctr_add(pplist, PG_FREE_LIST);
			}
			pplist = npplist;
		}
	}
	return (ret_pp);
}

int mpss_coalesce_disable = 0;

/*
 * Coalesce free pages into a page of the given szc and color if possible.
 * Return the pointer to the page created, otherwise, return NULL.
 */
static page_t *
page_freelist_coalesce(int mnode, uchar_t szc, int color)
{
	int 	r;		/* region size */
	int 	idx, full, i;
	pfn_t	pfnum;
	size_t	len;
	size_t	buckets_to_check;
	pgcnt_t	cands;
	page_t	*ret_pp;
	int	color_stride;

	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce);

	if (mpss_coalesce_disable) {
		return (NULL);
	}

	r = szc;
	PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands);
	if (cands == 0) {
		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip);
		return (NULL);
	}
	full = FULL_REGION_CNT(r);
	color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
	    page_colors;

	/* Prevent page_counters dynamic memory from being freed */
	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
	len  = PAGE_COUNTERS_ENTRIES(mnode, r);
	buckets_to_check = len / color_stride;
	idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color);
	ASSERT((idx % color_stride) == color);
	idx += color_stride;
	if (idx >= len)
		idx = color;
	for (i = 0; i < buckets_to_check; i++) {
		if (PAGE_COUNTERS(mnode, r, idx) == full) {
			pfnum = IDX_TO_PNUM(mnode, r, idx);
			ASSERT(pfnum >= mem_node_config[mnode].physbase &&
			    pfnum < mem_node_config[mnode].physmax);
			/*
			 * RFE: For performance maybe we can do something less
			 *	brutal than locking the entire freelist. So far
			 * 	this doesn't seem to be a performance problem?
			 */
			page_freelist_lock(mnode);
			if (PAGE_COUNTERS(mnode, r, idx) != full) {
				VM_STAT_ADD(vmm_vmstats.page_ctrs_changed);
				goto skip_this_one;
			}
			ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC);
			if (ret_pp != NULL) {
				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
				    idx;
				page_freelist_unlock(mnode);
				rw_exit(&page_ctrs_rwlock[mnode]);
#if defined(__sparc)
				if (PP_ISNORELOC(ret_pp)) {
					pgcnt_t npgs;

					npgs = page_get_pagecnt(ret_pp->p_szc);
					kcage_freemem_sub(npgs);
				}
#endif
				return (ret_pp);
			}
skip_this_one:
			page_freelist_unlock(mnode);
			/*
			 * No point looking for another page if we've
			 * already tried all of the ones that
			 * page_ctr_cands indicated.  Stash off where we left
			 * off.
			 * Note: this is not exact since we don't hold the
			 * page_freelist_locks before we initially get the
			 * value of cands for performance reasons, but should
			 * be a decent approximation.
			 */
			if (--cands == 0) {
				PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) =
				    idx;
				break;
			}
		}
		idx += color_stride;
		if (idx >= len)
			idx = color;
	}
	rw_exit(&page_ctrs_rwlock[mnode]);
	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed);
	return (NULL);
}

/*
 * For the given mnode, promote as many small pages to large pages as possible.
 */
void
page_freelist_coalesce_all(int mnode)
{
	int 	r;		/* region size */
	int 	idx, full;
	pfn_t	pfnum;
	size_t	len;

	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);

	if (mpss_coalesce_disable) {
		return;
	}

	/*
	 * Lock the entire freelist and coalesce what we can.
	 *
	 * Always promote to the largest page possible
	 * first to reduce the number of page promotions.
	 */
	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
	page_freelist_lock(mnode);
	for (r = mmu_page_sizes - 1; r > 0; r--) {
		pgcnt_t cands;

		PGCTRS_CANDS_GETVALUE(mnode, r, cands);
		if (cands == 0) {
			VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all);
			continue;
		}

		full = FULL_REGION_CNT(r);
		len  = PAGE_COUNTERS_ENTRIES(mnode, r);

		for (idx = 0; idx < len; idx++) {
			if (PAGE_COUNTERS(mnode, r, idx) == full) {
				pfnum = IDX_TO_PNUM(mnode, r, idx);
				ASSERT(pfnum >=
				    mem_node_config[mnode].physbase &&
				    pfnum <
				    mem_node_config[mnode].physmax);
				(void) page_promote(mnode, pfnum, r, PC_FREE);
			}
		}
	}
	page_freelist_unlock(mnode);
	rw_exit(&page_ctrs_rwlock[mnode]);
}

/*
 * This is where all polices for moving pages around
 * to different page size free lists is implemented.
 * Returns 1 on success, 0 on failure.
 *
 * So far these are the priorities for this algorithm in descending
 * order:
 *
 *	1) When servicing a request try to do so with a free page
 *	   from next size up. Helps defer fragmentation as long
 *	   as possible.
 *
 *	2) Page coalesce on demand. Only when a freelist
 *	   larger than PAGESIZE is empty and step 1
 *	   will not work since all larger size lists are
 *	   also empty.
 *
 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
 */
page_t *
page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi)
{
	uchar_t nszc = szc + 1;
	int 	bin;
	page_t	*pp, *firstpp;
	page_t	*ret_pp = NULL;

	ASSERT(szc < mmu_page_sizes);

	/*
	 * First try to break up a larger page to fill
	 * current size freelist.
	 */
	while (nszc < mmu_page_sizes) {
		/*
		 * If page found then demote it.
		 */
		bin = page_convert_color(szc, nszc, color);
		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
			page_freelist_lock(mnode);
			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);

			/*
			 * If pfnhi is not PFNNULL, look for large page below
			 * pfnhi. PFNNULL signifies no pfn requirement.
			 */
			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
				do {
					pp = pp->p_vpnext;
					if (pp == firstpp) {
						pp = NULL;
						break;
					}
				} while (pp->p_pagenum >= pfnhi);
			}
			if (pp) {
				ASSERT(pp->p_szc == nszc);
				ret_pp = page_demote(mnode, pp->p_pagenum,
				    pp->p_szc, szc, color, PC_ALLOC);
				if (ret_pp) {
					page_freelist_unlock(mnode);
#if defined(__sparc)
					if (PP_ISNORELOC(ret_pp)) {
						pgcnt_t npgs;

						npgs = page_get_pagecnt(
						    ret_pp->p_szc);
						kcage_freemem_sub(npgs);
					}
#endif
					return (ret_pp);
				}
			}
			page_freelist_unlock(mnode);
		}
		nszc++;
	}

	/*
	 * Ok that didn't work. Time to coalesce.
	 */
	if (szc != 0) {
		ret_pp = page_freelist_coalesce(mnode, szc, color);
	}

	return (ret_pp);
}

/*
 * Helper routine used only by the freelist code to lock
 * a page. If the page is a large page then it succeeds in
 * locking all the constituent pages or none at all.
 * Returns 1 on sucess, 0 on failure.
 */
static int
page_trylock_cons(page_t *pp, se_t se)
{
	page_t	*tpp, *first_pp = pp;

	/*
	 * Fail if can't lock first or only page.
	 */
	if (!page_trylock(pp, se)) {
		return (0);
	}

	/*
	 * PAGESIZE: common case.
	 */
	if (pp->p_szc == 0) {
		return (1);
	}

	/*
	 * Large page case.
	 */
	tpp = pp->p_next;
	while (tpp != pp) {
		if (!page_trylock(tpp, se)) {
			/*
			 * On failure unlock what we
			 * have locked so far.
			 */
			while (first_pp != tpp) {
				page_unlock(first_pp);
				first_pp = first_pp->p_next;
			}
			return (0);
		}
		tpp = tpp->p_next;
	}
	return (1);
}

page_t *
page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
    uint_t flags)
{
	kmutex_t	*pcm;
	int		i, fill_tried, fill_marker;
	page_t		*pp, *first_pp;
	uint_t		bin_marker;
	int		colors, cpucolors;
	uchar_t		nszc;
	uint_t		nszc_color_shift;
	int		nwaybins = 0, nwaycnt;

	ASSERT(szc < mmu_page_sizes);

	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);

	/* LINTED */
	MTYPE_START(mnode, mtype, flags);
	if (mtype < 0) {	/* mnode foes not have memory in mtype range */
		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
		return (NULL);
	}

	/*
	 * Set how many physical colors for this page size.
	 */
	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
	    page_colors;

	nszc = MIN(szc + 1, mmu_page_sizes - 1);
	nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc);

	/* cpu_page_colors is non-zero if a page color may be in > 1 bin */
	cpucolors = cpu_page_colors;

	/*
	 * adjust cpucolors to possibly check additional 'equivalent' bins
	 * to try to minimize fragmentation of large pages by delaying calls
	 * to page_freelist_fill.
	 */
	if (colorequiv > 1) {
		int equivcolors = colors / colorequiv;

		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
			cpucolors = equivcolors;
	}

	ASSERT(colors <= page_colors);
	ASSERT(colors);
	ASSERT((colors & (colors - 1)) == 0);

	ASSERT(bin < colors);

	/*
	 * Only hold one freelist lock at a time, that way we
	 * can start anywhere and not have to worry about lock
	 * ordering.
	 */
big_try_again:
	fill_tried = 0;
	nwaycnt = 0;
	for (i = 0; i <= colors; i++) {
try_again:
		ASSERT(bin < colors);
		if (PAGE_FREELISTS(mnode, szc, bin, mtype)) {
			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
			mutex_enter(pcm);
			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
			if (pp != NULL) {
				/*
				 * These were set before the page
				 * was put on the free list,
				 * they must still be set.
				 */
				ASSERT(PP_ISFREE(pp));
				ASSERT(PP_ISAGED(pp));
				ASSERT(pp->p_vnode == NULL);
				ASSERT(pp->p_hash == NULL);
				ASSERT(pp->p_offset == (u_offset_t)-1);
				ASSERT(pp->p_szc == szc);
				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);

				/*
				 * Walk down the hash chain.
				 * 8k pages are linked on p_next
				 * and p_prev fields. Large pages
				 * are a contiguous group of
				 * constituent pages linked together
				 * on their p_next and p_prev fields.
				 * The large pages are linked together
				 * on the hash chain using p_vpnext
				 * p_vpprev of the base constituent
				 * page of each large page.
				 */
				first_pp = pp;
				while (!page_trylock_cons(pp, SE_EXCL)) {
					if (szc == 0) {
						pp = pp->p_next;
					} else {
						pp = pp->p_vpnext;
					}

					ASSERT(PP_ISFREE(pp));
					ASSERT(PP_ISAGED(pp));
					ASSERT(pp->p_vnode == NULL);
					ASSERT(pp->p_hash == NULL);
					ASSERT(pp->p_offset == (u_offset_t)-1);
					ASSERT(pp->p_szc == szc);
					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
							mnode);

					if (pp == first_pp) {
						pp = NULL;
						break;
					}
				}

				if (pp) {
					ASSERT(mtype == PP_2_MTYPE(pp));
					ASSERT(pp->p_szc == szc);
					if (szc == 0) {
						page_sub(&PAGE_FREELISTS(mnode,
						    szc, bin, mtype), pp);
					} else {
						page_vpsub(&PAGE_FREELISTS(
						    mnode, szc, bin, mtype),
						    pp);
						CHK_LPG(pp, szc);
					}
					page_ctr_sub(pp, PG_FREE_LIST);

					if ((PP_ISFREE(pp) == 0) ||
					    (PP_ISAGED(pp) == 0))
						panic("free page is not. pp %p",
						    (void *)pp);
					mutex_exit(pcm);

#if defined(__sparc)
					ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
					    (flags & PG_NORELOC) == 0);

					if (PP_ISNORELOC(pp)) {
						pgcnt_t	npgs;

						npgs = page_get_pagecnt(szc);
						kcage_freemem_sub(npgs);
					}
#endif
					VM_STAT_ADD(vmm_vmstats.
					    pgmf_allocok[szc]);
					return (pp);
				}
			}
			mutex_exit(pcm);
		}

		/*
		 * Wow! The initial bin is empty.
		 * If specific color is needed, check if page color may be
		 * in other bins. cpucolors is:
		 *   0	if the colors for this cpu is equal to page_colors.
		 *	This means that pages with a particular color are in a
		 *	single bin.
		 *  -1	if colors of cpus (cheetah+) are heterogenous. Need to
		 *	first determine the colors for the current cpu.
		 *  >0	colors of all cpus are homogenous and < page_colors
		 */

		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
			if (!nwaybins) {
				/*
				 * cpucolors is negative if ecache setsizes
				 * are heterogenous. determine colors for this
				 * particular cpu.
				 */
				if (cpucolors < 0) {
					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
					ASSERT(cpucolors > 0);
					nwaybins = colors / cpucolors;
				} else {
					nwaybins = colors / cpucolors;
					ASSERT(szc > 0 || nwaybins > 1);
				}
				if (nwaybins < 2)
					cpucolors = 0;
			}

			if (cpucolors && (nwaycnt + 1 <= nwaybins)) {
				nwaycnt++;
				bin = (bin + (colors / nwaybins)) &
				    (colors - 1);
				if (nwaycnt < nwaybins) {
					goto try_again;
				}
			}
			/* back to initial color if fall-thru */
		}

		/*
		 * color bins are all empty if color match. Try and satisfy
		 * the request by breaking up or coalescing pages from
		 * a different size freelist of the correct color that
		 * satisfies the ORIGINAL color requested. If that
		 * fails then try pages of the same size but different
		 * colors assuming we are not called with
		 * PG_MATCH_COLOR.
		 */
		if (!fill_tried) {
			fill_tried = 1;
			fill_marker = bin >> nszc_color_shift;
			pp = page_freelist_fill(szc, bin, mnode, mtype,
			    PFNNULL);
			if (pp != NULL) {
				return (pp);
			}
		}

		if (flags & PG_MATCH_COLOR)
			break;

		/*
		 * Select next color bin to try.
		 */
		if (szc == 0) {
			/*
			 * PAGESIZE page case.
			 */
			if (i == 0) {
				bin = (bin + BIN_STEP) & page_colors_mask;
				bin_marker = bin;
			} else {
				bin = (bin + vac_colors) & page_colors_mask;
				if (bin == bin_marker) {
					bin = (bin + 1) & page_colors_mask;
					bin_marker = bin;
				}
			}
		} else {
			/*
			 * Large page case.
			 */
			bin = (bin + 1) & (colors - 1);
		}
		/*
		 * If bin advanced to the next color bin of the
		 * next larger pagesize, there is a chance the fill
		 * could succeed.
		 */
		if (fill_marker != (bin >> nszc_color_shift))
			fill_tried = 0;
	}

#if defined(__sparc)
	if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
		(kcage_freemem >= kcage_lotsfree)) {
		/*
		 * The Cage is ON and with plenty of free mem, and
		 * we're willing to check for a NORELOC page if we
		 * couldn't find a RELOC page, so spin again.
		 */
		flags |= PG_NORELOC;
		mtype = MTYPE_NORELOC;
		goto big_try_again;
	}
#else
	if (flags & PGI_MT_RANGE) {
		/* cycle through range of mtypes */
		MTYPE_NEXT(mnode, mtype, flags);
		if (mtype >= 0)
			goto big_try_again;
	}
#endif
	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);

	return (NULL);
}


/*
 * Returns the count of free pages for 'pp' with size code 'szc'.
 * Note: This function does not return an exact value as the page freelist
 * locks are not held and thus the values in the page_counters may be
 * changing as we walk through the data.
 */
static int
page_freecnt(int mnode, page_t *pp, uchar_t szc)
{
	pgcnt_t	pgfree;
	pgcnt_t cnt;
	ssize_t	r = szc;	/* region size */
	ssize_t	idx;
	int	i;
	int	full, range;

	/* Make sure pagenum passed in is aligned properly */
	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
	ASSERT(szc > 0);

	/* Prevent page_counters dynamic memory from being freed */
	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
	cnt = PAGE_COUNTERS(mnode, r, idx);
	pgfree = cnt << PNUM_SHIFT(r - 1);
	range = FULL_REGION_CNT(szc);

	/* Check for completely full region */
	if (cnt == range) {
		rw_exit(&page_ctrs_rwlock[mnode]);
		return (pgfree);
	}

	while (--r > 0) {
		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
		full = FULL_REGION_CNT(r);
		for (i = 0; i < range; i++, idx++) {
			cnt = PAGE_COUNTERS(mnode, r, idx);
			/*
			 * If cnt here is full, that means we have already
			 * accounted for these pages earlier.
			 */
			if (cnt != full) {
				pgfree += (cnt << PNUM_SHIFT(r - 1));
			}
		}
		range *= full;
	}
	rw_exit(&page_ctrs_rwlock[mnode]);
	return (pgfree);
}

/*
 * Called from page_geti_contig_pages to exclusively lock constituent pages
 * starting from 'spp' for page size code 'szc'.
 *
 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
 * region needs to be greater than or equal to the threshold.
 */
static int
page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
{
	pgcnt_t	pgcnt = PNUM_SIZE(szc);
	pgcnt_t pgfree, i;
	page_t *pp;

	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);


	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
		goto skipptcpcheck;
	/*
	 * check if there are sufficient free pages available before attempting
	 * to trylock. Count is approximate as page counters can change.
	 */
	pgfree = page_freecnt(mnode, spp, szc);

	/* attempt to trylock if there are sufficient already free pages */
	if (pgfree < pgcnt/ptcpthreshold) {
		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
		return (0);
	}

skipptcpcheck:

	for (i = 0; i < pgcnt; i++) {
		pp = &spp[i];
		if (!page_trylock(pp, SE_EXCL)) {
			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
			while (--i != (pgcnt_t)-1) {
				pp = &spp[i];
				ASSERT(PAGE_EXCL(pp));
				page_unlock(pp);
			}
			return (0);
		}
		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
		    !PP_ISFREE(pp)) {
			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
			ASSERT(i == 0);
			page_unlock(pp);
			return (0);
		}
		if (PP_ISNORELOC(pp)) {
			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
			while (i != (pgcnt_t)-1) {
				pp = &spp[i];
				ASSERT(PAGE_EXCL(pp));
				page_unlock(pp);
				i--;
			}
			return (0);
		}
	}
	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
	return (1);
}

/*
 * Claim large page pointed to by 'pp'. 'pp' is the starting set
 * of 'szc' constituent pages that had been locked exclusively previously.
 * Will attempt to relocate constituent pages in use.
 */
static page_t *
page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
{
	spgcnt_t pgcnt, npgs, i;
	page_t *targpp, *rpp, *hpp;
	page_t *replpp = NULL;
	page_t *pplist = NULL;

	ASSERT(pp != NULL);

	pgcnt = page_get_pagecnt(szc);
	while (pgcnt) {
		ASSERT(PAGE_EXCL(pp));
		ASSERT(!PP_ISNORELOC(pp));
		if (PP_ISFREE(pp)) {
			/*
			 * If this is a PG_FREE_LIST page then its
			 * size code can change underneath us due to
			 * page promotion or demotion. As an optimzation
			 * use page_list_sub_pages() instead of
			 * page_list_sub().
			 */
			if (PP_ISAGED(pp)) {
				page_list_sub_pages(pp, szc);
				if (pp->p_szc == szc) {
					return (pp);
				}
				ASSERT(pp->p_szc < szc);
				npgs = page_get_pagecnt(pp->p_szc);
				hpp = pp;
				for (i = 0; i < npgs; i++, pp++) {
					pp->p_szc = szc;
				}
				page_list_concat(&pplist, &hpp);
				pgcnt -= npgs;
				continue;
			}
			ASSERT(!PP_ISAGED(pp));
			ASSERT(pp->p_szc == 0);
			page_list_sub(pp, PG_CACHE_LIST);
			page_hashout(pp, NULL);
			PP_SETAGED(pp);
			pp->p_szc = szc;
			page_list_concat(&pplist, &pp);
			pp++;
			pgcnt--;
			continue;
		}
		npgs = page_get_pagecnt(pp->p_szc);

		/*
		 * page_create_wait freemem accounting done by caller of
		 * page_get_freelist and not necessary to call it prior to
		 * calling page_get_replacement_page.
		 *
		 * page_get_replacement_page can call page_get_contig_pages
		 * to acquire a large page (szc > 0); the replacement must be
		 * smaller than the contig page size to avoid looping or
		 * szc == 0 and PGI_PGCPSZC0 is set.
		 */
		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
			replpp = page_get_replacement_page(pp, NULL, 0);
			if (replpp) {
				npgs = page_get_pagecnt(pp->p_szc);
				ASSERT(npgs <= pgcnt);
				targpp = pp;
			}
		}

		/*
		 * If replacement is NULL or do_page_relocate fails, fail
		 * coalescing of pages.
		 */
		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
		    &npgs, NULL) != 0)) {
			/*
			 * Unlock un-processed target list
			 */
			while (pgcnt--) {
				ASSERT(PAGE_EXCL(pp));
				page_unlock(pp);
				pp++;
			}
			/*
			 * Free the processed target list.
			 */
			while (pplist) {
				pp = pplist;
				page_sub(&pplist, pp);
				ASSERT(PAGE_EXCL(pp));
				ASSERT(pp->p_szc == szc);
				ASSERT(PP_ISFREE(pp));
				ASSERT(PP_ISAGED(pp));
				pp->p_szc = 0;
				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
				page_unlock(pp);
			}

			if (replpp != NULL)
				page_free_replacement_page(replpp);

			return (NULL);
		}
		ASSERT(pp == targpp);

		/* LINTED */
		ASSERT(hpp = pp); /* That's right, it's an assignment */

		pp += npgs;
		pgcnt -= npgs;

		while (npgs--) {
			ASSERT(PAGE_EXCL(targpp));
			ASSERT(!PP_ISFREE(targpp));
			ASSERT(!PP_ISNORELOC(targpp));
			PP_SETFREE(targpp);
			ASSERT(PP_ISAGED(targpp));
			ASSERT(targpp->p_szc < szc || (szc == 0 &&
			    (flags & PGI_PGCPSZC0)));
			targpp->p_szc = szc;
			targpp = targpp->p_next;

			rpp = replpp;
			ASSERT(rpp != NULL);
			page_sub(&replpp, rpp);
			ASSERT(PAGE_EXCL(rpp));
			ASSERT(!PP_ISFREE(rpp));
			page_unlock(rpp);
		}
		ASSERT(targpp == hpp);
		ASSERT(replpp == NULL);
		page_list_concat(&pplist, &targpp);
	}
	CHK_LPG(pplist, szc);
	return (pplist);
}

/*
 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
 * of 0 means nothing left after trim.
 */

int
trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
{
	pfn_t	kcagepfn;
	int	decr;
	int	rc = 0;

	if (PP_ISNORELOC(mseg->pages)) {
		if (PP_ISNORELOC(mseg->epages - 1) == 0) {

			/* lower part of this mseg inside kernel cage */
			decr = kcage_current_pfn(&kcagepfn);

			/* kernel cage may have transitioned past mseg */
			if (kcagepfn >= mseg->pages_base &&
			    kcagepfn < mseg->pages_end) {
				ASSERT(decr == 0);
				*lo = kcagepfn;
				*hi = MIN(pfnhi,
				    (mseg->pages_end - 1));
				rc = 1;
			}
		}
		/* else entire mseg in the cage */
	} else {
		if (PP_ISNORELOC(mseg->epages - 1)) {

			/* upper part of this mseg inside kernel cage */
			decr = kcage_current_pfn(&kcagepfn);

			/* kernel cage may have transitioned past mseg */
			if (kcagepfn >= mseg->pages_base &&
			    kcagepfn < mseg->pages_end) {
				ASSERT(decr);
				*hi = kcagepfn;
				*lo = MAX(pfnlo, mseg->pages_base);
				rc = 1;
			}
		} else {
			/* entire mseg outside of kernel cage */
			*lo = MAX(pfnlo, mseg->pages_base);
			*hi = MIN(pfnhi, (mseg->pages_end - 1));
			rc = 1;
		}
	}
	return (rc);
}

/*
 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a
 * page with size code 'szc'. Claiming such a page requires acquiring
 * exclusive locks on all constituent pages (page_trylock_contig_pages),
 * relocating pages in use and concatenating these constituent pages into a
 * large page.
 *
 * The page lists do not have such a large page and page_freelist_fill has
 * already failed to demote larger pages and/or coalesce smaller free pages.
 *
 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
 * pages with the same color as 'bin'.
 *
 * 'pfnflag' specifies the subset of the pfn range to search.
 */


static page_t *
page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
    pfn_t pfnlo, pfn_t pfnhi, int pfnflag)
{
	struct memseg *mseg;
	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
	pgcnt_t szcpgmask = szcpgcnt - 1;
	pfn_t	randpfn;
	page_t *pp, *randpp, *endpp;
	uint_t colors;
	pfn_t hi, lo;
	uint_t skip;

	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));

	if ((pfnhi - pfnlo) + 1 < szcpgcnt)
		return (NULL);

	ASSERT(szc < mmu_page_sizes);

	colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 :
	    page_colors;

	ASSERT(bin < colors);

	/*
	 * trim the pfn range to search based on pfnflag. pfnflag is set
	 * when there have been previous page_get_contig_page failures to
	 * limit the search.
	 *
	 * The high bit in pfnflag specifies the number of 'slots' in the
	 * pfn range and the remainder of pfnflag specifies which slot.
	 * For example, a value of 1010b would mean the second slot of
	 * the pfn range that has been divided into 8 slots.
	 */
	if (pfnflag > 1) {
		int	slots = 1 << (highbit(pfnflag) - 1);
		int	slotid = pfnflag & (slots - 1);
		pgcnt_t	szcpages;
		int	slotlen;

		pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
		pfnhi = pfnhi & ~(szcpgcnt - 1);

		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
		slotlen = howmany(szcpages, slots);
		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
		ASSERT(pfnlo < pfnhi);
		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
			pfnhi = pfnlo + (slotlen * szcpgcnt);
	}

	memsegs_lock(0);

	/*
	 * loop through memsegs to look for contig page candidates
	 */

	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
			/* no overlap */
			continue;
		}

		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
			/* mseg too small */
			continue;

		/* trim off kernel cage pages from pfn range */
		if (kcage_on) {
			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
				continue;
		} else {
			lo = MAX(pfnlo, mseg->pages_base);
			hi = MIN(pfnhi, (mseg->pages_end - 1));
		}

		/* round to szcpgcnt boundaries */
		lo = P2ROUNDUP(lo, szcpgcnt);
		hi = hi & ~(szcpgcnt - 1);

		if (hi <= lo)
			continue;

		/*
		 * set lo to point to the pfn for the desired bin. Large
		 * page sizes may only have a single page color
		 */
		if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
			uint_t	lobin;

			/*
			 * factor in colorequiv to check additional
			 * 'equivalent' bins.
			 */
			if (colorequiv > 1 && colors > colorequiv)
				colors = colors / colorequiv;

			/* determine bin that lo currently points to */
			lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt;

			/*
			 * set lo to point at appropriate color and set skip
			 * to arrive at the next szc page of the same color.
			 */
			lo += ((bin - lobin) & (colors - 1)) * szcpgcnt;

			skip = colors * szcpgcnt;
		} else {
			/* check all pages starting from lo */
			skip = szcpgcnt;
		}
		if (hi <= lo)
			/* mseg cannot satisfy color request */
			continue;

		/* randomly choose a point between lo and hi to begin search */

		randpfn = (pfn_t)GETTICK();
		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
		randpp = mseg->pages + (randpfn - mseg->pages_base);

		ASSERT(randpp->p_pagenum == randpfn);

		pp = randpp;
		endpp =  mseg->pages + (hi - mseg->pages_base);

		ASSERT(randpp + szcpgcnt <= endpp);

		do {
			ASSERT(!(pp->p_pagenum & szcpgmask));
			ASSERT((flags & PG_MATCH_COLOR) == 0 ||
			    colorequiv > 1 ||
			    PP_2_BIN(pp) == bin);
			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
				/* pages unlocked by page_claim on failure */
				if (page_claim_contig_pages(pp, szc, flags)) {
					memsegs_unlock(0);
					return (pp);
				}
			}

			pp += skip;
			if (pp >= endpp) {
				/* start from the beginning */
				pp = mseg->pages + (lo - mseg->pages_base);
				ASSERT(pp->p_pagenum == lo);
				ASSERT(pp + szcpgcnt <= endpp);
			}
		} while (pp != randpp);
	}
	memsegs_unlock(0);
	return (NULL);
}


/*
 * controlling routine that searches through physical memory in an attempt to
 * claim a large page based on the input parameters.
 * on the page free lists.
 *
 * calls page_geti_contig_pages with an initial pfn range from the mnode
 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
 * that overlaps with the kernel cage or does not match the requested page
 * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
 * page_geti_contig_pages may further limit the search range based on
 * previous failure counts (pgcpfailcnt[]).
 *
 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
 * pagesize page that satisfies mtype.
 */
page_t *
page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
    uint_t flags)
{
	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
	page_t		*pp;
	int		pfnflag = 0;	/* no limit on search if 0 */

	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);

	/* LINTED */
	MTYPE_START(mnode, mtype, flags);
	if (mtype < 0) {	/* mnode does not have memory in mtype range */
		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
		return (NULL);
	}

	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));

	/* do not limit search and ignore color if hi pri */

	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
		pfnflag = pgcpfailcnt[szc];

	/* remove color match to improve chances */

	if (flags & PGI_PGCPHIPRI || pfnflag)
		flags &= ~PG_MATCH_COLOR;

	do {
		/* get pfn range based on mnode and mtype */
		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);

		ASSERT(pfnhi >= pfnlo);

		pp = page_geti_contig_pages(mnode, bin, szc, flags,
		    pfnlo, pfnhi, pfnflag);

		if (pp != NULL) {
			pfnflag = pgcpfailcnt[szc];
			if (pfnflag) {
				/* double the search size */
				pgcpfailcnt[szc] = pfnflag >> 1;
			}
			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
			return (pp);
		}
	/* LINTED */
	} while ((flags & PGI_MT_RANGE) &&
	    (MTYPE_NEXT(mnode, mtype, flags) >= 0));

	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
	return (NULL);
}


/*
 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
 *
 * Does its own locking and accounting.
 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
 * pages of the proper color even if there are pages of a different color.
 *
 * Finds a page, removes it, THEN locks it.
 */

/*ARGSUSED*/
page_t *
page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
{
	struct as	*as = seg->s_as;
	page_t		*pp = NULL;
	ulong_t		bin;
	uchar_t		szc;
	int		mnode;
	int		mtype;
	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
	lgrp_mnode_cookie_t	lgrp_cookie;

	page_get_func = page_get_mnode_freelist;

	/*
	 * If we aren't passed a specific lgroup, or passed a freed lgrp
	 * assume we wish to allocate near to the current thread's home.
	 */
	if (!LGRP_EXISTS(lgrp))
		lgrp = lgrp_home_lgrp();

	if (kcage_on) {
		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
		    kcage_freemem < kcage_throttlefree + btop(size) &&
		    curthread != kcage_cageout_thread) {
			/*
			 * Set a "reserve" of kcage_throttlefree pages for
			 * PG_PANIC and cageout thread allocations.
			 *
			 * Everybody else has to serialize in
			 * page_create_get_something() to get a cage page, so
			 * that we don't deadlock cageout!
			 */
			return (NULL);
		}
	} else {
		flags &= ~PG_NORELOC;
		flags |= PGI_NOCAGE;
	}

	/* LINTED */
	MTYPE_INIT(mtype, vp, vaddr, flags);

	/*
	 * Convert size to page size code.
	 */
	if ((szc = page_szc(size)) == (uchar_t)-1)
		panic("page_get_freelist: illegal page size request");
	ASSERT(szc < mmu_page_sizes);

	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);

	/* LINTED */
	AS_2_BIN(as, seg, vp, vaddr, bin);

	/* bin is for base pagesize color - convert if larger pagesize. */
	if (szc)
		bin = page_convert_color(0, szc, bin);

	/*
	 * Try to get a local page first, but try remote if we can't
	 * get a page of the right color.
	 */
pgretry:
	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
		pp = page_get_func(mnode, bin, mtype, szc, flags);
		if (pp != NULL) {
			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
			DTRACE_PROBE4(page__get,
			    lgrp_t *, lgrp,
			    int, mnode,
			    ulong_t, bin,
			    uint_t, flags);
			return (pp);
		}
	}
	ASSERT(pp == NULL);

	/*
	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
	 * remote free lists.  Caller expected to call page_get_cachelist which
	 * will check local cache lists and remote free lists.
	 */
	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
		return (NULL);
	}

	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));

	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);

	/*
	 * Try to get a non-local freelist page.
	 */
	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
		pp = page_get_func(mnode, bin, mtype, szc, flags);
		if (pp != NULL) {
			DTRACE_PROBE4(page__get,
			    lgrp_t *, lgrp,
			    int, mnode,
			    ulong_t, bin,
			    uint_t, flags);
			VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
			return (pp);
		}
	}

	ASSERT(pp == NULL);

	/*
	 * when the cage is off chances are page_get_contig_pages() will fail
	 * to lock a large page chunk therefore when the cage is off it's not
	 * called by default.  this can be changed via /etc/system.
	 *
	 * page_get_contig_pages() also called to acquire a base pagesize page
	 * for page_create_get_something().
	 */
	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
	    (page_get_func != page_get_contig_pages)) {

		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
		page_get_func = page_get_contig_pages;
		goto pgretry;
	}

	if (pgcplimitsearch && page_get_func == page_get_contig_pages)
		pgcpfailcnt[szc]++;

	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
	return (NULL);
}

/*
 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
 *
 * Does its own locking.
 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
 * pages of the proper color even if there are pages of a different color.
 * Otherwise, scan the bins for ones with pages.  For each bin with pages,
 * try to lock one of them.  If no page can be locked, try the
 * next bin.  Return NULL if a page can not be found and locked.
 *
 * Finds a pages, trys to lock it, then removes it.
 */

/*ARGSUSED*/
page_t *
page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
    caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
{
	page_t		*pp;
	struct as	*as = seg->s_as;
	ulong_t		bin;
	/*LINTED*/
	int		mnode;
	int		mtype;
	lgrp_mnode_cookie_t	lgrp_cookie;

	/*
	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
	 * assume we wish to allocate near to the current thread's home.
	 */
	if (!LGRP_EXISTS(lgrp))
		lgrp = lgrp_home_lgrp();

	if (!kcage_on) {
		flags &= ~PG_NORELOC;
		flags |= PGI_NOCAGE;
	}

	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
	    kcage_freemem <= kcage_throttlefree) {
		/*
		 * Reserve kcage_throttlefree pages for critical kernel
		 * threads.
		 *
		 * Everybody else has to go to page_create_get_something()
		 * to get a cage page, so we don't deadlock cageout.
		 */
		return (NULL);
	}

	/* LINTED */
	AS_2_BIN(as, seg, vp, vaddr, bin);

	ASSERT(bin <= page_colors_mask);

	/* LINTED */
	MTYPE_INIT(mtype, vp, vaddr, flags);

	VM_STAT_ADD(vmm_vmstats.pgc_alloc);

	/*
	 * Try local cachelists first
	 */
	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
		if (pp != NULL) {
			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
			DTRACE_PROBE4(page__get,
			    lgrp_t *, lgrp,
			    int, mnode,
			    ulong_t, bin,
			    uint_t, flags);
			return (pp);
		}
	}

	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);

	/*
	 * Try freelists/cachelists that are farther away
	 * This is our only chance to allocate remote pages for PAGESIZE
	 * requests.
	 */
	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
		pp = page_get_mnode_freelist(mnode, bin, mtype,
		    0, flags);
		if (pp != NULL) {
			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
			DTRACE_PROBE4(page__get,
			    lgrp_t *, lgrp,
			    int, mnode,
			    ulong_t, bin,
			    uint_t, flags);
			return (pp);
		}
		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
		if (pp != NULL) {
			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
			DTRACE_PROBE4(page__get,
			    lgrp_t *, lgrp,
			    int, mnode,
			    ulong_t, bin,
			    uint_t, flags);
			return (pp);
		}
	}

	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
	return (NULL);
}

page_t *
page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
{
	kmutex_t	*pcm;
	int		i;
	page_t		*pp;
	page_t		*first_pp;
	uint_t		bin_marker;
	int		nwaybins, nwaycnt;
	int		cpucolors;

	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);

	/* LINTED */
	MTYPE_START(mnode, mtype, flags);
	if (mtype < 0) {	/* mnode does not have memory in mtype range */
		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
		return (NULL);
	}

	nwaybins = 0;
	cpucolors = cpu_page_colors;
	/*
	 * adjust cpucolors to possibly check additional 'equivalent' bins
	 * to try to minimize fragmentation of large pages by delaying calls
	 * to page_freelist_fill.
	 */
	if (colorequiv > 1) {
		int equivcolors = page_colors / colorequiv;

		if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors))
			cpucolors = equivcolors;
	}

	/*
	 * Only hold one cachelist lock at a time, that way we
	 * can start anywhere and not have to worry about lock
	 * ordering.
	 */

big_try_again:
	nwaycnt = 0;
	for (i = 0; i <= page_colors; i++) {
		if (PAGE_CACHELISTS(mnode, bin, mtype)) {
			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
			mutex_enter(pcm);
			pp = PAGE_CACHELISTS(mnode, bin, mtype);
			if (pp != NULL) {
				first_pp = pp;
				ASSERT(pp->p_vnode);
				ASSERT(PP_ISAGED(pp) == 0);
				ASSERT(pp->p_szc == 0);
				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
				while (!page_trylock(pp, SE_EXCL)) {
					pp = pp->p_next;
					ASSERT(pp->p_szc == 0);
					if (pp == first_pp) {
						/*
						 * We have searched the
						 * complete list!
						 * And all of them (might
						 * only be one) are locked.
						 * This can happen since
						 * these pages can also be
						 * found via the hash list.
						 * When found via the hash
						 * list, they are locked
						 * first, then removed.
						 * We give up to let the
						 * other thread run.
						 */
						pp = NULL;
						break;
					}
					ASSERT(pp->p_vnode);
					ASSERT(PP_ISFREE(pp));
					ASSERT(PP_ISAGED(pp) == 0);
					ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
							mnode);
				}

				if (pp) {
					page_t	**ppp;
					/*
					 * Found and locked a page.
					 * Pull it off the list.
					 */
					ASSERT(mtype == PP_2_MTYPE(pp));
					ppp = &PAGE_CACHELISTS(mnode, bin,
					    mtype);
					page_sub(ppp, pp);
					/*
					 * Subtract counters before releasing
					 * pcm mutex to avoid a race with
					 * page_freelist_coalesce and
					 * page_freelist_fill.
					 */
					page_ctr_sub(pp, PG_CACHE_LIST);
					mutex_exit(pcm);
					ASSERT(pp->p_vnode);
					ASSERT(PP_ISAGED(pp) == 0);
#if defined(__sparc)
					ASSERT(!kcage_on ||
					    (flags & PG_NORELOC) == 0 ||
					    PP_ISNORELOC(pp));
					if (PP_ISNORELOC(pp)) {
						kcage_freemem_sub(1);
					}
#endif
					VM_STAT_ADD(vmm_vmstats.
					    pgmc_allocok);
					return (pp);
				}
			}
			mutex_exit(pcm);
		}

		/*
		 * Wow! The initial bin is empty or no page in the bin could
		 * be locked.
		 *
		 * If specific color is needed, check if page color may be in
		 * other bins.
		 */
		if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) {
			if (!nwaybins) {
				if (cpucolors < 0) {
					cpucolors = CPUSETSIZE() / MMU_PAGESIZE;
					ASSERT(cpucolors > 0);
					nwaybins = page_colors / cpucolors;
					if (nwaybins < 2)
						cpucolors = 0;
				} else {
					nwaybins = page_colors / cpucolors;
					ASSERT(nwaybins > 1);
				}
			}

			if (++nwaycnt >= nwaybins) {
				break;
			}
			bin = (bin + (page_colors / nwaybins)) &
			    page_colors_mask;
			continue;
		}

		if (i == 0) {
			bin = (bin + BIN_STEP) & page_colors_mask;
			bin_marker = bin;
		} else {
			bin = (bin + vac_colors) & page_colors_mask;
			if (bin == bin_marker) {
				bin = (bin + 1) & page_colors_mask;
				bin_marker = bin;
			}
		}
	}

#if defined(__sparc)
	if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&
		(kcage_freemem >= kcage_lotsfree)) {
		/*
		 * The Cage is ON and with plenty of free mem, and
		 * we're willing to check for a NORELOC page if we
		 * couldn't find a RELOC page, so spin again.
		 */
		flags |= PG_NORELOC;
		mtype = MTYPE_NORELOC;
		goto big_try_again;
	}
#else
	if (flags & PGI_MT_RANGE) {
		MTYPE_NEXT(mnode, mtype, flags);
		if (mtype >= 0)
			goto big_try_again;
	}
#endif
	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
	return (NULL);
}

#ifdef DEBUG
#define	REPL_PAGE_STATS
#endif /* DEBUG */

#ifdef REPL_PAGE_STATS
struct repl_page_stats {
	uint_t	ngets;
	uint_t	ngets_noreloc;
	uint_t	npgr_noreloc;
	uint_t	nnopage_first;
	uint_t	nnopage;
	uint_t	nhashout;
	uint_t	nnofree;
	uint_t	nnext_pp;
} repl_page_stats;
#define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
#else /* REPL_PAGE_STATS */
#define	REPL_STAT_INCR(v)
#endif /* REPL_PAGE_STATS */

int	pgrppgcp;

/*
 * The freemem accounting must be done by the caller.
 * First we try to get a replacement page of the same size as like_pp,
 * if that is not possible, then we just get a set of discontiguous
 * PAGESIZE pages.
 */
page_t *
page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
    uint_t pgrflags)
{
	page_t		*like_pp;
	page_t		*pp, *pplist;
	page_t		*pl = NULL;
	ulong_t		bin;
	int		mnode, page_mnode;
	int		szc;
	spgcnt_t	npgs, pg_cnt;
	pfn_t		pfnum;
	int		mtype;
	int		flags = 0;
	lgrp_mnode_cookie_t	lgrp_cookie;
	lgrp_t		*lgrp;

	REPL_STAT_INCR(ngets);
	like_pp = orig_like_pp;
	ASSERT(PAGE_EXCL(like_pp));

	szc = like_pp->p_szc;
	npgs = page_get_pagecnt(szc);
	/*
	 * Now we reset like_pp to the base page_t.
	 * That way, we won't walk past the end of this 'szc' page.
	 */
	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
	like_pp = page_numtopp_nolock(pfnum);
	ASSERT(like_pp->p_szc == szc);

	if (PP_ISNORELOC(like_pp)) {
		ASSERT(kcage_on);
		REPL_STAT_INCR(ngets_noreloc);
		flags = PGI_RELOCONLY;
	} else if (pgrflags & PGR_NORELOC) {
		ASSERT(kcage_on);
		REPL_STAT_INCR(npgr_noreloc);
		flags = PG_NORELOC;
	}

	/*
	 * Kernel pages must always be replaced with the same size
	 * pages, since we cannot properly handle demotion of kernel
	 * pages.
	 */
	if (like_pp->p_vnode == &kvp)
		pgrflags |= PGR_SAMESZC;

	/* LINTED */
	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode);

	while (npgs) {
		pplist = NULL;
		for (;;) {
			pg_cnt = page_get_pagecnt(szc);
			bin = PP_2_BIN(like_pp);
			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
			ASSERT(pg_cnt <= npgs);

			/*
			 * If an lgroup was specified, try to get the
			 * page from that lgroup.
			 * NOTE: Must be careful with code below because
			 *	 lgroup may disappear and reappear since there
			 *	 is no locking for lgroup here.
			 */
			if (LGRP_EXISTS(lgrp_target)) {
				/*
				 * Keep local variable for lgroup separate
				 * from lgroup argument since this code should
				 * only be exercised when lgroup argument
				 * exists....
				 */
				lgrp = lgrp_target;

				/* Try the lgroup's freelists first */
				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
				    LGRP_SRCH_LOCAL);
				while ((pplist == NULL) &&
				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
				    != -1) {
					pplist = page_get_mnode_freelist(
						mnode, bin, mtype, szc,
						    flags);
				}

				/*
				 * Now try it's cachelists if this is a
				 * small page. Don't need to do it for
				 * larger ones since page_freelist_coalesce()
				 * already failed.
				 */
				if (pplist != NULL || szc != 0)
					break;

				/* Now try it's cachelists */
				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
				    LGRP_SRCH_LOCAL);

				while ((pplist == NULL) &&
				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
				    != -1) {
					pplist = page_get_mnode_cachelist(
						bin, flags, mnode, mtype);
				}
				if (pplist != NULL) {
					page_hashout(pplist, NULL);
					PP_SETAGED(pplist);
					REPL_STAT_INCR(nhashout);
					break;
				}
				/* Done looking in this lgroup. Bail out. */
				break;
			}

			/*
			 * No lgroup was specified (or lgroup was removed by
			 * DR, so just try to get the page as close to
			 * like_pp's mnode as possible.
			 * First try the local freelist...
			 */
			mnode = PP_2_MEM_NODE(like_pp);
			pplist = page_get_mnode_freelist(mnode, bin,
			    mtype, szc, flags);
			if (pplist != NULL)
				break;

			REPL_STAT_INCR(nnofree);

			/*
			 * ...then the local cachelist. Don't need to do it for
			 * larger pages cause page_freelist_coalesce() already
			 * failed there anyway.
			 */
			if (szc == 0) {
				pplist = page_get_mnode_cachelist(bin, flags,
				    mnode, mtype);
				if (pplist != NULL) {
					page_hashout(pplist, NULL);
					PP_SETAGED(pplist);
					REPL_STAT_INCR(nhashout);
					break;
				}
			}

			/* Now try remote freelists */
			page_mnode = mnode;
			lgrp =
			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
			    LGRP_SRCH_HIER);
			while (pplist == NULL &&
			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
			    != -1) {
				/*
				 * Skip local mnode.
				 */
				if ((mnode == page_mnode) ||
				    (mem_node_config[mnode].exists == 0))
					continue;

				pplist = page_get_mnode_freelist(mnode,
				    bin, mtype, szc, flags);
			}

			if (pplist != NULL)
				break;


			/* Now try remote cachelists */
			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
			    LGRP_SRCH_HIER);
			while (pplist == NULL && szc == 0) {
				mnode = lgrp_memnode_choose(&lgrp_cookie);
				if (mnode == -1)
					break;
				/*
				 * Skip local mnode.
				 */
				if ((mnode == page_mnode) ||
				    (mem_node_config[mnode].exists == 0))
					continue;

				pplist = page_get_mnode_cachelist(bin,
				    flags, mnode, mtype);

				if (pplist != NULL) {
					page_hashout(pplist, NULL);
					PP_SETAGED(pplist);
					REPL_STAT_INCR(nhashout);
					break;
				}
			}

			/*
			 * Break out of while loop under the following cases:
			 * - If we successfully got a page.
			 * - If pgrflags specified only returning a specific
			 *   page size and we could not find that page size.
			 * - If we could not satisfy the request with PAGESIZE
			 *   or larger pages.
			 */
			if (pplist != NULL || szc == 0)
				break;

			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
				/* try to find contig page */

				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
				    LGRP_SRCH_HIER);

				while ((pplist == NULL) &&
				    (mnode =
					lgrp_memnode_choose(&lgrp_cookie))
				    != -1) {
					pplist = page_get_contig_pages(
						mnode, bin, mtype, szc,
						    flags | PGI_PGCPHIPRI);
				}
				break;
			}

			/*
			 * The correct thing to do here is try the next
			 * page size down using szc--. Due to a bug
			 * with the processing of HAT_RELOAD_SHARE
			 * where the sfmmu_ttecnt arrays of all
			 * hats sharing an ISM segment don't get updated,
			 * using intermediate size pages for relocation
			 * can lead to continuous page faults.
			 */
			szc = 0;
		}

		if (pplist != NULL) {
			DTRACE_PROBE4(page__get,
			    lgrp_t *, lgrp,
			    int, mnode,
			    ulong_t, bin,
			    uint_t, flags);

			while (pplist != NULL && pg_cnt--) {
				ASSERT(pplist != NULL);
				pp = pplist;
				page_sub(&pplist, pp);
				PP_CLRFREE(pp);
				PP_CLRAGED(pp);
				page_list_concat(&pl, &pp);
				npgs--;
				like_pp = like_pp + 1;
				REPL_STAT_INCR(nnext_pp);
			}
			ASSERT(pg_cnt == 0);
		} else {
			break;
		}
	}

	if (npgs) {
		/*
		 * We were unable to allocate the necessary number
		 * of pages.
		 * We need to free up any pl.
		 */
		REPL_STAT_INCR(nnopage);
		page_free_replacement_page(pl);
		return (NULL);
	} else {
		return (pl);
	}
}

/*
 * demote a free large page to it's constituent pages
 */
void
page_demote_free_pages(page_t *pp)
{

	int mnode;

	ASSERT(pp != NULL);
	ASSERT(PAGE_LOCKED(pp));
	ASSERT(PP_ISFREE(pp));
	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);

	mnode = PP_2_MEM_NODE(pp);
	page_freelist_lock(mnode);
	if (pp->p_szc != 0) {
		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
	}
	page_freelist_unlock(mnode);
	ASSERT(pp->p_szc == 0);
}