/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * UNIX machine dependent virtual memory support.
 */

#ifndef	_VM_DEP_H
#define	_VM_DEP_H

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#ifdef	__cplusplus
extern "C" {
#endif

#include <sys/clock.h>
#include <vm/hat_pte.h>

/*
 * WARNING: vm_dep.h is included by files in common. As such, macros
 * dependent upon PTE36 such as LARGEPAGESIZE cannot be used in this file.
 */

#define	GETTICK()	tsc_read()

/* memranges in descending order */
extern pfn_t		*memranges;

#define	MEMRANGEHI(mtype)						\
	((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
#define	MEMRANGELO(mtype)	(memranges[mtype])

/*
 * combined memory ranges from mnode and memranges[] to manage single
 * mnode/mtype dimension in the page lists.
 */
typedef struct {
	pfn_t	mnr_pfnlo;
	pfn_t	mnr_pfnhi;
	int	mnr_mnode;
	int	mnr_memrange;		/* index into memranges[] */
#ifdef DEBUG
	/* maintain page list stats */
	pgcnt_t	mnr_mt_pgmax;		/* mnode/mtype max page cnt */
	pgcnt_t	mnr_mt_pgcnt;		/* free cnt */
	pgcnt_t	mnr_mt_clpgcnt;		/* cache list free cnt */
	struct mnr_mts {		/* mnode/mtype szc stats */
		pgcnt_t	mnr_mts_pgcnt;
		int	mnr_mts_colors;
		pgcnt_t *mnr_mtsc_pgcnt;
	} 	*mnr_mts;
#endif
} mnoderange_t;

#ifdef DEBUG
#define	PLCNT_SZ(ctrs_sz) {						\
	int	szc, colors;						\
	ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) *		\
	    mmu_page_sizes;						\
	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
		colors = page_get_pagecolors(szc);			\
		ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;	\
	}								\
}

#define	PLCNT_INIT(addr) {						\
	int	mt, szc, colors;					\
	for (mt = 0; mt < mnoderangecnt; mt++) {			\
		mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;	\
		addr += (sizeof (struct mnr_mts) * mmu_page_sizes);	\
		for (szc = 0; szc < mmu_page_sizes; szc++) {		\
			colors = page_get_pagecolors(szc);		\
			mnoderanges[mt].mnr_mts[szc].mnr_mts_colors =	\
			    colors;					\
			mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =	\
			    (pgcnt_t *)addr;				\
			addr += (sizeof (pgcnt_t) * colors);		\
		}							\
	}								\
}
#define	PLCNT_DO(pp, mtype, szc, cnt, flags) {				\
	int	bin = PP_2_BIN(pp);					\
	if (flags & PG_LIST_ISINIT)					\
		mnoderanges[mtype].mnr_mt_pgmax += cnt;			\
	atomic_add_long(&mnoderanges[mtype].mnr_mt_pgcnt, cnt);		\
	if (flags & PG_CACHE_LIST)					\
		atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt,	\
		    cnt);						\
	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
	    mnr_mts_pgcnt, cnt);					\
	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
	    mnr_mtsc_pgcnt[bin], cnt);					\
}
#else
#define	PLCNT_SZ(ctrs_sz)
#define	PLCNT_INIT(base)
#define	PLCNT_DO(pp, mtype, szc, cnt, flags)
#endif

#define	PLCNT_INCR(pp, mnode, szc, flags) {				\
	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
	int	mtype = PP_2_MTYPE(pp);					\
	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
	if (physmax4g && mtype <= mtype4g)				\
		atomic_add_long(&freemem4g, cnt);			\
	if (flags & PG_LIST_ISINIT) {					\
		if (physmax4g && mtype <= mtype4g)			\
			maxmem4g += cnt;				\
	}								\
	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
}

#define	PLCNT_DECR(pp, mnode, szc, flags) {				\
	long	cnt = ((-1) << PAGE_BSZS_SHIFT(szc));			\
	int	mtype = PP_2_MTYPE(pp);					\
	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
	if (physmax4g && mtype <= mtype4g)				\
		atomic_add_long(&freemem4g, cnt);			\
	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
}

extern mnoderange_t	*mnoderanges;
extern int		mnoderangecnt;
extern int		mtype4g;

/*
 * 4g memory management variables for systems with more than 4g of memory:
 *
 * physical memory below 4g is required for 32bit dma devices and, currently,
 * for kmem memory. On systems with more than 4g of memory, the pool of memory
 * below 4g can be depleted without any paging activity given that there is
 * likely to be sufficient memory above 4g.
 *
 * physmax4g is set true if the largest pfn is over 4g. The rest of the
 * 4g memory management code is enabled only when physmax4g is true.
 *
 * maxmem4g is the count of the maximum number of pages on the page lists
 * with physical addresses below 4g. It can be a lot less then 4g given that
 * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 * agp aperture etc.
 *
 * freemem4g maintains the count of the number of available pages on the
 * page lists with physical addresses below 4g.
 *
 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 * 6% (desfree4gshift = 4) of maxmem4g.
 *
 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 * and the amount of physical memory above 4g is greater than freemem4g.
 * In this case, page_get_* routines will restrict below 4g allocations
 * for requests that don't specifically require it.
 */

extern int		physmax4g;
extern pgcnt_t		maxmem4g;
extern pgcnt_t		freemem4g;
extern int		lotsfree4gshift;
extern int		desfree4gshift;
#define	LOTSFREE4G	(maxmem4g >> lotsfree4gshift)
#define	DESFREE4G	(maxmem4g >> desfree4gshift)

#define	RESTRICT4G_ALLOC					\
	(physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))

extern int		restricted_kmemalloc;
extern int		memrange_num(pfn_t);
extern int		pfn_2_mtype(pfn_t);
extern int		mtype_func(int, int, uint_t);

#define	NUM_MEM_RANGES	4		/* memory range types */

/*
 * Per page size free lists. Allocated dynamically.
 * dimensions [mtype][mmu_page_sizes][colors]
 *
 * mtype specifies a physical memory range with a unique mnode.
 */

extern page_t ****page_freelists;

#define	PAGE_FREELISTS(mnode, szc, color, mtype)		\
	(*(page_freelists[mtype][szc] + (color)))

/*
 * For now there is only a single size cache list. Allocated dynamically.
 * dimensions [mtype][colors]
 *
 * mtype specifies a physical memory range with a unique mnode.
 */
extern page_t ***page_cachelists;

#define	PAGE_CACHELISTS(mnode, color, mtype) 		\
	(*(page_cachelists[mtype] + (color)))

/*
 * There are mutexes for both the page freelist
 * and the page cachelist.  We want enough locks to make contention
 * reasonable, but not too many -- otherwise page_freelist_lock() gets
 * so expensive that it becomes the bottleneck!
 */

#define	NPC_MUTEX	16

extern kmutex_t	*fpc_mutex[NPC_MUTEX];
extern kmutex_t	*cpc_mutex[NPC_MUTEX];

extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t);
extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);

/* Find the bin for the given page if it was of size szc */
#define	PP_2_BIN_SZC(pp, szc)						\
	(((pp->p_pagenum) & page_colors_mask) >>			\
	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))

#define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))

#define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
#define	PP_2_MTYPE(pp)		(pfn_2_mtype(pp->p_pagenum))
#define	PP_2_SZC(pp)		(pp->p_szc)

#define	SZCPAGES(szc)		(1 << PAGE_BSZS_SHIFT(szc))
#define	PFN_BASE(pfnum, szc)	(pfnum & ~(SZCPAGES(szc) - 1))

#if defined(__amd64)

/*
 * set the mtype range (called from page_get_{free,cache}list)
 *   - set range to above 4g if the system has more than 4g of memory and the
 *   amount of memory below 4g runs low otherwise set range to all of memory
 *   starting from the hi pfns.
 *
 * page_get_anylist gets its mtype range from the specified ddi_dma_attr_t.
 */
#define	MTYPE_INIT(mtype, vp, vaddr, flags) {				\
	mtype = mnoderangecnt - 1;					\
	if (RESTRICT4G_ALLOC) {						\
		VM_STAT_ADD(vmm_vmstats.restrict4gcnt);			\
		/* here only for > 4g systems */			\
		flags |= PGI_MT_RANGE4G;				\
	} else {							\
		flags |= PGI_MT_RANGE0;					\
	}								\
}

#elif defined(__i386)

/*
 * set the mtype range
 *   - kmem requests needs to be below 4g if restricted_kmemalloc is set.
 *   - for non kmem requests, set range to above 4g if the amount of memory
 *   below 4g runs low.
 */

#define	MTYPE_INIT(mtype, vp, vaddr, flags) {				\
	if (restricted_kmemalloc && (vp) == &kvp &&			\
	    (caddr_t)(vaddr) >= kernelheap &&				\
	    (caddr_t)(vaddr) < ekernelheap) {				\
		ASSERT(physmax4g);					\
		mtype = mtype4g;					\
		flags |= PGI_MT_RANGE0;					\
	} else {							\
		mtype = mnoderangecnt - 1;				\
		if (RESTRICT4G_ALLOC) {					\
			VM_STAT_ADD(vmm_vmstats.restrict4gcnt);		\
			/* here only for > 4g systems */		\
			flags |= PGI_MT_RANGE4G;			\
		} else {						\
			flags |= PGI_MT_RANGE0;				\
		}							\
	}								\
}

#endif	/* __i386 */

/*
 * macros to loop through the mtype range (page_get_mnode_{free,cache,any}list,
 * and page_get_contig_pages)
 *
 * MTYPE_START sets the initial mtype. -1 if the mtype range specified does
 * not contain mnode.
 *
 * MTYPE_NEXT sets the next mtype. -1 if there are no more valid
 * mtype in the range.
 */

#define	MTYPE_START(mnode, mtype, flags)				\
	(mtype = mtype_func(mnode, mtype, flags))

#define	MTYPE_NEXT(mnode, mtype, flags)					\
	(mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT))

/* mtype init for page_get_replacement_page */

#define	MTYPE_PGR_INIT(mtype, flags, pp, mnode) {			\
	mtype = mnoderangecnt - 1;					\
	flags |= PGI_MT_RANGE0;						\
}

#define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
	ASSERT(mnoderanges[mtype].mnr_mnode == mnode);			\
	pfnlo = mnoderanges[mtype].mnr_pfnlo;				\
	pfnhi = mnoderanges[mtype].mnr_pfnhi;

#define	PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?	\
	&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :			\
	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])

#define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
#define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])

#ifdef DEBUG
#define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
extern void	chk_lpg(page_t *, uchar_t);
#else
#define	CHK_LPG(pp, szc)
#endif

#define	FULL_REGION_CNT(rg_szc)	\
	(LEVEL_SIZE(rg_szc) >> LEVEL_SHIFT(rg_szc - 1))

/* Return the leader for this mapping size */
#define	PP_GROUPLEADER(pp, szc) \
	(&(pp)[-(int)((pp)->p_pagenum & (SZCPAGES(szc)-1))])

/* Return the root page for this page based on p_szc */
#define	PP_PAGEROOT(pp) ((pp)->p_szc == 0 ? (pp) : \
	PP_GROUPLEADER((pp), (pp)->p_szc))

/*
 * The counter base must be per page_counter element to prevent
 * races when re-indexing, and the base page size element should
 * be aligned on a boundary of the given region size.
 *
 * We also round up the number of pages spanned by the counters
 * for a given region to PC_BASE_ALIGN in certain situations to simplify
 * the coding for some non-performance critical routines.
 */

#define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(MMU_PAGE_SIZES-1))
#define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)

/*
 * cpu/mmu-dependent vm variables
 */
extern uint_t mmu_page_sizes;
extern uint_t mmu_exported_page_sizes;

/* For x86, userszc is the same as the kernel's szc */
#define	USERSZC_2_SZC(userszc)	(userszc)
#define	SZC_2_USERSZC(szc)	(szc)

/*
 * for hw_page_map_t, sized to hold the ratio of large page to base
 * pagesize (1024 max)
 */
typedef	short	hpmctr_t;

/*
 * get the setsize of the current cpu - assume homogenous for x86
 */
extern int	l2cache_sz, l2cache_linesz, l2cache_assoc;

#define	L2CACHE_ALIGN		l2cache_linesz
#define	CPUSETSIZE()		\
	(l2cache_assoc ? (l2cache_sz / l2cache_assoc) : MMU_PAGESIZE)

/*
 * Return the log2(pagesize(szc) / MMU_PAGESIZE) --- or the shift count
 * for the number of base pages in this pagesize
 */
#define	PAGE_BSZS_SHIFT(szc) (LEVEL_SHIFT(szc) - MMU_PAGESHIFT)

/*
 * Internal PG_ flags.
 */
#define	PGI_RELOCONLY	0x010000	/* opposite of PG_NORELOC */
#define	PGI_NOCAGE	0x020000	/* cage is disabled */
#define	PGI_PGCPHIPRI	0x040000	/* page_get_contig_page pri alloc */
#define	PGI_PGCPSZC0	0x080000	/* relocate base pagesize page */

/*
 * PGI range flags - should not overlap PGI flags
 */
#define	PGI_MT_RANGE0	0x1000000	/* mtype range to 0 */
#define	PGI_MT_RANGE4G	0x2000000	/* mtype range to 4g */
#define	PGI_MT_NEXT	0x4000000	/* get next mtype */
#define	PGI_MT_RANGE	(PGI_MT_RANGE0 | PGI_MT_RANGE4G)

/*
 * hash as and addr to get a bin.
 */

#define	AS_2_BIN(as, seg, vp, addr, bin)				\
	bin = ((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
	    & page_colors_mask)

/*
 * When a bin is empty, and we can't satisfy a color request correctly,
 * we scan.  If we assume that the programs have reasonable spatial
 * behavior, then it will not be a good idea to use the adjacent color.
 * Using the adjacent color would result in virtually adjacent addresses
 * mapping into the same spot in the cache.  So, if we stumble across
 * an empty bin, skip a bunch before looking.  After the first skip,
 * then just look one bin at a time so we don't miss our cache on
 * every look. Be sure to check every bin.  Page_create() will panic
 * if we miss a page.
 *
 * This also explains the `<=' in the for loops in both page_get_freelist()
 * and page_get_cachelist().  Since we checked the target bin, skipped
 * a bunch, then continued one a time, we wind up checking the target bin
 * twice to make sure we get all of them bins.
 */
#define	BIN_STEP	19

#ifdef VM_STATS
struct vmm_vmstats_str {
	ulong_t pc_list_add_pages[MMU_PAGE_SIZES];
	ulong_t pc_list_sub_pages1[MMU_PAGE_SIZES];
	ulong_t pc_list_sub_pages2[MMU_PAGE_SIZES];
	ulong_t pc_list_sub_pages3[MMU_PAGE_SIZES];
	ulong_t pgf_alloc[MMU_PAGE_SIZES];
	ulong_t pgf_allocok[MMU_PAGE_SIZES];
	ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
	ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
	ulong_t	pgf_allocdeferred;
	ulong_t	pgf_allocretry[MMU_PAGE_SIZES];
	ulong_t pgc_alloc;
	ulong_t pgc_allocok;
	ulong_t pgc_allocokrem;
	ulong_t pgc_allocokdeferred;
	ulong_t pgc_allocfailed;
	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];
	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
	ulong_t	ptcp[MMU_PAGE_SIZES];
	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
	ulong_t	ptcpok[MMU_PAGE_SIZES];
	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];
	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
	ulong_t	pgmc_alloc;
	ulong_t	pgmc_allocfailed;
	ulong_t	pgmc_allocempty;
	ulong_t	pgmc_allocok;
	ulong_t	ppr_reloc[MMU_PAGE_SIZES];
	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
	ulong_t ppr_relocok[MMU_PAGE_SIZES];
	ulong_t page_ctrs_coalesce;	/* page coalesce counter */
	ulong_t page_ctrs_cands_skip;	/* candidates useful */
	ulong_t page_ctrs_changed;	/* ctrs changed after locking */
	ulong_t page_ctrs_failed;	/* page_freelist_coalesce failed */
	ulong_t page_ctrs_coalesce_all;	/* page coalesce all counter */
	ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
	ulong_t	restrict4gcnt;
};
extern struct vmm_vmstats_str vmm_vmstats;
#endif	/* VM_STATS */

extern size_t page_ctrs_sz(void);
extern caddr_t page_ctrs_alloc(caddr_t);
extern void page_ctr_sub(page_t *, int);
extern page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
extern uint_t page_get_pagecolors(uint_t);

#ifdef	__cplusplus
}
#endif

#endif	/* _VM_DEP_H */