xref: /titanic_44/usr/src/uts/i86pc/vm/vm_dep.h (revision e6ed03fcc10da912de5cab6b25f8bf3a8c5f14d9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * UNIX machine dependent virtual memory support.
28  */
29 
30 #ifndef	_VM_DEP_H
31 #define	_VM_DEP_H
32 
33 #pragma ident	"%Z%%M%	%I%	%E% SMI"
34 
35 #ifdef	__cplusplus
36 extern "C" {
37 #endif
38 
39 #include <sys/clock.h>
40 #include <vm/hat_pte.h>
41 #include <sys/param.h>
42 #include <sys/memnode.h>
43 
44 /*
45  * WARNING: vm_dep.h is included by files in common. As such, macros
46  * dependent upon PTE36 such as LARGEPAGESIZE cannot be used in this file.
47  */
48 
49 #define	GETTICK()	tsc_read()
50 
51 /* memranges in descending order */
52 extern pfn_t		*memranges;
53 
54 #define	MEMRANGEHI(mtype)						\
55 	((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
56 #define	MEMRANGELO(mtype)	(memranges[mtype])
57 
58 #define	MTYPE_FREEMEM(mt)						\
59 	(mnoderanges[mt].mnr_mt_clpgcnt +				\
60 	    mnoderanges[mt].mnr_mt_flpgcnt +				\
61 	    mnoderanges[mt].mnr_mt_lgpgcnt)
62 
63 /*
64  * combined memory ranges from mnode and memranges[] to manage single
65  * mnode/mtype dimension in the page lists.
66  */
67 typedef struct {
68 	pfn_t	mnr_pfnlo;
69 	pfn_t	mnr_pfnhi;
70 	int	mnr_mnode;
71 	int	mnr_memrange;		/* index into memranges[] */
72 	/* maintain page list stats */
73 	pgcnt_t	mnr_mt_pgmax;		/* mnode/mtype max page cnt */
74 	pgcnt_t	mnr_mt_clpgcnt;		/* cache list cnt */
75 	pgcnt_t	mnr_mt_flpgcnt;		/* free list cnt - small pages */
76 	pgcnt_t	mnr_mt_lgpgcnt;		/* free list cnt - large pages */
77 #ifdef DEBUG
78 	struct mnr_mts {		/* mnode/mtype szc stats */
79 		pgcnt_t	mnr_mts_pgcnt;
80 		int	mnr_mts_colors;
81 		pgcnt_t *mnr_mtsc_pgcnt;
82 	} 	*mnr_mts;
83 #endif
84 } mnoderange_t;
85 
86 #ifdef DEBUG
87 #define	PLCNT_SZ(ctrs_sz) {						\
88 	int	szc, colors;						\
89 	ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) *		\
90 	    mmu_page_sizes;						\
91 	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
92 		colors = page_get_pagecolors(szc);			\
93 		ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;	\
94 	}								\
95 }
96 
97 #define	PLCNT_INIT(addr) {						\
98 	int	mt, szc, colors;					\
99 	for (mt = 0; mt < mnoderangecnt; mt++) {			\
100 		mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;	\
101 		addr += (sizeof (struct mnr_mts) * mmu_page_sizes);	\
102 		for (szc = 0; szc < mmu_page_sizes; szc++) {		\
103 			colors = page_get_pagecolors(szc);		\
104 			mnoderanges[mt].mnr_mts[szc].mnr_mts_colors =	\
105 			    colors;					\
106 			mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =	\
107 			    (pgcnt_t *)addr;				\
108 			addr += (sizeof (pgcnt_t) * colors);		\
109 		}							\
110 	}								\
111 }
112 #define	PLCNT_DO(pp, mtype, szc, cnt, flags) {				\
113 	int	bin = PP_2_BIN(pp);					\
114 	if (flags & PG_CACHE_LIST)					\
115 		atomic_add_long(&mnoderanges[mtype].			\
116 		    mnr_mt_clpgcnt, cnt);				\
117 	else if (szc)							\
118 		atomic_add_long(&mnoderanges[mtype].			\
119 		    mnr_mt_lgpgcnt, cnt);				\
120 	else								\
121 		atomic_add_long(&mnoderanges[mtype].			\
122 		    mnr_mt_flpgcnt, cnt);				\
123 	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
124 	    mnr_mts_pgcnt, cnt);					\
125 	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
126 	    mnr_mtsc_pgcnt[bin], cnt);					\
127 }
128 #else
129 #define	PLCNT_SZ(ctrs_sz)
130 #define	PLCNT_INIT(base)
131 #define	PLCNT_DO(pp, mtype, szc, cnt, flags) {				\
132 	if (flags & PG_CACHE_LIST)					\
133 		atomic_add_long(&mnoderanges[mtype].			\
134 		    mnr_mt_clpgcnt, cnt);				\
135 	else if (szc)							\
136 		atomic_add_long(&mnoderanges[mtype].			\
137 		    mnr_mt_lgpgcnt, cnt);				\
138 	else								\
139 		atomic_add_long(&mnoderanges[mtype].			\
140 		    mnr_mt_flpgcnt, cnt);				\
141 }
142 #endif
143 
144 #define	PLCNT_INCR(pp, mnode, mtype, szc, flags) {			\
145 	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
146 	ASSERT(mtype == PP_2_MTYPE(pp));				\
147 	if (physmax4g && mtype <= mtype4g)				\
148 		atomic_add_long(&freemem4g, cnt);			\
149 	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
150 }
151 
152 #define	PLCNT_DECR(pp, mnode, mtype, szc, flags) {			\
153 	long	cnt = ((-1) << PAGE_BSZS_SHIFT(szc));			\
154 	ASSERT(mtype == PP_2_MTYPE(pp));				\
155 	if (physmax4g && mtype <= mtype4g)				\
156 		atomic_add_long(&freemem4g, cnt);			\
157 	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
158 }
159 
160 /*
161  * macros to update page list max counts.  no-op on x86.
162  */
163 #define	PLCNT_XFER_NORELOC(pp)
164 
165 #define	PLCNT_MODIFY_MAX(pfn, cnt)	mtype_modify_max(pfn, (pgcnt_t)cnt)
166 
167 extern mnoderange_t	*mnoderanges;
168 extern int		mnoderangecnt;
169 extern int		mtype4g;
170 
171 /*
172  * 4g memory management variables for systems with more than 4g of memory:
173  *
174  * physical memory below 4g is required for 32bit dma devices and, currently,
175  * for kmem memory. On systems with more than 4g of memory, the pool of memory
176  * below 4g can be depleted without any paging activity given that there is
177  * likely to be sufficient memory above 4g.
178  *
179  * physmax4g is set true if the largest pfn is over 4g. The rest of the
180  * 4g memory management code is enabled only when physmax4g is true.
181  *
182  * maxmem4g is the count of the maximum number of pages on the page lists
183  * with physical addresses below 4g. It can be a lot less then 4g given that
184  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
185  * agp aperture etc.
186  *
187  * freemem4g maintains the count of the number of available pages on the
188  * page lists with physical addresses below 4g.
189  *
190  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
191  * 6% (desfree4gshift = 4) of maxmem4g.
192  *
193  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
194  * and the amount of physical memory above 4g is greater than freemem4g.
195  * In this case, page_get_* routines will restrict below 4g allocations
196  * for requests that don't specifically require it.
197  */
198 
199 extern int		physmax4g;
200 extern pgcnt_t		maxmem4g;
201 extern pgcnt_t		freemem4g;
202 extern int		lotsfree4gshift;
203 extern int		desfree4gshift;
204 #define	LOTSFREE4G	(maxmem4g >> lotsfree4gshift)
205 #define	DESFREE4G	(maxmem4g >> desfree4gshift)
206 
207 #define	RESTRICT4G_ALLOC					\
208 	(physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
209 
210 /*
211  * 16m memory management:
212  *
213  * reserve some amount of physical memory below 16m for legacy devices.
214  *
215  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
216  * 16m or if the 16m pool drops below DESFREE16M.
217  *
218  * In this case, general page allocations via page_get_{free,cache}list
219  * routines will be restricted from allocating from the 16m pool. Allocations
220  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
221  * are not restricted.
222  */
223 
224 #define	FREEMEM16M	MTYPE_FREEMEM(0)
225 #define	DESFREE16M	desfree16m
226 #define	RESTRICT16M_ALLOC(freemem, pgcnt, flags)		\
227 	((freemem != 0) && ((flags & PG_PANIC) == 0) &&		\
228 	    ((freemem >= (FREEMEM16M)) ||			\
229 	    (FREEMEM16M  < (DESFREE16M + pgcnt))))
230 extern pgcnt_t		desfree16m;
231 
232 extern int		restricted_kmemalloc;
233 extern int		memrange_num(pfn_t);
234 extern int		pfn_2_mtype(pfn_t);
235 extern int		mtype_func(int, int, uint_t);
236 extern void		mtype_modify_max(pfn_t, long);
237 extern int		mnode_pgcnt(int);
238 extern int		mnode_range_cnt(int);
239 
240 #define	NUM_MEM_RANGES	4		/* memory range types */
241 
242 /*
243  * candidate counters in vm_pagelist.c are indexed by color and range
244  */
245 #define	MAX_MNODE_MRANGES	NUM_MEM_RANGES
246 #define	MNODE_RANGE_CNT(mnode)	mnode_range_cnt(mnode)
247 #define	MNODE_MAX_MRANGE(mnode)	(memrange_num(mem_node_config[mnode].physbase))
248 #define	MTYPE_2_MRANGE(mnode, mtype)	\
249 	(mnode_maxmrange[mnode] - mnoderanges[mtype].mnr_memrange)
250 
251 /*
252  * Per page size free lists. Allocated dynamically.
253  * dimensions [mtype][mmu_page_sizes][colors]
254  *
255  * mtype specifies a physical memory range with a unique mnode.
256  */
257 
258 extern page_t ****page_freelists;
259 
260 #define	PAGE_FREELISTS(mnode, szc, color, mtype)		\
261 	(*(page_freelists[mtype][szc] + (color)))
262 
263 /*
264  * For now there is only a single size cache list. Allocated dynamically.
265  * dimensions [mtype][colors]
266  *
267  * mtype specifies a physical memory range with a unique mnode.
268  */
269 extern page_t ***page_cachelists;
270 
271 #define	PAGE_CACHELISTS(mnode, color, mtype) 		\
272 	(*(page_cachelists[mtype] + (color)))
273 
274 /*
275  * There are mutexes for both the page freelist
276  * and the page cachelist.  We want enough locks to make contention
277  * reasonable, but not too many -- otherwise page_freelist_lock() gets
278  * so expensive that it becomes the bottleneck!
279  */
280 
281 #define	NPC_MUTEX	16
282 
283 extern kmutex_t	*fpc_mutex[NPC_MUTEX];
284 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
285 
286 extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t);
287 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
288 
289 /* mem node iterator is not used on x86 */
290 #define	MEM_NODE_ITERATOR_DECL(it)
291 #define	MEM_NODE_ITERATOR_INIT(pfn, mnode, it)
292 
293 /*
294  * interleaved_mnodes mode is never set on x86, therefore,
295  * simply return the limits of the given mnode, which then
296  * determines the length of hpm_counters array for the mnode.
297  */
298 #define	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) 	\
299 	{							\
300 		(physbase) = mem_node_config[(mnode)].physbase;	\
301 		(physmax) = mem_node_config[(mnode)].physmax;	\
302 		(first) = (mnode);				\
303 	}
304 
305 #define	PAGE_CTRS_WRITE_LOCK(mnode)				\
306 	{							\
307 		rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);\
308 		page_freelist_lock(mnode);			\
309 	}
310 
311 #define	PAGE_CTRS_WRITE_UNLOCK(mnode)				\
312 	{							\
313 		page_freelist_unlock(mnode);			\
314 		rw_exit(&page_ctrs_rwlock[(mnode)]);		\
315 	}
316 
317 #define	PAGE_GET_COLOR_SHIFT(szc, nszc)				\
318 	    (hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift)
319 
320 #define	PAGE_CONVERT_COLOR(ncolor, szc, nszc)			\
321 	    ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))
322 
323 #define	PFN_2_COLOR(pfn, szc, it)					\
324 	(((pfn) & page_colors_mask) >>			                \
325 	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
326 
327 #define	PNUM_SIZE(szc)							\
328 	(hw_page_array[(szc)].hp_pgcnt)
329 #define	PNUM_SHIFT(szc)							\
330 	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
331 #define	PAGE_GET_SHIFT(szc)						\
332 	(hw_page_array[(szc)].hp_shift)
333 #define	PAGE_GET_PAGECOLORS(szc)					\
334 	(hw_page_array[(szc)].hp_colors)
335 
336 /*
337  * This macro calculates the next sequential pfn with the specified
338  * color using color equivalency mask
339  */
340 #define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)    \
341 	ASSERT(((color) & ~(ceq_mask)) == 0);                                 \
342 	{								      \
343 		uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);                     \
344 		pfn_t	spfn = pfn >> pfn_shift;                              \
345 		pfn_t	stride = (ceq_mask) + 1;                              \
346 		ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);                 \
347 		if (((spfn ^ (color)) & (ceq_mask)) == 0) {                   \
348 			pfn += stride << pfn_shift;                           \
349 		} else {                                                      \
350 			pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color);          \
351 			pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \
352 		}                                                             \
353 	}
354 
355 /* get the color equivalency mask for the next szc */
356 #define	PAGE_GET_NSZ_MASK(szc, mask)                                         \
357 	((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
358 
359 /* get the color of the next szc */
360 #define	PAGE_GET_NSZ_COLOR(szc, color)                                       \
361 	((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
362 
363 /* Find the bin for the given page if it was of size szc */
364 #define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc, NULL))
365 
366 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
367 
368 #define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
369 #define	PP_2_MTYPE(pp)		(pfn_2_mtype(pfn_to_mfn(pp->p_pagenum)))
370 #define	PP_2_SZC(pp)		(pp->p_szc)
371 
372 #define	SZCPAGES(szc)		(1 << PAGE_BSZS_SHIFT(szc))
373 #define	PFN_BASE(pfnum, szc)	(pfnum & ~(SZCPAGES(szc) - 1))
374 
375 /*
376  * this structure is used for walking free page lists
377  * controls when to split large pages into smaller pages,
378  * and when to coalesce smaller pages into larger pages
379  */
380 typedef struct page_list_walker {
381 	uint_t	plw_colors;		/* num of colors for szc */
382 	uint_t  plw_color_mask;		/* colors-1 */
383 	uint_t	plw_bin_step;		/* next bin: 1 or 2 */
384 	uint_t  plw_count;		/* loop count */
385 	uint_t	plw_bin0;		/* starting bin */
386 	uint_t  plw_bin_marker;		/* bin after initial jump */
387 	uint_t  plw_bin_split_prev;	/* last bin we tried to split */
388 	uint_t  plw_do_split;		/* set if OK to split */
389 	uint_t  plw_split_next;		/* next bin to split */
390 	uint_t	plw_ceq_dif;		/* number of different color groups */
391 					/* to check */
392 	uint_t	plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
393 	uint_t	plw_bins[MMU_PAGE_SIZES + 1];	/* num of bins */
394 } page_list_walker_t;
395 
396 void	page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
397     int can_split, int use_ceq, page_list_walker_t *plw);
398 
399 uint_t	page_list_walk_next_bin(uchar_t szc, uint_t bin,
400     page_list_walker_t *plw);
401 
402 extern struct cpu	cpus[];
403 #define	CPU0		cpus
404 
405 #if defined(__amd64)
406 
407 /*
408  * set the mtype range (called from page_get_{free,cache}list)
409  *   - set range to above 4g if the system has more than 4g of memory and the
410  *   amount of memory below 4g runs low. If not, set range to above 16m if
411  *   16m threshold is reached otherwise set range to all of memory
412  *   starting from the hi pfns.
413  *
414  * page_get_anylist gets its mtype range from the specified ddi_dma_attr_t.
415  */
416 #define	MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) {			\
417 	mtype = mnoderangecnt - 1;					\
418 	if (RESTRICT4G_ALLOC) {						\
419 		VM_STAT_ADD(vmm_vmstats.restrict4gcnt);			\
420 		/* here only for > 4g systems */			\
421 		flags |= PGI_MT_RANGE4G;				\
422 	} else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), flags)) {	\
423 		flags |= PGI_MT_RANGE16M;				\
424 	} else {							\
425 		VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);		\
426 		VM_STAT_COND_ADD((flags & PG_PANIC),			\
427 		    vmm_vmstats.pgpanicalloc);				\
428 		flags |= PGI_MT_RANGE0;					\
429 	}								\
430 }
431 
432 #elif defined(__i386)
433 
434 /*
435  * set the mtype range
436  *   - kmem requests needs to be below 4g if restricted_kmemalloc is set.
437  *   - for non kmem requests, set range to above 4g if the amount of memory
438  *   below 4g runs low.
439  */
440 
441 #define	MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) {			\
442 	if (restricted_kmemalloc && VN_ISKAS(vp) &&			\
443 	    (caddr_t)(vaddr) >= kernelheap &&				\
444 	    (caddr_t)(vaddr) < ekernelheap) {				\
445 		ASSERT(physmax4g);					\
446 		mtype = mtype4g;					\
447 		if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),		\
448 		    btop(pgsz), flags)) {				\
449 			flags |= PGI_MT_RANGE16M;			\
450 		} else {						\
451 			VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);	\
452 			VM_STAT_COND_ADD((flags & PG_PANIC),		\
453 			    vmm_vmstats.pgpanicalloc);			\
454 			flags |= PGI_MT_RANGE0;				\
455 		}							\
456 	} else {							\
457 		mtype = mnoderangecnt - 1;				\
458 		if (RESTRICT4G_ALLOC) {					\
459 			VM_STAT_ADD(vmm_vmstats.restrict4gcnt);		\
460 			/* here only for > 4g systems */		\
461 			flags |= PGI_MT_RANGE4G;			\
462 		} else if (RESTRICT16M_ALLOC(freemem, btop(pgsz),	\
463 		    flags)) {						\
464 			flags |= PGI_MT_RANGE16M;			\
465 		} else {						\
466 			VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);	\
467 			VM_STAT_COND_ADD((flags & PG_PANIC),		\
468 			    vmm_vmstats.pgpanicalloc);			\
469 			flags |= PGI_MT_RANGE0;				\
470 		}							\
471 	}								\
472 }
473 
474 #endif	/* __i386 */
475 
476 /*
477  * macros to loop through the mtype range (page_get_mnode_{free,cache,any}list,
478  * and page_get_contig_pages)
479  *
480  * MTYPE_START sets the initial mtype. -1 if the mtype range specified does
481  * not contain mnode.
482  *
483  * MTYPE_NEXT sets the next mtype. -1 if there are no more valid
484  * mtype in the range.
485  */
486 
487 #define	MTYPE_START(mnode, mtype, flags)				\
488 	(mtype = mtype_func(mnode, mtype, flags))
489 
490 #define	MTYPE_NEXT(mnode, mtype, flags) {				\
491 	if (flags & PGI_MT_RANGE) {					\
492 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);	\
493 	} else {							\
494 		mtype = -1;						\
495 	}								\
496 }
497 
498 /* mtype init for page_get_replacement_page */
499 
500 #define	MTYPE_PGR_INIT(mtype, flags, pp, mnode, pgcnt) {		\
501 	mtype = mnoderangecnt - 1;					\
502 	if (RESTRICT16M_ALLOC(freemem, pgcnt, flags)) {			\
503 		flags |= PGI_MT_RANGE16M;				\
504 	} else {							\
505 		VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);		\
506 		flags |= PGI_MT_RANGE0;					\
507 	}								\
508 }
509 
510 #define	MNODE_PGCNT(mnode)		mnode_pgcnt(mnode)
511 
512 #define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
513 	ASSERT(mnoderanges[mtype].mnr_mnode == mnode);			\
514 	pfnlo = mnoderanges[mtype].mnr_pfnlo;				\
515 	pfnhi = mnoderanges[mtype].mnr_pfnhi;
516 
517 #define	PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?	\
518 	&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :			\
519 	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
520 
521 #define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
522 #define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])
523 
524 #ifdef DEBUG
525 #define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
526 extern void	chk_lpg(page_t *, uchar_t);
527 #else
528 #define	CHK_LPG(pp, szc)
529 #endif
530 
531 #define	FULL_REGION_CNT(rg_szc)	\
532 	(LEVEL_SIZE(rg_szc) >> LEVEL_SHIFT(rg_szc - 1))
533 
534 /* Return the leader for this mapping size */
535 #define	PP_GROUPLEADER(pp, szc) \
536 	(&(pp)[-(int)((pp)->p_pagenum & (SZCPAGES(szc)-1))])
537 
538 /* Return the root page for this page based on p_szc */
539 #define	PP_PAGEROOT(pp) ((pp)->p_szc == 0 ? (pp) : \
540 	PP_GROUPLEADER((pp), (pp)->p_szc))
541 
542 /*
543  * The counter base must be per page_counter element to prevent
544  * races when re-indexing, and the base page size element should
545  * be aligned on a boundary of the given region size.
546  *
547  * We also round up the number of pages spanned by the counters
548  * for a given region to PC_BASE_ALIGN in certain situations to simplify
549  * the coding for some non-performance critical routines.
550  */
551 
552 #define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(MMU_PAGE_SIZES-1))
553 #define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)
554 
555 /*
556  * cpu/mmu-dependent vm variables
557  */
558 extern uint_t mmu_page_sizes;
559 extern uint_t mmu_exported_page_sizes;
560 
561 /* For x86, userszc is the same as the kernel's szc */
562 #define	USERSZC_2_SZC(userszc)	(userszc)
563 #define	SZC_2_USERSZC(szc)	(szc)
564 
565 /*
566  * for hw_page_map_t, sized to hold the ratio of large page to base
567  * pagesize (1024 max)
568  */
569 typedef	short	hpmctr_t;
570 
571 /*
572  * get the setsize of the current cpu - assume homogenous for x86
573  */
574 extern int	l2cache_sz, l2cache_linesz, l2cache_assoc;
575 
576 #define	L2CACHE_ALIGN		l2cache_linesz
577 #define	L2CACHE_ALIGN_MAX	64
578 #define	CPUSETSIZE()		\
579 	(l2cache_assoc ? (l2cache_sz / l2cache_assoc) : MMU_PAGESIZE)
580 
581 /*
582  * Return the log2(pagesize(szc) / MMU_PAGESIZE) --- or the shift count
583  * for the number of base pages in this pagesize
584  */
585 #define	PAGE_BSZS_SHIFT(szc) (LEVEL_SHIFT(szc) - MMU_PAGESHIFT)
586 
587 /*
588  * Internal PG_ flags.
589  */
590 #define	PGI_RELOCONLY	0x010000	/* opposite of PG_NORELOC */
591 #define	PGI_NOCAGE	0x020000	/* cage is disabled */
592 #define	PGI_PGCPHIPRI	0x040000	/* page_get_contig_page pri alloc */
593 #define	PGI_PGCPSZC0	0x080000	/* relocate base pagesize page */
594 
595 /*
596  * PGI range flags - should not overlap PGI flags
597  */
598 #define	PGI_MT_RANGE0	0x1000000	/* mtype range to 0 */
599 #define	PGI_MT_RANGE16M	0x2000000	/* mtype range to 16m */
600 #define	PGI_MT_RANGE4G	0x4000000	/* mtype range to 4g */
601 #define	PGI_MT_NEXT	0x8000000	/* get next mtype */
602 #define	PGI_MT_RANGE	(PGI_MT_RANGE0 | PGI_MT_RANGE16M | PGI_MT_RANGE4G)
603 
604 /*
605  * Maximum and default values for user heap, stack, private and shared
606  * anonymous memory, and user text and initialized data.
607  * Used by map_pgsz*() routines.
608  */
609 extern size_t max_uheap_lpsize;
610 extern size_t default_uheap_lpsize;
611 extern size_t max_ustack_lpsize;
612 extern size_t default_ustack_lpsize;
613 extern size_t max_privmap_lpsize;
614 extern size_t max_uidata_lpsize;
615 extern size_t max_utext_lpsize;
616 extern size_t max_shm_lpsize;
617 extern size_t mcntl0_lpsize;
618 
619 /*
620  * Sanity control. Don't use large pages regardless of user
621  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
622  * The units for this variable are 8K pages.
623  */
624 extern pgcnt_t privm_lpg_min_physmem;
625 extern pgcnt_t shm_lpg_min_physmem;
626 
627 /*
628  * hash as and addr to get a bin.
629  */
630 
631 #define	AS_2_BIN(as, seg, vp, addr, bin, szc)				    \
632 	bin = (((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
633 	    & page_colors_mask) >>					    \
634 	    (hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
635 
636 /*
637  * cpu private vm data - accessed thru CPU->cpu_vm_data
638  *	vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
639  *	vc_pnext_memseg: tracks last memseg visited in page_nextn()
640  *	vc_kmptr: orignal unaligned kmem pointer for this vm_cpu_data_t
641  *	vc_kmsize: orignal kmem size for this vm_cpu_data_t
642  */
643 
644 typedef struct {
645 	struct memseg	*vc_pnum_memseg;
646 	struct memseg	*vc_pnext_memseg;
647 	void		*vc_kmptr;
648 	size_t		vc_kmsize;
649 } vm_cpu_data_t;
650 
651 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
652 #define	VM_CPU_DATA_PADSIZE						\
653 	(P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
654 
655 /* for boot cpu before kmem is initialized */
656 extern char	vm_cpu_data0[];
657 
658 /*
659  * When a bin is empty, and we can't satisfy a color request correctly,
660  * we scan.  If we assume that the programs have reasonable spatial
661  * behavior, then it will not be a good idea to use the adjacent color.
662  * Using the adjacent color would result in virtually adjacent addresses
663  * mapping into the same spot in the cache.  So, if we stumble across
664  * an empty bin, skip a bunch before looking.  After the first skip,
665  * then just look one bin at a time so we don't miss our cache on
666  * every look. Be sure to check every bin.  Page_create() will panic
667  * if we miss a page.
668  *
669  * This also explains the `<=' in the for loops in both page_get_freelist()
670  * and page_get_cachelist().  Since we checked the target bin, skipped
671  * a bunch, then continued one a time, we wind up checking the target bin
672  * twice to make sure we get all of them bins.
673  */
674 #define	BIN_STEP	19
675 
676 #ifdef VM_STATS
677 struct vmm_vmstats_str {
678 	ulong_t pgf_alloc[MMU_PAGE_SIZES];	/* page_get_freelist */
679 	ulong_t pgf_allocok[MMU_PAGE_SIZES];
680 	ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
681 	ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
682 	ulong_t	pgf_allocdeferred;
683 	ulong_t	pgf_allocretry[MMU_PAGE_SIZES];
684 	ulong_t pgc_alloc;			/* page_get_cachelist */
685 	ulong_t pgc_allocok;
686 	ulong_t pgc_allocokrem;
687 	ulong_t pgc_allocokdeferred;
688 	ulong_t pgc_allocfailed;
689 	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];	/* page_get_contig_pages */
690 	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
691 	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
692 	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
693 	ulong_t	ptcp[MMU_PAGE_SIZES];		/* page_trylock_contig_pages */
694 	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
695 	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
696 	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
697 	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
698 	ulong_t	ptcpok[MMU_PAGE_SIZES];
699 	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];	/* page_get_mnode_freelist */
700 	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
701 	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
702 	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
703 	ulong_t	pgmc_alloc;			/* page_get_mnode_cachelist */
704 	ulong_t	pgmc_allocfailed;
705 	ulong_t	pgmc_allocempty;
706 	ulong_t	pgmc_allocok;
707 	ulong_t	pladd_free[MMU_PAGE_SIZES];	/* page_list_add/sub */
708 	ulong_t	plsub_free[MMU_PAGE_SIZES];
709 	ulong_t	pladd_cache;
710 	ulong_t	plsub_cache;
711 	ulong_t	plsubpages_szcbig;
712 	ulong_t	plsubpages_szc0;
713 	ulong_t	pfs_req[MMU_PAGE_SIZES];	/* page_freelist_split */
714 	ulong_t	pfs_demote[MMU_PAGE_SIZES];
715 	ulong_t	pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
716 	ulong_t	ppr_reloc[MMU_PAGE_SIZES];	/* page_relocate */
717 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
718 	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
719 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
720 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
721 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
722 	ulong_t ppr_copyfail;
723 	/* page coalesce counter */
724 	ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
725 	/* candidates useful */
726 	ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
727 	/* ctrs changed after locking */
728 	ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
729 	/* page_freelist_coalesce failed */
730 	ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
731 	ulong_t page_ctrs_coalesce_all;	/* page coalesce all counter */
732 	ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
733 	ulong_t	restrict4gcnt;
734 	ulong_t	unrestrict16mcnt;	/* non-DMA 16m allocs allowed */
735 	ulong_t	pgpanicalloc;		/* PG_PANIC allocation */
736 };
737 extern struct vmm_vmstats_str vmm_vmstats;
738 #endif	/* VM_STATS */
739 
740 extern size_t page_ctrs_sz(void);
741 extern caddr_t page_ctrs_alloc(caddr_t);
742 extern void page_ctr_sub(int, int, page_t *, int);
743 extern page_t *page_freelist_split(uchar_t,
744     uint_t, int, int, pfn_t, page_list_walker_t *);
745 extern page_t *page_freelist_coalesce(int, uchar_t, uint_t, uint_t, int,
746     pfn_t);
747 extern uint_t page_get_pagecolors(uint_t);
748 
749 #ifdef	__cplusplus
750 }
751 #endif
752 
753 #endif	/* _VM_DEP_H */
754