xref: /illumos-gate/usr/src/uts/i86pc/vm/vm_dep.h (revision f045d8d6fec1759551cc2bce1d26628931f14fce)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /*
25  * Copyright (c) 2010, Intel Corporation.
26  * All rights reserved.
27  */
28 
29 /*
30  * UNIX machine dependent virtual memory support.
31  */
32 
33 #ifndef	_VM_DEP_H
34 #define	_VM_DEP_H
35 
36 
37 #ifdef	__cplusplus
38 extern "C" {
39 #endif
40 
41 #include <sys/clock.h>
42 #include <vm/hat_pte.h>
43 #include <sys/param.h>
44 #include <sys/memnode.h>
45 
46 /*
47  * WARNING: vm_dep.h is included by files in common.
48  */
49 
50 #define	GETTICK()	tsc_read()
51 /*
52  * Do not use this function for obtaining clock tick.  This
53  * is called by callers who do not need to have a guarenteed
54  * correct tick value.  The proper routine to use is tsc_read().
55  */
56 
57 extern hrtime_t		randtick();
58 extern uint_t page_create_update_flags_x86(uint_t);
59 extern int kernel_page_update_flags_x86(uint_t *);
60 
61 extern size_t plcnt_sz(size_t);
62 #define	PLCNT_SZ(ctrs_sz) (ctrs_sz = plcnt_sz(ctrs_sz))
63 
64 extern caddr_t plcnt_init(caddr_t);
65 #define	PLCNT_INIT(addr) (addr = plcnt_init(addr))
66 
67 extern void plcnt_inc_dec(page_t *, int, int, long, int);
68 #define	PLCNT_INCR(pp, mnode, mtype, szc, flags)			\
69 	plcnt_inc_dec(pp, mtype, szc, 1l << PAGE_BSZS_SHIFT(szc), flags)
70 #define	PLCNT_DECR(pp, mnode, mtype, szc, flags)			\
71 	plcnt_inc_dec(pp, mtype, szc, -1l << PAGE_BSZS_SHIFT(szc), flags)
72 
73 /*
74  * macro to update page list max counts.  no-op on x86.
75  */
76 #define	PLCNT_XFER_NORELOC(pp)
77 
78 /*
79  * macro to modify the page list max counts when memory is added to
80  * the page lists during startup (add_physmem) or during a DR operation
81  * when memory is added (kphysm_add_memory_dynamic) or deleted
82  * (kphysm_del_cleanup).
83  */
84 #define	PLCNT_MODIFY_MAX(pfn, cnt)	mtype_modify_max(pfn, cnt)
85 
86 extern int memrange_num(pfn_t);
87 extern int pfn_2_mtype(pfn_t);
88 extern int mtype_func(int, int, uint_t);
89 extern void mtype_modify_max(pfn_t, long);
90 extern int mnode_pgcnt(int);
91 extern int mnode_range_cnt(int);
92 
93 /*
94  * candidate counters in vm_pagelist.c are indexed by color and range
95  */
96 #define	NUM_MEM_RANGES		4		/* memory range types */
97 #define	MAX_MNODE_MRANGES	NUM_MEM_RANGES
98 #define	MNODE_RANGE_CNT(mnode)	mnode_range_cnt(mnode)
99 #define	MNODE_MAX_MRANGE(mnode)	memrange_num(mem_node_config[mnode].physbase)
100 
101 /*
102  * combined memory ranges from mnode and memranges[] to manage single
103  * mnode/mtype dimension in the page lists.
104  */
105 typedef struct {
106 	pfn_t	mnr_pfnlo;
107 	pfn_t	mnr_pfnhi;
108 	int	mnr_mnode;
109 	int	mnr_memrange;		/* index into memranges[] */
110 	int	mnr_next;		/* next lower PA mnoderange */
111 	int	mnr_exists;
112 	/* maintain page list stats */
113 	pgcnt_t	mnr_mt_clpgcnt;		/* cache list cnt */
114 	pgcnt_t	mnr_mt_flpgcnt[MMU_PAGE_SIZES];	/* free list cnt per szc */
115 	pgcnt_t	mnr_mt_totcnt;		/* sum of cache and free lists */
116 #ifdef DEBUG
117 	struct mnr_mts {		/* mnode/mtype szc stats */
118 		pgcnt_t	mnr_mts_pgcnt;
119 		int	mnr_mts_colors;
120 		pgcnt_t *mnr_mtsc_pgcnt;
121 	} 	*mnr_mts;
122 #endif
123 } mnoderange_t;
124 
125 #define	MEMRANGEHI(mtype)						\
126 	(((mtype) > 0) ? memranges[(mtype) - 1] - 1: physmax)
127 #define	MEMRANGELO(mtype)	(memranges[(mtype)])
128 
129 #define	MTYPE_FREEMEM(mt)	(mnoderanges[(mt)].mnr_mt_totcnt)
130 
131 /*
132  * This was really badly defined, it implicitly uses mnode_maxmrange[]
133  * which is a static in vm_pagelist.c
134  */
135 extern int mtype_2_mrange(int);
136 #define	MTYPE_2_MRANGE(mnode, mtype)	\
137 	(mnode_maxmrange[mnode] - mtype_2_mrange(mtype))
138 
139 /*
140  * this structure is used for walking free page lists, it
141  * controls when to split large pages into smaller pages,
142  * and when to coalesce smaller pages into larger pages
143  */
144 typedef struct page_list_walker {
145 	uint_t	plw_colors;		/* num of colors for szc */
146 	uint_t  plw_color_mask;		/* colors-1 */
147 	uint_t	plw_bin_step;		/* next bin: 1 or 2 */
148 	uint_t  plw_count;		/* loop count */
149 	uint_t	plw_bin0;		/* starting bin */
150 	uint_t  plw_bin_marker;		/* bin after initial jump */
151 	uint_t  plw_bin_split_prev;	/* last bin we tried to split */
152 	uint_t  plw_do_split;		/* set if OK to split */
153 	uint_t  plw_split_next;		/* next bin to split */
154 	uint_t	plw_ceq_dif;		/* number of different color groups */
155 					/* to check */
156 	uint_t	plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
157 	uint_t	plw_bins[MMU_PAGE_SIZES + 1];	/* num of bins */
158 } page_list_walker_t;
159 
160 /*
161  * Page freelists are organized as two freelist types user and kernel, with
162  * their own policy and allocation routines. The definitions related to the
163  * freelist type structure are grouped below.
164  *
165  * The page size free lists themselves are allocated dynamically with
166  * dimensions [mtype][mmu_page_sizes][colors]
167  *
168  * mtype specifies a physical memory range with a unique mnode.
169  */
170 
171 #define	MAX_PFLT_POLICIES 3
172 #define	MAX_PFLT_TYPE 2
173 enum freelist_types {PFLT_USER, PFLT_KMEM};
174 
175 /*
176  * The kernel only needs a small number of page colors, far fewer than user
177  * programs.
178  */
179 #define	KFLT_PAGE_COLORS 16
180 
181 typedef struct page_freelist_type page_freelist_type_t;
182 extern page_freelist_type_t flt_user;
183 extern page_freelist_type_t flt_kern;
184 extern page_freelist_type_t *ufltp;
185 extern page_freelist_type_t *kfltp;
186 
187 void page_flt_init(page_freelist_type_t *, page_freelist_type_t *);
188 page_t *page_get_uflt(struct vnode *, u_offset_t, struct seg *, caddr_t,
189     size_t, uint_t, struct lgrp *);
190 page_t *page_get_kflt(struct vnode *, u_offset_t, struct seg *, caddr_t,
191     size_t, uint_t, struct lgrp *);
192 void page_kflt_walk_init(uchar_t, uint_t, uint_t, int, int,
193     page_list_walker_t *);
194 uint_t page_kflt_walk_next_bin(uchar_t, uint_t, page_list_walker_t *);
195 page_t *page_import_kflt(page_freelist_type_t *, uint_t, int, uchar_t,
196     uint_t, int *);
197 page_t *page_user_alloc_kflt(page_freelist_type_t *, int, uint_t, int, uchar_t,
198 	uint_t);
199 void kflt_expand(void);
200 
201 typedef page_t *(*pflt_get_func_p) (struct vnode *, u_offset_t, struct seg *,
202     caddr_t, size_t, uint_t, lgrp_t *);
203 typedef page_t *(*pflt_policy_func_p)(page_freelist_type_t *, int, uint_t, int,
204     uchar_t, uint_t);
205 typedef void (*pflt_list_walk_init_func_p)(uchar_t, uint_t, uint_t, int, int,
206     page_list_walker_t *);
207 typedef uint_t (*pflt_list_walk_next_func_p)(uchar_t, uint_t,
208     page_list_walker_t *);
209 
210 struct page_freelist_type {
211 	int pflt_type;			/* type is user or kernel */
212 	pflt_get_func_p pflt_get_free;  /* top-level alloc routine */
213 	pflt_list_walk_init_func_p pflt_walk_init;  /* walker routines */
214 	pflt_list_walk_next_func_p pflt_walk_next;
215 	int	pflt_num_policies;	/* the number of policy routines */
216 	/*
217 	 * the policy routines are called by the allocator routine
218 	 * to implement the actual allocation policies.
219 	 */
220 	pflt_policy_func_p pflt_policy[MAX_PFLT_POLICIES];
221 	page_t ****pflt_freelists;	/* the page freelist arrays */
222 };
223 
224 #if defined(__amd64) && !defined(__xpv)
225 #define	PAGE_FREELISTP(is_kflt, mnode, szc, color, mtype)		\
226 	((is_kflt) ?							\
227 	(page_t **)(kfltp->pflt_freelists[mtype] + (color)) :   	\
228 	((ufltp->pflt_freelists[mtype][szc] + (color))))
229 
230 #define	PAGE_GET_FREELISTS(pp, vp, off, seg, vaddr, size, flags, lgrp)	     \
231 	{								     \
232 		if (kflt_on && (((flags) & PG_KFLT) == PG_KFLT)) {	     \
233 			pp = kfltp->pflt_get_free(vp, off, seg, vaddr, size, \
234 			    flags, lgrp);				     \
235 		} else {						     \
236 			pp = ufltp->pflt_get_free(vp, off, seg, vaddr, size, \
237 			    flags, lgrp);				     \
238 		}							     \
239 	}
240 #else /* __amd64 && ! __xpv */
241 #define	PAGE_FREELISTP(is_kflt, mnode, szc, color, mtype)		\
242 	((ufltp->pflt_freelists[mtype][szc] + (color)))
243 
244 #define	PAGE_GET_FREELISTS(pp, vp, off, seg, vaddr, size, flags, lgrp)	     \
245 			pp = ufltp->pflt_get_free(vp, off, seg, vaddr, size, \
246 			    flags, lgrp);
247 #endif /* __amd64 && ! __xpv */
248 
249 #define	PAGE_FREELISTS(is_kflt, mnode, szc, color, mtype)		\
250 	(*(PAGE_FREELISTP(is_kflt, mnode, szc, color, mtype)))
251 
252 #define	PAGE_GET_FREELISTS_POLICY(fp, i) 				\
253 	(fp->pflt_policy[i])
254 
255 #define	PAGE_LIST_WALK_INIT(fp, szc, flags, bin, can_split, use_ceq, plw) \
256 	fp->pflt_walk_init(szc, flags, bin, can_split, use_ceq, plw)
257 
258 #define	PAGE_LIST_WALK_NEXT(fp, szc, bin, plw) 				\
259 	fp->pflt_walk_next(szc, bin, plw)
260 
261 
262 /*
263  * For now there is only a single size cache list. Allocated dynamically.
264  * dimensions [mtype][colors]
265  *
266  * mtype specifies a physical memory range with a unique mnode.
267  */
268 extern page_t ***page_cachelists;
269 
270 #define	PAGE_CACHELISTS(mnode, color, mtype) 		\
271 	(*(page_cachelists[mtype] + (color)))
272 
273 /*
274  * There are mutexes for the user page freelist, the kernel page freelist
275  * and the page cachelist.  We want enough locks to make contention
276  * reasonable, but not too many -- otherwise page_freelist_lock() gets
277  * so expensive that it becomes the bottleneck!
278  */
279 
280 #define	NPC_MUTEX	16
281 
282 /*
283  * The kflt_disable variable is used to determine whether the kernel freelist
284  * is supported on this platform.
285  */
286 extern int kflt_disable;
287 
288 extern kmutex_t	*fpc_mutex[NPC_MUTEX];
289 extern kmutex_t	*kfpc_mutex[NPC_MUTEX];
290 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
291 
292 #define	PC_ISKFLT(fltp)	(fltp->pflt_type == PFLT_KMEM)
293 	/* flag used by the kflt_export function when calling page_promote */
294 #define	PC_KFLT_EXPORT 0x4
295 
296 extern page_t *page_get_mnode_freelist(page_freelist_type_t *, int, uint_t,
297     int, uchar_t, uint_t);
298 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
299 extern page_t *page_get_contig_pages(page_freelist_type_t *, int, uint_t, int,
300     uchar_t, uint_t);
301 extern void page_list_walk_init(uchar_t, uint_t, uint_t, int, int,
302     page_list_walker_t *);
303 extern uint_t page_list_walk_next_bin(uchar_t, uint_t, page_list_walker_t *);
304 
305 extern void kflt_evict_wakeup();
306 extern void kflt_freemem_add(pgcnt_t);
307 extern void kflt_freemem_sub(pgcnt_t);
308 
309 /* mem node iterator is not used on x86 */
310 #define	MEM_NODE_ITERATOR_DECL(it)
311 #define	MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it)
312 
313 /*
314  * interleaved_mnodes mode is never set on x86, therefore,
315  * simply return the limits of the given mnode, which then
316  * determines the length of hpm_counters array for the mnode.
317  */
318 #define	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) 	\
319 	{							\
320 		(physbase) = mem_node_config[(mnode)].physbase;	\
321 		(physmax) = mem_node_config[(mnode)].physmax;	\
322 		(first) = (mnode);				\
323 	}
324 
325 #define	PAGE_CTRS_WRITE_LOCK(mnode)				\
326 	{							\
327 		rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);\
328 		page_freelist_lock(mnode);			\
329 	}
330 
331 #define	PAGE_CTRS_WRITE_UNLOCK(mnode)				\
332 	{							\
333 		page_freelist_unlock(mnode);			\
334 		rw_exit(&page_ctrs_rwlock[(mnode)]);		\
335 	}
336 
337 /*
338  * macro to call page_ctrs_adjust() when memory is added
339  * during a DR operation.
340  */
341 #define	PAGE_CTRS_ADJUST(pfn, cnt, rv) {				       \
342 	spgcnt_t _cnt = (spgcnt_t)(cnt);				       \
343 	int _mn;							       \
344 	pgcnt_t _np;							       \
345 	pfn_t _pfn = (pfn);						       \
346 	pfn_t _endpfn = _pfn + _cnt;					       \
347 	while (_pfn < _endpfn) {					       \
348 		_mn = PFN_2_MEM_NODE(_pfn);				       \
349 		_np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - _pfn;   \
350 		_pfn += _np;						       \
351 		if ((rv = page_ctrs_adjust(_mn)) != 0)			       \
352 			break;						       \
353 	}								       \
354 }
355 
356 #define	PAGE_GET_COLOR_SHIFT(szc, nszc)				\
357 	    (hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift)
358 
359 #define	PAGE_CONVERT_COLOR(ncolor, szc, nszc)			\
360 	    ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))
361 
362 #define	PFN_2_COLOR(pfn, szc, it)					\
363 	(((pfn) & page_colors_mask) >>			                \
364 	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
365 
366 #define	PNUM_SIZE(szc)							\
367 	(hw_page_array[(szc)].hp_pgcnt)
368 #define	PNUM_SHIFT(szc)							\
369 	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
370 #define	PAGE_GET_SHIFT(szc)						\
371 	(hw_page_array[(szc)].hp_shift)
372 #define	PAGE_GET_PAGECOLORS(szc)					\
373 	(hw_page_array[(szc)].hp_colors)
374 
375 /*
376  * This macro calculates the next sequential pfn with the specified
377  * color using color equivalency mask
378  */
379 #define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)    \
380 	{								      \
381 		uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);                     \
382 		pfn_t	spfn = pfn >> pfn_shift;                              \
383 		pfn_t	stride = (ceq_mask) + 1;                              \
384 		ASSERT(((color) & ~(ceq_mask)) == 0);                         \
385 		ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);                 \
386 		if (((spfn ^ (color)) & (ceq_mask)) == 0) {                   \
387 			pfn += stride << pfn_shift;                           \
388 		} else {                                                      \
389 			pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color);          \
390 			pfn = (pfn > spfn ? pfn : pfn + stride) << pfn_shift; \
391 		}                                                             \
392 	}
393 
394 #define	USER_2_KMEM_BIN(bin)	((bin) & (KFLT_PAGE_COLORS - 1))
395 
396 /* get the color equivalency mask for the next szc */
397 #define	PAGE_GET_NSZ_MASK(szc, mask)                                         \
398 	((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
399 
400 /* get the color of the next szc */
401 #define	PAGE_GET_NSZ_COLOR(szc, color)                                       \
402 	((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc)))
403 
404 /* Find the bin for the given page if it was of size szc */
405 #define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc, NULL))
406 
407 #define	PP_2_BIN(pp)		((PP_ISKFLT(pp)) ?			\
408 	USER_2_KMEM_BIN(PP_2_BIN_SZC(pp, pp->p_szc)) :		\
409 	(PP_2_BIN_SZC(pp, pp->p_szc)))
410 
411 #define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
412 #define	PP_2_MTYPE(pp)		(pfn_2_mtype(pp->p_pagenum))
413 #define	PP_2_SZC(pp)		(pp->p_szc)
414 
415 #define	SZCPAGES(szc)		(1 << PAGE_BSZS_SHIFT(szc))
416 #define	PFN_BASE(pfnum, szc)	(pfnum & ~(SZCPAGES(szc) - 1))
417 
418 void	page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
419     int can_split, int use_ceq, page_list_walker_t *plw);
420 
421 uint_t	page_list_walk_next_bin(uchar_t szc, uint_t bin,
422     page_list_walker_t *plw);
423 
424 extern struct cpu	cpus[];
425 #define	CPU0		cpus
426 
427 extern int mtype_init(vnode_t *, caddr_t, uint_t *, size_t);
428 #define	MTYPE_INIT(mtype, vp, vaddr, flags, pgsz)		\
429 	(mtype = mtype_init(vp, vaddr, &(flags), pgsz))
430 
431 /*
432  * macros to loop through the mtype range (page_get_mnode_{free,cache,any}list,
433  * and page_get_contig_pages)
434  *
435  * MTYPE_START sets the initial mtype. -1 if the mtype range specified does
436  * not contain mnode.
437  *
438  * MTYPE_NEXT sets the next mtype. -1 if there are no more valid
439  * mtype in the range.
440  */
441 
442 #define	MTYPE_START(mnode, mtype, flags)				\
443 	(mtype = mtype_func(mnode, mtype, flags))
444 
445 #define	MTYPE_NEXT(mnode, mtype, flags) {				\
446 	if (flags & PGI_MT_RANGE) {					\
447 		mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);	\
448 	} else {							\
449 		mtype = -1;						\
450 	}								\
451 }
452 
453 extern int mtype_pgr_init(int *, page_t *, int, pgcnt_t);
454 #define	MTYPE_PGR_INIT(mtype, flags, pp, mnode, pgcnt)			\
455 	(mtype = mtype_pgr_init(&flags, pp, mnode, pgcnt))
456 
457 #define	MNODE_PGCNT(mnode)		mnode_pgcnt(mnode)
458 
459 extern void mnodetype_2_pfn(int, int, pfn_t *, pfn_t *);
460 #define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
461 	mnodetype_2_pfn(mnode, mtype, &pfnlo, &pfnhi)
462 
463 #define	PC_FREELIST_BIN_MUTEX(is_kflt, mnode, bin, flags)		\
464 	((is_kflt) ? 							\
465 	(&kfpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode]) : 		\
466 	(&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode]))
467 
468 #define	PC_BIN_MUTEX(is_kflt, mnode, bin, flags)			\
469 	((flags & PG_FREE_LIST) ?					\
470 	PC_FREELIST_BIN_MUTEX(is_kflt, mnode, bin, flags):		\
471 	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
472 
473 #define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
474 #define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])
475 #define	KFPC_MUTEX(mnode, i)	(&kfpc_mutex[i][mnode])
476 
477 #ifdef DEBUG
478 #define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
479 extern void	chk_lpg(page_t *, uchar_t);
480 #else
481 #define	CHK_LPG(pp, szc)
482 #endif
483 
484 #define	FULL_REGION_CNT(rg_szc)	\
485 	(LEVEL_SIZE(rg_szc) >> LEVEL_SHIFT(rg_szc - 1))
486 
487 /* Return the leader for this mapping size */
488 #define	PP_GROUPLEADER(pp, szc) \
489 	(&(pp)[-(int)((pp)->p_pagenum & (SZCPAGES(szc)-1))])
490 
491 /* Return the root page for this page based on p_szc */
492 #define	PP_PAGEROOT(pp) ((pp)->p_szc == 0 ? (pp) : \
493 	PP_GROUPLEADER((pp), (pp)->p_szc))
494 
495 /*
496  * The counter base must be per page_counter element to prevent
497  * races when re-indexing, and the base page size element should
498  * be aligned on a boundary of the given region size.
499  *
500  * We also round up the number of pages spanned by the counters
501  * for a given region to PC_BASE_ALIGN in certain situations to simplify
502  * the coding for some non-performance critical routines.
503  */
504 
505 #define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(MMU_PAGE_SIZES-1))
506 #define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)
507 
508 /*
509  * cpu/mmu-dependent vm variables
510  */
511 extern uint_t mmu_page_sizes;
512 extern uint_t mmu_exported_page_sizes;
513 /*
514  * page sizes that legacy applications can see via getpagesizes(3c).
515  * Used to prevent legacy applications from inadvertantly using the
516  * 'new' large pagesizes (1g and above).
517  */
518 extern uint_t mmu_legacy_page_sizes;
519 
520 /* For x86, userszc is the same as the kernel's szc */
521 #define	USERSZC_2_SZC(userszc)	(userszc)
522 #define	SZC_2_USERSZC(szc)	(szc)
523 
524 /*
525  * for hw_page_map_t, sized to hold the ratio of large page to base
526  * pagesize (1024 max)
527  */
528 typedef	short	hpmctr_t;
529 
530 /*
531  * get the setsize of the current cpu - assume homogenous for x86
532  */
533 extern int	l2cache_sz, l2cache_linesz, l2cache_assoc;
534 
535 #define	L2CACHE_ALIGN		l2cache_linesz
536 #define	L2CACHE_ALIGN_MAX	64
537 #define	CPUSETSIZE()		\
538 	(l2cache_assoc ? (l2cache_sz / l2cache_assoc) : MMU_PAGESIZE)
539 
540 /*
541  * Return the log2(pagesize(szc) / MMU_PAGESIZE) --- or the shift count
542  * for the number of base pages in this pagesize
543  */
544 #define	PAGE_BSZS_SHIFT(szc) (LEVEL_SHIFT(szc) - MMU_PAGESHIFT)
545 
546 /*
547  * Internal PG_ flags.
548  */
549 #define	PGI_RELOCONLY	0x010000	/* opposite of PG_NORELOC */
550 #define	PGI_NOCAGE	0x020000	/* cage is disabled */
551 #define	PGI_PGCPHIPRI	0x040000	/* page_get_contig_page pri alloc */
552 #define	PGI_PGCPSZC0	0x080000	/* relocate base pagesize page */
553 
554 /*
555  * PGI range flags - should not overlap PGI flags
556  */
557 #define	PGI_MT_RANGE0	0x1000000	/* mtype range to 0 */
558 #define	PGI_MT_RANGE16M 0x2000000	/* mtype range to 16m */
559 #define	PGI_MT_RANGE4G	0x4000000	/* mtype range to 4g */
560 #define	PGI_MT_NEXT	0x8000000	/* get next mtype */
561 #define	PGI_MT_RANGE	(PGI_MT_RANGE0 | PGI_MT_RANGE16M | PGI_MT_RANGE4G)
562 
563 /* Flag to avoid allocating a page in page_import_kflt() */
564 #define	PGI_NOPGALLOC	0x10000000
565 
566 /*
567  * Maximum and default values for user heap, stack, private and shared
568  * anonymous memory, and user text and initialized data.
569  * Used by map_pgsz*() routines.
570  */
571 extern size_t max_uheap_lpsize;
572 extern size_t default_uheap_lpsize;
573 extern size_t max_ustack_lpsize;
574 extern size_t default_ustack_lpsize;
575 extern size_t max_privmap_lpsize;
576 extern size_t max_uidata_lpsize;
577 extern size_t max_utext_lpsize;
578 extern size_t max_shm_lpsize;
579 extern size_t mcntl0_lpsize;
580 
581 /*
582  * Sanity control. Don't use large pages regardless of user
583  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
584  * The units for this variable are 8K pages.
585  */
586 extern pgcnt_t privm_lpg_min_physmem;
587 extern pgcnt_t shm_lpg_min_physmem;
588 
589 /*
590  * hash as and addr to get a bin.
591  */
592 
593 #define	AS_2_USER_BIN(as, seg, vp, addr, bin, szc)			    \
594 	bin = (((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
595 	    & page_colors_mask) >>					    \
596 	    (hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
597 
598 #define	AS_2_BIN(is_kflt, as, seg, vp, addr, bin, szc) {		    \
599 	AS_2_USER_BIN(as, seg, vp, addr, bin, szc);			    \
600 	if (is_kflt) {							    \
601 		bin = USER_2_KMEM_BIN(bin);				    \
602 	}								    \
603 }
604 /*
605  * cpu private vm data - accessed thru CPU->cpu_vm_data
606  *	vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
607  *	vc_pnext_memseg: tracks last memseg visited in page_nextn()
608  *	vc_kmptr: orignal unaligned kmem pointer for this vm_cpu_data_t
609  *	vc_kmsize: orignal kmem size for this vm_cpu_data_t
610  */
611 
612 typedef struct {
613 	struct memseg	*vc_pnum_memseg;
614 	struct memseg	*vc_pnext_memseg;
615 	void		*vc_kmptr;
616 	size_t		vc_kmsize;
617 } vm_cpu_data_t;
618 
619 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
620 #define	VM_CPU_DATA_PADSIZE						\
621 	(P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
622 
623 /* for boot cpu before kmem is initialized */
624 extern char	vm_cpu_data0[];
625 
626 /*
627  * When a bin is empty, and we can't satisfy a color request correctly,
628  * we scan.  If we assume that the programs have reasonable spatial
629  * behavior, then it will not be a good idea to use the adjacent color.
630  * Using the adjacent color would result in virtually adjacent addresses
631  * mapping into the same spot in the cache.  So, if we stumble across
632  * an empty bin, skip a bunch before looking.  After the first skip,
633  * then just look one bin at a time so we don't miss our cache on
634  * every look. Be sure to check every bin.  Page_create() will panic
635  * if we miss a page.
636  *
637  * This also explains the `<=' in the for loops in both page_get_freelist()
638  * and page_get_cachelist().  Since we checked the target bin, skipped
639  * a bunch, then continued one a time, we wind up checking the target bin
640  * twice to make sure we get all of them bins.
641  */
642 #define	BIN_STEP	19
643 
644 #ifdef VM_STATS
645 struct vmm_vmstats_str {
646 			/* page_get_uflt and page_get_kflt */
647 	ulong_t pgf_alloc[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
648 	ulong_t pgf_allocok[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
649 	ulong_t pgf_allocokrem[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
650 	ulong_t pgf_allocfailed[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
651 	ulong_t	pgf_allocdeferred;
652 	ulong_t	pgf_allocretry[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
653 	ulong_t pgik_allocok;			/* page_import_kflt */
654 	ulong_t pgik_allocfailed;
655 	ulong_t pgkx_allocok;			/* kflt_expand */
656 	ulong_t pgkx_allocfailed;
657 	ulong_t puak_allocok;			/* page_user_alloc_kflt */
658 	ulong_t puak_allocfailed;
659 	ulong_t pgexportok;			/* kflt_export */
660 	ulong_t pgexportfail;
661 	ulong_t pgkflt_disable;			/* kflt_user_evict */
662 	ulong_t pgc_alloc;			/* page_get_cachelist */
663 	ulong_t pgc_allocok;
664 	ulong_t pgc_allocokrem;
665 	ulong_t pgc_allocokdeferred;
666 	ulong_t pgc_allocfailed;
667 	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];	/* page_get_contig_pages */
668 	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
669 	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
670 	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
671 	ulong_t	ptcp[MMU_PAGE_SIZES];		/* page_trylock_contig_pages */
672 	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
673 	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
674 	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
675 	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
676 	ulong_t	ptcpfailkflt[MMU_PAGE_SIZES];
677 	ulong_t	ptcpok[MMU_PAGE_SIZES];
678 	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];	/* page_get_mnode_freelist */
679 	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
680 	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
681 	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
682 	ulong_t	pgmc_alloc;			/* page_get_mnode_cachelist */
683 	ulong_t	pgmc_allocfailed;
684 	ulong_t	pgmc_allocempty;
685 	ulong_t	pgmc_allocok;
686 	ulong_t	pladd_free[MMU_PAGE_SIZES];	/* page_list_add/sub */
687 	ulong_t	plsub_free[MMU_PAGE_SIZES];
688 	ulong_t	pladd_cache;
689 	ulong_t	plsub_cache;
690 	ulong_t	plsubpages_szcbig;
691 	ulong_t	plsubpages_szc0;
692 	ulong_t	pfs_req[MMU_PAGE_SIZES];	/* page_freelist_split */
693 	ulong_t	pfs_demote[MMU_PAGE_SIZES];
694 	ulong_t	pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
695 	ulong_t	ppr_reloc[MMU_PAGE_SIZES];	/* page_relocate */
696 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
697 	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
698 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
699 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
700 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
701 	ulong_t ppr_copyfail;
702 	/* page coalesce counter */
703 	ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
704 	/* candidates useful */
705 	ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
706 	/* ctrs changed after locking */
707 	ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
708 	/* page_freelist_coalesce failed */
709 	ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
710 	ulong_t page_ctrs_coalesce_all;	/* page coalesce all counter */
711 	ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
712 	ulong_t	restrict4gcnt;
713 	ulong_t	unrestrict16mcnt;	/* non-DMA 16m allocs allowed */
714 	ulong_t	pgpanicalloc;		/* PG_PANIC allocation */
715 	ulong_t	pcf_deny[MMU_PAGE_SIZES];	/* page_chk_freelist */
716 	ulong_t	pcf_allow[MMU_PAGE_SIZES];
717 };
718 extern struct vmm_vmstats_str vmm_vmstats;
719 #endif	/* VM_STATS */
720 
721 extern size_t page_ctrs_sz(void);
722 extern caddr_t page_ctrs_alloc(caddr_t);
723 extern void page_ctr_sub(int, int, page_t *, int);
724 extern page_t *page_freelist_split(uchar_t,
725     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
726 extern page_t *page_freelist_coalesce(int, uchar_t, uint_t, uint_t, int,
727     pfn_t);
728 extern void page_freelist_coalesce_all(int);
729 extern uint_t page_get_pagecolors(uint_t);
730 extern void pfnzero(pfn_t, uint_t, uint_t);
731 
732 #ifdef	__cplusplus
733 }
734 #endif
735 
736 #endif	/* _VM_DEP_H */
737