xref: /titanic_52/usr/src/uts/sun4/vm/vm_dep.h (revision ea46d7619be99679c4c99ed47508abe31d5e0979)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * UNIX machine dependent virtual memory support.
27  */
28 
29 #ifndef	_VM_DEP_H
30 #define	_VM_DEP_H
31 
32 #ifdef	__cplusplus
33 extern "C" {
34 #endif
35 
36 #include <vm/hat_sfmmu.h>
37 #include <sys/archsystm.h>
38 #include <sys/memnode.h>
39 
40 #define	GETTICK()	gettick()
41 
42 /* #define for keeping code architecturally neutral */
43 #define	randtick()	gettick()
44 
45 /*
46  * Per page size free lists. Allocated dynamically.
47  */
48 #define	MAX_MEM_TYPES	2	/* 0 = reloc, 1 = noreloc */
49 #define	MTYPE_RELOC	0
50 #define	MTYPE_NORELOC	1
51 
52 #define	PP_2_MTYPE(pp)	(PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC)
53 
54 #define	MTYPE_INIT(mtype, vp, vaddr, flags, pgsz)			\
55 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
56 
57 /* mtype init for page_get_replacement_page */
58 #define	MTYPE_PGR_INIT(mtype, flags, pp, mnode, pgcnt)			\
59 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
60 
61 #define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
62 	pfnlo = mem_node_config[mnode].physbase;			\
63 	pfnhi = mem_node_config[mnode].physmax;
64 
65 /*
66  * candidate counters in vm_pagelist.c are indexed by color and range
67  */
68 #define	MAX_MNODE_MRANGES		MAX_MEM_TYPES
69 #define	MNODE_RANGE_CNT(mnode)		MAX_MNODE_MRANGES
70 #define	MNODE_MAX_MRANGE(mnode)		(MAX_MEM_TYPES - 1)
71 #define	MTYPE_2_MRANGE(mnode, mtype)	(mtype)
72 
73 /*
74  * Internal PG_ flags.
75  */
76 #define	PGI_RELOCONLY	0x10000	/* acts in the opposite sense to PG_NORELOC */
77 #define	PGI_NOCAGE	0x20000	/* indicates Cage is disabled */
78 #define	PGI_PGCPHIPRI	0x40000	/* page_get_contig_page priority allocation */
79 #define	PGI_PGCPSZC0	0x80000	/* relocate base pagesize page */
80 
81 /*
82  * PGI mtype flags - should not overlap PGI flags
83  */
84 #define	PGI_MT_RANGE	0x1000000	/* mtype range */
85 #define	PGI_MT_NEXT	0x2000000	/* get next mtype */
86 
87 extern page_t ***page_cachelists[MAX_MEM_TYPES];
88 
89 #define	PAGE_CACHELISTS(mnode, color, mtype) \
90 	(*(page_cachelists[mtype][mnode] + (color)))
91 
92 /*
93  * There are 'page_colors' colors/bins.  Spread them out under a
94  * couple of locks.  There are mutexes for both the page freelist
95  * and the page cachelist.  We want enough locks to make contention
96  * reasonable, but not too many -- otherwise page_freelist_lock() gets
97  * so expensive that it becomes the bottleneck!
98  */
99 #define	NPC_MUTEX	16
100 
101 extern kmutex_t	*fpc_mutex[NPC_MUTEX];
102 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
103 
104 /*
105  * Iterator provides the info needed to convert RA to PA.
106  * MEM_NODE_ITERATOR_INIT() should be called before
107  * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous
108  * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash
109  * translations requiring initializer call if color or ceq_mask changes,
110  * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before
111  * PFN_2_COLOR() that uses a valid iterator argument.
112  *
113  * plat_mem_node_iterator_init() starts from last mblock in continuation
114  * case which may be invalid because memory DR.  To detect this situation
115  * mi_genid is checked against mpo_genid which is incremented after a
116  * memory DR operation.  See also plat_slice_add()/plat_slice_del().
117  */
118 #ifdef	sun4v
119 
120 typedef struct mem_node_iterator {
121 	uint_t mi_mnode;		/* mnode in which to iterate */
122 	int mi_init;			/* set to 1 when first init */
123 	int mi_genid;			/* set/checked against mpo_genid */
124 	int mi_last_mblock;		/* last mblock visited */
125 	uint_t mi_hash_ceq_mask;	/* cached copy of ceq_mask */
126 	uint_t mi_hash_color;		/* cached copy of color */
127 	uint_t mi_mnode_mask;		/* number of mask bits */
128 	uint_t mi_mnode_pfn_shift;	/* mnode position in pfn */
129 	pfn_t mi_mblock_base;		/* first valid pfn in current mblock */
130 	pfn_t mi_mblock_end;		/* last valid pfn in current mblock */
131 	pfn_t mi_ra_to_pa;		/* ra adjustment for current mblock */
132 	pfn_t mi_mnode_pfn_mask;	/* mask to obtain mnode id bits */
133 } mem_node_iterator_t;
134 
135 #define	MEM_NODE_ITERATOR_DECL(it) \
136 	mem_node_iterator_t it
137 #define	MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) \
138 	(pfn) = plat_mem_node_iterator_init((pfn), (mnode), (szc), (it), 1)
139 
140 extern pfn_t plat_mem_node_iterator_init(pfn_t, int, uchar_t,
141     mem_node_iterator_t *, int);
142 extern pfn_t plat_rapfn_to_papfn(pfn_t);
143 extern int interleaved_mnodes;
144 
145 #else	/* sun4v */
146 
147 #define	MEM_NODE_ITERATOR_DECL(it) \
148 	void *it = NULL
149 #define	MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it)
150 
151 #endif	/* sun4v */
152 
153 /*
154  * Return the mnode limits so that hpc_counters length and base
155  * index can be determined. When interleaved_mnodes is set, we
156  * create an array only for the first mnode that exists. All other
157  * mnodes will share the array in this case.
158  * If interleaved_mnodes is not set, simply return the limits for
159  * the given mnode.
160  */
161 #define	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first)		\
162 	if (!interleaved_mnodes) {					\
163 		(physbase) = mem_node_config[(mnode)].physbase;		\
164 		(physmax) = mem_node_config[(mnode)].physmax;		\
165 		(first) = (mnode);					\
166 	} else if ((first) < 0) {					\
167 		mem_node_max_range(&(physbase), &(physmax));		\
168 		(first) = (mnode);					\
169 	}
170 
171 #define	PAGE_CTRS_WRITE_LOCK(mnode)					\
172 	if (!interleaved_mnodes) {					\
173 		rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);	\
174 		page_freelist_lock(mnode);				\
175 	} else {							\
176 		/* changing shared hpm_counters */			\
177 		int _i;							\
178 		for (_i = 0; _i < max_mem_nodes; _i++) {		\
179 			rw_enter(&page_ctrs_rwlock[_i], RW_WRITER);	\
180 			page_freelist_lock(_i);				\
181 		}							\
182 	}
183 
184 #define	PAGE_CTRS_WRITE_UNLOCK(mnode)					\
185 	if (!interleaved_mnodes) {					\
186 		page_freelist_unlock(mnode);				\
187 		rw_exit(&page_ctrs_rwlock[(mnode)]);			\
188 	} else {							\
189 		int _i;							\
190 		for (_i = 0; _i < max_mem_nodes; _i++) {		\
191 			page_freelist_unlock(_i);			\
192 			rw_exit(&page_ctrs_rwlock[_i]);			\
193 		}							\
194 	}
195 
196 /*
197  * cpu specific color conversion functions
198  */
199 extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
200 #pragma weak page_get_nsz_color_mask_cpu
201 
202 extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
203 #pragma weak page_get_nsz_color_cpu
204 
205 extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
206 #pragma weak page_get_color_shift_cpu
207 
208 extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t);
209 #pragma weak page_convert_color_cpu
210 
211 extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
212     uchar_t, uint_t, uint_t, uint_t, void *);
213 #pragma weak page_next_pfn_for_color_cpu
214 
215 extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t, void *);
216 #pragma weak page_pfn_2_color_cpu
217 
218 #define	PAGE_GET_COLOR_SHIFT(szc, nszc)				\
219 	((&page_get_color_shift_cpu != NULL) ?			\
220 	    page_get_color_shift_cpu(szc, nszc) :		\
221 	    (hw_page_array[(nszc)].hp_shift -			\
222 		hw_page_array[(szc)].hp_shift))
223 
224 #define	PAGE_CONVERT_COLOR(ncolor, szc, nszc)			\
225 	((&page_convert_color_cpu != NULL) ?			\
226 	    page_convert_color_cpu(ncolor, szc, nszc) :		\
227 	    ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))))
228 
229 #define	PFN_2_COLOR(pfn, szc, it)				\
230 	((&page_pfn_2_color_cpu != NULL) ?			\
231 	    page_pfn_2_color_cpu(pfn, szc, it) :		\
232 	    ((pfn & (hw_page_array[0].hp_colors - 1)) >>	\
233 		(hw_page_array[szc].hp_shift -			\
234 		    hw_page_array[0].hp_shift)))
235 
236 #define	PNUM_SIZE(szc)							\
237 	(hw_page_array[(szc)].hp_pgcnt)
238 #define	PNUM_SHIFT(szc)							\
239 	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
240 #define	PAGE_GET_SHIFT(szc)						\
241 	(hw_page_array[(szc)].hp_shift)
242 #define	PAGE_GET_PAGECOLORS(szc)					\
243 	(hw_page_array[(szc)].hp_colors)
244 
245 /*
246  * This macro calculates the next sequential pfn with the specified
247  * color using color equivalency mask
248  */
249 #define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)   \
250 	{                                                                    \
251 		ASSERT(((color) & ~(ceq_mask)) == 0);                        \
252 		if (&page_next_pfn_for_color_cpu == NULL) {                  \
253 			uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);            \
254 			pfn_t	spfn = pfn >> pfn_shift;                     \
255 			pfn_t	stride = (ceq_mask) + 1;                     \
256 			ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);        \
257 			if (((spfn ^ (color)) & (ceq_mask)) == 0) {          \
258 				pfn += stride << pfn_shift;                  \
259 			} else {                                             \
260 				pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \
261 				pfn = (pfn > spfn ? pfn : pfn + stride) <<   \
262 				    pfn_shift;                               \
263 			}                                                    \
264 		} else {                                                     \
265 		    pfn = page_next_pfn_for_color_cpu(pfn, szc, color,	     \
266 			ceq_mask, color_mask, it);			     \
267 		}                                                            \
268 	}
269 
270 /* get the color equivalency mask for the next szc */
271 #define	PAGE_GET_NSZ_MASK(szc, mask)                                         \
272 	((&page_get_nsz_color_mask_cpu == NULL) ?                            \
273 	    ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) :  \
274 	    page_get_nsz_color_mask_cpu(szc, mask))
275 
276 /* get the color of the next szc */
277 #define	PAGE_GET_NSZ_COLOR(szc, color)                                       \
278 	((&page_get_nsz_color_cpu == NULL) ?                                 \
279 	    ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \
280 	    page_get_nsz_color_cpu(szc, color))
281 
282 /* Find the bin for the given page if it was of size szc */
283 #define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1)))
284 
285 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
286 
287 #define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
288 
289 #define	PC_BIN_MUTEX(iskflt, mnode, bin, flags) ((flags & PG_FREE_LIST) ? \
290 	&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :			  \
291 	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
292 
293 #define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
294 #define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])
295 
296 #define	PFN_BASE(pfnum, szc)	(pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1))
297 
298 /*
299  * this structure is used for walking free page lists
300  * controls when to split large pages into smaller pages,
301  * and when to coalesce smaller pages into larger pages
302  */
303 typedef struct page_list_walker {
304 	uint_t	plw_colors;		/* num of colors for szc */
305 	uint_t  plw_color_mask;		/* colors-1 */
306 	uint_t	plw_bin_step;		/* next bin: 1 or 2 */
307 	uint_t  plw_count;		/* loop count */
308 	uint_t	plw_bin0;		/* starting bin */
309 	uint_t  plw_bin_marker;		/* bin after initial jump */
310 	uint_t  plw_bin_split_prev;	/* last bin we tried to split */
311 	uint_t  plw_do_split;		/* set if OK to split */
312 	uint_t  plw_split_next;		/* next bin to split */
313 	uint_t	plw_ceq_dif;		/* number of different color groups */
314 					/* to check */
315 	uint_t	plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
316 	uint_t	plw_bins[MMU_PAGE_SIZES + 1];	/* num of bins */
317 } page_list_walker_t;
318 
319 void	page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
320     int can_split, int use_ceq, page_list_walker_t *plw);
321 
322 /*
323  * Page freelists have a single freelist type, the user page freelist. The
324  * kernel page freelist is disabled on SPARC platforms. The definitions related
325  * to the freelist type structure are grouped below.
326  */
327 
328 #define	MAX_PFLT_POLICIES 3
329 #define	MAX_PFLT_TYPE 2
330 enum freelist_types {PFLT_USER, PFLT_KMEM};
331 
332 /*
333  * The kernel only needs a small number of page colors, far fewer than user
334  * programs.
335  */
336 #define	KFLT_PAGE_COLORS 16
337 	/* flag used by the kflt_export function when calling page_promote */
338 #define	PC_KFLT_EXPORT 0x4
339 #define	PC_ISKFLT(fltp) (fltp->pflt_type == PFLT_KMEM)
340 
341 typedef struct page_freelist_type page_freelist_type_t;
342 extern page_freelist_type_t flt_user;
343 extern page_freelist_type_t *ufltp;
344 
345 typedef page_t *(*pflt_get_func_p) (struct vnode *, u_offset_t, struct seg *,
346     caddr_t, size_t, uint_t, lgrp_t *);
347 typedef page_t *(*pflt_policy_func_p)(page_freelist_type_t *, int, uint_t, int,
348     uchar_t, uint_t);
349 typedef void (*pflt_list_walk_init_func_p)(uchar_t, uint_t, uint_t, int, int,
350     page_list_walker_t *);
351 typedef uint_t (*pflt_list_walk_next_func_p)(uchar_t, uint_t,
352     page_list_walker_t *);
353 
354 page_t *page_get_uflt(struct vnode *, u_offset_t, struct seg *, caddr_t,
355     size_t, uint_t, struct lgrp *);
356 extern page_t *page_get_mnode_freelist(page_freelist_type_t *, int, uint_t,
357     int, uchar_t, uint_t);
358 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
359 extern page_t *page_get_contig_pages(page_freelist_type_t *, int, uint_t, int,
360     uchar_t, uint_t);
361 extern void page_list_walk_init(uchar_t, uint_t, uint_t, int, int,
362     page_list_walker_t *);
363 extern uint_t page_list_walk_next_bin(uchar_t, uint_t, page_list_walker_t *);
364 
365 /*
366  * Page freelists are organized as freelist types, on Sparc systems there
367  * is only a single user freelist type as the kernel cage provides a
368  * similar function to kernel freelist in that it prevents memory
369  * fragmentation.
370  *
371  * The page freelists have fixed page size and memory type dimensions.
372  * the 3rd (max_mem_nodes) and 4th (page coloring bins) dimensions are
373  * allocated dynamically.
374  */
375 struct page_freelist_type {
376 	int pflt_type;
377 	pflt_get_func_p pflt_get_free;
378 	pflt_list_walk_init_func_p pflt_walk_init;
379 	pflt_list_walk_next_func_p pflt_walk_next;
380 	int	pflt_num_policies;
381 	pflt_policy_func_p pflt_policy[MAX_PFLT_POLICIES];
382 	page_t ***pflt_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
383 };
384 
385 #define	PAGE_FREELISTP(is_kflt, mnode, szc, color, mtype)		\
386 	((ufltp->pflt_freelists[szc][mtype][mnode] + (color)))
387 
388 #define	PAGE_FREELISTS(is_kflt, mnode, szc, color, mtype)		\
389 	(*(ufltp->pflt_freelists[szc][mtype][mnode] + (color)))
390 
391 #define	PAGE_GET_FREELISTS(pp, vp, off, seg, vaddr, size, flags, lgrp)	     \
392 			pp = ufltp->pflt_get_free(vp, off, seg, vaddr, size, \
393 			    flags, lgrp);
394 
395 #define	PAGE_GET_FREELISTS_POLICY(fp, i) 				\
396 	(fp->pflt_policy[i])
397 
398 #define	PAGE_LIST_WALK_INIT(fp, szc, flags, bin, can_split, use_ceq, plw) \
399 	fp->pflt_walk_init(szc, flags, bin, can_split, use_ceq, plw)
400 
401 #define	PAGE_LIST_WALK_NEXT(fp, szc, bin, plw) 				\
402 	fp->pflt_walk_next(szc, bin, plw)
403 
404 typedef	char	hpmctr_t;
405 
406 #ifdef DEBUG
407 #define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
408 extern void	chk_lpg(page_t *, uchar_t);
409 #else
410 #define	CHK_LPG(pp, szc)
411 #endif
412 
413 /*
414  * page list count per mnode and type.
415  */
416 typedef	struct {
417 	pgcnt_t	plc_mt_pgmax;		/* max page cnt */
418 	pgcnt_t plc_mt_clpgcnt;		/* cache list cnt */
419 	pgcnt_t plc_mt_flpgcnt;		/* free list cnt - small pages */
420 	pgcnt_t plc_mt_lgpgcnt;		/* free list cnt - large pages */
421 #ifdef DEBUG
422 	struct {
423 		pgcnt_t plc_mts_pgcnt;	/* per page size count */
424 		int	plc_mts_colors;
425 		pgcnt_t	*plc_mtsc_pgcnt; /* per color bin count */
426 	} plc_mts[MMU_PAGE_SIZES];
427 #endif
428 } plcnt_t[MAX_MEM_NODES][MAX_MEM_TYPES];
429 
430 #ifdef DEBUG
431 
432 #define	PLCNT_SZ(ctrs_sz) {						\
433 	int	szc;							\
434 	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
435 		int	colors = page_get_pagecolors(szc);		\
436 		ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES *		\
437 		    colors * sizeof (pgcnt_t));				\
438 	}								\
439 }
440 
441 #define	PLCNT_INIT(base) {						\
442 	int	mn, mt, szc, colors;					\
443 	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
444 		colors = page_get_pagecolors(szc);			\
445 		for (mn = 0; mn < max_mem_nodes; mn++) {		\
446 			for (mt = 0; mt < MAX_MEM_TYPES; mt++) {	\
447 				plcnt[mn][mt].plc_mts[szc].		\
448 				    plc_mts_colors = colors;		\
449 				plcnt[mn][mt].plc_mts[szc].		\
450 				    plc_mtsc_pgcnt = (pgcnt_t *)base;	\
451 				base += (colors * sizeof (pgcnt_t));	\
452 			}						\
453 		}							\
454 	}								\
455 }
456 
457 #define	PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {			\
458 	int	bin = PP_2_BIN(pp);					\
459 	if (flags & PG_CACHE_LIST)					\
460 		atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);	\
461 	else if (szc)							\
462 		atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);	\
463 	else								\
464 		atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);	\
465 	atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].plc_mts_pgcnt,	\
466 	    cnt);							\
467 	atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].			\
468 	    plc_mtsc_pgcnt[bin], cnt);					\
469 }
470 
471 #else
472 
473 #define	PLCNT_SZ(ctrs_sz)
474 
475 #define	PLCNT_INIT(base)
476 
477 /* PG_FREE_LIST may not be explicitly set in flags for large pages */
478 
479 #define	PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {			\
480 	if (flags & PG_CACHE_LIST)					\
481 		atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);	\
482 	else if (szc)							\
483 		atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);	\
484 	else								\
485 		atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);	\
486 }
487 
488 #endif
489 
490 #define	PLCNT_INCR(pp, mn, mtype, szc, flags) {				\
491 	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
492 	PLCNT_DO(pp, mn, mtype, szc, cnt, flags);			\
493 }
494 
495 #define	PLCNT_DECR(pp, mn, mtype, szc, flags) {				\
496 	long	cnt = ((-1) << PAGE_BSZS_SHIFT(szc));			\
497 	PLCNT_DO(pp, mn, mtype, szc, cnt, flags);			\
498 }
499 
500 /*
501  * macros to update page list max counts - done when pages transferred
502  * from RELOC to NORELOC mtype (kcage_init or kcage_assimilate_page).
503  */
504 
505 #define	PLCNT_XFER_NORELOC(pp) {					\
506 	long	cnt = (1 << PAGE_BSZS_SHIFT((pp)->p_szc));		\
507 	int	mn = PP_2_MEM_NODE(pp);					\
508 	atomic_add_long(&plcnt[mn][MTYPE_NORELOC].plc_mt_pgmax, cnt);	\
509 	atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, -cnt);	\
510 }
511 
512 /*
513  * macro to modify the page list max counts when memory is added to
514  * the page lists during startup (add_physmem) or during a DR operation
515  * when memory is added (kphysm_add_memory_dynamic) or deleted
516  * (kphysm_del_cleanup).
517  */
518 #define	PLCNT_MODIFY_MAX(pfn, cnt) {					       \
519 	spgcnt_t _cnt = (spgcnt_t)(cnt);				       \
520 	pgcnt_t _acnt = ABS(_cnt);					       \
521 	int _mn;							       \
522 	pgcnt_t _np;							       \
523 	if (&plat_mem_node_intersect_range != NULL) {			       \
524 		for (_mn = 0; _mn < max_mem_nodes; _mn++) {		       \
525 			plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\
526 			if (_np == 0)					       \
527 				continue;				       \
528 			atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
529 			    (_cnt < 0) ? -_np : _np);			       \
530 		}							       \
531 	} else {							       \
532 		pfn_t _pfn = (pfn);					       \
533 		pfn_t _endpfn = _pfn + _acnt;				       \
534 		while (_pfn < _endpfn) {				       \
535 			_mn = PFN_2_MEM_NODE(_pfn);			       \
536 			_np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
537 			    _pfn;					       \
538 			_pfn += _np;					       \
539 			atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
540 			    (_cnt < 0) ? -_np : _np);			       \
541 		}							       \
542 	}								       \
543 }
544 
545 /*
546  * macro to call page_ctrs_adjust() when memory is added
547  * during a DR operation.
548  */
549 #define	PAGE_CTRS_ADJUST(pfn, cnt, rv) {				       \
550 	spgcnt_t _cnt = (spgcnt_t)(cnt);				       \
551 	int _mn;							       \
552 	pgcnt_t _np;							       \
553 	if (&plat_mem_node_intersect_range != NULL) {			       \
554 		for (_mn = 0; _mn < max_mem_nodes; _mn++) {		       \
555 			plat_mem_node_intersect_range((pfn), _cnt, _mn, &_np); \
556 			if (_np == 0)					       \
557 				continue;				       \
558 			if ((rv = page_ctrs_adjust(_mn)) != 0)		       \
559 				break;					       \
560 		}							       \
561 	} else {							       \
562 		pfn_t _pfn = (pfn);					       \
563 		pfn_t _endpfn = _pfn + _cnt;				       \
564 		while (_pfn < _endpfn) {				       \
565 			_mn = PFN_2_MEM_NODE(_pfn);			       \
566 			_np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
567 			    _pfn;					       \
568 			_pfn += _np;					       \
569 			if ((rv = page_ctrs_adjust(_mn)) != 0)		       \
570 				break;					       \
571 		}							       \
572 	}								       \
573 }
574 
575 extern plcnt_t	plcnt;
576 
577 #define	MNODE_PGCNT(mn)							\
578 	(plcnt[mn][MTYPE_RELOC].plc_mt_clpgcnt +			\
579 	    plcnt[mn][MTYPE_NORELOC].plc_mt_clpgcnt +			\
580 	    plcnt[mn][MTYPE_RELOC].plc_mt_flpgcnt +			\
581 	    plcnt[mn][MTYPE_NORELOC].plc_mt_flpgcnt +			\
582 	    plcnt[mn][MTYPE_RELOC].plc_mt_lgpgcnt +			\
583 	    plcnt[mn][MTYPE_NORELOC].plc_mt_lgpgcnt)
584 
585 #define	MNODETYPE_PGCNT(mn, mtype)					\
586 	(plcnt[mn][mtype].plc_mt_clpgcnt +				\
587 	    plcnt[mn][mtype].plc_mt_flpgcnt +				\
588 	    plcnt[mn][mtype].plc_mt_lgpgcnt)
589 
590 /*
591  * macros to loop through the mtype range - MTYPE_START returns -1 in
592  * mtype if no pages in mnode/mtype and possibly NEXT mtype.
593  */
594 #define	MTYPE_START(mnode, mtype, flags) {				\
595 	if (plcnt[mnode][mtype].plc_mt_pgmax == 0) {			\
596 		ASSERT(mtype == MTYPE_RELOC ||				\
597 		    MNODETYPE_PGCNT(mnode, mtype) == 0 ||		\
598 		    plcnt[mnode][mtype].plc_mt_pgmax != 0);		\
599 		MTYPE_NEXT(mnode, mtype, flags);			\
600 	}								\
601 }
602 
603 /*
604  * if allocation from the RELOC pool failed and there is sufficient cage
605  * memory, attempt to allocate from the NORELOC pool.
606  */
607 #define	MTYPE_NEXT(mnode, mtype, flags) { 				\
608 	if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&	\
609 	    (kcage_freemem >= kcage_lotsfree)) {			\
610 		if (plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax == 0) {	\
611 			ASSERT(MNODETYPE_PGCNT(mnode, MTYPE_NORELOC) == 0 || \
612 			    plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax != 0);  \
613 			mtype = -1;					\
614 		} else {						\
615 			mtype = MTYPE_NORELOC;				\
616 			flags |= PG_NORELOC;				\
617 		}							\
618 	} else {							\
619 		mtype = -1;						\
620 	}								\
621 }
622 
623 /*
624  * get the ecache setsize for the current cpu.
625  */
626 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
627 
628 extern struct cpu	cpu0;
629 #define	CPU0		&cpu0
630 
631 #define	PAGE_BSZS_SHIFT(szc)	TTE_BSZS_SHIFT(szc)
632 /*
633  * For sfmmu each larger page is 8 times the size of the previous
634  * size page.
635  */
636 #define	FULL_REGION_CNT(rg_szc)	(8)
637 
638 /*
639  * The counter base must be per page_counter element to prevent
640  * races when re-indexing, and the base page size element should
641  * be aligned on a boundary of the given region size.
642  *
643  * We also round up the number of pages spanned by the counters
644  * for a given region to PC_BASE_ALIGN in certain situations to simplify
645  * the coding for some non-performance critical routines.
646  */
647 #define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1))
648 #define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)
649 
650 extern int ecache_alignsize;
651 #define	L2CACHE_ALIGN		ecache_alignsize
652 #define	L2CACHE_ALIGN_MAX	512
653 
654 extern int update_proc_pgcolorbase_after_fork;
655 extern int consistent_coloring;
656 extern uint_t vac_colors_mask;
657 extern int vac_size;
658 extern int vac_shift;
659 
660 /*
661  * Kernel mem segment in 64-bit space
662  */
663 extern caddr_t kmem64_base, kmem64_end, kmem64_aligned_end;
664 extern int kmem64_alignsize, kmem64_szc;
665 extern uint64_t kmem64_pabase;
666 extern int max_bootlp_tteszc;
667 
668 /*
669  * Maximum and default values for user heap, stack, private and shared
670  * anonymous memory, and user text and initialized data.
671  *
672  * Initial values are defined in architecture specific mach_vm_dep.c file.
673  * Used by map_pgsz*() routines.
674  */
675 extern size_t max_uheap_lpsize;
676 extern size_t default_uheap_lpsize;
677 extern size_t max_ustack_lpsize;
678 extern size_t default_ustack_lpsize;
679 extern size_t max_privmap_lpsize;
680 extern size_t max_uidata_lpsize;
681 extern size_t max_utext_lpsize;
682 extern size_t max_shm_lpsize;
683 
684 /*
685  * For adjusting the default lpsize, for DTLB-limited page sizes.
686  */
687 extern void adjust_data_maxlpsize(size_t ismpagesize);
688 
689 /*
690  * Sanity control. Don't use large pages regardless of user
691  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
692  * The units for this variable are 8K pages.
693  */
694 extern pgcnt_t privm_lpg_min_physmem;
695 extern pgcnt_t shm_lpg_min_physmem;
696 
697 /*
698  * AS_2_BIN macro controls the page coloring policy.
699  * 0 (default) uses various vaddr bits
700  * 1 virtual=paddr
701  * 2 bin hopping
702  */
703 #define	AS_2_BIN(kflt, as, seg, vp, addr, bin, szc)			\
704 switch (consistent_coloring) {						\
705 	default:                                                        \
706 		cmn_err(CE_WARN,					\
707 			"AS_2_BIN: bad consistent coloring value");	\
708 		/* assume default algorithm -> continue */		\
709 	case 0: {                                                       \
710 		uint32_t ndx, new;					\
711 		int slew = 0;						\
712 		pfn_t pfn;                                              \
713                                                                         \
714 		if (vp != NULL && IS_SWAPVP(vp) &&			\
715 		    seg->s_ops == &segvn_ops)				\
716 			slew = as_color_bin(as);			\
717                                                                         \
718 		pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) +		\
719 			(((uintptr_t)addr >> page_coloring_shift) <<	\
720 			(vac_shift - MMU_PAGESHIFT));			\
721 		if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) {	\
722 			pfn += slew;					\
723 			bin = PFN_2_COLOR(pfn, szc, NULL);		\
724 		} else {						\
725 			bin = PFN_2_COLOR(pfn, szc, NULL);		\
726 			bin += slew >> (vac_shift - MMU_PAGESHIFT);	\
727 			bin &= hw_page_array[(szc)].hp_colors - 1;	\
728 		}							\
729 		break;                                                  \
730 	}                                                               \
731 	case 1:                                                         \
732 		bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT),	\
733 		    szc, NULL);						\
734 		break;                                                  \
735 	case 2: {                                                       \
736 		int cnt = as_color_bin(as);				\
737 		uint_t color_mask = page_get_pagecolors(0) - 1;		\
738                                                                         \
739 		/* make sure physical color aligns with vac color */	\
740 		while ((cnt & vac_colors_mask) !=			\
741 		    addr_to_vcolor(addr)) {				\
742 			cnt++;						\
743 		}                                                       \
744 		bin = cnt = cnt & color_mask;			        \
745 		bin >>= PAGE_GET_COLOR_SHIFT(0, szc);                   \
746 		/* update per as page coloring fields */		\
747 		cnt = (cnt + 1) & color_mask;			        \
748 		if (cnt == (as_color_start(as) & color_mask)) {	        \
749 			cnt = as_color_start(as) = as_color_start(as) + \
750 				PGCLR_LOOPFACTOR;			\
751 		}                                                       \
752 		as_color_bin(as) = cnt & color_mask;		        \
753 		break;                                                  \
754 	}								\
755 }									\
756 	ASSERT(bin < page_get_pagecolors(szc));
757 
758 /*
759  * cpu private vm data - accessed thru CPU->cpu_vm_data
760  *	vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
761  *	vc_pnext_memseg: tracks last memseg visited in page_nextn()
762  *	vc_kmptr: unaligned kmem pointer for this vm_cpu_data_t
763  *	vc_kmsize: orignal kmem size for this vm_cpu_data_t
764  */
765 
766 typedef struct {
767 	struct memseg	*vc_pnum_memseg;
768 	struct memseg	*vc_pnext_memseg;
769 	void		*vc_kmptr;
770 	size_t		vc_kmsize;
771 } vm_cpu_data_t;
772 
773 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
774 #define	VM_CPU_DATA_PADSIZE						\
775 	(P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
776 
777 /* for boot cpu before kmem is initialized */
778 extern char	vm_cpu_data0[];
779 
780 /*
781  * Function to get an ecache color bin: F(as, cnt, vcolor).
782  * the goal of this function is to:
783  * - to spread a processes' physical pages across the entire ecache to
784  *	maximize its use.
785  * - to minimize vac flushes caused when we reuse a physical page on a
786  *	different vac color than it was previously used.
787  * - to prevent all processes to use the same exact colors and trash each
788  *	other.
789  *
790  * cnt is a bin ptr kept on a per as basis.  As we page_create we increment
791  * the ptr so we spread out the physical pages to cover the entire ecache.
792  * The virtual color is made a subset of the physical color in order to
793  * in minimize virtual cache flushing.
794  * We add in the as to spread out different as.	 This happens when we
795  * initialize the start count value.
796  * sizeof(struct as) is 60 so we shift by 3 to get into the bit range
797  * that will tend to change.  For example, on spitfire based machines
798  * (vcshft == 1) contigous as are spread bu ~6 bins.
799  * vcshft provides for proper virtual color alignment.
800  * In theory cnt should be updated using cas only but if we are off by one
801  * or 2 it is no big deal.
802  * We also keep a start value which is used to randomize on what bin we
803  * start counting when it is time to start another loop. This avoids
804  * contigous allocations of ecache size to point to the same bin.
805  * Why 3? Seems work ok. Better than 7 or anything larger.
806  */
807 #define	PGCLR_LOOPFACTOR 3
808 
809 /*
810  * When a bin is empty, and we can't satisfy a color request correctly,
811  * we scan.  If we assume that the programs have reasonable spatial
812  * behavior, then it will not be a good idea to use the adjacent color.
813  * Using the adjacent color would result in virtually adjacent addresses
814  * mapping into the same spot in the cache.  So, if we stumble across
815  * an empty bin, skip a bunch before looking.  After the first skip,
816  * then just look one bin at a time so we don't miss our cache on
817  * every look. Be sure to check every bin.  Page_create() will panic
818  * if we miss a page.
819  *
820  * This also explains the `<=' in the for loops in both page_get_freelist()
821  * and page_get_cachelist().  Since we checked the target bin, skipped
822  * a bunch, then continued one a time, we wind up checking the target bin
823  * twice to make sure we get all of them bins.
824  */
825 #define	BIN_STEP	20
826 
827 #ifdef VM_STATS
828 struct vmm_vmstats_str {
829 				/* page_get_uflt and page_get_kflt */
830 	ulong_t pgf_alloc[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
831 	ulong_t pgf_allocok[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
832 	ulong_t pgf_allocokrem[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
833 	ulong_t pgf_allocfailed[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
834 	ulong_t	pgf_allocdeferred;
835 	ulong_t	pgf_allocretry[MMU_PAGE_SIZES][MAX_PFLT_TYPE];
836 	ulong_t pgik_allocok;			/* page_import_kflt */
837 	ulong_t pgik_allocfailed;
838 	ulong_t pgkx_allocok;			/* kflt_expand */
839 	ulong_t pgkx_allocfailed;
840 	ulong_t puak_allocok;			/* page_user_alloc_kflt */
841 	ulong_t puak_allocfailed;
842 	ulong_t pgexportok;			/* kflt_export */
843 	ulong_t pgexportfail;
844 	ulong_t pgkflt_disable;			/* kflt_user_evict */
845 	ulong_t pgc_alloc;			/* page_get_cachelist */
846 	ulong_t pgc_allocok;
847 	ulong_t pgc_allocokrem;
848 	ulong_t pgc_allocokdeferred;
849 	ulong_t pgc_allocfailed;
850 	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];	/* page_get_contig_pages */
851 	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
852 	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
853 	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
854 	ulong_t	ptcp[MMU_PAGE_SIZES];		/* page_trylock_contig_pages */
855 	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
856 	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
857 	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
858 	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
859 	ulong_t	ptcpfailkflt[MMU_PAGE_SIZES];
860 	ulong_t	ptcpok[MMU_PAGE_SIZES];
861 	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];	/* page_get_mnode_freelist */
862 	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
863 	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
864 	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
865 	ulong_t	pgmc_alloc;			/* page_get_mnode_cachelist */
866 	ulong_t	pgmc_allocfailed;
867 	ulong_t	pgmc_allocempty;
868 	ulong_t	pgmc_allocok;
869 	ulong_t	pladd_free[MMU_PAGE_SIZES];	/* page_list_add/sub */
870 	ulong_t	plsub_free[MMU_PAGE_SIZES];
871 	ulong_t	pladd_cache;
872 	ulong_t	plsub_cache;
873 	ulong_t	plsubpages_szcbig;
874 	ulong_t	plsubpages_szc0;
875 	ulong_t	pfs_req[MMU_PAGE_SIZES];	/* page_freelist_split */
876 	ulong_t	pfs_demote[MMU_PAGE_SIZES];
877 	ulong_t	pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
878 	ulong_t	ppr_reloc[MMU_PAGE_SIZES];	/* page_relocate */
879 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
880 	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
881 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
882 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
883 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
884 	ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
885 	ulong_t ppr_copyfail;
886 	/* page coalesce counter */
887 	ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
888 	/* candidates useful */
889 	ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
890 	/* ctrs changed after locking */
891 	ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
892 	/* page_freelist_coalesce failed */
893 	ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
894 	ulong_t page_ctrs_coalesce_all;	/* page coalesce all counter */
895 	ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
896 };
897 extern struct vmm_vmstats_str vmm_vmstats;
898 #endif	/* VM_STATS */
899 
900 /*
901  * Used to hold off page relocations into the cage until OBP has completed
902  * its boot-time handoff of its resources to the kernel.
903  */
904 extern int page_relocate_ready;
905 
906 /*
907  * cpu/mmu-dependent vm variables may be reset at bootup.
908  */
909 extern uint_t mmu_page_sizes;
910 extern uint_t max_mmu_page_sizes;
911 extern uint_t mmu_hashcnt;
912 extern uint_t max_mmu_hashcnt;
913 extern size_t mmu_ism_pagesize;
914 extern int mmu_exported_pagesize_mask;
915 extern uint_t mmu_exported_page_sizes;
916 extern uint_t szc_2_userszc[];
917 extern uint_t userszc_2_szc[];
918 
919 #define	mmu_legacy_page_sizes	mmu_exported_page_sizes
920 #define	USERSZC_2_SZC(userszc)	(userszc_2_szc[userszc])
921 #define	SZC_2_USERSZC(szc)	(szc_2_userszc[szc])
922 
923 /*
924  * Platform specific page routines
925  */
926 extern void mach_page_add(page_t **, page_t *);
927 extern void mach_page_sub(page_t **, page_t *);
928 extern uint_t page_get_pagecolors(uint_t);
929 extern void ppcopy_kernel__relocatable(page_t *, page_t *);
930 #define	ppcopy_kernel(p1, p2)	ppcopy_kernel__relocatable(p1, p2)
931 
932 /*
933  * platform specific large pages for kernel heap support
934  */
935 extern size_t get_segkmem_lpsize(size_t lpsize);
936 extern size_t mmu_get_kernel_lpsize(size_t lpsize);
937 extern void mmu_init_kernel_pgsz(struct hat *hat);
938 extern void mmu_init_kcontext();
939 extern uint64_t kcontextreg;
940 
941 /*
942  * Nucleus data page allocator routines
943  */
944 extern void ndata_alloc_init(struct memlist *, uintptr_t, uintptr_t);
945 extern void *ndata_alloc(struct memlist *, size_t, size_t);
946 extern void *ndata_extra_base(struct memlist *, size_t, caddr_t);
947 extern size_t ndata_maxsize(struct memlist *);
948 extern size_t ndata_spare(struct memlist *, size_t, size_t);
949 
950 #ifdef	__cplusplus
951 }
952 #endif
953 
954 #endif	/* _VM_DEP_H */
955