xref: /illumos-gate/usr/src/uts/sun4/vm/vm_dep.h (revision 5fbc1fe0da7f34cf8155bf7624c94583cc98e47c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 Joyent, Inc.
24  */
25 
26 /*
27  * UNIX machine dependent virtual memory support.
28  */
29 
30 #ifndef	_VM_DEP_H
31 #define	_VM_DEP_H
32 
33 #ifdef	__cplusplus
34 extern "C" {
35 #endif
36 
37 #include <vm/hat_sfmmu.h>
38 #include <sys/archsystm.h>
39 #include <sys/memnode.h>
40 
41 #define	GETTICK()	gettick()
42 
43 /* tick value that should be used for random values */
44 extern u_longlong_t randtick(void);
45 
46 /*
47  * Per page size free lists. Allocated dynamically.
48  */
49 #define	MAX_MEM_TYPES	2	/* 0 = reloc, 1 = noreloc */
50 #define	MTYPE_RELOC	0
51 #define	MTYPE_NORELOC	1
52 
53 #define	PP_2_MTYPE(pp)	(PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC)
54 
55 #define	MTYPE_INIT(mtype, vp, vaddr, flags, pgsz)			\
56 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
57 
58 /* mtype init for page_get_replacement_page */
59 #define	MTYPE_PGR_INIT(mtype, flags, pp, pgcnt)			\
60 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
61 
62 #define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
63 	pfnlo = mem_node_config[mnode].physbase;			\
64 	pfnhi = mem_node_config[mnode].physmax;
65 
66 /*
67  * candidate counters in vm_pagelist.c are indexed by color and range
68  */
69 #define	MAX_MNODE_MRANGES		MAX_MEM_TYPES
70 #define	MNODE_RANGE_CNT(mnode)		MAX_MNODE_MRANGES
71 #define	MNODE_MAX_MRANGE(mnode)		(MAX_MEM_TYPES - 1)
72 #define	MTYPE_2_MRANGE(mnode, mtype)	(mtype)
73 
74 /*
75  * Internal PG_ flags.
76  */
77 #define	PGI_RELOCONLY	0x10000	/* acts in the opposite sense to PG_NORELOC */
78 #define	PGI_NOCAGE	0x20000	/* indicates Cage is disabled */
79 #define	PGI_PGCPHIPRI	0x40000	/* page_get_contig_page priority allocation */
80 #define	PGI_PGCPSZC0	0x80000	/* relocate base pagesize page */
81 
82 /*
83  * PGI mtype flags - should not overlap PGI flags
84  */
85 #define	PGI_MT_RANGE	0x1000000	/* mtype range */
86 #define	PGI_MT_NEXT	0x2000000	/* get next mtype */
87 
88 extern page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
89 extern page_t ***page_cachelists[MAX_MEM_TYPES];
90 
91 #define	PAGE_FREELISTS(mnode, szc, color, mtype) \
92 	(*(page_freelists[szc][mtype][mnode] + (color)))
93 
94 #define	PAGE_CACHELISTS(mnode, color, mtype) \
95 	(*(page_cachelists[mtype][mnode] + (color)))
96 
97 /*
98  * There are 'page_colors' colors/bins.  Spread them out under a
99  * couple of locks.  There are mutexes for both the page freelist
100  * and the page cachelist.  We want enough locks to make contention
101  * reasonable, but not too many -- otherwise page_freelist_lock() gets
102  * so expensive that it becomes the bottleneck!
103  */
104 #define	NPC_MUTEX	16
105 
106 extern kmutex_t	*fpc_mutex[NPC_MUTEX];
107 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
108 
109 /*
110  * Iterator provides the info needed to convert RA to PA.
111  * MEM_NODE_ITERATOR_INIT() should be called before
112  * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous
113  * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash
114  * translations requiring initializer call if color or ceq_mask changes,
115  * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before
116  * PFN_2_COLOR() that uses a valid iterator argument.
117  *
118  * plat_mem_node_iterator_init() starts from last mblock in continuation
119  * case which may be invalid because memory DR.  To detect this situation
120  * mi_genid is checked against mpo_genid which is incremented after a
121  * memory DR operation.  See also plat_slice_add()/plat_slice_del().
122  */
123 #ifdef	sun4v
124 
125 typedef struct mem_node_iterator {
126 	uint_t mi_mnode;		/* mnode in which to iterate */
127 	int mi_init;			/* set to 1 when first init */
128 	int mi_genid;			/* set/checked against mpo_genid */
129 	int mi_last_mblock;		/* last mblock visited */
130 	uint_t mi_hash_ceq_mask;	/* cached copy of ceq_mask */
131 	uint_t mi_hash_color;		/* cached copy of color */
132 	uint_t mi_mnode_mask;		/* number of mask bits */
133 	uint_t mi_mnode_pfn_shift;	/* mnode position in pfn */
134 	pfn_t mi_mblock_base;		/* first valid pfn in current mblock */
135 	pfn_t mi_mblock_end;		/* last valid pfn in current mblock */
136 	pfn_t mi_ra_to_pa;		/* ra adjustment for current mblock */
137 	pfn_t mi_mnode_pfn_mask;	/* mask to obtain mnode id bits */
138 } mem_node_iterator_t;
139 
140 #define	MEM_NODE_ITERATOR_DECL(it) \
141 	mem_node_iterator_t it
142 #define	MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) \
143 	(pfn) = plat_mem_node_iterator_init((pfn), (mnode), (szc), (it), 1)
144 
145 extern pfn_t plat_mem_node_iterator_init(pfn_t, int, uchar_t,
146     mem_node_iterator_t *, int);
147 extern pfn_t plat_rapfn_to_papfn(pfn_t);
148 extern int interleaved_mnodes;
149 
150 #else	/* sun4v */
151 
152 #define	MEM_NODE_ITERATOR_DECL(it) \
153 	void *it = NULL
154 #define	MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it)
155 
156 #endif	/* sun4v */
157 
158 /*
159  * Return the mnode limits so that hpc_counters length and base
160  * index can be determined. When interleaved_mnodes is set, we
161  * create an array only for the first mnode that exists. All other
162  * mnodes will share the array in this case.
163  * If interleaved_mnodes is not set, simply return the limits for
164  * the given mnode.
165  */
166 #define	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first)		\
167 	if (!interleaved_mnodes) {					\
168 		(physbase) = mem_node_config[(mnode)].physbase;		\
169 		(physmax) = mem_node_config[(mnode)].physmax;		\
170 		(first) = (mnode);					\
171 	} else if ((first) < 0) {					\
172 		mem_node_max_range(&(physbase), &(physmax));		\
173 		(first) = (mnode);					\
174 	}
175 
176 #define	PAGE_CTRS_WRITE_LOCK(mnode)					\
177 	if (!interleaved_mnodes) {					\
178 		rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER);	\
179 		page_freelist_lock(mnode);				\
180 	} else {							\
181 		/* changing shared hpm_counters */			\
182 		int _i;							\
183 		for (_i = 0; _i < max_mem_nodes; _i++) {		\
184 			rw_enter(&page_ctrs_rwlock[_i], RW_WRITER);	\
185 			page_freelist_lock(_i);				\
186 		}							\
187 	}
188 
189 #define	PAGE_CTRS_WRITE_UNLOCK(mnode)					\
190 	if (!interleaved_mnodes) {					\
191 		page_freelist_unlock(mnode);				\
192 		rw_exit(&page_ctrs_rwlock[(mnode)]);			\
193 	} else {							\
194 		int _i;							\
195 		for (_i = 0; _i < max_mem_nodes; _i++) {		\
196 			page_freelist_unlock(_i);			\
197 			rw_exit(&page_ctrs_rwlock[_i]);			\
198 		}							\
199 	}
200 
201 /*
202  * cpu specific color conversion functions
203  */
204 extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t);
205 #pragma weak page_get_nsz_color_mask_cpu
206 
207 extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t);
208 #pragma weak page_get_nsz_color_cpu
209 
210 extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t);
211 #pragma weak page_get_color_shift_cpu
212 
213 extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t);
214 #pragma weak page_convert_color_cpu
215 
216 extern pfn_t page_next_pfn_for_color_cpu(pfn_t,
217     uchar_t, uint_t, uint_t, uint_t, void *);
218 #pragma weak page_next_pfn_for_color_cpu
219 
220 extern uint_t  page_pfn_2_color_cpu(pfn_t, uchar_t, void *);
221 #pragma weak page_pfn_2_color_cpu
222 
223 #define	PAGE_GET_COLOR_SHIFT(szc, nszc)				\
224 	((&page_get_color_shift_cpu != NULL) ?			\
225 	    page_get_color_shift_cpu(szc, nszc) :		\
226 	    (hw_page_array[(nszc)].hp_shift -			\
227 		hw_page_array[(szc)].hp_shift))
228 
229 #define	PAGE_CONVERT_COLOR(ncolor, szc, nszc)			\
230 	((&page_convert_color_cpu != NULL) ?			\
231 	    page_convert_color_cpu(ncolor, szc, nszc) :		\
232 	    ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc))))
233 
234 #define	PFN_2_COLOR(pfn, szc, it)				\
235 	((&page_pfn_2_color_cpu != NULL) ?			\
236 	    page_pfn_2_color_cpu(pfn, szc, it) :		\
237 	    ((pfn & (hw_page_array[0].hp_colors - 1)) >>	\
238 		(hw_page_array[szc].hp_shift -			\
239 		    hw_page_array[0].hp_shift)))
240 
241 #define	PNUM_SIZE(szc)							\
242 	(hw_page_array[(szc)].hp_pgcnt)
243 #define	PNUM_SHIFT(szc)							\
244 	(hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift)
245 #define	PAGE_GET_SHIFT(szc)						\
246 	(hw_page_array[(szc)].hp_shift)
247 #define	PAGE_GET_PAGECOLORS(szc)					\
248 	(hw_page_array[(szc)].hp_colors)
249 
250 /*
251  * This macro calculates the next sequential pfn with the specified
252  * color using color equivalency mask
253  */
254 #define	PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it)   \
255 	{                                                                    \
256 		ASSERT(((color) & ~(ceq_mask)) == 0);                        \
257 		if (&page_next_pfn_for_color_cpu == NULL) {                  \
258 			uint_t	pfn_shift = PAGE_BSZS_SHIFT(szc);            \
259 			pfn_t	spfn = pfn >> pfn_shift;                     \
260 			pfn_t	stride = (ceq_mask) + 1;                     \
261 			ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0);        \
262 			if (((spfn ^ (color)) & (ceq_mask)) == 0) {          \
263 				pfn += stride << pfn_shift;                  \
264 			} else {                                             \
265 				pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \
266 				pfn = (pfn > spfn ? pfn : pfn + stride) <<   \
267 				    pfn_shift;                               \
268 			}                                                    \
269 		} else {                                                     \
270 		    pfn = page_next_pfn_for_color_cpu(pfn, szc, color,	     \
271 			ceq_mask, color_mask, it);			     \
272 		}                                                            \
273 	}
274 
275 /* get the color equivalency mask for the next szc */
276 #define	PAGE_GET_NSZ_MASK(szc, mask)                                         \
277 	((&page_get_nsz_color_mask_cpu == NULL) ?                            \
278 	    ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) :  \
279 	    page_get_nsz_color_mask_cpu(szc, mask))
280 
281 /* get the color of the next szc */
282 #define	PAGE_GET_NSZ_COLOR(szc, color)                                       \
283 	((&page_get_nsz_color_cpu == NULL) ?                                 \
284 	    ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \
285 	    page_get_nsz_color_cpu(szc, color))
286 
287 /* Find the bin for the given page if it was of size szc */
288 #define	PP_2_BIN_SZC(pp, szc)	(PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1)))
289 
290 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
291 
292 #define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
293 
294 #define	PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?	\
295 	&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :			\
296 	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
297 
298 #define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
299 #define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])
300 
301 #define	PFN_BASE(pfnum, szc)	(pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1))
302 
303 /*
304  * this structure is used for walking free page lists
305  * controls when to split large pages into smaller pages,
306  * and when to coalesce smaller pages into larger pages
307  */
308 typedef struct page_list_walker {
309 	uint_t	plw_colors;		/* num of colors for szc */
310 	uint_t  plw_color_mask;		/* colors-1 */
311 	uint_t	plw_bin_step;		/* next bin: 1 or 2 */
312 	uint_t  plw_count;		/* loop count */
313 	uint_t	plw_bin0;		/* starting bin */
314 	uint_t  plw_bin_marker;		/* bin after initial jump */
315 	uint_t  plw_bin_split_prev;	/* last bin we tried to split */
316 	uint_t  plw_do_split;		/* set if OK to split */
317 	uint_t  plw_split_next;		/* next bin to split */
318 	uint_t	plw_ceq_dif;		/* number of different color groups */
319 					/* to check */
320 	uint_t	plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */
321 	uint_t	plw_bins[MMU_PAGE_SIZES + 1];	/* num of bins */
322 } page_list_walker_t;
323 
324 void	page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin,
325     int can_split, int use_ceq, page_list_walker_t *plw);
326 
327 typedef	char	hpmctr_t;
328 
329 #ifdef DEBUG
330 #define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
331 extern void	chk_lpg(page_t *, uchar_t);
332 #else
333 #define	CHK_LPG(pp, szc)
334 #endif
335 
336 /*
337  * page list count per mnode and type.
338  */
339 typedef	struct {
340 	pgcnt_t	plc_mt_pgmax;		/* max page cnt */
341 	pgcnt_t plc_mt_clpgcnt;		/* cache list cnt */
342 	pgcnt_t plc_mt_flpgcnt;		/* free list cnt - small pages */
343 	pgcnt_t plc_mt_lgpgcnt;		/* free list cnt - large pages */
344 #ifdef DEBUG
345 	struct {
346 		pgcnt_t plc_mts_pgcnt;	/* per page size count */
347 		int	plc_mts_colors;
348 		pgcnt_t	*plc_mtsc_pgcnt; /* per color bin count */
349 	} plc_mts[MMU_PAGE_SIZES];
350 #endif
351 } plcnt_t[MAX_MEM_NODES][MAX_MEM_TYPES];
352 
353 #ifdef DEBUG
354 
355 #define	PLCNT_SZ(ctrs_sz) {						\
356 	int	szc;							\
357 	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
358 		int	colors = page_get_pagecolors(szc);		\
359 		ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES *		\
360 		    colors * sizeof (pgcnt_t));				\
361 	}								\
362 }
363 
364 #define	PLCNT_INIT(base) {						\
365 	int	mn, mt, szc, colors;					\
366 	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
367 		colors = page_get_pagecolors(szc);			\
368 		for (mn = 0; mn < max_mem_nodes; mn++) {		\
369 			for (mt = 0; mt < MAX_MEM_TYPES; mt++) {	\
370 				plcnt[mn][mt].plc_mts[szc].		\
371 				    plc_mts_colors = colors;		\
372 				plcnt[mn][mt].plc_mts[szc].		\
373 				    plc_mtsc_pgcnt = (pgcnt_t *)base;	\
374 				base += (colors * sizeof (pgcnt_t));	\
375 			}						\
376 		}							\
377 	}								\
378 }
379 
380 #define	PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {			\
381 	int	bin = PP_2_BIN(pp);					\
382 	if (flags & PG_CACHE_LIST)					\
383 		atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);	\
384 	else if (szc)							\
385 		atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);	\
386 	else								\
387 		atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);	\
388 	atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].plc_mts_pgcnt,	\
389 	    cnt);							\
390 	atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].			\
391 	    plc_mtsc_pgcnt[bin], cnt);					\
392 }
393 
394 #else
395 
396 #define	PLCNT_SZ(ctrs_sz)
397 
398 #define	PLCNT_INIT(base)
399 
400 /* PG_FREE_LIST may not be explicitly set in flags for large pages */
401 
402 #define	PLCNT_DO(pp, mn, mtype, szc, cnt, flags) {			\
403 	if (flags & PG_CACHE_LIST)					\
404 		atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt);	\
405 	else if (szc)							\
406 		atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt);	\
407 	else								\
408 		atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt);	\
409 }
410 
411 #endif
412 
413 #define	PLCNT_INCR(pp, mn, mtype, szc, flags) {				\
414 	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
415 	PLCNT_DO(pp, mn, mtype, szc, cnt, flags);			\
416 }
417 
418 #define	PLCNT_DECR(pp, mn, mtype, szc, flags) {				\
419 	long	cnt = ((ULONG_MAX) << PAGE_BSZS_SHIFT(szc));		\
420 	PLCNT_DO(pp, mn, mtype, szc, cnt, flags);			\
421 }
422 
423 /*
424  * macros to update page list max counts - done when pages transferred
425  * from RELOC to NORELOC mtype (kcage_init or kcage_assimilate_page).
426  */
427 
428 #define	PLCNT_XFER_NORELOC(pp) {					\
429 	long	cnt = (1 << PAGE_BSZS_SHIFT((pp)->p_szc));		\
430 	int	mn = PP_2_MEM_NODE(pp);					\
431 	atomic_add_long(&plcnt[mn][MTYPE_NORELOC].plc_mt_pgmax, cnt);	\
432 	atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, -cnt);	\
433 }
434 
435 /*
436  * macro to modify the page list max counts when memory is added to
437  * the page lists during startup (add_physmem) or during a DR operation
438  * when memory is added (kphysm_add_memory_dynamic) or deleted
439  * (kphysm_del_cleanup).
440  */
441 #define	PLCNT_MODIFY_MAX(pfn, cnt) {					       \
442 	spgcnt_t _cnt = (spgcnt_t)(cnt);				       \
443 	pgcnt_t _acnt = ABS(_cnt);					       \
444 	int _mn;							       \
445 	pgcnt_t _np;							       \
446 	if (&plat_mem_node_intersect_range != NULL) {			       \
447 		for (_mn = 0; _mn < max_mem_nodes; _mn++) {		       \
448 			plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\
449 			if (_np == 0)					       \
450 				continue;				       \
451 			atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
452 			    (_cnt < 0) ? -_np : _np);			       \
453 		}							       \
454 	} else {							       \
455 		pfn_t _pfn = (pfn);					       \
456 		pfn_t _endpfn = _pfn + _acnt;				       \
457 		while (_pfn < _endpfn) {				       \
458 			_mn = PFN_2_MEM_NODE(_pfn);			       \
459 			_np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
460 			    _pfn;					       \
461 			_pfn += _np;					       \
462 			atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \
463 			    (_cnt < 0) ? -_np : _np);			       \
464 		}							       \
465 	}								       \
466 }
467 
468 /*
469  * macro to call page_ctrs_adjust() when memory is added
470  * during a DR operation.
471  */
472 #define	PAGE_CTRS_ADJUST(pfn, cnt, rv) {				       \
473 	spgcnt_t _cnt = (spgcnt_t)(cnt);				       \
474 	int _mn;							       \
475 	pgcnt_t _np;							       \
476 	rv = 0;								       \
477 	if (&plat_mem_node_intersect_range != NULL) {			       \
478 		for (_mn = 0; _mn < max_mem_nodes; _mn++) {		       \
479 			plat_mem_node_intersect_range((pfn), _cnt, _mn, &_np); \
480 			if (_np == 0)					       \
481 				continue;				       \
482 			if ((rv = page_ctrs_adjust(_mn)) != 0)		       \
483 				break;					       \
484 		}							       \
485 	} else {							       \
486 		pfn_t _pfn = (pfn);					       \
487 		pfn_t _endpfn = _pfn + _cnt;				       \
488 		while (_pfn < _endpfn) {				       \
489 			_mn = PFN_2_MEM_NODE(_pfn);			       \
490 			_np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \
491 			    _pfn;					       \
492 			_pfn += _np;					       \
493 			if ((rv = page_ctrs_adjust(_mn)) != 0)		       \
494 				break;					       \
495 		}							       \
496 	}								       \
497 }
498 
499 extern plcnt_t	plcnt;
500 
501 #define	MNODE_PGCNT(mn)							\
502 	(plcnt[mn][MTYPE_RELOC].plc_mt_clpgcnt +			\
503 	    plcnt[mn][MTYPE_NORELOC].plc_mt_clpgcnt +			\
504 	    plcnt[mn][MTYPE_RELOC].plc_mt_flpgcnt +			\
505 	    plcnt[mn][MTYPE_NORELOC].plc_mt_flpgcnt +			\
506 	    plcnt[mn][MTYPE_RELOC].plc_mt_lgpgcnt +			\
507 	    plcnt[mn][MTYPE_NORELOC].plc_mt_lgpgcnt)
508 
509 #define	MNODETYPE_PGCNT(mn, mtype)					\
510 	(plcnt[mn][mtype].plc_mt_clpgcnt +				\
511 	    plcnt[mn][mtype].plc_mt_flpgcnt +				\
512 	    plcnt[mn][mtype].plc_mt_lgpgcnt)
513 
514 /*
515  * macros to loop through the mtype range - MTYPE_START returns -1 in
516  * mtype if no pages in mnode/mtype and possibly NEXT mtype.
517  */
518 #define	MTYPE_START(mnode, mtype, flags) {				\
519 	if (plcnt[mnode][mtype].plc_mt_pgmax == 0) {			\
520 		ASSERT(mtype == MTYPE_RELOC ||				\
521 		    MNODETYPE_PGCNT(mnode, mtype) == 0 ||		\
522 		    plcnt[mnode][mtype].plc_mt_pgmax != 0);		\
523 		MTYPE_NEXT(mnode, mtype, flags);			\
524 	}								\
525 }
526 
527 /*
528  * if allocation from the RELOC pool failed and there is sufficient cage
529  * memory, attempt to allocate from the NORELOC pool.
530  */
531 #define	MTYPE_NEXT(mnode, mtype, flags) {				\
532 	if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) &&	\
533 	    (kcage_freemem >= kcage_lotsfree)) {			\
534 		if (plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax == 0) {	\
535 			ASSERT(MNODETYPE_PGCNT(mnode, MTYPE_NORELOC) == 0 || \
536 			    plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax != 0);  \
537 			mtype = -1;					\
538 		} else {						\
539 			mtype = MTYPE_NORELOC;				\
540 			flags |= PG_NORELOC;				\
541 		}							\
542 	} else {							\
543 		mtype = -1;						\
544 	}								\
545 }
546 
547 /*
548  * get the ecache setsize for the current cpu.
549  */
550 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
551 
552 extern struct cpu	cpu0;
553 #define	CPU0		&cpu0
554 
555 #define	PAGE_BSZS_SHIFT(szc)	TTE_BSZS_SHIFT(szc)
556 /*
557  * For sfmmu each larger page is 8 times the size of the previous
558  * size page.
559  */
560 #define	FULL_REGION_CNT(rg_szc)	(8)
561 
562 /*
563  * The counter base must be per page_counter element to prevent
564  * races when re-indexing, and the base page size element should
565  * be aligned on a boundary of the given region size.
566  *
567  * We also round up the number of pages spanned by the counters
568  * for a given region to PC_BASE_ALIGN in certain situations to simplify
569  * the coding for some non-performance critical routines.
570  */
571 #define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1))
572 #define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)
573 
574 extern int ecache_alignsize;
575 #define	L2CACHE_ALIGN		ecache_alignsize
576 #define	L2CACHE_ALIGN_MAX	512
577 
578 extern int update_proc_pgcolorbase_after_fork;
579 extern int consistent_coloring;
580 extern uint_t vac_colors_mask;
581 extern int vac_size;
582 extern int vac_shift;
583 
584 /*
585  * Kernel mem segment in 64-bit space
586  */
587 extern caddr_t kmem64_base, kmem64_end, kmem64_aligned_end;
588 extern int kmem64_alignsize, kmem64_szc;
589 extern uint64_t kmem64_pabase;
590 extern int max_bootlp_tteszc;
591 
592 /*
593  * Maximum and default values for user heap, stack, private and shared
594  * anonymous memory, and user text and initialized data.
595  *
596  * Initial values are defined in architecture specific mach_vm_dep.c file.
597  * Used by map_pgsz*() routines.
598  */
599 extern size_t max_uheap_lpsize;
600 extern size_t default_uheap_lpsize;
601 extern size_t max_ustack_lpsize;
602 extern size_t default_ustack_lpsize;
603 extern size_t max_privmap_lpsize;
604 extern size_t max_uidata_lpsize;
605 extern size_t max_utext_lpsize;
606 extern size_t max_shm_lpsize;
607 
608 /*
609  * For adjusting the default lpsize, for DTLB-limited page sizes.
610  */
611 extern void adjust_data_maxlpsize(size_t ismpagesize);
612 
613 /*
614  * Sanity control. Don't use large pages regardless of user
615  * settings if there's less than priv or shm_lpg_min_physmem memory installed.
616  * The units for this variable are 8K pages.
617  */
618 extern pgcnt_t privm_lpg_min_physmem;
619 extern pgcnt_t shm_lpg_min_physmem;
620 
621 /*
622  * AS_2_BIN macro controls the page coloring policy.
623  * 0 (default) uses various vaddr bits
624  * 1 virtual=paddr
625  * 2 bin hopping
626  */
627 #define	AS_2_BIN(as, seg, vp, addr, bin, szc)				\
628 switch (consistent_coloring) {						\
629 	default:                                                        \
630 		cmn_err(CE_WARN,					\
631 			"AS_2_BIN: bad consistent coloring value");	\
632 		/* assume default algorithm -> continue */		\
633 		/* FALLTHROUGH */					\
634 	case 0: {                                                       \
635 		uint32_t ndx, new;					\
636 		int slew = 0;						\
637 		pfn_t pfn;                                              \
638                                                                         \
639 		if (vp != NULL && IS_SWAPVP(vp) &&			\
640 		    seg->s_ops == &segvn_ops)				\
641 			slew = as_color_bin(as);			\
642                                                                         \
643 		pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) +		\
644 			(((uintptr_t)addr >> page_coloring_shift) <<	\
645 			(vac_shift - MMU_PAGESHIFT));			\
646 		if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) {	\
647 			pfn += slew;					\
648 			bin = PFN_2_COLOR(pfn, szc, NULL);		\
649 		} else {						\
650 			bin = PFN_2_COLOR(pfn, szc, NULL);		\
651 			bin += slew >> (vac_shift - MMU_PAGESHIFT);	\
652 			bin &= hw_page_array[(szc)].hp_colors - 1;	\
653 		}							\
654 		break;                                                  \
655 	}                                                               \
656 	case 1:                                                         \
657 		bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT),	\
658 		    szc, NULL);						\
659 		break;                                                  \
660 	case 2: {                                                       \
661 		int cnt = as_color_bin(as);				\
662 		uint_t color_mask = page_get_pagecolors(0) - 1;		\
663                                                                         \
664 		/* make sure physical color aligns with vac color */	\
665 		while ((cnt & vac_colors_mask) !=			\
666 		    addr_to_vcolor(addr)) {				\
667 			cnt++;						\
668 		}                                                       \
669 		bin = cnt = cnt & color_mask;			        \
670 		bin >>= PAGE_GET_COLOR_SHIFT(0, szc);                   \
671 		/* update per as page coloring fields */		\
672 		cnt = (cnt + 1) & color_mask;			        \
673 		if (cnt == (as_color_start(as) & color_mask)) {	        \
674 			cnt = as_color_start(as) = as_color_start(as) + \
675 				PGCLR_LOOPFACTOR;			\
676 		}                                                       \
677 		as_color_bin(as) = cnt & color_mask;		        \
678 		break;                                                  \
679 	}								\
680 }									\
681 	ASSERT(bin < page_get_pagecolors(szc));
682 
683 /*
684  * cpu private vm data - accessed thru CPU->cpu_vm_data
685  *	vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock()
686  *	vc_pnext_memseg: tracks last memseg visited in page_nextn()
687  *	vc_kmptr: unaligned kmem pointer for this vm_cpu_data_t
688  *	vc_kmsize: orignal kmem size for this vm_cpu_data_t
689  */
690 
691 typedef struct {
692 	struct memseg	*vc_pnum_memseg;
693 	struct memseg	*vc_pnext_memseg;
694 	void		*vc_kmptr;
695 	size_t		vc_kmsize;
696 } vm_cpu_data_t;
697 
698 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */
699 #define	VM_CPU_DATA_PADSIZE						\
700 	(P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX))
701 
702 /*
703  * Function to get an ecache color bin: F(as, cnt, vcolor).
704  * the goal of this function is to:
705  * - to spread a processes' physical pages across the entire ecache to
706  *	maximize its use.
707  * - to minimize vac flushes caused when we reuse a physical page on a
708  *	different vac color than it was previously used.
709  * - to prevent all processes to use the same exact colors and trash each
710  *	other.
711  *
712  * cnt is a bin ptr kept on a per as basis.  As we page_create we increment
713  * the ptr so we spread out the physical pages to cover the entire ecache.
714  * The virtual color is made a subset of the physical color in order to
715  * in minimize virtual cache flushing.
716  * We add in the as to spread out different as.	 This happens when we
717  * initialize the start count value.
718  * sizeof(struct as) is 60 so we shift by 3 to get into the bit range
719  * that will tend to change.  For example, on spitfire based machines
720  * (vcshft == 1) contigous as are spread bu ~6 bins.
721  * vcshft provides for proper virtual color alignment.
722  * In theory cnt should be updated using cas only but if we are off by one
723  * or 2 it is no big deal.
724  * We also keep a start value which is used to randomize on what bin we
725  * start counting when it is time to start another loop. This avoids
726  * contigous allocations of ecache size to point to the same bin.
727  * Why 3? Seems work ok. Better than 7 or anything larger.
728  */
729 #define	PGCLR_LOOPFACTOR 3
730 
731 /*
732  * When a bin is empty, and we can't satisfy a color request correctly,
733  * we scan.  If we assume that the programs have reasonable spatial
734  * behavior, then it will not be a good idea to use the adjacent color.
735  * Using the adjacent color would result in virtually adjacent addresses
736  * mapping into the same spot in the cache.  So, if we stumble across
737  * an empty bin, skip a bunch before looking.  After the first skip,
738  * then just look one bin at a time so we don't miss our cache on
739  * every look. Be sure to check every bin.  Page_create() will panic
740  * if we miss a page.
741  *
742  * This also explains the `<=' in the for loops in both page_get_freelist()
743  * and page_get_cachelist().  Since we checked the target bin, skipped
744  * a bunch, then continued one a time, we wind up checking the target bin
745  * twice to make sure we get all of them bins.
746  */
747 #define	BIN_STEP	20
748 
749 #ifdef VM_STATS
750 struct vmm_vmstats_str {
751 	ulong_t pgf_alloc[MMU_PAGE_SIZES];	/* page_get_freelist */
752 	ulong_t pgf_allocok[MMU_PAGE_SIZES];
753 	ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
754 	ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
755 	ulong_t pgf_allocdeferred;
756 	ulong_t	pgf_allocretry[MMU_PAGE_SIZES];
757 	ulong_t pgc_alloc;			/* page_get_cachelist */
758 	ulong_t pgc_allocok;
759 	ulong_t pgc_allocokrem;
760 	ulong_t	pgc_allocokdeferred;
761 	ulong_t pgc_allocfailed;
762 	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];	/* page_get_contig_pages */
763 	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
764 	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
765 	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
766 	ulong_t	ptcp[MMU_PAGE_SIZES];		/* page_trylock_contig_pages */
767 	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
768 	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
769 	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
770 	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
771 	ulong_t	ptcpok[MMU_PAGE_SIZES];
772 	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];	/* page_get_mnode_freelist */
773 	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
774 	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
775 	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
776 	ulong_t	pgmc_alloc;			/* page_get_mnode_cachelist */
777 	ulong_t	pgmc_allocfailed;
778 	ulong_t	pgmc_allocempty;
779 	ulong_t	pgmc_allocok;
780 	ulong_t	pladd_free[MMU_PAGE_SIZES];	/* page_list_add/sub */
781 	ulong_t	plsub_free[MMU_PAGE_SIZES];
782 	ulong_t	pladd_cache;
783 	ulong_t	plsub_cache;
784 	ulong_t	plsubpages_szcbig;
785 	ulong_t	plsubpages_szc0;
786 	ulong_t	pfs_req[MMU_PAGE_SIZES];	/* page_freelist_split */
787 	ulong_t	pfs_demote[MMU_PAGE_SIZES];
788 	ulong_t	pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
789 	ulong_t ppr_reloc[MMU_PAGE_SIZES];	/* page_relocate */
790 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
791 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
792 	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
793 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
794 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
795 	ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
796 	ulong_t ppr_copyfail;
797 	/* page coalesce counter */
798 	ulong_t	page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
799 	/* candidates useful */
800 	ulong_t	page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
801 	/* ctrs changed after locking */
802 	ulong_t	page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
803 	/* page_freelist_coalesce failed */
804 	ulong_t	page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
805 	ulong_t	page_ctrs_coalesce_all;	/* page coalesce all counter */
806 	ulong_t	page_ctrs_cands_skip_all; /* candidates useful for all func */
807 };
808 extern struct vmm_vmstats_str vmm_vmstats;
809 #endif	/* VM_STATS */
810 
811 /*
812  * Used to hold off page relocations into the cage until OBP has completed
813  * its boot-time handoff of its resources to the kernel.
814  */
815 extern int page_relocate_ready;
816 
817 /*
818  * cpu/mmu-dependent vm variables may be reset at bootup.
819  */
820 extern uint_t mmu_page_sizes;
821 extern uint_t max_mmu_page_sizes;
822 extern uint_t mmu_hashcnt;
823 extern uint_t max_mmu_hashcnt;
824 extern size_t mmu_ism_pagesize;
825 extern int mmu_exported_pagesize_mask;
826 extern uint_t mmu_exported_page_sizes;
827 extern uint_t szc_2_userszc[];
828 extern uint_t userszc_2_szc[];
829 
830 #define	mmu_legacy_page_sizes	mmu_exported_page_sizes
831 #define	USERSZC_2_SZC(userszc)	(userszc_2_szc[userszc])
832 #define	SZC_2_USERSZC(szc)	(szc_2_userszc[szc])
833 
834 /*
835  * Platform specific page routines
836  */
837 extern void mach_page_add(page_t **, page_t *);
838 extern void mach_page_sub(page_t **, page_t *);
839 extern uint_t page_get_pagecolors(uint_t);
840 extern void ppcopy_kernel__relocatable(page_t *, page_t *);
841 #define	ppcopy_kernel(p1, p2)	ppcopy_kernel__relocatable(p1, p2)
842 
843 /*
844  * platform specific large pages for kernel heap support
845  */
846 extern size_t get_segkmem_lpsize(size_t lpsize);
847 extern size_t mmu_get_kernel_lpsize(size_t lpsize);
848 extern void mmu_init_kernel_pgsz(struct hat *hat);
849 extern void mmu_init_kcontext();
850 extern uint64_t kcontextreg;
851 
852 /*
853  * Nucleus data page allocator routines
854  */
855 extern void ndata_alloc_init(struct memlist *, uintptr_t, uintptr_t);
856 extern void *ndata_alloc(struct memlist *, size_t, size_t);
857 extern void *ndata_extra_base(struct memlist *, size_t, caddr_t);
858 extern size_t ndata_maxsize(struct memlist *);
859 extern size_t ndata_spare(struct memlist *, size_t, size_t);
860 
861 #ifdef	__cplusplus
862 }
863 #endif
864 
865 #endif	/* _VM_DEP_H */
866