xref: /titanic_44/usr/src/uts/sun4/vm/vm_dep.h (revision b7f45089ccbe01bab3d7c7377b49d80d2ae18a69)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * UNIX machine dependent virtual memory support.
29  */
30 
31 #ifndef	_VM_DEP_H
32 #define	_VM_DEP_H
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #ifdef	__cplusplus
37 extern "C" {
38 #endif
39 
40 #include <vm/hat_sfmmu.h>
41 #include <sys/archsystm.h>
42 #include <sys/memnode.h>
43 
44 #define	GETTICK()	gettick()
45 
46 /*
47  * Per page size free lists. Allocated dynamically.
48  */
49 #define	MAX_MEM_TYPES	2	/* 0 = reloc, 1 = noreloc */
50 #define	MTYPE_RELOC	0
51 #define	MTYPE_NORELOC	1
52 
53 #define	PP_2_MTYPE(pp)	(PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC)
54 
55 #define	MTYPE_INIT(mtype, vp, vaddr, flags)				\
56 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
57 
58 /*
59  * macros to loop through the mtype range - noops for sparc
60  */
61 #define	MTYPE_START(mnode, mtype, flags)
62 #define	MTYPE_NEXT(mnode, mtype, flags)		(-1)
63 
64 /* mtype init for page_get_replacement_page */
65 
66 #define	MTYPE_PGR_INIT(mtype, flags, pp, mnode)				\
67 	mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC;
68 
69 #define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
70 	ASSERT(mtype != MTYPE_NORELOC);					\
71 	pfnlo = mem_node_config[mnode].physbase;			\
72 	pfnhi = mem_node_config[mnode].physmax;
73 
74 /*
75  * Internal PG_ flags.
76  */
77 #define	PGI_RELOCONLY	0x10000	/* acts in the opposite sense to PG_NORELOC */
78 #define	PGI_NOCAGE	0x20000	/* indicates Cage is disabled */
79 #define	PGI_PGCPHIPRI	0x40000	/* page_get_contig_page priority allocation */
80 #define	PGI_PGCPSZC0	0x80000	/* relocate base pagesize page */
81 
82 /*
83  * PGI mtype flags - should not overlap PGI flags
84  */
85 #define	PGI_MT_RANGE	0x1000000	/* mtype range */
86 #define	PGI_MT_NEXT	0x2000000	/* get next mtype */
87 
88 extern page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
89 extern page_t ***page_cachelists[MAX_MEM_TYPES];
90 
91 #define	PAGE_FREELISTS(mnode, szc, color, mtype) \
92 	(*(page_freelists[szc][mtype][mnode] + (color)))
93 
94 #define	PAGE_CACHELISTS(mnode, color, mtype) \
95 	(*(page_cachelists[mtype][mnode] + (color)))
96 
97 /*
98  * There are 'page_colors' colors/bins.  Spread them out under a
99  * couple of locks.  There are mutexes for both the page freelist
100  * and the page cachelist.  We want enough locks to make contention
101  * reasonable, but not too many -- otherwise page_freelist_lock() gets
102  * so expensive that it becomes the bottleneck!
103  */
104 #define	NPC_MUTEX	16
105 
106 extern kmutex_t	*fpc_mutex[NPC_MUTEX];
107 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
108 
109 /* Find the bin for the given page if it was of size szc */
110 #define	PP_2_BIN_SZC(pp, szc)                                           \
111 	(((pp->p_pagenum) & page_colors_mask) >>                        \
112 	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
113 
114 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
115 
116 #define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
117 
118 #define	PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?	\
119 	&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :			\
120 	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
121 
122 #define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
123 #define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])
124 
125 #define	PFN_BASE(pfnum, szc)	(pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1))
126 
127 typedef	char	hpmctr_t;
128 
129 #ifdef DEBUG
130 #define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
131 extern void	chk_lpg(page_t *, uchar_t);
132 #else
133 #define	CHK_LPG(pp, szc)
134 #endif
135 
136 #ifdef DEBUG
137 
138 /* page list count */
139 typedef	struct {
140 	pgcnt_t	plc_m_pgmax;
141 	pgcnt_t	plc_m_pgcnt;
142 	pgcnt_t	plc_m_clpgcnt;		/* cache list cnt */
143 	struct {
144 		pgcnt_t	plc_mt_pgmax;
145 		pgcnt_t plc_mt_pgcnt;
146 		struct {
147 			pgcnt_t plc_mts_pgcnt;
148 			int	plc_mts_colors;
149 			pgcnt_t	*plc_mtsc_pgcnt;
150 		} plc_mts[MMU_PAGE_SIZES];
151 	} plc_mt[MAX_MEM_TYPES];
152 } plcnt_t[MAX_MEM_NODES];
153 
154 extern plcnt_t	plcnt;
155 
156 #define	PLCNT_SZ(ctrs_sz) {						\
157 	int	szc;							\
158 	for (szc = 0; szc <= mmu_page_sizes; szc++) {			\
159 		int	colors = page_get_pagecolors(szc);		\
160 		ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES *		\
161 		    colors * sizeof (pgcnt_t));				\
162 	}								\
163 }
164 
165 #define	PLCNT_INIT(base) {						\
166 	int	mn, mt, szc, colors;					\
167 	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
168 		colors = page_get_pagecolors(szc);			\
169 		for (mn = 0; mn < max_mem_nodes; mn++) {		\
170 			for (mt = 0; mt < MAX_MEM_TYPES; mt++) {	\
171 				plcnt[mn].plc_mt[mt].plc_mts[szc].	\
172 				    plc_mts_colors = colors;		\
173 				plcnt[mn].plc_mt[mt].plc_mts[szc].	\
174 				    plc_mtsc_pgcnt = (pgcnt_t *)base;	\
175 				base += (colors * sizeof (pgcnt_t));	\
176 			}						\
177 		}							\
178 	}								\
179 }
180 
181 #define	PLCNT_DO(pp, mn, szc, cnt, flags) {				\
182 	int	mtype = PP_2_MTYPE(pp);					\
183 	int	bin = PP_2_BIN(pp);					\
184 	if (flags & (PG_LIST_ISINIT | PG_LIST_ISCAGE))			\
185 		atomic_add_long(&plcnt[mn].plc_mt[mtype].plc_mt_pgmax,	\
186 		    cnt);						\
187 	atomic_add_long(&mem_node_config[mn].cursize, cnt);		\
188 	if (flags & PG_CACHE_LIST)					\
189 		atomic_add_long(&plcnt[mn].plc_m_clpgcnt, cnt);		\
190 	atomic_add_long(&plcnt[mn].plc_m_pgcnt, cnt);			\
191 	atomic_add_long(&plcnt[mn].plc_mt[mtype].plc_mt_pgcnt, cnt);	\
192 	atomic_add_long(&plcnt[mn].plc_mt[mtype].plc_mts[szc].		\
193 	    plc_mts_pgcnt, cnt);					\
194 	atomic_add_long(&plcnt[mn].plc_mt[mtype].plc_mts[szc].		\
195 	    plc_mtsc_pgcnt[bin], cnt);					\
196 }
197 
198 #define	PLCNT_INCR(pp, mn, szc, flags) {				\
199 	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
200 	if (flags & PG_LIST_ISINIT)					\
201 		plcnt[mn].plc_m_pgmax += cnt;				\
202 	PLCNT_DO(pp, mn, szc, cnt, flags);				\
203 }
204 
205 #define	PLCNT_DECR(pp, mn, szc, flags) {				\
206 	long	cnt = ((-1) << PAGE_BSZS_SHIFT(szc));			\
207 	PLCNT_DO(pp, mn, szc, cnt, flags);				\
208 }
209 
210 #else
211 
212 #define	PLCNT_SZ(ctrs_sz)
213 
214 #define	PLCNT_INIT(base)
215 
216 #define	PLCNT_INCR(pp, mnode, szc, flags) {				\
217 	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
218 	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
219 }
220 
221 #define	PLCNT_DECR(pp, mnode, szc, flags) {				\
222 	long	cnt = ((-1) << PAGE_BSZS_SHIFT(szc));			\
223 	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
224 }
225 
226 #endif
227 
228 /*
229  * get the ecache setsize for the current cpu.
230  */
231 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
232 
233 
234 #define	PAGE_BSZS_SHIFT(szc)	TTE_BSZS_SHIFT(szc)
235 /*
236  * For sfmmu each larger page is 8 times the size of the previous
237  * size page.
238  */
239 #define	FULL_REGION_CNT(rg_szc)	(8)
240 
241 /*
242  * The counter base must be per page_counter element to prevent
243  * races when re-indexing, and the base page size element should
244  * be aligned on a boundary of the given region size.
245  *
246  * We also round up the number of pages spanned by the counters
247  * for a given region to PC_BASE_ALIGN in certain situations to simplify
248  * the coding for some non-performance critical routines.
249  */
250 #define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1))
251 #define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)
252 
253 extern int ecache_alignsize;
254 #define	L2CACHE_ALIGN		ecache_alignsize
255 
256 extern int consistent_coloring;
257 extern uint_t vac_colors_mask;
258 extern int vac_size;
259 extern int vac_shift;
260 
261 /*
262  * Auto large page selection support variables. Some CPU
263  * implementations may differ from the defaults and will need
264  * to change these.
265  */
266 extern int auto_lpg_tlb_threshold;
267 extern int auto_lpg_minszc;
268 extern int auto_lpg_maxszc;
269 extern size_t auto_lpg_heap_default;
270 extern size_t auto_lpg_stack_default;
271 extern size_t auto_lpg_va_default;
272 extern size_t auto_lpg_remap_threshold;
273 
274 /*
275  * AS_2_BIN macro controls the page coloring policy.
276  * 0 (default) uses various vaddr bits
277  * 1 virtual=paddr
278  * 2 bin hopping
279  */
280 #define	AS_2_BIN(as, seg, vp, addr, bin)				\
281 switch (consistent_coloring) {						\
282 	default:                                                        \
283 		cmn_err(CE_WARN,					\
284 			"AS_2_BIN: bad consistent coloring value");	\
285 		/* assume default algorithm -> continue */		\
286 	case 0: {                                                       \
287 		uint32_t ndx, new;					\
288 		int slew = 0;						\
289                                                                         \
290 		if (vp != NULL && IS_SWAPVP(vp) &&			\
291 			seg->s_ops == &segvn_ops)			\
292 			slew = as_color_bin(as);			\
293                                                                         \
294 		bin = (((uintptr_t)addr >> MMU_PAGESHIFT) +		\
295 			(((uintptr_t)addr >> page_coloring_shift) <<	\
296 			(vac_shift - MMU_PAGESHIFT)) + slew) &		\
297 			page_colors_mask;				\
298                                                                         \
299 		break;                                                  \
300 	}                                                               \
301 	case 1:                                                         \
302 		bin = ((uintptr_t)addr >> MMU_PAGESHIFT) &		\
303 			page_colors_mask;				\
304 		break;                                                  \
305 	case 2: {                                                       \
306 		int cnt = as_color_bin(as);				\
307 		/* make sure physical color aligns with vac color */	\
308 		while ((cnt & vac_colors_mask) !=			\
309 		    addr_to_vcolor(addr)) {				\
310 			cnt++;						\
311 		}                                                       \
312 		bin = cnt = cnt & page_colors_mask;			\
313 		/* update per as page coloring fields */		\
314 		cnt = (cnt + 1) & page_colors_mask;			\
315 		if (cnt == (as_color_start(as) & page_colors_mask)) {	\
316 			cnt = as_color_start(as) = as_color_start(as) + \
317 				PGCLR_LOOPFACTOR;			\
318 		}                                                       \
319 		as_color_bin(as) = cnt & page_colors_mask;		\
320 		break;                                                  \
321 	}								\
322 }									\
323 	ASSERT(bin <= page_colors_mask);
324 
325 /*
326  * Function to get an ecache color bin: F(as, cnt, vcolor).
327  * the goal of this function is to:
328  * - to spread a processes' physical pages across the entire ecache to
329  *	maximize its use.
330  * - to minimize vac flushes caused when we reuse a physical page on a
331  *	different vac color than it was previously used.
332  * - to prevent all processes to use the same exact colors and trash each
333  *	other.
334  *
335  * cnt is a bin ptr kept on a per as basis.  As we page_create we increment
336  * the ptr so we spread out the physical pages to cover the entire ecache.
337  * The virtual color is made a subset of the physical color in order to
338  * in minimize virtual cache flushing.
339  * We add in the as to spread out different as.	 This happens when we
340  * initialize the start count value.
341  * sizeof(struct as) is 60 so we shift by 3 to get into the bit range
342  * that will tend to change.  For example, on spitfire based machines
343  * (vcshft == 1) contigous as are spread bu ~6 bins.
344  * vcshft provides for proper virtual color alignment.
345  * In theory cnt should be updated using cas only but if we are off by one
346  * or 2 it is no big deal.
347  * We also keep a start value which is used to randomize on what bin we
348  * start counting when it is time to start another loop. This avoids
349  * contigous allocations of ecache size to point to the same bin.
350  * Why 3? Seems work ok. Better than 7 or anything larger.
351  */
352 #define	PGCLR_LOOPFACTOR 3
353 
354 /*
355  * When a bin is empty, and we can't satisfy a color request correctly,
356  * we scan.  If we assume that the programs have reasonable spatial
357  * behavior, then it will not be a good idea to use the adjacent color.
358  * Using the adjacent color would result in virtually adjacent addresses
359  * mapping into the same spot in the cache.  So, if we stumble across
360  * an empty bin, skip a bunch before looking.  After the first skip,
361  * then just look one bin at a time so we don't miss our cache on
362  * every look. Be sure to check every bin.  Page_create() will panic
363  * if we miss a page.
364  *
365  * This also explains the `<=' in the for loops in both page_get_freelist()
366  * and page_get_cachelist().  Since we checked the target bin, skipped
367  * a bunch, then continued one a time, we wind up checking the target bin
368  * twice to make sure we get all of them bins.
369  */
370 #define	BIN_STEP	20
371 
372 #ifdef VM_STATS
373 struct vmm_vmstats_str {
374 	ulong_t pc_list_add_pages[MMU_PAGE_SIZES];
375 	ulong_t pc_list_sub_pages1[MMU_PAGE_SIZES];
376 	ulong_t pc_list_sub_pages2[MMU_PAGE_SIZES];
377 	ulong_t pc_list_sub_pages3[MMU_PAGE_SIZES];
378 	ulong_t pgf_alloc[MMU_PAGE_SIZES];
379 	ulong_t pgf_allocok[MMU_PAGE_SIZES];
380 	ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
381 	ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
382 	ulong_t pgf_allocdeferred;
383 	ulong_t	pgf_allocretry[MMU_PAGE_SIZES];
384 	ulong_t pgc_alloc;
385 	ulong_t pgc_allocok;
386 	ulong_t pgc_allocokrem;
387 	ulong_t	pgc_allocokdeferred;
388 	ulong_t pgc_allocfailed;
389 	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];
390 	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
391 	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
392 	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
393 	ulong_t	ptcp[MMU_PAGE_SIZES];
394 	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
395 	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
396 	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
397 	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
398 	ulong_t	ptcpok[MMU_PAGE_SIZES];
399 	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];
400 	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
401 	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
402 	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
403 	ulong_t	pgmc_alloc;
404 	ulong_t	pgmc_allocfailed;
405 	ulong_t	pgmc_allocempty;
406 	ulong_t	pgmc_allocok;
407 	ulong_t ppr_reloc[MMU_PAGE_SIZES];
408 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
409 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
410 	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
411 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
412 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
413 	ulong_t ppr_krelocfail[MMU_PAGE_SIZES];
414 	ulong_t	page_ctrs_coalesce;	/* page coalesce counter */
415 	ulong_t	page_ctrs_cands_skip;	/* candidates useful */
416 	ulong_t	page_ctrs_changed;	/* ctrs changed after locking */
417 	ulong_t	page_ctrs_failed;	/* page_freelist_coalesce failed */
418 	ulong_t	page_ctrs_coalesce_all;	/* page coalesce all counter */
419 	ulong_t	page_ctrs_cands_skip_all; /* candidates useful for all func */
420 };
421 extern struct vmm_vmstats_str vmm_vmstats;
422 #endif	/* VM_STATS */
423 
424 /*
425  * Used to hold off page relocations into the cage until OBP has completed
426  * its boot-time handoff of its resources to the kernel.
427  */
428 extern int page_relocate_ready;
429 
430 /*
431  * cpu/mmu-dependent vm variables may be reset at bootup.
432  */
433 extern uint_t mmu_page_sizes;
434 extern uint_t max_mmu_page_sizes;
435 extern uint_t mmu_hashcnt;
436 extern uint_t max_mmu_hashcnt;
437 extern size_t mmu_ism_pagesize;
438 extern int mmu_exported_pagesize_mask;
439 extern uint_t mmu_exported_page_sizes;
440 extern uint_t szc_2_userszc[];
441 extern uint_t userszc_2_szc[];
442 
443 #define	USERSZC_2_SZC(userszc)	(userszc_2_szc[userszc])
444 #define	SZC_2_USERSZC(szc)	(szc_2_userszc[szc])
445 
446 /*
447  * Platform specific map_pgsz large page hook routines.
448  */
449 extern size_t map_pgszva(struct proc *p, caddr_t addr, size_t len);
450 extern size_t map_pgszheap(struct proc *p, caddr_t addr, size_t len);
451 extern size_t map_pgszstk(struct proc *p, caddr_t addr, size_t len);
452 
453 /*
454  * Platform specific page routines
455  */
456 extern void mach_page_add(page_t **, page_t *);
457 extern void mach_page_sub(page_t **, page_t *);
458 extern uint_t page_get_pagecolors(uint_t);
459 extern void ppcopy_kernel__relocatable(page_t *, page_t *);
460 #define	ppcopy_kernel(p1, p2)	ppcopy_kernel__relocatable(p1, p2)
461 
462 /*
463  * platform specific large pages for kernel heap support
464  */
465 extern size_t get_segkmem_lpsize(size_t lpsize);
466 extern size_t mmu_get_kernel_lpsize(size_t lpsize);
467 extern void mmu_init_kernel_pgsz(struct hat *hat);
468 extern void mmu_init_kcontext();
469 extern uint64_t kcontextreg;
470 
471 #ifdef	__cplusplus
472 }
473 #endif
474 
475 #endif	/* _VM_DEP_H */
476