xref: /titanic_52/usr/src/uts/i86pc/vm/vm_dep.h (revision 1a7c1b724419d3cb5fa6eea75123c6b2060ba31b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * UNIX machine dependent virtual memory support.
29  */
30 
31 #ifndef	_VM_DEP_H
32 #define	_VM_DEP_H
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #ifdef	__cplusplus
37 extern "C" {
38 #endif
39 
40 #include <sys/clock.h>
41 #include <vm/hat_pte.h>
42 
43 /*
44  * WARNING: vm_dep.h is included by files in common. As such, macros
45  * dependent upon PTE36 such as LARGEPAGESIZE cannot be used in this file.
46  */
47 
48 #define	GETTICK()	tsc_read()
49 
50 /* memranges in descending order */
51 extern pfn_t		*memranges;
52 
53 #define	MEMRANGEHI(mtype)						\
54 	((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
55 #define	MEMRANGELO(mtype)	(memranges[mtype])
56 
57 /*
58  * combined memory ranges from mnode and memranges[] to manage single
59  * mnode/mtype dimension in the page lists.
60  */
61 typedef struct {
62 	pfn_t	mnr_pfnlo;
63 	pfn_t	mnr_pfnhi;
64 	int	mnr_mnode;
65 	int	mnr_memrange;		/* index into memranges[] */
66 #ifdef DEBUG
67 	/* maintain page list stats */
68 	pgcnt_t	mnr_mt_pgmax;		/* mnode/mtype max page cnt */
69 	pgcnt_t	mnr_mt_pgcnt;		/* free cnt */
70 	pgcnt_t	mnr_mt_clpgcnt;		/* cache list free cnt */
71 	struct mnr_mts {		/* mnode/mtype szc stats */
72 		pgcnt_t	mnr_mts_pgcnt;
73 		int	mnr_mts_colors;
74 		pgcnt_t *mnr_mtsc_pgcnt;
75 	} 	*mnr_mts;
76 #endif
77 } mnoderange_t;
78 
79 #ifdef DEBUG
80 #define	PLCNT_SZ(ctrs_sz) {						\
81 	int	szc, colors;						\
82 	ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) *		\
83 	    mmu_page_sizes;						\
84 	for (szc = 0; szc < mmu_page_sizes; szc++) {			\
85 		colors = page_get_pagecolors(szc);			\
86 		ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;	\
87 	}								\
88 }
89 
90 #define	PLCNT_INIT(addr) {						\
91 	int	mt, szc, colors;					\
92 	for (mt = 0; mt < mnoderangecnt; mt++) {			\
93 		mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;	\
94 		addr += (sizeof (struct mnr_mts) * mmu_page_sizes);	\
95 		for (szc = 0; szc < mmu_page_sizes; szc++) {		\
96 			colors = page_get_pagecolors(szc);		\
97 			mnoderanges[mt].mnr_mts[szc].mnr_mts_colors =	\
98 			    colors;					\
99 			mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =	\
100 			    (pgcnt_t *)addr;				\
101 			addr += (sizeof (pgcnt_t) * colors);		\
102 		}							\
103 	}								\
104 }
105 #define	PLCNT_DO(pp, mtype, szc, cnt, flags) {				\
106 	int	bin = PP_2_BIN(pp);					\
107 	if (flags & PG_LIST_ISINIT)					\
108 		mnoderanges[mtype].mnr_mt_pgmax += cnt;			\
109 	atomic_add_long(&mnoderanges[mtype].mnr_mt_pgcnt, cnt);		\
110 	if (flags & PG_CACHE_LIST)					\
111 		atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt,	\
112 		    cnt);						\
113 	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
114 	    mnr_mts_pgcnt, cnt);					\
115 	atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].		\
116 	    mnr_mtsc_pgcnt[bin], cnt);					\
117 }
118 #else
119 #define	PLCNT_SZ(ctrs_sz)
120 #define	PLCNT_INIT(base)
121 #define	PLCNT_DO(pp, mtype, szc, cnt, flags)
122 #endif
123 
124 #define	PLCNT_INCR(pp, mnode, szc, flags) {				\
125 	long	cnt = (1 << PAGE_BSZS_SHIFT(szc));			\
126 	int	mtype = PP_2_MTYPE(pp);					\
127 	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
128 	if (physmax4g && mtype <= mtype4g)				\
129 		atomic_add_long(&freemem4g, cnt);			\
130 	if (flags & PG_LIST_ISINIT) {					\
131 		if (physmax4g && mtype <= mtype4g)			\
132 			maxmem4g += cnt;				\
133 	}								\
134 	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
135 }
136 
137 #define	PLCNT_DECR(pp, mnode, szc, flags) {				\
138 	long	cnt = ((-1) << PAGE_BSZS_SHIFT(szc));			\
139 	int	mtype = PP_2_MTYPE(pp);					\
140 	atomic_add_long(&mem_node_config[mnode].cursize, cnt);		\
141 	if (physmax4g && mtype <= mtype4g)				\
142 		atomic_add_long(&freemem4g, cnt);			\
143 	PLCNT_DO(pp, mtype, szc, cnt, flags);				\
144 }
145 
146 extern mnoderange_t	*mnoderanges;
147 extern int		mnoderangecnt;
148 extern int		mtype4g;
149 
150 /*
151  * 4g memory management variables for systems with more than 4g of memory:
152  *
153  * physical memory below 4g is required for 32bit dma devices and, currently,
154  * for kmem memory. On systems with more than 4g of memory, the pool of memory
155  * below 4g can be depleted without any paging activity given that there is
156  * likely to be sufficient memory above 4g.
157  *
158  * physmax4g is set true if the largest pfn is over 4g. The rest of the
159  * 4g memory management code is enabled only when physmax4g is true.
160  *
161  * maxmem4g is the count of the maximum number of pages on the page lists
162  * with physical addresses below 4g. It can be a lot less then 4g given that
163  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
164  * agp aperture etc.
165  *
166  * freemem4g maintains the count of the number of available pages on the
167  * page lists with physical addresses below 4g.
168  *
169  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
170  * 6% (desfree4gshift = 4) of maxmem4g.
171  *
172  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
173  * and the amount of physical memory above 4g is greater than freemem4g.
174  * In this case, page_get_* routines will restrict below 4g allocations
175  * for requests that don't specifically require it.
176  */
177 
178 extern int		physmax4g;
179 extern pgcnt_t		maxmem4g;
180 extern pgcnt_t		freemem4g;
181 extern int		lotsfree4gshift;
182 extern int		desfree4gshift;
183 #define	LOTSFREE4G	(maxmem4g >> lotsfree4gshift)
184 #define	DESFREE4G	(maxmem4g >> desfree4gshift)
185 
186 #define	RESTRICT4G_ALLOC					\
187 	(physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
188 
189 extern int		restricted_kmemalloc;
190 extern int		memrange_num(pfn_t);
191 extern int		pfn_2_mtype(pfn_t);
192 extern int		mtype_func(int, int, uint_t);
193 
194 #define	NUM_MEM_RANGES	4		/* memory range types */
195 
196 /*
197  * Per page size free lists. Allocated dynamically.
198  * dimensions [mtype][mmu_page_sizes][colors]
199  *
200  * mtype specifies a physical memory range with a unique mnode.
201  */
202 
203 extern page_t ****page_freelists;
204 
205 #define	PAGE_FREELISTS(mnode, szc, color, mtype)		\
206 	(*(page_freelists[mtype][szc] + (color)))
207 
208 /*
209  * For now there is only a single size cache list. Allocated dynamically.
210  * dimensions [mtype][colors]
211  *
212  * mtype specifies a physical memory range with a unique mnode.
213  */
214 extern page_t ***page_cachelists;
215 
216 #define	PAGE_CACHELISTS(mnode, color, mtype) 		\
217 	(*(page_cachelists[mtype] + (color)))
218 
219 /*
220  * There are mutexes for both the page freelist
221  * and the page cachelist.  We want enough locks to make contention
222  * reasonable, but not too many -- otherwise page_freelist_lock() gets
223  * so expensive that it becomes the bottleneck!
224  */
225 
226 #define	NPC_MUTEX	16
227 
228 extern kmutex_t	*fpc_mutex[NPC_MUTEX];
229 extern kmutex_t	*cpc_mutex[NPC_MUTEX];
230 
231 extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t);
232 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
233 
234 /* Find the bin for the given page if it was of size szc */
235 #define	PP_2_BIN_SZC(pp, szc)						\
236 	(((pp->p_pagenum) & page_colors_mask) >>			\
237 	(hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift))
238 
239 #define	PP_2_BIN(pp)		(PP_2_BIN_SZC(pp, pp->p_szc))
240 
241 #define	PP_2_MEM_NODE(pp)	(PFN_2_MEM_NODE(pp->p_pagenum))
242 #define	PP_2_MTYPE(pp)		(pfn_2_mtype(pp->p_pagenum))
243 #define	PP_2_SZC(pp)		(pp->p_szc)
244 
245 #define	SZCPAGES(szc)		(1 << PAGE_BSZS_SHIFT(szc))
246 #define	PFN_BASE(pfnum, szc)	(pfnum & ~(SZCPAGES(szc) - 1))
247 
248 #if defined(__amd64)
249 
250 /*
251  * set the mtype range (called from page_get_{free,cache}list)
252  *   - set range to above 4g if the system has more than 4g of memory and the
253  *   amount of memory below 4g runs low otherwise set range to all of memory
254  *   starting from the hi pfns.
255  *
256  * page_get_anylist gets its mtype range from the specified ddi_dma_attr_t.
257  */
258 #define	MTYPE_INIT(mtype, vp, vaddr, flags) {				\
259 	mtype = mnoderangecnt - 1;					\
260 	if (RESTRICT4G_ALLOC) {						\
261 		VM_STAT_ADD(vmm_vmstats.restrict4gcnt);			\
262 		/* here only for > 4g systems */			\
263 		flags |= PGI_MT_RANGE4G;				\
264 	} else {							\
265 		flags |= PGI_MT_RANGE0;					\
266 	}								\
267 }
268 
269 #elif defined(__i386)
270 
271 /*
272  * set the mtype range
273  *   - kmem requests needs to be below 4g if restricted_kmemalloc is set.
274  *   - for non kmem requests, set range to above 4g if the amount of memory
275  *   below 4g runs low.
276  */
277 
278 #define	MTYPE_INIT(mtype, vp, vaddr, flags) {				\
279 	if (restricted_kmemalloc && (vp) == &kvp &&			\
280 	    (caddr_t)(vaddr) >= kernelheap &&				\
281 	    (caddr_t)(vaddr) < ekernelheap) {				\
282 		ASSERT(physmax4g);					\
283 		mtype = mtype4g;					\
284 		flags |= PGI_MT_RANGE0;					\
285 	} else {							\
286 		mtype = mnoderangecnt - 1;				\
287 		if (RESTRICT4G_ALLOC) {					\
288 			VM_STAT_ADD(vmm_vmstats.restrict4gcnt);		\
289 			/* here only for > 4g systems */		\
290 			flags |= PGI_MT_RANGE4G;			\
291 		} else {						\
292 			flags |= PGI_MT_RANGE0;				\
293 		}							\
294 	}								\
295 }
296 
297 #endif	/* __i386 */
298 
299 /*
300  * macros to loop through the mtype range (page_get_mnode_{free,cache,any}list,
301  * and page_get_contig_pages)
302  *
303  * MTYPE_START sets the initial mtype. -1 if the mtype range specified does
304  * not contain mnode.
305  *
306  * MTYPE_NEXT sets the next mtype. -1 if there are no more valid
307  * mtype in the range.
308  */
309 
310 #define	MTYPE_START(mnode, mtype, flags)				\
311 	(mtype = mtype_func(mnode, mtype, flags))
312 
313 #define	MTYPE_NEXT(mnode, mtype, flags)					\
314 	(mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT))
315 
316 /* mtype init for page_get_replacement_page */
317 
318 #define	MTYPE_PGR_INIT(mtype, flags, pp, mnode) {			\
319 	mtype = mnoderangecnt - 1;					\
320 	flags |= PGI_MT_RANGE0;						\
321 }
322 
323 #define	MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi)			\
324 	ASSERT(mnoderanges[mtype].mnr_mnode == mnode);			\
325 	pfnlo = mnoderanges[mtype].mnr_pfnlo;				\
326 	pfnhi = mnoderanges[mtype].mnr_pfnhi;
327 
328 #define	PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ?	\
329 	&fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] :			\
330 	&cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode])
331 
332 #define	FPC_MUTEX(mnode, i)	(&fpc_mutex[i][mnode])
333 #define	CPC_MUTEX(mnode, i)	(&cpc_mutex[i][mnode])
334 
335 #ifdef DEBUG
336 #define	CHK_LPG(pp, szc)	chk_lpg(pp, szc)
337 extern void	chk_lpg(page_t *, uchar_t);
338 #else
339 #define	CHK_LPG(pp, szc)
340 #endif
341 
342 #define	FULL_REGION_CNT(rg_szc)	\
343 	(LEVEL_SIZE(rg_szc) >> LEVEL_SHIFT(rg_szc - 1))
344 
345 /* Return the leader for this mapping size */
346 #define	PP_GROUPLEADER(pp, szc) \
347 	(&(pp)[-(int)((pp)->p_pagenum & (SZCPAGES(szc)-1))])
348 
349 /* Return the root page for this page based on p_szc */
350 #define	PP_PAGEROOT(pp) ((pp)->p_szc == 0 ? (pp) : \
351 	PP_GROUPLEADER((pp), (pp)->p_szc))
352 
353 /*
354  * The counter base must be per page_counter element to prevent
355  * races when re-indexing, and the base page size element should
356  * be aligned on a boundary of the given region size.
357  *
358  * We also round up the number of pages spanned by the counters
359  * for a given region to PC_BASE_ALIGN in certain situations to simplify
360  * the coding for some non-performance critical routines.
361  */
362 
363 #define	PC_BASE_ALIGN		((pfn_t)1 << PAGE_BSZS_SHIFT(MMU_PAGE_SIZES-1))
364 #define	PC_BASE_ALIGN_MASK	(PC_BASE_ALIGN - 1)
365 
366 /*
367  * cpu/mmu-dependent vm variables
368  */
369 extern uint_t mmu_page_sizes;
370 extern uint_t mmu_exported_page_sizes;
371 
372 /* For x86, userszc is the same as the kernel's szc */
373 #define	USERSZC_2_SZC(userszc)	(userszc)
374 #define	SZC_2_USERSZC(szc)	(szc)
375 
376 /*
377  * for hw_page_map_t, sized to hold the ratio of large page to base
378  * pagesize (1024 max)
379  */
380 typedef	short	hpmctr_t;
381 
382 /*
383  * get the setsize of the current cpu - assume homogenous for x86
384  */
385 extern int	l2cache_sz, l2cache_linesz, l2cache_assoc;
386 
387 #define	L2CACHE_ALIGN		l2cache_linesz
388 #define	CPUSETSIZE()		\
389 	(l2cache_assoc ? (l2cache_sz / l2cache_assoc) : MMU_PAGESIZE)
390 
391 /*
392  * Return the log2(pagesize(szc) / MMU_PAGESIZE) --- or the shift count
393  * for the number of base pages in this pagesize
394  */
395 #define	PAGE_BSZS_SHIFT(szc) (LEVEL_SHIFT(szc) - MMU_PAGESHIFT)
396 
397 /*
398  * Internal PG_ flags.
399  */
400 #define	PGI_RELOCONLY	0x010000	/* opposite of PG_NORELOC */
401 #define	PGI_NOCAGE	0x020000	/* cage is disabled */
402 #define	PGI_PGCPHIPRI	0x040000	/* page_get_contig_page pri alloc */
403 #define	PGI_PGCPSZC0	0x080000	/* relocate base pagesize page */
404 
405 /*
406  * PGI range flags - should not overlap PGI flags
407  */
408 #define	PGI_MT_RANGE0	0x1000000	/* mtype range to 0 */
409 #define	PGI_MT_RANGE4G	0x2000000	/* mtype range to 4g */
410 #define	PGI_MT_NEXT	0x4000000	/* get next mtype */
411 #define	PGI_MT_RANGE	(PGI_MT_RANGE0 | PGI_MT_RANGE4G)
412 
413 /*
414  * hash as and addr to get a bin.
415  */
416 
417 #define	AS_2_BIN(as, seg, vp, addr, bin)				\
418 	bin = ((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \
419 	    & page_colors_mask)
420 
421 /*
422  * When a bin is empty, and we can't satisfy a color request correctly,
423  * we scan.  If we assume that the programs have reasonable spatial
424  * behavior, then it will not be a good idea to use the adjacent color.
425  * Using the adjacent color would result in virtually adjacent addresses
426  * mapping into the same spot in the cache.  So, if we stumble across
427  * an empty bin, skip a bunch before looking.  After the first skip,
428  * then just look one bin at a time so we don't miss our cache on
429  * every look. Be sure to check every bin.  Page_create() will panic
430  * if we miss a page.
431  *
432  * This also explains the `<=' in the for loops in both page_get_freelist()
433  * and page_get_cachelist().  Since we checked the target bin, skipped
434  * a bunch, then continued one a time, we wind up checking the target bin
435  * twice to make sure we get all of them bins.
436  */
437 #define	BIN_STEP	19
438 
439 #ifdef VM_STATS
440 struct vmm_vmstats_str {
441 	ulong_t pc_list_add_pages[MMU_PAGE_SIZES];
442 	ulong_t pc_list_sub_pages1[MMU_PAGE_SIZES];
443 	ulong_t pc_list_sub_pages2[MMU_PAGE_SIZES];
444 	ulong_t pc_list_sub_pages3[MMU_PAGE_SIZES];
445 	ulong_t pgf_alloc[MMU_PAGE_SIZES];
446 	ulong_t pgf_allocok[MMU_PAGE_SIZES];
447 	ulong_t pgf_allocokrem[MMU_PAGE_SIZES];
448 	ulong_t pgf_allocfailed[MMU_PAGE_SIZES];
449 	ulong_t	pgf_allocdeferred;
450 	ulong_t	pgf_allocretry[MMU_PAGE_SIZES];
451 	ulong_t pgc_alloc;
452 	ulong_t pgc_allocok;
453 	ulong_t pgc_allocokrem;
454 	ulong_t pgc_allocokdeferred;
455 	ulong_t pgc_allocfailed;
456 	ulong_t	pgcp_alloc[MMU_PAGE_SIZES];
457 	ulong_t	pgcp_allocfailed[MMU_PAGE_SIZES];
458 	ulong_t	pgcp_allocempty[MMU_PAGE_SIZES];
459 	ulong_t	pgcp_allocok[MMU_PAGE_SIZES];
460 	ulong_t	ptcp[MMU_PAGE_SIZES];
461 	ulong_t	ptcpfreethresh[MMU_PAGE_SIZES];
462 	ulong_t	ptcpfailexcl[MMU_PAGE_SIZES];
463 	ulong_t	ptcpfailszc[MMU_PAGE_SIZES];
464 	ulong_t	ptcpfailcage[MMU_PAGE_SIZES];
465 	ulong_t	ptcpok[MMU_PAGE_SIZES];
466 	ulong_t	pgmf_alloc[MMU_PAGE_SIZES];
467 	ulong_t	pgmf_allocfailed[MMU_PAGE_SIZES];
468 	ulong_t	pgmf_allocempty[MMU_PAGE_SIZES];
469 	ulong_t	pgmf_allocok[MMU_PAGE_SIZES];
470 	ulong_t	pgmc_alloc;
471 	ulong_t	pgmc_allocfailed;
472 	ulong_t	pgmc_allocempty;
473 	ulong_t	pgmc_allocok;
474 	ulong_t	ppr_reloc[MMU_PAGE_SIZES];
475 	ulong_t ppr_relocnoroot[MMU_PAGE_SIZES];
476 	ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES];
477 	ulong_t ppr_relocnolock[MMU_PAGE_SIZES];
478 	ulong_t ppr_relocnomem[MMU_PAGE_SIZES];
479 	ulong_t ppr_relocok[MMU_PAGE_SIZES];
480 	ulong_t page_ctrs_coalesce;	/* page coalesce counter */
481 	ulong_t page_ctrs_cands_skip;	/* candidates useful */
482 	ulong_t page_ctrs_changed;	/* ctrs changed after locking */
483 	ulong_t page_ctrs_failed;	/* page_freelist_coalesce failed */
484 	ulong_t page_ctrs_coalesce_all;	/* page coalesce all counter */
485 	ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */
486 	ulong_t	restrict4gcnt;
487 };
488 extern struct vmm_vmstats_str vmm_vmstats;
489 #endif	/* VM_STATS */
490 
491 extern size_t page_ctrs_sz(void);
492 extern caddr_t page_ctrs_alloc(caddr_t);
493 extern void page_ctr_sub(page_t *, int);
494 extern page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t);
495 extern uint_t page_get_pagecolors(uint_t);
496 
497 #ifdef	__cplusplus
498 }
499 #endif
500 
501 #endif	/* _VM_DEP_H */
502