1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * UNIX machine dependent virtual memory support. 29 */ 30 31 #ifndef _VM_DEP_H 32 #define _VM_DEP_H 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #ifdef __cplusplus 37 extern "C" { 38 #endif 39 40 #include <sys/clock.h> 41 #include <vm/hat_pte.h> 42 43 /* 44 * WARNING: vm_dep.h is included by files in common. As such, macros 45 * dependent upon PTE36 such as LARGEPAGESIZE cannot be used in this file. 46 */ 47 48 #define GETTICK() tsc_read() 49 50 /* memranges in descending order */ 51 extern pfn_t *memranges; 52 53 #define MEMRANGEHI(mtype) \ 54 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 55 #define MEMRANGELO(mtype) (memranges[mtype]) 56 57 /* 58 * combined memory ranges from mnode and memranges[] to manage single 59 * mnode/mtype dimension in the page lists. 60 */ 61 typedef struct { 62 pfn_t mnr_pfnlo; 63 pfn_t mnr_pfnhi; 64 int mnr_mnode; 65 int mnr_memrange; /* index into memranges[] */ 66 #ifdef DEBUG 67 /* maintain page list stats */ 68 pgcnt_t mnr_mt_pgmax; /* mnode/mtype max page cnt */ 69 pgcnt_t mnr_mt_pgcnt; /* free cnt */ 70 pgcnt_t mnr_mt_clpgcnt; /* cache list free cnt */ 71 struct mnr_mts { /* mnode/mtype szc stats */ 72 pgcnt_t mnr_mts_pgcnt; 73 int mnr_mts_colors; 74 pgcnt_t *mnr_mtsc_pgcnt; 75 } *mnr_mts; 76 #endif 77 } mnoderange_t; 78 79 #ifdef DEBUG 80 #define PLCNT_SZ(ctrs_sz) { \ 81 int szc, colors; \ 82 ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * \ 83 mmu_page_sizes; \ 84 for (szc = 0; szc < mmu_page_sizes; szc++) { \ 85 colors = page_get_pagecolors(szc); \ 86 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; \ 87 } \ 88 } 89 90 #define PLCNT_INIT(addr) { \ 91 int mt, szc, colors; \ 92 for (mt = 0; mt < mnoderangecnt; mt++) { \ 93 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; \ 94 addr += (sizeof (struct mnr_mts) * mmu_page_sizes); \ 95 for (szc = 0; szc < mmu_page_sizes; szc++) { \ 96 colors = page_get_pagecolors(szc); \ 97 mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = \ 98 colors; \ 99 mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = \ 100 (pgcnt_t *)addr; \ 101 addr += (sizeof (pgcnt_t) * colors); \ 102 } \ 103 } \ 104 } 105 #define PLCNT_DO(pp, mtype, szc, cnt, flags) { \ 106 int bin = PP_2_BIN(pp); \ 107 if (flags & PG_LIST_ISINIT) \ 108 mnoderanges[mtype].mnr_mt_pgmax += cnt; \ 109 atomic_add_long(&mnoderanges[mtype].mnr_mt_pgcnt, cnt); \ 110 if (flags & PG_CACHE_LIST) \ 111 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, \ 112 cnt); \ 113 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc]. \ 114 mnr_mts_pgcnt, cnt); \ 115 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc]. \ 116 mnr_mtsc_pgcnt[bin], cnt); \ 117 } 118 #else 119 #define PLCNT_SZ(ctrs_sz) 120 #define PLCNT_INIT(base) 121 #define PLCNT_DO(pp, mtype, szc, cnt, flags) 122 #endif 123 124 #define PLCNT_INCR(pp, mnode, szc, flags) { \ 125 long cnt = (1 << PAGE_BSZS_SHIFT(szc)); \ 126 int mtype = PP_2_MTYPE(pp); \ 127 atomic_add_long(&mem_node_config[mnode].cursize, cnt); \ 128 if (physmax4g && mtype <= mtype4g) \ 129 atomic_add_long(&freemem4g, cnt); \ 130 if (flags & PG_LIST_ISINIT) { \ 131 if (physmax4g && mtype <= mtype4g) \ 132 maxmem4g += cnt; \ 133 } \ 134 PLCNT_DO(pp, mtype, szc, cnt, flags); \ 135 } 136 137 #define PLCNT_DECR(pp, mnode, szc, flags) { \ 138 long cnt = ((-1) << PAGE_BSZS_SHIFT(szc)); \ 139 int mtype = PP_2_MTYPE(pp); \ 140 atomic_add_long(&mem_node_config[mnode].cursize, cnt); \ 141 if (physmax4g && mtype <= mtype4g) \ 142 atomic_add_long(&freemem4g, cnt); \ 143 PLCNT_DO(pp, mtype, szc, cnt, flags); \ 144 } 145 146 extern mnoderange_t *mnoderanges; 147 extern int mnoderangecnt; 148 extern int mtype4g; 149 150 /* 151 * 4g memory management variables for systems with more than 4g of memory: 152 * 153 * physical memory below 4g is required for 32bit dma devices and, currently, 154 * for kmem memory. On systems with more than 4g of memory, the pool of memory 155 * below 4g can be depleted without any paging activity given that there is 156 * likely to be sufficient memory above 4g. 157 * 158 * physmax4g is set true if the largest pfn is over 4g. The rest of the 159 * 4g memory management code is enabled only when physmax4g is true. 160 * 161 * maxmem4g is the count of the maximum number of pages on the page lists 162 * with physical addresses below 4g. It can be a lot less then 4g given that 163 * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 164 * agp aperture etc. 165 * 166 * freemem4g maintains the count of the number of available pages on the 167 * page lists with physical addresses below 4g. 168 * 169 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 170 * 6% (desfree4gshift = 4) of maxmem4g. 171 * 172 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 173 * and the amount of physical memory above 4g is greater than freemem4g. 174 * In this case, page_get_* routines will restrict below 4g allocations 175 * for requests that don't specifically require it. 176 */ 177 178 extern int physmax4g; 179 extern pgcnt_t maxmem4g; 180 extern pgcnt_t freemem4g; 181 extern int lotsfree4gshift; 182 extern int desfree4gshift; 183 #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 184 #define DESFREE4G (maxmem4g >> desfree4gshift) 185 186 #define RESTRICT4G_ALLOC \ 187 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 188 189 extern int restricted_kmemalloc; 190 extern int memrange_num(pfn_t); 191 extern int pfn_2_mtype(pfn_t); 192 extern int mtype_func(int, int, uint_t); 193 194 #define NUM_MEM_RANGES 4 /* memory range types */ 195 196 /* 197 * Per page size free lists. Allocated dynamically. 198 * dimensions [mtype][mmu_page_sizes][colors] 199 * 200 * mtype specifies a physical memory range with a unique mnode. 201 */ 202 203 extern page_t ****page_freelists; 204 205 #define PAGE_FREELISTS(mnode, szc, color, mtype) \ 206 (*(page_freelists[mtype][szc] + (color))) 207 208 /* 209 * For now there is only a single size cache list. Allocated dynamically. 210 * dimensions [mtype][colors] 211 * 212 * mtype specifies a physical memory range with a unique mnode. 213 */ 214 extern page_t ***page_cachelists; 215 216 #define PAGE_CACHELISTS(mnode, color, mtype) \ 217 (*(page_cachelists[mtype] + (color))) 218 219 /* 220 * There are mutexes for both the page freelist 221 * and the page cachelist. We want enough locks to make contention 222 * reasonable, but not too many -- otherwise page_freelist_lock() gets 223 * so expensive that it becomes the bottleneck! 224 */ 225 226 #define NPC_MUTEX 16 227 228 extern kmutex_t *fpc_mutex[NPC_MUTEX]; 229 extern kmutex_t *cpc_mutex[NPC_MUTEX]; 230 231 extern page_t *page_get_mnode_freelist(int, uint_t, int, uchar_t, uint_t); 232 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 233 234 /* Find the bin for the given page if it was of size szc */ 235 #define PP_2_BIN_SZC(pp, szc) \ 236 (((pp->p_pagenum) & page_colors_mask) >> \ 237 (hw_page_array[szc].hp_shift - hw_page_array[0].hp_shift)) 238 239 #define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc)) 240 241 #define PP_2_MEM_NODE(pp) (PFN_2_MEM_NODE(pp->p_pagenum)) 242 #define PP_2_MTYPE(pp) (pfn_2_mtype(pp->p_pagenum)) 243 #define PP_2_SZC(pp) (pp->p_szc) 244 245 #define SZCPAGES(szc) (1 << PAGE_BSZS_SHIFT(szc)) 246 #define PFN_BASE(pfnum, szc) (pfnum & ~(SZCPAGES(szc) - 1)) 247 248 #if defined(__amd64) 249 250 /* 251 * set the mtype range (called from page_get_{free,cache}list) 252 * - set range to above 4g if the system has more than 4g of memory and the 253 * amount of memory below 4g runs low otherwise set range to all of memory 254 * starting from the hi pfns. 255 * 256 * page_get_anylist gets its mtype range from the specified ddi_dma_attr_t. 257 */ 258 #define MTYPE_INIT(mtype, vp, vaddr, flags) { \ 259 mtype = mnoderangecnt - 1; \ 260 if (RESTRICT4G_ALLOC) { \ 261 VM_STAT_ADD(vmm_vmstats.restrict4gcnt); \ 262 /* here only for > 4g systems */ \ 263 flags |= PGI_MT_RANGE4G; \ 264 } else { \ 265 flags |= PGI_MT_RANGE0; \ 266 } \ 267 } 268 269 #elif defined(__i386) 270 271 /* 272 * set the mtype range 273 * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 274 * - for non kmem requests, set range to above 4g if the amount of memory 275 * below 4g runs low. 276 */ 277 278 #define MTYPE_INIT(mtype, vp, vaddr, flags) { \ 279 if (restricted_kmemalloc && (vp) == &kvp && \ 280 (caddr_t)(vaddr) >= kernelheap && \ 281 (caddr_t)(vaddr) < ekernelheap) { \ 282 ASSERT(physmax4g); \ 283 mtype = mtype4g; \ 284 flags |= PGI_MT_RANGE0; \ 285 } else { \ 286 mtype = mnoderangecnt - 1; \ 287 if (RESTRICT4G_ALLOC) { \ 288 VM_STAT_ADD(vmm_vmstats.restrict4gcnt); \ 289 /* here only for > 4g systems */ \ 290 flags |= PGI_MT_RANGE4G; \ 291 } else { \ 292 flags |= PGI_MT_RANGE0; \ 293 } \ 294 } \ 295 } 296 297 #endif /* __i386 */ 298 299 /* 300 * macros to loop through the mtype range (page_get_mnode_{free,cache,any}list, 301 * and page_get_contig_pages) 302 * 303 * MTYPE_START sets the initial mtype. -1 if the mtype range specified does 304 * not contain mnode. 305 * 306 * MTYPE_NEXT sets the next mtype. -1 if there are no more valid 307 * mtype in the range. 308 */ 309 310 #define MTYPE_START(mnode, mtype, flags) \ 311 (mtype = mtype_func(mnode, mtype, flags)) 312 313 #define MTYPE_NEXT(mnode, mtype, flags) \ 314 (mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT)) 315 316 /* mtype init for page_get_replacement_page */ 317 318 #define MTYPE_PGR_INIT(mtype, flags, pp, mnode) { \ 319 mtype = mnoderangecnt - 1; \ 320 flags |= PGI_MT_RANGE0; \ 321 } 322 323 #define MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi) \ 324 ASSERT(mnoderanges[mtype].mnr_mnode == mnode); \ 325 pfnlo = mnoderanges[mtype].mnr_pfnlo; \ 326 pfnhi = mnoderanges[mtype].mnr_pfnhi; 327 328 #define PC_BIN_MUTEX(mnode, bin, flags) ((flags & PG_FREE_LIST) ? \ 329 &fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] : \ 330 &cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode]) 331 332 #define FPC_MUTEX(mnode, i) (&fpc_mutex[i][mnode]) 333 #define CPC_MUTEX(mnode, i) (&cpc_mutex[i][mnode]) 334 335 #ifdef DEBUG 336 #define CHK_LPG(pp, szc) chk_lpg(pp, szc) 337 extern void chk_lpg(page_t *, uchar_t); 338 #else 339 #define CHK_LPG(pp, szc) 340 #endif 341 342 #define FULL_REGION_CNT(rg_szc) \ 343 (LEVEL_SIZE(rg_szc) >> LEVEL_SHIFT(rg_szc - 1)) 344 345 /* Return the leader for this mapping size */ 346 #define PP_GROUPLEADER(pp, szc) \ 347 (&(pp)[-(int)((pp)->p_pagenum & (SZCPAGES(szc)-1))]) 348 349 /* Return the root page for this page based on p_szc */ 350 #define PP_PAGEROOT(pp) ((pp)->p_szc == 0 ? (pp) : \ 351 PP_GROUPLEADER((pp), (pp)->p_szc)) 352 353 /* 354 * The counter base must be per page_counter element to prevent 355 * races when re-indexing, and the base page size element should 356 * be aligned on a boundary of the given region size. 357 * 358 * We also round up the number of pages spanned by the counters 359 * for a given region to PC_BASE_ALIGN in certain situations to simplify 360 * the coding for some non-performance critical routines. 361 */ 362 363 #define PC_BASE_ALIGN ((pfn_t)1 << PAGE_BSZS_SHIFT(MMU_PAGE_SIZES-1)) 364 #define PC_BASE_ALIGN_MASK (PC_BASE_ALIGN - 1) 365 366 /* 367 * cpu/mmu-dependent vm variables 368 */ 369 extern uint_t mmu_page_sizes; 370 extern uint_t mmu_exported_page_sizes; 371 372 /* For x86, userszc is the same as the kernel's szc */ 373 #define USERSZC_2_SZC(userszc) (userszc) 374 #define SZC_2_USERSZC(szc) (szc) 375 376 /* 377 * for hw_page_map_t, sized to hold the ratio of large page to base 378 * pagesize (1024 max) 379 */ 380 typedef short hpmctr_t; 381 382 /* 383 * get the setsize of the current cpu - assume homogenous for x86 384 */ 385 extern int l2cache_sz, l2cache_linesz, l2cache_assoc; 386 387 #define L2CACHE_ALIGN l2cache_linesz 388 #define CPUSETSIZE() \ 389 (l2cache_assoc ? (l2cache_sz / l2cache_assoc) : MMU_PAGESIZE) 390 391 /* 392 * Return the log2(pagesize(szc) / MMU_PAGESIZE) --- or the shift count 393 * for the number of base pages in this pagesize 394 */ 395 #define PAGE_BSZS_SHIFT(szc) (LEVEL_SHIFT(szc) - MMU_PAGESHIFT) 396 397 /* 398 * Internal PG_ flags. 399 */ 400 #define PGI_RELOCONLY 0x010000 /* opposite of PG_NORELOC */ 401 #define PGI_NOCAGE 0x020000 /* cage is disabled */ 402 #define PGI_PGCPHIPRI 0x040000 /* page_get_contig_page pri alloc */ 403 #define PGI_PGCPSZC0 0x080000 /* relocate base pagesize page */ 404 405 /* 406 * PGI range flags - should not overlap PGI flags 407 */ 408 #define PGI_MT_RANGE0 0x1000000 /* mtype range to 0 */ 409 #define PGI_MT_RANGE4G 0x2000000 /* mtype range to 4g */ 410 #define PGI_MT_NEXT 0x4000000 /* get next mtype */ 411 #define PGI_MT_RANGE (PGI_MT_RANGE0 | PGI_MT_RANGE4G) 412 413 /* 414 * hash as and addr to get a bin. 415 */ 416 417 #define AS_2_BIN(as, seg, vp, addr, bin) \ 418 bin = ((((uintptr_t)(addr) >> PAGESHIFT) + ((uintptr_t)(as) >> 4)) \ 419 & page_colors_mask) 420 421 /* 422 * When a bin is empty, and we can't satisfy a color request correctly, 423 * we scan. If we assume that the programs have reasonable spatial 424 * behavior, then it will not be a good idea to use the adjacent color. 425 * Using the adjacent color would result in virtually adjacent addresses 426 * mapping into the same spot in the cache. So, if we stumble across 427 * an empty bin, skip a bunch before looking. After the first skip, 428 * then just look one bin at a time so we don't miss our cache on 429 * every look. Be sure to check every bin. Page_create() will panic 430 * if we miss a page. 431 * 432 * This also explains the `<=' in the for loops in both page_get_freelist() 433 * and page_get_cachelist(). Since we checked the target bin, skipped 434 * a bunch, then continued one a time, we wind up checking the target bin 435 * twice to make sure we get all of them bins. 436 */ 437 #define BIN_STEP 19 438 439 #ifdef VM_STATS 440 struct vmm_vmstats_str { 441 ulong_t pc_list_add_pages[MMU_PAGE_SIZES]; 442 ulong_t pc_list_sub_pages1[MMU_PAGE_SIZES]; 443 ulong_t pc_list_sub_pages2[MMU_PAGE_SIZES]; 444 ulong_t pc_list_sub_pages3[MMU_PAGE_SIZES]; 445 ulong_t pgf_alloc[MMU_PAGE_SIZES]; 446 ulong_t pgf_allocok[MMU_PAGE_SIZES]; 447 ulong_t pgf_allocokrem[MMU_PAGE_SIZES]; 448 ulong_t pgf_allocfailed[MMU_PAGE_SIZES]; 449 ulong_t pgf_allocdeferred; 450 ulong_t pgf_allocretry[MMU_PAGE_SIZES]; 451 ulong_t pgc_alloc; 452 ulong_t pgc_allocok; 453 ulong_t pgc_allocokrem; 454 ulong_t pgc_allocokdeferred; 455 ulong_t pgc_allocfailed; 456 ulong_t pgcp_alloc[MMU_PAGE_SIZES]; 457 ulong_t pgcp_allocfailed[MMU_PAGE_SIZES]; 458 ulong_t pgcp_allocempty[MMU_PAGE_SIZES]; 459 ulong_t pgcp_allocok[MMU_PAGE_SIZES]; 460 ulong_t ptcp[MMU_PAGE_SIZES]; 461 ulong_t ptcpfreethresh[MMU_PAGE_SIZES]; 462 ulong_t ptcpfailexcl[MMU_PAGE_SIZES]; 463 ulong_t ptcpfailszc[MMU_PAGE_SIZES]; 464 ulong_t ptcpfailcage[MMU_PAGE_SIZES]; 465 ulong_t ptcpok[MMU_PAGE_SIZES]; 466 ulong_t pgmf_alloc[MMU_PAGE_SIZES]; 467 ulong_t pgmf_allocfailed[MMU_PAGE_SIZES]; 468 ulong_t pgmf_allocempty[MMU_PAGE_SIZES]; 469 ulong_t pgmf_allocok[MMU_PAGE_SIZES]; 470 ulong_t pgmc_alloc; 471 ulong_t pgmc_allocfailed; 472 ulong_t pgmc_allocempty; 473 ulong_t pgmc_allocok; 474 ulong_t ppr_reloc[MMU_PAGE_SIZES]; 475 ulong_t ppr_relocnoroot[MMU_PAGE_SIZES]; 476 ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES]; 477 ulong_t ppr_relocnolock[MMU_PAGE_SIZES]; 478 ulong_t ppr_relocnomem[MMU_PAGE_SIZES]; 479 ulong_t ppr_relocok[MMU_PAGE_SIZES]; 480 ulong_t page_ctrs_coalesce; /* page coalesce counter */ 481 ulong_t page_ctrs_cands_skip; /* candidates useful */ 482 ulong_t page_ctrs_changed; /* ctrs changed after locking */ 483 ulong_t page_ctrs_failed; /* page_freelist_coalesce failed */ 484 ulong_t page_ctrs_coalesce_all; /* page coalesce all counter */ 485 ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */ 486 ulong_t restrict4gcnt; 487 }; 488 extern struct vmm_vmstats_str vmm_vmstats; 489 #endif /* VM_STATS */ 490 491 extern size_t page_ctrs_sz(void); 492 extern caddr_t page_ctrs_alloc(caddr_t); 493 extern void page_ctr_sub(page_t *, int); 494 extern page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 495 extern uint_t page_get_pagecolors(uint_t); 496 497 #ifdef __cplusplus 498 } 499 #endif 500 501 #endif /* _VM_DEP_H */ 502