1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2016 Joyent, Inc. 25 * Copyright 2022 Garrett D'Amore <garrett@damore.org> 26 */ 27 28 /* 29 * UNIX machine dependent virtual memory support. 30 */ 31 32 #include <sys/vm.h> 33 #include <sys/exec.h> 34 35 #include <sys/exechdr.h> 36 #include <vm/seg_kmem.h> 37 #include <sys/atomic.h> 38 #include <sys/archsystm.h> 39 #include <sys/machsystm.h> 40 #include <sys/kdi.h> 41 #include <sys/cpu_module.h> 42 #include <sys/secflags.h> 43 44 #include <vm/hat_sfmmu.h> 45 46 #include <sys/memnode.h> 47 48 #include <sys/mem_config.h> 49 #include <sys/mem_cage.h> 50 #include <vm/vm_dep.h> 51 #include <vm/page.h> 52 #include <sys/platform_module.h> 53 54 /* 55 * These variables are set by module specific config routines. 56 * They are only set by modules which will use physical cache page coloring. 57 */ 58 int do_pg_coloring = 0; 59 60 /* 61 * These variables can be conveniently patched at kernel load time to 62 * prevent do_pg_coloring from being enabled by 63 * module specific config routines. 64 */ 65 66 int use_page_coloring = 1; 67 68 /* 69 * initialized by page_coloring_init() 70 */ 71 extern uint_t page_colors; 72 extern uint_t page_colors_mask; 73 extern uint_t page_coloring_shift; 74 int cpu_page_colors; 75 uint_t vac_colors = 0; 76 uint_t vac_colors_mask = 0; 77 78 /* cpu specific coloring initialization */ 79 extern void page_coloring_init_cpu(); 80 #pragma weak page_coloring_init_cpu 81 82 /* 83 * get the ecache setsize for the current cpu. 84 */ 85 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 86 87 plcnt_t plcnt; /* page list count */ 88 89 /* 90 * This variable is set by the cpu module to contain the lowest 91 * address not affected by the SF_ERRATA_57 workaround. It should 92 * remain 0 if the workaround is not needed. 93 */ 94 #if defined(SF_ERRATA_57) 95 caddr_t errata57_limit; 96 #endif 97 98 extern void page_relocate_hash(page_t *, page_t *); 99 100 /* 101 * these must be defined in platform specific areas 102 */ 103 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 104 struct proc *, uint_t); 105 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 106 caddr_t, size_t, uint_t, struct lgrp *); 107 /* 108 * Convert page frame number to an OBMEM page frame number 109 * (i.e. put in the type bits -- zero for this implementation) 110 */ 111 pfn_t 112 impl_obmem_pfnum(pfn_t pf) 113 { 114 return (pf); 115 } 116 117 /* 118 * Use physmax to determine the highest physical page of DRAM memory 119 * It is assumed that any physical addresses above physmax is in IO space. 120 * We don't bother checking the low end because we assume that memory space 121 * begins at physical page frame 0. 122 * 123 * Return 1 if the page frame is onboard DRAM memory, else 0. 124 * Returns 0 for nvram so it won't be cached. 125 */ 126 int 127 pf_is_memory(pfn_t pf) 128 { 129 /* We must be IO space */ 130 if (pf > physmax) 131 return (0); 132 133 /* We must be memory space */ 134 return (1); 135 } 136 137 /* 138 * Handle a pagefault. 139 */ 140 faultcode_t 141 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 142 { 143 struct as *as; 144 struct proc *p; 145 faultcode_t res; 146 caddr_t base; 147 size_t len; 148 int err; 149 150 if (INVALID_VADDR(addr)) 151 return (FC_NOMAP); 152 153 if (iskernel) { 154 as = &kas; 155 } else { 156 p = curproc; 157 as = p->p_as; 158 #if defined(SF_ERRATA_57) 159 /* 160 * Prevent infinite loops due to a segment driver 161 * setting the execute permissions and the sfmmu hat 162 * silently ignoring them. 163 */ 164 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 165 addr < errata57_limit) { 166 res = FC_NOMAP; 167 goto out; 168 } 169 #endif 170 } 171 172 /* 173 * Dispatch pagefault. 174 */ 175 res = as_fault(as->a_hat, as, addr, 1, type, rw); 176 177 /* 178 * If this isn't a potential unmapped hole in the user's 179 * UNIX data or stack segments, just return status info. 180 */ 181 if (!(res == FC_NOMAP && iskernel == 0)) 182 goto out; 183 184 /* 185 * Check to see if we happened to faulted on a currently unmapped 186 * part of the UNIX data or stack segments. If so, create a zfod 187 * mapping there and then try calling the fault routine again. 188 */ 189 base = p->p_brkbase; 190 len = p->p_brksize; 191 192 if (addr < base || addr >= base + len) { /* data seg? */ 193 base = (caddr_t)(p->p_usrstack - p->p_stksize); 194 len = p->p_stksize; 195 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 196 /* not in either UNIX data or stack segments */ 197 res = FC_NOMAP; 198 goto out; 199 } 200 } 201 202 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 203 /* This code is probably not needed anymore */ 204 205 /* expand the gap to the page boundaries on each side */ 206 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 207 ((uintptr_t)base & PAGEMASK); 208 base = (caddr_t)((uintptr_t)base & PAGEMASK); 209 210 as_rangelock(as); 211 as_purge(as); 212 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 213 err = as_map(as, base, len, segvn_create, zfod_argsp); 214 as_rangeunlock(as); 215 if (err) { 216 res = FC_MAKE_ERR(err); 217 goto out; 218 } 219 } else { 220 /* 221 * This page is already mapped by another thread after we 222 * returned from as_fault() above. We just fallthrough 223 * as_fault() below. 224 */ 225 as_rangeunlock(as); 226 } 227 228 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 229 230 out: 231 232 return (res); 233 } 234 235 /* 236 * This is the routine which defines the address limit implied 237 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 238 * mappable address in a 32-bit process on this platform (though 239 * perhaps we should make it be UINT32_MAX here?) 240 */ 241 void 242 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 243 { 244 struct proc *p = curproc; 245 caddr_t userlimit = flags & _MAP_LOW32 ? 246 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 247 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 248 } 249 250 /* 251 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 252 */ 253 caddr_t hole_start, hole_end; 254 255 /* 256 * kpm mapping window 257 */ 258 caddr_t kpm_vbase; 259 size_t kpm_size; 260 uchar_t kpm_size_shift; 261 262 int valid_va_range_aligned_wraparound; 263 /* 264 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 265 * addresses at least "minlen" long, where the base of the range is at "off" 266 * phase from an "align" boundary and there is space for a "redzone"-sized 267 * redzone on either side of the range. On success, 1 is returned and *basep 268 * and *lenp are adjusted to describe the acceptable range (including 269 * the redzone). On failure, 0 is returned. 270 */ 271 int 272 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 273 size_t align, size_t redzone, size_t off) 274 { 275 caddr_t hi, lo; 276 size_t tot_len; 277 278 ASSERT(align == 0 ? off == 0 : off < align); 279 ASSERT(ISP2(align)); 280 ASSERT(align == 0 || align >= PAGESIZE); 281 282 lo = *basep; 283 hi = lo + *lenp; 284 tot_len = minlen + 2 * redzone; /* need at least this much space */ 285 286 /* If hi rolled over the top try cutting back. */ 287 if (hi < lo) { 288 *lenp = 0UL - (uintptr_t)lo - 1UL; 289 /* Trying to see if this really happens, and then if so, why */ 290 valid_va_range_aligned_wraparound++; 291 hi = lo + *lenp; 292 } 293 if (*lenp < tot_len) { 294 return (0); 295 } 296 297 /* 298 * Deal with a possible hole in the address range between 299 * hole_start and hole_end that should never be mapped by the MMU. 300 */ 301 302 if (lo < hole_start) { 303 if (hi > hole_start) 304 if (hi < hole_end) 305 hi = hole_start; 306 else 307 /* lo < hole_start && hi >= hole_end */ 308 if (dir == AH_LO) { 309 /* 310 * prefer lowest range 311 */ 312 if (hole_start - lo >= tot_len) 313 hi = hole_start; 314 else if (hi - hole_end >= tot_len) 315 lo = hole_end; 316 else 317 return (0); 318 } else { 319 /* 320 * prefer highest range 321 */ 322 if (hi - hole_end >= tot_len) 323 lo = hole_end; 324 else if (hole_start - lo >= tot_len) 325 hi = hole_start; 326 else 327 return (0); 328 } 329 } else { 330 /* lo >= hole_start */ 331 if (hi < hole_end) 332 return (0); 333 if (lo < hole_end) 334 lo = hole_end; 335 } 336 337 /* Check if remaining length is too small */ 338 if (hi - lo < tot_len) { 339 return (0); 340 } 341 if (align > 1) { 342 caddr_t tlo = lo + redzone; 343 caddr_t thi = hi - redzone; 344 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off); 345 if (tlo < lo + redzone) { 346 return (0); 347 } 348 if (thi < tlo || thi - tlo < minlen) { 349 return (0); 350 } 351 } 352 *basep = lo; 353 *lenp = hi - lo; 354 return (1); 355 } 356 357 /* 358 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 359 * addresses at least "minlen" long. On success, 1 is returned and *basep 360 * and *lenp are adjusted to describe the acceptable range. On failure, 0 361 * is returned. 362 */ 363 int 364 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 365 { 366 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 367 } 368 369 /* 370 * Default to forbidding the first 64k of address space. This protects most 371 * reasonably sized structures from dereferences through NULL: 372 * ((foo_t *)0)->bar 373 */ 374 uintptr_t forbidden_null_mapping_sz = 0x10000; 375 376 /* 377 * Determine whether [addr, addr+len] with protections `prot' are valid 378 * for a user address space. 379 */ 380 /*ARGSUSED*/ 381 int 382 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 383 caddr_t userlimit) 384 { 385 caddr_t eaddr = addr + len; 386 387 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 388 return (RANGE_BADADDR); 389 390 if ((addr <= (caddr_t)forbidden_null_mapping_sz) && 391 as->a_proc != NULL && 392 secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP)) 393 return (RANGE_BADADDR); 394 395 /* 396 * Determine if the address range falls within an illegal 397 * range of the MMU. 398 */ 399 if (eaddr > hole_start && addr < hole_end) 400 return (RANGE_BADADDR); 401 402 #if defined(SF_ERRATA_57) 403 /* 404 * Make sure USERLIMIT isn't raised too high 405 */ 406 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 407 errata57_limit == 0); 408 409 if (AS_TYPE_64BIT(as) && 410 (addr < errata57_limit) && 411 (prot & PROT_EXEC)) 412 return (RANGE_BADPROT); 413 #endif /* SF_ERRATA57 */ 414 return (RANGE_OKAY); 415 } 416 417 /* 418 * Routine used to check to see if an a.out can be executed 419 * by the current machine/architecture. 420 */ 421 int 422 chkaout(struct exdata *exp) 423 { 424 if (exp->ux_mach == M_SPARC) 425 return (0); 426 else 427 return (ENOEXEC); 428 } 429 430 431 /* 432 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 433 * KPM selects an address such that it's equal offset modulo shm_alignment and 434 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 435 */ 436 int 437 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 438 { 439 if (vac) { 440 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 441 } else { 442 return (0); 443 } 444 } 445 446 /* 447 * Sanity control. Don't use large pages regardless of user 448 * settings if there's less than priv or shm_lpg_min_physmem memory installed. 449 * The units for this variable is 8K pages. 450 */ 451 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */ 452 pgcnt_t privm_lpg_min_physmem = 131072; /* 1GB */ 453 454 static size_t 455 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 456 { 457 size_t pgsz = MMU_PAGESIZE; 458 int szc; 459 460 /* 461 * If len is zero, retrieve from proc and don't demote the page size. 462 * Use atleast the default pagesize. 463 */ 464 if (len == 0) { 465 len = p->p_brkbase + p->p_brksize - p->p_bssbase; 466 } 467 len = MAX(len, default_uheap_lpsize); 468 469 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 470 pgsz = hw_page_array[szc].hp_size; 471 if ((disable_auto_data_large_pages & (1 << szc)) || 472 pgsz > max_uheap_lpsize) 473 continue; 474 if (len >= pgsz) { 475 break; 476 } 477 } 478 479 /* 480 * If addr == 0 we were called by memcntl() when the 481 * size code is 0. Don't set pgsz less than current size. 482 */ 483 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 484 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 485 } 486 487 return (pgsz); 488 } 489 490 static size_t 491 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 492 { 493 size_t pgsz = MMU_PAGESIZE; 494 int szc; 495 496 /* 497 * If len is zero, retrieve from proc and don't demote the page size. 498 * Use atleast the default pagesize. 499 */ 500 if (len == 0) { 501 len = p->p_stksize; 502 } 503 len = MAX(len, default_ustack_lpsize); 504 505 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 506 pgsz = hw_page_array[szc].hp_size; 507 if ((disable_auto_data_large_pages & (1 << szc)) || 508 pgsz > max_ustack_lpsize) 509 continue; 510 if (len >= pgsz) { 511 break; 512 } 513 } 514 515 /* 516 * If addr == 0 we were called by memcntl() or exec_args() when the 517 * size code is 0. Don't set pgsz less than current size. 518 */ 519 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 520 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 521 } 522 523 return (pgsz); 524 } 525 526 static size_t 527 map_pgszism(caddr_t addr, size_t len) 528 { 529 uint_t szc; 530 size_t pgsz; 531 532 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) { 533 if (disable_ism_large_pages & (1 << szc)) 534 continue; 535 536 pgsz = hw_page_array[szc].hp_size; 537 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz)) 538 return (pgsz); 539 } 540 541 return (DEFAULT_ISM_PAGESIZE); 542 } 543 544 /* 545 * Suggest a page size to be used to map a segment of type maptype and length 546 * len. Returns a page size (not a size code). 547 */ 548 /* ARGSUSED */ 549 size_t 550 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 551 { 552 size_t pgsz = MMU_PAGESIZE; 553 554 ASSERT(maptype != MAPPGSZ_VA); 555 556 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 557 return (MMU_PAGESIZE); 558 } 559 560 switch (maptype) { 561 case MAPPGSZ_ISM: 562 pgsz = map_pgszism(addr, len); 563 break; 564 565 case MAPPGSZ_STK: 566 if (max_ustack_lpsize > MMU_PAGESIZE) { 567 pgsz = map_pgszstk(p, addr, len); 568 } 569 break; 570 571 case MAPPGSZ_HEAP: 572 if (max_uheap_lpsize > MMU_PAGESIZE) { 573 pgsz = map_pgszheap(p, addr, len); 574 } 575 break; 576 } 577 return (pgsz); 578 } 579 580 581 /* assumes TTE8K...TTE4M == szc */ 582 583 static uint_t 584 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs, 585 size_t max_lpsize, size_t min_physmem) 586 { 587 caddr_t eaddr = addr + size; 588 uint_t szcvec = 0; 589 caddr_t raddr; 590 caddr_t readdr; 591 size_t pgsz; 592 int i; 593 594 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 595 return (0); 596 } 597 for (i = mmu_page_sizes - 1; i > 0; i--) { 598 if (disable_lpgs & (1 << i)) { 599 continue; 600 } 601 pgsz = page_get_pagesize(i); 602 if (pgsz > max_lpsize) { 603 continue; 604 } 605 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 606 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 607 if (raddr < addr || raddr >= readdr) { 608 continue; 609 } 610 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 611 continue; 612 } 613 szcvec |= (1 << i); 614 /* 615 * And or in the remaining enabled page sizes. 616 */ 617 szcvec |= P2PHASE(~disable_lpgs, (1 << i)); 618 szcvec &= ~1; /* no need to return 8K pagesize */ 619 break; 620 } 621 return (szcvec); 622 } 623 624 /* 625 * Return a bit vector of large page size codes that 626 * can be used to map [addr, addr + len) region. 627 */ 628 /* ARGSUSED */ 629 uint_t 630 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 631 int memcntl) 632 { 633 if (flags & MAP_TEXT) { 634 return (map_szcvec(addr, size, off, 635 disable_auto_text_large_pages, 636 max_utext_lpsize, shm_lpg_min_physmem)); 637 638 } else if (flags & MAP_INITDATA) { 639 return (map_szcvec(addr, size, off, 640 disable_auto_data_large_pages, 641 max_uidata_lpsize, privm_lpg_min_physmem)); 642 643 } else if (type == MAPPGSZC_SHM) { 644 return (map_szcvec(addr, size, off, 645 disable_auto_data_large_pages, 646 max_shm_lpsize, shm_lpg_min_physmem)); 647 648 } else if (type == MAPPGSZC_HEAP) { 649 return (map_szcvec(addr, size, off, 650 disable_auto_data_large_pages, 651 max_uheap_lpsize, privm_lpg_min_physmem)); 652 653 } else if (type == MAPPGSZC_STACK) { 654 return (map_szcvec(addr, size, off, 655 disable_auto_data_large_pages, 656 max_ustack_lpsize, privm_lpg_min_physmem)); 657 658 } else { 659 return (map_szcvec(addr, size, off, 660 disable_auto_data_large_pages, 661 max_privmap_lpsize, privm_lpg_min_physmem)); 662 } 663 } 664 665 /* 666 * Anchored in the table below are counters used to keep track 667 * of free contiguous physical memory. Each element of the table contains 668 * the array of counters, the size of array which is allocated during 669 * startup based on physmax and a shift value used to convert a pagenum 670 * into a counter array index or vice versa. The table has page size 671 * for rows and region size for columns: 672 * 673 * page_counters[page_size][region_size] 674 * 675 * page_size: TTE size code of pages on page_size freelist. 676 * 677 * region_size: TTE size code of a candidate larger page made up 678 * made up of contiguous free page_size pages. 679 * 680 * As you go across a page_size row increasing region_size each 681 * element keeps track of how many (region_size - 1) size groups 682 * made up of page_size free pages can be coalesced into a 683 * regsion_size page. Yuck! Lets try an example: 684 * 685 * page_counters[1][3] is the table element used for identifying 686 * candidate 4M pages from contiguous pages off the 64K free list. 687 * Each index in the page_counters[1][3].array spans 4M. Its the 688 * number of free 512K size (regsion_size - 1) groups of contiguous 689 * 64K free pages. So when page_counters[1][3].counters[n] == 8 690 * we know we have a candidate 4M page made up of 512K size groups 691 * of 64K free pages. 692 */ 693 694 /* 695 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 696 * dimensions are allocated dynamically. 697 */ 698 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 699 700 /* 701 * For now there is only a single size cache list. 702 * Allocated dynamically. 703 */ 704 page_t ***page_cachelists[MAX_MEM_TYPES]; 705 706 kmutex_t *fpc_mutex[NPC_MUTEX]; 707 kmutex_t *cpc_mutex[NPC_MUTEX]; 708 709 /* 710 * Calculate space needed for page freelists and counters 711 */ 712 size_t 713 calc_free_pagelist_sz(void) 714 { 715 int szc; 716 size_t alloc_sz, cache_sz, free_sz; 717 718 /* 719 * one cachelist per color, node, and type 720 */ 721 cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) + 722 sizeof (page_t **); 723 cache_sz *= max_mem_nodes * MAX_MEM_TYPES; 724 725 /* 726 * one freelist per size, color, node, and type 727 */ 728 free_sz = sizeof (page_t **); 729 for (szc = 0; szc < mmu_page_sizes; szc++) 730 free_sz += sizeof (page_t *) * page_get_pagecolors(szc); 731 free_sz *= max_mem_nodes * MAX_MEM_TYPES; 732 733 alloc_sz = cache_sz + free_sz + page_ctrs_sz(); 734 return (alloc_sz); 735 } 736 737 caddr_t 738 alloc_page_freelists(caddr_t alloc_base) 739 { 740 int mnode, mtype; 741 int szc, clrs; 742 743 /* 744 * We only support small pages in the cachelist. 745 */ 746 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 747 page_cachelists[mtype] = (page_t ***)alloc_base; 748 alloc_base += (max_mem_nodes * sizeof (page_t **)); 749 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 750 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 751 alloc_base += 752 (page_get_pagecolors(0) * sizeof (page_t *)); 753 } 754 } 755 756 /* 757 * Allocate freelists bins for all 758 * supported page sizes. 759 */ 760 for (szc = 0; szc < mmu_page_sizes; szc++) { 761 clrs = page_get_pagecolors(szc); 762 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 763 page_freelists[szc][mtype] = (page_t ***)alloc_base; 764 alloc_base += (max_mem_nodes * sizeof (page_t **)); 765 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 766 page_freelists[szc][mtype][mnode] = 767 (page_t **)alloc_base; 768 alloc_base += (clrs * (sizeof (page_t *))); 769 } 770 } 771 } 772 773 alloc_base = page_ctrs_alloc(alloc_base); 774 return (alloc_base); 775 } 776 777 /* 778 * Allocate page_freelists locks for a memnode from the nucleus data 779 * area. This is the first time that mmu_page_sizes is used during 780 * bootup, so check mmu_page_sizes initialization. 781 */ 782 int 783 ndata_alloc_page_mutexs(struct memlist *ndata) 784 { 785 size_t alloc_sz; 786 caddr_t alloc_base; 787 int i; 788 void page_coloring_init(); 789 790 page_coloring_init(); 791 if (&mmu_init_mmu_page_sizes) { 792 if (!mmu_init_mmu_page_sizes(0)) { 793 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 794 mmu_page_sizes); 795 } 796 } 797 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 798 799 /* fpc_mutex and cpc_mutex */ 800 alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 801 802 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 803 if (alloc_base == NULL) 804 return (-1); 805 806 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 807 808 for (i = 0; i < NPC_MUTEX; i++) { 809 fpc_mutex[i] = (kmutex_t *)alloc_base; 810 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 811 cpc_mutex[i] = (kmutex_t *)alloc_base; 812 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 813 } 814 return (0); 815 } 816 817 /* 818 * To select our starting bin, we stride through the bins with a stride 819 * of 337. Why 337? It's prime, it's largeish, and it performs well both 820 * in simulation and practice for different workloads on varying cache sizes. 821 */ 822 uint32_t color_start_current = 0; 823 uint32_t color_start_stride = 337; 824 int color_start_random = 0; 825 826 /* ARGSUSED */ 827 uint_t 828 get_color_start(struct as *as) 829 { 830 uint32_t old, new; 831 832 if (consistent_coloring == 2 || color_start_random) { 833 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 834 (hw_page_array[0].hp_colors - 1))); 835 } 836 837 do { 838 old = color_start_current; 839 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 840 } while (atomic_cas_32(&color_start_current, old, new) != old); 841 842 return ((uint_t)(new)); 843 } 844 845 /* 846 * Called once at startup from kphysm_init() -- before memialloc() 847 * is invoked to do the 1st page_free()/page_freelist_add(). 848 * 849 * initializes page_colors and page_colors_mask based on ecache_setsize. 850 * 851 * Also initializes the counter locks. 852 */ 853 void 854 page_coloring_init() 855 { 856 int a, i; 857 uint_t colors; 858 859 if (do_pg_coloring == 0) { 860 page_colors = 1; 861 for (i = 0; i < mmu_page_sizes; i++) { 862 colorequivszc[i] = 0; 863 hw_page_array[i].hp_colors = 1; 864 } 865 return; 866 } 867 868 /* 869 * Calculate page_colors from ecache_setsize. ecache_setsize contains 870 * the max ecache setsize of all cpus configured in the system or, for 871 * cheetah+ systems, the max possible ecache setsize for all possible 872 * cheetah+ cpus. 873 */ 874 page_colors = ecache_setsize / MMU_PAGESIZE; 875 page_colors_mask = page_colors - 1; 876 877 vac_colors = vac_size / MMU_PAGESIZE; 878 vac_colors_mask = vac_colors -1; 879 880 page_coloring_shift = 0; 881 a = ecache_setsize; 882 while (a >>= 1) { 883 page_coloring_shift++; 884 } 885 886 /* initialize number of colors per page size */ 887 for (i = 0; i < mmu_page_sizes; i++) { 888 hw_page_array[i].hp_colors = (page_colors_mask >> 889 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 890 + 1; 891 colorequivszc[i] = 0; 892 } 893 894 /* 895 * initialize cpu_page_colors if ecache setsizes are homogenous. 896 * cpu_page_colors set to -1 during DR operation or during startup 897 * if setsizes are heterogenous. 898 * 899 * The value of cpu_page_colors determines if additional color bins 900 * need to be checked for a particular color in the page_get routines. 901 */ 902 if (cpu_setsize > 0 && cpu_page_colors == 0 && 903 cpu_setsize < ecache_setsize) { 904 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 905 a = lowbit(page_colors) - lowbit(cpu_page_colors); 906 ASSERT(a > 0); 907 ASSERT(a < 16); 908 909 for (i = 0; i < mmu_page_sizes; i++) { 910 if ((colors = hw_page_array[i].hp_colors) <= 1) { 911 continue; 912 } 913 while ((colors >> a) == 0) 914 a--; 915 ASSERT(a >= 0); 916 917 /* higher 4 bits encodes color equiv mask */ 918 colorequivszc[i] = (a << 4); 919 } 920 } 921 922 /* do cpu specific color initialization */ 923 if (&page_coloring_init_cpu) { 924 page_coloring_init_cpu(); 925 } 926 } 927 928 int 929 bp_color(struct buf *bp) 930 { 931 int color = -1; 932 933 if (vac) { 934 if ((bp->b_flags & B_PAGEIO) != 0) { 935 color = sfmmu_get_ppvcolor(bp->b_pages); 936 } else if (bp->b_un.b_addr != NULL) { 937 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 938 } 939 } 940 return (color < 0 ? 0 : ptob(color)); 941 } 942 943 /* 944 * Function for flushing D-cache when performing module relocations 945 * to an alternate mapping. Stubbed out on all platforms except sun4u, 946 * at least for now. 947 */ 948 void 949 dcache_flushall() 950 { 951 sfmmu_cache_flushall(); 952 } 953 954 static int 955 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 956 { 957 if (va1 < va2 && va1 + sz1 <= va2) 958 return (0); 959 960 if (va2 < va1 && va2 + sz2 <= va1) 961 return (0); 962 963 return (1); 964 } 965 966 /* 967 * Return the number of bytes, relative to the beginning of a given range, that 968 * are non-toxic (can be read from and written to with relative impunity). 969 */ 970 size_t 971 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 972 { 973 /* OBP reads are harmless, but we don't want people writing there */ 974 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 975 OFW_START_ADDR + 1)) 976 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 977 978 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 979 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 980 981 return (sz); /* no overlap */ 982 } 983 984 /* 985 * Minimum physmem required for enabling large pages for kernel heap 986 * Currently we do not enable lp for kmem on systems with less 987 * than 1GB of memory. This value can be changed via /etc/system 988 */ 989 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 990 991 /* 992 * this function chooses large page size for kernel heap 993 */ 994 size_t 995 get_segkmem_lpsize(size_t lpsize) 996 { 997 size_t memtotal = physmem * PAGESIZE; 998 size_t mmusz; 999 uint_t szc; 1000 1001 if (memtotal < segkmem_lpminphysmem) 1002 return (PAGESIZE); 1003 1004 if (plat_lpkmem_is_supported != NULL && 1005 plat_lpkmem_is_supported() == 0) 1006 return (PAGESIZE); 1007 1008 mmusz = mmu_get_kernel_lpsize(lpsize); 1009 szc = page_szc(mmusz); 1010 1011 while (szc) { 1012 if (!(disable_large_pages & (1 << szc))) 1013 return (page_get_pagesize(szc)); 1014 szc--; 1015 } 1016 return (PAGESIZE); 1017 } 1018