1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * UNIX machine dependent virtual memory support. 30 */ 31 32 #include <sys/vm.h> 33 #include <sys/exec.h> 34 35 #include <sys/exechdr.h> 36 #include <vm/seg_kmem.h> 37 #include <sys/atomic.h> 38 #include <sys/archsystm.h> 39 #include <sys/machsystm.h> 40 #include <sys/kdi.h> 41 #include <sys/cpu_module.h> 42 43 #include <vm/hat_sfmmu.h> 44 45 #include <sys/memnode.h> 46 47 #include <sys/mem_config.h> 48 #include <sys/mem_cage.h> 49 #include <vm/vm_dep.h> 50 #include <vm/page.h> 51 #include <sys/platform_module.h> 52 53 /* 54 * These variables are set by module specific config routines. 55 * They are only set by modules which will use physical cache page coloring 56 * and/or virtual cache page coloring. 57 */ 58 int do_pg_coloring = 0; 59 int do_virtual_coloring = 0; 60 61 /* 62 * These variables can be conveniently patched at kernel load time to 63 * prevent do_pg_coloring or do_virtual_coloring from being enabled by 64 * module specific config routines. 65 */ 66 67 int use_page_coloring = 1; 68 int use_virtual_coloring = 1; 69 70 /* 71 * initialized by page_coloring_init() 72 */ 73 extern uint_t page_colors; 74 extern uint_t page_colors_mask; 75 extern uint_t page_coloring_shift; 76 int cpu_page_colors; 77 uint_t vac_colors = 0; 78 uint_t vac_colors_mask = 0; 79 80 /* cpu specific coloring initialization */ 81 extern void page_coloring_init_cpu(); 82 #pragma weak page_coloring_init_cpu 83 84 /* 85 * get the ecache setsize for the current cpu. 86 */ 87 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 88 89 plcnt_t plcnt; /* page list count */ 90 91 /* 92 * This variable is set by the cpu module to contain the lowest 93 * address not affected by the SF_ERRATA_57 workaround. It should 94 * remain 0 if the workaround is not needed. 95 */ 96 #if defined(SF_ERRATA_57) 97 caddr_t errata57_limit; 98 #endif 99 100 extern int disable_auto_large_pages; /* used by map_pgsz*() routines */ 101 102 extern void page_relocate_hash(page_t *, page_t *); 103 104 /* 105 * these must be defined in platform specific areas 106 */ 107 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 108 struct proc *, uint_t); 109 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 110 caddr_t, size_t, uint_t, struct lgrp *); 111 /* 112 * Convert page frame number to an OBMEM page frame number 113 * (i.e. put in the type bits -- zero for this implementation) 114 */ 115 pfn_t 116 impl_obmem_pfnum(pfn_t pf) 117 { 118 return (pf); 119 } 120 121 /* 122 * Use physmax to determine the highest physical page of DRAM memory 123 * It is assumed that any physical addresses above physmax is in IO space. 124 * We don't bother checking the low end because we assume that memory space 125 * begins at physical page frame 0. 126 * 127 * Return 1 if the page frame is onboard DRAM memory, else 0. 128 * Returns 0 for nvram so it won't be cached. 129 */ 130 int 131 pf_is_memory(pfn_t pf) 132 { 133 /* We must be IO space */ 134 if (pf > physmax) 135 return (0); 136 137 /* We must be memory space */ 138 return (1); 139 } 140 141 /* 142 * Handle a pagefault. 143 */ 144 faultcode_t 145 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 146 { 147 struct as *as; 148 struct proc *p; 149 faultcode_t res; 150 caddr_t base; 151 size_t len; 152 int err; 153 154 if (INVALID_VADDR(addr)) 155 return (FC_NOMAP); 156 157 if (iskernel) { 158 as = &kas; 159 } else { 160 p = curproc; 161 as = p->p_as; 162 #if defined(SF_ERRATA_57) 163 /* 164 * Prevent infinite loops due to a segment driver 165 * setting the execute permissions and the sfmmu hat 166 * silently ignoring them. 167 */ 168 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 169 addr < errata57_limit) { 170 res = FC_NOMAP; 171 goto out; 172 } 173 #endif 174 } 175 176 /* 177 * Dispatch pagefault. 178 */ 179 res = as_fault(as->a_hat, as, addr, 1, type, rw); 180 181 /* 182 * If this isn't a potential unmapped hole in the user's 183 * UNIX data or stack segments, just return status info. 184 */ 185 if (!(res == FC_NOMAP && iskernel == 0)) 186 goto out; 187 188 /* 189 * Check to see if we happened to faulted on a currently unmapped 190 * part of the UNIX data or stack segments. If so, create a zfod 191 * mapping there and then try calling the fault routine again. 192 */ 193 base = p->p_brkbase; 194 len = p->p_brksize; 195 196 if (addr < base || addr >= base + len) { /* data seg? */ 197 base = (caddr_t)(p->p_usrstack - p->p_stksize); 198 len = p->p_stksize; 199 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 200 /* not in either UNIX data or stack segments */ 201 res = FC_NOMAP; 202 goto out; 203 } 204 } 205 206 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 207 /* This code is probably not needed anymore */ 208 209 /* expand the gap to the page boundaries on each side */ 210 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 211 ((uintptr_t)base & PAGEMASK); 212 base = (caddr_t)((uintptr_t)base & PAGEMASK); 213 214 as_rangelock(as); 215 as_purge(as); 216 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 217 err = as_map(as, base, len, segvn_create, zfod_argsp); 218 as_rangeunlock(as); 219 if (err) { 220 res = FC_MAKE_ERR(err); 221 goto out; 222 } 223 } else { 224 /* 225 * This page is already mapped by another thread after we 226 * returned from as_fault() above. We just fallthrough 227 * as_fault() below. 228 */ 229 as_rangeunlock(as); 230 } 231 232 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 233 234 out: 235 236 return (res); 237 } 238 239 /* 240 * This is the routine which defines the address limit implied 241 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 242 * mappable address in a 32-bit process on this platform (though 243 * perhaps we should make it be UINT32_MAX here?) 244 */ 245 void 246 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 247 { 248 struct proc *p = curproc; 249 caddr_t userlimit = flags & _MAP_LOW32 ? 250 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 251 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 252 } 253 254 /* 255 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 256 */ 257 caddr_t hole_start, hole_end; 258 259 /* 260 * kpm mapping window 261 */ 262 caddr_t kpm_vbase; 263 size_t kpm_size; 264 uchar_t kpm_size_shift; 265 266 /* 267 * Determine whether [base, base+len] contains a mapable range of 268 * addresses at least minlen long. base and len are adjusted if 269 * required to provide a mapable range. 270 */ 271 /* ARGSUSED */ 272 int 273 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 274 { 275 caddr_t hi, lo; 276 277 lo = *basep; 278 hi = lo + *lenp; 279 280 /* 281 * If hi rolled over the top, try cutting back. 282 */ 283 if (hi < lo) { 284 size_t newlen = 0 - (uintptr_t)lo - 1l; 285 286 if (newlen + (uintptr_t)hi < minlen) 287 return (0); 288 if (newlen < minlen) 289 return (0); 290 *lenp = newlen; 291 } else if (hi - lo < minlen) 292 return (0); 293 294 /* 295 * Deal with a possible hole in the address range between 296 * hole_start and hole_end that should never be mapped by the MMU. 297 */ 298 hi = lo + *lenp; 299 300 if (lo < hole_start) { 301 if (hi > hole_start) 302 if (hi < hole_end) 303 hi = hole_start; 304 else 305 /* lo < hole_start && hi >= hole_end */ 306 if (dir == AH_LO) { 307 /* 308 * prefer lowest range 309 */ 310 if (hole_start - lo >= minlen) 311 hi = hole_start; 312 else if (hi - hole_end >= minlen) 313 lo = hole_end; 314 else 315 return (0); 316 } else { 317 /* 318 * prefer highest range 319 */ 320 if (hi - hole_end >= minlen) 321 lo = hole_end; 322 else if (hole_start - lo >= minlen) 323 hi = hole_start; 324 else 325 return (0); 326 } 327 } else { 328 /* lo >= hole_start */ 329 if (hi < hole_end) 330 return (0); 331 if (lo < hole_end) 332 lo = hole_end; 333 } 334 335 if (hi - lo < minlen) 336 return (0); 337 338 *basep = lo; 339 *lenp = hi - lo; 340 341 return (1); 342 } 343 344 /* 345 * Determine whether [addr, addr+len] with protections `prot' are valid 346 * for a user address space. 347 */ 348 /*ARGSUSED*/ 349 int 350 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 351 caddr_t userlimit) 352 { 353 caddr_t eaddr = addr + len; 354 355 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 356 return (RANGE_BADADDR); 357 358 /* 359 * Determine if the address range falls within an illegal 360 * range of the MMU. 361 */ 362 if (eaddr > hole_start && addr < hole_end) 363 return (RANGE_BADADDR); 364 365 #if defined(SF_ERRATA_57) 366 /* 367 * Make sure USERLIMIT isn't raised too high 368 */ 369 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 370 errata57_limit == 0); 371 372 if (AS_TYPE_64BIT(as) && 373 (addr < errata57_limit) && 374 (prot & PROT_EXEC)) 375 return (RANGE_BADPROT); 376 #endif /* SF_ERRATA57 */ 377 return (RANGE_OKAY); 378 } 379 380 /* 381 * Routine used to check to see if an a.out can be executed 382 * by the current machine/architecture. 383 */ 384 int 385 chkaout(struct exdata *exp) 386 { 387 if (exp->ux_mach == M_SPARC) 388 return (0); 389 else 390 return (ENOEXEC); 391 } 392 393 /* 394 * The following functions return information about an a.out 395 * which is used when a program is executed. 396 */ 397 398 /* 399 * Return the load memory address for the data segment. 400 */ 401 caddr_t 402 getdmem(struct exec *exp) 403 { 404 /* 405 * XXX - Sparc Reference Hack approaching 406 * Remember that we are loading 407 * 8k executables into a 4k machine 408 * DATA_ALIGN == 2 * PAGESIZE 409 */ 410 if (exp->a_text) 411 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); 412 else 413 return ((caddr_t)USRTEXT); 414 } 415 416 /* 417 * Return the starting disk address for the data segment. 418 */ 419 ulong_t 420 getdfile(struct exec *exp) 421 { 422 if (exp->a_magic == ZMAGIC) 423 return (exp->a_text); 424 else 425 return (sizeof (struct exec) + exp->a_text); 426 } 427 428 /* 429 * Return the load memory address for the text segment. 430 */ 431 432 /*ARGSUSED*/ 433 caddr_t 434 gettmem(struct exec *exp) 435 { 436 return ((caddr_t)USRTEXT); 437 } 438 439 /* 440 * Return the file byte offset for the text segment. 441 */ 442 uint_t 443 gettfile(struct exec *exp) 444 { 445 if (exp->a_magic == ZMAGIC) 446 return (0); 447 else 448 return (sizeof (struct exec)); 449 } 450 451 void 452 getexinfo( 453 struct exdata *edp_in, 454 struct exdata *edp_out, 455 int *pagetext, 456 int *pagedata) 457 { 458 *edp_out = *edp_in; /* structure copy */ 459 460 if ((edp_in->ux_mag == ZMAGIC) && 461 ((edp_in->vp->v_flag & VNOMAP) == 0)) { 462 *pagetext = 1; 463 *pagedata = 1; 464 } else { 465 *pagetext = 0; 466 *pagedata = 0; 467 } 468 } 469 470 #define MAP_PGSZ_COMMON(pgsz, n, upper, lower, len) \ 471 for ((n) = (upper); (n) > (lower); (n)--) { \ 472 if (disable_auto_large_pages & (1 << (n))) \ 473 continue; \ 474 if (hw_page_array[(n)].hp_size <= (len)) { \ 475 (pgsz) = hw_page_array[(n)].hp_size; \ 476 break; \ 477 } \ 478 } 479 480 481 /*ARGSUSED*/ 482 static size_t 483 map_pgszva(struct proc *p, caddr_t addr, size_t len) 484 { 485 size_t pgsz = MMU_PAGESIZE; 486 int n, upper; 487 488 /* 489 * Select the best fit page size within the constraints of 490 * auto_lpg_{min,max}szc. 491 * 492 * Note that we also take the heap size into account when 493 * deciding if we've crossed the threshold at which we should 494 * increase the page size. This isn't perfect since the heap 495 * may not have reached its full size yet, but it's better than 496 * not considering it at all. 497 */ 498 len += p->p_brksize; 499 if (ptob(auto_lpg_tlb_threshold) <= len) { 500 501 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 502 503 /* 504 * Use auto_lpg_minszc - 1 as the limit so we never drop 505 * below auto_lpg_minszc. We don't have a size code to refer 506 * to like we have for bss and stack, so we assume 0. 507 * auto_lpg_minszc should always be >= 0. Using 508 * auto_lpg_minszc cuts off the loop. 509 */ 510 MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len); 511 } 512 513 return (pgsz); 514 } 515 516 static size_t 517 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 518 { 519 size_t pgsz; 520 int n, upper, lower; 521 522 /* 523 * If len is zero, retrieve from proc and don't demote the page size. 524 */ 525 if (len == 0) { 526 len = p->p_brksize; 527 } 528 529 /* 530 * Still zero? Then we don't have a heap yet, so pick the default 531 * heap size. 532 */ 533 if (len == 0) { 534 pgsz = auto_lpg_heap_default; 535 } else { 536 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 537 } 538 539 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 540 /* 541 * We're past the threshold, so select the best fit 542 * page size within the constraints of 543 * auto_lpg_{min,max}szc and the minimum required 544 * alignment. 545 */ 546 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 547 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 548 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 549 } 550 551 /* 552 * If addr == 0 we were called by memcntl() or exec_args() when the 553 * size code is 0. Don't set pgsz less than current size. 554 */ 555 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 556 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 557 } 558 559 return (pgsz); 560 } 561 562 static size_t 563 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 564 { 565 size_t pgsz; 566 int n, upper, lower; 567 568 /* 569 * If len is zero, retrieve from proc and don't demote the page size. 570 */ 571 if (len == 0) { 572 len = p->p_stksize; 573 } 574 575 /* 576 * Still zero? Then we don't have a heap yet, so pick the default 577 * stack size. 578 */ 579 if (len == 0) { 580 pgsz = auto_lpg_stack_default; 581 } else { 582 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 583 } 584 585 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 586 /* 587 * We're past the threshold, so select the best fit 588 * page size within the constraints of 589 * auto_lpg_{min,max}szc and the minimum required 590 * alignment. 591 */ 592 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 593 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 594 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 595 } 596 597 /* 598 * If addr == 0 we were called by memcntl() or exec_args() when the 599 * size code is 0. Don't set pgsz less than current size. 600 */ 601 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 602 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 603 } 604 605 return (pgsz); 606 } 607 608 static size_t 609 map_pgszism(caddr_t addr, size_t len) 610 { 611 uint_t szc; 612 size_t pgsz; 613 extern int disable_ism_large_pages; 614 615 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) { 616 if (disable_ism_large_pages & (1 << szc)) 617 continue; 618 619 pgsz = hw_page_array[szc].hp_size; 620 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz)) 621 return (pgsz); 622 } 623 return (DEFAULT_ISM_PAGESIZE); 624 } 625 626 /* 627 * Suggest a page size to be used to map a segment of type maptype and length 628 * len. Returns a page size (not a size code). 629 * If remap is non-NULL, fill in a value suggesting whether or not to remap 630 * this segment. 631 */ 632 size_t 633 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 634 { 635 size_t pgsz = 0; 636 637 if (remap != NULL) 638 *remap = (len > auto_lpg_remap_threshold); 639 640 switch (maptype) { 641 case MAPPGSZ_ISM: 642 pgsz = map_pgszism(addr, len); 643 break; 644 645 case MAPPGSZ_VA: 646 pgsz = map_pgszva(p, addr, len); 647 break; 648 649 case MAPPGSZ_STK: 650 pgsz = map_pgszstk(p, addr, len); 651 break; 652 653 case MAPPGSZ_HEAP: 654 pgsz = map_pgszheap(p, addr, len); 655 break; 656 } 657 return (pgsz); 658 } 659 660 /* 661 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 662 * KPM selects an address such that it's equal offset modulo shm_alignment and 663 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 664 */ 665 int 666 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 667 { 668 if (vac) { 669 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 670 } else { 671 return (0); 672 } 673 } 674 675 /* 676 * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m 677 * can be set in platform or CPU specific code but user can change the 678 * default values via /etc/system. 679 * 680 * Initial values are defined in architecture specific mach_vm_dep.c file. 681 */ 682 extern int use_text_pgsz64k; 683 extern int use_text_pgsz4m; 684 extern int use_initdata_pgsz64k; 685 686 /* 687 * disable_text_largepages and disable_initdata_largepages bitmaks are set in 688 * platform or CPU specific code to disable page sizes that should not be 689 * used. These variables normally shouldn't be changed via /etc/system. A 690 * particular page size for text or inititialized data will be used by default 691 * if both one of use_* variables is set to 1 AND this page size is not 692 * disabled in the corresponding disable_* bitmask variable. 693 * 694 * Initial values are defined in architecture specific mach_vm_dep.c file. 695 */ 696 extern int disable_text_largepages; 697 extern int disable_initdata_largepages; 698 699 /* 700 * Minimum segment size tunables before 64K or 4M large pages 701 * should be used to map it. 702 * 703 * Initial values are defined in architecture specific mach_vm_dep.c file. 704 */ 705 extern size_t text_pgsz64k_minsize; 706 extern size_t text_pgsz4m_minsize; 707 extern size_t initdata_pgsz64k_minsize; 708 709 /* 710 * Sanity control. Don't use large pages regardless of user 711 * settings if there's less than execseg_lpg_min_physmem memory installed. 712 * The units for this variable is 8K pages. 713 */ 714 pgcnt_t execseg_lpg_min_physmem = 131072; /* 1GB */ 715 716 extern int disable_shm_large_pages; 717 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */ 718 extern size_t max_shm_lpsize; 719 720 721 /* assumes TTE8K...TTE4M == szc */ 722 723 static uint_t 724 map_text_pgsz4m(caddr_t addr, size_t len) 725 { 726 caddr_t a; 727 728 if (len < text_pgsz4m_minsize) { 729 return (0); 730 } 731 732 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 733 if (a < addr || a >= addr + len) { 734 return (0); 735 } 736 len -= (a - addr); 737 if (len < MMU_PAGESIZE4M) { 738 return (0); 739 } 740 741 return (1 << TTE4M); 742 } 743 744 static uint_t 745 map_text_pgsz64k(caddr_t addr, size_t len) 746 { 747 caddr_t a; 748 size_t svlen = len; 749 750 if (len < text_pgsz64k_minsize) { 751 return (0); 752 } 753 754 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 755 if (a < addr || a >= addr + len) { 756 return (0); 757 } 758 len -= (a - addr); 759 if (len < MMU_PAGESIZE64K) { 760 return (0); 761 } 762 if (!use_text_pgsz4m || 763 disable_text_largepages & (1 << TTE4M)) { 764 return (1 << TTE64K); 765 } 766 if (svlen < text_pgsz4m_minsize) { 767 return (1 << TTE64K); 768 } 769 addr = a; 770 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 771 if (a < addr || a >= addr + len) { 772 return (1 << TTE64K); 773 } 774 len -= (a - addr); 775 if (len < MMU_PAGESIZE4M) { 776 return (1 << TTE64K); 777 } 778 return ((1 << TTE4M) | (1 << TTE64K)); 779 } 780 781 static uint_t 782 map_initdata_pgsz64k(caddr_t addr, size_t len) 783 { 784 caddr_t a; 785 786 if (len < initdata_pgsz64k_minsize) { 787 return (0); 788 } 789 790 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 791 if (a < addr || a >= addr + len) { 792 return (0); 793 } 794 len -= (a - addr); 795 if (len < MMU_PAGESIZE64K) { 796 return (0); 797 } 798 return (1 << TTE64K); 799 } 800 801 /* 802 * Return a bit vector of large page size codes that 803 * can be used to map [addr, addr + len) region. 804 */ 805 uint_t 806 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 807 { 808 uint_t ret = 0; 809 810 if (physmem < execseg_lpg_min_physmem) { 811 return (0); 812 } 813 814 if (text) { 815 if (use_text_pgsz64k && 816 !(disable_text_largepages & (1 << TTE64K))) { 817 ret = map_text_pgsz64k(addr, len); 818 } else if (use_text_pgsz4m && 819 !(disable_text_largepages & (1 << TTE4M))) { 820 ret = map_text_pgsz4m(addr, len); 821 } 822 } else if (use_initdata_pgsz64k && 823 !(disable_initdata_largepages & (1 << TTE64K))) { 824 ret = map_initdata_pgsz64k(addr, len); 825 } 826 827 return (ret); 828 } 829 830 uint_t 831 map_shm_pgszcvec(caddr_t addr, size_t size, uintptr_t off) 832 { 833 caddr_t eaddr = addr + size; 834 uint_t szcvec = 0; 835 int i; 836 caddr_t raddr; 837 caddr_t readdr; 838 size_t pgsz; 839 840 if (physmem < shm_lpg_min_physmem || mmu_page_sizes <= 1 || 841 max_shm_lpsize <= MMU_PAGESIZE) { 842 return (0); 843 } 844 845 for (i = mmu_page_sizes - 1; i > 0; i--) { 846 if (disable_shm_large_pages & (1 << i)) { 847 continue; 848 } 849 pgsz = page_get_pagesize(i); 850 if (pgsz > max_shm_lpsize) { 851 continue; 852 } 853 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 854 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 855 if (raddr < addr || raddr >= readdr) { 856 continue; 857 } 858 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 859 continue; 860 } 861 szcvec |= (1 << i); 862 /* 863 * And or in the remaining enabled page sizes. 864 */ 865 szcvec |= P2PHASE(~disable_shm_large_pages, (1 << i)); 866 szcvec &= ~1; /* no need to return 8K pagesize */ 867 break; 868 } 869 return (szcvec); 870 } 871 872 /* 873 * Anchored in the table below are counters used to keep track 874 * of free contiguous physical memory. Each element of the table contains 875 * the array of counters, the size of array which is allocated during 876 * startup based on physmax and a shift value used to convert a pagenum 877 * into a counter array index or vice versa. The table has page size 878 * for rows and region size for columns: 879 * 880 * page_counters[page_size][region_size] 881 * 882 * page_size: TTE size code of pages on page_size freelist. 883 * 884 * region_size: TTE size code of a candidate larger page made up 885 * made up of contiguous free page_size pages. 886 * 887 * As you go across a page_size row increasing region_size each 888 * element keeps track of how many (region_size - 1) size groups 889 * made up of page_size free pages can be coalesced into a 890 * regsion_size page. Yuck! Lets try an example: 891 * 892 * page_counters[1][3] is the table element used for identifying 893 * candidate 4M pages from contiguous pages off the 64K free list. 894 * Each index in the page_counters[1][3].array spans 4M. Its the 895 * number of free 512K size (regsion_size - 1) groups of contiguous 896 * 64K free pages. So when page_counters[1][3].counters[n] == 8 897 * we know we have a candidate 4M page made up of 512K size groups 898 * of 64K free pages. 899 */ 900 901 /* 902 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 903 * dimensions are allocated dynamically. 904 */ 905 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 906 907 /* 908 * For now there is only a single size cache list. 909 * Allocated dynamically. 910 */ 911 page_t ***page_cachelists[MAX_MEM_TYPES]; 912 913 kmutex_t *fpc_mutex[NPC_MUTEX]; 914 kmutex_t *cpc_mutex[NPC_MUTEX]; 915 916 caddr_t 917 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align) 918 { 919 int mtype; 920 uint_t szc; 921 922 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 923 924 /* 925 * We only support small pages in the cachelist. 926 */ 927 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 928 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 929 alloc_base += (sizeof (page_t *) * page_get_pagecolors(0)); 930 /* 931 * Allocate freelists bins for all 932 * supported page sizes. 933 */ 934 for (szc = 0; szc < mmu_page_sizes; szc++) { 935 page_freelists[szc][mtype][mnode] = 936 (page_t **)alloc_base; 937 alloc_base += ((sizeof (page_t *) * 938 page_get_pagecolors(szc))); 939 } 940 } 941 942 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 943 944 return (alloc_base); 945 } 946 947 /* 948 * Allocate page_freelists bin headers for a memnode from the 949 * nucleus data area. This is the first time that mmu_page_sizes is 950 * used during sun4u bootup, so check mmu_page_sizes initialization. 951 */ 952 int 953 ndata_alloc_page_freelists(struct memlist *ndata, int mnode) 954 { 955 size_t alloc_sz; 956 caddr_t alloc_base; 957 caddr_t end; 958 int mtype; 959 uint_t szc; 960 int32_t allp = 0; 961 962 if (&mmu_init_mmu_page_sizes) { 963 if (!mmu_init_mmu_page_sizes(allp)) { 964 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 965 mmu_page_sizes); 966 } 967 } 968 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 969 970 /* first time called - allocate max_mem_nodes dimension */ 971 if (mnode == 0) { 972 int i; 973 974 /* page_cachelists */ 975 alloc_sz = MAX_MEM_TYPES * max_mem_nodes * 976 sizeof (page_t **); 977 978 /* page_freelists */ 979 alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes * 980 sizeof (page_t **); 981 982 /* fpc_mutex and cpc_mutex */ 983 alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 984 985 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 986 if (alloc_base == NULL) 987 return (-1); 988 989 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 990 991 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 992 page_cachelists[mtype] = (page_t ***)alloc_base; 993 alloc_base += (max_mem_nodes * sizeof (page_t **)); 994 for (szc = 0; szc < mmu_page_sizes; szc++) { 995 page_freelists[szc][mtype] = 996 (page_t ***)alloc_base; 997 alloc_base += (max_mem_nodes * 998 sizeof (page_t **)); 999 } 1000 } 1001 for (i = 0; i < NPC_MUTEX; i++) { 1002 fpc_mutex[i] = (kmutex_t *)alloc_base; 1003 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 1004 cpc_mutex[i] = (kmutex_t *)alloc_base; 1005 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 1006 } 1007 alloc_sz = 0; 1008 } 1009 1010 /* 1011 * Calculate the size needed by alloc_page_freelists(). 1012 */ 1013 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 1014 alloc_sz += sizeof (page_t *) * page_get_pagecolors(0); 1015 1016 for (szc = 0; szc < mmu_page_sizes; szc++) 1017 alloc_sz += sizeof (page_t *) * 1018 page_get_pagecolors(szc); 1019 } 1020 1021 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 1022 if (alloc_base == NULL) 1023 return (-1); 1024 1025 end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize); 1026 ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz, 1027 ecache_alignsize)); 1028 1029 return (0); 1030 } 1031 1032 /* 1033 * To select our starting bin, we stride through the bins with a stride 1034 * of 337. Why 337? It's prime, it's largeish, and it performs well both 1035 * in simulation and practice for different workloads on varying cache sizes. 1036 */ 1037 uint32_t color_start_current = 0; 1038 uint32_t color_start_stride = 337; 1039 int color_start_random = 0; 1040 1041 /* ARGSUSED */ 1042 uint_t 1043 get_color_start(struct as *as) 1044 { 1045 uint32_t old, new; 1046 1047 if (consistent_coloring == 2 || color_start_random) { 1048 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 1049 (hw_page_array[0].hp_colors - 1))); 1050 } 1051 1052 do { 1053 old = color_start_current; 1054 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 1055 } while (cas32(&color_start_current, old, new) != old); 1056 1057 return ((uint_t)(new)); 1058 } 1059 1060 /* 1061 * Called once at startup from kphysm_init() -- before memialloc() 1062 * is invoked to do the 1st page_free()/page_freelist_add(). 1063 * 1064 * initializes page_colors and page_colors_mask based on ecache_setsize. 1065 * 1066 * Also initializes the counter locks. 1067 */ 1068 void 1069 page_coloring_init() 1070 { 1071 int a, i; 1072 uint_t colors; 1073 1074 if (do_pg_coloring == 0) { 1075 page_colors = 1; 1076 for (i = 0; i < mmu_page_sizes; i++) 1077 hw_page_array[i].hp_colors = 1; 1078 return; 1079 } 1080 1081 /* 1082 * Calculate page_colors from ecache_setsize. ecache_setsize contains 1083 * the max ecache setsize of all cpus configured in the system or, for 1084 * cheetah+ systems, the max possible ecache setsize for all possible 1085 * cheetah+ cpus. 1086 */ 1087 page_colors = ecache_setsize / MMU_PAGESIZE; 1088 page_colors_mask = page_colors - 1; 1089 1090 vac_colors = vac_size / MMU_PAGESIZE; 1091 vac_colors_mask = vac_colors -1; 1092 1093 page_coloring_shift = 0; 1094 a = ecache_setsize; 1095 while (a >>= 1) { 1096 page_coloring_shift++; 1097 } 1098 1099 /* initialize number of colors per page size */ 1100 for (i = 0; i < mmu_page_sizes; i++) { 1101 hw_page_array[i].hp_colors = (page_colors_mask >> 1102 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1103 + 1; 1104 } 1105 1106 /* 1107 * initialize cpu_page_colors if ecache setsizes are homogenous. 1108 * cpu_page_colors set to -1 during DR operation or during startup 1109 * if setsizes are heterogenous. 1110 * 1111 * The value of cpu_page_colors determines if additional color bins 1112 * need to be checked for a particular color in the page_get routines. 1113 */ 1114 if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize)) { 1115 1116 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 1117 a = lowbit(page_colors) - lowbit(cpu_page_colors); 1118 ASSERT(a > 0); 1119 ASSERT(a < 16); 1120 1121 for (i = 0; i < mmu_page_sizes; i++) { 1122 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1123 colorequivszc[i] = 0; 1124 continue; 1125 } 1126 while ((colors >> a) == 0) 1127 a--; 1128 ASSERT(a >= 0); 1129 1130 /* higher 4 bits encodes color equiv mask */ 1131 colorequivszc[i] = (a << 4); 1132 } 1133 } 1134 1135 /* factor in colorequiv to check additional 'equivalent' bins. */ 1136 if (colorequiv > 1 && &page_coloring_init_cpu == NULL) { 1137 1138 a = lowbit(colorequiv) - 1; 1139 1140 if (a > 15) 1141 a = 15; 1142 1143 for (i = 0; i < mmu_page_sizes; i++) { 1144 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1145 continue; 1146 } 1147 while ((colors >> a) == 0) 1148 a--; 1149 if ((a << 4) > colorequivszc[i]) { 1150 colorequivszc[i] = (a << 4); 1151 } 1152 } 1153 } 1154 1155 /* do cpu specific color initialization */ 1156 if (&page_coloring_init_cpu) { 1157 page_coloring_init_cpu(); 1158 } 1159 } 1160 1161 int 1162 bp_color(struct buf *bp) 1163 { 1164 int color = -1; 1165 1166 if (vac) { 1167 if ((bp->b_flags & B_PAGEIO) != 0) { 1168 color = sfmmu_get_ppvcolor(bp->b_pages); 1169 } else if (bp->b_un.b_addr != NULL) { 1170 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 1171 } 1172 } 1173 return (color < 0 ? 0 : ptob(color)); 1174 } 1175 1176 /* 1177 * Create & Initialise pageout scanner thread. The thread has to 1178 * start at procedure with process pp and priority pri. 1179 */ 1180 void 1181 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1182 { 1183 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1184 } 1185 1186 /* 1187 * Function for flushing D-cache when performing module relocations 1188 * to an alternate mapping. Stubbed out on all platforms except sun4u, 1189 * at least for now. 1190 */ 1191 void 1192 dcache_flushall() 1193 { 1194 sfmmu_cache_flushall(); 1195 } 1196 1197 static int 1198 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 1199 { 1200 if (va1 < va2 && va1 + sz1 <= va2) 1201 return (0); 1202 1203 if (va2 < va1 && va2 + sz2 <= va1) 1204 return (0); 1205 1206 return (1); 1207 } 1208 1209 /* 1210 * Return the number of bytes, relative to the beginning of a given range, that 1211 * are non-toxic (can be read from and written to with relative impunity). 1212 */ 1213 size_t 1214 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 1215 { 1216 /* OBP reads are harmless, but we don't want people writing there */ 1217 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 1218 OFW_START_ADDR + 1)) 1219 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 1220 1221 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 1222 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 1223 1224 return (sz); /* no overlap */ 1225 } 1226 1227 /* 1228 * Minimum physmem required for enabling large pages for kernel heap 1229 * Currently we do not enable lp for kmem on systems with less 1230 * than 1GB of memory. This value can be changed via /etc/system 1231 */ 1232 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 1233 1234 /* 1235 * this function chooses large page size for kernel heap 1236 */ 1237 size_t 1238 get_segkmem_lpsize(size_t lpsize) 1239 { 1240 size_t memtotal = physmem * PAGESIZE; 1241 size_t mmusz; 1242 uint_t szc; 1243 extern int disable_large_pages; 1244 1245 if (memtotal < segkmem_lpminphysmem) 1246 return (PAGESIZE); 1247 1248 if (plat_lpkmem_is_supported != NULL && 1249 plat_lpkmem_is_supported() == 0) 1250 return (PAGESIZE); 1251 1252 mmusz = mmu_get_kernel_lpsize(lpsize); 1253 szc = page_szc(mmusz); 1254 1255 while (szc) { 1256 if (!(disable_large_pages & (1 << szc))) 1257 return (page_get_pagesize(szc)); 1258 szc--; 1259 } 1260 return (PAGESIZE); 1261 } 1262