1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * UNIX machine dependent virtual memory support. 30 */ 31 32 #include <sys/vm.h> 33 #include <sys/exec.h> 34 35 #include <sys/exechdr.h> 36 #include <vm/seg_kmem.h> 37 #include <sys/atomic.h> 38 #include <sys/archsystm.h> 39 #include <sys/machsystm.h> 40 #include <sys/kdi.h> 41 #include <sys/cpu_module.h> 42 43 #include <vm/hat_sfmmu.h> 44 45 #include <sys/memnode.h> 46 47 #include <sys/mem_config.h> 48 #include <sys/mem_cage.h> 49 #include <vm/vm_dep.h> 50 #include <sys/platform_module.h> 51 52 /* 53 * These variables are set by module specific config routines. 54 * They are only set by modules which will use physical cache page coloring 55 * and/or virtual cache page coloring. 56 */ 57 int do_pg_coloring = 0; 58 int do_virtual_coloring = 0; 59 60 /* 61 * These variables can be conveniently patched at kernel load time to 62 * prevent do_pg_coloring or do_virtual_coloring from being enabled by 63 * module specific config routines. 64 */ 65 66 int use_page_coloring = 1; 67 int use_virtual_coloring = 1; 68 69 /* 70 * initialized by page_coloring_init() 71 */ 72 extern uint_t page_colors; 73 extern uint_t page_colors_mask; 74 extern uint_t page_coloring_shift; 75 int cpu_page_colors; 76 uint_t vac_colors = 0; 77 uint_t vac_colors_mask = 0; 78 79 /* 80 * get the ecache setsize for the current cpu. 81 */ 82 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 83 84 plcnt_t plcnt; /* page list count */ 85 86 /* 87 * This variable is set by the cpu module to contain the lowest 88 * address not affected by the SF_ERRATA_57 workaround. It should 89 * remain 0 if the workaround is not needed. 90 */ 91 #if defined(SF_ERRATA_57) 92 caddr_t errata57_limit; 93 #endif 94 95 extern int disable_auto_large_pages; /* used by map_pgsz*() routines */ 96 97 extern void page_relocate_hash(page_t *, page_t *); 98 99 /* 100 * these must be defined in platform specific areas 101 */ 102 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 103 struct proc *, uint_t); 104 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 105 caddr_t, size_t, uint_t, struct lgrp *); 106 /* 107 * Convert page frame number to an OBMEM page frame number 108 * (i.e. put in the type bits -- zero for this implementation) 109 */ 110 pfn_t 111 impl_obmem_pfnum(pfn_t pf) 112 { 113 return (pf); 114 } 115 116 /* 117 * Use physmax to determine the highest physical page of DRAM memory 118 * It is assumed that any physical addresses above physmax is in IO space. 119 * We don't bother checking the low end because we assume that memory space 120 * begins at physical page frame 0. 121 * 122 * Return 1 if the page frame is onboard DRAM memory, else 0. 123 * Returns 0 for nvram so it won't be cached. 124 */ 125 int 126 pf_is_memory(pfn_t pf) 127 { 128 /* We must be IO space */ 129 if (pf > physmax) 130 return (0); 131 132 /* We must be memory space */ 133 return (1); 134 } 135 136 /* 137 * Handle a pagefault. 138 */ 139 faultcode_t 140 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 141 { 142 struct as *as; 143 struct proc *p; 144 faultcode_t res; 145 caddr_t base; 146 size_t len; 147 int err; 148 149 if (INVALID_VADDR(addr)) 150 return (FC_NOMAP); 151 152 if (iskernel) { 153 as = &kas; 154 } else { 155 p = curproc; 156 as = p->p_as; 157 #if defined(SF_ERRATA_57) 158 /* 159 * Prevent infinite loops due to a segment driver 160 * setting the execute permissions and the sfmmu hat 161 * silently ignoring them. 162 */ 163 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 164 addr < errata57_limit) { 165 res = FC_NOMAP; 166 goto out; 167 } 168 #endif 169 } 170 171 /* 172 * Dispatch pagefault. 173 */ 174 res = as_fault(as->a_hat, as, addr, 1, type, rw); 175 176 /* 177 * If this isn't a potential unmapped hole in the user's 178 * UNIX data or stack segments, just return status info. 179 */ 180 if (!(res == FC_NOMAP && iskernel == 0)) 181 goto out; 182 183 /* 184 * Check to see if we happened to faulted on a currently unmapped 185 * part of the UNIX data or stack segments. If so, create a zfod 186 * mapping there and then try calling the fault routine again. 187 */ 188 base = p->p_brkbase; 189 len = p->p_brksize; 190 191 if (addr < base || addr >= base + len) { /* data seg? */ 192 base = (caddr_t)(p->p_usrstack - p->p_stksize); 193 len = p->p_stksize; 194 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 195 /* not in either UNIX data or stack segments */ 196 res = FC_NOMAP; 197 goto out; 198 } 199 } 200 201 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 202 /* This code is probably not needed anymore */ 203 204 /* expand the gap to the page boundaries on each side */ 205 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 206 ((uintptr_t)base & PAGEMASK); 207 base = (caddr_t)((uintptr_t)base & PAGEMASK); 208 209 as_rangelock(as); 210 as_purge(as); 211 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 212 err = as_map(as, base, len, segvn_create, zfod_argsp); 213 as_rangeunlock(as); 214 if (err) { 215 res = FC_MAKE_ERR(err); 216 goto out; 217 } 218 } else { 219 /* 220 * This page is already mapped by another thread after we 221 * returned from as_fault() above. We just fallthrough 222 * as_fault() below. 223 */ 224 as_rangeunlock(as); 225 } 226 227 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 228 229 out: 230 231 return (res); 232 } 233 234 /* 235 * This is the routine which defines the address limit implied 236 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 237 * mappable address in a 32-bit process on this platform (though 238 * perhaps we should make it be UINT32_MAX here?) 239 */ 240 void 241 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 242 { 243 struct proc *p = curproc; 244 caddr_t userlimit = flags & _MAP_LOW32 ? 245 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 246 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 247 } 248 249 /* 250 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 251 */ 252 caddr_t hole_start, hole_end; 253 254 /* 255 * kpm mapping window 256 */ 257 caddr_t kpm_vbase; 258 size_t kpm_size; 259 uchar_t kpm_size_shift; 260 261 /* 262 * Determine whether [base, base+len] contains a mapable range of 263 * addresses at least minlen long. base and len are adjusted if 264 * required to provide a mapable range. 265 */ 266 /* ARGSUSED */ 267 int 268 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 269 { 270 caddr_t hi, lo; 271 272 lo = *basep; 273 hi = lo + *lenp; 274 275 /* 276 * If hi rolled over the top, try cutting back. 277 */ 278 if (hi < lo) { 279 size_t newlen = 0 - (uintptr_t)lo - 1l; 280 281 if (newlen + (uintptr_t)hi < minlen) 282 return (0); 283 if (newlen < minlen) 284 return (0); 285 *lenp = newlen; 286 } else if (hi - lo < minlen) 287 return (0); 288 289 /* 290 * Deal with a possible hole in the address range between 291 * hole_start and hole_end that should never be mapped by the MMU. 292 */ 293 hi = lo + *lenp; 294 295 if (lo < hole_start) { 296 if (hi > hole_start) 297 if (hi < hole_end) 298 hi = hole_start; 299 else 300 /* lo < hole_start && hi >= hole_end */ 301 if (dir == AH_LO) { 302 /* 303 * prefer lowest range 304 */ 305 if (hole_start - lo >= minlen) 306 hi = hole_start; 307 else if (hi - hole_end >= minlen) 308 lo = hole_end; 309 else 310 return (0); 311 } else { 312 /* 313 * prefer highest range 314 */ 315 if (hi - hole_end >= minlen) 316 lo = hole_end; 317 else if (hole_start - lo >= minlen) 318 hi = hole_start; 319 else 320 return (0); 321 } 322 } else { 323 /* lo >= hole_start */ 324 if (hi < hole_end) 325 return (0); 326 if (lo < hole_end) 327 lo = hole_end; 328 } 329 330 if (hi - lo < minlen) 331 return (0); 332 333 *basep = lo; 334 *lenp = hi - lo; 335 336 return (1); 337 } 338 339 /* 340 * Determine whether [addr, addr+len] with protections `prot' are valid 341 * for a user address space. 342 */ 343 /*ARGSUSED*/ 344 int 345 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 346 caddr_t userlimit) 347 { 348 caddr_t eaddr = addr + len; 349 350 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 351 return (RANGE_BADADDR); 352 353 /* 354 * Determine if the address range falls within an illegal 355 * range of the MMU. 356 */ 357 if (eaddr > hole_start && addr < hole_end) 358 return (RANGE_BADADDR); 359 360 #if defined(SF_ERRATA_57) 361 /* 362 * Make sure USERLIMIT isn't raised too high 363 */ 364 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 365 errata57_limit == 0); 366 367 if (AS_TYPE_64BIT(as) && 368 (addr < errata57_limit) && 369 (prot & PROT_EXEC)) 370 return (RANGE_BADPROT); 371 #endif /* SF_ERRATA57 */ 372 return (RANGE_OKAY); 373 } 374 375 /* 376 * Routine used to check to see if an a.out can be executed 377 * by the current machine/architecture. 378 */ 379 int 380 chkaout(struct exdata *exp) 381 { 382 if (exp->ux_mach == M_SPARC) 383 return (0); 384 else 385 return (ENOEXEC); 386 } 387 388 /* 389 * The following functions return information about an a.out 390 * which is used when a program is executed. 391 */ 392 393 /* 394 * Return the load memory address for the data segment. 395 */ 396 caddr_t 397 getdmem(struct exec *exp) 398 { 399 /* 400 * XXX - Sparc Reference Hack approaching 401 * Remember that we are loading 402 * 8k executables into a 4k machine 403 * DATA_ALIGN == 2 * PAGESIZE 404 */ 405 if (exp->a_text) 406 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); 407 else 408 return ((caddr_t)USRTEXT); 409 } 410 411 /* 412 * Return the starting disk address for the data segment. 413 */ 414 ulong_t 415 getdfile(struct exec *exp) 416 { 417 if (exp->a_magic == ZMAGIC) 418 return (exp->a_text); 419 else 420 return (sizeof (struct exec) + exp->a_text); 421 } 422 423 /* 424 * Return the load memory address for the text segment. 425 */ 426 427 /*ARGSUSED*/ 428 caddr_t 429 gettmem(struct exec *exp) 430 { 431 return ((caddr_t)USRTEXT); 432 } 433 434 /* 435 * Return the file byte offset for the text segment. 436 */ 437 uint_t 438 gettfile(struct exec *exp) 439 { 440 if (exp->a_magic == ZMAGIC) 441 return (0); 442 else 443 return (sizeof (struct exec)); 444 } 445 446 void 447 getexinfo( 448 struct exdata *edp_in, 449 struct exdata *edp_out, 450 int *pagetext, 451 int *pagedata) 452 { 453 *edp_out = *edp_in; /* structure copy */ 454 455 if ((edp_in->ux_mag == ZMAGIC) && 456 ((edp_in->vp->v_flag & VNOMAP) == 0)) { 457 *pagetext = 1; 458 *pagedata = 1; 459 } else { 460 *pagetext = 0; 461 *pagedata = 0; 462 } 463 } 464 465 #define MAP_PGSZ_COMMON(pgsz, n, upper, lower, len) \ 466 for ((n) = (upper); (n) > (lower); (n)--) { \ 467 if (disable_auto_large_pages & (1 << (n))) \ 468 continue; \ 469 if (hw_page_array[(n)].hp_size <= (len)) { \ 470 (pgsz) = hw_page_array[(n)].hp_size; \ 471 break; \ 472 } \ 473 } 474 475 476 /*ARGSUSED*/ 477 static size_t 478 map_pgszva(struct proc *p, caddr_t addr, size_t len) 479 { 480 size_t pgsz = MMU_PAGESIZE; 481 int n, upper; 482 483 /* 484 * Select the best fit page size within the constraints of 485 * auto_lpg_{min,max}szc. 486 * 487 * Note that we also take the heap size into account when 488 * deciding if we've crossed the threshold at which we should 489 * increase the page size. This isn't perfect since the heap 490 * may not have reached its full size yet, but it's better than 491 * not considering it at all. 492 */ 493 len += p->p_brksize; 494 if (ptob(auto_lpg_tlb_threshold) <= len) { 495 496 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 497 498 /* 499 * Use auto_lpg_minszc - 1 as the limit so we never drop 500 * below auto_lpg_minszc. We don't have a size code to refer 501 * to like we have for bss and stack, so we assume 0. 502 * auto_lpg_minszc should always be >= 0. Using 503 * auto_lpg_minszc cuts off the loop. 504 */ 505 MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len); 506 } 507 508 return (pgsz); 509 } 510 511 static size_t 512 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 513 { 514 size_t pgsz; 515 int n, upper, lower; 516 517 /* 518 * If len is zero, retrieve from proc and don't demote the page size. 519 */ 520 if (len == 0) { 521 len = p->p_brksize; 522 } 523 524 /* 525 * Still zero? Then we don't have a heap yet, so pick the default 526 * heap size. 527 */ 528 if (len == 0) { 529 pgsz = auto_lpg_heap_default; 530 } else { 531 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 532 } 533 534 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 535 /* 536 * We're past the threshold, so select the best fit 537 * page size within the constraints of 538 * auto_lpg_{min,max}szc and the minimum required 539 * alignment. 540 */ 541 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 542 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 543 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 544 } 545 546 /* 547 * If addr == 0 we were called by memcntl() or exec_args() when the 548 * size code is 0. Don't set pgsz less than current size. 549 */ 550 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 551 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 552 } 553 554 return (pgsz); 555 } 556 557 static size_t 558 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 559 { 560 size_t pgsz; 561 int n, upper, lower; 562 563 /* 564 * If len is zero, retrieve from proc and don't demote the page size. 565 */ 566 if (len == 0) { 567 len = p->p_stksize; 568 } 569 570 /* 571 * Still zero? Then we don't have a heap yet, so pick the default 572 * stack size. 573 */ 574 if (len == 0) { 575 pgsz = auto_lpg_stack_default; 576 } else { 577 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 578 } 579 580 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 581 /* 582 * We're past the threshold, so select the best fit 583 * page size within the constraints of 584 * auto_lpg_{min,max}szc and the minimum required 585 * alignment. 586 */ 587 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 588 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 589 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 590 } 591 592 /* 593 * If addr == 0 we were called by memcntl() or exec_args() when the 594 * size code is 0. Don't set pgsz less than current size. 595 */ 596 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 597 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 598 } 599 600 return (pgsz); 601 } 602 603 static size_t 604 map_pgszism(caddr_t addr, size_t len) 605 { 606 uint_t szc; 607 size_t pgsz; 608 extern int disable_ism_large_pages; 609 610 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) { 611 if (disable_ism_large_pages & (1 << szc)) 612 continue; 613 614 pgsz = hw_page_array[szc].hp_size; 615 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz)) 616 return (pgsz); 617 } 618 return (DEFAULT_ISM_PAGESIZE); 619 } 620 621 /* 622 * Suggest a page size to be used to map a segment of type maptype and length 623 * len. Returns a page size (not a size code). 624 * If remap is non-NULL, fill in a value suggesting whether or not to remap 625 * this segment. 626 */ 627 size_t 628 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 629 { 630 size_t pgsz = 0; 631 632 if (remap != NULL) 633 *remap = (len > auto_lpg_remap_threshold); 634 635 switch (maptype) { 636 case MAPPGSZ_ISM: 637 pgsz = map_pgszism(addr, len); 638 break; 639 640 case MAPPGSZ_VA: 641 pgsz = map_pgszva(p, addr, len); 642 break; 643 644 case MAPPGSZ_STK: 645 pgsz = map_pgszstk(p, addr, len); 646 break; 647 648 case MAPPGSZ_HEAP: 649 pgsz = map_pgszheap(p, addr, len); 650 break; 651 } 652 return (pgsz); 653 } 654 655 /* 656 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 657 * KPM selects an address such that it's equal offset modulo shm_alignment and 658 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 659 */ 660 int 661 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 662 { 663 if (vac) { 664 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 665 } else { 666 return (0); 667 } 668 } 669 670 /* 671 * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m 672 * can be set in platform or CPU specific code but user can change the 673 * default values via /etc/system. 674 * 675 * Initial values are defined in architecture specific mach_vm_dep.c file. 676 */ 677 extern int use_text_pgsz64k; 678 extern int use_text_pgsz4m; 679 extern int use_initdata_pgsz64k; 680 681 /* 682 * disable_text_largepages and disable_initdata_largepages bitmaks are set in 683 * platform or CPU specific code to disable page sizes that should not be 684 * used. These variables normally shouldn't be changed via /etc/system. A 685 * particular page size for text or inititialized data will be used by default 686 * if both one of use_* variables is set to 1 AND this page size is not 687 * disabled in the corresponding disable_* bitmask variable. 688 * 689 * Initial values are defined in architecture specific mach_vm_dep.c file. 690 */ 691 extern int disable_text_largepages; 692 extern int disable_initdata_largepages; 693 694 /* 695 * Minimum segment size tunables before 64K or 4M large pages 696 * should be used to map it. 697 * 698 * Initial values are defined in architecture specific mach_vm_dep.c file. 699 */ 700 extern size_t text_pgsz64k_minsize; 701 extern size_t text_pgsz4m_minsize; 702 extern size_t initdata_pgsz64k_minsize; 703 704 /* 705 * Sanity control. Don't use large pages regardless of user 706 * settings if there's less than execseg_lpg_min_physmem memory installed. 707 * The units for this variable is 8K pages. 708 */ 709 pgcnt_t execseg_lpg_min_physmem = 131072; /* 1GB */ 710 711 extern int disable_shm_large_pages; 712 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */ 713 extern size_t max_shm_lpsize; 714 715 716 /* assumes TTE8K...TTE4M == szc */ 717 718 static uint_t 719 map_text_pgsz4m(caddr_t addr, size_t len) 720 { 721 caddr_t a; 722 723 if (len < text_pgsz4m_minsize) { 724 return (0); 725 } 726 727 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 728 if (a < addr || a >= addr + len) { 729 return (0); 730 } 731 len -= (a - addr); 732 if (len < MMU_PAGESIZE4M) { 733 return (0); 734 } 735 736 return (1 << TTE4M); 737 } 738 739 static uint_t 740 map_text_pgsz64k(caddr_t addr, size_t len) 741 { 742 caddr_t a; 743 size_t svlen = len; 744 745 if (len < text_pgsz64k_minsize) { 746 return (0); 747 } 748 749 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 750 if (a < addr || a >= addr + len) { 751 return (0); 752 } 753 len -= (a - addr); 754 if (len < MMU_PAGESIZE64K) { 755 return (0); 756 } 757 if (!use_text_pgsz4m || 758 disable_text_largepages & (1 << TTE4M)) { 759 return (1 << TTE64K); 760 } 761 if (svlen < text_pgsz4m_minsize) { 762 return (1 << TTE64K); 763 } 764 addr = a; 765 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 766 if (a < addr || a >= addr + len) { 767 return (1 << TTE64K); 768 } 769 len -= (a - addr); 770 if (len < MMU_PAGESIZE4M) { 771 return (1 << TTE64K); 772 } 773 return ((1 << TTE4M) | (1 << TTE64K)); 774 } 775 776 static uint_t 777 map_initdata_pgsz64k(caddr_t addr, size_t len) 778 { 779 caddr_t a; 780 781 if (len < initdata_pgsz64k_minsize) { 782 return (0); 783 } 784 785 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 786 if (a < addr || a >= addr + len) { 787 return (0); 788 } 789 len -= (a - addr); 790 if (len < MMU_PAGESIZE64K) { 791 return (0); 792 } 793 return (1 << TTE64K); 794 } 795 796 /* 797 * Return a bit vector of large page size codes that 798 * can be used to map [addr, addr + len) region. 799 */ 800 uint_t 801 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 802 { 803 uint_t ret = 0; 804 805 if (physmem < execseg_lpg_min_physmem) { 806 return (0); 807 } 808 809 if (text) { 810 if (use_text_pgsz64k && 811 !(disable_text_largepages & (1 << TTE64K))) { 812 ret = map_text_pgsz64k(addr, len); 813 } else if (use_text_pgsz4m && 814 !(disable_text_largepages & (1 << TTE4M))) { 815 ret = map_text_pgsz4m(addr, len); 816 } 817 } else if (use_initdata_pgsz64k && 818 !(disable_initdata_largepages & (1 << TTE64K))) { 819 ret = map_initdata_pgsz64k(addr, len); 820 } 821 822 return (ret); 823 } 824 825 uint_t 826 map_shm_pgszcvec(caddr_t addr, size_t size, uintptr_t off) 827 { 828 caddr_t eaddr = addr + size; 829 uint_t szcvec = 0; 830 int i; 831 caddr_t raddr; 832 caddr_t readdr; 833 size_t pgsz; 834 835 if (physmem < shm_lpg_min_physmem || mmu_page_sizes <= 1 || 836 max_shm_lpsize <= MMU_PAGESIZE) { 837 return (0); 838 } 839 840 for (i = mmu_page_sizes - 1; i > 0; i--) { 841 if (disable_shm_large_pages & (1 << i)) { 842 continue; 843 } 844 pgsz = page_get_pagesize(i); 845 if (pgsz > max_shm_lpsize) { 846 continue; 847 } 848 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 849 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 850 if (raddr < addr || raddr >= readdr) { 851 continue; 852 } 853 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 854 continue; 855 } 856 szcvec |= (1 << i); 857 /* 858 * And or in the remaining enabled page sizes. 859 */ 860 szcvec |= P2PHASE(~disable_shm_large_pages, (1 << i)); 861 szcvec &= ~1; /* no need to return 8K pagesize */ 862 break; 863 } 864 return (szcvec); 865 } 866 867 #define PNUM_SIZE(size_code) \ 868 (hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift) 869 870 /* 871 * Anchored in the table below are counters used to keep track 872 * of free contiguous physical memory. Each element of the table contains 873 * the array of counters, the size of array which is allocated during 874 * startup based on physmax and a shift value used to convert a pagenum 875 * into a counter array index or vice versa. The table has page size 876 * for rows and region size for columns: 877 * 878 * page_counters[page_size][region_size] 879 * 880 * page_size: TTE size code of pages on page_size freelist. 881 * 882 * region_size: TTE size code of a candidate larger page made up 883 * made up of contiguous free page_size pages. 884 * 885 * As you go across a page_size row increasing region_size each 886 * element keeps track of how many (region_size - 1) size groups 887 * made up of page_size free pages can be coalesced into a 888 * regsion_size page. Yuck! Lets try an example: 889 * 890 * page_counters[1][3] is the table element used for identifying 891 * candidate 4M pages from contiguous pages off the 64K free list. 892 * Each index in the page_counters[1][3].array spans 4M. Its the 893 * number of free 512K size (regsion_size - 1) groups of contiguous 894 * 64K free pages. So when page_counters[1][3].counters[n] == 8 895 * we know we have a candidate 4M page made up of 512K size groups 896 * of 64K free pages. 897 */ 898 899 /* 900 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 901 * dimensions are allocated dynamically. 902 */ 903 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 904 905 /* 906 * For now there is only a single size cache list. 907 * Allocated dynamically. 908 */ 909 page_t ***page_cachelists[MAX_MEM_TYPES]; 910 911 kmutex_t *fpc_mutex[NPC_MUTEX]; 912 kmutex_t *cpc_mutex[NPC_MUTEX]; 913 914 caddr_t 915 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align) 916 { 917 int mtype; 918 uint_t szc; 919 920 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 921 922 /* 923 * We only support small pages in the cachelist. 924 */ 925 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 926 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 927 alloc_base += (sizeof (page_t *) * page_colors); 928 /* 929 * Allocate freelists bins for all 930 * supported page sizes. 931 */ 932 for (szc = 0; szc < mmu_page_sizes; szc++) { 933 page_freelists[szc][mtype][mnode] = 934 (page_t **)alloc_base; 935 alloc_base += ((sizeof (page_t *) * 936 page_get_pagecolors(szc))); 937 } 938 } 939 940 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 941 942 return (alloc_base); 943 } 944 945 /* 946 * Allocate page_freelists bin headers for a memnode from the 947 * nucleus data area. This is the first time that mmu_page_sizes is 948 * used during sun4u bootup, so check mmu_page_sizes initialization. 949 */ 950 int 951 ndata_alloc_page_freelists(struct memlist *ndata, int mnode) 952 { 953 size_t alloc_sz; 954 caddr_t alloc_base; 955 caddr_t end; 956 int mtype; 957 uint_t szc; 958 int32_t allp = 0; 959 960 if (&mmu_init_mmu_page_sizes) { 961 if (!mmu_init_mmu_page_sizes(allp)) { 962 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 963 mmu_page_sizes); 964 } 965 } 966 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 967 968 /* first time called - allocate max_mem_nodes dimension */ 969 if (mnode == 0) { 970 int i; 971 972 /* page_cachelists */ 973 alloc_sz = MAX_MEM_TYPES * max_mem_nodes * 974 sizeof (page_t **); 975 976 /* page_freelists */ 977 alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes * 978 sizeof (page_t **); 979 980 /* fpc_mutex and cpc_mutex */ 981 alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 982 983 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 984 if (alloc_base == NULL) 985 return (-1); 986 987 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 988 989 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 990 page_cachelists[mtype] = (page_t ***)alloc_base; 991 alloc_base += (max_mem_nodes * sizeof (page_t **)); 992 for (szc = 0; szc < mmu_page_sizes; szc++) { 993 page_freelists[szc][mtype] = 994 (page_t ***)alloc_base; 995 alloc_base += (max_mem_nodes * 996 sizeof (page_t **)); 997 } 998 } 999 for (i = 0; i < NPC_MUTEX; i++) { 1000 fpc_mutex[i] = (kmutex_t *)alloc_base; 1001 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 1002 cpc_mutex[i] = (kmutex_t *)alloc_base; 1003 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 1004 } 1005 alloc_sz = 0; 1006 } 1007 1008 /* 1009 * Calculate the size needed by alloc_page_freelists(). 1010 */ 1011 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 1012 alloc_sz += sizeof (page_t *) * page_colors; 1013 1014 for (szc = 0; szc < mmu_page_sizes; szc++) 1015 alloc_sz += sizeof (page_t *) * 1016 page_get_pagecolors(szc); 1017 } 1018 1019 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 1020 if (alloc_base == NULL) 1021 return (-1); 1022 1023 end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize); 1024 ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz, 1025 ecache_alignsize)); 1026 1027 return (0); 1028 } 1029 1030 /* 1031 * To select our starting bin, we stride through the bins with a stride 1032 * of 337. Why 337? It's prime, it's largeish, and it performs well both 1033 * in simulation and practice for different workloads on varying cache sizes. 1034 */ 1035 uint32_t color_start_current = 0; 1036 uint32_t color_start_stride = 337; 1037 int color_start_random = 0; 1038 1039 /* ARGSUSED */ 1040 uint_t 1041 get_color_start(struct as *as) 1042 { 1043 uint32_t old, new; 1044 1045 if (consistent_coloring == 2 || color_start_random) { 1046 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 1047 page_colors_mask)); 1048 } 1049 1050 do { 1051 old = color_start_current; 1052 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 1053 } while (cas32(&color_start_current, old, new) != old); 1054 1055 return ((uint_t)(new)); 1056 } 1057 1058 /* 1059 * Called once at startup from kphysm_init() -- before memialloc() 1060 * is invoked to do the 1st page_free()/page_freelist_add(). 1061 * 1062 * initializes page_colors and page_colors_mask based on ecache_setsize. 1063 * 1064 * Also initializes the counter locks. 1065 */ 1066 void 1067 page_coloring_init() 1068 { 1069 int a; 1070 1071 if (do_pg_coloring == 0) { 1072 page_colors = 1; 1073 return; 1074 } 1075 1076 /* 1077 * Calculate page_colors from ecache_setsize. ecache_setsize contains 1078 * the max ecache setsize of all cpus configured in the system or, for 1079 * cheetah+ systems, the max possible ecache setsize for all possible 1080 * cheetah+ cpus. 1081 */ 1082 page_colors = ecache_setsize / MMU_PAGESIZE; 1083 page_colors_mask = page_colors - 1; 1084 1085 /* 1086 * initialize cpu_page_colors if ecache setsizes are homogenous. 1087 * cpu_page_colors set to -1 during DR operation or during startup 1088 * if setsizes are heterogenous. 1089 * 1090 * The value of cpu_page_colors determines if additional color bins 1091 * need to be checked for a particular color in the page_get routines. 1092 */ 1093 if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize)) 1094 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 1095 1096 vac_colors = vac_size / MMU_PAGESIZE; 1097 vac_colors_mask = vac_colors -1; 1098 1099 page_coloring_shift = 0; 1100 a = ecache_setsize; 1101 while (a >>= 1) { 1102 page_coloring_shift++; 1103 } 1104 } 1105 1106 int 1107 bp_color(struct buf *bp) 1108 { 1109 int color = -1; 1110 1111 if (vac) { 1112 if ((bp->b_flags & B_PAGEIO) != 0) { 1113 color = sfmmu_get_ppvcolor(bp->b_pages); 1114 } else if (bp->b_un.b_addr != NULL) { 1115 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 1116 } 1117 } 1118 return (color < 0 ? 0 : ptob(color)); 1119 } 1120 1121 /* 1122 * Create & Initialise pageout scanner thread. The thread has to 1123 * start at procedure with process pp and priority pri. 1124 */ 1125 void 1126 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1127 { 1128 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1129 } 1130 1131 /* 1132 * Function for flushing D-cache when performing module relocations 1133 * to an alternate mapping. Stubbed out on all platforms except sun4u, 1134 * at least for now. 1135 */ 1136 void 1137 dcache_flushall() 1138 { 1139 sfmmu_cache_flushall(); 1140 } 1141 1142 static int 1143 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 1144 { 1145 if (va1 < va2 && va1 + sz1 <= va2) 1146 return (0); 1147 1148 if (va2 < va1 && va2 + sz2 <= va1) 1149 return (0); 1150 1151 return (1); 1152 } 1153 1154 /* 1155 * Return the number of bytes, relative to the beginning of a given range, that 1156 * are non-toxic (can be read from and written to with relative impunity). 1157 */ 1158 size_t 1159 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 1160 { 1161 /* OBP reads are harmless, but we don't want people writing there */ 1162 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 1163 OFW_START_ADDR + 1)) 1164 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 1165 1166 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 1167 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 1168 1169 return (sz); /* no overlap */ 1170 } 1171 1172 /* 1173 * Minimum physmem required for enabling large pages for kernel heap 1174 * Currently we do not enable lp for kmem on systems with less 1175 * than 1GB of memory. This value can be changed via /etc/system 1176 */ 1177 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 1178 1179 /* 1180 * this function chooses large page size for kernel heap 1181 */ 1182 size_t 1183 get_segkmem_lpsize(size_t lpsize) 1184 { 1185 size_t memtotal = physmem * PAGESIZE; 1186 size_t mmusz; 1187 uint_t szc; 1188 extern int disable_large_pages; 1189 1190 if (memtotal < segkmem_lpminphysmem) 1191 return (PAGESIZE); 1192 1193 if (plat_lpkmem_is_supported != NULL && 1194 plat_lpkmem_is_supported() == 0) 1195 return (PAGESIZE); 1196 1197 mmusz = mmu_get_kernel_lpsize(lpsize); 1198 szc = page_szc(mmusz); 1199 1200 while (szc) { 1201 if (!(disable_large_pages & (1 << szc))) 1202 return (page_get_pagesize(szc)); 1203 szc--; 1204 } 1205 return (PAGESIZE); 1206 } 1207