1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * UNIX machine dependent virtual memory support. 31 */ 32 33 #include <sys/vm.h> 34 #include <sys/exec.h> 35 36 #include <sys/exechdr.h> 37 #include <vm/seg_kmem.h> 38 #include <sys/atomic.h> 39 #include <sys/archsystm.h> 40 #include <sys/machsystm.h> 41 #include <sys/kdi.h> 42 #include <sys/cpu_module.h> 43 44 #include <vm/hat_sfmmu.h> 45 46 #include <sys/memnode.h> 47 48 #include <sys/mem_config.h> 49 #include <sys/mem_cage.h> 50 #include <vm/vm_dep.h> 51 #include <sys/platform_module.h> 52 53 /* 54 * These variables are set by module specific config routines. 55 * They are only set by modules which will use physical cache page coloring 56 * and/or virtual cache page coloring. 57 */ 58 int do_pg_coloring = 0; 59 int do_virtual_coloring = 0; 60 61 /* 62 * These variables can be conveniently patched at kernel load time to 63 * prevent do_pg_coloring or do_virtual_coloring from being enabled by 64 * module specific config routines. 65 */ 66 67 int use_page_coloring = 1; 68 int use_virtual_coloring = 1; 69 70 /* 71 * initialized by page_coloring_init() 72 */ 73 extern uint_t page_colors; 74 extern uint_t page_colors_mask; 75 extern uint_t page_coloring_shift; 76 int cpu_page_colors; 77 uint_t vac_colors = 0; 78 uint_t vac_colors_mask = 0; 79 80 /* 81 * get the ecache setsize for the current cpu. 82 */ 83 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 84 85 plcnt_t plcnt; /* page list count */ 86 87 /* 88 * This variable is set by the cpu module to contain the lowest 89 * address not affected by the SF_ERRATA_57 workaround. It should 90 * remain 0 if the workaround is not needed. 91 */ 92 #if defined(SF_ERRATA_57) 93 caddr_t errata57_limit; 94 #endif 95 96 extern int disable_auto_large_pages; /* used by map_pgsz*() routines */ 97 98 extern void page_relocate_hash(page_t *, page_t *); 99 100 /* 101 * these must be defined in platform specific areas 102 */ 103 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 104 struct proc *, uint_t); 105 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 106 caddr_t, size_t, uint_t, struct lgrp *); 107 /* 108 * Convert page frame number to an OBMEM page frame number 109 * (i.e. put in the type bits -- zero for this implementation) 110 */ 111 pfn_t 112 impl_obmem_pfnum(pfn_t pf) 113 { 114 return (pf); 115 } 116 117 /* 118 * Use physmax to determine the highest physical page of DRAM memory 119 * It is assumed that any physical addresses above physmax is in IO space. 120 * We don't bother checking the low end because we assume that memory space 121 * begins at physical page frame 0. 122 * 123 * Return 1 if the page frame is onboard DRAM memory, else 0. 124 * Returns 0 for nvram so it won't be cached. 125 */ 126 int 127 pf_is_memory(pfn_t pf) 128 { 129 /* We must be IO space */ 130 if (pf > physmax) 131 return (0); 132 133 /* We must be memory space */ 134 return (1); 135 } 136 137 /* 138 * Handle a pagefault. 139 */ 140 faultcode_t 141 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 142 { 143 struct as *as; 144 struct proc *p; 145 faultcode_t res; 146 caddr_t base; 147 size_t len; 148 int err; 149 150 if (INVALID_VADDR(addr)) 151 return (FC_NOMAP); 152 153 if (iskernel) { 154 as = &kas; 155 } else { 156 p = curproc; 157 as = p->p_as; 158 #if defined(SF_ERRATA_57) 159 /* 160 * Prevent infinite loops due to a segment driver 161 * setting the execute permissions and the sfmmu hat 162 * silently ignoring them. 163 */ 164 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 165 addr < errata57_limit) { 166 res = FC_NOMAP; 167 goto out; 168 } 169 #endif 170 } 171 172 /* 173 * Dispatch pagefault. 174 */ 175 res = as_fault(as->a_hat, as, addr, 1, type, rw); 176 177 /* 178 * If this isn't a potential unmapped hole in the user's 179 * UNIX data or stack segments, just return status info. 180 */ 181 if (!(res == FC_NOMAP && iskernel == 0)) 182 goto out; 183 184 /* 185 * Check to see if we happened to faulted on a currently unmapped 186 * part of the UNIX data or stack segments. If so, create a zfod 187 * mapping there and then try calling the fault routine again. 188 */ 189 base = p->p_brkbase; 190 len = p->p_brksize; 191 192 if (addr < base || addr >= base + len) { /* data seg? */ 193 base = (caddr_t)(p->p_usrstack - p->p_stksize); 194 len = p->p_stksize; 195 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 196 /* not in either UNIX data or stack segments */ 197 res = FC_NOMAP; 198 goto out; 199 } 200 } 201 202 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 203 /* This code is probably not needed anymore */ 204 205 /* expand the gap to the page boundaries on each side */ 206 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 207 ((uintptr_t)base & PAGEMASK); 208 base = (caddr_t)((uintptr_t)base & PAGEMASK); 209 210 as_rangelock(as); 211 as_purge(as); 212 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 213 err = as_map(as, base, len, segvn_create, zfod_argsp); 214 as_rangeunlock(as); 215 if (err) { 216 res = FC_MAKE_ERR(err); 217 goto out; 218 } 219 } else { 220 /* 221 * This page is already mapped by another thread after we 222 * returned from as_fault() above. We just fallthrough 223 * as_fault() below. 224 */ 225 as_rangeunlock(as); 226 } 227 228 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 229 230 out: 231 232 return (res); 233 } 234 235 /* 236 * This is the routine which defines the address limit implied 237 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 238 * mappable address in a 32-bit process on this platform (though 239 * perhaps we should make it be UINT32_MAX here?) 240 */ 241 void 242 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 243 { 244 struct proc *p = curproc; 245 caddr_t userlimit = flags & _MAP_LOW32 ? 246 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 247 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 248 } 249 250 /* 251 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 252 */ 253 caddr_t hole_start, hole_end; 254 255 /* 256 * kpm mapping window 257 */ 258 caddr_t kpm_vbase; 259 size_t kpm_size; 260 uchar_t kpm_size_shift; 261 262 /* 263 * Determine whether [base, base+len] contains a mapable range of 264 * addresses at least minlen long. base and len are adjusted if 265 * required to provide a mapable range. 266 */ 267 /* ARGSUSED */ 268 int 269 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 270 { 271 caddr_t hi, lo; 272 273 lo = *basep; 274 hi = lo + *lenp; 275 276 /* 277 * If hi rolled over the top, try cutting back. 278 */ 279 if (hi < lo) { 280 size_t newlen = 0 - (uintptr_t)lo - 1l; 281 282 if (newlen + (uintptr_t)hi < minlen) 283 return (0); 284 if (newlen < minlen) 285 return (0); 286 *lenp = newlen; 287 } else if (hi - lo < minlen) 288 return (0); 289 290 /* 291 * Deal with a possible hole in the address range between 292 * hole_start and hole_end that should never be mapped by the MMU. 293 */ 294 hi = lo + *lenp; 295 296 if (lo < hole_start) { 297 if (hi > hole_start) 298 if (hi < hole_end) 299 hi = hole_start; 300 else 301 /* lo < hole_start && hi >= hole_end */ 302 if (dir == AH_LO) { 303 /* 304 * prefer lowest range 305 */ 306 if (hole_start - lo >= minlen) 307 hi = hole_start; 308 else if (hi - hole_end >= minlen) 309 lo = hole_end; 310 else 311 return (0); 312 } else { 313 /* 314 * prefer highest range 315 */ 316 if (hi - hole_end >= minlen) 317 lo = hole_end; 318 else if (hole_start - lo >= minlen) 319 hi = hole_start; 320 else 321 return (0); 322 } 323 } else { 324 /* lo >= hole_start */ 325 if (hi < hole_end) 326 return (0); 327 if (lo < hole_end) 328 lo = hole_end; 329 } 330 331 if (hi - lo < minlen) 332 return (0); 333 334 *basep = lo; 335 *lenp = hi - lo; 336 337 return (1); 338 } 339 340 /* 341 * Determine whether [addr, addr+len] with protections `prot' are valid 342 * for a user address space. 343 */ 344 /*ARGSUSED*/ 345 int 346 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 347 caddr_t userlimit) 348 { 349 caddr_t eaddr = addr + len; 350 351 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 352 return (RANGE_BADADDR); 353 354 /* 355 * Determine if the address range falls within an illegal 356 * range of the MMU. 357 */ 358 if (eaddr > hole_start && addr < hole_end) 359 return (RANGE_BADADDR); 360 361 #if defined(SF_ERRATA_57) 362 /* 363 * Make sure USERLIMIT isn't raised too high 364 */ 365 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 366 errata57_limit == 0); 367 368 if (AS_TYPE_64BIT(as) && 369 (addr < errata57_limit) && 370 (prot & PROT_EXEC)) 371 return (RANGE_BADPROT); 372 #endif /* SF_ERRATA57 */ 373 return (RANGE_OKAY); 374 } 375 376 /* 377 * Routine used to check to see if an a.out can be executed 378 * by the current machine/architecture. 379 */ 380 int 381 chkaout(struct exdata *exp) 382 { 383 if (exp->ux_mach == M_SPARC) 384 return (0); 385 else 386 return (ENOEXEC); 387 } 388 389 /* 390 * The following functions return information about an a.out 391 * which is used when a program is executed. 392 */ 393 394 /* 395 * Return the load memory address for the data segment. 396 */ 397 caddr_t 398 getdmem(struct exec *exp) 399 { 400 /* 401 * XXX - Sparc Reference Hack approaching 402 * Remember that we are loading 403 * 8k executables into a 4k machine 404 * DATA_ALIGN == 2 * PAGESIZE 405 */ 406 if (exp->a_text) 407 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); 408 else 409 return ((caddr_t)USRTEXT); 410 } 411 412 /* 413 * Return the starting disk address for the data segment. 414 */ 415 ulong_t 416 getdfile(struct exec *exp) 417 { 418 if (exp->a_magic == ZMAGIC) 419 return (exp->a_text); 420 else 421 return (sizeof (struct exec) + exp->a_text); 422 } 423 424 /* 425 * Return the load memory address for the text segment. 426 */ 427 428 /*ARGSUSED*/ 429 caddr_t 430 gettmem(struct exec *exp) 431 { 432 return ((caddr_t)USRTEXT); 433 } 434 435 /* 436 * Return the file byte offset for the text segment. 437 */ 438 uint_t 439 gettfile(struct exec *exp) 440 { 441 if (exp->a_magic == ZMAGIC) 442 return (0); 443 else 444 return (sizeof (struct exec)); 445 } 446 447 void 448 getexinfo( 449 struct exdata *edp_in, 450 struct exdata *edp_out, 451 int *pagetext, 452 int *pagedata) 453 { 454 *edp_out = *edp_in; /* structure copy */ 455 456 if ((edp_in->ux_mag == ZMAGIC) && 457 ((edp_in->vp->v_flag & VNOMAP) == 0)) { 458 *pagetext = 1; 459 *pagedata = 1; 460 } else { 461 *pagetext = 0; 462 *pagedata = 0; 463 } 464 } 465 466 #define MAP_PGSZ_COMMON(pgsz, n, upper, lower, len) \ 467 for ((n) = (upper); (n) > (lower); (n)--) { \ 468 if (disable_auto_large_pages & (1 << (n))) \ 469 continue; \ 470 if (hw_page_array[(n)].hp_size <= (len)) { \ 471 (pgsz) = hw_page_array[(n)].hp_size; \ 472 break; \ 473 } \ 474 } 475 476 477 /*ARGSUSED*/ 478 size_t 479 map_pgszva(struct proc *p, caddr_t addr, size_t len) 480 { 481 size_t pgsz = MMU_PAGESIZE; 482 int n, upper; 483 484 /* 485 * Select the best fit page size within the constraints of 486 * auto_lpg_{min,max}szc. 487 * 488 * Note that we also take the heap size into account when 489 * deciding if we've crossed the threshold at which we should 490 * increase the page size. This isn't perfect since the heap 491 * may not have reached its full size yet, but it's better than 492 * not considering it at all. 493 */ 494 len += p->p_brksize; 495 if (ptob(auto_lpg_tlb_threshold) <= len) { 496 497 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 498 499 /* 500 * Use auto_lpg_minszc - 1 as the limit so we never drop 501 * below auto_lpg_minszc. We don't have a size code to refer 502 * to like we have for bss and stack, so we assume 0. 503 * auto_lpg_minszc should always be >= 0. Using 504 * auto_lpg_minszc cuts off the loop. 505 */ 506 MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len); 507 } 508 509 return (pgsz); 510 } 511 512 size_t 513 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 514 { 515 size_t pgsz; 516 int n, upper, lower; 517 518 /* 519 * If len is zero, retrieve from proc and don't demote the page size. 520 */ 521 if (len == 0) { 522 len = p->p_brksize; 523 } 524 525 /* 526 * Still zero? Then we don't have a heap yet, so pick the default 527 * heap size. 528 */ 529 if (len == 0) { 530 pgsz = auto_lpg_heap_default; 531 } else { 532 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 533 } 534 535 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 536 /* 537 * We're past the threshold, so select the best fit 538 * page size within the constraints of 539 * auto_lpg_{min,max}szc and the minimum required 540 * alignment. 541 */ 542 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 543 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 544 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 545 } 546 547 /* 548 * If addr == 0 we were called by memcntl() or exec_args() when the 549 * size code is 0. Don't set pgsz less than current size. 550 */ 551 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 552 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 553 } 554 555 return (pgsz); 556 } 557 558 size_t 559 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 560 { 561 size_t pgsz; 562 int n, upper, lower; 563 564 /* 565 * If len is zero, retrieve from proc and don't demote the page size. 566 */ 567 if (len == 0) { 568 len = p->p_stksize; 569 } 570 571 /* 572 * Still zero? Then we don't have a heap yet, so pick the default 573 * stack size. 574 */ 575 if (len == 0) { 576 pgsz = auto_lpg_stack_default; 577 } else { 578 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 579 } 580 581 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 582 /* 583 * We're past the threshold, so select the best fit 584 * page size within the constraints of 585 * auto_lpg_{min,max}szc and the minimum required 586 * alignment. 587 */ 588 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 589 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 590 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 591 } 592 593 /* 594 * If addr == 0 we were called by memcntl() or exec_args() when the 595 * size code is 0. Don't set pgsz less than current size. 596 */ 597 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 598 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 599 } 600 601 return (pgsz); 602 } 603 604 605 /* 606 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 607 * KPM selects an address such that it's equal offset modulo shm_alignment and 608 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 609 */ 610 int 611 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 612 { 613 if (vac) { 614 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 615 } else { 616 return (0); 617 } 618 } 619 620 /* 621 * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m 622 * can be set in platform or CPU specific code but user can change the 623 * default values via /etc/system. 624 * 625 * Initial values are defined in architecture specific mach_vm_dep.c file. 626 */ 627 extern int use_text_pgsz64k; 628 extern int use_text_pgsz4m; 629 extern int use_initdata_pgsz64k; 630 631 /* 632 * disable_text_largepages and disable_initdata_largepages bitmaks are set in 633 * platform or CPU specific code to disable page sizes that should not be 634 * used. These variables normally shouldn't be changed via /etc/system. A 635 * particular page size for text or inititialized data will be used by default 636 * if both one of use_* variables is set to 1 AND this page size is not 637 * disabled in the corresponding disable_* bitmask variable. 638 * 639 * Initial values are defined in architecture specific mach_vm_dep.c file. 640 */ 641 extern int disable_text_largepages; 642 extern int disable_initdata_largepages; 643 644 /* 645 * Minimum segment size tunables before 64K or 4M large pages 646 * should be used to map it. 647 * 648 * Initial values are defined in architecture specific mach_vm_dep.c file. 649 */ 650 extern size_t text_pgsz64k_minsize; 651 extern size_t text_pgsz4m_minsize; 652 extern size_t initdata_pgsz64k_minsize; 653 654 /* 655 * Sanity control. Don't use large pages regardless of user 656 * settings if there's less than execseg_lpg_min_physmem memory installed. 657 * The units for this variable is 8K pages. 658 */ 659 pgcnt_t execseg_lpg_min_physmem = 131072; /* 1GB */ 660 661 662 /* assumes TTE8K...TTE4M == szc */ 663 664 static uint_t 665 map_text_pgsz4m(caddr_t addr, size_t len) 666 { 667 caddr_t a; 668 669 if (len < text_pgsz4m_minsize) { 670 return (0); 671 } 672 673 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 674 if (a < addr || a >= addr + len) { 675 return (0); 676 } 677 len -= (a - addr); 678 if (len < MMU_PAGESIZE4M) { 679 return (0); 680 } 681 682 return (1 << TTE4M); 683 } 684 685 static uint_t 686 map_text_pgsz64k(caddr_t addr, size_t len) 687 { 688 caddr_t a; 689 size_t svlen = len; 690 691 if (len < text_pgsz64k_minsize) { 692 return (0); 693 } 694 695 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 696 if (a < addr || a >= addr + len) { 697 return (0); 698 } 699 len -= (a - addr); 700 if (len < MMU_PAGESIZE64K) { 701 return (0); 702 } 703 if (!use_text_pgsz4m || 704 disable_text_largepages & (1 << TTE4M)) { 705 return (1 << TTE64K); 706 } 707 if (svlen < text_pgsz4m_minsize) { 708 return (1 << TTE64K); 709 } 710 addr = a; 711 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 712 if (a < addr || a >= addr + len) { 713 return (1 << TTE64K); 714 } 715 len -= (a - addr); 716 if (len < MMU_PAGESIZE4M) { 717 return (1 << TTE64K); 718 } 719 return ((1 << TTE4M) | (1 << TTE64K)); 720 } 721 722 static uint_t 723 map_initdata_pgsz64k(caddr_t addr, size_t len) 724 { 725 caddr_t a; 726 727 if (len < initdata_pgsz64k_minsize) { 728 return (0); 729 } 730 731 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 732 if (a < addr || a >= addr + len) { 733 return (0); 734 } 735 len -= (a - addr); 736 if (len < MMU_PAGESIZE64K) { 737 return (0); 738 } 739 return (1 << TTE64K); 740 } 741 742 /* 743 * Return a bit vector of large page size codes that 744 * can be used to map [addr, addr + len) region. 745 */ 746 uint_t 747 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 748 { 749 uint_t ret = 0; 750 751 if (physmem < execseg_lpg_min_physmem) { 752 return (0); 753 } 754 755 if (text) { 756 if (use_text_pgsz64k && 757 !(disable_text_largepages & (1 << TTE64K))) { 758 ret = map_text_pgsz64k(addr, len); 759 } else if (use_text_pgsz4m && 760 !(disable_text_largepages & (1 << TTE4M))) { 761 ret = map_text_pgsz4m(addr, len); 762 } 763 } else if (use_initdata_pgsz64k && 764 !(disable_initdata_largepages & (1 << TTE64K))) { 765 ret = map_initdata_pgsz64k(addr, len); 766 } 767 768 return (ret); 769 } 770 771 #define PNUM_SIZE(size_code) \ 772 (hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift) 773 774 /* 775 * Anchored in the table below are counters used to keep track 776 * of free contiguous physical memory. Each element of the table contains 777 * the array of counters, the size of array which is allocated during 778 * startup based on physmax and a shift value used to convert a pagenum 779 * into a counter array index or vice versa. The table has page size 780 * for rows and region size for columns: 781 * 782 * page_counters[page_size][region_size] 783 * 784 * page_size: TTE size code of pages on page_size freelist. 785 * 786 * region_size: TTE size code of a candidate larger page made up 787 * made up of contiguous free page_size pages. 788 * 789 * As you go across a page_size row increasing region_size each 790 * element keeps track of how many (region_size - 1) size groups 791 * made up of page_size free pages can be coalesced into a 792 * regsion_size page. Yuck! Lets try an example: 793 * 794 * page_counters[1][3] is the table element used for identifying 795 * candidate 4M pages from contiguous pages off the 64K free list. 796 * Each index in the page_counters[1][3].array spans 4M. Its the 797 * number of free 512K size (regsion_size - 1) groups of contiguous 798 * 64K free pages. So when page_counters[1][3].counters[n] == 8 799 * we know we have a candidate 4M page made up of 512K size groups 800 * of 64K free pages. 801 */ 802 803 /* 804 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 805 * dimensions are allocated dynamically. 806 */ 807 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 808 809 /* 810 * For now there is only a single size cache list. 811 * Allocated dynamically. 812 */ 813 page_t ***page_cachelists[MAX_MEM_TYPES]; 814 815 kmutex_t *fpc_mutex[NPC_MUTEX]; 816 kmutex_t *cpc_mutex[NPC_MUTEX]; 817 818 caddr_t 819 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align) 820 { 821 int mtype; 822 uint_t szc; 823 824 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 825 826 /* 827 * We only support small pages in the cachelist. 828 */ 829 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 830 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 831 alloc_base += (sizeof (page_t *) * page_colors); 832 /* 833 * Allocate freelists bins for all 834 * supported page sizes. 835 */ 836 for (szc = 0; szc < mmu_page_sizes; szc++) { 837 page_freelists[szc][mtype][mnode] = 838 (page_t **)alloc_base; 839 alloc_base += ((sizeof (page_t *) * 840 page_get_pagecolors(szc))); 841 } 842 } 843 844 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 845 846 return (alloc_base); 847 } 848 849 /* 850 * Allocate page_freelists bin headers for a memnode from the 851 * nucleus data area. This is the first time that mmu_page_sizes is 852 * used during sun4u bootup, so check mmu_page_sizes initialization. 853 */ 854 int 855 ndata_alloc_page_freelists(struct memlist *ndata, int mnode) 856 { 857 size_t alloc_sz; 858 caddr_t alloc_base; 859 caddr_t end; 860 int mtype; 861 uint_t szc; 862 int32_t allp = 0; 863 864 if (&mmu_init_mmu_page_sizes) { 865 if (!mmu_init_mmu_page_sizes(allp)) { 866 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 867 mmu_page_sizes); 868 } 869 } 870 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 871 872 /* first time called - allocate max_mem_nodes dimension */ 873 if (mnode == 0) { 874 int i; 875 876 /* page_cachelists */ 877 alloc_sz = MAX_MEM_TYPES * max_mem_nodes * 878 sizeof (page_t **); 879 880 /* page_freelists */ 881 alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes * 882 sizeof (page_t **); 883 884 /* fpc_mutex and cpc_mutex */ 885 alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 886 887 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 888 if (alloc_base == NULL) 889 return (-1); 890 891 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 892 893 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 894 page_cachelists[mtype] = (page_t ***)alloc_base; 895 alloc_base += (max_mem_nodes * sizeof (page_t **)); 896 for (szc = 0; szc < mmu_page_sizes; szc++) { 897 page_freelists[szc][mtype] = 898 (page_t ***)alloc_base; 899 alloc_base += (max_mem_nodes * 900 sizeof (page_t **)); 901 } 902 } 903 for (i = 0; i < NPC_MUTEX; i++) { 904 fpc_mutex[i] = (kmutex_t *)alloc_base; 905 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 906 cpc_mutex[i] = (kmutex_t *)alloc_base; 907 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 908 } 909 alloc_sz = 0; 910 } 911 912 /* 913 * Calculate the size needed by alloc_page_freelists(). 914 */ 915 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 916 alloc_sz += sizeof (page_t *) * page_colors; 917 918 for (szc = 0; szc < mmu_page_sizes; szc++) 919 alloc_sz += sizeof (page_t *) * 920 page_get_pagecolors(szc); 921 } 922 923 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 924 if (alloc_base == NULL) 925 return (-1); 926 927 end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize); 928 ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz, 929 ecache_alignsize)); 930 931 return (0); 932 } 933 934 /* 935 * To select our starting bin, we stride through the bins with a stride 936 * of 337. Why 337? It's prime, it's largeish, and it performs well both 937 * in simulation and practice for different workloads on varying cache sizes. 938 */ 939 uint32_t color_start_current = 0; 940 uint32_t color_start_stride = 337; 941 int color_start_random = 0; 942 943 /* ARGSUSED */ 944 uint_t 945 get_color_start(struct as *as) 946 { 947 uint32_t old, new; 948 949 if (consistent_coloring == 2 || color_start_random) { 950 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 951 page_colors_mask)); 952 } 953 954 do { 955 old = color_start_current; 956 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 957 } while (cas32(&color_start_current, old, new) != old); 958 959 return ((uint_t)(new)); 960 } 961 962 /* 963 * Called once at startup from kphysm_init() -- before memialloc() 964 * is invoked to do the 1st page_free()/page_freelist_add(). 965 * 966 * initializes page_colors and page_colors_mask based on ecache_setsize. 967 * 968 * Also initializes the counter locks. 969 */ 970 void 971 page_coloring_init() 972 { 973 int a; 974 975 if (do_pg_coloring == 0) { 976 page_colors = 1; 977 return; 978 } 979 980 /* 981 * Calculate page_colors from ecache_setsize. ecache_setsize contains 982 * the max ecache setsize of all cpus configured in the system or, for 983 * cheetah+ systems, the max possible ecache setsize for all possible 984 * cheetah+ cpus. 985 */ 986 page_colors = ecache_setsize / MMU_PAGESIZE; 987 page_colors_mask = page_colors - 1; 988 989 /* 990 * initialize cpu_page_colors if ecache setsizes are homogenous. 991 * cpu_page_colors set to -1 during DR operation or during startup 992 * if setsizes are heterogenous. 993 * 994 * The value of cpu_page_colors determines if additional color bins 995 * need to be checked for a particular color in the page_get routines. 996 */ 997 if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize)) 998 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 999 1000 vac_colors = vac_size / MMU_PAGESIZE; 1001 vac_colors_mask = vac_colors -1; 1002 1003 page_coloring_shift = 0; 1004 a = ecache_setsize; 1005 while (a >>= 1) { 1006 page_coloring_shift++; 1007 } 1008 } 1009 1010 int 1011 bp_color(struct buf *bp) 1012 { 1013 int color = -1; 1014 1015 if (vac) { 1016 if ((bp->b_flags & B_PAGEIO) != 0) { 1017 color = sfmmu_get_ppvcolor(bp->b_pages); 1018 } else if (bp->b_un.b_addr != NULL) { 1019 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 1020 } 1021 } 1022 return (color < 0 ? 0 : ptob(color)); 1023 } 1024 1025 /* 1026 * Create & Initialise pageout scanner thread. The thread has to 1027 * start at procedure with process pp and priority pri. 1028 */ 1029 void 1030 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1031 { 1032 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1033 } 1034 1035 /* 1036 * Function for flushing D-cache when performing module relocations 1037 * to an alternate mapping. Stubbed out on all platforms except sun4u, 1038 * at least for now. 1039 */ 1040 void 1041 dcache_flushall() 1042 { 1043 sfmmu_cache_flushall(); 1044 } 1045 1046 static int 1047 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 1048 { 1049 if (va1 < va2 && va1 + sz1 <= va2) 1050 return (0); 1051 1052 if (va2 < va1 && va2 + sz2 <= va1) 1053 return (0); 1054 1055 return (1); 1056 } 1057 1058 /* 1059 * Return the number of bytes, relative to the beginning of a given range, that 1060 * are non-toxic (can be read from and written to with relative impunity). 1061 */ 1062 size_t 1063 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 1064 { 1065 /* OBP reads are harmless, but we don't want people writing there */ 1066 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 1067 OFW_START_ADDR + 1)) 1068 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 1069 1070 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 1071 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 1072 1073 return (sz); /* no overlap */ 1074 } 1075 1076 /* 1077 * Minimum physmem required for enabling large pages for kernel heap 1078 * Currently we do not enable lp for kmem on systems with less 1079 * than 1GB of memory. This value can be changed via /etc/system 1080 */ 1081 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 1082 1083 /* 1084 * this function chooses large page size for kernel heap 1085 */ 1086 size_t 1087 get_segkmem_lpsize(size_t lpsize) 1088 { 1089 size_t memtotal = physmem * PAGESIZE; 1090 1091 if (memtotal < segkmem_lpminphysmem) 1092 return (PAGESIZE); 1093 1094 if (plat_lpkmem_is_supported != NULL && 1095 plat_lpkmem_is_supported() == 0) 1096 return (PAGESIZE); 1097 1098 return (mmu_get_kernel_lpsize(lpsize)); 1099 } 1100