1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * UNIX machine dependent virtual memory support. 31 */ 32 33 #include <sys/vm.h> 34 #include <sys/exec.h> 35 36 #include <sys/exechdr.h> 37 #include <vm/seg_kmem.h> 38 #include <sys/atomic.h> 39 #include <sys/archsystm.h> 40 #include <sys/machsystm.h> 41 #include <sys/kdi.h> 42 #include <sys/cpu_module.h> 43 44 #include <vm/hat_sfmmu.h> 45 46 #include <sys/memnode.h> 47 48 #include <sys/mem_config.h> 49 #include <sys/mem_cage.h> 50 #include <vm/vm_dep.h> 51 #include <sys/platform_module.h> 52 53 /* 54 * These variables are set by module specific config routines. 55 * They are only set by modules which will use physical cache page coloring 56 * and/or virtual cache page coloring. 57 */ 58 int do_pg_coloring = 0; 59 int do_virtual_coloring = 0; 60 61 /* 62 * These variables can be conveniently patched at kernel load time to 63 * prevent do_pg_coloring or do_virtual_coloring from being enabled by 64 * module specific config routines. 65 */ 66 67 int use_page_coloring = 1; 68 int use_virtual_coloring = 1; 69 70 /* 71 * initialized by page_coloring_init() 72 */ 73 extern uint_t page_colors; 74 extern uint_t page_colors_mask; 75 extern uint_t page_coloring_shift; 76 int cpu_page_colors; 77 uint_t vac_colors = 0; 78 uint_t vac_colors_mask = 0; 79 80 /* 81 * get the ecache setsize for the current cpu. 82 */ 83 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 84 85 #ifdef DEBUG 86 plcnt_t plcnt; /* page list count */ 87 #endif 88 89 /* 90 * This variable is set by the cpu module to contain the lowest 91 * address not affected by the SF_ERRATA_57 workaround. It should 92 * remain 0 if the workaround is not needed. 93 */ 94 #if defined(SF_ERRATA_57) 95 caddr_t errata57_limit; 96 #endif 97 98 extern int disable_auto_large_pages; /* used by map_pgsz*() routines */ 99 100 extern void page_relocate_hash(page_t *, page_t *); 101 102 /* 103 * these must be defined in platform specific areas 104 */ 105 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 106 struct proc *, uint_t); 107 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 108 caddr_t, size_t, uint_t, struct lgrp *); 109 /* 110 * Convert page frame number to an OBMEM page frame number 111 * (i.e. put in the type bits -- zero for this implementation) 112 */ 113 pfn_t 114 impl_obmem_pfnum(pfn_t pf) 115 { 116 return (pf); 117 } 118 119 /* 120 * Use physmax to determine the highest physical page of DRAM memory 121 * It is assumed that any physical addresses above physmax is in IO space. 122 * We don't bother checking the low end because we assume that memory space 123 * begins at physical page frame 0. 124 * 125 * Return 1 if the page frame is onboard DRAM memory, else 0. 126 * Returns 0 for nvram so it won't be cached. 127 */ 128 int 129 pf_is_memory(pfn_t pf) 130 { 131 /* We must be IO space */ 132 if (pf > physmax) 133 return (0); 134 135 /* We must be memory space */ 136 return (1); 137 } 138 139 /* 140 * Handle a pagefault. 141 */ 142 faultcode_t 143 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 144 { 145 struct as *as; 146 struct proc *p; 147 faultcode_t res; 148 caddr_t base; 149 size_t len; 150 int err; 151 152 if (INVALID_VADDR(addr)) 153 return (FC_NOMAP); 154 155 if (iskernel) { 156 as = &kas; 157 } else { 158 p = curproc; 159 as = p->p_as; 160 #if defined(SF_ERRATA_57) 161 /* 162 * Prevent infinite loops due to a segment driver 163 * setting the execute permissions and the sfmmu hat 164 * silently ignoring them. 165 */ 166 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 167 addr < errata57_limit) { 168 res = FC_NOMAP; 169 goto out; 170 } 171 #endif 172 } 173 174 /* 175 * Dispatch pagefault. 176 */ 177 res = as_fault(as->a_hat, as, addr, 1, type, rw); 178 179 /* 180 * If this isn't a potential unmapped hole in the user's 181 * UNIX data or stack segments, just return status info. 182 */ 183 if (!(res == FC_NOMAP && iskernel == 0)) 184 goto out; 185 186 /* 187 * Check to see if we happened to faulted on a currently unmapped 188 * part of the UNIX data or stack segments. If so, create a zfod 189 * mapping there and then try calling the fault routine again. 190 */ 191 base = p->p_brkbase; 192 len = p->p_brksize; 193 194 if (addr < base || addr >= base + len) { /* data seg? */ 195 base = (caddr_t)(p->p_usrstack - p->p_stksize); 196 len = p->p_stksize; 197 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 198 /* not in either UNIX data or stack segments */ 199 res = FC_NOMAP; 200 goto out; 201 } 202 } 203 204 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 205 /* This code is probably not needed anymore */ 206 207 /* expand the gap to the page boundaries on each side */ 208 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 209 ((uintptr_t)base & PAGEMASK); 210 base = (caddr_t)((uintptr_t)base & PAGEMASK); 211 212 as_rangelock(as); 213 as_purge(as); 214 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 215 err = as_map(as, base, len, segvn_create, zfod_argsp); 216 as_rangeunlock(as); 217 if (err) { 218 res = FC_MAKE_ERR(err); 219 goto out; 220 } 221 } else { 222 /* 223 * This page is already mapped by another thread after we 224 * returned from as_fault() above. We just fallthrough 225 * as_fault() below. 226 */ 227 as_rangeunlock(as); 228 } 229 230 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 231 232 out: 233 234 return (res); 235 } 236 237 /* 238 * This is the routine which defines the address limit implied 239 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 240 * mappable address in a 32-bit process on this platform (though 241 * perhaps we should make it be UINT32_MAX here?) 242 */ 243 void 244 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 245 { 246 struct proc *p = curproc; 247 caddr_t userlimit = flags & _MAP_LOW32 ? 248 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 249 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 250 } 251 252 /* 253 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 254 */ 255 caddr_t hole_start, hole_end; 256 257 /* 258 * kpm mapping window 259 */ 260 caddr_t kpm_vbase; 261 size_t kpm_size; 262 uchar_t kpm_size_shift; 263 264 /* 265 * Determine whether [base, base+len] contains a mapable range of 266 * addresses at least minlen long. base and len are adjusted if 267 * required to provide a mapable range. 268 */ 269 /* ARGSUSED */ 270 int 271 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 272 { 273 caddr_t hi, lo; 274 275 lo = *basep; 276 hi = lo + *lenp; 277 278 /* 279 * If hi rolled over the top, try cutting back. 280 */ 281 if (hi < lo) { 282 size_t newlen = 0 - (uintptr_t)lo - 1l; 283 284 if (newlen + (uintptr_t)hi < minlen) 285 return (0); 286 if (newlen < minlen) 287 return (0); 288 *lenp = newlen; 289 } else if (hi - lo < minlen) 290 return (0); 291 292 /* 293 * Deal with a possible hole in the address range between 294 * hole_start and hole_end that should never be mapped by the MMU. 295 */ 296 hi = lo + *lenp; 297 298 if (lo < hole_start) { 299 if (hi > hole_start) 300 if (hi < hole_end) 301 hi = hole_start; 302 else 303 /* lo < hole_start && hi >= hole_end */ 304 if (dir == AH_LO) { 305 /* 306 * prefer lowest range 307 */ 308 if (hole_start - lo >= minlen) 309 hi = hole_start; 310 else if (hi - hole_end >= minlen) 311 lo = hole_end; 312 else 313 return (0); 314 } else { 315 /* 316 * prefer highest range 317 */ 318 if (hi - hole_end >= minlen) 319 lo = hole_end; 320 else if (hole_start - lo >= minlen) 321 hi = hole_start; 322 else 323 return (0); 324 } 325 } else { 326 /* lo >= hole_start */ 327 if (hi < hole_end) 328 return (0); 329 if (lo < hole_end) 330 lo = hole_end; 331 } 332 333 if (hi - lo < minlen) 334 return (0); 335 336 *basep = lo; 337 *lenp = hi - lo; 338 339 return (1); 340 } 341 342 /* 343 * Determine whether [addr, addr+len] with protections `prot' are valid 344 * for a user address space. 345 */ 346 /*ARGSUSED*/ 347 int 348 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 349 caddr_t userlimit) 350 { 351 caddr_t eaddr = addr + len; 352 353 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 354 return (RANGE_BADADDR); 355 356 /* 357 * Determine if the address range falls within an illegal 358 * range of the MMU. 359 */ 360 if (eaddr > hole_start && addr < hole_end) 361 return (RANGE_BADADDR); 362 363 #if defined(SF_ERRATA_57) 364 /* 365 * Make sure USERLIMIT isn't raised too high 366 */ 367 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 368 errata57_limit == 0); 369 370 if (AS_TYPE_64BIT(as) && 371 (addr < errata57_limit) && 372 (prot & PROT_EXEC)) 373 return (RANGE_BADPROT); 374 #endif /* SF_ERRATA57 */ 375 return (RANGE_OKAY); 376 } 377 378 /* 379 * Routine used to check to see if an a.out can be executed 380 * by the current machine/architecture. 381 */ 382 int 383 chkaout(struct exdata *exp) 384 { 385 if (exp->ux_mach == M_SPARC) 386 return (0); 387 else 388 return (ENOEXEC); 389 } 390 391 /* 392 * The following functions return information about an a.out 393 * which is used when a program is executed. 394 */ 395 396 /* 397 * Return the load memory address for the data segment. 398 */ 399 caddr_t 400 getdmem(struct exec *exp) 401 { 402 /* 403 * XXX - Sparc Reference Hack approaching 404 * Remember that we are loading 405 * 8k executables into a 4k machine 406 * DATA_ALIGN == 2 * PAGESIZE 407 */ 408 if (exp->a_text) 409 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); 410 else 411 return ((caddr_t)USRTEXT); 412 } 413 414 /* 415 * Return the starting disk address for the data segment. 416 */ 417 ulong_t 418 getdfile(struct exec *exp) 419 { 420 if (exp->a_magic == ZMAGIC) 421 return (exp->a_text); 422 else 423 return (sizeof (struct exec) + exp->a_text); 424 } 425 426 /* 427 * Return the load memory address for the text segment. 428 */ 429 430 /*ARGSUSED*/ 431 caddr_t 432 gettmem(struct exec *exp) 433 { 434 return ((caddr_t)USRTEXT); 435 } 436 437 /* 438 * Return the file byte offset for the text segment. 439 */ 440 uint_t 441 gettfile(struct exec *exp) 442 { 443 if (exp->a_magic == ZMAGIC) 444 return (0); 445 else 446 return (sizeof (struct exec)); 447 } 448 449 void 450 getexinfo( 451 struct exdata *edp_in, 452 struct exdata *edp_out, 453 int *pagetext, 454 int *pagedata) 455 { 456 *edp_out = *edp_in; /* structure copy */ 457 458 if ((edp_in->ux_mag == ZMAGIC) && 459 ((edp_in->vp->v_flag & VNOMAP) == 0)) { 460 *pagetext = 1; 461 *pagedata = 1; 462 } else { 463 *pagetext = 0; 464 *pagedata = 0; 465 } 466 } 467 468 #define MAP_PGSZ_COMMON(pgsz, n, upper, lower, len) \ 469 for ((n) = (upper); (n) > (lower); (n)--) { \ 470 if (disable_auto_large_pages & (1 << (n))) \ 471 continue; \ 472 if (hw_page_array[(n)].hp_size <= (len)) { \ 473 (pgsz) = hw_page_array[(n)].hp_size; \ 474 break; \ 475 } \ 476 } 477 478 479 /*ARGSUSED*/ 480 size_t 481 map_pgszva(struct proc *p, caddr_t addr, size_t len) 482 { 483 size_t pgsz = MMU_PAGESIZE; 484 int n, upper; 485 486 /* 487 * Select the best fit page size within the constraints of 488 * auto_lpg_{min,max}szc. 489 * 490 * Note that we also take the heap size into account when 491 * deciding if we've crossed the threshold at which we should 492 * increase the page size. This isn't perfect since the heap 493 * may not have reached its full size yet, but it's better than 494 * not considering it at all. 495 */ 496 len += p->p_brksize; 497 if (ptob(auto_lpg_tlb_threshold) <= len) { 498 499 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 500 501 /* 502 * Use auto_lpg_minszc - 1 as the limit so we never drop 503 * below auto_lpg_minszc. We don't have a size code to refer 504 * to like we have for bss and stack, so we assume 0. 505 * auto_lpg_minszc should always be >= 0. Using 506 * auto_lpg_minszc cuts off the loop. 507 */ 508 MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len); 509 } 510 511 return (pgsz); 512 } 513 514 size_t 515 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 516 { 517 size_t pgsz; 518 int n, upper, lower; 519 520 /* 521 * If len is zero, retrieve from proc and don't demote the page size. 522 */ 523 if (len == 0) { 524 len = p->p_brksize; 525 } 526 527 /* 528 * Still zero? Then we don't have a heap yet, so pick the default 529 * heap size. 530 */ 531 if (len == 0) { 532 pgsz = auto_lpg_heap_default; 533 } else { 534 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 535 } 536 537 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 538 /* 539 * We're past the threshold, so select the best fit 540 * page size within the constraints of 541 * auto_lpg_{min,max}szc and the minimum required 542 * alignment. 543 */ 544 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 545 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 546 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 547 } 548 549 /* 550 * If addr == 0 we were called by memcntl() or exec_args() when the 551 * size code is 0. Don't set pgsz less than current size. 552 */ 553 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 554 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 555 } 556 557 return (pgsz); 558 } 559 560 size_t 561 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 562 { 563 size_t pgsz; 564 int n, upper, lower; 565 566 /* 567 * If len is zero, retrieve from proc and don't demote the page size. 568 */ 569 if (len == 0) { 570 len = p->p_stksize; 571 } 572 573 /* 574 * Still zero? Then we don't have a heap yet, so pick the default 575 * stack size. 576 */ 577 if (len == 0) { 578 pgsz = auto_lpg_stack_default; 579 } else { 580 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 581 } 582 583 if ((pgsz * auto_lpg_tlb_threshold) <= len) { 584 /* 585 * We're past the threshold, so select the best fit 586 * page size within the constraints of 587 * auto_lpg_{min,max}szc and the minimum required 588 * alignment. 589 */ 590 upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); 591 lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); 592 MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); 593 } 594 595 /* 596 * If addr == 0 we were called by memcntl() or exec_args() when the 597 * size code is 0. Don't set pgsz less than current size. 598 */ 599 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 600 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 601 } 602 603 return (pgsz); 604 } 605 606 607 /* 608 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 609 * KPM selects an address such that it's equal offset modulo shm_alignment and 610 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 611 */ 612 int 613 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 614 { 615 if (vac) { 616 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 617 } else { 618 return (0); 619 } 620 } 621 622 /* 623 * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m 624 * can be set in platform or CPU specific code but user can change the 625 * default values via /etc/system. 626 * 627 * Initial values are defined in architecture specific mach_vm_dep.c file. 628 */ 629 extern int use_text_pgsz64k; 630 extern int use_text_pgsz4m; 631 extern int use_initdata_pgsz64k; 632 633 /* 634 * disable_text_largepages and disable_initdata_largepages bitmaks are set in 635 * platform or CPU specific code to disable page sizes that should not be 636 * used. These variables normally shouldn't be changed via /etc/system. A 637 * particular page size for text or inititialized data will be used by default 638 * if both one of use_* variables is set to 1 AND this page size is not 639 * disabled in the corresponding disable_* bitmask variable. 640 * 641 * Initial values are defined in architecture specific mach_vm_dep.c file. 642 */ 643 extern int disable_text_largepages; 644 extern int disable_initdata_largepages; 645 646 /* 647 * Minimum segment size tunables before 64K or 4M large pages 648 * should be used to map it. 649 * 650 * Initial values are defined in architecture specific mach_vm_dep.c file. 651 */ 652 extern size_t text_pgsz64k_minsize; 653 extern size_t text_pgsz4m_minsize; 654 extern size_t initdata_pgsz64k_minsize; 655 656 /* 657 * Sanity control. Don't use large pages regardless of user 658 * settings if there's less than execseg_lpg_min_physmem memory installed. 659 * The units for this variable is 8K pages. 660 */ 661 pgcnt_t execseg_lpg_min_physmem = 131072; /* 1GB */ 662 663 664 /* assumes TTE8K...TTE4M == szc */ 665 666 static uint_t 667 map_text_pgsz4m(caddr_t addr, size_t len) 668 { 669 caddr_t a; 670 671 if (len < text_pgsz4m_minsize) { 672 return (0); 673 } 674 675 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 676 if (a < addr || a >= addr + len) { 677 return (0); 678 } 679 len -= (a - addr); 680 if (len < MMU_PAGESIZE4M) { 681 return (0); 682 } 683 684 return (1 << TTE4M); 685 } 686 687 static uint_t 688 map_text_pgsz64k(caddr_t addr, size_t len) 689 { 690 caddr_t a; 691 size_t svlen = len; 692 693 if (len < text_pgsz64k_minsize) { 694 return (0); 695 } 696 697 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 698 if (a < addr || a >= addr + len) { 699 return (0); 700 } 701 len -= (a - addr); 702 if (len < MMU_PAGESIZE64K) { 703 return (0); 704 } 705 if (!use_text_pgsz4m || 706 disable_text_largepages & (1 << TTE4M)) { 707 return (1 << TTE64K); 708 } 709 if (svlen < text_pgsz4m_minsize) { 710 return (1 << TTE64K); 711 } 712 addr = a; 713 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); 714 if (a < addr || a >= addr + len) { 715 return (1 << TTE64K); 716 } 717 len -= (a - addr); 718 if (len < MMU_PAGESIZE4M) { 719 return (1 << TTE64K); 720 } 721 return ((1 << TTE4M) | (1 << TTE64K)); 722 } 723 724 static uint_t 725 map_initdata_pgsz64k(caddr_t addr, size_t len) 726 { 727 caddr_t a; 728 729 if (len < initdata_pgsz64k_minsize) { 730 return (0); 731 } 732 733 a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); 734 if (a < addr || a >= addr + len) { 735 return (0); 736 } 737 len -= (a - addr); 738 if (len < MMU_PAGESIZE64K) { 739 return (0); 740 } 741 return (1 << TTE64K); 742 } 743 744 /* 745 * Return a bit vector of large page size codes that 746 * can be used to map [addr, addr + len) region. 747 */ 748 uint_t 749 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 750 { 751 uint_t ret = 0; 752 753 if (physmem < execseg_lpg_min_physmem) { 754 return (0); 755 } 756 757 if (text) { 758 if (use_text_pgsz64k && 759 !(disable_text_largepages & (1 << TTE64K))) { 760 ret = map_text_pgsz64k(addr, len); 761 } else if (use_text_pgsz4m && 762 !(disable_text_largepages & (1 << TTE4M))) { 763 ret = map_text_pgsz4m(addr, len); 764 } 765 } else if (use_initdata_pgsz64k && 766 !(disable_initdata_largepages & (1 << TTE64K))) { 767 ret = map_initdata_pgsz64k(addr, len); 768 } 769 770 return (ret); 771 } 772 773 #define PNUM_SIZE(size_code) \ 774 (hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift) 775 776 /* 777 * Anchored in the table below are counters used to keep track 778 * of free contiguous physical memory. Each element of the table contains 779 * the array of counters, the size of array which is allocated during 780 * startup based on physmax and a shift value used to convert a pagenum 781 * into a counter array index or vice versa. The table has page size 782 * for rows and region size for columns: 783 * 784 * page_counters[page_size][region_size] 785 * 786 * page_size: TTE size code of pages on page_size freelist. 787 * 788 * region_size: TTE size code of a candidate larger page made up 789 * made up of contiguous free page_size pages. 790 * 791 * As you go across a page_size row increasing region_size each 792 * element keeps track of how many (region_size - 1) size groups 793 * made up of page_size free pages can be coalesced into a 794 * regsion_size page. Yuck! Lets try an example: 795 * 796 * page_counters[1][3] is the table element used for identifying 797 * candidate 4M pages from contiguous pages off the 64K free list. 798 * Each index in the page_counters[1][3].array spans 4M. Its the 799 * number of free 512K size (regsion_size - 1) groups of contiguous 800 * 64K free pages. So when page_counters[1][3].counters[n] == 8 801 * we know we have a candidate 4M page made up of 512K size groups 802 * of 64K free pages. 803 */ 804 805 /* 806 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 807 * dimensions are allocated dynamically. 808 */ 809 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 810 811 /* 812 * For now there is only a single size cache list. 813 * Allocated dynamically. 814 */ 815 page_t ***page_cachelists[MAX_MEM_TYPES]; 816 817 kmutex_t *fpc_mutex[NPC_MUTEX]; 818 kmutex_t *cpc_mutex[NPC_MUTEX]; 819 820 caddr_t 821 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align) 822 { 823 int mtype; 824 uint_t szc; 825 826 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 827 828 /* 829 * We only support small pages in the cachelist. 830 */ 831 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 832 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 833 alloc_base += (sizeof (page_t *) * page_colors); 834 /* 835 * Allocate freelists bins for all 836 * supported page sizes. 837 */ 838 for (szc = 0; szc < mmu_page_sizes; szc++) { 839 page_freelists[szc][mtype][mnode] = 840 (page_t **)alloc_base; 841 alloc_base += ((sizeof (page_t *) * 842 page_get_pagecolors(szc))); 843 } 844 } 845 846 alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); 847 848 return (alloc_base); 849 } 850 851 /* 852 * Allocate page_freelists bin headers for a memnode from the 853 * nucleus data area. This is the first time that mmu_page_sizes is 854 * used during sun4u bootup, so check mmu_page_sizes initialization. 855 */ 856 int 857 ndata_alloc_page_freelists(struct memlist *ndata, int mnode) 858 { 859 size_t alloc_sz; 860 caddr_t alloc_base; 861 caddr_t end; 862 int mtype; 863 uint_t szc; 864 int32_t allp = 0; 865 866 if (&mmu_init_mmu_page_sizes) { 867 if (!mmu_init_mmu_page_sizes(allp)) { 868 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 869 mmu_page_sizes); 870 } 871 } 872 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 873 874 /* first time called - allocate max_mem_nodes dimension */ 875 if (mnode == 0) { 876 int i; 877 878 /* page_cachelists */ 879 alloc_sz = MAX_MEM_TYPES * max_mem_nodes * 880 sizeof (page_t **); 881 882 /* page_freelists */ 883 alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes * 884 sizeof (page_t **); 885 886 /* fpc_mutex and cpc_mutex */ 887 alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 888 889 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 890 if (alloc_base == NULL) 891 return (-1); 892 893 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 894 895 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 896 page_cachelists[mtype] = (page_t ***)alloc_base; 897 alloc_base += (max_mem_nodes * sizeof (page_t **)); 898 for (szc = 0; szc < mmu_page_sizes; szc++) { 899 page_freelists[szc][mtype] = 900 (page_t ***)alloc_base; 901 alloc_base += (max_mem_nodes * 902 sizeof (page_t **)); 903 } 904 } 905 for (i = 0; i < NPC_MUTEX; i++) { 906 fpc_mutex[i] = (kmutex_t *)alloc_base; 907 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 908 cpc_mutex[i] = (kmutex_t *)alloc_base; 909 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 910 } 911 alloc_sz = 0; 912 } 913 914 /* 915 * Calculate the size needed by alloc_page_freelists(). 916 */ 917 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 918 alloc_sz += sizeof (page_t *) * page_colors; 919 920 for (szc = 0; szc < mmu_page_sizes; szc++) 921 alloc_sz += sizeof (page_t *) * 922 page_get_pagecolors(szc); 923 } 924 925 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 926 if (alloc_base == NULL) 927 return (-1); 928 929 end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize); 930 ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz, 931 ecache_alignsize)); 932 933 return (0); 934 } 935 936 /* 937 * To select our starting bin, we stride through the bins with a stride 938 * of 337. Why 337? It's prime, it's largeish, and it performs well both 939 * in simulation and practice for different workloads on varying cache sizes. 940 */ 941 uint32_t color_start_current = 0; 942 uint32_t color_start_stride = 337; 943 int color_start_random = 0; 944 945 /* ARGSUSED */ 946 uint_t 947 get_color_start(struct as *as) 948 { 949 uint32_t old, new; 950 951 if (consistent_coloring == 2 || color_start_random) { 952 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 953 page_colors_mask)); 954 } 955 956 do { 957 old = color_start_current; 958 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 959 } while (cas32(&color_start_current, old, new) != old); 960 961 return ((uint_t)(new)); 962 } 963 964 /* 965 * Called once at startup from kphysm_init() -- before memialloc() 966 * is invoked to do the 1st page_free()/page_freelist_add(). 967 * 968 * initializes page_colors and page_colors_mask based on ecache_setsize. 969 * 970 * Also initializes the counter locks. 971 */ 972 void 973 page_coloring_init() 974 { 975 int a; 976 977 if (do_pg_coloring == 0) { 978 page_colors = 1; 979 return; 980 } 981 982 /* 983 * Calculate page_colors from ecache_setsize. ecache_setsize contains 984 * the max ecache setsize of all cpus configured in the system or, for 985 * cheetah+ systems, the max possible ecache setsize for all possible 986 * cheetah+ cpus. 987 */ 988 page_colors = ecache_setsize / MMU_PAGESIZE; 989 page_colors_mask = page_colors - 1; 990 991 /* 992 * initialize cpu_page_colors if ecache setsizes are homogenous. 993 * cpu_page_colors set to -1 during DR operation or during startup 994 * if setsizes are heterogenous. 995 * 996 * The value of cpu_page_colors determines if additional color bins 997 * need to be checked for a particular color in the page_get routines. 998 */ 999 if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize)) 1000 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 1001 1002 vac_colors = vac_size / MMU_PAGESIZE; 1003 vac_colors_mask = vac_colors -1; 1004 1005 page_coloring_shift = 0; 1006 a = ecache_setsize; 1007 while (a >>= 1) { 1008 page_coloring_shift++; 1009 } 1010 } 1011 1012 int 1013 bp_color(struct buf *bp) 1014 { 1015 int color = -1; 1016 1017 if (vac) { 1018 if ((bp->b_flags & B_PAGEIO) != 0) { 1019 color = sfmmu_get_ppvcolor(bp->b_pages); 1020 } else if (bp->b_un.b_addr != NULL) { 1021 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 1022 } 1023 } 1024 return (color < 0 ? 0 : ptob(color)); 1025 } 1026 1027 /* 1028 * Create & Initialise pageout scanner thread. The thread has to 1029 * start at procedure with process pp and priority pri. 1030 */ 1031 void 1032 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1033 { 1034 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1035 } 1036 1037 /* 1038 * Function for flushing D-cache when performing module relocations 1039 * to an alternate mapping. Stubbed out on all platforms except sun4u, 1040 * at least for now. 1041 */ 1042 void 1043 dcache_flushall() 1044 { 1045 sfmmu_cache_flushall(); 1046 } 1047 1048 static int 1049 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 1050 { 1051 if (va1 < va2 && va1 + sz1 <= va2) 1052 return (0); 1053 1054 if (va2 < va1 && va2 + sz2 <= va1) 1055 return (0); 1056 1057 return (1); 1058 } 1059 1060 /* 1061 * Return the number of bytes, relative to the beginning of a given range, that 1062 * are non-toxic (can be read from and written to with relative impunity). 1063 */ 1064 size_t 1065 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 1066 { 1067 /* OBP reads are harmless, but we don't want people writing there */ 1068 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 1069 OFW_START_ADDR + 1)) 1070 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 1071 1072 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 1073 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 1074 1075 return (sz); /* no overlap */ 1076 } 1077 1078 /* 1079 * Minimum physmem required for enabling large pages for kernel heap 1080 * Currently we do not enable lp for kmem on systems with less 1081 * than 1GB of memory. This value can be changed via /etc/system 1082 */ 1083 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 1084 1085 /* 1086 * this function chooses large page size for kernel heap 1087 */ 1088 size_t 1089 get_segkmem_lpsize(size_t lpsize) 1090 { 1091 size_t memtotal = physmem * PAGESIZE; 1092 1093 if (memtotal < segkmem_lpminphysmem) 1094 return (PAGESIZE); 1095 1096 if (plat_lpkmem_is_supported != NULL && 1097 plat_lpkmem_is_supported() == 0) 1098 return (PAGESIZE); 1099 1100 return (mmu_get_kernel_lpsize(lpsize)); 1101 } 1102