1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2016 Joyent, Inc. 25 */ 26 27 /* 28 * UNIX machine dependent virtual memory support. 29 */ 30 31 #include <sys/vm.h> 32 #include <sys/exec.h> 33 34 #include <sys/exechdr.h> 35 #include <vm/seg_kmem.h> 36 #include <sys/atomic.h> 37 #include <sys/archsystm.h> 38 #include <sys/machsystm.h> 39 #include <sys/kdi.h> 40 #include <sys/cpu_module.h> 41 #include <sys/secflags.h> 42 43 #include <vm/hat_sfmmu.h> 44 45 #include <sys/memnode.h> 46 47 #include <sys/mem_config.h> 48 #include <sys/mem_cage.h> 49 #include <vm/vm_dep.h> 50 #include <vm/page.h> 51 #include <sys/platform_module.h> 52 53 /* 54 * These variables are set by module specific config routines. 55 * They are only set by modules which will use physical cache page coloring. 56 */ 57 int do_pg_coloring = 0; 58 59 /* 60 * These variables can be conveniently patched at kernel load time to 61 * prevent do_pg_coloring from being enabled by 62 * module specific config routines. 63 */ 64 65 int use_page_coloring = 1; 66 67 /* 68 * initialized by page_coloring_init() 69 */ 70 extern uint_t page_colors; 71 extern uint_t page_colors_mask; 72 extern uint_t page_coloring_shift; 73 int cpu_page_colors; 74 uint_t vac_colors = 0; 75 uint_t vac_colors_mask = 0; 76 77 /* cpu specific coloring initialization */ 78 extern void page_coloring_init_cpu(); 79 #pragma weak page_coloring_init_cpu 80 81 /* 82 * get the ecache setsize for the current cpu. 83 */ 84 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 85 86 plcnt_t plcnt; /* page list count */ 87 88 /* 89 * This variable is set by the cpu module to contain the lowest 90 * address not affected by the SF_ERRATA_57 workaround. It should 91 * remain 0 if the workaround is not needed. 92 */ 93 #if defined(SF_ERRATA_57) 94 caddr_t errata57_limit; 95 #endif 96 97 extern void page_relocate_hash(page_t *, page_t *); 98 99 /* 100 * these must be defined in platform specific areas 101 */ 102 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, 103 struct proc *, uint_t); 104 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, 105 caddr_t, size_t, uint_t, struct lgrp *); 106 /* 107 * Convert page frame number to an OBMEM page frame number 108 * (i.e. put in the type bits -- zero for this implementation) 109 */ 110 pfn_t 111 impl_obmem_pfnum(pfn_t pf) 112 { 113 return (pf); 114 } 115 116 /* 117 * Use physmax to determine the highest physical page of DRAM memory 118 * It is assumed that any physical addresses above physmax is in IO space. 119 * We don't bother checking the low end because we assume that memory space 120 * begins at physical page frame 0. 121 * 122 * Return 1 if the page frame is onboard DRAM memory, else 0. 123 * Returns 0 for nvram so it won't be cached. 124 */ 125 int 126 pf_is_memory(pfn_t pf) 127 { 128 /* We must be IO space */ 129 if (pf > physmax) 130 return (0); 131 132 /* We must be memory space */ 133 return (1); 134 } 135 136 /* 137 * Handle a pagefault. 138 */ 139 faultcode_t 140 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) 141 { 142 struct as *as; 143 struct proc *p; 144 faultcode_t res; 145 caddr_t base; 146 size_t len; 147 int err; 148 149 if (INVALID_VADDR(addr)) 150 return (FC_NOMAP); 151 152 if (iskernel) { 153 as = &kas; 154 } else { 155 p = curproc; 156 as = p->p_as; 157 #if defined(SF_ERRATA_57) 158 /* 159 * Prevent infinite loops due to a segment driver 160 * setting the execute permissions and the sfmmu hat 161 * silently ignoring them. 162 */ 163 if (rw == S_EXEC && AS_TYPE_64BIT(as) && 164 addr < errata57_limit) { 165 res = FC_NOMAP; 166 goto out; 167 } 168 #endif 169 } 170 171 /* 172 * Dispatch pagefault. 173 */ 174 res = as_fault(as->a_hat, as, addr, 1, type, rw); 175 176 /* 177 * If this isn't a potential unmapped hole in the user's 178 * UNIX data or stack segments, just return status info. 179 */ 180 if (!(res == FC_NOMAP && iskernel == 0)) 181 goto out; 182 183 /* 184 * Check to see if we happened to faulted on a currently unmapped 185 * part of the UNIX data or stack segments. If so, create a zfod 186 * mapping there and then try calling the fault routine again. 187 */ 188 base = p->p_brkbase; 189 len = p->p_brksize; 190 191 if (addr < base || addr >= base + len) { /* data seg? */ 192 base = (caddr_t)(p->p_usrstack - p->p_stksize); 193 len = p->p_stksize; 194 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 195 /* not in either UNIX data or stack segments */ 196 res = FC_NOMAP; 197 goto out; 198 } 199 } 200 201 /* the rest of this function implements a 3.X 4.X 5.X compatibility */ 202 /* This code is probably not needed anymore */ 203 204 /* expand the gap to the page boundaries on each side */ 205 len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - 206 ((uintptr_t)base & PAGEMASK); 207 base = (caddr_t)((uintptr_t)base & PAGEMASK); 208 209 as_rangelock(as); 210 as_purge(as); 211 if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { 212 err = as_map(as, base, len, segvn_create, zfod_argsp); 213 as_rangeunlock(as); 214 if (err) { 215 res = FC_MAKE_ERR(err); 216 goto out; 217 } 218 } else { 219 /* 220 * This page is already mapped by another thread after we 221 * returned from as_fault() above. We just fallthrough 222 * as_fault() below. 223 */ 224 as_rangeunlock(as); 225 } 226 227 res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); 228 229 out: 230 231 return (res); 232 } 233 234 /* 235 * This is the routine which defines the address limit implied 236 * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest 237 * mappable address in a 32-bit process on this platform (though 238 * perhaps we should make it be UINT32_MAX here?) 239 */ 240 void 241 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 242 { 243 struct proc *p = curproc; 244 caddr_t userlimit = flags & _MAP_LOW32 ? 245 (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; 246 map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); 247 } 248 249 /* 250 * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. 251 */ 252 caddr_t hole_start, hole_end; 253 254 /* 255 * kpm mapping window 256 */ 257 caddr_t kpm_vbase; 258 size_t kpm_size; 259 uchar_t kpm_size_shift; 260 261 int valid_va_range_aligned_wraparound; 262 /* 263 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 264 * addresses at least "minlen" long, where the base of the range is at "off" 265 * phase from an "align" boundary and there is space for a "redzone"-sized 266 * redzone on either side of the range. On success, 1 is returned and *basep 267 * and *lenp are adjusted to describe the acceptable range (including 268 * the redzone). On failure, 0 is returned. 269 */ 270 int 271 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 272 size_t align, size_t redzone, size_t off) 273 { 274 caddr_t hi, lo; 275 size_t tot_len; 276 277 ASSERT(align == 0 ? off == 0 : off < align); 278 ASSERT(ISP2(align)); 279 ASSERT(align == 0 || align >= PAGESIZE); 280 281 lo = *basep; 282 hi = lo + *lenp; 283 tot_len = minlen + 2 * redzone; /* need at least this much space */ 284 285 /* If hi rolled over the top try cutting back. */ 286 if (hi < lo) { 287 *lenp = 0UL - (uintptr_t)lo - 1UL; 288 /* Trying to see if this really happens, and then if so, why */ 289 valid_va_range_aligned_wraparound++; 290 hi = lo + *lenp; 291 } 292 if (*lenp < tot_len) { 293 return (0); 294 } 295 296 /* 297 * Deal with a possible hole in the address range between 298 * hole_start and hole_end that should never be mapped by the MMU. 299 */ 300 301 if (lo < hole_start) { 302 if (hi > hole_start) 303 if (hi < hole_end) 304 hi = hole_start; 305 else 306 /* lo < hole_start && hi >= hole_end */ 307 if (dir == AH_LO) { 308 /* 309 * prefer lowest range 310 */ 311 if (hole_start - lo >= tot_len) 312 hi = hole_start; 313 else if (hi - hole_end >= tot_len) 314 lo = hole_end; 315 else 316 return (0); 317 } else { 318 /* 319 * prefer highest range 320 */ 321 if (hi - hole_end >= tot_len) 322 lo = hole_end; 323 else if (hole_start - lo >= tot_len) 324 hi = hole_start; 325 else 326 return (0); 327 } 328 } else { 329 /* lo >= hole_start */ 330 if (hi < hole_end) 331 return (0); 332 if (lo < hole_end) 333 lo = hole_end; 334 } 335 336 /* Check if remaining length is too small */ 337 if (hi - lo < tot_len) { 338 return (0); 339 } 340 if (align > 1) { 341 caddr_t tlo = lo + redzone; 342 caddr_t thi = hi - redzone; 343 tlo = (caddr_t)P2PHASEUP((uintptr_t)tlo, align, off); 344 if (tlo < lo + redzone) { 345 return (0); 346 } 347 if (thi < tlo || thi - tlo < minlen) { 348 return (0); 349 } 350 } 351 *basep = lo; 352 *lenp = hi - lo; 353 return (1); 354 } 355 356 /* 357 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 358 * addresses at least "minlen" long. On success, 1 is returned and *basep 359 * and *lenp are adjusted to describe the acceptable range. On failure, 0 360 * is returned. 361 */ 362 int 363 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 364 { 365 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 366 } 367 368 /* 369 * Default to forbidding the first 64k of address space. This protects most 370 * reasonably sized structures from dereferences through NULL: 371 * ((foo_t *)0)->bar 372 */ 373 uintptr_t forbidden_null_mapping_sz = 0x10000; 374 375 /* 376 * Determine whether [addr, addr+len] with protections `prot' are valid 377 * for a user address space. 378 */ 379 /*ARGSUSED*/ 380 int 381 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 382 caddr_t userlimit) 383 { 384 caddr_t eaddr = addr + len; 385 386 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 387 return (RANGE_BADADDR); 388 389 if ((addr <= (caddr_t)forbidden_null_mapping_sz) && 390 as->a_proc != NULL && 391 secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP)) 392 return (RANGE_BADADDR); 393 394 /* 395 * Determine if the address range falls within an illegal 396 * range of the MMU. 397 */ 398 if (eaddr > hole_start && addr < hole_end) 399 return (RANGE_BADADDR); 400 401 #if defined(SF_ERRATA_57) 402 /* 403 * Make sure USERLIMIT isn't raised too high 404 */ 405 ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || 406 errata57_limit == 0); 407 408 if (AS_TYPE_64BIT(as) && 409 (addr < errata57_limit) && 410 (prot & PROT_EXEC)) 411 return (RANGE_BADPROT); 412 #endif /* SF_ERRATA57 */ 413 return (RANGE_OKAY); 414 } 415 416 /* 417 * Routine used to check to see if an a.out can be executed 418 * by the current machine/architecture. 419 */ 420 int 421 chkaout(struct exdata *exp) 422 { 423 if (exp->ux_mach == M_SPARC) 424 return (0); 425 else 426 return (ENOEXEC); 427 } 428 429 /* 430 * The following functions return information about an a.out 431 * which is used when a program is executed. 432 */ 433 434 /* 435 * Return the load memory address for the data segment. 436 */ 437 caddr_t 438 getdmem(struct exec *exp) 439 { 440 /* 441 * XXX - Sparc Reference Hack approaching 442 * Remember that we are loading 443 * 8k executables into a 4k machine 444 * DATA_ALIGN == 2 * PAGESIZE 445 */ 446 if (exp->a_text) 447 return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); 448 else 449 return ((caddr_t)USRTEXT); 450 } 451 452 /* 453 * Return the starting disk address for the data segment. 454 */ 455 ulong_t 456 getdfile(struct exec *exp) 457 { 458 if (exp->a_magic == ZMAGIC) 459 return (exp->a_text); 460 else 461 return (sizeof (struct exec) + exp->a_text); 462 } 463 464 /* 465 * Return the load memory address for the text segment. 466 */ 467 468 /*ARGSUSED*/ 469 caddr_t 470 gettmem(struct exec *exp) 471 { 472 return ((caddr_t)USRTEXT); 473 } 474 475 /* 476 * Return the file byte offset for the text segment. 477 */ 478 uint_t 479 gettfile(struct exec *exp) 480 { 481 if (exp->a_magic == ZMAGIC) 482 return (0); 483 else 484 return (sizeof (struct exec)); 485 } 486 487 void 488 getexinfo( 489 struct exdata *edp_in, 490 struct exdata *edp_out, 491 int *pagetext, 492 int *pagedata) 493 { 494 *edp_out = *edp_in; /* structure copy */ 495 496 if ((edp_in->ux_mag == ZMAGIC) && 497 ((edp_in->vp->v_flag & VNOMAP) == 0)) { 498 *pagetext = 1; 499 *pagedata = 1; 500 } else { 501 *pagetext = 0; 502 *pagedata = 0; 503 } 504 } 505 506 /* 507 * Return non 0 value if the address may cause a VAC alias with KPM mappings. 508 * KPM selects an address such that it's equal offset modulo shm_alignment and 509 * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. 510 */ 511 int 512 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 513 { 514 if (vac) { 515 return (((uintptr_t)addr ^ off) & shm_alignment - 1); 516 } else { 517 return (0); 518 } 519 } 520 521 /* 522 * Sanity control. Don't use large pages regardless of user 523 * settings if there's less than priv or shm_lpg_min_physmem memory installed. 524 * The units for this variable is 8K pages. 525 */ 526 pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */ 527 pgcnt_t privm_lpg_min_physmem = 131072; /* 1GB */ 528 529 static size_t 530 map_pgszheap(struct proc *p, caddr_t addr, size_t len) 531 { 532 size_t pgsz = MMU_PAGESIZE; 533 int szc; 534 535 /* 536 * If len is zero, retrieve from proc and don't demote the page size. 537 * Use atleast the default pagesize. 538 */ 539 if (len == 0) { 540 len = p->p_brkbase + p->p_brksize - p->p_bssbase; 541 } 542 len = MAX(len, default_uheap_lpsize); 543 544 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 545 pgsz = hw_page_array[szc].hp_size; 546 if ((disable_auto_data_large_pages & (1 << szc)) || 547 pgsz > max_uheap_lpsize) 548 continue; 549 if (len >= pgsz) { 550 break; 551 } 552 } 553 554 /* 555 * If addr == 0 we were called by memcntl() when the 556 * size code is 0. Don't set pgsz less than current size. 557 */ 558 if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { 559 pgsz = hw_page_array[p->p_brkpageszc].hp_size; 560 } 561 562 return (pgsz); 563 } 564 565 static size_t 566 map_pgszstk(struct proc *p, caddr_t addr, size_t len) 567 { 568 size_t pgsz = MMU_PAGESIZE; 569 int szc; 570 571 /* 572 * If len is zero, retrieve from proc and don't demote the page size. 573 * Use atleast the default pagesize. 574 */ 575 if (len == 0) { 576 len = p->p_stksize; 577 } 578 len = MAX(len, default_ustack_lpsize); 579 580 for (szc = mmu_page_sizes - 1; szc >= 0; szc--) { 581 pgsz = hw_page_array[szc].hp_size; 582 if ((disable_auto_data_large_pages & (1 << szc)) || 583 pgsz > max_ustack_lpsize) 584 continue; 585 if (len >= pgsz) { 586 break; 587 } 588 } 589 590 /* 591 * If addr == 0 we were called by memcntl() or exec_args() when the 592 * size code is 0. Don't set pgsz less than current size. 593 */ 594 if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { 595 pgsz = hw_page_array[p->p_stkpageszc].hp_size; 596 } 597 598 return (pgsz); 599 } 600 601 static size_t 602 map_pgszism(caddr_t addr, size_t len) 603 { 604 uint_t szc; 605 size_t pgsz; 606 607 for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) { 608 if (disable_ism_large_pages & (1 << szc)) 609 continue; 610 611 pgsz = hw_page_array[szc].hp_size; 612 if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz)) 613 return (pgsz); 614 } 615 616 return (DEFAULT_ISM_PAGESIZE); 617 } 618 619 /* 620 * Suggest a page size to be used to map a segment of type maptype and length 621 * len. Returns a page size (not a size code). 622 */ 623 /* ARGSUSED */ 624 size_t 625 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 626 { 627 size_t pgsz = MMU_PAGESIZE; 628 629 ASSERT(maptype != MAPPGSZ_VA); 630 631 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 632 return (MMU_PAGESIZE); 633 } 634 635 switch (maptype) { 636 case MAPPGSZ_ISM: 637 pgsz = map_pgszism(addr, len); 638 break; 639 640 case MAPPGSZ_STK: 641 if (max_ustack_lpsize > MMU_PAGESIZE) { 642 pgsz = map_pgszstk(p, addr, len); 643 } 644 break; 645 646 case MAPPGSZ_HEAP: 647 if (max_uheap_lpsize > MMU_PAGESIZE) { 648 pgsz = map_pgszheap(p, addr, len); 649 } 650 break; 651 } 652 return (pgsz); 653 } 654 655 656 /* assumes TTE8K...TTE4M == szc */ 657 658 static uint_t 659 map_szcvec(caddr_t addr, size_t size, uintptr_t off, int disable_lpgs, 660 size_t max_lpsize, size_t min_physmem) 661 { 662 caddr_t eaddr = addr + size; 663 uint_t szcvec = 0; 664 caddr_t raddr; 665 caddr_t readdr; 666 size_t pgsz; 667 int i; 668 669 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 670 return (0); 671 } 672 for (i = mmu_page_sizes - 1; i > 0; i--) { 673 if (disable_lpgs & (1 << i)) { 674 continue; 675 } 676 pgsz = page_get_pagesize(i); 677 if (pgsz > max_lpsize) { 678 continue; 679 } 680 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 681 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 682 if (raddr < addr || raddr >= readdr) { 683 continue; 684 } 685 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 686 continue; 687 } 688 szcvec |= (1 << i); 689 /* 690 * And or in the remaining enabled page sizes. 691 */ 692 szcvec |= P2PHASE(~disable_lpgs, (1 << i)); 693 szcvec &= ~1; /* no need to return 8K pagesize */ 694 break; 695 } 696 return (szcvec); 697 } 698 699 /* 700 * Return a bit vector of large page size codes that 701 * can be used to map [addr, addr + len) region. 702 */ 703 /* ARGSUSED */ 704 uint_t 705 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 706 int memcntl) 707 { 708 if (flags & MAP_TEXT) { 709 return (map_szcvec(addr, size, off, 710 disable_auto_text_large_pages, 711 max_utext_lpsize, shm_lpg_min_physmem)); 712 713 } else if (flags & MAP_INITDATA) { 714 return (map_szcvec(addr, size, off, 715 disable_auto_data_large_pages, 716 max_uidata_lpsize, privm_lpg_min_physmem)); 717 718 } else if (type == MAPPGSZC_SHM) { 719 return (map_szcvec(addr, size, off, 720 disable_auto_data_large_pages, 721 max_shm_lpsize, shm_lpg_min_physmem)); 722 723 } else if (type == MAPPGSZC_HEAP) { 724 return (map_szcvec(addr, size, off, 725 disable_auto_data_large_pages, 726 max_uheap_lpsize, privm_lpg_min_physmem)); 727 728 } else if (type == MAPPGSZC_STACK) { 729 return (map_szcvec(addr, size, off, 730 disable_auto_data_large_pages, 731 max_ustack_lpsize, privm_lpg_min_physmem)); 732 733 } else { 734 return (map_szcvec(addr, size, off, 735 disable_auto_data_large_pages, 736 max_privmap_lpsize, privm_lpg_min_physmem)); 737 } 738 } 739 740 /* 741 * Anchored in the table below are counters used to keep track 742 * of free contiguous physical memory. Each element of the table contains 743 * the array of counters, the size of array which is allocated during 744 * startup based on physmax and a shift value used to convert a pagenum 745 * into a counter array index or vice versa. The table has page size 746 * for rows and region size for columns: 747 * 748 * page_counters[page_size][region_size] 749 * 750 * page_size: TTE size code of pages on page_size freelist. 751 * 752 * region_size: TTE size code of a candidate larger page made up 753 * made up of contiguous free page_size pages. 754 * 755 * As you go across a page_size row increasing region_size each 756 * element keeps track of how many (region_size - 1) size groups 757 * made up of page_size free pages can be coalesced into a 758 * regsion_size page. Yuck! Lets try an example: 759 * 760 * page_counters[1][3] is the table element used for identifying 761 * candidate 4M pages from contiguous pages off the 64K free list. 762 * Each index in the page_counters[1][3].array spans 4M. Its the 763 * number of free 512K size (regsion_size - 1) groups of contiguous 764 * 64K free pages. So when page_counters[1][3].counters[n] == 8 765 * we know we have a candidate 4M page made up of 512K size groups 766 * of 64K free pages. 767 */ 768 769 /* 770 * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) 771 * dimensions are allocated dynamically. 772 */ 773 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 774 775 /* 776 * For now there is only a single size cache list. 777 * Allocated dynamically. 778 */ 779 page_t ***page_cachelists[MAX_MEM_TYPES]; 780 781 kmutex_t *fpc_mutex[NPC_MUTEX]; 782 kmutex_t *cpc_mutex[NPC_MUTEX]; 783 784 /* 785 * Calculate space needed for page freelists and counters 786 */ 787 size_t 788 calc_free_pagelist_sz(void) 789 { 790 int szc; 791 size_t alloc_sz, cache_sz, free_sz; 792 793 /* 794 * one cachelist per color, node, and type 795 */ 796 cache_sz = (page_get_pagecolors(0) * sizeof (page_t *)) + 797 sizeof (page_t **); 798 cache_sz *= max_mem_nodes * MAX_MEM_TYPES; 799 800 /* 801 * one freelist per size, color, node, and type 802 */ 803 free_sz = sizeof (page_t **); 804 for (szc = 0; szc < mmu_page_sizes; szc++) 805 free_sz += sizeof (page_t *) * page_get_pagecolors(szc); 806 free_sz *= max_mem_nodes * MAX_MEM_TYPES; 807 808 alloc_sz = cache_sz + free_sz + page_ctrs_sz(); 809 return (alloc_sz); 810 } 811 812 caddr_t 813 alloc_page_freelists(caddr_t alloc_base) 814 { 815 int mnode, mtype; 816 int szc, clrs; 817 818 /* 819 * We only support small pages in the cachelist. 820 */ 821 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 822 page_cachelists[mtype] = (page_t ***)alloc_base; 823 alloc_base += (max_mem_nodes * sizeof (page_t **)); 824 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 825 page_cachelists[mtype][mnode] = (page_t **)alloc_base; 826 alloc_base += 827 (page_get_pagecolors(0) * sizeof (page_t *)); 828 } 829 } 830 831 /* 832 * Allocate freelists bins for all 833 * supported page sizes. 834 */ 835 for (szc = 0; szc < mmu_page_sizes; szc++) { 836 clrs = page_get_pagecolors(szc); 837 for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { 838 page_freelists[szc][mtype] = (page_t ***)alloc_base; 839 alloc_base += (max_mem_nodes * sizeof (page_t **)); 840 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 841 page_freelists[szc][mtype][mnode] = 842 (page_t **)alloc_base; 843 alloc_base += (clrs * (sizeof (page_t *))); 844 } 845 } 846 } 847 848 alloc_base = page_ctrs_alloc(alloc_base); 849 return (alloc_base); 850 } 851 852 /* 853 * Allocate page_freelists locks for a memnode from the nucleus data 854 * area. This is the first time that mmu_page_sizes is used during 855 * bootup, so check mmu_page_sizes initialization. 856 */ 857 int 858 ndata_alloc_page_mutexs(struct memlist *ndata) 859 { 860 size_t alloc_sz; 861 caddr_t alloc_base; 862 int i; 863 void page_coloring_init(); 864 865 page_coloring_init(); 866 if (&mmu_init_mmu_page_sizes) { 867 if (!mmu_init_mmu_page_sizes(0)) { 868 cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", 869 mmu_page_sizes); 870 } 871 } 872 ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); 873 874 /* fpc_mutex and cpc_mutex */ 875 alloc_sz = 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); 876 877 alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); 878 if (alloc_base == NULL) 879 return (-1); 880 881 ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); 882 883 for (i = 0; i < NPC_MUTEX; i++) { 884 fpc_mutex[i] = (kmutex_t *)alloc_base; 885 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 886 cpc_mutex[i] = (kmutex_t *)alloc_base; 887 alloc_base += (sizeof (kmutex_t) * max_mem_nodes); 888 } 889 return (0); 890 } 891 892 /* 893 * To select our starting bin, we stride through the bins with a stride 894 * of 337. Why 337? It's prime, it's largeish, and it performs well both 895 * in simulation and practice for different workloads on varying cache sizes. 896 */ 897 uint32_t color_start_current = 0; 898 uint32_t color_start_stride = 337; 899 int color_start_random = 0; 900 901 /* ARGSUSED */ 902 uint_t 903 get_color_start(struct as *as) 904 { 905 uint32_t old, new; 906 907 if (consistent_coloring == 2 || color_start_random) { 908 return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & 909 (hw_page_array[0].hp_colors - 1))); 910 } 911 912 do { 913 old = color_start_current; 914 new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); 915 } while (atomic_cas_32(&color_start_current, old, new) != old); 916 917 return ((uint_t)(new)); 918 } 919 920 /* 921 * Called once at startup from kphysm_init() -- before memialloc() 922 * is invoked to do the 1st page_free()/page_freelist_add(). 923 * 924 * initializes page_colors and page_colors_mask based on ecache_setsize. 925 * 926 * Also initializes the counter locks. 927 */ 928 void 929 page_coloring_init() 930 { 931 int a, i; 932 uint_t colors; 933 934 if (do_pg_coloring == 0) { 935 page_colors = 1; 936 for (i = 0; i < mmu_page_sizes; i++) { 937 colorequivszc[i] = 0; 938 hw_page_array[i].hp_colors = 1; 939 } 940 return; 941 } 942 943 /* 944 * Calculate page_colors from ecache_setsize. ecache_setsize contains 945 * the max ecache setsize of all cpus configured in the system or, for 946 * cheetah+ systems, the max possible ecache setsize for all possible 947 * cheetah+ cpus. 948 */ 949 page_colors = ecache_setsize / MMU_PAGESIZE; 950 page_colors_mask = page_colors - 1; 951 952 vac_colors = vac_size / MMU_PAGESIZE; 953 vac_colors_mask = vac_colors -1; 954 955 page_coloring_shift = 0; 956 a = ecache_setsize; 957 while (a >>= 1) { 958 page_coloring_shift++; 959 } 960 961 /* initialize number of colors per page size */ 962 for (i = 0; i < mmu_page_sizes; i++) { 963 hw_page_array[i].hp_colors = (page_colors_mask >> 964 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 965 + 1; 966 colorequivszc[i] = 0; 967 } 968 969 /* 970 * initialize cpu_page_colors if ecache setsizes are homogenous. 971 * cpu_page_colors set to -1 during DR operation or during startup 972 * if setsizes are heterogenous. 973 * 974 * The value of cpu_page_colors determines if additional color bins 975 * need to be checked for a particular color in the page_get routines. 976 */ 977 if (cpu_setsize > 0 && cpu_page_colors == 0 && 978 cpu_setsize < ecache_setsize) { 979 cpu_page_colors = cpu_setsize / MMU_PAGESIZE; 980 a = lowbit(page_colors) - lowbit(cpu_page_colors); 981 ASSERT(a > 0); 982 ASSERT(a < 16); 983 984 for (i = 0; i < mmu_page_sizes; i++) { 985 if ((colors = hw_page_array[i].hp_colors) <= 1) { 986 continue; 987 } 988 while ((colors >> a) == 0) 989 a--; 990 ASSERT(a >= 0); 991 992 /* higher 4 bits encodes color equiv mask */ 993 colorequivszc[i] = (a << 4); 994 } 995 } 996 997 /* do cpu specific color initialization */ 998 if (&page_coloring_init_cpu) { 999 page_coloring_init_cpu(); 1000 } 1001 } 1002 1003 int 1004 bp_color(struct buf *bp) 1005 { 1006 int color = -1; 1007 1008 if (vac) { 1009 if ((bp->b_flags & B_PAGEIO) != 0) { 1010 color = sfmmu_get_ppvcolor(bp->b_pages); 1011 } else if (bp->b_un.b_addr != NULL) { 1012 color = sfmmu_get_addrvcolor(bp->b_un.b_addr); 1013 } 1014 } 1015 return (color < 0 ? 0 : ptob(color)); 1016 } 1017 1018 /* 1019 * Function for flushing D-cache when performing module relocations 1020 * to an alternate mapping. Stubbed out on all platforms except sun4u, 1021 * at least for now. 1022 */ 1023 void 1024 dcache_flushall() 1025 { 1026 sfmmu_cache_flushall(); 1027 } 1028 1029 static int 1030 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) 1031 { 1032 if (va1 < va2 && va1 + sz1 <= va2) 1033 return (0); 1034 1035 if (va2 < va1 && va2 + sz2 <= va1) 1036 return (0); 1037 1038 return (1); 1039 } 1040 1041 /* 1042 * Return the number of bytes, relative to the beginning of a given range, that 1043 * are non-toxic (can be read from and written to with relative impunity). 1044 */ 1045 size_t 1046 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) 1047 { 1048 /* OBP reads are harmless, but we don't want people writing there */ 1049 if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - 1050 OFW_START_ADDR + 1)) 1051 return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); 1052 1053 if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) 1054 return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); 1055 1056 return (sz); /* no overlap */ 1057 } 1058 1059 /* 1060 * Minimum physmem required for enabling large pages for kernel heap 1061 * Currently we do not enable lp for kmem on systems with less 1062 * than 1GB of memory. This value can be changed via /etc/system 1063 */ 1064 size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ 1065 1066 /* 1067 * this function chooses large page size for kernel heap 1068 */ 1069 size_t 1070 get_segkmem_lpsize(size_t lpsize) 1071 { 1072 size_t memtotal = physmem * PAGESIZE; 1073 size_t mmusz; 1074 uint_t szc; 1075 1076 if (memtotal < segkmem_lpminphysmem) 1077 return (PAGESIZE); 1078 1079 if (plat_lpkmem_is_supported != NULL && 1080 plat_lpkmem_is_supported() == 0) 1081 return (PAGESIZE); 1082 1083 mmusz = mmu_get_kernel_lpsize(lpsize); 1084 szc = page_szc(mmusz); 1085 1086 while (szc) { 1087 if (!(disable_large_pages & (1 << szc))) 1088 return (page_get_pagesize(szc)); 1089 szc--; 1090 } 1091 return (PAGESIZE); 1092 } 1093