1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 59 #include <vm/hat.h> 60 #include <vm/as.h> 61 #include <vm/seg.h> 62 #include <vm/seg_kp.h> 63 #include <vm/seg_vn.h> 64 #include <vm/page.h> 65 #include <vm/seg_kmem.h> 66 #include <vm/seg_kpm.h> 67 #include <vm/vm_dep.h> 68 69 #include <sys/cpu.h> 70 #include <sys/vm_machparam.h> 71 #include <sys/memlist.h> 72 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 73 #include <vm/hat_i86.h> 74 #include <sys/x86_archext.h> 75 #include <sys/elf_386.h> 76 #include <sys/cmn_err.h> 77 #include <sys/archsystm.h> 78 #include <sys/machsystm.h> 79 80 #include <sys/vtrace.h> 81 #include <sys/ddidmareq.h> 82 #include <sys/promif.h> 83 #include <sys/memnode.h> 84 #include <sys/stack.h> 85 86 uint_t vac_colors = 1; 87 88 int largepagesupport = 0; 89 extern uint_t page_create_new; 90 extern uint_t page_create_exists; 91 extern uint_t page_create_putbacks; 92 extern uint_t page_create_putbacks; 93 extern uintptr_t eprom_kernelbase; 94 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 95 96 /* 4g memory management */ 97 pgcnt_t maxmem4g; 98 pgcnt_t freemem4g; 99 int physmax4g; 100 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 101 int lotsfree4gshift = 3; 102 103 /* 16m memory management: desired number of free pages below 16m. */ 104 pgcnt_t desfree16m = 0x380; 105 106 #ifdef VM_STATS 107 struct { 108 ulong_t pga_alloc; 109 ulong_t pga_notfullrange; 110 ulong_t pga_nulldmaattr; 111 ulong_t pga_allocok; 112 ulong_t pga_allocfailed; 113 ulong_t pgma_alloc; 114 ulong_t pgma_allocok; 115 ulong_t pgma_allocfailed; 116 ulong_t pgma_allocempty; 117 } pga_vmstats; 118 #endif 119 120 uint_t mmu_page_sizes; 121 122 /* How many page sizes the users can see */ 123 uint_t mmu_exported_page_sizes; 124 125 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 126 /* 127 * Number of pages in 1 GB. Don't enable automatic large pages if we have 128 * fewer than this many pages. 129 */ 130 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 131 132 /* 133 * Return the optimum page size for a given mapping 134 */ 135 /*ARGSUSED*/ 136 size_t 137 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 138 { 139 level_t l; 140 141 if (remap) 142 *remap = 0; 143 144 switch (maptype) { 145 146 case MAPPGSZ_STK: 147 case MAPPGSZ_HEAP: 148 case MAPPGSZ_VA: 149 /* 150 * use the pages size that best fits len 151 */ 152 for (l = mmu.max_page_level; l > 0; --l) { 153 if (len < LEVEL_SIZE(l)) 154 continue; 155 break; 156 } 157 return (LEVEL_SIZE(l)); 158 159 /* 160 * for ISM use the 1st large page size. 161 */ 162 case MAPPGSZ_ISM: 163 if (mmu.max_page_level == 0) 164 return (MMU_PAGESIZE); 165 return (LEVEL_SIZE(1)); 166 } 167 return (0); 168 } 169 170 /* 171 * This can be patched via /etc/system to allow large pages 172 * to be used for mapping application and libraries text segments. 173 */ 174 int use_text_largepages = 0; 175 int use_shm_largepages = 0; 176 177 /* 178 * Return a bit vector of large page size codes that 179 * can be used to map [addr, addr + len) region. 180 */ 181 182 /*ARGSUSED*/ 183 uint_t 184 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 185 { 186 size_t pgsz; 187 caddr_t a; 188 189 if (!text || !use_text_largepages || 190 mmu.max_page_level == 0) 191 return (0); 192 193 pgsz = LEVEL_SIZE(1); 194 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 195 if (a < addr || a >= addr + len) { 196 return (0); 197 } 198 len -= (a - addr); 199 if (len < pgsz) { 200 return (0); 201 } 202 return (1 << 1); 203 } 204 205 uint_t 206 map_shm_pgszcvec(caddr_t addr, size_t len, uintptr_t off) 207 { 208 size_t pgsz; 209 caddr_t a; 210 211 if (!use_shm_largepages || mmu.max_page_level == 0) { 212 return (0); 213 } 214 215 pgsz = LEVEL_SIZE(1); 216 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 217 if (a < addr || a >= addr + len || 218 P2PHASE((uintptr_t)addr ^ off, pgsz)) { 219 return (0); 220 } 221 len -= (a - addr); 222 if (len < pgsz) { 223 return (0); 224 } 225 return (1 << 1); 226 } 227 228 /* 229 * Handle a pagefault. 230 */ 231 faultcode_t 232 pagefault( 233 caddr_t addr, 234 enum fault_type type, 235 enum seg_rw rw, 236 int iskernel) 237 { 238 struct as *as; 239 struct hat *hat; 240 struct proc *p; 241 kthread_t *t; 242 faultcode_t res; 243 caddr_t base; 244 size_t len; 245 int err; 246 int mapped_red; 247 uintptr_t ea; 248 249 ASSERT_STACK_ALIGNED(); 250 251 if (INVALID_VADDR(addr)) 252 return (FC_NOMAP); 253 254 mapped_red = segkp_map_red(); 255 256 if (iskernel) { 257 as = &kas; 258 hat = as->a_hat; 259 } else { 260 t = curthread; 261 p = ttoproc(t); 262 as = p->p_as; 263 hat = as->a_hat; 264 } 265 266 /* 267 * Dispatch pagefault. 268 */ 269 res = as_fault(hat, as, addr, 1, type, rw); 270 271 /* 272 * If this isn't a potential unmapped hole in the user's 273 * UNIX data or stack segments, just return status info. 274 */ 275 if (res != FC_NOMAP || iskernel) 276 goto out; 277 278 /* 279 * Check to see if we happened to faulted on a currently unmapped 280 * part of the UNIX data or stack segments. If so, create a zfod 281 * mapping there and then try calling the fault routine again. 282 */ 283 base = p->p_brkbase; 284 len = p->p_brksize; 285 286 if (addr < base || addr >= base + len) { /* data seg? */ 287 base = (caddr_t)p->p_usrstack - p->p_stksize; 288 len = p->p_stksize; 289 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 290 /* not in either UNIX data or stack segments */ 291 res = FC_NOMAP; 292 goto out; 293 } 294 } 295 296 /* 297 * the rest of this function implements a 3.X 4.X 5.X compatibility 298 * This code is probably not needed anymore 299 */ 300 if (p->p_model == DATAMODEL_ILP32) { 301 302 /* expand the gap to the page boundaries on each side */ 303 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 304 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 305 len = ea - (uintptr_t)base; 306 307 as_rangelock(as); 308 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 309 0) { 310 err = as_map(as, base, len, segvn_create, zfod_argsp); 311 as_rangeunlock(as); 312 if (err) { 313 res = FC_MAKE_ERR(err); 314 goto out; 315 } 316 } else { 317 /* 318 * This page is already mapped by another thread after 319 * we returned from as_fault() above. We just fall 320 * through as_fault() below. 321 */ 322 as_rangeunlock(as); 323 } 324 325 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 326 } 327 328 out: 329 if (mapped_red) 330 segkp_unmap_red(); 331 332 return (res); 333 } 334 335 void 336 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 337 { 338 struct proc *p = curproc; 339 caddr_t userlimit = (flags & _MAP_LOW32) ? 340 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 341 342 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 343 } 344 345 /*ARGSUSED*/ 346 int 347 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 348 { 349 return (0); 350 } 351 352 /* 353 * map_addr_proc() is the routine called when the system is to 354 * choose an address for the user. We will pick an address 355 * range which is the highest available below kernelbase. 356 * 357 * addrp is a value/result parameter. 358 * On input it is a hint from the user to be used in a completely 359 * machine dependent fashion. We decide to completely ignore this hint. 360 * 361 * On output it is NULL if no address can be found in the current 362 * processes address space or else an address that is currently 363 * not mapped for len bytes with a page of red zone on either side. 364 * 365 * align is not needed on x86 (it's for viturally addressed caches) 366 */ 367 /*ARGSUSED*/ 368 void 369 map_addr_proc( 370 caddr_t *addrp, 371 size_t len, 372 offset_t off, 373 int vacalign, 374 caddr_t userlimit, 375 struct proc *p, 376 uint_t flags) 377 { 378 struct as *as = p->p_as; 379 caddr_t addr; 380 caddr_t base; 381 size_t slen; 382 size_t align_amount; 383 384 ASSERT32(userlimit == as->a_userlimit); 385 386 base = p->p_brkbase; 387 #if defined(__amd64) 388 /* 389 * XX64 Yes, this needs more work. 390 */ 391 if (p->p_model == DATAMODEL_NATIVE) { 392 if (userlimit < as->a_userlimit) { 393 /* 394 * This happens when a program wants to map 395 * something in a range that's accessible to a 396 * program in a smaller address space. For example, 397 * a 64-bit program calling mmap32(2) to guarantee 398 * that the returned address is below 4Gbytes. 399 */ 400 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 401 402 if (userlimit > base) 403 slen = userlimit - base; 404 else { 405 *addrp = NULL; 406 return; 407 } 408 } else { 409 /* 410 * XX64 This layout is probably wrong .. but in 411 * the event we make the amd64 address space look 412 * like sparcv9 i.e. with the stack -above- the 413 * heap, this bit of code might even be correct. 414 */ 415 slen = p->p_usrstack - base - 416 (((size_t)rctl_enforced_value( 417 rctlproc_legacy[RLIMIT_STACK], 418 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 419 } 420 } else 421 #endif 422 slen = userlimit - base; 423 424 len = (len + PAGEOFFSET) & PAGEMASK; 425 426 /* 427 * Redzone for each side of the request. This is done to leave 428 * one page unmapped between segments. This is not required, but 429 * it's useful for the user because if their program strays across 430 * a segment boundary, it will catch a fault immediately making 431 * debugging a little easier. 432 */ 433 len += 2 * MMU_PAGESIZE; 434 435 /* 436 * figure out what the alignment should be 437 * 438 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 439 */ 440 if (len <= ELF_386_MAXPGSZ) { 441 /* 442 * Align virtual addresses to ensure that ELF shared libraries 443 * are mapped with the appropriate alignment constraints by 444 * the run-time linker. 445 */ 446 align_amount = ELF_386_MAXPGSZ; 447 } else { 448 int l = mmu.max_page_level; 449 450 while (l && len < LEVEL_SIZE(l)) 451 --l; 452 453 align_amount = LEVEL_SIZE(l); 454 } 455 456 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 457 align_amount = (uintptr_t)*addrp; 458 459 len += align_amount; 460 461 /* 462 * Look for a large enough hole starting below userlimit. 463 * After finding it, use the upper part. Addition of PAGESIZE 464 * is for the redzone as described above. 465 */ 466 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 467 caddr_t as_addr; 468 469 addr = base + slen - len + MMU_PAGESIZE; 470 as_addr = addr; 471 /* 472 * Round address DOWN to the alignment amount, 473 * add the offset, and if this address is less 474 * than the original address, add alignment amount. 475 */ 476 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 477 addr += (uintptr_t)(off & (align_amount - 1)); 478 if (addr < as_addr) 479 addr += align_amount; 480 481 ASSERT(addr <= (as_addr + align_amount)); 482 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 483 ((uintptr_t)(off & (align_amount - 1)))); 484 *addrp = addr; 485 } else { 486 *addrp = NULL; /* no more virtual space */ 487 } 488 } 489 490 /* 491 * Determine whether [base, base+len] contains a valid range of 492 * addresses at least minlen long. base and len are adjusted if 493 * required to provide a valid range. 494 */ 495 /*ARGSUSED3*/ 496 int 497 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 498 { 499 uintptr_t hi, lo; 500 501 lo = (uintptr_t)*basep; 502 hi = lo + *lenp; 503 504 /* 505 * If hi rolled over the top, try cutting back. 506 */ 507 if (hi < lo) { 508 if (0 - lo + hi < minlen) 509 return (0); 510 if (0 - lo < minlen) 511 return (0); 512 *lenp = 0 - lo; 513 } else if (hi - lo < minlen) { 514 return (0); 515 } 516 #if defined(__amd64) 517 /* 518 * Deal with a possible hole in the address range between 519 * hole_start and hole_end that should never be mapped. 520 */ 521 if (lo < hole_start) { 522 if (hi > hole_start) { 523 if (hi < hole_end) { 524 hi = hole_start; 525 } else { 526 /* lo < hole_start && hi >= hole_end */ 527 if (dir == AH_LO) { 528 /* 529 * prefer lowest range 530 */ 531 if (hole_start - lo >= minlen) 532 hi = hole_start; 533 else if (hi - hole_end >= minlen) 534 lo = hole_end; 535 else 536 return (0); 537 } else { 538 /* 539 * prefer highest range 540 */ 541 if (hi - hole_end >= minlen) 542 lo = hole_end; 543 else if (hole_start - lo >= minlen) 544 hi = hole_start; 545 else 546 return (0); 547 } 548 } 549 } 550 } else { 551 /* lo >= hole_start */ 552 if (hi < hole_end) 553 return (0); 554 if (lo < hole_end) 555 lo = hole_end; 556 } 557 558 if (hi - lo < minlen) 559 return (0); 560 561 *basep = (caddr_t)lo; 562 *lenp = hi - lo; 563 #endif 564 return (1); 565 } 566 567 /* 568 * Determine whether [addr, addr+len] are valid user addresses. 569 */ 570 /*ARGSUSED*/ 571 int 572 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 573 caddr_t userlimit) 574 { 575 caddr_t eaddr = addr + len; 576 577 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 578 return (RANGE_BADADDR); 579 580 #if defined(__amd64) 581 /* 582 * Check for the VA hole 583 */ 584 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 585 return (RANGE_BADADDR); 586 #endif 587 588 return (RANGE_OKAY); 589 } 590 591 /* 592 * Return 1 if the page frame is onboard memory, else 0. 593 */ 594 int 595 pf_is_memory(pfn_t pf) 596 { 597 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 598 } 599 600 601 /* 602 * initialized by page_coloring_init(). 603 */ 604 uint_t page_colors; 605 uint_t page_colors_mask; 606 uint_t page_coloring_shift; 607 int cpu_page_colors; 608 static uint_t l2_colors; 609 610 /* 611 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 612 * and page_colors are calculated from the l2 cache n-way set size. Within a 613 * mnode range, the page freelist and cachelist are hashed into bins based on 614 * color. This makes it easier to search for a page within a specific memory 615 * range. 616 */ 617 #define PAGE_COLORS_MIN 16 618 619 page_t ****page_freelists; 620 page_t ***page_cachelists; 621 622 /* 623 * As the PC architecture evolved memory up was clumped into several 624 * ranges for various historical I/O devices to do DMA. 625 * < 16Meg - ISA bus 626 * < 2Gig - ??? 627 * < 4Gig - PCI bus or drivers that don't understand PAE mode 628 */ 629 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 630 0x100000, /* pfn range for 4G and above */ 631 0x80000, /* pfn range for 2G-4G */ 632 0x01000, /* pfn range for 16M-2G */ 633 0x00000, /* pfn range for 0-16M */ 634 }; 635 636 /* 637 * These are changed during startup if the machine has limited memory. 638 */ 639 pfn_t *memranges = &arch_memranges[0]; 640 int nranges = NUM_MEM_RANGES; 641 642 /* 643 * Used by page layer to know about page sizes 644 */ 645 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 646 647 /* 648 * This can be patched via /etc/system to allow old non-PAE aware device 649 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 650 */ 651 #if defined(__i386) 652 int restricted_kmemalloc = 0; 653 #elif defined(__amd64) 654 int restricted_kmemalloc = 0; 655 #endif 656 657 kmutex_t *fpc_mutex[NPC_MUTEX]; 658 kmutex_t *cpc_mutex[NPC_MUTEX]; 659 660 661 /* 662 * return the memrange containing pfn 663 */ 664 int 665 memrange_num(pfn_t pfn) 666 { 667 int n; 668 669 for (n = 0; n < nranges - 1; ++n) { 670 if (pfn >= memranges[n]) 671 break; 672 } 673 return (n); 674 } 675 676 /* 677 * return the mnoderange containing pfn 678 */ 679 int 680 pfn_2_mtype(pfn_t pfn) 681 { 682 int n; 683 684 for (n = mnoderangecnt - 1; n >= 0; n--) { 685 if (pfn >= mnoderanges[n].mnr_pfnlo) { 686 break; 687 } 688 } 689 return (n); 690 } 691 692 /* 693 * is_contigpage_free: 694 * returns a page list of contiguous pages. It minimally has to return 695 * minctg pages. Caller determines minctg based on the scatter-gather 696 * list length. 697 * 698 * pfnp is set to the next page frame to search on return. 699 */ 700 static page_t * 701 is_contigpage_free( 702 pfn_t *pfnp, 703 pgcnt_t *pgcnt, 704 pgcnt_t minctg, 705 uint64_t pfnseg, 706 int iolock) 707 { 708 int i = 0; 709 pfn_t pfn = *pfnp; 710 page_t *pp; 711 page_t *plist = NULL; 712 713 /* 714 * fail if pfn + minctg crosses a segment boundary. 715 * Adjust for next starting pfn to begin at segment boundary. 716 */ 717 718 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 719 *pfnp = roundup(*pfnp, pfnseg + 1); 720 return (NULL); 721 } 722 723 do { 724 retry: 725 pp = page_numtopp_nolock(pfn + i); 726 if ((pp == NULL) || 727 (page_trylock(pp, SE_EXCL) == 0)) { 728 (*pfnp)++; 729 break; 730 } 731 if (page_pptonum(pp) != pfn + i) { 732 page_unlock(pp); 733 goto retry; 734 } 735 736 if (!(PP_ISFREE(pp))) { 737 page_unlock(pp); 738 (*pfnp)++; 739 break; 740 } 741 742 if (!PP_ISAGED(pp)) { 743 page_list_sub(pp, PG_CACHE_LIST); 744 page_hashout(pp, (kmutex_t *)NULL); 745 } else { 746 page_list_sub(pp, PG_FREE_LIST); 747 } 748 749 if (iolock) 750 page_io_lock(pp); 751 page_list_concat(&plist, &pp); 752 753 /* 754 * exit loop when pgcnt satisfied or segment boundary reached. 755 */ 756 757 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 758 759 *pfnp += i; /* set to next pfn to search */ 760 761 if (i >= minctg) { 762 *pgcnt -= i; 763 return (plist); 764 } 765 766 /* 767 * failure: minctg not satisfied. 768 * 769 * if next request crosses segment boundary, set next pfn 770 * to search from the segment boundary. 771 */ 772 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 773 *pfnp = roundup(*pfnp, pfnseg + 1); 774 775 /* clean up any pages already allocated */ 776 777 while (plist) { 778 pp = plist; 779 page_sub(&plist, pp); 780 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 781 if (iolock) 782 page_io_unlock(pp); 783 page_unlock(pp); 784 } 785 786 return (NULL); 787 } 788 789 /* 790 * verify that pages being returned from allocator have correct DMA attribute 791 */ 792 #ifndef DEBUG 793 #define check_dma(a, b, c) (0) 794 #else 795 static void 796 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 797 { 798 if (dma_attr == NULL) 799 return; 800 801 while (cnt-- > 0) { 802 if (mmu_ptob((uint64_t)pp->p_pagenum) < 803 dma_attr->dma_attr_addr_lo) 804 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 805 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 806 dma_attr->dma_attr_addr_hi) 807 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 808 pp = pp->p_next; 809 } 810 } 811 #endif 812 813 static kmutex_t contig_lock; 814 815 #define CONTIG_LOCK() mutex_enter(&contig_lock); 816 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 817 818 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 819 820 static page_t * 821 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 822 { 823 pfn_t pfn; 824 int sgllen; 825 uint64_t pfnseg; 826 pgcnt_t minctg; 827 page_t *pplist = NULL, *plist; 828 uint64_t lo, hi; 829 pgcnt_t pfnalign = 0; 830 static pfn_t startpfn; 831 static pgcnt_t lastctgcnt; 832 uintptr_t align; 833 834 CONTIG_LOCK(); 835 836 if (mattr) { 837 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 838 hi = mmu_btop(mattr->dma_attr_addr_hi); 839 if (hi >= physmax) 840 hi = physmax - 1; 841 sgllen = mattr->dma_attr_sgllen; 842 pfnseg = mmu_btop(mattr->dma_attr_seg); 843 844 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 845 if (align > MMU_PAGESIZE) 846 pfnalign = mmu_btop(align); 847 848 /* 849 * in order to satisfy the request, must minimally 850 * acquire minctg contiguous pages 851 */ 852 minctg = howmany(*pgcnt, sgllen); 853 854 ASSERT(hi >= lo); 855 856 /* 857 * start from where last searched if the minctg >= lastctgcnt 858 */ 859 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 860 startpfn = lo; 861 } else { 862 hi = physmax - 1; 863 lo = 0; 864 sgllen = 1; 865 pfnseg = mmu.highest_pfn; 866 minctg = *pgcnt; 867 868 if (minctg < lastctgcnt) 869 startpfn = lo; 870 } 871 lastctgcnt = minctg; 872 873 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 874 875 /* conserve 16m memory - start search above 16m when possible */ 876 if (hi > PFN_16M && startpfn < PFN_16M) 877 startpfn = PFN_16M; 878 879 pfn = startpfn; 880 if (pfnalign) 881 pfn = P2ROUNDUP(pfn, pfnalign); 882 883 while (pfn + minctg - 1 <= hi) { 884 885 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 886 if (plist) { 887 page_list_concat(&pplist, &plist); 888 sgllen--; 889 /* 890 * return when contig pages no longer needed 891 */ 892 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 893 startpfn = pfn; 894 CONTIG_UNLOCK(); 895 check_dma(mattr, pplist, *pgcnt); 896 return (pplist); 897 } 898 minctg = howmany(*pgcnt, sgllen); 899 } 900 if (pfnalign) 901 pfn = P2ROUNDUP(pfn, pfnalign); 902 } 903 904 /* cannot find contig pages in specified range */ 905 if (startpfn == lo) { 906 CONTIG_UNLOCK(); 907 return (NULL); 908 } 909 910 /* did not start with lo previously */ 911 pfn = lo; 912 if (pfnalign) 913 pfn = P2ROUNDUP(pfn, pfnalign); 914 915 /* allow search to go above startpfn */ 916 while (pfn < startpfn) { 917 918 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 919 if (plist != NULL) { 920 921 page_list_concat(&pplist, &plist); 922 sgllen--; 923 924 /* 925 * return when contig pages no longer needed 926 */ 927 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 928 startpfn = pfn; 929 CONTIG_UNLOCK(); 930 check_dma(mattr, pplist, *pgcnt); 931 return (pplist); 932 } 933 minctg = howmany(*pgcnt, sgllen); 934 } 935 if (pfnalign) 936 pfn = P2ROUNDUP(pfn, pfnalign); 937 } 938 CONTIG_UNLOCK(); 939 return (NULL); 940 } 941 942 /* 943 * combine mem_node_config and memrange memory ranges into one data 944 * structure to be used for page list management. 945 * 946 * mnode_range_cnt() calculates the number of memory ranges for mnode and 947 * memranges[]. Used to determine the size of page lists and mnoderanges. 948 * 949 * mnode_range_setup() initializes mnoderanges. 950 */ 951 mnoderange_t *mnoderanges; 952 int mnoderangecnt; 953 int mtype4g; 954 955 int 956 mnode_range_cnt(int mnode) 957 { 958 int mri; 959 int mnrcnt = 0; 960 961 if (mem_node_config[mnode].exists != 0) { 962 mri = nranges - 1; 963 964 /* find the memranges index below contained in mnode range */ 965 966 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 967 mri--; 968 969 /* 970 * increment mnode range counter when memranges or mnode 971 * boundary is reached. 972 */ 973 while (mri >= 0 && 974 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 975 mnrcnt++; 976 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 977 mri--; 978 else 979 break; 980 } 981 } 982 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 983 return (mnrcnt); 984 } 985 986 void 987 mnode_range_setup(mnoderange_t *mnoderanges) 988 { 989 int mnode, mri; 990 991 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 992 if (mem_node_config[mnode].exists == 0) 993 continue; 994 995 mri = nranges - 1; 996 997 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 998 mri--; 999 1000 while (mri >= 0 && mem_node_config[mnode].physmax >= 1001 MEMRANGELO(mri)) { 1002 mnoderanges->mnr_pfnlo = 1003 MAX(MEMRANGELO(mri), 1004 mem_node_config[mnode].physbase); 1005 mnoderanges->mnr_pfnhi = 1006 MIN(MEMRANGEHI(mri), 1007 mem_node_config[mnode].physmax); 1008 mnoderanges->mnr_mnode = mnode; 1009 mnoderanges->mnr_memrange = mri; 1010 mnoderanges++; 1011 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1012 mri--; 1013 else 1014 break; 1015 } 1016 } 1017 } 1018 1019 /* 1020 * Determine if the mnode range specified in mtype contains memory belonging 1021 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1022 * the range of indices from high pfn to 0, 16m or 4g. 1023 * 1024 * Return first mnode range type index found otherwise return -1 if none found. 1025 */ 1026 int 1027 mtype_func(int mnode, int mtype, uint_t flags) 1028 { 1029 if (flags & PGI_MT_RANGE) { 1030 int mtlim; 1031 1032 if (flags & PGI_MT_NEXT) 1033 mtype--; 1034 if (flags & PGI_MT_RANGE0) 1035 mtlim = 0; 1036 else if (flags & PGI_MT_RANGE4G) 1037 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1038 else if (flags & PGI_MT_RANGE16M) 1039 mtlim = 1; /* exclude 0-16m range */ 1040 while (mtype >= mtlim) { 1041 if (mnoderanges[mtype].mnr_mnode == mnode) 1042 return (mtype); 1043 mtype--; 1044 } 1045 } else { 1046 if (mnoderanges[mtype].mnr_mnode == mnode) 1047 return (mtype); 1048 } 1049 return (-1); 1050 } 1051 1052 /* 1053 * Update the page list max counts with the pfn range specified by the 1054 * input parameters. Called from add_physmem() when physical memory with 1055 * page_t's are initially added to the page lists. 1056 */ 1057 void 1058 mtype_modify_max(pfn_t startpfn, long cnt) 1059 { 1060 int mtype = 0; 1061 pfn_t endpfn = startpfn + cnt, pfn; 1062 pgcnt_t inc; 1063 1064 ASSERT(cnt > 0); 1065 1066 for (pfn = startpfn; pfn < endpfn; ) { 1067 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1068 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1069 inc = endpfn - pfn; 1070 } else { 1071 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1072 } 1073 mnoderanges[mtype].mnr_mt_pgmax += inc; 1074 if (physmax4g && mtype <= mtype4g) 1075 maxmem4g += inc; 1076 pfn += inc; 1077 } 1078 mtype++; 1079 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1080 } 1081 } 1082 1083 /* 1084 * Returns the free page count for mnode 1085 */ 1086 int 1087 mnode_pgcnt(int mnode) 1088 { 1089 int mtype = mnoderangecnt - 1; 1090 int flags = PGI_MT_RANGE0; 1091 pgcnt_t pgcnt = 0; 1092 1093 mtype = mtype_func(mnode, mtype, flags); 1094 1095 while (mtype != -1) { 1096 pgcnt += MTYPE_FREEMEM(mtype); 1097 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1098 } 1099 return (pgcnt); 1100 } 1101 1102 /* 1103 * Initialize page coloring variables based on the l2 cache parameters. 1104 * Calculate and return memory needed for page coloring data structures. 1105 */ 1106 size_t 1107 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1108 { 1109 size_t colorsz = 0; 1110 int i; 1111 int colors; 1112 1113 /* 1114 * Reduce the memory ranges lists if we don't have large amounts 1115 * of memory. This avoids searching known empty free lists. 1116 */ 1117 i = memrange_num(physmax); 1118 memranges += i; 1119 nranges -= i; 1120 #if defined(__i386) 1121 if (i > 0) 1122 restricted_kmemalloc = 0; 1123 #endif 1124 /* physmax greater than 4g */ 1125 if (i == 0) 1126 physmax4g = 1; 1127 1128 ASSERT(ISP2(l2_sz)); 1129 ASSERT(ISP2(l2_linesz)); 1130 ASSERT(l2_sz > MMU_PAGESIZE); 1131 1132 /* l2_assoc is 0 for fully associative l2 cache */ 1133 if (l2_assoc) 1134 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1135 else 1136 l2_colors = 1; 1137 1138 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1139 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1140 1141 /* 1142 * cpu_page_colors is non-zero when a page color may be spread across 1143 * multiple bins. 1144 */ 1145 if (l2_colors < page_colors) 1146 cpu_page_colors = l2_colors; 1147 1148 ASSERT(ISP2(page_colors)); 1149 1150 page_colors_mask = page_colors - 1; 1151 1152 ASSERT(ISP2(CPUSETSIZE())); 1153 page_coloring_shift = lowbit(CPUSETSIZE()); 1154 1155 /* initialize number of colors per page size */ 1156 for (i = 0; i <= mmu.max_page_level; i++) { 1157 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1158 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1159 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1160 hw_page_array[i].hp_colors = (page_colors_mask >> 1161 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1162 + 1; 1163 } 1164 1165 /* 1166 * The value of cpu_page_colors determines if additional color bins 1167 * need to be checked for a particular color in the page_get routines. 1168 */ 1169 if (cpu_page_colors != 0) { 1170 1171 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1172 ASSERT(a > 0); 1173 ASSERT(a < 16); 1174 1175 for (i = 0; i <= mmu.max_page_level; i++) { 1176 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1177 colorequivszc[i] = 0; 1178 continue; 1179 } 1180 while ((colors >> a) == 0) 1181 a--; 1182 ASSERT(a >= 0); 1183 1184 /* higher 4 bits encodes color equiv mask */ 1185 colorequivszc[i] = (a << 4); 1186 } 1187 } 1188 1189 /* factor in colorequiv to check additional 'equivalent' bins. */ 1190 if (colorequiv > 1) { 1191 1192 int a = lowbit(colorequiv) - 1; 1193 if (a > 15) 1194 a = 15; 1195 1196 for (i = 0; i <= mmu.max_page_level; i++) { 1197 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1198 continue; 1199 } 1200 while ((colors >> a) == 0) 1201 a--; 1202 if ((a << 4) > colorequivszc[i]) { 1203 colorequivszc[i] = (a << 4); 1204 } 1205 } 1206 } 1207 1208 /* size for mnoderanges */ 1209 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1210 mnoderangecnt += mnode_range_cnt(i); 1211 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1212 1213 /* size for fpc_mutex and cpc_mutex */ 1214 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1215 1216 /* size of page_freelists */ 1217 colorsz += mnoderangecnt * sizeof (page_t ***); 1218 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1219 1220 for (i = 0; i < mmu_page_sizes; i++) { 1221 colors = page_get_pagecolors(i); 1222 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1223 } 1224 1225 /* size of page_cachelists */ 1226 colorsz += mnoderangecnt * sizeof (page_t **); 1227 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1228 1229 return (colorsz); 1230 } 1231 1232 /* 1233 * Called once at startup to configure page_coloring data structures and 1234 * does the 1st page_free()/page_freelist_add(). 1235 */ 1236 void 1237 page_coloring_setup(caddr_t pcmemaddr) 1238 { 1239 int i; 1240 int j; 1241 int k; 1242 caddr_t addr; 1243 int colors; 1244 1245 /* 1246 * do page coloring setup 1247 */ 1248 addr = pcmemaddr; 1249 1250 mnoderanges = (mnoderange_t *)addr; 1251 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1252 1253 mnode_range_setup(mnoderanges); 1254 1255 if (physmax4g) 1256 mtype4g = pfn_2_mtype(0xfffff); 1257 1258 for (k = 0; k < NPC_MUTEX; k++) { 1259 fpc_mutex[k] = (kmutex_t *)addr; 1260 addr += (max_mem_nodes * sizeof (kmutex_t)); 1261 } 1262 for (k = 0; k < NPC_MUTEX; k++) { 1263 cpc_mutex[k] = (kmutex_t *)addr; 1264 addr += (max_mem_nodes * sizeof (kmutex_t)); 1265 } 1266 page_freelists = (page_t ****)addr; 1267 addr += (mnoderangecnt * sizeof (page_t ***)); 1268 1269 page_cachelists = (page_t ***)addr; 1270 addr += (mnoderangecnt * sizeof (page_t **)); 1271 1272 for (i = 0; i < mnoderangecnt; i++) { 1273 page_freelists[i] = (page_t ***)addr; 1274 addr += (mmu_page_sizes * sizeof (page_t **)); 1275 1276 for (j = 0; j < mmu_page_sizes; j++) { 1277 colors = page_get_pagecolors(j); 1278 page_freelists[i][j] = (page_t **)addr; 1279 addr += (colors * sizeof (page_t *)); 1280 } 1281 page_cachelists[i] = (page_t **)addr; 1282 addr += (page_colors * sizeof (page_t *)); 1283 } 1284 } 1285 1286 /*ARGSUSED*/ 1287 int 1288 bp_color(struct buf *bp) 1289 { 1290 return (0); 1291 } 1292 1293 /* 1294 * get a page from any list with the given mnode 1295 */ 1296 page_t * 1297 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1298 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1299 { 1300 kmutex_t *pcm; 1301 int i; 1302 page_t *pp; 1303 page_t *first_pp; 1304 uint64_t pgaddr; 1305 ulong_t bin; 1306 int mtypestart; 1307 int plw_initialized; 1308 page_list_walker_t plw; 1309 1310 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1311 1312 ASSERT((flags & PG_MATCH_COLOR) == 0); 1313 ASSERT(szc == 0); 1314 ASSERT(dma_attr != NULL); 1315 1316 MTYPE_START(mnode, mtype, flags); 1317 if (mtype < 0) { 1318 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1319 return (NULL); 1320 } 1321 1322 mtypestart = mtype; 1323 1324 bin = origbin; 1325 1326 /* 1327 * check up to page_colors + 1 bins - origbin may be checked twice 1328 * because of BIN_STEP skip 1329 */ 1330 do { 1331 plw_initialized = 0; 1332 1333 for (plw.plw_count = 0; 1334 plw.plw_count < page_colors; plw.plw_count++) { 1335 1336 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1337 goto nextfreebin; 1338 1339 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1340 mutex_enter(pcm); 1341 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1342 first_pp = pp; 1343 while (pp != NULL) { 1344 if (page_trylock(pp, SE_EXCL) == 0) { 1345 pp = pp->p_next; 1346 if (pp == first_pp) { 1347 pp = NULL; 1348 } 1349 continue; 1350 } 1351 1352 ASSERT(PP_ISFREE(pp)); 1353 ASSERT(PP_ISAGED(pp)); 1354 ASSERT(pp->p_vnode == NULL); 1355 ASSERT(pp->p_hash == NULL); 1356 ASSERT(pp->p_offset == (u_offset_t)-1); 1357 ASSERT(pp->p_szc == szc); 1358 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1359 /* check if page within DMA attributes */ 1360 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1361 1362 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1363 (pgaddr + MMU_PAGESIZE - 1 <= 1364 dma_attr->dma_attr_addr_hi)) { 1365 break; 1366 } 1367 1368 /* continue looking */ 1369 page_unlock(pp); 1370 pp = pp->p_next; 1371 if (pp == first_pp) 1372 pp = NULL; 1373 1374 } 1375 if (pp != NULL) { 1376 ASSERT(mtype == PP_2_MTYPE(pp)); 1377 ASSERT(pp->p_szc == 0); 1378 1379 /* found a page with specified DMA attributes */ 1380 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1381 mtype), pp); 1382 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1383 1384 if ((PP_ISFREE(pp) == 0) || 1385 (PP_ISAGED(pp) == 0)) { 1386 cmn_err(CE_PANIC, "page %p is not free", 1387 (void *)pp); 1388 } 1389 1390 mutex_exit(pcm); 1391 check_dma(dma_attr, pp, 1); 1392 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1393 return (pp); 1394 } 1395 mutex_exit(pcm); 1396 nextfreebin: 1397 if (plw_initialized == 0) { 1398 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 1399 ASSERT(plw.plw_ceq_dif == page_colors); 1400 plw_initialized = 1; 1401 } 1402 1403 if (plw.plw_do_split) { 1404 pp = page_freelist_split(szc, bin, mnode, 1405 mtype, 1406 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 1407 &plw); 1408 if (pp != NULL) 1409 return (pp); 1410 } 1411 1412 bin = page_list_walk_next_bin(szc, bin, &plw); 1413 } 1414 1415 MTYPE_NEXT(mnode, mtype, flags); 1416 } while (mtype >= 0); 1417 1418 /* failed to find a page in the freelist; try it in the cachelist */ 1419 1420 /* reset mtype start for cachelist search */ 1421 mtype = mtypestart; 1422 ASSERT(mtype >= 0); 1423 1424 /* start with the bin of matching color */ 1425 bin = origbin; 1426 1427 do { 1428 for (i = 0; i <= page_colors; i++) { 1429 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1430 goto nextcachebin; 1431 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1432 mutex_enter(pcm); 1433 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1434 first_pp = pp; 1435 while (pp != NULL) { 1436 if (page_trylock(pp, SE_EXCL) == 0) { 1437 pp = pp->p_next; 1438 if (pp == first_pp) 1439 break; 1440 continue; 1441 } 1442 ASSERT(pp->p_vnode); 1443 ASSERT(PP_ISAGED(pp) == 0); 1444 ASSERT(pp->p_szc == 0); 1445 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1446 1447 /* check if page within DMA attributes */ 1448 1449 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1450 1451 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1452 (pgaddr + MMU_PAGESIZE - 1 <= 1453 dma_attr->dma_attr_addr_hi)) { 1454 break; 1455 } 1456 1457 /* continue looking */ 1458 page_unlock(pp); 1459 pp = pp->p_next; 1460 if (pp == first_pp) 1461 pp = NULL; 1462 } 1463 1464 if (pp != NULL) { 1465 ASSERT(mtype == PP_2_MTYPE(pp)); 1466 ASSERT(pp->p_szc == 0); 1467 1468 /* found a page with specified DMA attributes */ 1469 page_sub(&PAGE_CACHELISTS(mnode, bin, 1470 mtype), pp); 1471 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1472 1473 mutex_exit(pcm); 1474 ASSERT(pp->p_vnode); 1475 ASSERT(PP_ISAGED(pp) == 0); 1476 check_dma(dma_attr, pp, 1); 1477 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1478 return (pp); 1479 } 1480 mutex_exit(pcm); 1481 nextcachebin: 1482 bin += (i == 0) ? BIN_STEP : 1; 1483 bin &= page_colors_mask; 1484 } 1485 MTYPE_NEXT(mnode, mtype, flags); 1486 } while (mtype >= 0); 1487 1488 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1489 return (NULL); 1490 } 1491 1492 /* 1493 * This function is similar to page_get_freelist()/page_get_cachelist() 1494 * but it searches both the lists to find a page with the specified 1495 * color (or no color) and DMA attributes. The search is done in the 1496 * freelist first and then in the cache list within the highest memory 1497 * range (based on DMA attributes) before searching in the lower 1498 * memory ranges. 1499 * 1500 * Note: This function is called only by page_create_io(). 1501 */ 1502 /*ARGSUSED*/ 1503 page_t * 1504 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1505 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1506 { 1507 uint_t bin; 1508 int mtype; 1509 page_t *pp; 1510 int n; 1511 int m; 1512 int szc; 1513 int fullrange; 1514 int mnode; 1515 int local_failed_stat = 0; 1516 lgrp_mnode_cookie_t lgrp_cookie; 1517 1518 VM_STAT_ADD(pga_vmstats.pga_alloc); 1519 1520 /* only base pagesize currently supported */ 1521 if (size != MMU_PAGESIZE) 1522 return (NULL); 1523 1524 /* 1525 * If we're passed a specific lgroup, we use it. Otherwise, 1526 * assume first-touch placement is desired. 1527 */ 1528 if (!LGRP_EXISTS(lgrp)) 1529 lgrp = lgrp_home_lgrp(); 1530 1531 /* LINTED */ 1532 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 1533 1534 /* 1535 * Only hold one freelist or cachelist lock at a time, that way we 1536 * can start anywhere and not have to worry about lock 1537 * ordering. 1538 */ 1539 if (dma_attr == NULL) { 1540 n = 0; 1541 m = mnoderangecnt - 1; 1542 fullrange = 1; 1543 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1544 } else { 1545 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1546 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1547 1548 /* 1549 * We can guarantee alignment only for page boundary. 1550 */ 1551 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1552 return (NULL); 1553 1554 n = pfn_2_mtype(pfnlo); 1555 m = pfn_2_mtype(pfnhi); 1556 1557 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1558 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1559 } 1560 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1561 1562 if (n > m) 1563 return (NULL); 1564 1565 szc = 0; 1566 1567 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1568 if (n == 0) { 1569 flags |= PGI_MT_RANGE0; 1570 n = m; 1571 } 1572 1573 /* 1574 * Try local memory node first, but try remote if we can't 1575 * get a page of the right color. 1576 */ 1577 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1578 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1579 /* 1580 * allocate pages from high pfn to low. 1581 */ 1582 for (mtype = m; mtype >= n; mtype--) { 1583 if (fullrange != 0) { 1584 pp = page_get_mnode_freelist(mnode, 1585 bin, mtype, szc, flags); 1586 if (pp == NULL) { 1587 pp = page_get_mnode_cachelist( 1588 bin, flags, mnode, mtype); 1589 } 1590 } else { 1591 pp = page_get_mnode_anylist(bin, szc, 1592 flags, mnode, mtype, dma_attr); 1593 } 1594 if (pp != NULL) { 1595 VM_STAT_ADD(pga_vmstats.pga_allocok); 1596 check_dma(dma_attr, pp, 1); 1597 return (pp); 1598 } 1599 } 1600 if (!local_failed_stat) { 1601 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1602 local_failed_stat = 1; 1603 } 1604 } 1605 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1606 1607 return (NULL); 1608 } 1609 1610 /* 1611 * page_create_io() 1612 * 1613 * This function is a copy of page_create_va() with an additional 1614 * argument 'mattr' that specifies DMA memory requirements to 1615 * the page list functions. This function is used by the segkmem 1616 * allocator so it is only to create new pages (i.e PG_EXCL is 1617 * set). 1618 * 1619 * Note: This interface is currently used by x86 PSM only and is 1620 * not fully specified so the commitment level is only for 1621 * private interface specific to x86. This interface uses PSM 1622 * specific page_get_anylist() interface. 1623 */ 1624 1625 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1626 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1627 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1628 break; \ 1629 } \ 1630 } 1631 1632 1633 page_t * 1634 page_create_io( 1635 struct vnode *vp, 1636 u_offset_t off, 1637 uint_t bytes, 1638 uint_t flags, 1639 struct as *as, 1640 caddr_t vaddr, 1641 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1642 { 1643 page_t *plist = NULL; 1644 uint_t plist_len = 0; 1645 pgcnt_t npages; 1646 page_t *npp = NULL; 1647 uint_t pages_req; 1648 page_t *pp; 1649 kmutex_t *phm = NULL; 1650 uint_t index; 1651 1652 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1653 "page_create_start:vp %p off %llx bytes %u flags %x", 1654 vp, off, bytes, flags); 1655 1656 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1657 1658 pages_req = npages = mmu_btopr(bytes); 1659 1660 /* 1661 * Do the freemem and pcf accounting. 1662 */ 1663 if (!page_create_wait(npages, flags)) { 1664 return (NULL); 1665 } 1666 1667 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1668 "page_create_success:vp %p off %llx", 1669 vp, off); 1670 1671 /* 1672 * If satisfying this request has left us with too little 1673 * memory, start the wheels turning to get some back. The 1674 * first clause of the test prevents waking up the pageout 1675 * daemon in situations where it would decide that there's 1676 * nothing to do. 1677 */ 1678 if (nscan < desscan && freemem < minfree) { 1679 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1680 "pageout_cv_signal:freemem %ld", freemem); 1681 cv_signal(&proc_pageout->p_cv); 1682 } 1683 1684 if (flags & PG_PHYSCONTIG) { 1685 1686 plist = page_get_contigpage(&npages, mattr, 1); 1687 if (plist == NULL) { 1688 page_create_putback(npages); 1689 return (NULL); 1690 } 1691 1692 pp = plist; 1693 1694 do { 1695 if (!page_hashin(pp, vp, off, NULL)) { 1696 panic("pg_creat_io: hashin failed %p %p %llx", 1697 (void *)pp, (void *)vp, off); 1698 } 1699 VM_STAT_ADD(page_create_new); 1700 off += MMU_PAGESIZE; 1701 PP_CLRFREE(pp); 1702 PP_CLRAGED(pp); 1703 page_set_props(pp, P_REF); 1704 pp = pp->p_next; 1705 } while (pp != plist); 1706 1707 if (!npages) { 1708 check_dma(mattr, plist, pages_req); 1709 return (plist); 1710 } else { 1711 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1712 } 1713 1714 /* 1715 * fall-thru: 1716 * 1717 * page_get_contigpage returns when npages <= sgllen. 1718 * Grab the rest of the non-contig pages below from anylist. 1719 */ 1720 } 1721 1722 /* 1723 * Loop around collecting the requested number of pages. 1724 * Most of the time, we have to `create' a new page. With 1725 * this in mind, pull the page off the free list before 1726 * getting the hash lock. This will minimize the hash 1727 * lock hold time, nesting, and the like. If it turns 1728 * out we don't need the page, we put it back at the end. 1729 */ 1730 while (npages--) { 1731 phm = NULL; 1732 1733 index = PAGE_HASH_FUNC(vp, off); 1734 top: 1735 ASSERT(phm == NULL); 1736 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1737 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1738 1739 if (npp == NULL) { 1740 /* 1741 * Try to get the page of any color either from 1742 * the freelist or from the cache list. 1743 */ 1744 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1745 flags & ~PG_MATCH_COLOR, mattr, NULL); 1746 if (npp == NULL) { 1747 if (mattr == NULL) { 1748 /* 1749 * Not looking for a special page; 1750 * panic! 1751 */ 1752 panic("no page found %d", (int)npages); 1753 } 1754 /* 1755 * No page found! This can happen 1756 * if we are looking for a page 1757 * within a specific memory range 1758 * for DMA purposes. If PG_WAIT is 1759 * specified then we wait for a 1760 * while and then try again. The 1761 * wait could be forever if we 1762 * don't get the page(s) we need. 1763 * 1764 * Note: XXX We really need a mechanism 1765 * to wait for pages in the desired 1766 * range. For now, we wait for any 1767 * pages and see if we can use it. 1768 */ 1769 1770 if ((mattr != NULL) && (flags & PG_WAIT)) { 1771 delay(10); 1772 goto top; 1773 } 1774 1775 goto fail; /* undo accounting stuff */ 1776 } 1777 1778 if (PP_ISAGED(npp) == 0) { 1779 /* 1780 * Since this page came from the 1781 * cachelist, we must destroy the 1782 * old vnode association. 1783 */ 1784 page_hashout(npp, (kmutex_t *)NULL); 1785 } 1786 } 1787 1788 /* 1789 * We own this page! 1790 */ 1791 ASSERT(PAGE_EXCL(npp)); 1792 ASSERT(npp->p_vnode == NULL); 1793 ASSERT(!hat_page_is_mapped(npp)); 1794 PP_CLRFREE(npp); 1795 PP_CLRAGED(npp); 1796 1797 /* 1798 * Here we have a page in our hot little mits and are 1799 * just waiting to stuff it on the appropriate lists. 1800 * Get the mutex and check to see if it really does 1801 * not exist. 1802 */ 1803 phm = PAGE_HASH_MUTEX(index); 1804 mutex_enter(phm); 1805 PAGE_HASH_SEARCH(index, pp, vp, off); 1806 if (pp == NULL) { 1807 VM_STAT_ADD(page_create_new); 1808 pp = npp; 1809 npp = NULL; 1810 if (!page_hashin(pp, vp, off, phm)) { 1811 /* 1812 * Since we hold the page hash mutex and 1813 * just searched for this page, page_hashin 1814 * had better not fail. If it does, that 1815 * means somethread did not follow the 1816 * page hash mutex rules. Panic now and 1817 * get it over with. As usual, go down 1818 * holding all the locks. 1819 */ 1820 ASSERT(MUTEX_HELD(phm)); 1821 panic("page_create: hashin fail %p %p %llx %p", 1822 (void *)pp, (void *)vp, off, (void *)phm); 1823 1824 } 1825 ASSERT(MUTEX_HELD(phm)); 1826 mutex_exit(phm); 1827 phm = NULL; 1828 1829 /* 1830 * Hat layer locking need not be done to set 1831 * the following bits since the page is not hashed 1832 * and was on the free list (i.e., had no mappings). 1833 * 1834 * Set the reference bit to protect 1835 * against immediate pageout 1836 * 1837 * XXXmh modify freelist code to set reference 1838 * bit so we don't have to do it here. 1839 */ 1840 page_set_props(pp, P_REF); 1841 } else { 1842 ASSERT(MUTEX_HELD(phm)); 1843 mutex_exit(phm); 1844 phm = NULL; 1845 /* 1846 * NOTE: This should not happen for pages associated 1847 * with kernel vnode 'kvp'. 1848 */ 1849 /* XX64 - to debug why this happens! */ 1850 ASSERT(vp != &kvp); 1851 if (vp == &kvp) 1852 cmn_err(CE_NOTE, 1853 "page_create: page not expected " 1854 "in hash list for kernel vnode - pp 0x%p", 1855 (void *)pp); 1856 VM_STAT_ADD(page_create_exists); 1857 goto fail; 1858 } 1859 1860 /* 1861 * Got a page! It is locked. Acquire the i/o 1862 * lock since we are going to use the p_next and 1863 * p_prev fields to link the requested pages together. 1864 */ 1865 page_io_lock(pp); 1866 page_add(&plist, pp); 1867 plist = plist->p_next; 1868 off += MMU_PAGESIZE; 1869 vaddr += MMU_PAGESIZE; 1870 } 1871 1872 check_dma(mattr, plist, pages_req); 1873 return (plist); 1874 1875 fail: 1876 if (npp != NULL) { 1877 /* 1878 * Did not need this page after all. 1879 * Put it back on the free list. 1880 */ 1881 VM_STAT_ADD(page_create_putbacks); 1882 PP_SETFREE(npp); 1883 PP_SETAGED(npp); 1884 npp->p_offset = (u_offset_t)-1; 1885 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1886 page_unlock(npp); 1887 } 1888 1889 /* 1890 * Give up the pages we already got. 1891 */ 1892 while (plist != NULL) { 1893 pp = plist; 1894 page_sub(&plist, pp); 1895 page_io_unlock(pp); 1896 plist_len++; 1897 /*LINTED: constant in conditional ctx*/ 1898 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1899 } 1900 1901 /* 1902 * VN_DISPOSE does freemem accounting for the pages in plist 1903 * by calling page_free. So, we need to undo the pcf accounting 1904 * for only the remaining pages. 1905 */ 1906 VM_STAT_ADD(page_create_putbacks); 1907 page_create_putback(pages_req - plist_len); 1908 1909 return (NULL); 1910 } 1911 1912 1913 /* 1914 * Copy the data from the physical page represented by "frompp" to 1915 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1916 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1917 * level and no one sleeps with an active mapping there. 1918 * 1919 * Note that the ref/mod bits in the page_t's are not affected by 1920 * this operation, hence it is up to the caller to update them appropriately. 1921 */ 1922 void 1923 ppcopy(page_t *frompp, page_t *topp) 1924 { 1925 caddr_t pp_addr1; 1926 caddr_t pp_addr2; 1927 void *pte1; 1928 void *pte2; 1929 kmutex_t *ppaddr_mutex; 1930 1931 ASSERT_STACK_ALIGNED(); 1932 ASSERT(PAGE_LOCKED(frompp)); 1933 ASSERT(PAGE_LOCKED(topp)); 1934 1935 if (kpm_enable) { 1936 pp_addr1 = hat_kpm_page2va(frompp, 0); 1937 pp_addr2 = hat_kpm_page2va(topp, 0); 1938 kpreempt_disable(); 1939 } else { 1940 /* 1941 * disable pre-emption so that CPU can't change 1942 */ 1943 kpreempt_disable(); 1944 1945 pp_addr1 = CPU->cpu_caddr1; 1946 pp_addr2 = CPU->cpu_caddr2; 1947 pte1 = (void *)CPU->cpu_caddr1pte; 1948 pte2 = (void *)CPU->cpu_caddr2pte; 1949 1950 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1951 mutex_enter(ppaddr_mutex); 1952 1953 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1954 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1955 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1956 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1957 HAT_LOAD_NOCONSIST); 1958 } 1959 1960 if (use_sse_pagecopy) 1961 hwblkpagecopy(pp_addr1, pp_addr2); 1962 else 1963 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1964 1965 if (!kpm_enable) 1966 mutex_exit(ppaddr_mutex); 1967 kpreempt_enable(); 1968 } 1969 1970 /* 1971 * Zero the physical page from off to off + len given by `pp' 1972 * without changing the reference and modified bits of page. 1973 * 1974 * We use this using CPU private page address #2, see ppcopy() for more info. 1975 * pagezero() must not be called at interrupt level. 1976 */ 1977 void 1978 pagezero(page_t *pp, uint_t off, uint_t len) 1979 { 1980 caddr_t pp_addr2; 1981 void *pte2; 1982 kmutex_t *ppaddr_mutex; 1983 1984 ASSERT_STACK_ALIGNED(); 1985 ASSERT(len <= MMU_PAGESIZE); 1986 ASSERT(off <= MMU_PAGESIZE); 1987 ASSERT(off + len <= MMU_PAGESIZE); 1988 ASSERT(PAGE_LOCKED(pp)); 1989 1990 if (kpm_enable) { 1991 pp_addr2 = hat_kpm_page2va(pp, 0); 1992 kpreempt_disable(); 1993 } else { 1994 kpreempt_disable(); 1995 1996 pp_addr2 = CPU->cpu_caddr2; 1997 pte2 = (void *)CPU->cpu_caddr2pte; 1998 1999 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 2000 mutex_enter(ppaddr_mutex); 2001 2002 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 2003 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 2004 HAT_LOAD_NOCONSIST); 2005 } 2006 2007 if (use_sse_pagezero) 2008 hwblkclr(pp_addr2 + off, len); 2009 else 2010 bzero(pp_addr2 + off, len); 2011 2012 if (!kpm_enable) 2013 mutex_exit(ppaddr_mutex); 2014 kpreempt_enable(); 2015 } 2016 2017 /* 2018 * Platform-dependent page scrub call. 2019 */ 2020 void 2021 pagescrub(page_t *pp, uint_t off, uint_t len) 2022 { 2023 /* 2024 * For now, we rely on the fact that pagezero() will 2025 * always clear UEs. 2026 */ 2027 pagezero(pp, off, len); 2028 } 2029 2030 /* 2031 * set up two private addresses for use on a given CPU for use in ppcopy() 2032 */ 2033 void 2034 setup_vaddr_for_ppcopy(struct cpu *cpup) 2035 { 2036 void *addr; 2037 void *pte; 2038 2039 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2040 pte = hat_mempte_setup(addr); 2041 cpup->cpu_caddr1 = addr; 2042 cpup->cpu_caddr1pte = (pteptr_t)pte; 2043 2044 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2045 pte = hat_mempte_setup(addr); 2046 cpup->cpu_caddr2 = addr; 2047 cpup->cpu_caddr2pte = (pteptr_t)pte; 2048 2049 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 2050 } 2051 2052 2053 /* 2054 * Create the pageout scanner thread. The thread has to 2055 * start at procedure with process pp and priority pri. 2056 */ 2057 void 2058 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 2059 { 2060 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 2061 } 2062 2063 /* 2064 * Function for flushing D-cache when performing module relocations 2065 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 2066 */ 2067 void 2068 dcache_flushall() 2069 {} 2070