1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * UNIX machine dependent virtual memory support. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/user.h> 45 #include <sys/proc.h> 46 #include <sys/kmem.h> 47 #include <sys/vmem.h> 48 #include <sys/buf.h> 49 #include <sys/cpuvar.h> 50 #include <sys/lgrp.h> 51 #include <sys/disp.h> 52 #include <sys/vm.h> 53 #include <sys/mman.h> 54 #include <sys/vnode.h> 55 #include <sys/cred.h> 56 #include <sys/exec.h> 57 #include <sys/exechdr.h> 58 #include <sys/debug.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 0; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 extern uintptr_t eprom_kernelbase; 95 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 96 97 /* 4g memory management */ 98 pgcnt_t maxmem4g; 99 pgcnt_t freemem4g; 100 int physmax4g; 101 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 102 int lotsfree4gshift = 3; 103 104 #ifdef VM_STATS 105 struct { 106 ulong_t pga_alloc; 107 ulong_t pga_notfullrange; 108 ulong_t pga_nulldmaattr; 109 ulong_t pga_allocok; 110 ulong_t pga_allocfailed; 111 ulong_t pgma_alloc; 112 ulong_t pgma_allocok; 113 ulong_t pgma_allocfailed; 114 ulong_t pgma_allocempty; 115 } pga_vmstats; 116 #endif 117 118 uint_t mmu_page_sizes; 119 120 /* How many page sizes the users can see */ 121 uint_t mmu_exported_page_sizes; 122 123 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 124 /* 125 * Number of pages in 1 GB. Don't enable automatic large pages if we have 126 * fewer than this many pages. 127 */ 128 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 129 130 /* 131 * Return the optimum page size for a given mapping 132 */ 133 /*ARGSUSED*/ 134 size_t 135 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 136 { 137 level_t l; 138 139 if (remap) 140 *remap = 0; 141 142 switch (maptype) { 143 144 case MAPPGSZ_STK: 145 case MAPPGSZ_HEAP: 146 case MAPPGSZ_VA: 147 /* 148 * use the pages size that best fits len 149 */ 150 for (l = mmu.max_page_level; l > 0; --l) { 151 if (len < LEVEL_SIZE(l)) 152 continue; 153 break; 154 } 155 return (LEVEL_SIZE(l)); 156 157 /* 158 * for ISM use the 1st large page size. 159 */ 160 case MAPPGSZ_ISM: 161 if (mmu.max_page_level == 0) 162 return (MMU_PAGESIZE); 163 return (LEVEL_SIZE(1)); 164 } 165 return (0); 166 } 167 168 /* 169 * This can be patched via /etc/system to allow large pages 170 * to be used for mapping application and libraries text segments. 171 */ 172 int use_text_largepages = 0; 173 174 /* 175 * Return a bit vector of large page size codes that 176 * can be used to map [addr, addr + len) region. 177 */ 178 179 /*ARGSUSED*/ 180 uint_t 181 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 182 { 183 size_t pgsz; 184 caddr_t a; 185 186 if (!text || !use_text_largepages || 187 mmu.max_page_level == 0) 188 return (0); 189 190 pgsz = LEVEL_SIZE(1); 191 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 192 if (a < addr || a >= addr + len) { 193 return (0); 194 } 195 len -= (a - addr); 196 if (len < pgsz) { 197 return (0); 198 } 199 return (1 << 1); 200 } 201 202 /* 203 * Handle a pagefault. 204 */ 205 faultcode_t 206 pagefault( 207 caddr_t addr, 208 enum fault_type type, 209 enum seg_rw rw, 210 int iskernel) 211 { 212 struct as *as; 213 struct hat *hat; 214 struct proc *p; 215 kthread_t *t; 216 faultcode_t res; 217 caddr_t base; 218 size_t len; 219 int err; 220 int mapped_red; 221 uintptr_t ea; 222 223 ASSERT_STACK_ALIGNED(); 224 225 if (INVALID_VADDR(addr)) 226 return (FC_NOMAP); 227 228 mapped_red = segkp_map_red(); 229 230 if (iskernel) { 231 as = &kas; 232 hat = as->a_hat; 233 } else { 234 t = curthread; 235 p = ttoproc(t); 236 as = p->p_as; 237 hat = as->a_hat; 238 } 239 240 /* 241 * Dispatch pagefault. 242 */ 243 res = as_fault(hat, as, addr, 1, type, rw); 244 245 /* 246 * If this isn't a potential unmapped hole in the user's 247 * UNIX data or stack segments, just return status info. 248 */ 249 if (res != FC_NOMAP || iskernel) 250 goto out; 251 252 /* 253 * Check to see if we happened to faulted on a currently unmapped 254 * part of the UNIX data or stack segments. If so, create a zfod 255 * mapping there and then try calling the fault routine again. 256 */ 257 base = p->p_brkbase; 258 len = p->p_brksize; 259 260 if (addr < base || addr >= base + len) { /* data seg? */ 261 base = (caddr_t)p->p_usrstack - p->p_stksize; 262 len = p->p_stksize; 263 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 264 /* not in either UNIX data or stack segments */ 265 res = FC_NOMAP; 266 goto out; 267 } 268 } 269 270 /* 271 * the rest of this function implements a 3.X 4.X 5.X compatibility 272 * This code is probably not needed anymore 273 */ 274 if (p->p_model == DATAMODEL_ILP32) { 275 276 /* expand the gap to the page boundaries on each side */ 277 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 278 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 279 len = ea - (uintptr_t)base; 280 281 as_rangelock(as); 282 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 283 0) { 284 err = as_map(as, base, len, segvn_create, zfod_argsp); 285 as_rangeunlock(as); 286 if (err) { 287 res = FC_MAKE_ERR(err); 288 goto out; 289 } 290 } else { 291 /* 292 * This page is already mapped by another thread after 293 * we returned from as_fault() above. We just fall 294 * through as_fault() below. 295 */ 296 as_rangeunlock(as); 297 } 298 299 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 300 } 301 302 out: 303 if (mapped_red) 304 segkp_unmap_red(); 305 306 return (res); 307 } 308 309 void 310 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 311 { 312 struct proc *p = curproc; 313 caddr_t userlimit = (flags & _MAP_LOW32) ? 314 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 315 316 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 317 } 318 319 /*ARGSUSED*/ 320 int 321 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 322 { 323 return (0); 324 } 325 326 /* 327 * map_addr_proc() is the routine called when the system is to 328 * choose an address for the user. We will pick an address 329 * range which is the highest available below kernelbase. 330 * 331 * addrp is a value/result parameter. 332 * On input it is a hint from the user to be used in a completely 333 * machine dependent fashion. We decide to completely ignore this hint. 334 * 335 * On output it is NULL if no address can be found in the current 336 * processes address space or else an address that is currently 337 * not mapped for len bytes with a page of red zone on either side. 338 * 339 * align is not needed on x86 (it's for viturally addressed caches) 340 */ 341 /*ARGSUSED*/ 342 void 343 map_addr_proc( 344 caddr_t *addrp, 345 size_t len, 346 offset_t off, 347 int vacalign, 348 caddr_t userlimit, 349 struct proc *p, 350 uint_t flags) 351 { 352 struct as *as = p->p_as; 353 caddr_t addr; 354 caddr_t base; 355 size_t slen; 356 size_t align_amount; 357 358 ASSERT32(userlimit == as->a_userlimit); 359 360 base = p->p_brkbase; 361 #if defined(__amd64) 362 /* 363 * XX64 Yes, this needs more work. 364 */ 365 if (p->p_model == DATAMODEL_NATIVE) { 366 if (userlimit < as->a_userlimit) { 367 /* 368 * This happens when a program wants to map 369 * something in a range that's accessible to a 370 * program in a smaller address space. For example, 371 * a 64-bit program calling mmap32(2) to guarantee 372 * that the returned address is below 4Gbytes. 373 */ 374 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 375 376 if (userlimit > base) 377 slen = userlimit - base; 378 else { 379 *addrp = NULL; 380 return; 381 } 382 } else { 383 /* 384 * XX64 This layout is probably wrong .. but in 385 * the event we make the amd64 address space look 386 * like sparcv9 i.e. with the stack -above- the 387 * heap, this bit of code might even be correct. 388 */ 389 slen = p->p_usrstack - base - 390 (((size_t)rctl_enforced_value( 391 rctlproc_legacy[RLIMIT_STACK], 392 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 393 } 394 } else 395 #endif 396 slen = userlimit - base; 397 398 len = (len + PAGEOFFSET) & PAGEMASK; 399 400 /* 401 * Redzone for each side of the request. This is done to leave 402 * one page unmapped between segments. This is not required, but 403 * it's useful for the user because if their program strays across 404 * a segment boundary, it will catch a fault immediately making 405 * debugging a little easier. 406 */ 407 len += 2 * MMU_PAGESIZE; 408 409 /* 410 * figure out what the alignment should be 411 * 412 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 413 */ 414 if (len <= ELF_386_MAXPGSZ) { 415 /* 416 * Align virtual addresses to ensure that ELF shared libraries 417 * are mapped with the appropriate alignment constraints by 418 * the run-time linker. 419 */ 420 align_amount = ELF_386_MAXPGSZ; 421 } else { 422 int l = mmu.max_page_level; 423 424 while (l && len < LEVEL_SIZE(l)) 425 --l; 426 427 align_amount = LEVEL_SIZE(l); 428 } 429 430 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 431 align_amount = (uintptr_t)*addrp; 432 433 len += align_amount; 434 435 /* 436 * Look for a large enough hole starting below userlimit. 437 * After finding it, use the upper part. Addition of PAGESIZE 438 * is for the redzone as described above. 439 */ 440 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 441 caddr_t as_addr; 442 443 addr = base + slen - len + MMU_PAGESIZE; 444 as_addr = addr; 445 /* 446 * Round address DOWN to the alignment amount, 447 * add the offset, and if this address is less 448 * than the original address, add alignment amount. 449 */ 450 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 451 addr += (uintptr_t)(off & (align_amount - 1)); 452 if (addr < as_addr) 453 addr += align_amount; 454 455 ASSERT(addr <= (as_addr + align_amount)); 456 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 457 ((uintptr_t)(off & (align_amount - 1)))); 458 *addrp = addr; 459 } else { 460 *addrp = NULL; /* no more virtual space */ 461 } 462 } 463 464 /* 465 * Determine whether [base, base+len] contains a valid range of 466 * addresses at least minlen long. base and len are adjusted if 467 * required to provide a valid range. 468 */ 469 /*ARGSUSED3*/ 470 int 471 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 472 { 473 uintptr_t hi, lo; 474 475 lo = (uintptr_t)*basep; 476 hi = lo + *lenp; 477 478 /* 479 * If hi rolled over the top, try cutting back. 480 */ 481 if (hi < lo) { 482 if (0 - lo + hi < minlen) 483 return (0); 484 if (0 - lo < minlen) 485 return (0); 486 *lenp = 0 - lo; 487 } else if (hi - lo < minlen) { 488 return (0); 489 } 490 #if defined(__amd64) 491 /* 492 * Deal with a possible hole in the address range between 493 * hole_start and hole_end that should never be mapped. 494 */ 495 if (lo < hole_start) { 496 if (hi > hole_start) { 497 if (hi < hole_end) { 498 hi = hole_start; 499 } else { 500 /* lo < hole_start && hi >= hole_end */ 501 if (dir == AH_LO) { 502 /* 503 * prefer lowest range 504 */ 505 if (hole_start - lo >= minlen) 506 hi = hole_start; 507 else if (hi - hole_end >= minlen) 508 lo = hole_end; 509 else 510 return (0); 511 } else { 512 /* 513 * prefer highest range 514 */ 515 if (hi - hole_end >= minlen) 516 lo = hole_end; 517 else if (hole_start - lo >= minlen) 518 hi = hole_start; 519 else 520 return (0); 521 } 522 } 523 } 524 } else { 525 /* lo >= hole_start */ 526 if (hi < hole_end) 527 return (0); 528 if (lo < hole_end) 529 lo = hole_end; 530 } 531 532 if (hi - lo < minlen) 533 return (0); 534 535 *basep = (caddr_t)lo; 536 *lenp = hi - lo; 537 #endif 538 return (1); 539 } 540 541 /* 542 * Determine whether [addr, addr+len] are valid user addresses. 543 */ 544 /*ARGSUSED*/ 545 int 546 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 547 caddr_t userlimit) 548 { 549 caddr_t eaddr = addr + len; 550 551 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 552 return (RANGE_BADADDR); 553 554 #if defined(__amd64) 555 /* 556 * Check for the VA hole 557 */ 558 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 559 return (RANGE_BADADDR); 560 #endif 561 562 return (RANGE_OKAY); 563 } 564 565 /* 566 * Return 1 if the page frame is onboard memory, else 0. 567 */ 568 int 569 pf_is_memory(pfn_t pf) 570 { 571 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 572 } 573 574 575 /* 576 * initialized by page_coloring_init(). 577 */ 578 uint_t page_colors; 579 uint_t page_colors_mask; 580 uint_t page_coloring_shift; 581 int cpu_page_colors; 582 static uint_t l2_colors; 583 584 /* 585 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 586 * and page_colors are calculated from the l2 cache n-way set size. Within a 587 * mnode range, the page freelist and cachelist are hashed into bins based on 588 * color. This makes it easier to search for a page within a specific memory 589 * range. 590 */ 591 #define PAGE_COLORS_MIN 16 592 593 page_t ****page_freelists; 594 page_t ***page_cachelists; 595 596 /* 597 * As the PC architecture evolved memory up was clumped into several 598 * ranges for various historical I/O devices to do DMA. 599 * < 16Meg - ISA bus 600 * < 2Gig - ??? 601 * < 4Gig - PCI bus or drivers that don't understand PAE mode 602 */ 603 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 604 0x100000, /* pfn range for 4G and above */ 605 0x80000, /* pfn range for 2G-4G */ 606 0x01000, /* pfn range for 16M-2G */ 607 0x00000, /* pfn range for 0-16M */ 608 }; 609 610 /* 611 * These are changed during startup if the machine has limited memory. 612 */ 613 pfn_t *memranges = &arch_memranges[0]; 614 int nranges = NUM_MEM_RANGES; 615 616 /* 617 * Used by page layer to know about page sizes 618 */ 619 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 620 621 /* 622 * This can be patched via /etc/system to allow old non-PAE aware device 623 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 624 */ 625 #if defined(__i386) 626 int restricted_kmemalloc = 1; /* XX64 re-examine with PSARC 2004/405 */ 627 #elif defined(__amd64) 628 int restricted_kmemalloc = 0; 629 #endif 630 631 kmutex_t *fpc_mutex[NPC_MUTEX]; 632 kmutex_t *cpc_mutex[NPC_MUTEX]; 633 634 635 /* 636 * return the memrange containing pfn 637 */ 638 int 639 memrange_num(pfn_t pfn) 640 { 641 int n; 642 643 for (n = 0; n < nranges - 1; ++n) { 644 if (pfn >= memranges[n]) 645 break; 646 } 647 return (n); 648 } 649 650 /* 651 * return the mnoderange containing pfn 652 */ 653 int 654 pfn_2_mtype(pfn_t pfn) 655 { 656 int n; 657 658 for (n = mnoderangecnt - 1; n >= 0; n--) { 659 if (pfn >= mnoderanges[n].mnr_pfnlo) { 660 break; 661 } 662 } 663 return (n); 664 } 665 666 /* 667 * is_contigpage_free: 668 * returns a page list of contiguous pages. It minimally has to return 669 * minctg pages. Caller determines minctg based on the scatter-gather 670 * list length. 671 * 672 * pfnp is set to the next page frame to search on return. 673 */ 674 static page_t * 675 is_contigpage_free( 676 pfn_t *pfnp, 677 pgcnt_t *pgcnt, 678 pgcnt_t minctg, 679 uint64_t pfnseg, 680 int iolock) 681 { 682 int i = 0; 683 pfn_t pfn = *pfnp; 684 page_t *pp; 685 page_t *plist = NULL; 686 687 /* 688 * fail if pfn + minctg crosses a segment boundary. 689 * Adjust for next starting pfn to begin at segment boundary. 690 */ 691 692 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 693 *pfnp = roundup(*pfnp, pfnseg + 1); 694 return (NULL); 695 } 696 697 do { 698 retry: 699 pp = page_numtopp_nolock(pfn + i); 700 if ((pp == NULL) || 701 (page_trylock(pp, SE_EXCL) == 0)) { 702 (*pfnp)++; 703 break; 704 } 705 if (page_pptonum(pp) != pfn + i) { 706 page_unlock(pp); 707 goto retry; 708 } 709 710 if (!(PP_ISFREE(pp))) { 711 page_unlock(pp); 712 (*pfnp)++; 713 break; 714 } 715 716 if (!PP_ISAGED(pp)) { 717 page_list_sub(pp, PG_CACHE_LIST); 718 page_hashout(pp, (kmutex_t *)NULL); 719 } else { 720 page_list_sub(pp, PG_FREE_LIST); 721 } 722 723 if (iolock) 724 page_io_lock(pp); 725 page_list_concat(&plist, &pp); 726 727 /* 728 * exit loop when pgcnt satisfied or segment boundary reached. 729 */ 730 731 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 732 733 *pfnp += i; /* set to next pfn to search */ 734 735 if (i >= minctg) { 736 *pgcnt -= i; 737 return (plist); 738 } 739 740 /* 741 * failure: minctg not satisfied. 742 * 743 * if next request crosses segment boundary, set next pfn 744 * to search from the segment boundary. 745 */ 746 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 747 *pfnp = roundup(*pfnp, pfnseg + 1); 748 749 /* clean up any pages already allocated */ 750 751 while (plist) { 752 pp = plist; 753 page_sub(&plist, pp); 754 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 755 if (iolock) 756 page_io_unlock(pp); 757 page_unlock(pp); 758 } 759 760 return (NULL); 761 } 762 763 /* 764 * verify that pages being returned from allocator have correct DMA attribute 765 */ 766 #ifndef DEBUG 767 #define check_dma(a, b, c) (0) 768 #else 769 static void 770 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 771 { 772 if (dma_attr == NULL) 773 return; 774 775 while (cnt-- > 0) { 776 if (mmu_ptob((uint64_t)pp->p_pagenum) < 777 dma_attr->dma_attr_addr_lo) 778 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 779 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 780 dma_attr->dma_attr_addr_hi) 781 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 782 pp = pp->p_next; 783 } 784 } 785 #endif 786 787 static kmutex_t contig_lock; 788 789 #define CONTIG_LOCK() mutex_enter(&contig_lock); 790 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 791 792 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 793 794 static page_t * 795 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 796 { 797 pfn_t pfn; 798 int sgllen; 799 uint64_t pfnseg; 800 pgcnt_t minctg; 801 page_t *pplist = NULL, *plist; 802 uint64_t lo, hi; 803 pgcnt_t pfnalign = 0; 804 static pfn_t startpfn; 805 static pgcnt_t lastctgcnt; 806 uintptr_t align; 807 808 CONTIG_LOCK(); 809 810 if (mattr) { 811 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 812 hi = mmu_btop(mattr->dma_attr_addr_hi); 813 if (hi >= physmax) 814 hi = physmax - 1; 815 sgllen = mattr->dma_attr_sgllen; 816 pfnseg = mmu_btop(mattr->dma_attr_seg); 817 818 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 819 if (align > MMU_PAGESIZE) 820 pfnalign = mmu_btop(align); 821 822 /* 823 * in order to satisfy the request, must minimally 824 * acquire minctg contiguous pages 825 */ 826 minctg = howmany(*pgcnt, sgllen); 827 828 ASSERT(hi >= lo); 829 830 /* 831 * start from where last searched if the minctg >= lastctgcnt 832 */ 833 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 834 startpfn = lo; 835 } else { 836 hi = physmax - 1; 837 lo = 0; 838 sgllen = 1; 839 pfnseg = mmu.highest_pfn; 840 minctg = *pgcnt; 841 842 if (minctg < lastctgcnt) 843 startpfn = lo; 844 } 845 lastctgcnt = minctg; 846 847 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 848 849 /* conserve 16m memory - start search above 16m when possible */ 850 if (hi > PFN_16M && startpfn < PFN_16M) 851 startpfn = PFN_16M; 852 853 pfn = startpfn; 854 if (pfnalign) 855 pfn = P2ROUNDUP(pfn, pfnalign); 856 857 while (pfn + minctg - 1 <= hi) { 858 859 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 860 if (plist) { 861 page_list_concat(&pplist, &plist); 862 sgllen--; 863 /* 864 * return when contig pages no longer needed 865 */ 866 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 867 startpfn = pfn; 868 CONTIG_UNLOCK(); 869 check_dma(mattr, pplist, *pgcnt); 870 return (pplist); 871 } 872 minctg = howmany(*pgcnt, sgllen); 873 } 874 if (pfnalign) 875 pfn = P2ROUNDUP(pfn, pfnalign); 876 } 877 878 /* cannot find contig pages in specified range */ 879 if (startpfn == lo) { 880 CONTIG_UNLOCK(); 881 return (NULL); 882 } 883 884 /* did not start with lo previously */ 885 pfn = lo; 886 if (pfnalign) 887 pfn = P2ROUNDUP(pfn, pfnalign); 888 889 /* allow search to go above startpfn */ 890 while (pfn < startpfn) { 891 892 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 893 if (plist != NULL) { 894 895 page_list_concat(&pplist, &plist); 896 sgllen--; 897 898 /* 899 * return when contig pages no longer needed 900 */ 901 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 902 startpfn = pfn; 903 CONTIG_UNLOCK(); 904 check_dma(mattr, pplist, *pgcnt); 905 return (pplist); 906 } 907 minctg = howmany(*pgcnt, sgllen); 908 } 909 if (pfnalign) 910 pfn = P2ROUNDUP(pfn, pfnalign); 911 } 912 CONTIG_UNLOCK(); 913 return (NULL); 914 } 915 916 /* 917 * combine mem_node_config and memrange memory ranges into one data 918 * structure to be used for page list management. 919 * 920 * mnode_range_cnt() calculates the number of memory ranges for mnode and 921 * memranges[]. Used to determine the size of page lists and mnoderanges. 922 * 923 * mnode_range_setup() initializes mnoderanges. 924 */ 925 mnoderange_t *mnoderanges; 926 int mnoderangecnt; 927 int mtype4g; 928 929 int 930 mnode_range_cnt() 931 { 932 int mri; 933 int mnrcnt = 0; 934 int mnode; 935 936 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 937 if (mem_node_config[mnode].exists == 0) 938 continue; 939 940 mri = nranges - 1; 941 942 /* find the memranges index below contained in mnode range */ 943 944 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 945 mri--; 946 947 /* 948 * increment mnode range counter when memranges or mnode 949 * boundary is reached. 950 */ 951 while (mri >= 0 && 952 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 953 mnrcnt++; 954 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 955 mri--; 956 else 957 break; 958 } 959 } 960 return (mnrcnt); 961 } 962 963 void 964 mnode_range_setup(mnoderange_t *mnoderanges) 965 { 966 int mnode, mri; 967 968 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 969 if (mem_node_config[mnode].exists == 0) 970 continue; 971 972 mri = nranges - 1; 973 974 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 975 mri--; 976 977 while (mri >= 0 && mem_node_config[mnode].physmax >= 978 MEMRANGELO(mri)) { 979 mnoderanges->mnr_pfnlo = 980 MAX(MEMRANGELO(mri), 981 mem_node_config[mnode].physbase); 982 mnoderanges->mnr_pfnhi = 983 MIN(MEMRANGEHI(mri), 984 mem_node_config[mnode].physmax); 985 mnoderanges->mnr_mnode = mnode; 986 mnoderanges->mnr_memrange = mri; 987 mnoderanges++; 988 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 989 mri--; 990 else 991 break; 992 } 993 } 994 } 995 996 /* 997 * Determine if the mnode range specified in mtype contains memory belonging 998 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 999 * the range of indices to 0 or 4g. 1000 * 1001 * Return first mnode range type index found otherwise return -1 if none found. 1002 */ 1003 int 1004 mtype_func(int mnode, int mtype, uint_t flags) 1005 { 1006 if (flags & PGI_MT_RANGE) { 1007 int mtlim = 0; /* default to PGI_MT_RANGEO */ 1008 1009 if (flags & PGI_MT_NEXT) 1010 mtype--; 1011 if (flags & PGI_MT_RANGE4G) 1012 mtlim = mtype4g + 1; 1013 while (mtype >= mtlim) { 1014 if (mnoderanges[mtype].mnr_mnode == mnode) 1015 return (mtype); 1016 mtype--; 1017 } 1018 } else { 1019 if (mnoderanges[mtype].mnr_mnode == mnode) 1020 return (mtype); 1021 } 1022 return (-1); 1023 } 1024 1025 /* 1026 * Update the page list max counts with the pfn range specified by the 1027 * input parameters. Called from add_physmem() when physical memory with 1028 * page_t's are initially added to the page lists. 1029 */ 1030 void 1031 mtype_modify_max(pfn_t startpfn, long cnt) 1032 { 1033 int mtype = 0; 1034 pfn_t endpfn = startpfn + cnt, pfn; 1035 pgcnt_t inc; 1036 1037 ASSERT(cnt > 0); 1038 1039 for (pfn = startpfn; pfn < endpfn; ) { 1040 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1041 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1042 inc = endpfn - pfn; 1043 } else { 1044 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1045 } 1046 mnoderanges[mtype].mnr_mt_pgmax += inc; 1047 if (physmax4g && mtype <= mtype4g) 1048 maxmem4g += inc; 1049 pfn += inc; 1050 } 1051 mtype++; 1052 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1053 } 1054 } 1055 1056 /* 1057 * Returns the free page count for mnode 1058 */ 1059 int 1060 mnode_pgcnt(int mnode) 1061 { 1062 int mtype = mnoderangecnt - 1; 1063 int flags = PGI_MT_RANGE0; 1064 pgcnt_t pgcnt = 0; 1065 1066 mtype = mtype_func(mnode, mtype, flags); 1067 1068 while (mtype != -1) { 1069 pgcnt += (mnoderanges[mtype].mnr_mt_flpgcnt + 1070 mnoderanges[mtype].mnr_mt_lgpgcnt + 1071 mnoderanges[mtype].mnr_mt_clpgcnt); 1072 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1073 } 1074 return (pgcnt); 1075 } 1076 1077 /* 1078 * Initialize page coloring variables based on the l2 cache parameters. 1079 * Calculate and return memory needed for page coloring data structures. 1080 */ 1081 size_t 1082 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1083 { 1084 size_t colorsz = 0; 1085 int i; 1086 int colors; 1087 1088 /* 1089 * Reduce the memory ranges lists if we don't have large amounts 1090 * of memory. This avoids searching known empty free lists. 1091 */ 1092 i = memrange_num(physmax); 1093 memranges += i; 1094 nranges -= i; 1095 #if defined(__i386) 1096 if (i > 0) 1097 restricted_kmemalloc = 0; 1098 #endif 1099 /* physmax greater than 4g */ 1100 if (i == 0) 1101 physmax4g = 1; 1102 1103 /* 1104 * setup pagesize for generic page layer 1105 */ 1106 for (i = 0; i <= mmu.max_page_level; ++i) { 1107 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1108 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1109 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1110 } 1111 1112 ASSERT(ISP2(l2_sz)); 1113 ASSERT(ISP2(l2_linesz)); 1114 ASSERT(l2_sz > MMU_PAGESIZE); 1115 1116 /* l2_assoc is 0 for fully associative l2 cache */ 1117 if (l2_assoc) 1118 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1119 else 1120 l2_colors = 1; 1121 1122 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1123 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1124 1125 /* 1126 * cpu_page_colors is non-zero when a page color may be spread across 1127 * multiple bins. 1128 */ 1129 if (l2_colors < page_colors) 1130 cpu_page_colors = l2_colors; 1131 1132 ASSERT(ISP2(page_colors)); 1133 1134 page_colors_mask = page_colors - 1; 1135 1136 ASSERT(ISP2(CPUSETSIZE())); 1137 page_coloring_shift = lowbit(CPUSETSIZE()); 1138 1139 /* size for mnoderanges */ 1140 mnoderangecnt = mnode_range_cnt(); 1141 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1142 1143 /* size for fpc_mutex and cpc_mutex */ 1144 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1145 1146 /* size of page_freelists */ 1147 colorsz += mnoderangecnt * sizeof (page_t ***); 1148 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1149 1150 for (i = 0; i < mmu_page_sizes; i++) { 1151 colors = page_get_pagecolors(i); 1152 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1153 } 1154 1155 /* size of page_cachelists */ 1156 colorsz += mnoderangecnt * sizeof (page_t **); 1157 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1158 1159 return (colorsz); 1160 } 1161 1162 /* 1163 * Called once at startup to configure page_coloring data structures and 1164 * does the 1st page_free()/page_freelist_add(). 1165 */ 1166 void 1167 page_coloring_setup(caddr_t pcmemaddr) 1168 { 1169 int i; 1170 int j; 1171 int k; 1172 caddr_t addr; 1173 int colors; 1174 1175 /* 1176 * do page coloring setup 1177 */ 1178 addr = pcmemaddr; 1179 1180 mnoderanges = (mnoderange_t *)addr; 1181 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1182 1183 mnode_range_setup(mnoderanges); 1184 1185 if (physmax4g) 1186 mtype4g = pfn_2_mtype(0xfffff); 1187 1188 for (k = 0; k < NPC_MUTEX; k++) { 1189 fpc_mutex[k] = (kmutex_t *)addr; 1190 addr += (max_mem_nodes * sizeof (kmutex_t)); 1191 } 1192 for (k = 0; k < NPC_MUTEX; k++) { 1193 cpc_mutex[k] = (kmutex_t *)addr; 1194 addr += (max_mem_nodes * sizeof (kmutex_t)); 1195 } 1196 page_freelists = (page_t ****)addr; 1197 addr += (mnoderangecnt * sizeof (page_t ***)); 1198 1199 page_cachelists = (page_t ***)addr; 1200 addr += (mnoderangecnt * sizeof (page_t **)); 1201 1202 for (i = 0; i < mnoderangecnt; i++) { 1203 page_freelists[i] = (page_t ***)addr; 1204 addr += (mmu_page_sizes * sizeof (page_t **)); 1205 1206 for (j = 0; j < mmu_page_sizes; j++) { 1207 colors = page_get_pagecolors(j); 1208 page_freelists[i][j] = (page_t **)addr; 1209 addr += (colors * sizeof (page_t *)); 1210 } 1211 page_cachelists[i] = (page_t **)addr; 1212 addr += (page_colors * sizeof (page_t *)); 1213 } 1214 } 1215 1216 /*ARGSUSED*/ 1217 int 1218 bp_color(struct buf *bp) 1219 { 1220 return (0); 1221 } 1222 1223 /* 1224 * get a page from any list with the given mnode 1225 */ 1226 page_t * 1227 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1228 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1229 { 1230 kmutex_t *pcm; 1231 int i; 1232 page_t *pp; 1233 page_t *first_pp; 1234 uint64_t pgaddr; 1235 ulong_t bin; 1236 int mtypestart; 1237 1238 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1239 1240 ASSERT((flags & PG_MATCH_COLOR) == 0); 1241 ASSERT(szc == 0); 1242 ASSERT(dma_attr != NULL); 1243 1244 1245 MTYPE_START(mnode, mtype, flags); 1246 if (mtype < 0) { 1247 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1248 return (NULL); 1249 } 1250 1251 mtypestart = mtype; 1252 1253 bin = origbin; 1254 1255 /* 1256 * check up to page_colors + 1 bins - origbin may be checked twice 1257 * because of BIN_STEP skip 1258 */ 1259 do { 1260 i = 0; 1261 while (i <= page_colors) { 1262 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1263 goto nextfreebin; 1264 1265 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1266 mutex_enter(pcm); 1267 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1268 first_pp = pp; 1269 while (pp != NULL) { 1270 if (page_trylock(pp, SE_EXCL) == 0) { 1271 pp = pp->p_next; 1272 if (pp == first_pp) { 1273 pp = NULL; 1274 } 1275 continue; 1276 } 1277 1278 ASSERT(PP_ISFREE(pp)); 1279 ASSERT(PP_ISAGED(pp)); 1280 ASSERT(pp->p_vnode == NULL); 1281 ASSERT(pp->p_hash == NULL); 1282 ASSERT(pp->p_offset == (u_offset_t)-1); 1283 ASSERT(pp->p_szc == szc); 1284 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1285 /* check if page within DMA attributes */ 1286 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1287 1288 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1289 (pgaddr + MMU_PAGESIZE - 1 <= 1290 dma_attr->dma_attr_addr_hi)) { 1291 break; 1292 } 1293 1294 /* continue looking */ 1295 page_unlock(pp); 1296 pp = pp->p_next; 1297 if (pp == first_pp) 1298 pp = NULL; 1299 1300 } 1301 if (pp != NULL) { 1302 ASSERT(mtype == PP_2_MTYPE(pp)); 1303 ASSERT(pp->p_szc == 0); 1304 1305 /* found a page with specified DMA attributes */ 1306 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1307 mtype), pp); 1308 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1309 1310 if ((PP_ISFREE(pp) == 0) || 1311 (PP_ISAGED(pp) == 0)) { 1312 cmn_err(CE_PANIC, "page %p is not free", 1313 (void *)pp); 1314 } 1315 1316 mutex_exit(pcm); 1317 check_dma(dma_attr, pp, 1); 1318 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1319 return (pp); 1320 } 1321 mutex_exit(pcm); 1322 nextfreebin: 1323 pp = page_freelist_fill(szc, bin, mnode, mtype, 1324 mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 1325 if (pp) 1326 return (pp); 1327 1328 /* try next bin */ 1329 bin += (i == 0) ? BIN_STEP : 1; 1330 bin &= page_colors_mask; 1331 i++; 1332 } 1333 MTYPE_NEXT(mnode, mtype, flags); 1334 } while (mtype >= 0); 1335 1336 /* failed to find a page in the freelist; try it in the cachelist */ 1337 1338 /* reset mtype start for cachelist search */ 1339 mtype = mtypestart; 1340 ASSERT(mtype >= 0); 1341 1342 /* start with the bin of matching color */ 1343 bin = origbin; 1344 1345 do { 1346 for (i = 0; i <= page_colors; i++) { 1347 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1348 goto nextcachebin; 1349 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1350 mutex_enter(pcm); 1351 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1352 first_pp = pp; 1353 while (pp != NULL) { 1354 if (page_trylock(pp, SE_EXCL) == 0) { 1355 pp = pp->p_next; 1356 if (pp == first_pp) 1357 break; 1358 continue; 1359 } 1360 ASSERT(pp->p_vnode); 1361 ASSERT(PP_ISAGED(pp) == 0); 1362 ASSERT(pp->p_szc == 0); 1363 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1364 1365 /* check if page within DMA attributes */ 1366 1367 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1368 1369 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1370 (pgaddr + MMU_PAGESIZE - 1 <= 1371 dma_attr->dma_attr_addr_hi)) { 1372 break; 1373 } 1374 1375 /* continue looking */ 1376 page_unlock(pp); 1377 pp = pp->p_next; 1378 if (pp == first_pp) 1379 pp = NULL; 1380 } 1381 1382 if (pp != NULL) { 1383 ASSERT(mtype == PP_2_MTYPE(pp)); 1384 ASSERT(pp->p_szc == 0); 1385 1386 /* found a page with specified DMA attributes */ 1387 page_sub(&PAGE_CACHELISTS(mnode, bin, 1388 mtype), pp); 1389 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1390 1391 mutex_exit(pcm); 1392 ASSERT(pp->p_vnode); 1393 ASSERT(PP_ISAGED(pp) == 0); 1394 check_dma(dma_attr, pp, 1); 1395 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1396 return (pp); 1397 } 1398 mutex_exit(pcm); 1399 nextcachebin: 1400 bin += (i == 0) ? BIN_STEP : 1; 1401 bin &= page_colors_mask; 1402 } 1403 MTYPE_NEXT(mnode, mtype, flags); 1404 } while (mtype >= 0); 1405 1406 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1407 return (NULL); 1408 } 1409 1410 /* 1411 * This function is similar to page_get_freelist()/page_get_cachelist() 1412 * but it searches both the lists to find a page with the specified 1413 * color (or no color) and DMA attributes. The search is done in the 1414 * freelist first and then in the cache list within the highest memory 1415 * range (based on DMA attributes) before searching in the lower 1416 * memory ranges. 1417 * 1418 * Note: This function is called only by page_create_io(). 1419 */ 1420 /*ARGSUSED*/ 1421 page_t * 1422 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1423 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1424 { 1425 uint_t bin; 1426 int mtype; 1427 page_t *pp; 1428 int n; 1429 int m; 1430 int szc; 1431 int fullrange; 1432 int mnode; 1433 int local_failed_stat = 0; 1434 lgrp_mnode_cookie_t lgrp_cookie; 1435 1436 VM_STAT_ADD(pga_vmstats.pga_alloc); 1437 1438 /* only base pagesize currently supported */ 1439 if (size != MMU_PAGESIZE) 1440 return (NULL); 1441 1442 /* 1443 * If we're passed a specific lgroup, we use it. Otherwise, 1444 * assume first-touch placement is desired. 1445 */ 1446 if (!LGRP_EXISTS(lgrp)) 1447 lgrp = lgrp_home_lgrp(); 1448 1449 /* LINTED */ 1450 AS_2_BIN(as, seg, vp, vaddr, bin); 1451 1452 /* 1453 * Only hold one freelist or cachelist lock at a time, that way we 1454 * can start anywhere and not have to worry about lock 1455 * ordering. 1456 */ 1457 if (dma_attr == NULL) { 1458 n = 0; 1459 m = mnoderangecnt - 1; 1460 fullrange = 1; 1461 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1462 } else { 1463 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1464 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1465 1466 /* 1467 * We can guarantee alignment only for page boundary. 1468 */ 1469 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1470 return (NULL); 1471 1472 n = pfn_2_mtype(pfnlo); 1473 m = pfn_2_mtype(pfnhi); 1474 1475 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1476 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1477 } 1478 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1479 1480 if (n > m) 1481 return (NULL); 1482 1483 szc = 0; 1484 1485 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1486 if (n == 0) { 1487 flags |= PGI_MT_RANGE0; 1488 n = m; 1489 } 1490 1491 /* 1492 * Try local memory node first, but try remote if we can't 1493 * get a page of the right color. 1494 */ 1495 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1496 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1497 /* 1498 * allocate pages from high pfn to low. 1499 */ 1500 for (mtype = m; mtype >= n; mtype--) { 1501 if (fullrange != 0) { 1502 pp = page_get_mnode_freelist(mnode, 1503 bin, mtype, szc, flags); 1504 if (pp == NULL) { 1505 pp = page_get_mnode_cachelist( 1506 bin, flags, mnode, mtype); 1507 } 1508 } else { 1509 pp = page_get_mnode_anylist(bin, szc, 1510 flags, mnode, mtype, dma_attr); 1511 } 1512 if (pp != NULL) { 1513 VM_STAT_ADD(pga_vmstats.pga_allocok); 1514 check_dma(dma_attr, pp, 1); 1515 return (pp); 1516 } 1517 } 1518 if (!local_failed_stat) { 1519 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1520 local_failed_stat = 1; 1521 } 1522 } 1523 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1524 1525 return (NULL); 1526 } 1527 1528 /* 1529 * page_create_io() 1530 * 1531 * This function is a copy of page_create_va() with an additional 1532 * argument 'mattr' that specifies DMA memory requirements to 1533 * the page list functions. This function is used by the segkmem 1534 * allocator so it is only to create new pages (i.e PG_EXCL is 1535 * set). 1536 * 1537 * Note: This interface is currently used by x86 PSM only and is 1538 * not fully specified so the commitment level is only for 1539 * private interface specific to x86. This interface uses PSM 1540 * specific page_get_anylist() interface. 1541 */ 1542 1543 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1544 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1545 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1546 break; \ 1547 } \ 1548 } 1549 1550 1551 page_t * 1552 page_create_io( 1553 struct vnode *vp, 1554 u_offset_t off, 1555 uint_t bytes, 1556 uint_t flags, 1557 struct as *as, 1558 caddr_t vaddr, 1559 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1560 { 1561 page_t *plist = NULL; 1562 uint_t plist_len = 0; 1563 pgcnt_t npages; 1564 page_t *npp = NULL; 1565 uint_t pages_req; 1566 page_t *pp; 1567 kmutex_t *phm = NULL; 1568 uint_t index; 1569 1570 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1571 "page_create_start:vp %p off %llx bytes %u flags %x", 1572 vp, off, bytes, flags); 1573 1574 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1575 1576 pages_req = npages = mmu_btopr(bytes); 1577 1578 /* 1579 * Do the freemem and pcf accounting. 1580 */ 1581 if (!page_create_wait(npages, flags)) { 1582 return (NULL); 1583 } 1584 1585 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1586 "page_create_success:vp %p off %llx", 1587 vp, off); 1588 1589 /* 1590 * If satisfying this request has left us with too little 1591 * memory, start the wheels turning to get some back. The 1592 * first clause of the test prevents waking up the pageout 1593 * daemon in situations where it would decide that there's 1594 * nothing to do. 1595 */ 1596 if (nscan < desscan && freemem < minfree) { 1597 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1598 "pageout_cv_signal:freemem %ld", freemem); 1599 cv_signal(&proc_pageout->p_cv); 1600 } 1601 1602 if (flags & PG_PHYSCONTIG) { 1603 1604 plist = page_get_contigpage(&npages, mattr, 1); 1605 if (plist == NULL) { 1606 page_create_putback(npages); 1607 return (NULL); 1608 } 1609 1610 pp = plist; 1611 1612 do { 1613 if (!page_hashin(pp, vp, off, NULL)) { 1614 panic("pg_creat_io: hashin failed %p %p %llx", 1615 (void *)pp, (void *)vp, off); 1616 } 1617 VM_STAT_ADD(page_create_new); 1618 off += MMU_PAGESIZE; 1619 PP_CLRFREE(pp); 1620 PP_CLRAGED(pp); 1621 page_set_props(pp, P_REF); 1622 pp = pp->p_next; 1623 } while (pp != plist); 1624 1625 if (!npages) { 1626 check_dma(mattr, plist, pages_req); 1627 return (plist); 1628 } else { 1629 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1630 } 1631 1632 /* 1633 * fall-thru: 1634 * 1635 * page_get_contigpage returns when npages <= sgllen. 1636 * Grab the rest of the non-contig pages below from anylist. 1637 */ 1638 } 1639 1640 /* 1641 * Loop around collecting the requested number of pages. 1642 * Most of the time, we have to `create' a new page. With 1643 * this in mind, pull the page off the free list before 1644 * getting the hash lock. This will minimize the hash 1645 * lock hold time, nesting, and the like. If it turns 1646 * out we don't need the page, we put it back at the end. 1647 */ 1648 while (npages--) { 1649 phm = NULL; 1650 1651 index = PAGE_HASH_FUNC(vp, off); 1652 top: 1653 ASSERT(phm == NULL); 1654 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1655 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1656 1657 if (npp == NULL) { 1658 /* 1659 * Try to get the page of any color either from 1660 * the freelist or from the cache list. 1661 */ 1662 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1663 flags & ~PG_MATCH_COLOR, mattr, NULL); 1664 if (npp == NULL) { 1665 if (mattr == NULL) { 1666 /* 1667 * Not looking for a special page; 1668 * panic! 1669 */ 1670 panic("no page found %d", (int)npages); 1671 } 1672 /* 1673 * No page found! This can happen 1674 * if we are looking for a page 1675 * within a specific memory range 1676 * for DMA purposes. If PG_WAIT is 1677 * specified then we wait for a 1678 * while and then try again. The 1679 * wait could be forever if we 1680 * don't get the page(s) we need. 1681 * 1682 * Note: XXX We really need a mechanism 1683 * to wait for pages in the desired 1684 * range. For now, we wait for any 1685 * pages and see if we can use it. 1686 */ 1687 1688 if ((mattr != NULL) && (flags & PG_WAIT)) { 1689 delay(10); 1690 goto top; 1691 } 1692 1693 goto fail; /* undo accounting stuff */ 1694 } 1695 1696 if (PP_ISAGED(npp) == 0) { 1697 /* 1698 * Since this page came from the 1699 * cachelist, we must destroy the 1700 * old vnode association. 1701 */ 1702 page_hashout(npp, (kmutex_t *)NULL); 1703 } 1704 } 1705 1706 /* 1707 * We own this page! 1708 */ 1709 ASSERT(PAGE_EXCL(npp)); 1710 ASSERT(npp->p_vnode == NULL); 1711 ASSERT(!hat_page_is_mapped(npp)); 1712 PP_CLRFREE(npp); 1713 PP_CLRAGED(npp); 1714 1715 /* 1716 * Here we have a page in our hot little mits and are 1717 * just waiting to stuff it on the appropriate lists. 1718 * Get the mutex and check to see if it really does 1719 * not exist. 1720 */ 1721 phm = PAGE_HASH_MUTEX(index); 1722 mutex_enter(phm); 1723 PAGE_HASH_SEARCH(index, pp, vp, off); 1724 if (pp == NULL) { 1725 VM_STAT_ADD(page_create_new); 1726 pp = npp; 1727 npp = NULL; 1728 if (!page_hashin(pp, vp, off, phm)) { 1729 /* 1730 * Since we hold the page hash mutex and 1731 * just searched for this page, page_hashin 1732 * had better not fail. If it does, that 1733 * means somethread did not follow the 1734 * page hash mutex rules. Panic now and 1735 * get it over with. As usual, go down 1736 * holding all the locks. 1737 */ 1738 ASSERT(MUTEX_HELD(phm)); 1739 panic("page_create: hashin fail %p %p %llx %p", 1740 (void *)pp, (void *)vp, off, (void *)phm); 1741 1742 } 1743 ASSERT(MUTEX_HELD(phm)); 1744 mutex_exit(phm); 1745 phm = NULL; 1746 1747 /* 1748 * Hat layer locking need not be done to set 1749 * the following bits since the page is not hashed 1750 * and was on the free list (i.e., had no mappings). 1751 * 1752 * Set the reference bit to protect 1753 * against immediate pageout 1754 * 1755 * XXXmh modify freelist code to set reference 1756 * bit so we don't have to do it here. 1757 */ 1758 page_set_props(pp, P_REF); 1759 } else { 1760 ASSERT(MUTEX_HELD(phm)); 1761 mutex_exit(phm); 1762 phm = NULL; 1763 /* 1764 * NOTE: This should not happen for pages associated 1765 * with kernel vnode 'kvp'. 1766 */ 1767 /* XX64 - to debug why this happens! */ 1768 ASSERT(vp != &kvp); 1769 if (vp == &kvp) 1770 cmn_err(CE_NOTE, 1771 "page_create: page not expected " 1772 "in hash list for kernel vnode - pp 0x%p", 1773 (void *)pp); 1774 VM_STAT_ADD(page_create_exists); 1775 goto fail; 1776 } 1777 1778 /* 1779 * Got a page! It is locked. Acquire the i/o 1780 * lock since we are going to use the p_next and 1781 * p_prev fields to link the requested pages together. 1782 */ 1783 page_io_lock(pp); 1784 page_add(&plist, pp); 1785 plist = plist->p_next; 1786 off += MMU_PAGESIZE; 1787 vaddr += MMU_PAGESIZE; 1788 } 1789 1790 check_dma(mattr, plist, pages_req); 1791 return (plist); 1792 1793 fail: 1794 if (npp != NULL) { 1795 /* 1796 * Did not need this page after all. 1797 * Put it back on the free list. 1798 */ 1799 VM_STAT_ADD(page_create_putbacks); 1800 PP_SETFREE(npp); 1801 PP_SETAGED(npp); 1802 npp->p_offset = (u_offset_t)-1; 1803 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1804 page_unlock(npp); 1805 } 1806 1807 /* 1808 * Give up the pages we already got. 1809 */ 1810 while (plist != NULL) { 1811 pp = plist; 1812 page_sub(&plist, pp); 1813 page_io_unlock(pp); 1814 plist_len++; 1815 /*LINTED: constant in conditional ctx*/ 1816 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1817 } 1818 1819 /* 1820 * VN_DISPOSE does freemem accounting for the pages in plist 1821 * by calling page_free. So, we need to undo the pcf accounting 1822 * for only the remaining pages. 1823 */ 1824 VM_STAT_ADD(page_create_putbacks); 1825 page_create_putback(pages_req - plist_len); 1826 1827 return (NULL); 1828 } 1829 1830 1831 /* 1832 * Copy the data from the physical page represented by "frompp" to 1833 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1834 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1835 * level and no one sleeps with an active mapping there. 1836 * 1837 * Note that the ref/mod bits in the page_t's are not affected by 1838 * this operation, hence it is up to the caller to update them appropriately. 1839 */ 1840 void 1841 ppcopy(page_t *frompp, page_t *topp) 1842 { 1843 caddr_t pp_addr1; 1844 caddr_t pp_addr2; 1845 void *pte1; 1846 void *pte2; 1847 kmutex_t *ppaddr_mutex; 1848 1849 ASSERT_STACK_ALIGNED(); 1850 ASSERT(PAGE_LOCKED(frompp)); 1851 ASSERT(PAGE_LOCKED(topp)); 1852 1853 if (kpm_enable) { 1854 pp_addr1 = hat_kpm_page2va(frompp, 0); 1855 pp_addr2 = hat_kpm_page2va(topp, 0); 1856 kpreempt_disable(); 1857 } else { 1858 /* 1859 * disable pre-emption so that CPU can't change 1860 */ 1861 kpreempt_disable(); 1862 1863 pp_addr1 = CPU->cpu_caddr1; 1864 pp_addr2 = CPU->cpu_caddr2; 1865 pte1 = (void *)CPU->cpu_caddr1pte; 1866 pte2 = (void *)CPU->cpu_caddr2pte; 1867 1868 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1869 mutex_enter(ppaddr_mutex); 1870 1871 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1872 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1873 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1874 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1875 HAT_LOAD_NOCONSIST); 1876 } 1877 1878 if (use_sse_pagecopy) 1879 hwblkpagecopy(pp_addr1, pp_addr2); 1880 else 1881 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1882 1883 if (!kpm_enable) 1884 mutex_exit(ppaddr_mutex); 1885 kpreempt_enable(); 1886 } 1887 1888 /* 1889 * Zero the physical page from off to off + len given by `pp' 1890 * without changing the reference and modified bits of page. 1891 * 1892 * We use this using CPU private page address #2, see ppcopy() for more info. 1893 * pagezero() must not be called at interrupt level. 1894 */ 1895 void 1896 pagezero(page_t *pp, uint_t off, uint_t len) 1897 { 1898 caddr_t pp_addr2; 1899 void *pte2; 1900 kmutex_t *ppaddr_mutex; 1901 1902 ASSERT_STACK_ALIGNED(); 1903 ASSERT(len <= MMU_PAGESIZE); 1904 ASSERT(off <= MMU_PAGESIZE); 1905 ASSERT(off + len <= MMU_PAGESIZE); 1906 ASSERT(PAGE_LOCKED(pp)); 1907 1908 if (kpm_enable) { 1909 pp_addr2 = hat_kpm_page2va(pp, 0); 1910 kpreempt_disable(); 1911 } else { 1912 kpreempt_disable(); 1913 1914 pp_addr2 = CPU->cpu_caddr2; 1915 pte2 = (void *)CPU->cpu_caddr2pte; 1916 1917 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1918 mutex_enter(ppaddr_mutex); 1919 1920 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 1921 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1922 HAT_LOAD_NOCONSIST); 1923 } 1924 1925 if (use_sse_pagezero) 1926 hwblkclr(pp_addr2 + off, len); 1927 else 1928 bzero(pp_addr2 + off, len); 1929 1930 if (!kpm_enable) 1931 mutex_exit(ppaddr_mutex); 1932 kpreempt_enable(); 1933 } 1934 1935 /* 1936 * Platform-dependent page scrub call. 1937 */ 1938 void 1939 pagescrub(page_t *pp, uint_t off, uint_t len) 1940 { 1941 /* 1942 * For now, we rely on the fact that pagezero() will 1943 * always clear UEs. 1944 */ 1945 pagezero(pp, off, len); 1946 } 1947 1948 /* 1949 * set up two private addresses for use on a given CPU for use in ppcopy() 1950 */ 1951 void 1952 setup_vaddr_for_ppcopy(struct cpu *cpup) 1953 { 1954 void *addr; 1955 void *pte; 1956 1957 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1958 pte = hat_mempte_setup(addr); 1959 cpup->cpu_caddr1 = addr; 1960 cpup->cpu_caddr1pte = (pteptr_t)pte; 1961 1962 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1963 pte = hat_mempte_setup(addr); 1964 cpup->cpu_caddr2 = addr; 1965 cpup->cpu_caddr2pte = (pteptr_t)pte; 1966 1967 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 1968 } 1969 1970 1971 /* 1972 * Create the pageout scanner thread. The thread has to 1973 * start at procedure with process pp and priority pri. 1974 */ 1975 void 1976 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1977 { 1978 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1979 } 1980 1981 /* 1982 * Function for flushing D-cache when performing module relocations 1983 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 1984 */ 1985 void 1986 dcache_flushall() 1987 {} 1988