1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * UNIX machine dependent virtual memory support. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/user.h> 45 #include <sys/proc.h> 46 #include <sys/kmem.h> 47 #include <sys/vmem.h> 48 #include <sys/buf.h> 49 #include <sys/cpuvar.h> 50 #include <sys/lgrp.h> 51 #include <sys/disp.h> 52 #include <sys/vm.h> 53 #include <sys/mman.h> 54 #include <sys/vnode.h> 55 #include <sys/cred.h> 56 #include <sys/exec.h> 57 #include <sys/exechdr.h> 58 #include <sys/debug.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 0; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 extern uintptr_t eprom_kernelbase; 95 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 96 97 /* 4g memory management */ 98 pgcnt_t maxmem4g; 99 pgcnt_t freemem4g; 100 int physmax4g; 101 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 102 int lotsfree4gshift = 3; 103 104 /* 16m memory management: desired number of free pages below 16m. */ 105 pgcnt_t desfree16m = 0x380; 106 107 #ifdef VM_STATS 108 struct { 109 ulong_t pga_alloc; 110 ulong_t pga_notfullrange; 111 ulong_t pga_nulldmaattr; 112 ulong_t pga_allocok; 113 ulong_t pga_allocfailed; 114 ulong_t pgma_alloc; 115 ulong_t pgma_allocok; 116 ulong_t pgma_allocfailed; 117 ulong_t pgma_allocempty; 118 } pga_vmstats; 119 #endif 120 121 uint_t mmu_page_sizes; 122 123 /* How many page sizes the users can see */ 124 uint_t mmu_exported_page_sizes; 125 126 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 127 /* 128 * Number of pages in 1 GB. Don't enable automatic large pages if we have 129 * fewer than this many pages. 130 */ 131 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 132 133 /* 134 * Return the optimum page size for a given mapping 135 */ 136 /*ARGSUSED*/ 137 size_t 138 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 139 { 140 level_t l; 141 142 if (remap) 143 *remap = 0; 144 145 switch (maptype) { 146 147 case MAPPGSZ_STK: 148 case MAPPGSZ_HEAP: 149 case MAPPGSZ_VA: 150 /* 151 * use the pages size that best fits len 152 */ 153 for (l = mmu.max_page_level; l > 0; --l) { 154 if (len < LEVEL_SIZE(l)) 155 continue; 156 break; 157 } 158 return (LEVEL_SIZE(l)); 159 160 /* 161 * for ISM use the 1st large page size. 162 */ 163 case MAPPGSZ_ISM: 164 if (mmu.max_page_level == 0) 165 return (MMU_PAGESIZE); 166 return (LEVEL_SIZE(1)); 167 } 168 return (0); 169 } 170 171 /* 172 * This can be patched via /etc/system to allow large pages 173 * to be used for mapping application and libraries text segments. 174 */ 175 int use_text_largepages = 0; 176 177 /* 178 * Return a bit vector of large page size codes that 179 * can be used to map [addr, addr + len) region. 180 */ 181 182 /*ARGSUSED*/ 183 uint_t 184 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 185 { 186 size_t pgsz; 187 caddr_t a; 188 189 if (!text || !use_text_largepages || 190 mmu.max_page_level == 0) 191 return (0); 192 193 pgsz = LEVEL_SIZE(1); 194 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 195 if (a < addr || a >= addr + len) { 196 return (0); 197 } 198 len -= (a - addr); 199 if (len < pgsz) { 200 return (0); 201 } 202 return (1 << 1); 203 } 204 205 /* 206 * Handle a pagefault. 207 */ 208 faultcode_t 209 pagefault( 210 caddr_t addr, 211 enum fault_type type, 212 enum seg_rw rw, 213 int iskernel) 214 { 215 struct as *as; 216 struct hat *hat; 217 struct proc *p; 218 kthread_t *t; 219 faultcode_t res; 220 caddr_t base; 221 size_t len; 222 int err; 223 int mapped_red; 224 uintptr_t ea; 225 226 ASSERT_STACK_ALIGNED(); 227 228 if (INVALID_VADDR(addr)) 229 return (FC_NOMAP); 230 231 mapped_red = segkp_map_red(); 232 233 if (iskernel) { 234 as = &kas; 235 hat = as->a_hat; 236 } else { 237 t = curthread; 238 p = ttoproc(t); 239 as = p->p_as; 240 hat = as->a_hat; 241 } 242 243 /* 244 * Dispatch pagefault. 245 */ 246 res = as_fault(hat, as, addr, 1, type, rw); 247 248 /* 249 * If this isn't a potential unmapped hole in the user's 250 * UNIX data or stack segments, just return status info. 251 */ 252 if (res != FC_NOMAP || iskernel) 253 goto out; 254 255 /* 256 * Check to see if we happened to faulted on a currently unmapped 257 * part of the UNIX data or stack segments. If so, create a zfod 258 * mapping there and then try calling the fault routine again. 259 */ 260 base = p->p_brkbase; 261 len = p->p_brksize; 262 263 if (addr < base || addr >= base + len) { /* data seg? */ 264 base = (caddr_t)p->p_usrstack - p->p_stksize; 265 len = p->p_stksize; 266 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 267 /* not in either UNIX data or stack segments */ 268 res = FC_NOMAP; 269 goto out; 270 } 271 } 272 273 /* 274 * the rest of this function implements a 3.X 4.X 5.X compatibility 275 * This code is probably not needed anymore 276 */ 277 if (p->p_model == DATAMODEL_ILP32) { 278 279 /* expand the gap to the page boundaries on each side */ 280 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 281 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 282 len = ea - (uintptr_t)base; 283 284 as_rangelock(as); 285 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 286 0) { 287 err = as_map(as, base, len, segvn_create, zfod_argsp); 288 as_rangeunlock(as); 289 if (err) { 290 res = FC_MAKE_ERR(err); 291 goto out; 292 } 293 } else { 294 /* 295 * This page is already mapped by another thread after 296 * we returned from as_fault() above. We just fall 297 * through as_fault() below. 298 */ 299 as_rangeunlock(as); 300 } 301 302 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 303 } 304 305 out: 306 if (mapped_red) 307 segkp_unmap_red(); 308 309 return (res); 310 } 311 312 void 313 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 314 { 315 struct proc *p = curproc; 316 caddr_t userlimit = (flags & _MAP_LOW32) ? 317 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 318 319 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 320 } 321 322 /*ARGSUSED*/ 323 int 324 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 325 { 326 return (0); 327 } 328 329 /* 330 * map_addr_proc() is the routine called when the system is to 331 * choose an address for the user. We will pick an address 332 * range which is the highest available below kernelbase. 333 * 334 * addrp is a value/result parameter. 335 * On input it is a hint from the user to be used in a completely 336 * machine dependent fashion. We decide to completely ignore this hint. 337 * 338 * On output it is NULL if no address can be found in the current 339 * processes address space or else an address that is currently 340 * not mapped for len bytes with a page of red zone on either side. 341 * 342 * align is not needed on x86 (it's for viturally addressed caches) 343 */ 344 /*ARGSUSED*/ 345 void 346 map_addr_proc( 347 caddr_t *addrp, 348 size_t len, 349 offset_t off, 350 int vacalign, 351 caddr_t userlimit, 352 struct proc *p, 353 uint_t flags) 354 { 355 struct as *as = p->p_as; 356 caddr_t addr; 357 caddr_t base; 358 size_t slen; 359 size_t align_amount; 360 361 ASSERT32(userlimit == as->a_userlimit); 362 363 base = p->p_brkbase; 364 #if defined(__amd64) 365 /* 366 * XX64 Yes, this needs more work. 367 */ 368 if (p->p_model == DATAMODEL_NATIVE) { 369 if (userlimit < as->a_userlimit) { 370 /* 371 * This happens when a program wants to map 372 * something in a range that's accessible to a 373 * program in a smaller address space. For example, 374 * a 64-bit program calling mmap32(2) to guarantee 375 * that the returned address is below 4Gbytes. 376 */ 377 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 378 379 if (userlimit > base) 380 slen = userlimit - base; 381 else { 382 *addrp = NULL; 383 return; 384 } 385 } else { 386 /* 387 * XX64 This layout is probably wrong .. but in 388 * the event we make the amd64 address space look 389 * like sparcv9 i.e. with the stack -above- the 390 * heap, this bit of code might even be correct. 391 */ 392 slen = p->p_usrstack - base - 393 (((size_t)rctl_enforced_value( 394 rctlproc_legacy[RLIMIT_STACK], 395 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 396 } 397 } else 398 #endif 399 slen = userlimit - base; 400 401 len = (len + PAGEOFFSET) & PAGEMASK; 402 403 /* 404 * Redzone for each side of the request. This is done to leave 405 * one page unmapped between segments. This is not required, but 406 * it's useful for the user because if their program strays across 407 * a segment boundary, it will catch a fault immediately making 408 * debugging a little easier. 409 */ 410 len += 2 * MMU_PAGESIZE; 411 412 /* 413 * figure out what the alignment should be 414 * 415 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 416 */ 417 if (len <= ELF_386_MAXPGSZ) { 418 /* 419 * Align virtual addresses to ensure that ELF shared libraries 420 * are mapped with the appropriate alignment constraints by 421 * the run-time linker. 422 */ 423 align_amount = ELF_386_MAXPGSZ; 424 } else { 425 int l = mmu.max_page_level; 426 427 while (l && len < LEVEL_SIZE(l)) 428 --l; 429 430 align_amount = LEVEL_SIZE(l); 431 } 432 433 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 434 align_amount = (uintptr_t)*addrp; 435 436 len += align_amount; 437 438 /* 439 * Look for a large enough hole starting below userlimit. 440 * After finding it, use the upper part. Addition of PAGESIZE 441 * is for the redzone as described above. 442 */ 443 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 444 caddr_t as_addr; 445 446 addr = base + slen - len + MMU_PAGESIZE; 447 as_addr = addr; 448 /* 449 * Round address DOWN to the alignment amount, 450 * add the offset, and if this address is less 451 * than the original address, add alignment amount. 452 */ 453 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 454 addr += (uintptr_t)(off & (align_amount - 1)); 455 if (addr < as_addr) 456 addr += align_amount; 457 458 ASSERT(addr <= (as_addr + align_amount)); 459 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 460 ((uintptr_t)(off & (align_amount - 1)))); 461 *addrp = addr; 462 } else { 463 *addrp = NULL; /* no more virtual space */ 464 } 465 } 466 467 /* 468 * Determine whether [base, base+len] contains a valid range of 469 * addresses at least minlen long. base and len are adjusted if 470 * required to provide a valid range. 471 */ 472 /*ARGSUSED3*/ 473 int 474 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 475 { 476 uintptr_t hi, lo; 477 478 lo = (uintptr_t)*basep; 479 hi = lo + *lenp; 480 481 /* 482 * If hi rolled over the top, try cutting back. 483 */ 484 if (hi < lo) { 485 if (0 - lo + hi < minlen) 486 return (0); 487 if (0 - lo < minlen) 488 return (0); 489 *lenp = 0 - lo; 490 } else if (hi - lo < minlen) { 491 return (0); 492 } 493 #if defined(__amd64) 494 /* 495 * Deal with a possible hole in the address range between 496 * hole_start and hole_end that should never be mapped. 497 */ 498 if (lo < hole_start) { 499 if (hi > hole_start) { 500 if (hi < hole_end) { 501 hi = hole_start; 502 } else { 503 /* lo < hole_start && hi >= hole_end */ 504 if (dir == AH_LO) { 505 /* 506 * prefer lowest range 507 */ 508 if (hole_start - lo >= minlen) 509 hi = hole_start; 510 else if (hi - hole_end >= minlen) 511 lo = hole_end; 512 else 513 return (0); 514 } else { 515 /* 516 * prefer highest range 517 */ 518 if (hi - hole_end >= minlen) 519 lo = hole_end; 520 else if (hole_start - lo >= minlen) 521 hi = hole_start; 522 else 523 return (0); 524 } 525 } 526 } 527 } else { 528 /* lo >= hole_start */ 529 if (hi < hole_end) 530 return (0); 531 if (lo < hole_end) 532 lo = hole_end; 533 } 534 535 if (hi - lo < minlen) 536 return (0); 537 538 *basep = (caddr_t)lo; 539 *lenp = hi - lo; 540 #endif 541 return (1); 542 } 543 544 /* 545 * Determine whether [addr, addr+len] are valid user addresses. 546 */ 547 /*ARGSUSED*/ 548 int 549 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 550 caddr_t userlimit) 551 { 552 caddr_t eaddr = addr + len; 553 554 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 555 return (RANGE_BADADDR); 556 557 #if defined(__amd64) 558 /* 559 * Check for the VA hole 560 */ 561 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 562 return (RANGE_BADADDR); 563 #endif 564 565 return (RANGE_OKAY); 566 } 567 568 /* 569 * Return 1 if the page frame is onboard memory, else 0. 570 */ 571 int 572 pf_is_memory(pfn_t pf) 573 { 574 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 575 } 576 577 578 /* 579 * initialized by page_coloring_init(). 580 */ 581 uint_t page_colors; 582 uint_t page_colors_mask; 583 uint_t page_coloring_shift; 584 int cpu_page_colors; 585 static uint_t l2_colors; 586 587 /* 588 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 589 * and page_colors are calculated from the l2 cache n-way set size. Within a 590 * mnode range, the page freelist and cachelist are hashed into bins based on 591 * color. This makes it easier to search for a page within a specific memory 592 * range. 593 */ 594 #define PAGE_COLORS_MIN 16 595 596 page_t ****page_freelists; 597 page_t ***page_cachelists; 598 599 /* 600 * As the PC architecture evolved memory up was clumped into several 601 * ranges for various historical I/O devices to do DMA. 602 * < 16Meg - ISA bus 603 * < 2Gig - ??? 604 * < 4Gig - PCI bus or drivers that don't understand PAE mode 605 */ 606 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 607 0x100000, /* pfn range for 4G and above */ 608 0x80000, /* pfn range for 2G-4G */ 609 0x01000, /* pfn range for 16M-2G */ 610 0x00000, /* pfn range for 0-16M */ 611 }; 612 613 /* 614 * These are changed during startup if the machine has limited memory. 615 */ 616 pfn_t *memranges = &arch_memranges[0]; 617 int nranges = NUM_MEM_RANGES; 618 619 /* 620 * Used by page layer to know about page sizes 621 */ 622 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 623 624 /* 625 * This can be patched via /etc/system to allow old non-PAE aware device 626 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 627 */ 628 #if defined(__i386) 629 int restricted_kmemalloc = 1; /* XX64 re-examine with PSARC 2004/405 */ 630 #elif defined(__amd64) 631 int restricted_kmemalloc = 0; 632 #endif 633 634 kmutex_t *fpc_mutex[NPC_MUTEX]; 635 kmutex_t *cpc_mutex[NPC_MUTEX]; 636 637 638 /* 639 * return the memrange containing pfn 640 */ 641 int 642 memrange_num(pfn_t pfn) 643 { 644 int n; 645 646 for (n = 0; n < nranges - 1; ++n) { 647 if (pfn >= memranges[n]) 648 break; 649 } 650 return (n); 651 } 652 653 /* 654 * return the mnoderange containing pfn 655 */ 656 int 657 pfn_2_mtype(pfn_t pfn) 658 { 659 int n; 660 661 for (n = mnoderangecnt - 1; n >= 0; n--) { 662 if (pfn >= mnoderanges[n].mnr_pfnlo) { 663 break; 664 } 665 } 666 return (n); 667 } 668 669 /* 670 * is_contigpage_free: 671 * returns a page list of contiguous pages. It minimally has to return 672 * minctg pages. Caller determines minctg based on the scatter-gather 673 * list length. 674 * 675 * pfnp is set to the next page frame to search on return. 676 */ 677 static page_t * 678 is_contigpage_free( 679 pfn_t *pfnp, 680 pgcnt_t *pgcnt, 681 pgcnt_t minctg, 682 uint64_t pfnseg, 683 int iolock) 684 { 685 int i = 0; 686 pfn_t pfn = *pfnp; 687 page_t *pp; 688 page_t *plist = NULL; 689 690 /* 691 * fail if pfn + minctg crosses a segment boundary. 692 * Adjust for next starting pfn to begin at segment boundary. 693 */ 694 695 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 696 *pfnp = roundup(*pfnp, pfnseg + 1); 697 return (NULL); 698 } 699 700 do { 701 retry: 702 pp = page_numtopp_nolock(pfn + i); 703 if ((pp == NULL) || 704 (page_trylock(pp, SE_EXCL) == 0)) { 705 (*pfnp)++; 706 break; 707 } 708 if (page_pptonum(pp) != pfn + i) { 709 page_unlock(pp); 710 goto retry; 711 } 712 713 if (!(PP_ISFREE(pp))) { 714 page_unlock(pp); 715 (*pfnp)++; 716 break; 717 } 718 719 if (!PP_ISAGED(pp)) { 720 page_list_sub(pp, PG_CACHE_LIST); 721 page_hashout(pp, (kmutex_t *)NULL); 722 } else { 723 page_list_sub(pp, PG_FREE_LIST); 724 } 725 726 if (iolock) 727 page_io_lock(pp); 728 page_list_concat(&plist, &pp); 729 730 /* 731 * exit loop when pgcnt satisfied or segment boundary reached. 732 */ 733 734 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 735 736 *pfnp += i; /* set to next pfn to search */ 737 738 if (i >= minctg) { 739 *pgcnt -= i; 740 return (plist); 741 } 742 743 /* 744 * failure: minctg not satisfied. 745 * 746 * if next request crosses segment boundary, set next pfn 747 * to search from the segment boundary. 748 */ 749 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 750 *pfnp = roundup(*pfnp, pfnseg + 1); 751 752 /* clean up any pages already allocated */ 753 754 while (plist) { 755 pp = plist; 756 page_sub(&plist, pp); 757 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 758 if (iolock) 759 page_io_unlock(pp); 760 page_unlock(pp); 761 } 762 763 return (NULL); 764 } 765 766 /* 767 * verify that pages being returned from allocator have correct DMA attribute 768 */ 769 #ifndef DEBUG 770 #define check_dma(a, b, c) (0) 771 #else 772 static void 773 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 774 { 775 if (dma_attr == NULL) 776 return; 777 778 while (cnt-- > 0) { 779 if (mmu_ptob((uint64_t)pp->p_pagenum) < 780 dma_attr->dma_attr_addr_lo) 781 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 782 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 783 dma_attr->dma_attr_addr_hi) 784 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 785 pp = pp->p_next; 786 } 787 } 788 #endif 789 790 static kmutex_t contig_lock; 791 792 #define CONTIG_LOCK() mutex_enter(&contig_lock); 793 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 794 795 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 796 797 static page_t * 798 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 799 { 800 pfn_t pfn; 801 int sgllen; 802 uint64_t pfnseg; 803 pgcnt_t minctg; 804 page_t *pplist = NULL, *plist; 805 uint64_t lo, hi; 806 pgcnt_t pfnalign = 0; 807 static pfn_t startpfn; 808 static pgcnt_t lastctgcnt; 809 uintptr_t align; 810 811 CONTIG_LOCK(); 812 813 if (mattr) { 814 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 815 hi = mmu_btop(mattr->dma_attr_addr_hi); 816 if (hi >= physmax) 817 hi = physmax - 1; 818 sgllen = mattr->dma_attr_sgllen; 819 pfnseg = mmu_btop(mattr->dma_attr_seg); 820 821 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 822 if (align > MMU_PAGESIZE) 823 pfnalign = mmu_btop(align); 824 825 /* 826 * in order to satisfy the request, must minimally 827 * acquire minctg contiguous pages 828 */ 829 minctg = howmany(*pgcnt, sgllen); 830 831 ASSERT(hi >= lo); 832 833 /* 834 * start from where last searched if the minctg >= lastctgcnt 835 */ 836 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 837 startpfn = lo; 838 } else { 839 hi = physmax - 1; 840 lo = 0; 841 sgllen = 1; 842 pfnseg = mmu.highest_pfn; 843 minctg = *pgcnt; 844 845 if (minctg < lastctgcnt) 846 startpfn = lo; 847 } 848 lastctgcnt = minctg; 849 850 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 851 852 /* conserve 16m memory - start search above 16m when possible */ 853 if (hi > PFN_16M && startpfn < PFN_16M) 854 startpfn = PFN_16M; 855 856 pfn = startpfn; 857 if (pfnalign) 858 pfn = P2ROUNDUP(pfn, pfnalign); 859 860 while (pfn + minctg - 1 <= hi) { 861 862 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 863 if (plist) { 864 page_list_concat(&pplist, &plist); 865 sgllen--; 866 /* 867 * return when contig pages no longer needed 868 */ 869 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 870 startpfn = pfn; 871 CONTIG_UNLOCK(); 872 check_dma(mattr, pplist, *pgcnt); 873 return (pplist); 874 } 875 minctg = howmany(*pgcnt, sgllen); 876 } 877 if (pfnalign) 878 pfn = P2ROUNDUP(pfn, pfnalign); 879 } 880 881 /* cannot find contig pages in specified range */ 882 if (startpfn == lo) { 883 CONTIG_UNLOCK(); 884 return (NULL); 885 } 886 887 /* did not start with lo previously */ 888 pfn = lo; 889 if (pfnalign) 890 pfn = P2ROUNDUP(pfn, pfnalign); 891 892 /* allow search to go above startpfn */ 893 while (pfn < startpfn) { 894 895 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 896 if (plist != NULL) { 897 898 page_list_concat(&pplist, &plist); 899 sgllen--; 900 901 /* 902 * return when contig pages no longer needed 903 */ 904 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 905 startpfn = pfn; 906 CONTIG_UNLOCK(); 907 check_dma(mattr, pplist, *pgcnt); 908 return (pplist); 909 } 910 minctg = howmany(*pgcnt, sgllen); 911 } 912 if (pfnalign) 913 pfn = P2ROUNDUP(pfn, pfnalign); 914 } 915 CONTIG_UNLOCK(); 916 return (NULL); 917 } 918 919 /* 920 * combine mem_node_config and memrange memory ranges into one data 921 * structure to be used for page list management. 922 * 923 * mnode_range_cnt() calculates the number of memory ranges for mnode and 924 * memranges[]. Used to determine the size of page lists and mnoderanges. 925 * 926 * mnode_range_setup() initializes mnoderanges. 927 */ 928 mnoderange_t *mnoderanges; 929 int mnoderangecnt; 930 int mtype4g; 931 932 int 933 mnode_range_cnt() 934 { 935 int mri; 936 int mnrcnt = 0; 937 int mnode; 938 939 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 940 if (mem_node_config[mnode].exists == 0) 941 continue; 942 943 mri = nranges - 1; 944 945 /* find the memranges index below contained in mnode range */ 946 947 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 948 mri--; 949 950 /* 951 * increment mnode range counter when memranges or mnode 952 * boundary is reached. 953 */ 954 while (mri >= 0 && 955 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 956 mnrcnt++; 957 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 958 mri--; 959 else 960 break; 961 } 962 } 963 return (mnrcnt); 964 } 965 966 void 967 mnode_range_setup(mnoderange_t *mnoderanges) 968 { 969 int mnode, mri; 970 971 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 972 if (mem_node_config[mnode].exists == 0) 973 continue; 974 975 mri = nranges - 1; 976 977 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 978 mri--; 979 980 while (mri >= 0 && mem_node_config[mnode].physmax >= 981 MEMRANGELO(mri)) { 982 mnoderanges->mnr_pfnlo = 983 MAX(MEMRANGELO(mri), 984 mem_node_config[mnode].physbase); 985 mnoderanges->mnr_pfnhi = 986 MIN(MEMRANGEHI(mri), 987 mem_node_config[mnode].physmax); 988 mnoderanges->mnr_mnode = mnode; 989 mnoderanges->mnr_memrange = mri; 990 mnoderanges++; 991 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 992 mri--; 993 else 994 break; 995 } 996 } 997 } 998 999 /* 1000 * Determine if the mnode range specified in mtype contains memory belonging 1001 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1002 * the range of indices from high pfn to 0, 16m or 4g. 1003 * 1004 * Return first mnode range type index found otherwise return -1 if none found. 1005 */ 1006 int 1007 mtype_func(int mnode, int mtype, uint_t flags) 1008 { 1009 if (flags & PGI_MT_RANGE) { 1010 int mtlim; 1011 1012 if (flags & PGI_MT_NEXT) 1013 mtype--; 1014 if (flags & PGI_MT_RANGE0) 1015 mtlim = 0; 1016 else if (flags & PGI_MT_RANGE4G) 1017 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1018 else if (flags & PGI_MT_RANGE16M) 1019 mtlim = 1; /* exclude 0-16m range */ 1020 while (mtype >= mtlim) { 1021 if (mnoderanges[mtype].mnr_mnode == mnode) 1022 return (mtype); 1023 mtype--; 1024 } 1025 } else { 1026 if (mnoderanges[mtype].mnr_mnode == mnode) 1027 return (mtype); 1028 } 1029 return (-1); 1030 } 1031 1032 /* 1033 * Update the page list max counts with the pfn range specified by the 1034 * input parameters. Called from add_physmem() when physical memory with 1035 * page_t's are initially added to the page lists. 1036 */ 1037 void 1038 mtype_modify_max(pfn_t startpfn, long cnt) 1039 { 1040 int mtype = 0; 1041 pfn_t endpfn = startpfn + cnt, pfn; 1042 pgcnt_t inc; 1043 1044 ASSERT(cnt > 0); 1045 1046 for (pfn = startpfn; pfn < endpfn; ) { 1047 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1048 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1049 inc = endpfn - pfn; 1050 } else { 1051 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1052 } 1053 mnoderanges[mtype].mnr_mt_pgmax += inc; 1054 if (physmax4g && mtype <= mtype4g) 1055 maxmem4g += inc; 1056 pfn += inc; 1057 } 1058 mtype++; 1059 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1060 } 1061 } 1062 1063 /* 1064 * Returns the free page count for mnode 1065 */ 1066 int 1067 mnode_pgcnt(int mnode) 1068 { 1069 int mtype = mnoderangecnt - 1; 1070 int flags = PGI_MT_RANGE0; 1071 pgcnt_t pgcnt = 0; 1072 1073 mtype = mtype_func(mnode, mtype, flags); 1074 1075 while (mtype != -1) { 1076 pgcnt += MTYPE_FREEMEM(mtype); 1077 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1078 } 1079 return (pgcnt); 1080 } 1081 1082 /* 1083 * Initialize page coloring variables based on the l2 cache parameters. 1084 * Calculate and return memory needed for page coloring data structures. 1085 */ 1086 size_t 1087 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1088 { 1089 size_t colorsz = 0; 1090 int i; 1091 int colors; 1092 1093 /* 1094 * Reduce the memory ranges lists if we don't have large amounts 1095 * of memory. This avoids searching known empty free lists. 1096 */ 1097 i = memrange_num(physmax); 1098 memranges += i; 1099 nranges -= i; 1100 #if defined(__i386) 1101 if (i > 0) 1102 restricted_kmemalloc = 0; 1103 #endif 1104 /* physmax greater than 4g */ 1105 if (i == 0) 1106 physmax4g = 1; 1107 1108 /* 1109 * setup pagesize for generic page layer 1110 */ 1111 for (i = 0; i <= mmu.max_page_level; ++i) { 1112 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1113 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1114 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1115 } 1116 1117 ASSERT(ISP2(l2_sz)); 1118 ASSERT(ISP2(l2_linesz)); 1119 ASSERT(l2_sz > MMU_PAGESIZE); 1120 1121 /* l2_assoc is 0 for fully associative l2 cache */ 1122 if (l2_assoc) 1123 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1124 else 1125 l2_colors = 1; 1126 1127 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1128 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1129 1130 /* 1131 * cpu_page_colors is non-zero when a page color may be spread across 1132 * multiple bins. 1133 */ 1134 if (l2_colors < page_colors) 1135 cpu_page_colors = l2_colors; 1136 1137 ASSERT(ISP2(page_colors)); 1138 1139 page_colors_mask = page_colors - 1; 1140 1141 ASSERT(ISP2(CPUSETSIZE())); 1142 page_coloring_shift = lowbit(CPUSETSIZE()); 1143 1144 /* size for mnoderanges */ 1145 mnoderangecnt = mnode_range_cnt(); 1146 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1147 1148 /* size for fpc_mutex and cpc_mutex */ 1149 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1150 1151 /* size of page_freelists */ 1152 colorsz += mnoderangecnt * sizeof (page_t ***); 1153 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1154 1155 for (i = 0; i < mmu_page_sizes; i++) { 1156 colors = page_get_pagecolors(i); 1157 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1158 } 1159 1160 /* size of page_cachelists */ 1161 colorsz += mnoderangecnt * sizeof (page_t **); 1162 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1163 1164 return (colorsz); 1165 } 1166 1167 /* 1168 * Called once at startup to configure page_coloring data structures and 1169 * does the 1st page_free()/page_freelist_add(). 1170 */ 1171 void 1172 page_coloring_setup(caddr_t pcmemaddr) 1173 { 1174 int i; 1175 int j; 1176 int k; 1177 caddr_t addr; 1178 int colors; 1179 1180 /* 1181 * do page coloring setup 1182 */ 1183 addr = pcmemaddr; 1184 1185 mnoderanges = (mnoderange_t *)addr; 1186 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1187 1188 mnode_range_setup(mnoderanges); 1189 1190 if (physmax4g) 1191 mtype4g = pfn_2_mtype(0xfffff); 1192 1193 for (k = 0; k < NPC_MUTEX; k++) { 1194 fpc_mutex[k] = (kmutex_t *)addr; 1195 addr += (max_mem_nodes * sizeof (kmutex_t)); 1196 } 1197 for (k = 0; k < NPC_MUTEX; k++) { 1198 cpc_mutex[k] = (kmutex_t *)addr; 1199 addr += (max_mem_nodes * sizeof (kmutex_t)); 1200 } 1201 page_freelists = (page_t ****)addr; 1202 addr += (mnoderangecnt * sizeof (page_t ***)); 1203 1204 page_cachelists = (page_t ***)addr; 1205 addr += (mnoderangecnt * sizeof (page_t **)); 1206 1207 for (i = 0; i < mnoderangecnt; i++) { 1208 page_freelists[i] = (page_t ***)addr; 1209 addr += (mmu_page_sizes * sizeof (page_t **)); 1210 1211 for (j = 0; j < mmu_page_sizes; j++) { 1212 colors = page_get_pagecolors(j); 1213 page_freelists[i][j] = (page_t **)addr; 1214 addr += (colors * sizeof (page_t *)); 1215 } 1216 page_cachelists[i] = (page_t **)addr; 1217 addr += (page_colors * sizeof (page_t *)); 1218 } 1219 } 1220 1221 /*ARGSUSED*/ 1222 int 1223 bp_color(struct buf *bp) 1224 { 1225 return (0); 1226 } 1227 1228 /* 1229 * get a page from any list with the given mnode 1230 */ 1231 page_t * 1232 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1233 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1234 { 1235 kmutex_t *pcm; 1236 int i; 1237 page_t *pp; 1238 page_t *first_pp; 1239 uint64_t pgaddr; 1240 ulong_t bin; 1241 int mtypestart; 1242 1243 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1244 1245 ASSERT((flags & PG_MATCH_COLOR) == 0); 1246 ASSERT(szc == 0); 1247 ASSERT(dma_attr != NULL); 1248 1249 1250 MTYPE_START(mnode, mtype, flags); 1251 if (mtype < 0) { 1252 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1253 return (NULL); 1254 } 1255 1256 mtypestart = mtype; 1257 1258 bin = origbin; 1259 1260 /* 1261 * check up to page_colors + 1 bins - origbin may be checked twice 1262 * because of BIN_STEP skip 1263 */ 1264 do { 1265 i = 0; 1266 while (i <= page_colors) { 1267 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1268 goto nextfreebin; 1269 1270 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1271 mutex_enter(pcm); 1272 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1273 first_pp = pp; 1274 while (pp != NULL) { 1275 if (page_trylock(pp, SE_EXCL) == 0) { 1276 pp = pp->p_next; 1277 if (pp == first_pp) { 1278 pp = NULL; 1279 } 1280 continue; 1281 } 1282 1283 ASSERT(PP_ISFREE(pp)); 1284 ASSERT(PP_ISAGED(pp)); 1285 ASSERT(pp->p_vnode == NULL); 1286 ASSERT(pp->p_hash == NULL); 1287 ASSERT(pp->p_offset == (u_offset_t)-1); 1288 ASSERT(pp->p_szc == szc); 1289 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1290 /* check if page within DMA attributes */ 1291 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1292 1293 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1294 (pgaddr + MMU_PAGESIZE - 1 <= 1295 dma_attr->dma_attr_addr_hi)) { 1296 break; 1297 } 1298 1299 /* continue looking */ 1300 page_unlock(pp); 1301 pp = pp->p_next; 1302 if (pp == first_pp) 1303 pp = NULL; 1304 1305 } 1306 if (pp != NULL) { 1307 ASSERT(mtype == PP_2_MTYPE(pp)); 1308 ASSERT(pp->p_szc == 0); 1309 1310 /* found a page with specified DMA attributes */ 1311 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1312 mtype), pp); 1313 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1314 1315 if ((PP_ISFREE(pp) == 0) || 1316 (PP_ISAGED(pp) == 0)) { 1317 cmn_err(CE_PANIC, "page %p is not free", 1318 (void *)pp); 1319 } 1320 1321 mutex_exit(pcm); 1322 check_dma(dma_attr, pp, 1); 1323 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1324 return (pp); 1325 } 1326 mutex_exit(pcm); 1327 nextfreebin: 1328 pp = page_freelist_fill(szc, bin, mnode, mtype, 1329 mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 1330 if (pp) 1331 return (pp); 1332 1333 /* try next bin */ 1334 bin += (i == 0) ? BIN_STEP : 1; 1335 bin &= page_colors_mask; 1336 i++; 1337 } 1338 MTYPE_NEXT(mnode, mtype, flags); 1339 } while (mtype >= 0); 1340 1341 /* failed to find a page in the freelist; try it in the cachelist */ 1342 1343 /* reset mtype start for cachelist search */ 1344 mtype = mtypestart; 1345 ASSERT(mtype >= 0); 1346 1347 /* start with the bin of matching color */ 1348 bin = origbin; 1349 1350 do { 1351 for (i = 0; i <= page_colors; i++) { 1352 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1353 goto nextcachebin; 1354 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1355 mutex_enter(pcm); 1356 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1357 first_pp = pp; 1358 while (pp != NULL) { 1359 if (page_trylock(pp, SE_EXCL) == 0) { 1360 pp = pp->p_next; 1361 if (pp == first_pp) 1362 break; 1363 continue; 1364 } 1365 ASSERT(pp->p_vnode); 1366 ASSERT(PP_ISAGED(pp) == 0); 1367 ASSERT(pp->p_szc == 0); 1368 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1369 1370 /* check if page within DMA attributes */ 1371 1372 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1373 1374 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1375 (pgaddr + MMU_PAGESIZE - 1 <= 1376 dma_attr->dma_attr_addr_hi)) { 1377 break; 1378 } 1379 1380 /* continue looking */ 1381 page_unlock(pp); 1382 pp = pp->p_next; 1383 if (pp == first_pp) 1384 pp = NULL; 1385 } 1386 1387 if (pp != NULL) { 1388 ASSERT(mtype == PP_2_MTYPE(pp)); 1389 ASSERT(pp->p_szc == 0); 1390 1391 /* found a page with specified DMA attributes */ 1392 page_sub(&PAGE_CACHELISTS(mnode, bin, 1393 mtype), pp); 1394 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1395 1396 mutex_exit(pcm); 1397 ASSERT(pp->p_vnode); 1398 ASSERT(PP_ISAGED(pp) == 0); 1399 check_dma(dma_attr, pp, 1); 1400 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1401 return (pp); 1402 } 1403 mutex_exit(pcm); 1404 nextcachebin: 1405 bin += (i == 0) ? BIN_STEP : 1; 1406 bin &= page_colors_mask; 1407 } 1408 MTYPE_NEXT(mnode, mtype, flags); 1409 } while (mtype >= 0); 1410 1411 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1412 return (NULL); 1413 } 1414 1415 /* 1416 * This function is similar to page_get_freelist()/page_get_cachelist() 1417 * but it searches both the lists to find a page with the specified 1418 * color (or no color) and DMA attributes. The search is done in the 1419 * freelist first and then in the cache list within the highest memory 1420 * range (based on DMA attributes) before searching in the lower 1421 * memory ranges. 1422 * 1423 * Note: This function is called only by page_create_io(). 1424 */ 1425 /*ARGSUSED*/ 1426 page_t * 1427 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1428 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1429 { 1430 uint_t bin; 1431 int mtype; 1432 page_t *pp; 1433 int n; 1434 int m; 1435 int szc; 1436 int fullrange; 1437 int mnode; 1438 int local_failed_stat = 0; 1439 lgrp_mnode_cookie_t lgrp_cookie; 1440 1441 VM_STAT_ADD(pga_vmstats.pga_alloc); 1442 1443 /* only base pagesize currently supported */ 1444 if (size != MMU_PAGESIZE) 1445 return (NULL); 1446 1447 /* 1448 * If we're passed a specific lgroup, we use it. Otherwise, 1449 * assume first-touch placement is desired. 1450 */ 1451 if (!LGRP_EXISTS(lgrp)) 1452 lgrp = lgrp_home_lgrp(); 1453 1454 /* LINTED */ 1455 AS_2_BIN(as, seg, vp, vaddr, bin); 1456 1457 /* 1458 * Only hold one freelist or cachelist lock at a time, that way we 1459 * can start anywhere and not have to worry about lock 1460 * ordering. 1461 */ 1462 if (dma_attr == NULL) { 1463 n = 0; 1464 m = mnoderangecnt - 1; 1465 fullrange = 1; 1466 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1467 } else { 1468 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1469 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1470 1471 /* 1472 * We can guarantee alignment only for page boundary. 1473 */ 1474 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1475 return (NULL); 1476 1477 n = pfn_2_mtype(pfnlo); 1478 m = pfn_2_mtype(pfnhi); 1479 1480 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1481 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1482 } 1483 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1484 1485 if (n > m) 1486 return (NULL); 1487 1488 szc = 0; 1489 1490 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1491 if (n == 0) { 1492 flags |= PGI_MT_RANGE0; 1493 n = m; 1494 } 1495 1496 /* 1497 * Try local memory node first, but try remote if we can't 1498 * get a page of the right color. 1499 */ 1500 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1501 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1502 /* 1503 * allocate pages from high pfn to low. 1504 */ 1505 for (mtype = m; mtype >= n; mtype--) { 1506 if (fullrange != 0) { 1507 pp = page_get_mnode_freelist(mnode, 1508 bin, mtype, szc, flags); 1509 if (pp == NULL) { 1510 pp = page_get_mnode_cachelist( 1511 bin, flags, mnode, mtype); 1512 } 1513 } else { 1514 pp = page_get_mnode_anylist(bin, szc, 1515 flags, mnode, mtype, dma_attr); 1516 } 1517 if (pp != NULL) { 1518 VM_STAT_ADD(pga_vmstats.pga_allocok); 1519 check_dma(dma_attr, pp, 1); 1520 return (pp); 1521 } 1522 } 1523 if (!local_failed_stat) { 1524 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1525 local_failed_stat = 1; 1526 } 1527 } 1528 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1529 1530 return (NULL); 1531 } 1532 1533 /* 1534 * page_create_io() 1535 * 1536 * This function is a copy of page_create_va() with an additional 1537 * argument 'mattr' that specifies DMA memory requirements to 1538 * the page list functions. This function is used by the segkmem 1539 * allocator so it is only to create new pages (i.e PG_EXCL is 1540 * set). 1541 * 1542 * Note: This interface is currently used by x86 PSM only and is 1543 * not fully specified so the commitment level is only for 1544 * private interface specific to x86. This interface uses PSM 1545 * specific page_get_anylist() interface. 1546 */ 1547 1548 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1549 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1550 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1551 break; \ 1552 } \ 1553 } 1554 1555 1556 page_t * 1557 page_create_io( 1558 struct vnode *vp, 1559 u_offset_t off, 1560 uint_t bytes, 1561 uint_t flags, 1562 struct as *as, 1563 caddr_t vaddr, 1564 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1565 { 1566 page_t *plist = NULL; 1567 uint_t plist_len = 0; 1568 pgcnt_t npages; 1569 page_t *npp = NULL; 1570 uint_t pages_req; 1571 page_t *pp; 1572 kmutex_t *phm = NULL; 1573 uint_t index; 1574 1575 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1576 "page_create_start:vp %p off %llx bytes %u flags %x", 1577 vp, off, bytes, flags); 1578 1579 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1580 1581 pages_req = npages = mmu_btopr(bytes); 1582 1583 /* 1584 * Do the freemem and pcf accounting. 1585 */ 1586 if (!page_create_wait(npages, flags)) { 1587 return (NULL); 1588 } 1589 1590 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1591 "page_create_success:vp %p off %llx", 1592 vp, off); 1593 1594 /* 1595 * If satisfying this request has left us with too little 1596 * memory, start the wheels turning to get some back. The 1597 * first clause of the test prevents waking up the pageout 1598 * daemon in situations where it would decide that there's 1599 * nothing to do. 1600 */ 1601 if (nscan < desscan && freemem < minfree) { 1602 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1603 "pageout_cv_signal:freemem %ld", freemem); 1604 cv_signal(&proc_pageout->p_cv); 1605 } 1606 1607 if (flags & PG_PHYSCONTIG) { 1608 1609 plist = page_get_contigpage(&npages, mattr, 1); 1610 if (plist == NULL) { 1611 page_create_putback(npages); 1612 return (NULL); 1613 } 1614 1615 pp = plist; 1616 1617 do { 1618 if (!page_hashin(pp, vp, off, NULL)) { 1619 panic("pg_creat_io: hashin failed %p %p %llx", 1620 (void *)pp, (void *)vp, off); 1621 } 1622 VM_STAT_ADD(page_create_new); 1623 off += MMU_PAGESIZE; 1624 PP_CLRFREE(pp); 1625 PP_CLRAGED(pp); 1626 page_set_props(pp, P_REF); 1627 pp = pp->p_next; 1628 } while (pp != plist); 1629 1630 if (!npages) { 1631 check_dma(mattr, plist, pages_req); 1632 return (plist); 1633 } else { 1634 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1635 } 1636 1637 /* 1638 * fall-thru: 1639 * 1640 * page_get_contigpage returns when npages <= sgllen. 1641 * Grab the rest of the non-contig pages below from anylist. 1642 */ 1643 } 1644 1645 /* 1646 * Loop around collecting the requested number of pages. 1647 * Most of the time, we have to `create' a new page. With 1648 * this in mind, pull the page off the free list before 1649 * getting the hash lock. This will minimize the hash 1650 * lock hold time, nesting, and the like. If it turns 1651 * out we don't need the page, we put it back at the end. 1652 */ 1653 while (npages--) { 1654 phm = NULL; 1655 1656 index = PAGE_HASH_FUNC(vp, off); 1657 top: 1658 ASSERT(phm == NULL); 1659 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1660 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1661 1662 if (npp == NULL) { 1663 /* 1664 * Try to get the page of any color either from 1665 * the freelist or from the cache list. 1666 */ 1667 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1668 flags & ~PG_MATCH_COLOR, mattr, NULL); 1669 if (npp == NULL) { 1670 if (mattr == NULL) { 1671 /* 1672 * Not looking for a special page; 1673 * panic! 1674 */ 1675 panic("no page found %d", (int)npages); 1676 } 1677 /* 1678 * No page found! This can happen 1679 * if we are looking for a page 1680 * within a specific memory range 1681 * for DMA purposes. If PG_WAIT is 1682 * specified then we wait for a 1683 * while and then try again. The 1684 * wait could be forever if we 1685 * don't get the page(s) we need. 1686 * 1687 * Note: XXX We really need a mechanism 1688 * to wait for pages in the desired 1689 * range. For now, we wait for any 1690 * pages and see if we can use it. 1691 */ 1692 1693 if ((mattr != NULL) && (flags & PG_WAIT)) { 1694 delay(10); 1695 goto top; 1696 } 1697 1698 goto fail; /* undo accounting stuff */ 1699 } 1700 1701 if (PP_ISAGED(npp) == 0) { 1702 /* 1703 * Since this page came from the 1704 * cachelist, we must destroy the 1705 * old vnode association. 1706 */ 1707 page_hashout(npp, (kmutex_t *)NULL); 1708 } 1709 } 1710 1711 /* 1712 * We own this page! 1713 */ 1714 ASSERT(PAGE_EXCL(npp)); 1715 ASSERT(npp->p_vnode == NULL); 1716 ASSERT(!hat_page_is_mapped(npp)); 1717 PP_CLRFREE(npp); 1718 PP_CLRAGED(npp); 1719 1720 /* 1721 * Here we have a page in our hot little mits and are 1722 * just waiting to stuff it on the appropriate lists. 1723 * Get the mutex and check to see if it really does 1724 * not exist. 1725 */ 1726 phm = PAGE_HASH_MUTEX(index); 1727 mutex_enter(phm); 1728 PAGE_HASH_SEARCH(index, pp, vp, off); 1729 if (pp == NULL) { 1730 VM_STAT_ADD(page_create_new); 1731 pp = npp; 1732 npp = NULL; 1733 if (!page_hashin(pp, vp, off, phm)) { 1734 /* 1735 * Since we hold the page hash mutex and 1736 * just searched for this page, page_hashin 1737 * had better not fail. If it does, that 1738 * means somethread did not follow the 1739 * page hash mutex rules. Panic now and 1740 * get it over with. As usual, go down 1741 * holding all the locks. 1742 */ 1743 ASSERT(MUTEX_HELD(phm)); 1744 panic("page_create: hashin fail %p %p %llx %p", 1745 (void *)pp, (void *)vp, off, (void *)phm); 1746 1747 } 1748 ASSERT(MUTEX_HELD(phm)); 1749 mutex_exit(phm); 1750 phm = NULL; 1751 1752 /* 1753 * Hat layer locking need not be done to set 1754 * the following bits since the page is not hashed 1755 * and was on the free list (i.e., had no mappings). 1756 * 1757 * Set the reference bit to protect 1758 * against immediate pageout 1759 * 1760 * XXXmh modify freelist code to set reference 1761 * bit so we don't have to do it here. 1762 */ 1763 page_set_props(pp, P_REF); 1764 } else { 1765 ASSERT(MUTEX_HELD(phm)); 1766 mutex_exit(phm); 1767 phm = NULL; 1768 /* 1769 * NOTE: This should not happen for pages associated 1770 * with kernel vnode 'kvp'. 1771 */ 1772 /* XX64 - to debug why this happens! */ 1773 ASSERT(vp != &kvp); 1774 if (vp == &kvp) 1775 cmn_err(CE_NOTE, 1776 "page_create: page not expected " 1777 "in hash list for kernel vnode - pp 0x%p", 1778 (void *)pp); 1779 VM_STAT_ADD(page_create_exists); 1780 goto fail; 1781 } 1782 1783 /* 1784 * Got a page! It is locked. Acquire the i/o 1785 * lock since we are going to use the p_next and 1786 * p_prev fields to link the requested pages together. 1787 */ 1788 page_io_lock(pp); 1789 page_add(&plist, pp); 1790 plist = plist->p_next; 1791 off += MMU_PAGESIZE; 1792 vaddr += MMU_PAGESIZE; 1793 } 1794 1795 check_dma(mattr, plist, pages_req); 1796 return (plist); 1797 1798 fail: 1799 if (npp != NULL) { 1800 /* 1801 * Did not need this page after all. 1802 * Put it back on the free list. 1803 */ 1804 VM_STAT_ADD(page_create_putbacks); 1805 PP_SETFREE(npp); 1806 PP_SETAGED(npp); 1807 npp->p_offset = (u_offset_t)-1; 1808 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1809 page_unlock(npp); 1810 } 1811 1812 /* 1813 * Give up the pages we already got. 1814 */ 1815 while (plist != NULL) { 1816 pp = plist; 1817 page_sub(&plist, pp); 1818 page_io_unlock(pp); 1819 plist_len++; 1820 /*LINTED: constant in conditional ctx*/ 1821 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1822 } 1823 1824 /* 1825 * VN_DISPOSE does freemem accounting for the pages in plist 1826 * by calling page_free. So, we need to undo the pcf accounting 1827 * for only the remaining pages. 1828 */ 1829 VM_STAT_ADD(page_create_putbacks); 1830 page_create_putback(pages_req - plist_len); 1831 1832 return (NULL); 1833 } 1834 1835 1836 /* 1837 * Copy the data from the physical page represented by "frompp" to 1838 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1839 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1840 * level and no one sleeps with an active mapping there. 1841 * 1842 * Note that the ref/mod bits in the page_t's are not affected by 1843 * this operation, hence it is up to the caller to update them appropriately. 1844 */ 1845 void 1846 ppcopy(page_t *frompp, page_t *topp) 1847 { 1848 caddr_t pp_addr1; 1849 caddr_t pp_addr2; 1850 void *pte1; 1851 void *pte2; 1852 kmutex_t *ppaddr_mutex; 1853 1854 ASSERT_STACK_ALIGNED(); 1855 ASSERT(PAGE_LOCKED(frompp)); 1856 ASSERT(PAGE_LOCKED(topp)); 1857 1858 if (kpm_enable) { 1859 pp_addr1 = hat_kpm_page2va(frompp, 0); 1860 pp_addr2 = hat_kpm_page2va(topp, 0); 1861 kpreempt_disable(); 1862 } else { 1863 /* 1864 * disable pre-emption so that CPU can't change 1865 */ 1866 kpreempt_disable(); 1867 1868 pp_addr1 = CPU->cpu_caddr1; 1869 pp_addr2 = CPU->cpu_caddr2; 1870 pte1 = (void *)CPU->cpu_caddr1pte; 1871 pte2 = (void *)CPU->cpu_caddr2pte; 1872 1873 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1874 mutex_enter(ppaddr_mutex); 1875 1876 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1877 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1878 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1879 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1880 HAT_LOAD_NOCONSIST); 1881 } 1882 1883 if (use_sse_pagecopy) 1884 hwblkpagecopy(pp_addr1, pp_addr2); 1885 else 1886 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1887 1888 if (!kpm_enable) 1889 mutex_exit(ppaddr_mutex); 1890 kpreempt_enable(); 1891 } 1892 1893 /* 1894 * Zero the physical page from off to off + len given by `pp' 1895 * without changing the reference and modified bits of page. 1896 * 1897 * We use this using CPU private page address #2, see ppcopy() for more info. 1898 * pagezero() must not be called at interrupt level. 1899 */ 1900 void 1901 pagezero(page_t *pp, uint_t off, uint_t len) 1902 { 1903 caddr_t pp_addr2; 1904 void *pte2; 1905 kmutex_t *ppaddr_mutex; 1906 1907 ASSERT_STACK_ALIGNED(); 1908 ASSERT(len <= MMU_PAGESIZE); 1909 ASSERT(off <= MMU_PAGESIZE); 1910 ASSERT(off + len <= MMU_PAGESIZE); 1911 ASSERT(PAGE_LOCKED(pp)); 1912 1913 if (kpm_enable) { 1914 pp_addr2 = hat_kpm_page2va(pp, 0); 1915 kpreempt_disable(); 1916 } else { 1917 kpreempt_disable(); 1918 1919 pp_addr2 = CPU->cpu_caddr2; 1920 pte2 = (void *)CPU->cpu_caddr2pte; 1921 1922 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1923 mutex_enter(ppaddr_mutex); 1924 1925 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 1926 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1927 HAT_LOAD_NOCONSIST); 1928 } 1929 1930 if (use_sse_pagezero) 1931 hwblkclr(pp_addr2 + off, len); 1932 else 1933 bzero(pp_addr2 + off, len); 1934 1935 if (!kpm_enable) 1936 mutex_exit(ppaddr_mutex); 1937 kpreempt_enable(); 1938 } 1939 1940 /* 1941 * Platform-dependent page scrub call. 1942 */ 1943 void 1944 pagescrub(page_t *pp, uint_t off, uint_t len) 1945 { 1946 /* 1947 * For now, we rely on the fact that pagezero() will 1948 * always clear UEs. 1949 */ 1950 pagezero(pp, off, len); 1951 } 1952 1953 /* 1954 * set up two private addresses for use on a given CPU for use in ppcopy() 1955 */ 1956 void 1957 setup_vaddr_for_ppcopy(struct cpu *cpup) 1958 { 1959 void *addr; 1960 void *pte; 1961 1962 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1963 pte = hat_mempte_setup(addr); 1964 cpup->cpu_caddr1 = addr; 1965 cpup->cpu_caddr1pte = (pteptr_t)pte; 1966 1967 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1968 pte = hat_mempte_setup(addr); 1969 cpup->cpu_caddr2 = addr; 1970 cpup->cpu_caddr2pte = (pteptr_t)pte; 1971 1972 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 1973 } 1974 1975 1976 /* 1977 * Create the pageout scanner thread. The thread has to 1978 * start at procedure with process pp and priority pri. 1979 */ 1980 void 1981 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1982 { 1983 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1984 } 1985 1986 /* 1987 * Function for flushing D-cache when performing module relocations 1988 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 1989 */ 1990 void 1991 dcache_flushall() 1992 {} 1993