1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 59 #include <vm/hat.h> 60 #include <vm/as.h> 61 #include <vm/seg.h> 62 #include <vm/seg_kp.h> 63 #include <vm/seg_vn.h> 64 #include <vm/page.h> 65 #include <vm/seg_kmem.h> 66 #include <vm/seg_kpm.h> 67 #include <vm/vm_dep.h> 68 69 #include <sys/cpu.h> 70 #include <sys/vm_machparam.h> 71 #include <sys/memlist.h> 72 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 73 #include <vm/hat_i86.h> 74 #include <sys/x86_archext.h> 75 #include <sys/elf_386.h> 76 #include <sys/cmn_err.h> 77 #include <sys/archsystm.h> 78 #include <sys/machsystm.h> 79 80 #include <sys/vtrace.h> 81 #include <sys/ddidmareq.h> 82 #include <sys/promif.h> 83 #include <sys/memnode.h> 84 #include <sys/stack.h> 85 86 uint_t vac_colors = 0; 87 88 int largepagesupport = 0; 89 extern uint_t page_create_new; 90 extern uint_t page_create_exists; 91 extern uint_t page_create_putbacks; 92 extern uint_t page_create_putbacks; 93 extern uintptr_t eprom_kernelbase; 94 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 95 96 /* 4g memory management */ 97 pgcnt_t maxmem4g; 98 pgcnt_t freemem4g; 99 int physmax4g; 100 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 101 int lotsfree4gshift = 3; 102 103 /* 16m memory management: desired number of free pages below 16m. */ 104 pgcnt_t desfree16m = 0x380; 105 106 #ifdef VM_STATS 107 struct { 108 ulong_t pga_alloc; 109 ulong_t pga_notfullrange; 110 ulong_t pga_nulldmaattr; 111 ulong_t pga_allocok; 112 ulong_t pga_allocfailed; 113 ulong_t pgma_alloc; 114 ulong_t pgma_allocok; 115 ulong_t pgma_allocfailed; 116 ulong_t pgma_allocempty; 117 } pga_vmstats; 118 #endif 119 120 uint_t mmu_page_sizes; 121 122 /* How many page sizes the users can see */ 123 uint_t mmu_exported_page_sizes; 124 125 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 126 /* 127 * Number of pages in 1 GB. Don't enable automatic large pages if we have 128 * fewer than this many pages. 129 */ 130 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 131 132 /* 133 * Return the optimum page size for a given mapping 134 */ 135 /*ARGSUSED*/ 136 size_t 137 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 138 { 139 level_t l; 140 141 if (remap) 142 *remap = 0; 143 144 switch (maptype) { 145 146 case MAPPGSZ_STK: 147 case MAPPGSZ_HEAP: 148 case MAPPGSZ_VA: 149 /* 150 * use the pages size that best fits len 151 */ 152 for (l = mmu.max_page_level; l > 0; --l) { 153 if (len < LEVEL_SIZE(l)) 154 continue; 155 break; 156 } 157 return (LEVEL_SIZE(l)); 158 159 /* 160 * for ISM use the 1st large page size. 161 */ 162 case MAPPGSZ_ISM: 163 if (mmu.max_page_level == 0) 164 return (MMU_PAGESIZE); 165 return (LEVEL_SIZE(1)); 166 } 167 return (0); 168 } 169 170 /* 171 * This can be patched via /etc/system to allow large pages 172 * to be used for mapping application and libraries text segments. 173 */ 174 int use_text_largepages = 0; 175 176 /* 177 * Return a bit vector of large page size codes that 178 * can be used to map [addr, addr + len) region. 179 */ 180 181 /*ARGSUSED*/ 182 uint_t 183 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 184 { 185 size_t pgsz; 186 caddr_t a; 187 188 if (!text || !use_text_largepages || 189 mmu.max_page_level == 0) 190 return (0); 191 192 pgsz = LEVEL_SIZE(1); 193 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 194 if (a < addr || a >= addr + len) { 195 return (0); 196 } 197 len -= (a - addr); 198 if (len < pgsz) { 199 return (0); 200 } 201 return (1 << 1); 202 } 203 204 /* 205 * Handle a pagefault. 206 */ 207 faultcode_t 208 pagefault( 209 caddr_t addr, 210 enum fault_type type, 211 enum seg_rw rw, 212 int iskernel) 213 { 214 struct as *as; 215 struct hat *hat; 216 struct proc *p; 217 kthread_t *t; 218 faultcode_t res; 219 caddr_t base; 220 size_t len; 221 int err; 222 int mapped_red; 223 uintptr_t ea; 224 225 ASSERT_STACK_ALIGNED(); 226 227 if (INVALID_VADDR(addr)) 228 return (FC_NOMAP); 229 230 mapped_red = segkp_map_red(); 231 232 if (iskernel) { 233 as = &kas; 234 hat = as->a_hat; 235 } else { 236 t = curthread; 237 p = ttoproc(t); 238 as = p->p_as; 239 hat = as->a_hat; 240 } 241 242 /* 243 * Dispatch pagefault. 244 */ 245 res = as_fault(hat, as, addr, 1, type, rw); 246 247 /* 248 * If this isn't a potential unmapped hole in the user's 249 * UNIX data or stack segments, just return status info. 250 */ 251 if (res != FC_NOMAP || iskernel) 252 goto out; 253 254 /* 255 * Check to see if we happened to faulted on a currently unmapped 256 * part of the UNIX data or stack segments. If so, create a zfod 257 * mapping there and then try calling the fault routine again. 258 */ 259 base = p->p_brkbase; 260 len = p->p_brksize; 261 262 if (addr < base || addr >= base + len) { /* data seg? */ 263 base = (caddr_t)p->p_usrstack - p->p_stksize; 264 len = p->p_stksize; 265 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 266 /* not in either UNIX data or stack segments */ 267 res = FC_NOMAP; 268 goto out; 269 } 270 } 271 272 /* 273 * the rest of this function implements a 3.X 4.X 5.X compatibility 274 * This code is probably not needed anymore 275 */ 276 if (p->p_model == DATAMODEL_ILP32) { 277 278 /* expand the gap to the page boundaries on each side */ 279 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 280 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 281 len = ea - (uintptr_t)base; 282 283 as_rangelock(as); 284 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 285 0) { 286 err = as_map(as, base, len, segvn_create, zfod_argsp); 287 as_rangeunlock(as); 288 if (err) { 289 res = FC_MAKE_ERR(err); 290 goto out; 291 } 292 } else { 293 /* 294 * This page is already mapped by another thread after 295 * we returned from as_fault() above. We just fall 296 * through as_fault() below. 297 */ 298 as_rangeunlock(as); 299 } 300 301 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 302 } 303 304 out: 305 if (mapped_red) 306 segkp_unmap_red(); 307 308 return (res); 309 } 310 311 void 312 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 313 { 314 struct proc *p = curproc; 315 caddr_t userlimit = (flags & _MAP_LOW32) ? 316 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 317 318 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 319 } 320 321 /*ARGSUSED*/ 322 int 323 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 324 { 325 return (0); 326 } 327 328 /* 329 * map_addr_proc() is the routine called when the system is to 330 * choose an address for the user. We will pick an address 331 * range which is the highest available below kernelbase. 332 * 333 * addrp is a value/result parameter. 334 * On input it is a hint from the user to be used in a completely 335 * machine dependent fashion. We decide to completely ignore this hint. 336 * 337 * On output it is NULL if no address can be found in the current 338 * processes address space or else an address that is currently 339 * not mapped for len bytes with a page of red zone on either side. 340 * 341 * align is not needed on x86 (it's for viturally addressed caches) 342 */ 343 /*ARGSUSED*/ 344 void 345 map_addr_proc( 346 caddr_t *addrp, 347 size_t len, 348 offset_t off, 349 int vacalign, 350 caddr_t userlimit, 351 struct proc *p, 352 uint_t flags) 353 { 354 struct as *as = p->p_as; 355 caddr_t addr; 356 caddr_t base; 357 size_t slen; 358 size_t align_amount; 359 360 ASSERT32(userlimit == as->a_userlimit); 361 362 base = p->p_brkbase; 363 #if defined(__amd64) 364 /* 365 * XX64 Yes, this needs more work. 366 */ 367 if (p->p_model == DATAMODEL_NATIVE) { 368 if (userlimit < as->a_userlimit) { 369 /* 370 * This happens when a program wants to map 371 * something in a range that's accessible to a 372 * program in a smaller address space. For example, 373 * a 64-bit program calling mmap32(2) to guarantee 374 * that the returned address is below 4Gbytes. 375 */ 376 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 377 378 if (userlimit > base) 379 slen = userlimit - base; 380 else { 381 *addrp = NULL; 382 return; 383 } 384 } else { 385 /* 386 * XX64 This layout is probably wrong .. but in 387 * the event we make the amd64 address space look 388 * like sparcv9 i.e. with the stack -above- the 389 * heap, this bit of code might even be correct. 390 */ 391 slen = p->p_usrstack - base - 392 (((size_t)rctl_enforced_value( 393 rctlproc_legacy[RLIMIT_STACK], 394 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 395 } 396 } else 397 #endif 398 slen = userlimit - base; 399 400 len = (len + PAGEOFFSET) & PAGEMASK; 401 402 /* 403 * Redzone for each side of the request. This is done to leave 404 * one page unmapped between segments. This is not required, but 405 * it's useful for the user because if their program strays across 406 * a segment boundary, it will catch a fault immediately making 407 * debugging a little easier. 408 */ 409 len += 2 * MMU_PAGESIZE; 410 411 /* 412 * figure out what the alignment should be 413 * 414 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 415 */ 416 if (len <= ELF_386_MAXPGSZ) { 417 /* 418 * Align virtual addresses to ensure that ELF shared libraries 419 * are mapped with the appropriate alignment constraints by 420 * the run-time linker. 421 */ 422 align_amount = ELF_386_MAXPGSZ; 423 } else { 424 int l = mmu.max_page_level; 425 426 while (l && len < LEVEL_SIZE(l)) 427 --l; 428 429 align_amount = LEVEL_SIZE(l); 430 } 431 432 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 433 align_amount = (uintptr_t)*addrp; 434 435 len += align_amount; 436 437 /* 438 * Look for a large enough hole starting below userlimit. 439 * After finding it, use the upper part. Addition of PAGESIZE 440 * is for the redzone as described above. 441 */ 442 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 443 caddr_t as_addr; 444 445 addr = base + slen - len + MMU_PAGESIZE; 446 as_addr = addr; 447 /* 448 * Round address DOWN to the alignment amount, 449 * add the offset, and if this address is less 450 * than the original address, add alignment amount. 451 */ 452 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 453 addr += (uintptr_t)(off & (align_amount - 1)); 454 if (addr < as_addr) 455 addr += align_amount; 456 457 ASSERT(addr <= (as_addr + align_amount)); 458 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 459 ((uintptr_t)(off & (align_amount - 1)))); 460 *addrp = addr; 461 } else { 462 *addrp = NULL; /* no more virtual space */ 463 } 464 } 465 466 /* 467 * Determine whether [base, base+len] contains a valid range of 468 * addresses at least minlen long. base and len are adjusted if 469 * required to provide a valid range. 470 */ 471 /*ARGSUSED3*/ 472 int 473 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 474 { 475 uintptr_t hi, lo; 476 477 lo = (uintptr_t)*basep; 478 hi = lo + *lenp; 479 480 /* 481 * If hi rolled over the top, try cutting back. 482 */ 483 if (hi < lo) { 484 if (0 - lo + hi < minlen) 485 return (0); 486 if (0 - lo < minlen) 487 return (0); 488 *lenp = 0 - lo; 489 } else if (hi - lo < minlen) { 490 return (0); 491 } 492 #if defined(__amd64) 493 /* 494 * Deal with a possible hole in the address range between 495 * hole_start and hole_end that should never be mapped. 496 */ 497 if (lo < hole_start) { 498 if (hi > hole_start) { 499 if (hi < hole_end) { 500 hi = hole_start; 501 } else { 502 /* lo < hole_start && hi >= hole_end */ 503 if (dir == AH_LO) { 504 /* 505 * prefer lowest range 506 */ 507 if (hole_start - lo >= minlen) 508 hi = hole_start; 509 else if (hi - hole_end >= minlen) 510 lo = hole_end; 511 else 512 return (0); 513 } else { 514 /* 515 * prefer highest range 516 */ 517 if (hi - hole_end >= minlen) 518 lo = hole_end; 519 else if (hole_start - lo >= minlen) 520 hi = hole_start; 521 else 522 return (0); 523 } 524 } 525 } 526 } else { 527 /* lo >= hole_start */ 528 if (hi < hole_end) 529 return (0); 530 if (lo < hole_end) 531 lo = hole_end; 532 } 533 534 if (hi - lo < minlen) 535 return (0); 536 537 *basep = (caddr_t)lo; 538 *lenp = hi - lo; 539 #endif 540 return (1); 541 } 542 543 /* 544 * Determine whether [addr, addr+len] are valid user addresses. 545 */ 546 /*ARGSUSED*/ 547 int 548 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 549 caddr_t userlimit) 550 { 551 caddr_t eaddr = addr + len; 552 553 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 554 return (RANGE_BADADDR); 555 556 #if defined(__amd64) 557 /* 558 * Check for the VA hole 559 */ 560 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 561 return (RANGE_BADADDR); 562 #endif 563 564 return (RANGE_OKAY); 565 } 566 567 /* 568 * Return 1 if the page frame is onboard memory, else 0. 569 */ 570 int 571 pf_is_memory(pfn_t pf) 572 { 573 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 574 } 575 576 577 /* 578 * initialized by page_coloring_init(). 579 */ 580 uint_t page_colors; 581 uint_t page_colors_mask; 582 uint_t page_coloring_shift; 583 int cpu_page_colors; 584 static uint_t l2_colors; 585 586 /* 587 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 588 * and page_colors are calculated from the l2 cache n-way set size. Within a 589 * mnode range, the page freelist and cachelist are hashed into bins based on 590 * color. This makes it easier to search for a page within a specific memory 591 * range. 592 */ 593 #define PAGE_COLORS_MIN 16 594 595 page_t ****page_freelists; 596 page_t ***page_cachelists; 597 598 /* 599 * As the PC architecture evolved memory up was clumped into several 600 * ranges for various historical I/O devices to do DMA. 601 * < 16Meg - ISA bus 602 * < 2Gig - ??? 603 * < 4Gig - PCI bus or drivers that don't understand PAE mode 604 */ 605 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 606 0x100000, /* pfn range for 4G and above */ 607 0x80000, /* pfn range for 2G-4G */ 608 0x01000, /* pfn range for 16M-2G */ 609 0x00000, /* pfn range for 0-16M */ 610 }; 611 612 /* 613 * These are changed during startup if the machine has limited memory. 614 */ 615 pfn_t *memranges = &arch_memranges[0]; 616 int nranges = NUM_MEM_RANGES; 617 618 /* 619 * Used by page layer to know about page sizes 620 */ 621 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 622 623 /* 624 * This can be patched via /etc/system to allow old non-PAE aware device 625 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 626 */ 627 #if defined(__i386) 628 int restricted_kmemalloc = 0; 629 #elif defined(__amd64) 630 int restricted_kmemalloc = 0; 631 #endif 632 633 kmutex_t *fpc_mutex[NPC_MUTEX]; 634 kmutex_t *cpc_mutex[NPC_MUTEX]; 635 636 637 /* 638 * return the memrange containing pfn 639 */ 640 int 641 memrange_num(pfn_t pfn) 642 { 643 int n; 644 645 for (n = 0; n < nranges - 1; ++n) { 646 if (pfn >= memranges[n]) 647 break; 648 } 649 return (n); 650 } 651 652 /* 653 * return the mnoderange containing pfn 654 */ 655 int 656 pfn_2_mtype(pfn_t pfn) 657 { 658 int n; 659 660 for (n = mnoderangecnt - 1; n >= 0; n--) { 661 if (pfn >= mnoderanges[n].mnr_pfnlo) { 662 break; 663 } 664 } 665 return (n); 666 } 667 668 /* 669 * is_contigpage_free: 670 * returns a page list of contiguous pages. It minimally has to return 671 * minctg pages. Caller determines minctg based on the scatter-gather 672 * list length. 673 * 674 * pfnp is set to the next page frame to search on return. 675 */ 676 static page_t * 677 is_contigpage_free( 678 pfn_t *pfnp, 679 pgcnt_t *pgcnt, 680 pgcnt_t minctg, 681 uint64_t pfnseg, 682 int iolock) 683 { 684 int i = 0; 685 pfn_t pfn = *pfnp; 686 page_t *pp; 687 page_t *plist = NULL; 688 689 /* 690 * fail if pfn + minctg crosses a segment boundary. 691 * Adjust for next starting pfn to begin at segment boundary. 692 */ 693 694 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 695 *pfnp = roundup(*pfnp, pfnseg + 1); 696 return (NULL); 697 } 698 699 do { 700 retry: 701 pp = page_numtopp_nolock(pfn + i); 702 if ((pp == NULL) || 703 (page_trylock(pp, SE_EXCL) == 0)) { 704 (*pfnp)++; 705 break; 706 } 707 if (page_pptonum(pp) != pfn + i) { 708 page_unlock(pp); 709 goto retry; 710 } 711 712 if (!(PP_ISFREE(pp))) { 713 page_unlock(pp); 714 (*pfnp)++; 715 break; 716 } 717 718 if (!PP_ISAGED(pp)) { 719 page_list_sub(pp, PG_CACHE_LIST); 720 page_hashout(pp, (kmutex_t *)NULL); 721 } else { 722 page_list_sub(pp, PG_FREE_LIST); 723 } 724 725 if (iolock) 726 page_io_lock(pp); 727 page_list_concat(&plist, &pp); 728 729 /* 730 * exit loop when pgcnt satisfied or segment boundary reached. 731 */ 732 733 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 734 735 *pfnp += i; /* set to next pfn to search */ 736 737 if (i >= minctg) { 738 *pgcnt -= i; 739 return (plist); 740 } 741 742 /* 743 * failure: minctg not satisfied. 744 * 745 * if next request crosses segment boundary, set next pfn 746 * to search from the segment boundary. 747 */ 748 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 749 *pfnp = roundup(*pfnp, pfnseg + 1); 750 751 /* clean up any pages already allocated */ 752 753 while (plist) { 754 pp = plist; 755 page_sub(&plist, pp); 756 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 757 if (iolock) 758 page_io_unlock(pp); 759 page_unlock(pp); 760 } 761 762 return (NULL); 763 } 764 765 /* 766 * verify that pages being returned from allocator have correct DMA attribute 767 */ 768 #ifndef DEBUG 769 #define check_dma(a, b, c) (0) 770 #else 771 static void 772 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 773 { 774 if (dma_attr == NULL) 775 return; 776 777 while (cnt-- > 0) { 778 if (mmu_ptob((uint64_t)pp->p_pagenum) < 779 dma_attr->dma_attr_addr_lo) 780 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 781 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 782 dma_attr->dma_attr_addr_hi) 783 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 784 pp = pp->p_next; 785 } 786 } 787 #endif 788 789 static kmutex_t contig_lock; 790 791 #define CONTIG_LOCK() mutex_enter(&contig_lock); 792 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 793 794 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 795 796 static page_t * 797 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 798 { 799 pfn_t pfn; 800 int sgllen; 801 uint64_t pfnseg; 802 pgcnt_t minctg; 803 page_t *pplist = NULL, *plist; 804 uint64_t lo, hi; 805 pgcnt_t pfnalign = 0; 806 static pfn_t startpfn; 807 static pgcnt_t lastctgcnt; 808 uintptr_t align; 809 810 CONTIG_LOCK(); 811 812 if (mattr) { 813 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 814 hi = mmu_btop(mattr->dma_attr_addr_hi); 815 if (hi >= physmax) 816 hi = physmax - 1; 817 sgllen = mattr->dma_attr_sgllen; 818 pfnseg = mmu_btop(mattr->dma_attr_seg); 819 820 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 821 if (align > MMU_PAGESIZE) 822 pfnalign = mmu_btop(align); 823 824 /* 825 * in order to satisfy the request, must minimally 826 * acquire minctg contiguous pages 827 */ 828 minctg = howmany(*pgcnt, sgllen); 829 830 ASSERT(hi >= lo); 831 832 /* 833 * start from where last searched if the minctg >= lastctgcnt 834 */ 835 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 836 startpfn = lo; 837 } else { 838 hi = physmax - 1; 839 lo = 0; 840 sgllen = 1; 841 pfnseg = mmu.highest_pfn; 842 minctg = *pgcnt; 843 844 if (minctg < lastctgcnt) 845 startpfn = lo; 846 } 847 lastctgcnt = minctg; 848 849 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 850 851 /* conserve 16m memory - start search above 16m when possible */ 852 if (hi > PFN_16M && startpfn < PFN_16M) 853 startpfn = PFN_16M; 854 855 pfn = startpfn; 856 if (pfnalign) 857 pfn = P2ROUNDUP(pfn, pfnalign); 858 859 while (pfn + minctg - 1 <= hi) { 860 861 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 862 if (plist) { 863 page_list_concat(&pplist, &plist); 864 sgllen--; 865 /* 866 * return when contig pages no longer needed 867 */ 868 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 869 startpfn = pfn; 870 CONTIG_UNLOCK(); 871 check_dma(mattr, pplist, *pgcnt); 872 return (pplist); 873 } 874 minctg = howmany(*pgcnt, sgllen); 875 } 876 if (pfnalign) 877 pfn = P2ROUNDUP(pfn, pfnalign); 878 } 879 880 /* cannot find contig pages in specified range */ 881 if (startpfn == lo) { 882 CONTIG_UNLOCK(); 883 return (NULL); 884 } 885 886 /* did not start with lo previously */ 887 pfn = lo; 888 if (pfnalign) 889 pfn = P2ROUNDUP(pfn, pfnalign); 890 891 /* allow search to go above startpfn */ 892 while (pfn < startpfn) { 893 894 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 895 if (plist != NULL) { 896 897 page_list_concat(&pplist, &plist); 898 sgllen--; 899 900 /* 901 * return when contig pages no longer needed 902 */ 903 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 904 startpfn = pfn; 905 CONTIG_UNLOCK(); 906 check_dma(mattr, pplist, *pgcnt); 907 return (pplist); 908 } 909 minctg = howmany(*pgcnt, sgllen); 910 } 911 if (pfnalign) 912 pfn = P2ROUNDUP(pfn, pfnalign); 913 } 914 CONTIG_UNLOCK(); 915 return (NULL); 916 } 917 918 /* 919 * combine mem_node_config and memrange memory ranges into one data 920 * structure to be used for page list management. 921 * 922 * mnode_range_cnt() calculates the number of memory ranges for mnode and 923 * memranges[]. Used to determine the size of page lists and mnoderanges. 924 * 925 * mnode_range_setup() initializes mnoderanges. 926 */ 927 mnoderange_t *mnoderanges; 928 int mnoderangecnt; 929 int mtype4g; 930 931 int 932 mnode_range_cnt() 933 { 934 int mri; 935 int mnrcnt = 0; 936 int mnode; 937 938 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 939 if (mem_node_config[mnode].exists == 0) 940 continue; 941 942 mri = nranges - 1; 943 944 /* find the memranges index below contained in mnode range */ 945 946 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 947 mri--; 948 949 /* 950 * increment mnode range counter when memranges or mnode 951 * boundary is reached. 952 */ 953 while (mri >= 0 && 954 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 955 mnrcnt++; 956 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 957 mri--; 958 else 959 break; 960 } 961 } 962 return (mnrcnt); 963 } 964 965 void 966 mnode_range_setup(mnoderange_t *mnoderanges) 967 { 968 int mnode, mri; 969 970 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 971 if (mem_node_config[mnode].exists == 0) 972 continue; 973 974 mri = nranges - 1; 975 976 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 977 mri--; 978 979 while (mri >= 0 && mem_node_config[mnode].physmax >= 980 MEMRANGELO(mri)) { 981 mnoderanges->mnr_pfnlo = 982 MAX(MEMRANGELO(mri), 983 mem_node_config[mnode].physbase); 984 mnoderanges->mnr_pfnhi = 985 MIN(MEMRANGEHI(mri), 986 mem_node_config[mnode].physmax); 987 mnoderanges->mnr_mnode = mnode; 988 mnoderanges->mnr_memrange = mri; 989 mnoderanges++; 990 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 991 mri--; 992 else 993 break; 994 } 995 } 996 } 997 998 /* 999 * Determine if the mnode range specified in mtype contains memory belonging 1000 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1001 * the range of indices from high pfn to 0, 16m or 4g. 1002 * 1003 * Return first mnode range type index found otherwise return -1 if none found. 1004 */ 1005 int 1006 mtype_func(int mnode, int mtype, uint_t flags) 1007 { 1008 if (flags & PGI_MT_RANGE) { 1009 int mtlim; 1010 1011 if (flags & PGI_MT_NEXT) 1012 mtype--; 1013 if (flags & PGI_MT_RANGE0) 1014 mtlim = 0; 1015 else if (flags & PGI_MT_RANGE4G) 1016 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1017 else if (flags & PGI_MT_RANGE16M) 1018 mtlim = 1; /* exclude 0-16m range */ 1019 while (mtype >= mtlim) { 1020 if (mnoderanges[mtype].mnr_mnode == mnode) 1021 return (mtype); 1022 mtype--; 1023 } 1024 } else { 1025 if (mnoderanges[mtype].mnr_mnode == mnode) 1026 return (mtype); 1027 } 1028 return (-1); 1029 } 1030 1031 /* 1032 * Update the page list max counts with the pfn range specified by the 1033 * input parameters. Called from add_physmem() when physical memory with 1034 * page_t's are initially added to the page lists. 1035 */ 1036 void 1037 mtype_modify_max(pfn_t startpfn, long cnt) 1038 { 1039 int mtype = 0; 1040 pfn_t endpfn = startpfn + cnt, pfn; 1041 pgcnt_t inc; 1042 1043 ASSERT(cnt > 0); 1044 1045 for (pfn = startpfn; pfn < endpfn; ) { 1046 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1047 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1048 inc = endpfn - pfn; 1049 } else { 1050 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1051 } 1052 mnoderanges[mtype].mnr_mt_pgmax += inc; 1053 if (physmax4g && mtype <= mtype4g) 1054 maxmem4g += inc; 1055 pfn += inc; 1056 } 1057 mtype++; 1058 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1059 } 1060 } 1061 1062 /* 1063 * Returns the free page count for mnode 1064 */ 1065 int 1066 mnode_pgcnt(int mnode) 1067 { 1068 int mtype = mnoderangecnt - 1; 1069 int flags = PGI_MT_RANGE0; 1070 pgcnt_t pgcnt = 0; 1071 1072 mtype = mtype_func(mnode, mtype, flags); 1073 1074 while (mtype != -1) { 1075 pgcnt += MTYPE_FREEMEM(mtype); 1076 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1077 } 1078 return (pgcnt); 1079 } 1080 1081 /* 1082 * Initialize page coloring variables based on the l2 cache parameters. 1083 * Calculate and return memory needed for page coloring data structures. 1084 */ 1085 size_t 1086 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1087 { 1088 size_t colorsz = 0; 1089 int i; 1090 int colors; 1091 1092 /* 1093 * Reduce the memory ranges lists if we don't have large amounts 1094 * of memory. This avoids searching known empty free lists. 1095 */ 1096 i = memrange_num(physmax); 1097 memranges += i; 1098 nranges -= i; 1099 #if defined(__i386) 1100 if (i > 0) 1101 restricted_kmemalloc = 0; 1102 #endif 1103 /* physmax greater than 4g */ 1104 if (i == 0) 1105 physmax4g = 1; 1106 1107 /* 1108 * setup pagesize for generic page layer 1109 */ 1110 for (i = 0; i <= mmu.max_page_level; ++i) { 1111 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1112 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1113 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1114 } 1115 1116 ASSERT(ISP2(l2_sz)); 1117 ASSERT(ISP2(l2_linesz)); 1118 ASSERT(l2_sz > MMU_PAGESIZE); 1119 1120 /* l2_assoc is 0 for fully associative l2 cache */ 1121 if (l2_assoc) 1122 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1123 else 1124 l2_colors = 1; 1125 1126 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1127 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1128 1129 /* 1130 * cpu_page_colors is non-zero when a page color may be spread across 1131 * multiple bins. 1132 */ 1133 if (l2_colors < page_colors) 1134 cpu_page_colors = l2_colors; 1135 1136 ASSERT(ISP2(page_colors)); 1137 1138 page_colors_mask = page_colors - 1; 1139 1140 ASSERT(ISP2(CPUSETSIZE())); 1141 page_coloring_shift = lowbit(CPUSETSIZE()); 1142 1143 /* size for mnoderanges */ 1144 mnoderangecnt = mnode_range_cnt(); 1145 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1146 1147 /* size for fpc_mutex and cpc_mutex */ 1148 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1149 1150 /* size of page_freelists */ 1151 colorsz += mnoderangecnt * sizeof (page_t ***); 1152 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1153 1154 for (i = 0; i < mmu_page_sizes; i++) { 1155 colors = page_get_pagecolors(i); 1156 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1157 } 1158 1159 /* size of page_cachelists */ 1160 colorsz += mnoderangecnt * sizeof (page_t **); 1161 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1162 1163 return (colorsz); 1164 } 1165 1166 /* 1167 * Called once at startup to configure page_coloring data structures and 1168 * does the 1st page_free()/page_freelist_add(). 1169 */ 1170 void 1171 page_coloring_setup(caddr_t pcmemaddr) 1172 { 1173 int i; 1174 int j; 1175 int k; 1176 caddr_t addr; 1177 int colors; 1178 1179 /* 1180 * do page coloring setup 1181 */ 1182 addr = pcmemaddr; 1183 1184 mnoderanges = (mnoderange_t *)addr; 1185 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1186 1187 mnode_range_setup(mnoderanges); 1188 1189 if (physmax4g) 1190 mtype4g = pfn_2_mtype(0xfffff); 1191 1192 for (k = 0; k < NPC_MUTEX; k++) { 1193 fpc_mutex[k] = (kmutex_t *)addr; 1194 addr += (max_mem_nodes * sizeof (kmutex_t)); 1195 } 1196 for (k = 0; k < NPC_MUTEX; k++) { 1197 cpc_mutex[k] = (kmutex_t *)addr; 1198 addr += (max_mem_nodes * sizeof (kmutex_t)); 1199 } 1200 page_freelists = (page_t ****)addr; 1201 addr += (mnoderangecnt * sizeof (page_t ***)); 1202 1203 page_cachelists = (page_t ***)addr; 1204 addr += (mnoderangecnt * sizeof (page_t **)); 1205 1206 for (i = 0; i < mnoderangecnt; i++) { 1207 page_freelists[i] = (page_t ***)addr; 1208 addr += (mmu_page_sizes * sizeof (page_t **)); 1209 1210 for (j = 0; j < mmu_page_sizes; j++) { 1211 colors = page_get_pagecolors(j); 1212 page_freelists[i][j] = (page_t **)addr; 1213 addr += (colors * sizeof (page_t *)); 1214 } 1215 page_cachelists[i] = (page_t **)addr; 1216 addr += (page_colors * sizeof (page_t *)); 1217 } 1218 } 1219 1220 /*ARGSUSED*/ 1221 int 1222 bp_color(struct buf *bp) 1223 { 1224 return (0); 1225 } 1226 1227 /* 1228 * get a page from any list with the given mnode 1229 */ 1230 page_t * 1231 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1232 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1233 { 1234 kmutex_t *pcm; 1235 int i; 1236 page_t *pp; 1237 page_t *first_pp; 1238 uint64_t pgaddr; 1239 ulong_t bin; 1240 int mtypestart; 1241 1242 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1243 1244 ASSERT((flags & PG_MATCH_COLOR) == 0); 1245 ASSERT(szc == 0); 1246 ASSERT(dma_attr != NULL); 1247 1248 1249 MTYPE_START(mnode, mtype, flags); 1250 if (mtype < 0) { 1251 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1252 return (NULL); 1253 } 1254 1255 mtypestart = mtype; 1256 1257 bin = origbin; 1258 1259 /* 1260 * check up to page_colors + 1 bins - origbin may be checked twice 1261 * because of BIN_STEP skip 1262 */ 1263 do { 1264 i = 0; 1265 while (i <= page_colors) { 1266 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1267 goto nextfreebin; 1268 1269 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1270 mutex_enter(pcm); 1271 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1272 first_pp = pp; 1273 while (pp != NULL) { 1274 if (page_trylock(pp, SE_EXCL) == 0) { 1275 pp = pp->p_next; 1276 if (pp == first_pp) { 1277 pp = NULL; 1278 } 1279 continue; 1280 } 1281 1282 ASSERT(PP_ISFREE(pp)); 1283 ASSERT(PP_ISAGED(pp)); 1284 ASSERT(pp->p_vnode == NULL); 1285 ASSERT(pp->p_hash == NULL); 1286 ASSERT(pp->p_offset == (u_offset_t)-1); 1287 ASSERT(pp->p_szc == szc); 1288 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1289 /* check if page within DMA attributes */ 1290 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1291 1292 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1293 (pgaddr + MMU_PAGESIZE - 1 <= 1294 dma_attr->dma_attr_addr_hi)) { 1295 break; 1296 } 1297 1298 /* continue looking */ 1299 page_unlock(pp); 1300 pp = pp->p_next; 1301 if (pp == first_pp) 1302 pp = NULL; 1303 1304 } 1305 if (pp != NULL) { 1306 ASSERT(mtype == PP_2_MTYPE(pp)); 1307 ASSERT(pp->p_szc == 0); 1308 1309 /* found a page with specified DMA attributes */ 1310 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1311 mtype), pp); 1312 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1313 1314 if ((PP_ISFREE(pp) == 0) || 1315 (PP_ISAGED(pp) == 0)) { 1316 cmn_err(CE_PANIC, "page %p is not free", 1317 (void *)pp); 1318 } 1319 1320 mutex_exit(pcm); 1321 check_dma(dma_attr, pp, 1); 1322 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1323 return (pp); 1324 } 1325 mutex_exit(pcm); 1326 nextfreebin: 1327 pp = page_freelist_fill(szc, bin, mnode, mtype, 1328 mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 1329 if (pp) 1330 return (pp); 1331 1332 /* try next bin */ 1333 bin += (i == 0) ? BIN_STEP : 1; 1334 bin &= page_colors_mask; 1335 i++; 1336 } 1337 MTYPE_NEXT(mnode, mtype, flags); 1338 } while (mtype >= 0); 1339 1340 /* failed to find a page in the freelist; try it in the cachelist */ 1341 1342 /* reset mtype start for cachelist search */ 1343 mtype = mtypestart; 1344 ASSERT(mtype >= 0); 1345 1346 /* start with the bin of matching color */ 1347 bin = origbin; 1348 1349 do { 1350 for (i = 0; i <= page_colors; i++) { 1351 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1352 goto nextcachebin; 1353 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1354 mutex_enter(pcm); 1355 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1356 first_pp = pp; 1357 while (pp != NULL) { 1358 if (page_trylock(pp, SE_EXCL) == 0) { 1359 pp = pp->p_next; 1360 if (pp == first_pp) 1361 break; 1362 continue; 1363 } 1364 ASSERT(pp->p_vnode); 1365 ASSERT(PP_ISAGED(pp) == 0); 1366 ASSERT(pp->p_szc == 0); 1367 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1368 1369 /* check if page within DMA attributes */ 1370 1371 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1372 1373 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1374 (pgaddr + MMU_PAGESIZE - 1 <= 1375 dma_attr->dma_attr_addr_hi)) { 1376 break; 1377 } 1378 1379 /* continue looking */ 1380 page_unlock(pp); 1381 pp = pp->p_next; 1382 if (pp == first_pp) 1383 pp = NULL; 1384 } 1385 1386 if (pp != NULL) { 1387 ASSERT(mtype == PP_2_MTYPE(pp)); 1388 ASSERT(pp->p_szc == 0); 1389 1390 /* found a page with specified DMA attributes */ 1391 page_sub(&PAGE_CACHELISTS(mnode, bin, 1392 mtype), pp); 1393 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1394 1395 mutex_exit(pcm); 1396 ASSERT(pp->p_vnode); 1397 ASSERT(PP_ISAGED(pp) == 0); 1398 check_dma(dma_attr, pp, 1); 1399 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1400 return (pp); 1401 } 1402 mutex_exit(pcm); 1403 nextcachebin: 1404 bin += (i == 0) ? BIN_STEP : 1; 1405 bin &= page_colors_mask; 1406 } 1407 MTYPE_NEXT(mnode, mtype, flags); 1408 } while (mtype >= 0); 1409 1410 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1411 return (NULL); 1412 } 1413 1414 /* 1415 * This function is similar to page_get_freelist()/page_get_cachelist() 1416 * but it searches both the lists to find a page with the specified 1417 * color (or no color) and DMA attributes. The search is done in the 1418 * freelist first and then in the cache list within the highest memory 1419 * range (based on DMA attributes) before searching in the lower 1420 * memory ranges. 1421 * 1422 * Note: This function is called only by page_create_io(). 1423 */ 1424 /*ARGSUSED*/ 1425 page_t * 1426 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1427 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1428 { 1429 uint_t bin; 1430 int mtype; 1431 page_t *pp; 1432 int n; 1433 int m; 1434 int szc; 1435 int fullrange; 1436 int mnode; 1437 int local_failed_stat = 0; 1438 lgrp_mnode_cookie_t lgrp_cookie; 1439 1440 VM_STAT_ADD(pga_vmstats.pga_alloc); 1441 1442 /* only base pagesize currently supported */ 1443 if (size != MMU_PAGESIZE) 1444 return (NULL); 1445 1446 /* 1447 * If we're passed a specific lgroup, we use it. Otherwise, 1448 * assume first-touch placement is desired. 1449 */ 1450 if (!LGRP_EXISTS(lgrp)) 1451 lgrp = lgrp_home_lgrp(); 1452 1453 /* LINTED */ 1454 AS_2_BIN(as, seg, vp, vaddr, bin); 1455 1456 /* 1457 * Only hold one freelist or cachelist lock at a time, that way we 1458 * can start anywhere and not have to worry about lock 1459 * ordering. 1460 */ 1461 if (dma_attr == NULL) { 1462 n = 0; 1463 m = mnoderangecnt - 1; 1464 fullrange = 1; 1465 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1466 } else { 1467 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1468 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1469 1470 /* 1471 * We can guarantee alignment only for page boundary. 1472 */ 1473 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1474 return (NULL); 1475 1476 n = pfn_2_mtype(pfnlo); 1477 m = pfn_2_mtype(pfnhi); 1478 1479 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1480 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1481 } 1482 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1483 1484 if (n > m) 1485 return (NULL); 1486 1487 szc = 0; 1488 1489 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1490 if (n == 0) { 1491 flags |= PGI_MT_RANGE0; 1492 n = m; 1493 } 1494 1495 /* 1496 * Try local memory node first, but try remote if we can't 1497 * get a page of the right color. 1498 */ 1499 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1500 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1501 /* 1502 * allocate pages from high pfn to low. 1503 */ 1504 for (mtype = m; mtype >= n; mtype--) { 1505 if (fullrange != 0) { 1506 pp = page_get_mnode_freelist(mnode, 1507 bin, mtype, szc, flags); 1508 if (pp == NULL) { 1509 pp = page_get_mnode_cachelist( 1510 bin, flags, mnode, mtype); 1511 } 1512 } else { 1513 pp = page_get_mnode_anylist(bin, szc, 1514 flags, mnode, mtype, dma_attr); 1515 } 1516 if (pp != NULL) { 1517 VM_STAT_ADD(pga_vmstats.pga_allocok); 1518 check_dma(dma_attr, pp, 1); 1519 return (pp); 1520 } 1521 } 1522 if (!local_failed_stat) { 1523 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1524 local_failed_stat = 1; 1525 } 1526 } 1527 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1528 1529 return (NULL); 1530 } 1531 1532 /* 1533 * page_create_io() 1534 * 1535 * This function is a copy of page_create_va() with an additional 1536 * argument 'mattr' that specifies DMA memory requirements to 1537 * the page list functions. This function is used by the segkmem 1538 * allocator so it is only to create new pages (i.e PG_EXCL is 1539 * set). 1540 * 1541 * Note: This interface is currently used by x86 PSM only and is 1542 * not fully specified so the commitment level is only for 1543 * private interface specific to x86. This interface uses PSM 1544 * specific page_get_anylist() interface. 1545 */ 1546 1547 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1548 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1549 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1550 break; \ 1551 } \ 1552 } 1553 1554 1555 page_t * 1556 page_create_io( 1557 struct vnode *vp, 1558 u_offset_t off, 1559 uint_t bytes, 1560 uint_t flags, 1561 struct as *as, 1562 caddr_t vaddr, 1563 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1564 { 1565 page_t *plist = NULL; 1566 uint_t plist_len = 0; 1567 pgcnt_t npages; 1568 page_t *npp = NULL; 1569 uint_t pages_req; 1570 page_t *pp; 1571 kmutex_t *phm = NULL; 1572 uint_t index; 1573 1574 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1575 "page_create_start:vp %p off %llx bytes %u flags %x", 1576 vp, off, bytes, flags); 1577 1578 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1579 1580 pages_req = npages = mmu_btopr(bytes); 1581 1582 /* 1583 * Do the freemem and pcf accounting. 1584 */ 1585 if (!page_create_wait(npages, flags)) { 1586 return (NULL); 1587 } 1588 1589 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1590 "page_create_success:vp %p off %llx", 1591 vp, off); 1592 1593 /* 1594 * If satisfying this request has left us with too little 1595 * memory, start the wheels turning to get some back. The 1596 * first clause of the test prevents waking up the pageout 1597 * daemon in situations where it would decide that there's 1598 * nothing to do. 1599 */ 1600 if (nscan < desscan && freemem < minfree) { 1601 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1602 "pageout_cv_signal:freemem %ld", freemem); 1603 cv_signal(&proc_pageout->p_cv); 1604 } 1605 1606 if (flags & PG_PHYSCONTIG) { 1607 1608 plist = page_get_contigpage(&npages, mattr, 1); 1609 if (plist == NULL) { 1610 page_create_putback(npages); 1611 return (NULL); 1612 } 1613 1614 pp = plist; 1615 1616 do { 1617 if (!page_hashin(pp, vp, off, NULL)) { 1618 panic("pg_creat_io: hashin failed %p %p %llx", 1619 (void *)pp, (void *)vp, off); 1620 } 1621 VM_STAT_ADD(page_create_new); 1622 off += MMU_PAGESIZE; 1623 PP_CLRFREE(pp); 1624 PP_CLRAGED(pp); 1625 page_set_props(pp, P_REF); 1626 pp = pp->p_next; 1627 } while (pp != plist); 1628 1629 if (!npages) { 1630 check_dma(mattr, plist, pages_req); 1631 return (plist); 1632 } else { 1633 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1634 } 1635 1636 /* 1637 * fall-thru: 1638 * 1639 * page_get_contigpage returns when npages <= sgllen. 1640 * Grab the rest of the non-contig pages below from anylist. 1641 */ 1642 } 1643 1644 /* 1645 * Loop around collecting the requested number of pages. 1646 * Most of the time, we have to `create' a new page. With 1647 * this in mind, pull the page off the free list before 1648 * getting the hash lock. This will minimize the hash 1649 * lock hold time, nesting, and the like. If it turns 1650 * out we don't need the page, we put it back at the end. 1651 */ 1652 while (npages--) { 1653 phm = NULL; 1654 1655 index = PAGE_HASH_FUNC(vp, off); 1656 top: 1657 ASSERT(phm == NULL); 1658 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1659 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1660 1661 if (npp == NULL) { 1662 /* 1663 * Try to get the page of any color either from 1664 * the freelist or from the cache list. 1665 */ 1666 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1667 flags & ~PG_MATCH_COLOR, mattr, NULL); 1668 if (npp == NULL) { 1669 if (mattr == NULL) { 1670 /* 1671 * Not looking for a special page; 1672 * panic! 1673 */ 1674 panic("no page found %d", (int)npages); 1675 } 1676 /* 1677 * No page found! This can happen 1678 * if we are looking for a page 1679 * within a specific memory range 1680 * for DMA purposes. If PG_WAIT is 1681 * specified then we wait for a 1682 * while and then try again. The 1683 * wait could be forever if we 1684 * don't get the page(s) we need. 1685 * 1686 * Note: XXX We really need a mechanism 1687 * to wait for pages in the desired 1688 * range. For now, we wait for any 1689 * pages and see if we can use it. 1690 */ 1691 1692 if ((mattr != NULL) && (flags & PG_WAIT)) { 1693 delay(10); 1694 goto top; 1695 } 1696 1697 goto fail; /* undo accounting stuff */ 1698 } 1699 1700 if (PP_ISAGED(npp) == 0) { 1701 /* 1702 * Since this page came from the 1703 * cachelist, we must destroy the 1704 * old vnode association. 1705 */ 1706 page_hashout(npp, (kmutex_t *)NULL); 1707 } 1708 } 1709 1710 /* 1711 * We own this page! 1712 */ 1713 ASSERT(PAGE_EXCL(npp)); 1714 ASSERT(npp->p_vnode == NULL); 1715 ASSERT(!hat_page_is_mapped(npp)); 1716 PP_CLRFREE(npp); 1717 PP_CLRAGED(npp); 1718 1719 /* 1720 * Here we have a page in our hot little mits and are 1721 * just waiting to stuff it on the appropriate lists. 1722 * Get the mutex and check to see if it really does 1723 * not exist. 1724 */ 1725 phm = PAGE_HASH_MUTEX(index); 1726 mutex_enter(phm); 1727 PAGE_HASH_SEARCH(index, pp, vp, off); 1728 if (pp == NULL) { 1729 VM_STAT_ADD(page_create_new); 1730 pp = npp; 1731 npp = NULL; 1732 if (!page_hashin(pp, vp, off, phm)) { 1733 /* 1734 * Since we hold the page hash mutex and 1735 * just searched for this page, page_hashin 1736 * had better not fail. If it does, that 1737 * means somethread did not follow the 1738 * page hash mutex rules. Panic now and 1739 * get it over with. As usual, go down 1740 * holding all the locks. 1741 */ 1742 ASSERT(MUTEX_HELD(phm)); 1743 panic("page_create: hashin fail %p %p %llx %p", 1744 (void *)pp, (void *)vp, off, (void *)phm); 1745 1746 } 1747 ASSERT(MUTEX_HELD(phm)); 1748 mutex_exit(phm); 1749 phm = NULL; 1750 1751 /* 1752 * Hat layer locking need not be done to set 1753 * the following bits since the page is not hashed 1754 * and was on the free list (i.e., had no mappings). 1755 * 1756 * Set the reference bit to protect 1757 * against immediate pageout 1758 * 1759 * XXXmh modify freelist code to set reference 1760 * bit so we don't have to do it here. 1761 */ 1762 page_set_props(pp, P_REF); 1763 } else { 1764 ASSERT(MUTEX_HELD(phm)); 1765 mutex_exit(phm); 1766 phm = NULL; 1767 /* 1768 * NOTE: This should not happen for pages associated 1769 * with kernel vnode 'kvp'. 1770 */ 1771 /* XX64 - to debug why this happens! */ 1772 ASSERT(vp != &kvp); 1773 if (vp == &kvp) 1774 cmn_err(CE_NOTE, 1775 "page_create: page not expected " 1776 "in hash list for kernel vnode - pp 0x%p", 1777 (void *)pp); 1778 VM_STAT_ADD(page_create_exists); 1779 goto fail; 1780 } 1781 1782 /* 1783 * Got a page! It is locked. Acquire the i/o 1784 * lock since we are going to use the p_next and 1785 * p_prev fields to link the requested pages together. 1786 */ 1787 page_io_lock(pp); 1788 page_add(&plist, pp); 1789 plist = plist->p_next; 1790 off += MMU_PAGESIZE; 1791 vaddr += MMU_PAGESIZE; 1792 } 1793 1794 check_dma(mattr, plist, pages_req); 1795 return (plist); 1796 1797 fail: 1798 if (npp != NULL) { 1799 /* 1800 * Did not need this page after all. 1801 * Put it back on the free list. 1802 */ 1803 VM_STAT_ADD(page_create_putbacks); 1804 PP_SETFREE(npp); 1805 PP_SETAGED(npp); 1806 npp->p_offset = (u_offset_t)-1; 1807 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1808 page_unlock(npp); 1809 } 1810 1811 /* 1812 * Give up the pages we already got. 1813 */ 1814 while (plist != NULL) { 1815 pp = plist; 1816 page_sub(&plist, pp); 1817 page_io_unlock(pp); 1818 plist_len++; 1819 /*LINTED: constant in conditional ctx*/ 1820 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1821 } 1822 1823 /* 1824 * VN_DISPOSE does freemem accounting for the pages in plist 1825 * by calling page_free. So, we need to undo the pcf accounting 1826 * for only the remaining pages. 1827 */ 1828 VM_STAT_ADD(page_create_putbacks); 1829 page_create_putback(pages_req - plist_len); 1830 1831 return (NULL); 1832 } 1833 1834 1835 /* 1836 * Copy the data from the physical page represented by "frompp" to 1837 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1838 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1839 * level and no one sleeps with an active mapping there. 1840 * 1841 * Note that the ref/mod bits in the page_t's are not affected by 1842 * this operation, hence it is up to the caller to update them appropriately. 1843 */ 1844 void 1845 ppcopy(page_t *frompp, page_t *topp) 1846 { 1847 caddr_t pp_addr1; 1848 caddr_t pp_addr2; 1849 void *pte1; 1850 void *pte2; 1851 kmutex_t *ppaddr_mutex; 1852 1853 ASSERT_STACK_ALIGNED(); 1854 ASSERT(PAGE_LOCKED(frompp)); 1855 ASSERT(PAGE_LOCKED(topp)); 1856 1857 if (kpm_enable) { 1858 pp_addr1 = hat_kpm_page2va(frompp, 0); 1859 pp_addr2 = hat_kpm_page2va(topp, 0); 1860 kpreempt_disable(); 1861 } else { 1862 /* 1863 * disable pre-emption so that CPU can't change 1864 */ 1865 kpreempt_disable(); 1866 1867 pp_addr1 = CPU->cpu_caddr1; 1868 pp_addr2 = CPU->cpu_caddr2; 1869 pte1 = (void *)CPU->cpu_caddr1pte; 1870 pte2 = (void *)CPU->cpu_caddr2pte; 1871 1872 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1873 mutex_enter(ppaddr_mutex); 1874 1875 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1876 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1877 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1878 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1879 HAT_LOAD_NOCONSIST); 1880 } 1881 1882 if (use_sse_pagecopy) 1883 hwblkpagecopy(pp_addr1, pp_addr2); 1884 else 1885 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1886 1887 if (!kpm_enable) 1888 mutex_exit(ppaddr_mutex); 1889 kpreempt_enable(); 1890 } 1891 1892 /* 1893 * Zero the physical page from off to off + len given by `pp' 1894 * without changing the reference and modified bits of page. 1895 * 1896 * We use this using CPU private page address #2, see ppcopy() for more info. 1897 * pagezero() must not be called at interrupt level. 1898 */ 1899 void 1900 pagezero(page_t *pp, uint_t off, uint_t len) 1901 { 1902 caddr_t pp_addr2; 1903 void *pte2; 1904 kmutex_t *ppaddr_mutex; 1905 1906 ASSERT_STACK_ALIGNED(); 1907 ASSERT(len <= MMU_PAGESIZE); 1908 ASSERT(off <= MMU_PAGESIZE); 1909 ASSERT(off + len <= MMU_PAGESIZE); 1910 ASSERT(PAGE_LOCKED(pp)); 1911 1912 if (kpm_enable) { 1913 pp_addr2 = hat_kpm_page2va(pp, 0); 1914 kpreempt_disable(); 1915 } else { 1916 kpreempt_disable(); 1917 1918 pp_addr2 = CPU->cpu_caddr2; 1919 pte2 = (void *)CPU->cpu_caddr2pte; 1920 1921 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1922 mutex_enter(ppaddr_mutex); 1923 1924 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 1925 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1926 HAT_LOAD_NOCONSIST); 1927 } 1928 1929 if (use_sse_pagezero) 1930 hwblkclr(pp_addr2 + off, len); 1931 else 1932 bzero(pp_addr2 + off, len); 1933 1934 if (!kpm_enable) 1935 mutex_exit(ppaddr_mutex); 1936 kpreempt_enable(); 1937 } 1938 1939 /* 1940 * Platform-dependent page scrub call. 1941 */ 1942 void 1943 pagescrub(page_t *pp, uint_t off, uint_t len) 1944 { 1945 /* 1946 * For now, we rely on the fact that pagezero() will 1947 * always clear UEs. 1948 */ 1949 pagezero(pp, off, len); 1950 } 1951 1952 /* 1953 * set up two private addresses for use on a given CPU for use in ppcopy() 1954 */ 1955 void 1956 setup_vaddr_for_ppcopy(struct cpu *cpup) 1957 { 1958 void *addr; 1959 void *pte; 1960 1961 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1962 pte = hat_mempte_setup(addr); 1963 cpup->cpu_caddr1 = addr; 1964 cpup->cpu_caddr1pte = (pteptr_t)pte; 1965 1966 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1967 pte = hat_mempte_setup(addr); 1968 cpup->cpu_caddr2 = addr; 1969 cpup->cpu_caddr2pte = (pteptr_t)pte; 1970 1971 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 1972 } 1973 1974 1975 /* 1976 * Create the pageout scanner thread. The thread has to 1977 * start at procedure with process pp and priority pri. 1978 */ 1979 void 1980 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1981 { 1982 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1983 } 1984 1985 /* 1986 * Function for flushing D-cache when performing module relocations 1987 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 1988 */ 1989 void 1990 dcache_flushall() 1991 {} 1992