1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * UNIX machine dependent virtual memory support. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/user.h> 45 #include <sys/proc.h> 46 #include <sys/kmem.h> 47 #include <sys/vmem.h> 48 #include <sys/buf.h> 49 #include <sys/cpuvar.h> 50 #include <sys/lgrp.h> 51 #include <sys/disp.h> 52 #include <sys/vm.h> 53 #include <sys/mman.h> 54 #include <sys/vnode.h> 55 #include <sys/cred.h> 56 #include <sys/exec.h> 57 #include <sys/exechdr.h> 58 #include <sys/debug.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 0; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 extern uintptr_t eprom_kernelbase; 95 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 96 97 /* 4g memory management */ 98 pgcnt_t maxmem4g; 99 pgcnt_t freemem4g; 100 int physmax4g; 101 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 102 int lotsfree4gshift = 3; 103 104 #ifdef VM_STATS 105 struct { 106 ulong_t pga_alloc; 107 ulong_t pga_notfullrange; 108 ulong_t pga_nulldmaattr; 109 ulong_t pga_allocok; 110 ulong_t pga_allocfailed; 111 ulong_t pgma_alloc; 112 ulong_t pgma_allocok; 113 ulong_t pgma_allocfailed; 114 ulong_t pgma_allocempty; 115 } pga_vmstats; 116 #endif 117 118 uint_t mmu_page_sizes; 119 120 /* How many page sizes the users can see */ 121 uint_t mmu_exported_page_sizes; 122 123 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 124 /* 125 * Number of pages in 1 GB. Don't enable automatic large pages if we have 126 * fewer than this many pages. 127 */ 128 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 129 130 /* 131 * Return the optimum page size for a given mapping 132 */ 133 /*ARGSUSED*/ 134 size_t 135 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 136 { 137 level_t l; 138 139 if (remap) 140 *remap = 0; 141 142 switch (maptype) { 143 144 case MAPPGSZ_STK: 145 case MAPPGSZ_HEAP: 146 case MAPPGSZ_VA: 147 /* 148 * use the pages size that best fits len 149 */ 150 for (l = mmu.max_page_level; l > 0; --l) { 151 if (len < LEVEL_SIZE(l)) 152 continue; 153 break; 154 } 155 return (LEVEL_SIZE(l)); 156 157 /* 158 * for ISM use the 1st large page size. 159 */ 160 case MAPPGSZ_ISM: 161 if (mmu.max_page_level == 0) 162 return (MMU_PAGESIZE); 163 return (LEVEL_SIZE(1)); 164 } 165 return (0); 166 } 167 168 /* 169 * This can be patched via /etc/system to allow large pages 170 * to be used for mapping application and libraries text segments. 171 */ 172 int use_text_largepages = 0; 173 174 /* 175 * Return a bit vector of large page size codes that 176 * can be used to map [addr, addr + len) region. 177 */ 178 179 /*ARGSUSED*/ 180 uint_t 181 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 182 { 183 size_t pgsz; 184 caddr_t a; 185 186 if (!text || !use_text_largepages || 187 mmu.max_page_level == 0) 188 return (0); 189 190 pgsz = LEVEL_SIZE(1); 191 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 192 if (a < addr || a >= addr + len) { 193 return (0); 194 } 195 len -= (a - addr); 196 if (len < pgsz) { 197 return (0); 198 } 199 return (1 << 1); 200 } 201 202 /* 203 * Handle a pagefault. 204 */ 205 faultcode_t 206 pagefault( 207 caddr_t addr, 208 enum fault_type type, 209 enum seg_rw rw, 210 int iskernel) 211 { 212 struct as *as; 213 struct hat *hat; 214 struct proc *p; 215 kthread_t *t; 216 faultcode_t res; 217 caddr_t base; 218 size_t len; 219 int err; 220 int mapped_red; 221 uintptr_t ea; 222 223 ASSERT_STACK_ALIGNED(); 224 225 if (INVALID_VADDR(addr)) 226 return (FC_NOMAP); 227 228 mapped_red = segkp_map_red(); 229 230 if (iskernel) { 231 as = &kas; 232 hat = as->a_hat; 233 } else { 234 t = curthread; 235 p = ttoproc(t); 236 as = p->p_as; 237 hat = as->a_hat; 238 } 239 240 /* 241 * Dispatch pagefault. 242 */ 243 res = as_fault(hat, as, addr, 1, type, rw); 244 245 /* 246 * If this isn't a potential unmapped hole in the user's 247 * UNIX data or stack segments, just return status info. 248 */ 249 if (res != FC_NOMAP || iskernel) 250 goto out; 251 252 /* 253 * Check to see if we happened to faulted on a currently unmapped 254 * part of the UNIX data or stack segments. If so, create a zfod 255 * mapping there and then try calling the fault routine again. 256 */ 257 base = p->p_brkbase; 258 len = p->p_brksize; 259 260 if (addr < base || addr >= base + len) { /* data seg? */ 261 base = (caddr_t)p->p_usrstack - p->p_stksize; 262 len = p->p_stksize; 263 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 264 /* not in either UNIX data or stack segments */ 265 res = FC_NOMAP; 266 goto out; 267 } 268 } 269 270 /* 271 * the rest of this function implements a 3.X 4.X 5.X compatibility 272 * This code is probably not needed anymore 273 */ 274 if (p->p_model == DATAMODEL_ILP32) { 275 276 /* expand the gap to the page boundaries on each side */ 277 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 278 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 279 len = ea - (uintptr_t)base; 280 281 as_rangelock(as); 282 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 283 0) { 284 err = as_map(as, base, len, segvn_create, zfod_argsp); 285 as_rangeunlock(as); 286 if (err) { 287 res = FC_MAKE_ERR(err); 288 goto out; 289 } 290 } else { 291 /* 292 * This page is already mapped by another thread after 293 * we returned from as_fault() above. We just fall 294 * through as_fault() below. 295 */ 296 as_rangeunlock(as); 297 } 298 299 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 300 } 301 302 out: 303 if (mapped_red) 304 segkp_unmap_red(); 305 306 return (res); 307 } 308 309 void 310 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 311 { 312 struct proc *p = curproc; 313 caddr_t userlimit = (flags & _MAP_LOW32) ? 314 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 315 316 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 317 } 318 319 /*ARGSUSED*/ 320 int 321 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 322 { 323 return (0); 324 } 325 326 /* 327 * map_addr_proc() is the routine called when the system is to 328 * choose an address for the user. We will pick an address 329 * range which is the highest available below kernelbase. 330 * 331 * addrp is a value/result parameter. 332 * On input it is a hint from the user to be used in a completely 333 * machine dependent fashion. We decide to completely ignore this hint. 334 * 335 * On output it is NULL if no address can be found in the current 336 * processes address space or else an address that is currently 337 * not mapped for len bytes with a page of red zone on either side. 338 * 339 * align is not needed on x86 (it's for viturally addressed caches) 340 */ 341 /*ARGSUSED*/ 342 void 343 map_addr_proc( 344 caddr_t *addrp, 345 size_t len, 346 offset_t off, 347 int vacalign, 348 caddr_t userlimit, 349 struct proc *p, 350 uint_t flags) 351 { 352 struct as *as = p->p_as; 353 caddr_t addr; 354 caddr_t base; 355 size_t slen; 356 size_t align_amount; 357 358 ASSERT32(userlimit == as->a_userlimit); 359 360 base = p->p_brkbase; 361 #if defined(__amd64) 362 /* 363 * XX64 Yes, this needs more work. 364 */ 365 if (p->p_model == DATAMODEL_NATIVE) { 366 if (userlimit < as->a_userlimit) { 367 /* 368 * This happens when a program wants to map 369 * something in a range that's accessible to a 370 * program in a smaller address space. For example, 371 * a 64-bit program calling mmap32(2) to guarantee 372 * that the returned address is below 4Gbytes. 373 */ 374 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 375 376 if (userlimit > base) 377 slen = userlimit - base; 378 else { 379 *addrp = NULL; 380 return; 381 } 382 } else { 383 /* 384 * XX64 This layout is probably wrong .. but in 385 * the event we make the amd64 address space look 386 * like sparcv9 i.e. with the stack -above- the 387 * heap, this bit of code might even be correct. 388 */ 389 slen = p->p_usrstack - base - 390 (((size_t)rctl_enforced_value( 391 rctlproc_legacy[RLIMIT_STACK], 392 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 393 } 394 } else 395 #endif 396 slen = userlimit - base; 397 398 len = (len + PAGEOFFSET) & PAGEMASK; 399 400 /* 401 * Redzone for each side of the request. This is done to leave 402 * one page unmapped between segments. This is not required, but 403 * it's useful for the user because if their program strays across 404 * a segment boundary, it will catch a fault immediately making 405 * debugging a little easier. 406 */ 407 len += 2 * MMU_PAGESIZE; 408 409 /* 410 * figure out what the alignment should be 411 * 412 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 413 */ 414 if (len <= ELF_386_MAXPGSZ) { 415 /* 416 * Align virtual addresses to ensure that ELF shared libraries 417 * are mapped with the appropriate alignment constraints by 418 * the run-time linker. 419 */ 420 align_amount = ELF_386_MAXPGSZ; 421 } else { 422 int l = mmu.max_page_level; 423 424 while (l && len < LEVEL_SIZE(l)) 425 --l; 426 427 align_amount = LEVEL_SIZE(l); 428 } 429 430 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 431 align_amount = (uintptr_t)*addrp; 432 433 len += align_amount; 434 435 /* 436 * Look for a large enough hole starting below userlimit. 437 * After finding it, use the upper part. Addition of PAGESIZE 438 * is for the redzone as described above. 439 */ 440 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 441 caddr_t as_addr; 442 443 addr = base + slen - len + MMU_PAGESIZE; 444 as_addr = addr; 445 /* 446 * Round address DOWN to the alignment amount, 447 * add the offset, and if this address is less 448 * than the original address, add alignment amount. 449 */ 450 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 451 addr += (uintptr_t)(off & (align_amount - 1)); 452 if (addr < as_addr) 453 addr += align_amount; 454 455 ASSERT(addr <= (as_addr + align_amount)); 456 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 457 ((uintptr_t)(off & (align_amount - 1)))); 458 *addrp = addr; 459 } else { 460 *addrp = NULL; /* no more virtual space */ 461 } 462 } 463 464 /* 465 * Determine whether [base, base+len] contains a valid range of 466 * addresses at least minlen long. base and len are adjusted if 467 * required to provide a valid range. 468 */ 469 /*ARGSUSED3*/ 470 int 471 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 472 { 473 uintptr_t hi, lo; 474 475 lo = (uintptr_t)*basep; 476 hi = lo + *lenp; 477 478 /* 479 * If hi rolled over the top, try cutting back. 480 */ 481 if (hi < lo) { 482 if (0 - lo + hi < minlen) 483 return (0); 484 if (0 - lo < minlen) 485 return (0); 486 *lenp = 0 - lo; 487 } else if (hi - lo < minlen) { 488 return (0); 489 } 490 #if defined(__amd64) 491 /* 492 * Deal with a possible hole in the address range between 493 * hole_start and hole_end that should never be mapped. 494 */ 495 if (lo < hole_start) { 496 if (hi > hole_start) { 497 if (hi < hole_end) { 498 hi = hole_start; 499 } else { 500 /* lo < hole_start && hi >= hole_end */ 501 if (dir == AH_LO) { 502 /* 503 * prefer lowest range 504 */ 505 if (hole_start - lo >= minlen) 506 hi = hole_start; 507 else if (hi - hole_end >= minlen) 508 lo = hole_end; 509 else 510 return (0); 511 } else { 512 /* 513 * prefer highest range 514 */ 515 if (hi - hole_end >= minlen) 516 lo = hole_end; 517 else if (hole_start - lo >= minlen) 518 hi = hole_start; 519 else 520 return (0); 521 } 522 } 523 } 524 } else { 525 /* lo >= hole_start */ 526 if (hi < hole_end) 527 return (0); 528 if (lo < hole_end) 529 lo = hole_end; 530 } 531 532 if (hi - lo < minlen) 533 return (0); 534 535 *basep = (caddr_t)lo; 536 *lenp = hi - lo; 537 #endif 538 return (1); 539 } 540 541 /* 542 * Determine whether [addr, addr+len] are valid user addresses. 543 */ 544 /*ARGSUSED*/ 545 int 546 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 547 caddr_t userlimit) 548 { 549 caddr_t eaddr = addr + len; 550 551 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 552 return (RANGE_BADADDR); 553 554 #if defined(__amd64) 555 /* 556 * Check for the VA hole 557 */ 558 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 559 return (RANGE_BADADDR); 560 #endif 561 562 return (RANGE_OKAY); 563 } 564 565 /* 566 * Return 1 if the page frame is onboard memory, else 0. 567 */ 568 int 569 pf_is_memory(pfn_t pf) 570 { 571 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 572 } 573 574 575 /* 576 * initialized by page_coloring_init(). 577 */ 578 uint_t page_colors; 579 uint_t page_colors_mask; 580 uint_t page_coloring_shift; 581 int cpu_page_colors; 582 static uint_t l2_colors; 583 584 /* 585 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 586 * and page_colors are calculated from the l2 cache n-way set size. Within a 587 * mnode range, the page freelist and cachelist are hashed into bins based on 588 * color. This makes it easier to search for a page within a specific memory 589 * range. 590 */ 591 #define PAGE_COLORS_MIN 16 592 593 page_t ****page_freelists; 594 page_t ***page_cachelists; 595 596 /* 597 * As the PC architecture evolved memory up was clumped into several 598 * ranges for various historical I/O devices to do DMA. 599 * < 16Meg - ISA bus 600 * < 2Gig - ??? 601 * < 4Gig - PCI bus or drivers that don't understand PAE mode 602 */ 603 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 604 0x100000, /* pfn range for 4G and above */ 605 0x80000, /* pfn range for 2G-4G */ 606 0x01000, /* pfn range for 16M-2G */ 607 0x00000, /* pfn range for 0-16M */ 608 }; 609 610 /* 611 * These are changed during startup if the machine has limited memory. 612 */ 613 pfn_t *memranges = &arch_memranges[0]; 614 int nranges = NUM_MEM_RANGES; 615 616 /* 617 * Used by page layer to know about page sizes 618 */ 619 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 620 621 /* 622 * This can be patched via /etc/system to allow old non-PAE aware device 623 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 624 */ 625 #if defined(__i386) 626 int restricted_kmemalloc = 1; /* XX64 re-examine with PSARC 2004/405 */ 627 #elif defined(__amd64) 628 int restricted_kmemalloc = 0; 629 #endif 630 631 kmutex_t *fpc_mutex[NPC_MUTEX]; 632 kmutex_t *cpc_mutex[NPC_MUTEX]; 633 634 635 /* 636 * return the memrange containing pfn 637 */ 638 int 639 memrange_num(pfn_t pfn) 640 { 641 int n; 642 643 for (n = 0; n < nranges - 1; ++n) { 644 if (pfn >= memranges[n]) 645 break; 646 } 647 return (n); 648 } 649 650 /* 651 * return the mnoderange containing pfn 652 */ 653 int 654 pfn_2_mtype(pfn_t pfn) 655 { 656 int n; 657 658 for (n = mnoderangecnt - 1; n >= 0; n--) { 659 if (pfn >= mnoderanges[n].mnr_pfnlo) { 660 break; 661 } 662 } 663 return (n); 664 } 665 666 /* 667 * is_contigpage_free: 668 * returns a page list of contiguous pages. It minimally has to return 669 * minctg pages. Caller determines minctg based on the scatter-gather 670 * list length. 671 * 672 * pfnp is set to the next page frame to search on return. 673 */ 674 static page_t * 675 is_contigpage_free( 676 pfn_t *pfnp, 677 pgcnt_t *pgcnt, 678 pgcnt_t minctg, 679 uint64_t pfnseg, 680 int iolock) 681 { 682 int i = 0; 683 pfn_t pfn = *pfnp; 684 page_t *pp; 685 page_t *plist = NULL; 686 687 /* 688 * fail if pfn + minctg crosses a segment boundary. 689 * Adjust for next starting pfn to begin at segment boundary. 690 */ 691 692 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 693 *pfnp = roundup(*pfnp, pfnseg + 1); 694 return (NULL); 695 } 696 697 do { 698 retry: 699 pp = page_numtopp_nolock(pfn + i); 700 if ((pp == NULL) || 701 (page_trylock(pp, SE_EXCL) == 0)) { 702 (*pfnp)++; 703 break; 704 } 705 if (page_pptonum(pp) != pfn + i) { 706 page_unlock(pp); 707 goto retry; 708 } 709 710 if (!(PP_ISFREE(pp))) { 711 page_unlock(pp); 712 (*pfnp)++; 713 break; 714 } 715 716 if (!PP_ISAGED(pp)) { 717 page_list_sub(pp, PG_CACHE_LIST); 718 page_hashout(pp, (kmutex_t *)NULL); 719 } else { 720 page_list_sub(pp, PG_FREE_LIST); 721 } 722 723 if (iolock) 724 page_io_lock(pp); 725 page_list_concat(&plist, &pp); 726 727 /* 728 * exit loop when pgcnt satisfied or segment boundary reached. 729 */ 730 731 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 732 733 *pfnp += i; /* set to next pfn to search */ 734 735 if (i >= minctg) { 736 *pgcnt -= i; 737 return (plist); 738 } 739 740 /* 741 * failure: minctg not satisfied. 742 * 743 * if next request crosses segment boundary, set next pfn 744 * to search from the segment boundary. 745 */ 746 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 747 *pfnp = roundup(*pfnp, pfnseg + 1); 748 749 /* clean up any pages already allocated */ 750 751 while (plist) { 752 pp = plist; 753 page_sub(&plist, pp); 754 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 755 if (iolock) 756 page_io_unlock(pp); 757 page_unlock(pp); 758 } 759 760 return (NULL); 761 } 762 763 /* 764 * verify that pages being returned from allocator have correct DMA attribute 765 */ 766 #ifndef DEBUG 767 #define check_dma(a, b, c) (0) 768 #else 769 static void 770 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 771 { 772 if (dma_attr == NULL) 773 return; 774 775 while (cnt-- > 0) { 776 if (mmu_ptob((uint64_t)pp->p_pagenum) < 777 dma_attr->dma_attr_addr_lo) 778 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 779 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 780 dma_attr->dma_attr_addr_hi) 781 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 782 pp = pp->p_next; 783 } 784 } 785 #endif 786 787 static kmutex_t contig_lock; 788 789 #define CONTIG_LOCK() mutex_enter(&contig_lock); 790 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 791 792 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 793 794 static page_t * 795 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 796 { 797 pfn_t pfn; 798 int sgllen; 799 uint64_t pfnseg; 800 pgcnt_t minctg; 801 page_t *pplist = NULL, *plist; 802 uint64_t lo, hi; 803 pgcnt_t pfnalign = 0; 804 static pfn_t startpfn; 805 static pgcnt_t lastctgcnt; 806 uintptr_t align; 807 808 CONTIG_LOCK(); 809 810 if (mattr) { 811 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 812 hi = mmu_btop(mattr->dma_attr_addr_hi); 813 if (hi >= physmax) 814 hi = physmax - 1; 815 sgllen = mattr->dma_attr_sgllen; 816 pfnseg = mmu_btop(mattr->dma_attr_seg); 817 818 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 819 if (align > MMU_PAGESIZE) 820 pfnalign = mmu_btop(align); 821 822 /* 823 * in order to satisfy the request, must minimally 824 * acquire minctg contiguous pages 825 */ 826 minctg = howmany(*pgcnt, sgllen); 827 828 ASSERT(hi >= lo); 829 830 /* 831 * start from where last searched if the minctg >= lastctgcnt 832 */ 833 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 834 startpfn = lo; 835 } else { 836 hi = physmax - 1; 837 lo = 0; 838 sgllen = 1; 839 pfnseg = mmu.highest_pfn; 840 minctg = *pgcnt; 841 842 if (minctg < lastctgcnt) 843 startpfn = lo; 844 } 845 lastctgcnt = minctg; 846 847 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 848 849 /* conserve 16m memory - start search above 16m when possible */ 850 if (hi > PFN_16M && startpfn < PFN_16M) 851 startpfn = PFN_16M; 852 853 pfn = startpfn; 854 if (pfnalign) 855 pfn = P2ROUNDUP(pfn, pfnalign); 856 857 while (pfn + minctg - 1 <= hi) { 858 859 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 860 if (plist) { 861 page_list_concat(&pplist, &plist); 862 sgllen--; 863 /* 864 * return when contig pages no longer needed 865 */ 866 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 867 startpfn = pfn; 868 CONTIG_UNLOCK(); 869 check_dma(mattr, pplist, *pgcnt); 870 return (pplist); 871 } 872 minctg = howmany(*pgcnt, sgllen); 873 } 874 if (pfnalign) 875 pfn = P2ROUNDUP(pfn, pfnalign); 876 } 877 878 /* cannot find contig pages in specified range */ 879 if (startpfn == lo) { 880 CONTIG_UNLOCK(); 881 return (NULL); 882 } 883 884 /* did not start with lo previously */ 885 pfn = lo; 886 if (pfnalign) 887 pfn = P2ROUNDUP(pfn, pfnalign); 888 889 /* allow search to go above startpfn */ 890 while (pfn < startpfn) { 891 892 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 893 if (plist != NULL) { 894 895 page_list_concat(&pplist, &plist); 896 sgllen--; 897 898 /* 899 * return when contig pages no longer needed 900 */ 901 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 902 startpfn = pfn; 903 CONTIG_UNLOCK(); 904 check_dma(mattr, pplist, *pgcnt); 905 return (pplist); 906 } 907 minctg = howmany(*pgcnt, sgllen); 908 } 909 if (pfnalign) 910 pfn = P2ROUNDUP(pfn, pfnalign); 911 } 912 CONTIG_UNLOCK(); 913 return (NULL); 914 } 915 916 /* 917 * combine mem_node_config and memrange memory ranges into one data 918 * structure to be used for page list management. 919 * 920 * mnode_range_cnt() calculates the number of memory ranges for mnode and 921 * memranges[]. Used to determine the size of page lists and mnoderanges. 922 * 923 * mnode_range_setup() initializes mnoderanges. 924 */ 925 mnoderange_t *mnoderanges; 926 int mnoderangecnt; 927 int mtype4g; 928 929 int 930 mnode_range_cnt() 931 { 932 int mri; 933 int mnrcnt = 0; 934 int mnode; 935 936 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 937 if (mem_node_config[mnode].exists == 0) 938 continue; 939 940 mri = nranges - 1; 941 942 /* find the memranges index below contained in mnode range */ 943 944 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 945 mri--; 946 947 /* 948 * increment mnode range counter when memranges or mnode 949 * boundary is reached. 950 */ 951 while (mri >= 0 && 952 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 953 mnrcnt++; 954 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 955 mri--; 956 else 957 break; 958 } 959 } 960 return (mnrcnt); 961 } 962 963 void 964 mnode_range_setup(mnoderange_t *mnoderanges) 965 { 966 int mnode, mri; 967 968 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 969 if (mem_node_config[mnode].exists == 0) 970 continue; 971 972 mri = nranges - 1; 973 974 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 975 mri--; 976 977 while (mri >= 0 && mem_node_config[mnode].physmax >= 978 MEMRANGELO(mri)) { 979 mnoderanges->mnr_pfnlo = 980 MAX(MEMRANGELO(mri), 981 mem_node_config[mnode].physbase); 982 mnoderanges->mnr_pfnhi = 983 MIN(MEMRANGEHI(mri), 984 mem_node_config[mnode].physmax); 985 mnoderanges->mnr_mnode = mnode; 986 mnoderanges->mnr_memrange = mri; 987 mnoderanges++; 988 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 989 mri--; 990 else 991 break; 992 } 993 } 994 } 995 996 /* 997 * Determine if the mnode range specified in mtype contains memory belonging 998 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 999 * the range of indices to 0 or 4g. 1000 * 1001 * Return first mnode range type index found otherwise return -1 if none found. 1002 */ 1003 int 1004 mtype_func(int mnode, int mtype, uint_t flags) 1005 { 1006 if (flags & PGI_MT_RANGE) { 1007 int mtlim = 0; /* default to PGI_MT_RANGEO */ 1008 1009 if (flags & PGI_MT_NEXT) 1010 mtype--; 1011 if (flags & PGI_MT_RANGE4G) 1012 mtlim = mtype4g + 1; 1013 while (mtype >= mtlim) { 1014 if (mnoderanges[mtype].mnr_mnode == mnode) 1015 return (mtype); 1016 mtype--; 1017 } 1018 } else { 1019 if (mnoderanges[mtype].mnr_mnode == mnode) 1020 return (mtype); 1021 } 1022 return (-1); 1023 } 1024 1025 /* 1026 * Returns the free page count for mnode 1027 */ 1028 int 1029 mnode_pgcnt(int mnode) 1030 { 1031 int mtype = mnoderangecnt - 1; 1032 int flags = PGI_MT_RANGE0; 1033 pgcnt_t pgcnt = 0; 1034 1035 mtype = mtype_func(mnode, mtype, flags); 1036 1037 while (mtype != -1) { 1038 pgcnt += (mnoderanges[mtype].mnr_mt_flpgcnt + 1039 mnoderanges[mtype].mnr_mt_lgpgcnt + 1040 mnoderanges[mtype].mnr_mt_clpgcnt); 1041 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1042 } 1043 return (pgcnt); 1044 } 1045 1046 /* 1047 * Initialize page coloring variables based on the l2 cache parameters. 1048 * Calculate and return memory needed for page coloring data structures. 1049 */ 1050 size_t 1051 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1052 { 1053 size_t colorsz = 0; 1054 int i; 1055 int colors; 1056 1057 /* 1058 * Reduce the memory ranges lists if we don't have large amounts 1059 * of memory. This avoids searching known empty free lists. 1060 */ 1061 i = memrange_num(physmax); 1062 memranges += i; 1063 nranges -= i; 1064 #if defined(__i386) 1065 if (i > 0) 1066 restricted_kmemalloc = 0; 1067 #endif 1068 /* physmax greater than 4g */ 1069 if (i == 0) 1070 physmax4g = 1; 1071 1072 /* 1073 * setup pagesize for generic page layer 1074 */ 1075 for (i = 0; i <= mmu.max_page_level; ++i) { 1076 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1077 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1078 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1079 } 1080 1081 ASSERT(ISP2(l2_sz)); 1082 ASSERT(ISP2(l2_linesz)); 1083 ASSERT(l2_sz > MMU_PAGESIZE); 1084 1085 /* l2_assoc is 0 for fully associative l2 cache */ 1086 if (l2_assoc) 1087 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1088 else 1089 l2_colors = 1; 1090 1091 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1092 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1093 1094 /* 1095 * cpu_page_colors is non-zero when a page color may be spread across 1096 * multiple bins. 1097 */ 1098 if (l2_colors < page_colors) 1099 cpu_page_colors = l2_colors; 1100 1101 ASSERT(ISP2(page_colors)); 1102 1103 page_colors_mask = page_colors - 1; 1104 1105 ASSERT(ISP2(CPUSETSIZE())); 1106 page_coloring_shift = lowbit(CPUSETSIZE()); 1107 1108 /* size for mnoderanges */ 1109 mnoderangecnt = mnode_range_cnt(); 1110 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1111 1112 /* size for fpc_mutex and cpc_mutex */ 1113 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1114 1115 /* size of page_freelists */ 1116 colorsz += mnoderangecnt * sizeof (page_t ***); 1117 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1118 1119 for (i = 0; i < mmu_page_sizes; i++) { 1120 colors = page_get_pagecolors(i); 1121 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1122 } 1123 1124 /* size of page_cachelists */ 1125 colorsz += mnoderangecnt * sizeof (page_t **); 1126 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1127 1128 return (colorsz); 1129 } 1130 1131 /* 1132 * Called once at startup to configure page_coloring data structures and 1133 * does the 1st page_free()/page_freelist_add(). 1134 */ 1135 void 1136 page_coloring_setup(caddr_t pcmemaddr) 1137 { 1138 int i; 1139 int j; 1140 int k; 1141 caddr_t addr; 1142 int colors; 1143 1144 /* 1145 * do page coloring setup 1146 */ 1147 addr = pcmemaddr; 1148 1149 mnoderanges = (mnoderange_t *)addr; 1150 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1151 1152 mnode_range_setup(mnoderanges); 1153 1154 if (physmax4g) 1155 mtype4g = pfn_2_mtype(0xfffff); 1156 1157 for (k = 0; k < NPC_MUTEX; k++) { 1158 fpc_mutex[k] = (kmutex_t *)addr; 1159 addr += (max_mem_nodes * sizeof (kmutex_t)); 1160 } 1161 for (k = 0; k < NPC_MUTEX; k++) { 1162 cpc_mutex[k] = (kmutex_t *)addr; 1163 addr += (max_mem_nodes * sizeof (kmutex_t)); 1164 } 1165 page_freelists = (page_t ****)addr; 1166 addr += (mnoderangecnt * sizeof (page_t ***)); 1167 1168 page_cachelists = (page_t ***)addr; 1169 addr += (mnoderangecnt * sizeof (page_t **)); 1170 1171 for (i = 0; i < mnoderangecnt; i++) { 1172 page_freelists[i] = (page_t ***)addr; 1173 addr += (mmu_page_sizes * sizeof (page_t **)); 1174 1175 for (j = 0; j < mmu_page_sizes; j++) { 1176 colors = page_get_pagecolors(j); 1177 page_freelists[i][j] = (page_t **)addr; 1178 addr += (colors * sizeof (page_t *)); 1179 } 1180 page_cachelists[i] = (page_t **)addr; 1181 addr += (page_colors * sizeof (page_t *)); 1182 } 1183 } 1184 1185 /*ARGSUSED*/ 1186 int 1187 bp_color(struct buf *bp) 1188 { 1189 return (0); 1190 } 1191 1192 /* 1193 * get a page from any list with the given mnode 1194 */ 1195 page_t * 1196 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1197 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1198 { 1199 kmutex_t *pcm; 1200 int i; 1201 page_t *pp; 1202 page_t *first_pp; 1203 uint64_t pgaddr; 1204 ulong_t bin; 1205 int mtypestart; 1206 1207 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1208 1209 ASSERT((flags & PG_MATCH_COLOR) == 0); 1210 ASSERT(szc == 0); 1211 ASSERT(dma_attr != NULL); 1212 1213 1214 MTYPE_START(mnode, mtype, flags); 1215 if (mtype < 0) { 1216 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1217 return (NULL); 1218 } 1219 1220 mtypestart = mtype; 1221 1222 bin = origbin; 1223 1224 /* 1225 * check up to page_colors + 1 bins - origbin may be checked twice 1226 * because of BIN_STEP skip 1227 */ 1228 do { 1229 i = 0; 1230 while (i <= page_colors) { 1231 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1232 goto nextfreebin; 1233 1234 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1235 mutex_enter(pcm); 1236 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1237 first_pp = pp; 1238 while (pp != NULL) { 1239 if (page_trylock(pp, SE_EXCL) == 0) { 1240 pp = pp->p_next; 1241 if (pp == first_pp) { 1242 pp = NULL; 1243 } 1244 continue; 1245 } 1246 1247 ASSERT(PP_ISFREE(pp)); 1248 ASSERT(PP_ISAGED(pp)); 1249 ASSERT(pp->p_vnode == NULL); 1250 ASSERT(pp->p_hash == NULL); 1251 ASSERT(pp->p_offset == (u_offset_t)-1); 1252 ASSERT(pp->p_szc == szc); 1253 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1254 /* check if page within DMA attributes */ 1255 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1256 1257 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1258 (pgaddr + MMU_PAGESIZE - 1 <= 1259 dma_attr->dma_attr_addr_hi)) { 1260 break; 1261 } 1262 1263 /* continue looking */ 1264 page_unlock(pp); 1265 pp = pp->p_next; 1266 if (pp == first_pp) 1267 pp = NULL; 1268 1269 } 1270 if (pp != NULL) { 1271 ASSERT(mtype == PP_2_MTYPE(pp)); 1272 ASSERT(pp->p_szc == 0); 1273 1274 /* found a page with specified DMA attributes */ 1275 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1276 mtype), pp); 1277 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1278 1279 if ((PP_ISFREE(pp) == 0) || 1280 (PP_ISAGED(pp) == 0)) { 1281 cmn_err(CE_PANIC, "page %p is not free", 1282 (void *)pp); 1283 } 1284 1285 mutex_exit(pcm); 1286 check_dma(dma_attr, pp, 1); 1287 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1288 return (pp); 1289 } 1290 mutex_exit(pcm); 1291 nextfreebin: 1292 pp = page_freelist_fill(szc, bin, mnode, mtype, 1293 mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 1294 if (pp) 1295 return (pp); 1296 1297 /* try next bin */ 1298 bin += (i == 0) ? BIN_STEP : 1; 1299 bin &= page_colors_mask; 1300 i++; 1301 } 1302 MTYPE_NEXT(mnode, mtype, flags); 1303 } while (mtype >= 0); 1304 1305 /* failed to find a page in the freelist; try it in the cachelist */ 1306 1307 /* reset mtype start for cachelist search */ 1308 mtype = mtypestart; 1309 ASSERT(mtype >= 0); 1310 1311 /* start with the bin of matching color */ 1312 bin = origbin; 1313 1314 do { 1315 for (i = 0; i <= page_colors; i++) { 1316 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1317 goto nextcachebin; 1318 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1319 mutex_enter(pcm); 1320 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1321 first_pp = pp; 1322 while (pp != NULL) { 1323 if (page_trylock(pp, SE_EXCL) == 0) { 1324 pp = pp->p_next; 1325 if (pp == first_pp) 1326 break; 1327 continue; 1328 } 1329 ASSERT(pp->p_vnode); 1330 ASSERT(PP_ISAGED(pp) == 0); 1331 ASSERT(pp->p_szc == 0); 1332 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1333 1334 /* check if page within DMA attributes */ 1335 1336 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1337 1338 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1339 (pgaddr + MMU_PAGESIZE - 1 <= 1340 dma_attr->dma_attr_addr_hi)) { 1341 break; 1342 } 1343 1344 /* continue looking */ 1345 page_unlock(pp); 1346 pp = pp->p_next; 1347 if (pp == first_pp) 1348 pp = NULL; 1349 } 1350 1351 if (pp != NULL) { 1352 ASSERT(mtype == PP_2_MTYPE(pp)); 1353 ASSERT(pp->p_szc == 0); 1354 1355 /* found a page with specified DMA attributes */ 1356 page_sub(&PAGE_CACHELISTS(mnode, bin, 1357 mtype), pp); 1358 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1359 1360 mutex_exit(pcm); 1361 ASSERT(pp->p_vnode); 1362 ASSERT(PP_ISAGED(pp) == 0); 1363 check_dma(dma_attr, pp, 1); 1364 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1365 return (pp); 1366 } 1367 mutex_exit(pcm); 1368 nextcachebin: 1369 bin += (i == 0) ? BIN_STEP : 1; 1370 bin &= page_colors_mask; 1371 } 1372 MTYPE_NEXT(mnode, mtype, flags); 1373 } while (mtype >= 0); 1374 1375 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1376 return (NULL); 1377 } 1378 1379 /* 1380 * This function is similar to page_get_freelist()/page_get_cachelist() 1381 * but it searches both the lists to find a page with the specified 1382 * color (or no color) and DMA attributes. The search is done in the 1383 * freelist first and then in the cache list within the highest memory 1384 * range (based on DMA attributes) before searching in the lower 1385 * memory ranges. 1386 * 1387 * Note: This function is called only by page_create_io(). 1388 */ 1389 /*ARGSUSED*/ 1390 page_t * 1391 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1392 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1393 { 1394 uint_t bin; 1395 int mtype; 1396 page_t *pp; 1397 int n; 1398 int m; 1399 int szc; 1400 int fullrange; 1401 int mnode; 1402 int local_failed_stat = 0; 1403 lgrp_mnode_cookie_t lgrp_cookie; 1404 1405 VM_STAT_ADD(pga_vmstats.pga_alloc); 1406 1407 /* only base pagesize currently supported */ 1408 if (size != MMU_PAGESIZE) 1409 return (NULL); 1410 1411 /* 1412 * If we're passed a specific lgroup, we use it. Otherwise, 1413 * assume first-touch placement is desired. 1414 */ 1415 if (!LGRP_EXISTS(lgrp)) 1416 lgrp = lgrp_home_lgrp(); 1417 1418 /* LINTED */ 1419 AS_2_BIN(as, seg, vp, vaddr, bin); 1420 1421 /* 1422 * Only hold one freelist or cachelist lock at a time, that way we 1423 * can start anywhere and not have to worry about lock 1424 * ordering. 1425 */ 1426 if (dma_attr == NULL) { 1427 n = 0; 1428 m = mnoderangecnt - 1; 1429 fullrange = 1; 1430 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1431 } else { 1432 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1433 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1434 1435 /* 1436 * We can guarantee alignment only for page boundary. 1437 */ 1438 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1439 return (NULL); 1440 1441 n = pfn_2_mtype(pfnlo); 1442 m = pfn_2_mtype(pfnhi); 1443 1444 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1445 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1446 } 1447 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1448 1449 if (n > m) 1450 return (NULL); 1451 1452 szc = 0; 1453 1454 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1455 if (n == 0) { 1456 flags |= PGI_MT_RANGE0; 1457 n = m; 1458 } 1459 1460 /* 1461 * Try local memory node first, but try remote if we can't 1462 * get a page of the right color. 1463 */ 1464 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1465 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1466 /* 1467 * allocate pages from high pfn to low. 1468 */ 1469 for (mtype = m; mtype >= n; mtype--) { 1470 if (fullrange != 0) { 1471 pp = page_get_mnode_freelist(mnode, 1472 bin, mtype, szc, flags); 1473 if (pp == NULL) { 1474 pp = page_get_mnode_cachelist( 1475 bin, flags, mnode, mtype); 1476 } 1477 } else { 1478 pp = page_get_mnode_anylist(bin, szc, 1479 flags, mnode, mtype, dma_attr); 1480 } 1481 if (pp != NULL) { 1482 VM_STAT_ADD(pga_vmstats.pga_allocok); 1483 check_dma(dma_attr, pp, 1); 1484 return (pp); 1485 } 1486 } 1487 if (!local_failed_stat) { 1488 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1489 local_failed_stat = 1; 1490 } 1491 } 1492 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1493 1494 return (NULL); 1495 } 1496 1497 /* 1498 * page_create_io() 1499 * 1500 * This function is a copy of page_create_va() with an additional 1501 * argument 'mattr' that specifies DMA memory requirements to 1502 * the page list functions. This function is used by the segkmem 1503 * allocator so it is only to create new pages (i.e PG_EXCL is 1504 * set). 1505 * 1506 * Note: This interface is currently used by x86 PSM only and is 1507 * not fully specified so the commitment level is only for 1508 * private interface specific to x86. This interface uses PSM 1509 * specific page_get_anylist() interface. 1510 */ 1511 1512 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1513 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1514 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1515 break; \ 1516 } \ 1517 } 1518 1519 1520 page_t * 1521 page_create_io( 1522 struct vnode *vp, 1523 u_offset_t off, 1524 uint_t bytes, 1525 uint_t flags, 1526 struct as *as, 1527 caddr_t vaddr, 1528 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1529 { 1530 page_t *plist = NULL; 1531 uint_t plist_len = 0; 1532 pgcnt_t npages; 1533 page_t *npp = NULL; 1534 uint_t pages_req; 1535 page_t *pp; 1536 kmutex_t *phm = NULL; 1537 uint_t index; 1538 1539 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1540 "page_create_start:vp %p off %llx bytes %u flags %x", 1541 vp, off, bytes, flags); 1542 1543 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1544 1545 pages_req = npages = mmu_btopr(bytes); 1546 1547 /* 1548 * Do the freemem and pcf accounting. 1549 */ 1550 if (!page_create_wait(npages, flags)) { 1551 return (NULL); 1552 } 1553 1554 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1555 "page_create_success:vp %p off %llx", 1556 vp, off); 1557 1558 /* 1559 * If satisfying this request has left us with too little 1560 * memory, start the wheels turning to get some back. The 1561 * first clause of the test prevents waking up the pageout 1562 * daemon in situations where it would decide that there's 1563 * nothing to do. 1564 */ 1565 if (nscan < desscan && freemem < minfree) { 1566 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1567 "pageout_cv_signal:freemem %ld", freemem); 1568 cv_signal(&proc_pageout->p_cv); 1569 } 1570 1571 if (flags & PG_PHYSCONTIG) { 1572 1573 plist = page_get_contigpage(&npages, mattr, 1); 1574 if (plist == NULL) { 1575 page_create_putback(npages); 1576 return (NULL); 1577 } 1578 1579 pp = plist; 1580 1581 do { 1582 if (!page_hashin(pp, vp, off, NULL)) { 1583 panic("pg_creat_io: hashin failed %p %p %llx", 1584 (void *)pp, (void *)vp, off); 1585 } 1586 VM_STAT_ADD(page_create_new); 1587 off += MMU_PAGESIZE; 1588 PP_CLRFREE(pp); 1589 PP_CLRAGED(pp); 1590 page_set_props(pp, P_REF); 1591 pp = pp->p_next; 1592 } while (pp != plist); 1593 1594 if (!npages) { 1595 check_dma(mattr, plist, pages_req); 1596 return (plist); 1597 } else { 1598 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1599 } 1600 1601 /* 1602 * fall-thru: 1603 * 1604 * page_get_contigpage returns when npages <= sgllen. 1605 * Grab the rest of the non-contig pages below from anylist. 1606 */ 1607 } 1608 1609 /* 1610 * Loop around collecting the requested number of pages. 1611 * Most of the time, we have to `create' a new page. With 1612 * this in mind, pull the page off the free list before 1613 * getting the hash lock. This will minimize the hash 1614 * lock hold time, nesting, and the like. If it turns 1615 * out we don't need the page, we put it back at the end. 1616 */ 1617 while (npages--) { 1618 phm = NULL; 1619 1620 index = PAGE_HASH_FUNC(vp, off); 1621 top: 1622 ASSERT(phm == NULL); 1623 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1624 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1625 1626 if (npp == NULL) { 1627 /* 1628 * Try to get the page of any color either from 1629 * the freelist or from the cache list. 1630 */ 1631 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1632 flags & ~PG_MATCH_COLOR, mattr, NULL); 1633 if (npp == NULL) { 1634 if (mattr == NULL) { 1635 /* 1636 * Not looking for a special page; 1637 * panic! 1638 */ 1639 panic("no page found %d", (int)npages); 1640 } 1641 /* 1642 * No page found! This can happen 1643 * if we are looking for a page 1644 * within a specific memory range 1645 * for DMA purposes. If PG_WAIT is 1646 * specified then we wait for a 1647 * while and then try again. The 1648 * wait could be forever if we 1649 * don't get the page(s) we need. 1650 * 1651 * Note: XXX We really need a mechanism 1652 * to wait for pages in the desired 1653 * range. For now, we wait for any 1654 * pages and see if we can use it. 1655 */ 1656 1657 if ((mattr != NULL) && (flags & PG_WAIT)) { 1658 delay(10); 1659 goto top; 1660 } 1661 1662 goto fail; /* undo accounting stuff */ 1663 } 1664 1665 if (PP_ISAGED(npp) == 0) { 1666 /* 1667 * Since this page came from the 1668 * cachelist, we must destroy the 1669 * old vnode association. 1670 */ 1671 page_hashout(npp, (kmutex_t *)NULL); 1672 } 1673 } 1674 1675 /* 1676 * We own this page! 1677 */ 1678 ASSERT(PAGE_EXCL(npp)); 1679 ASSERT(npp->p_vnode == NULL); 1680 ASSERT(!hat_page_is_mapped(npp)); 1681 PP_CLRFREE(npp); 1682 PP_CLRAGED(npp); 1683 1684 /* 1685 * Here we have a page in our hot little mits and are 1686 * just waiting to stuff it on the appropriate lists. 1687 * Get the mutex and check to see if it really does 1688 * not exist. 1689 */ 1690 phm = PAGE_HASH_MUTEX(index); 1691 mutex_enter(phm); 1692 PAGE_HASH_SEARCH(index, pp, vp, off); 1693 if (pp == NULL) { 1694 VM_STAT_ADD(page_create_new); 1695 pp = npp; 1696 npp = NULL; 1697 if (!page_hashin(pp, vp, off, phm)) { 1698 /* 1699 * Since we hold the page hash mutex and 1700 * just searched for this page, page_hashin 1701 * had better not fail. If it does, that 1702 * means somethread did not follow the 1703 * page hash mutex rules. Panic now and 1704 * get it over with. As usual, go down 1705 * holding all the locks. 1706 */ 1707 ASSERT(MUTEX_HELD(phm)); 1708 panic("page_create: hashin fail %p %p %llx %p", 1709 (void *)pp, (void *)vp, off, (void *)phm); 1710 1711 } 1712 ASSERT(MUTEX_HELD(phm)); 1713 mutex_exit(phm); 1714 phm = NULL; 1715 1716 /* 1717 * Hat layer locking need not be done to set 1718 * the following bits since the page is not hashed 1719 * and was on the free list (i.e., had no mappings). 1720 * 1721 * Set the reference bit to protect 1722 * against immediate pageout 1723 * 1724 * XXXmh modify freelist code to set reference 1725 * bit so we don't have to do it here. 1726 */ 1727 page_set_props(pp, P_REF); 1728 } else { 1729 ASSERT(MUTEX_HELD(phm)); 1730 mutex_exit(phm); 1731 phm = NULL; 1732 /* 1733 * NOTE: This should not happen for pages associated 1734 * with kernel vnode 'kvp'. 1735 */ 1736 /* XX64 - to debug why this happens! */ 1737 ASSERT(vp != &kvp); 1738 if (vp == &kvp) 1739 cmn_err(CE_NOTE, 1740 "page_create: page not expected " 1741 "in hash list for kernel vnode - pp 0x%p", 1742 (void *)pp); 1743 VM_STAT_ADD(page_create_exists); 1744 goto fail; 1745 } 1746 1747 /* 1748 * Got a page! It is locked. Acquire the i/o 1749 * lock since we are going to use the p_next and 1750 * p_prev fields to link the requested pages together. 1751 */ 1752 page_io_lock(pp); 1753 page_add(&plist, pp); 1754 plist = plist->p_next; 1755 off += MMU_PAGESIZE; 1756 vaddr += MMU_PAGESIZE; 1757 } 1758 1759 check_dma(mattr, plist, pages_req); 1760 return (plist); 1761 1762 fail: 1763 if (npp != NULL) { 1764 /* 1765 * Did not need this page after all. 1766 * Put it back on the free list. 1767 */ 1768 VM_STAT_ADD(page_create_putbacks); 1769 PP_SETFREE(npp); 1770 PP_SETAGED(npp); 1771 npp->p_offset = (u_offset_t)-1; 1772 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1773 page_unlock(npp); 1774 } 1775 1776 /* 1777 * Give up the pages we already got. 1778 */ 1779 while (plist != NULL) { 1780 pp = plist; 1781 page_sub(&plist, pp); 1782 page_io_unlock(pp); 1783 plist_len++; 1784 /*LINTED: constant in conditional ctx*/ 1785 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1786 } 1787 1788 /* 1789 * VN_DISPOSE does freemem accounting for the pages in plist 1790 * by calling page_free. So, we need to undo the pcf accounting 1791 * for only the remaining pages. 1792 */ 1793 VM_STAT_ADD(page_create_putbacks); 1794 page_create_putback(pages_req - plist_len); 1795 1796 return (NULL); 1797 } 1798 1799 1800 /* 1801 * Copy the data from the physical page represented by "frompp" to 1802 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1803 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1804 * level and no one sleeps with an active mapping there. 1805 * 1806 * Note that the ref/mod bits in the page_t's are not affected by 1807 * this operation, hence it is up to the caller to update them appropriately. 1808 */ 1809 void 1810 ppcopy(page_t *frompp, page_t *topp) 1811 { 1812 caddr_t pp_addr1; 1813 caddr_t pp_addr2; 1814 void *pte1; 1815 void *pte2; 1816 kmutex_t *ppaddr_mutex; 1817 1818 ASSERT_STACK_ALIGNED(); 1819 ASSERT(PAGE_LOCKED(frompp)); 1820 ASSERT(PAGE_LOCKED(topp)); 1821 1822 if (kpm_enable) { 1823 pp_addr1 = hat_kpm_page2va(frompp, 0); 1824 pp_addr2 = hat_kpm_page2va(topp, 0); 1825 kpreempt_disable(); 1826 } else { 1827 /* 1828 * disable pre-emption so that CPU can't change 1829 */ 1830 kpreempt_disable(); 1831 1832 pp_addr1 = CPU->cpu_caddr1; 1833 pp_addr2 = CPU->cpu_caddr2; 1834 pte1 = (void *)CPU->cpu_caddr1pte; 1835 pte2 = (void *)CPU->cpu_caddr2pte; 1836 1837 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1838 mutex_enter(ppaddr_mutex); 1839 1840 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1841 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1842 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1843 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1844 HAT_LOAD_NOCONSIST); 1845 } 1846 1847 if (use_sse_pagecopy) 1848 hwblkpagecopy(pp_addr1, pp_addr2); 1849 else 1850 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1851 1852 if (!kpm_enable) 1853 mutex_exit(ppaddr_mutex); 1854 kpreempt_enable(); 1855 } 1856 1857 /* 1858 * Zero the physical page from off to off + len given by `pp' 1859 * without changing the reference and modified bits of page. 1860 * 1861 * We use this using CPU private page address #2, see ppcopy() for more info. 1862 * pagezero() must not be called at interrupt level. 1863 */ 1864 void 1865 pagezero(page_t *pp, uint_t off, uint_t len) 1866 { 1867 caddr_t pp_addr2; 1868 void *pte2; 1869 kmutex_t *ppaddr_mutex; 1870 1871 ASSERT_STACK_ALIGNED(); 1872 ASSERT(len <= MMU_PAGESIZE); 1873 ASSERT(off <= MMU_PAGESIZE); 1874 ASSERT(off + len <= MMU_PAGESIZE); 1875 ASSERT(PAGE_LOCKED(pp)); 1876 1877 if (kpm_enable) { 1878 pp_addr2 = hat_kpm_page2va(pp, 0); 1879 kpreempt_disable(); 1880 } else { 1881 kpreempt_disable(); 1882 1883 pp_addr2 = CPU->cpu_caddr2; 1884 pte2 = (void *)CPU->cpu_caddr2pte; 1885 1886 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1887 mutex_enter(ppaddr_mutex); 1888 1889 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 1890 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1891 HAT_LOAD_NOCONSIST); 1892 } 1893 1894 if (use_sse_pagezero) 1895 hwblkclr(pp_addr2 + off, len); 1896 else 1897 bzero(pp_addr2 + off, len); 1898 1899 if (!kpm_enable) 1900 mutex_exit(ppaddr_mutex); 1901 kpreempt_enable(); 1902 } 1903 1904 /* 1905 * Platform-dependent page scrub call. 1906 */ 1907 void 1908 pagescrub(page_t *pp, uint_t off, uint_t len) 1909 { 1910 /* 1911 * For now, we rely on the fact that pagezero() will 1912 * always clear UEs. 1913 */ 1914 pagezero(pp, off, len); 1915 } 1916 1917 /* 1918 * set up two private addresses for use on a given CPU for use in ppcopy() 1919 */ 1920 void 1921 setup_vaddr_for_ppcopy(struct cpu *cpup) 1922 { 1923 void *addr; 1924 void *pte; 1925 1926 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1927 pte = hat_mempte_setup(addr); 1928 cpup->cpu_caddr1 = addr; 1929 cpup->cpu_caddr1pte = (pteptr_t)pte; 1930 1931 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1932 pte = hat_mempte_setup(addr); 1933 cpup->cpu_caddr2 = addr; 1934 cpup->cpu_caddr2pte = (pteptr_t)pte; 1935 1936 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 1937 } 1938 1939 1940 /* 1941 * Create the pageout scanner thread. The thread has to 1942 * start at procedure with process pp and priority pri. 1943 */ 1944 void 1945 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1946 { 1947 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1948 } 1949 1950 /* 1951 * Function for flushing D-cache when performing module relocations 1952 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 1953 */ 1954 void 1955 dcache_flushall() 1956 {} 1957