1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * UNIX machine dependent virtual memory support. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/user.h> 45 #include <sys/proc.h> 46 #include <sys/kmem.h> 47 #include <sys/vmem.h> 48 #include <sys/buf.h> 49 #include <sys/cpuvar.h> 50 #include <sys/lgrp.h> 51 #include <sys/disp.h> 52 #include <sys/vm.h> 53 #include <sys/mman.h> 54 #include <sys/vnode.h> 55 #include <sys/cred.h> 56 #include <sys/exec.h> 57 #include <sys/exechdr.h> 58 #include <sys/debug.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 0; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 extern uintptr_t eprom_kernelbase; 95 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 96 97 /* 4g memory management */ 98 pgcnt_t maxmem4g; 99 pgcnt_t freemem4g; 100 int physmax4g; 101 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 102 int lotsfree4gshift = 3; 103 104 #ifdef VM_STATS 105 struct { 106 ulong_t pga_alloc; 107 ulong_t pga_notfullrange; 108 ulong_t pga_nulldmaattr; 109 ulong_t pga_allocok; 110 ulong_t pga_allocfailed; 111 ulong_t pgma_alloc; 112 ulong_t pgma_allocok; 113 ulong_t pgma_allocfailed; 114 ulong_t pgma_allocempty; 115 } pga_vmstats; 116 #endif 117 118 uint_t mmu_page_sizes; 119 120 /* How many page sizes the users can see */ 121 uint_t mmu_exported_page_sizes; 122 123 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 124 125 /* 126 * Return the optimum page size for a given mapping 127 */ 128 /*ARGSUSED*/ 129 size_t 130 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 131 { 132 level_t l; 133 134 if (remap) 135 *remap = 0; 136 137 switch (maptype) { 138 139 case MAPPGSZ_STK: 140 case MAPPGSZ_HEAP: 141 case MAPPGSZ_VA: 142 /* 143 * use the pages size that best fits len 144 */ 145 for (l = mmu.max_page_level; l > 0; --l) { 146 if (len < LEVEL_SIZE(l)) 147 continue; 148 break; 149 } 150 return (LEVEL_SIZE(l)); 151 152 /* 153 * for ISM use the 1st large page size. 154 */ 155 case MAPPGSZ_ISM: 156 if (mmu.max_page_level == 0) 157 return (MMU_PAGESIZE); 158 return (LEVEL_SIZE(1)); 159 } 160 return (0); 161 } 162 163 /* 164 * This can be patched via /etc/system to allow large pages 165 * to be used for mapping application and libraries text segments. 166 */ 167 int use_text_largepages = 0; 168 169 /* 170 * Return a bit vector of large page size codes that 171 * can be used to map [addr, addr + len) region. 172 */ 173 174 /*ARGSUSED*/ 175 uint_t 176 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 177 { 178 size_t pgsz; 179 caddr_t a; 180 181 if (!text || !use_text_largepages || 182 mmu.max_page_level == 0) 183 return (0); 184 185 pgsz = LEVEL_SIZE(1); 186 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 187 if (a < addr || a >= addr + len) { 188 return (0); 189 } 190 len -= (a - addr); 191 if (len < pgsz) { 192 return (0); 193 } 194 return (1 << 1); 195 } 196 197 /* 198 * Handle a pagefault. 199 */ 200 faultcode_t 201 pagefault( 202 caddr_t addr, 203 enum fault_type type, 204 enum seg_rw rw, 205 int iskernel) 206 { 207 struct as *as; 208 struct hat *hat; 209 struct proc *p; 210 kthread_t *t; 211 faultcode_t res; 212 caddr_t base; 213 size_t len; 214 int err; 215 int mapped_red; 216 uintptr_t ea; 217 218 ASSERT_STACK_ALIGNED(); 219 220 if (INVALID_VADDR(addr)) 221 return (FC_NOMAP); 222 223 mapped_red = segkp_map_red(); 224 225 if (iskernel) { 226 as = &kas; 227 hat = as->a_hat; 228 } else { 229 t = curthread; 230 p = ttoproc(t); 231 as = p->p_as; 232 hat = as->a_hat; 233 } 234 235 /* 236 * Dispatch pagefault. 237 */ 238 res = as_fault(hat, as, addr, 1, type, rw); 239 240 /* 241 * If this isn't a potential unmapped hole in the user's 242 * UNIX data or stack segments, just return status info. 243 */ 244 if (res != FC_NOMAP || iskernel) 245 goto out; 246 247 /* 248 * Check to see if we happened to faulted on a currently unmapped 249 * part of the UNIX data or stack segments. If so, create a zfod 250 * mapping there and then try calling the fault routine again. 251 */ 252 base = p->p_brkbase; 253 len = p->p_brksize; 254 255 if (addr < base || addr >= base + len) { /* data seg? */ 256 base = (caddr_t)p->p_usrstack - p->p_stksize; 257 len = p->p_stksize; 258 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 259 /* not in either UNIX data or stack segments */ 260 res = FC_NOMAP; 261 goto out; 262 } 263 } 264 265 /* 266 * the rest of this function implements a 3.X 4.X 5.X compatibility 267 * This code is probably not needed anymore 268 */ 269 if (p->p_model == DATAMODEL_ILP32) { 270 271 /* expand the gap to the page boundaries on each side */ 272 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 273 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 274 len = ea - (uintptr_t)base; 275 276 as_rangelock(as); 277 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 278 0) { 279 err = as_map(as, base, len, segvn_create, zfod_argsp); 280 as_rangeunlock(as); 281 if (err) { 282 res = FC_MAKE_ERR(err); 283 goto out; 284 } 285 } else { 286 /* 287 * This page is already mapped by another thread after 288 * we returned from as_fault() above. We just fall 289 * through as_fault() below. 290 */ 291 as_rangeunlock(as); 292 } 293 294 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 295 } 296 297 out: 298 if (mapped_red) 299 segkp_unmap_red(); 300 301 return (res); 302 } 303 304 void 305 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 306 { 307 struct proc *p = curproc; 308 caddr_t userlimit = (flags & _MAP_LOW32) ? 309 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 310 311 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 312 } 313 314 /*ARGSUSED*/ 315 int 316 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 317 { 318 return (0); 319 } 320 321 /* 322 * map_addr_proc() is the routine called when the system is to 323 * choose an address for the user. We will pick an address 324 * range which is the highest available below kernelbase. 325 * 326 * addrp is a value/result parameter. 327 * On input it is a hint from the user to be used in a completely 328 * machine dependent fashion. We decide to completely ignore this hint. 329 * 330 * On output it is NULL if no address can be found in the current 331 * processes address space or else an address that is currently 332 * not mapped for len bytes with a page of red zone on either side. 333 * 334 * align is not needed on x86 (it's for viturally addressed caches) 335 */ 336 /*ARGSUSED*/ 337 void 338 map_addr_proc( 339 caddr_t *addrp, 340 size_t len, 341 offset_t off, 342 int vacalign, 343 caddr_t userlimit, 344 struct proc *p, 345 uint_t flags) 346 { 347 struct as *as = p->p_as; 348 caddr_t addr; 349 caddr_t base; 350 size_t slen; 351 size_t align_amount; 352 353 ASSERT32(userlimit == as->a_userlimit); 354 355 base = p->p_brkbase; 356 #if defined(__amd64) 357 /* 358 * XX64 Yes, this needs more work. 359 */ 360 if (p->p_model == DATAMODEL_NATIVE) { 361 if (userlimit < as->a_userlimit) { 362 /* 363 * This happens when a program wants to map 364 * something in a range that's accessible to a 365 * program in a smaller address space. For example, 366 * a 64-bit program calling mmap32(2) to guarantee 367 * that the returned address is below 4Gbytes. 368 */ 369 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 370 371 if (userlimit > base) 372 slen = userlimit - base; 373 else { 374 *addrp = NULL; 375 return; 376 } 377 } else { 378 /* 379 * XX64 This layout is probably wrong .. but in 380 * the event we make the amd64 address space look 381 * like sparcv9 i.e. with the stack -above- the 382 * heap, this bit of code might even be correct. 383 */ 384 slen = p->p_usrstack - base - 385 (((size_t)rctl_enforced_value( 386 rctlproc_legacy[RLIMIT_STACK], 387 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 388 } 389 } else 390 #endif 391 slen = userlimit - base; 392 393 len = (len + PAGEOFFSET) & PAGEMASK; 394 395 /* 396 * Redzone for each side of the request. This is done to leave 397 * one page unmapped between segments. This is not required, but 398 * it's useful for the user because if their program strays across 399 * a segment boundary, it will catch a fault immediately making 400 * debugging a little easier. 401 */ 402 len += 2 * MMU_PAGESIZE; 403 404 /* 405 * figure out what the alignment should be 406 * 407 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 408 */ 409 if (len <= ELF_386_MAXPGSZ) { 410 /* 411 * Align virtual addresses to ensure that ELF shared libraries 412 * are mapped with the appropriate alignment constraints by 413 * the run-time linker. 414 */ 415 align_amount = ELF_386_MAXPGSZ; 416 } else { 417 int l = mmu.max_page_level; 418 419 while (l && len < LEVEL_SIZE(l)) 420 --l; 421 422 align_amount = LEVEL_SIZE(l); 423 } 424 425 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 426 align_amount = (uintptr_t)*addrp; 427 428 len += align_amount; 429 430 /* 431 * Look for a large enough hole starting below userlimit. 432 * After finding it, use the upper part. Addition of PAGESIZE 433 * is for the redzone as described above. 434 */ 435 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 436 caddr_t as_addr; 437 438 addr = base + slen - len + MMU_PAGESIZE; 439 as_addr = addr; 440 /* 441 * Round address DOWN to the alignment amount, 442 * add the offset, and if this address is less 443 * than the original address, add alignment amount. 444 */ 445 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 446 addr += (uintptr_t)(off & (align_amount - 1)); 447 if (addr < as_addr) 448 addr += align_amount; 449 450 ASSERT(addr <= (as_addr + align_amount)); 451 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 452 ((uintptr_t)(off & (align_amount - 1)))); 453 *addrp = addr; 454 } else { 455 *addrp = NULL; /* no more virtual space */ 456 } 457 } 458 459 /* 460 * Determine whether [base, base+len] contains a valid range of 461 * addresses at least minlen long. base and len are adjusted if 462 * required to provide a valid range. 463 */ 464 /*ARGSUSED3*/ 465 int 466 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 467 { 468 uintptr_t hi, lo; 469 470 lo = (uintptr_t)*basep; 471 hi = lo + *lenp; 472 473 /* 474 * If hi rolled over the top, try cutting back. 475 */ 476 if (hi < lo) { 477 if (0 - lo + hi < minlen) 478 return (0); 479 if (0 - lo < minlen) 480 return (0); 481 *lenp = 0 - lo; 482 } else if (hi - lo < minlen) { 483 return (0); 484 } 485 #if defined(__amd64) 486 /* 487 * Deal with a possible hole in the address range between 488 * hole_start and hole_end that should never be mapped. 489 */ 490 if (lo < hole_start) { 491 if (hi > hole_start) { 492 if (hi < hole_end) { 493 hi = hole_start; 494 } else { 495 /* lo < hole_start && hi >= hole_end */ 496 if (dir == AH_LO) { 497 /* 498 * prefer lowest range 499 */ 500 if (hole_start - lo >= minlen) 501 hi = hole_start; 502 else if (hi - hole_end >= minlen) 503 lo = hole_end; 504 else 505 return (0); 506 } else { 507 /* 508 * prefer highest range 509 */ 510 if (hi - hole_end >= minlen) 511 lo = hole_end; 512 else if (hole_start - lo >= minlen) 513 hi = hole_start; 514 else 515 return (0); 516 } 517 } 518 } 519 } else { 520 /* lo >= hole_start */ 521 if (hi < hole_end) 522 return (0); 523 if (lo < hole_end) 524 lo = hole_end; 525 } 526 527 if (hi - lo < minlen) 528 return (0); 529 530 *basep = (caddr_t)lo; 531 *lenp = hi - lo; 532 #endif 533 return (1); 534 } 535 536 /* 537 * Determine whether [addr, addr+len] are valid user addresses. 538 */ 539 /*ARGSUSED*/ 540 int 541 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 542 caddr_t userlimit) 543 { 544 caddr_t eaddr = addr + len; 545 546 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 547 return (RANGE_BADADDR); 548 549 #if defined(__amd64) 550 /* 551 * Check for the VA hole 552 */ 553 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 554 return (RANGE_BADADDR); 555 #endif 556 557 return (RANGE_OKAY); 558 } 559 560 /* 561 * Return 1 if the page frame is onboard memory, else 0. 562 */ 563 int 564 pf_is_memory(pfn_t pf) 565 { 566 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 567 } 568 569 570 /* 571 * initialized by page_coloring_init(). 572 */ 573 uint_t page_colors; 574 uint_t page_colors_mask; 575 uint_t page_coloring_shift; 576 int cpu_page_colors; 577 static uint_t l2_colors; 578 579 /* 580 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 581 * and page_colors are calculated from the l2 cache n-way set size. Within a 582 * mnode range, the page freelist and cachelist are hashed into bins based on 583 * color. This makes it easier to search for a page within a specific memory 584 * range. 585 */ 586 #define PAGE_COLORS_MIN 16 587 588 page_t ****page_freelists; 589 page_t ***page_cachelists; 590 591 /* 592 * As the PC architecture evolved memory up was clumped into several 593 * ranges for various historical I/O devices to do DMA. 594 * < 16Meg - ISA bus 595 * < 2Gig - ??? 596 * < 4Gig - PCI bus or drivers that don't understand PAE mode 597 */ 598 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 599 0x100000, /* pfn range for 4G and above */ 600 0x80000, /* pfn range for 2G-4G */ 601 0x01000, /* pfn range for 16M-2G */ 602 0x00000, /* pfn range for 0-16M */ 603 }; 604 605 /* 606 * These are changed during startup if the machine has limited memory. 607 */ 608 pfn_t *memranges = &arch_memranges[0]; 609 int nranges = NUM_MEM_RANGES; 610 611 /* 612 * Used by page layer to know about page sizes 613 */ 614 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 615 616 /* 617 * This can be patched via /etc/system to allow old non-PAE aware device 618 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 619 */ 620 #if defined(__i386) 621 int restricted_kmemalloc = 1; /* XX64 re-examine with PSARC 2004/405 */ 622 #elif defined(__amd64) 623 int restricted_kmemalloc = 0; 624 #endif 625 626 kmutex_t *fpc_mutex[NPC_MUTEX]; 627 kmutex_t *cpc_mutex[NPC_MUTEX]; 628 629 630 /* 631 * return the memrange containing pfn 632 */ 633 int 634 memrange_num(pfn_t pfn) 635 { 636 int n; 637 638 for (n = 0; n < nranges - 1; ++n) { 639 if (pfn >= memranges[n]) 640 break; 641 } 642 return (n); 643 } 644 645 /* 646 * return the mnoderange containing pfn 647 */ 648 int 649 pfn_2_mtype(pfn_t pfn) 650 { 651 int n; 652 653 for (n = mnoderangecnt - 1; n >= 0; n--) { 654 if (pfn >= mnoderanges[n].mnr_pfnlo) { 655 break; 656 } 657 } 658 return (n); 659 } 660 661 /* 662 * is_contigpage_free: 663 * returns a page list of contiguous pages. It minimally has to return 664 * minctg pages. Caller determines minctg based on the scatter-gather 665 * list length. 666 * 667 * pfnp is set to the next page frame to search on return. 668 */ 669 static page_t * 670 is_contigpage_free( 671 pfn_t *pfnp, 672 pgcnt_t *pgcnt, 673 pgcnt_t minctg, 674 uint64_t pfnseg, 675 int iolock) 676 { 677 int i = 0; 678 pfn_t pfn = *pfnp; 679 page_t *pp; 680 page_t *plist = NULL; 681 682 /* 683 * fail if pfn + minctg crosses a segment boundary. 684 * Adjust for next starting pfn to begin at segment boundary. 685 */ 686 687 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 688 *pfnp = roundup(*pfnp, pfnseg + 1); 689 return (NULL); 690 } 691 692 do { 693 retry: 694 pp = page_numtopp_nolock(pfn + i); 695 if ((pp == NULL) || 696 (page_trylock(pp, SE_EXCL) == 0)) { 697 (*pfnp)++; 698 break; 699 } 700 if (page_pptonum(pp) != pfn + i) { 701 page_unlock(pp); 702 goto retry; 703 } 704 705 if (!(PP_ISFREE(pp))) { 706 page_unlock(pp); 707 (*pfnp)++; 708 break; 709 } 710 711 if (!PP_ISAGED(pp)) { 712 page_list_sub(pp, PG_CACHE_LIST); 713 page_hashout(pp, (kmutex_t *)NULL); 714 } else { 715 page_list_sub(pp, PG_FREE_LIST); 716 } 717 718 if (iolock) 719 page_io_lock(pp); 720 page_list_concat(&plist, &pp); 721 722 /* 723 * exit loop when pgcnt satisfied or segment boundary reached. 724 */ 725 726 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 727 728 *pfnp += i; /* set to next pfn to search */ 729 730 if (i >= minctg) { 731 *pgcnt -= i; 732 return (plist); 733 } 734 735 /* 736 * failure: minctg not satisfied. 737 * 738 * if next request crosses segment boundary, set next pfn 739 * to search from the segment boundary. 740 */ 741 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 742 *pfnp = roundup(*pfnp, pfnseg + 1); 743 744 /* clean up any pages already allocated */ 745 746 while (plist) { 747 pp = plist; 748 page_sub(&plist, pp); 749 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 750 if (iolock) 751 page_io_unlock(pp); 752 page_unlock(pp); 753 } 754 755 return (NULL); 756 } 757 758 /* 759 * verify that pages being returned from allocator have correct DMA attribute 760 */ 761 #ifndef DEBUG 762 #define check_dma(a, b, c) (0) 763 #else 764 static void 765 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 766 { 767 if (dma_attr == NULL) 768 return; 769 770 while (cnt-- > 0) { 771 if (mmu_ptob((uint64_t)pp->p_pagenum) < 772 dma_attr->dma_attr_addr_lo) 773 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 774 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 775 dma_attr->dma_attr_addr_hi) 776 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 777 pp = pp->p_next; 778 } 779 } 780 #endif 781 782 static kmutex_t contig_lock; 783 784 #define CONTIG_LOCK() mutex_enter(&contig_lock); 785 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 786 787 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 788 789 static page_t * 790 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 791 { 792 pfn_t pfn; 793 int sgllen; 794 uint64_t pfnseg; 795 pgcnt_t minctg; 796 page_t *pplist = NULL, *plist; 797 uint64_t lo, hi; 798 pgcnt_t pfnalign = 0; 799 static pfn_t startpfn; 800 static pgcnt_t lastctgcnt; 801 uintptr_t align; 802 803 CONTIG_LOCK(); 804 805 if (mattr) { 806 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 807 hi = mmu_btop(mattr->dma_attr_addr_hi); 808 if (hi >= physmax) 809 hi = physmax - 1; 810 sgllen = mattr->dma_attr_sgllen; 811 pfnseg = mmu_btop(mattr->dma_attr_seg); 812 813 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 814 if (align > MMU_PAGESIZE) 815 pfnalign = mmu_btop(align); 816 817 /* 818 * in order to satisfy the request, must minimally 819 * acquire minctg contiguous pages 820 */ 821 minctg = howmany(*pgcnt, sgllen); 822 823 ASSERT(hi >= lo); 824 825 /* 826 * start from where last searched if the minctg >= lastctgcnt 827 */ 828 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 829 startpfn = lo; 830 } else { 831 hi = physmax - 1; 832 lo = 0; 833 sgllen = 1; 834 pfnseg = mmu.highest_pfn; 835 minctg = *pgcnt; 836 837 if (minctg < lastctgcnt) 838 startpfn = lo; 839 } 840 lastctgcnt = minctg; 841 842 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 843 844 /* conserve 16m memory - start search above 16m when possible */ 845 if (hi > PFN_16M && startpfn < PFN_16M) 846 startpfn = PFN_16M; 847 848 pfn = startpfn; 849 if (pfnalign) 850 pfn = P2ROUNDUP(pfn, pfnalign); 851 852 while (pfn + minctg - 1 <= hi) { 853 854 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 855 if (plist) { 856 page_list_concat(&pplist, &plist); 857 sgllen--; 858 /* 859 * return when contig pages no longer needed 860 */ 861 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 862 startpfn = pfn; 863 CONTIG_UNLOCK(); 864 check_dma(mattr, pplist, *pgcnt); 865 return (pplist); 866 } 867 minctg = howmany(*pgcnt, sgllen); 868 } 869 if (pfnalign) 870 pfn = P2ROUNDUP(pfn, pfnalign); 871 } 872 873 /* cannot find contig pages in specified range */ 874 if (startpfn == lo) { 875 CONTIG_UNLOCK(); 876 return (NULL); 877 } 878 879 /* did not start with lo previously */ 880 pfn = lo; 881 if (pfnalign) 882 pfn = P2ROUNDUP(pfn, pfnalign); 883 884 /* allow search to go above startpfn */ 885 while (pfn < startpfn) { 886 887 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 888 if (plist != NULL) { 889 890 page_list_concat(&pplist, &plist); 891 sgllen--; 892 893 /* 894 * return when contig pages no longer needed 895 */ 896 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 897 startpfn = pfn; 898 CONTIG_UNLOCK(); 899 check_dma(mattr, pplist, *pgcnt); 900 return (pplist); 901 } 902 minctg = howmany(*pgcnt, sgllen); 903 } 904 if (pfnalign) 905 pfn = P2ROUNDUP(pfn, pfnalign); 906 } 907 CONTIG_UNLOCK(); 908 return (NULL); 909 } 910 911 /* 912 * combine mem_node_config and memrange memory ranges into one data 913 * structure to be used for page list management. 914 * 915 * mnode_range_cnt() calculates the number of memory ranges for mnode and 916 * memranges[]. Used to determine the size of page lists and mnoderanges. 917 * 918 * mnode_range_setup() initializes mnoderanges. 919 */ 920 mnoderange_t *mnoderanges; 921 int mnoderangecnt; 922 int mtype4g; 923 924 int 925 mnode_range_cnt() 926 { 927 int mri; 928 int mnrcnt = 0; 929 int mnode; 930 931 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 932 if (mem_node_config[mnode].exists == 0) 933 continue; 934 935 mri = nranges - 1; 936 937 /* find the memranges index below contained in mnode range */ 938 939 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 940 mri--; 941 942 /* 943 * increment mnode range counter when memranges or mnode 944 * boundary is reached. 945 */ 946 while (mri >= 0 && 947 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 948 mnrcnt++; 949 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 950 mri--; 951 else 952 break; 953 } 954 } 955 return (mnrcnt); 956 } 957 958 void 959 mnode_range_setup(mnoderange_t *mnoderanges) 960 { 961 int mnode, mri; 962 963 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 964 if (mem_node_config[mnode].exists == 0) 965 continue; 966 967 mri = nranges - 1; 968 969 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 970 mri--; 971 972 while (mri >= 0 && mem_node_config[mnode].physmax >= 973 MEMRANGELO(mri)) { 974 mnoderanges->mnr_pfnlo = 975 MAX(MEMRANGELO(mri), 976 mem_node_config[mnode].physbase); 977 mnoderanges->mnr_pfnhi = 978 MIN(MEMRANGEHI(mri), 979 mem_node_config[mnode].physmax); 980 mnoderanges->mnr_mnode = mnode; 981 mnoderanges->mnr_memrange = mri; 982 mnoderanges++; 983 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 984 mri--; 985 else 986 break; 987 } 988 } 989 } 990 991 /* 992 * Determine if the mnode range specified in mtype contains memory belonging 993 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 994 * the range of indices to 0 or 4g. 995 * 996 * Return first mnode range type index found otherwise return -1 if none found. 997 */ 998 int 999 mtype_func(int mnode, int mtype, uint_t flags) 1000 { 1001 if (flags & PGI_MT_RANGE) { 1002 int mtlim = 0; /* default to PGI_MT_RANGEO */ 1003 1004 if (flags & PGI_MT_NEXT) 1005 mtype--; 1006 if (flags & PGI_MT_RANGE4G) 1007 mtlim = mtype4g + 1; 1008 while (mtype >= mtlim) { 1009 if (mnoderanges[mtype].mnr_mnode == mnode) 1010 return (mtype); 1011 mtype--; 1012 } 1013 } else { 1014 if (mnoderanges[mtype].mnr_mnode == mnode) 1015 return (mtype); 1016 } 1017 return (-1); 1018 } 1019 1020 /* 1021 * Initialize page coloring variables based on the l2 cache parameters. 1022 * Calculate and return memory needed for page coloring data structures. 1023 */ 1024 size_t 1025 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1026 { 1027 size_t colorsz = 0; 1028 int i; 1029 int colors; 1030 1031 /* 1032 * Reduce the memory ranges lists if we don't have large amounts 1033 * of memory. This avoids searching known empty free lists. 1034 */ 1035 i = memrange_num(physmax); 1036 memranges += i; 1037 nranges -= i; 1038 #if defined(__i386) 1039 if (i > 0) 1040 restricted_kmemalloc = 0; 1041 #endif 1042 /* physmax greater than 4g */ 1043 if (i == 0) 1044 physmax4g = 1; 1045 1046 /* 1047 * setup pagesize for generic page layer 1048 */ 1049 for (i = 0; i <= mmu.max_page_level; ++i) { 1050 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1051 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1052 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1053 } 1054 1055 ASSERT(ISP2(l2_sz)); 1056 ASSERT(ISP2(l2_linesz)); 1057 ASSERT(l2_sz > MMU_PAGESIZE); 1058 1059 /* l2_assoc is 0 for fully associative l2 cache */ 1060 if (l2_assoc) 1061 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1062 else 1063 l2_colors = 1; 1064 1065 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1066 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1067 1068 /* 1069 * cpu_page_colors is non-zero when a page color may be spread across 1070 * multiple bins. 1071 */ 1072 if (l2_colors < page_colors) 1073 cpu_page_colors = l2_colors; 1074 1075 ASSERT(ISP2(page_colors)); 1076 1077 page_colors_mask = page_colors - 1; 1078 1079 ASSERT(ISP2(CPUSETSIZE())); 1080 page_coloring_shift = lowbit(CPUSETSIZE()); 1081 1082 /* size for mnoderanges */ 1083 mnoderangecnt = mnode_range_cnt(); 1084 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1085 1086 /* size for fpc_mutex and cpc_mutex */ 1087 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1088 1089 /* size of page_freelists */ 1090 colorsz += mnoderangecnt * sizeof (page_t ***); 1091 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1092 1093 for (i = 0; i < mmu_page_sizes; i++) { 1094 colors = page_get_pagecolors(i); 1095 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1096 } 1097 1098 /* size of page_cachelists */ 1099 colorsz += mnoderangecnt * sizeof (page_t **); 1100 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1101 1102 return (colorsz); 1103 } 1104 1105 /* 1106 * Called once at startup to configure page_coloring data structures and 1107 * does the 1st page_free()/page_freelist_add(). 1108 */ 1109 void 1110 page_coloring_setup(caddr_t pcmemaddr) 1111 { 1112 int i; 1113 int j; 1114 int k; 1115 caddr_t addr; 1116 int colors; 1117 1118 /* 1119 * do page coloring setup 1120 */ 1121 addr = pcmemaddr; 1122 1123 mnoderanges = (mnoderange_t *)addr; 1124 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1125 1126 mnode_range_setup(mnoderanges); 1127 1128 if (physmax4g) 1129 mtype4g = pfn_2_mtype(0xfffff); 1130 1131 for (k = 0; k < NPC_MUTEX; k++) { 1132 fpc_mutex[k] = (kmutex_t *)addr; 1133 addr += (max_mem_nodes * sizeof (kmutex_t)); 1134 } 1135 for (k = 0; k < NPC_MUTEX; k++) { 1136 cpc_mutex[k] = (kmutex_t *)addr; 1137 addr += (max_mem_nodes * sizeof (kmutex_t)); 1138 } 1139 page_freelists = (page_t ****)addr; 1140 addr += (mnoderangecnt * sizeof (page_t ***)); 1141 1142 page_cachelists = (page_t ***)addr; 1143 addr += (mnoderangecnt * sizeof (page_t **)); 1144 1145 for (i = 0; i < mnoderangecnt; i++) { 1146 page_freelists[i] = (page_t ***)addr; 1147 addr += (mmu_page_sizes * sizeof (page_t **)); 1148 1149 for (j = 0; j < mmu_page_sizes; j++) { 1150 colors = page_get_pagecolors(j); 1151 page_freelists[i][j] = (page_t **)addr; 1152 addr += (colors * sizeof (page_t *)); 1153 } 1154 page_cachelists[i] = (page_t **)addr; 1155 addr += (page_colors * sizeof (page_t *)); 1156 } 1157 } 1158 1159 /*ARGSUSED*/ 1160 int 1161 bp_color(struct buf *bp) 1162 { 1163 return (0); 1164 } 1165 1166 /* 1167 * get a page from any list with the given mnode 1168 */ 1169 page_t * 1170 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1171 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1172 { 1173 kmutex_t *pcm; 1174 int i; 1175 page_t *pp; 1176 page_t *first_pp; 1177 uint64_t pgaddr; 1178 ulong_t bin; 1179 int mtypestart; 1180 1181 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1182 1183 ASSERT((flags & PG_MATCH_COLOR) == 0); 1184 ASSERT(szc == 0); 1185 ASSERT(dma_attr != NULL); 1186 1187 1188 MTYPE_START(mnode, mtype, flags); 1189 if (mtype < 0) { 1190 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1191 return (NULL); 1192 } 1193 1194 mtypestart = mtype; 1195 1196 bin = origbin; 1197 1198 /* 1199 * check up to page_colors + 1 bins - origbin may be checked twice 1200 * because of BIN_STEP skip 1201 */ 1202 do { 1203 i = 0; 1204 while (i <= page_colors) { 1205 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1206 goto nextfreebin; 1207 1208 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1209 mutex_enter(pcm); 1210 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1211 first_pp = pp; 1212 while (pp != NULL) { 1213 if (page_trylock(pp, SE_EXCL) == 0) { 1214 pp = pp->p_next; 1215 if (pp == first_pp) { 1216 pp = NULL; 1217 } 1218 continue; 1219 } 1220 1221 ASSERT(PP_ISFREE(pp)); 1222 ASSERT(PP_ISAGED(pp)); 1223 ASSERT(pp->p_vnode == NULL); 1224 ASSERT(pp->p_hash == NULL); 1225 ASSERT(pp->p_offset == (u_offset_t)-1); 1226 ASSERT(pp->p_szc == szc); 1227 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1228 /* check if page within DMA attributes */ 1229 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1230 1231 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1232 (pgaddr + MMU_PAGESIZE - 1 <= 1233 dma_attr->dma_attr_addr_hi)) { 1234 break; 1235 } 1236 1237 /* continue looking */ 1238 page_unlock(pp); 1239 pp = pp->p_next; 1240 if (pp == first_pp) 1241 pp = NULL; 1242 1243 } 1244 if (pp != NULL) { 1245 ASSERT(mtype == PP_2_MTYPE(pp)); 1246 ASSERT(pp->p_szc == 0); 1247 1248 /* found a page with specified DMA attributes */ 1249 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1250 mtype), pp); 1251 page_ctr_sub(pp, PG_FREE_LIST); 1252 1253 if ((PP_ISFREE(pp) == 0) || 1254 (PP_ISAGED(pp) == 0)) { 1255 cmn_err(CE_PANIC, "page %p is not free", 1256 (void *)pp); 1257 } 1258 1259 mutex_exit(pcm); 1260 check_dma(dma_attr, pp, 1); 1261 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1262 return (pp); 1263 } 1264 mutex_exit(pcm); 1265 nextfreebin: 1266 pp = page_freelist_fill(szc, bin, mnode, mtype, 1267 mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 1268 if (pp) 1269 return (pp); 1270 1271 /* try next bin */ 1272 bin += (i == 0) ? BIN_STEP : 1; 1273 bin &= page_colors_mask; 1274 i++; 1275 } 1276 } while ((flags & PGI_MT_RANGE) && 1277 (MTYPE_NEXT(mnode, mtype, flags) >= 0)); 1278 1279 /* failed to find a page in the freelist; try it in the cachelist */ 1280 1281 /* reset mtype start for cachelist search */ 1282 mtype = mtypestart; 1283 ASSERT(mtype >= 0); 1284 1285 /* start with the bin of matching color */ 1286 bin = origbin; 1287 1288 do { 1289 for (i = 0; i <= page_colors; i++) { 1290 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1291 goto nextcachebin; 1292 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1293 mutex_enter(pcm); 1294 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1295 first_pp = pp; 1296 while (pp != NULL) { 1297 if (page_trylock(pp, SE_EXCL) == 0) { 1298 pp = pp->p_next; 1299 if (pp == first_pp) 1300 break; 1301 continue; 1302 } 1303 ASSERT(pp->p_vnode); 1304 ASSERT(PP_ISAGED(pp) == 0); 1305 ASSERT(pp->p_szc == 0); 1306 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1307 1308 /* check if page within DMA attributes */ 1309 1310 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1311 1312 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1313 (pgaddr + MMU_PAGESIZE - 1 <= 1314 dma_attr->dma_attr_addr_hi)) { 1315 break; 1316 } 1317 1318 /* continue looking */ 1319 page_unlock(pp); 1320 pp = pp->p_next; 1321 if (pp == first_pp) 1322 pp = NULL; 1323 } 1324 1325 if (pp != NULL) { 1326 ASSERT(mtype == PP_2_MTYPE(pp)); 1327 ASSERT(pp->p_szc == 0); 1328 1329 /* found a page with specified DMA attributes */ 1330 page_sub(&PAGE_CACHELISTS(mnode, bin, 1331 mtype), pp); 1332 page_ctr_sub(pp, PG_CACHE_LIST); 1333 1334 mutex_exit(pcm); 1335 ASSERT(pp->p_vnode); 1336 ASSERT(PP_ISAGED(pp) == 0); 1337 check_dma(dma_attr, pp, 1); 1338 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1339 return (pp); 1340 } 1341 mutex_exit(pcm); 1342 nextcachebin: 1343 bin += (i == 0) ? BIN_STEP : 1; 1344 bin &= page_colors_mask; 1345 } 1346 } while ((flags & PGI_MT_RANGE) && 1347 (MTYPE_NEXT(mnode, mtype, flags) >= 0)); 1348 1349 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1350 return (NULL); 1351 } 1352 1353 /* 1354 * This function is similar to page_get_freelist()/page_get_cachelist() 1355 * but it searches both the lists to find a page with the specified 1356 * color (or no color) and DMA attributes. The search is done in the 1357 * freelist first and then in the cache list within the highest memory 1358 * range (based on DMA attributes) before searching in the lower 1359 * memory ranges. 1360 * 1361 * Note: This function is called only by page_create_io(). 1362 */ 1363 /*ARGSUSED*/ 1364 page_t * 1365 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1366 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1367 { 1368 uint_t bin; 1369 int mtype; 1370 page_t *pp; 1371 int n; 1372 int m; 1373 int szc; 1374 int fullrange; 1375 int mnode; 1376 int local_failed_stat = 0; 1377 lgrp_mnode_cookie_t lgrp_cookie; 1378 1379 VM_STAT_ADD(pga_vmstats.pga_alloc); 1380 1381 /* only base pagesize currently supported */ 1382 if (size != MMU_PAGESIZE) 1383 return (NULL); 1384 1385 /* 1386 * If we're passed a specific lgroup, we use it. Otherwise, 1387 * assume first-touch placement is desired. 1388 */ 1389 if (!LGRP_EXISTS(lgrp)) 1390 lgrp = lgrp_home_lgrp(); 1391 1392 /* LINTED */ 1393 AS_2_BIN(as, seg, vp, vaddr, bin); 1394 1395 /* 1396 * Only hold one freelist or cachelist lock at a time, that way we 1397 * can start anywhere and not have to worry about lock 1398 * ordering. 1399 */ 1400 if (dma_attr == NULL) { 1401 n = 0; 1402 m = mnoderangecnt - 1; 1403 fullrange = 1; 1404 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1405 } else { 1406 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1407 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1408 1409 /* 1410 * We can guarantee alignment only for page boundary. 1411 */ 1412 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1413 return (NULL); 1414 1415 n = pfn_2_mtype(pfnlo); 1416 m = pfn_2_mtype(pfnhi); 1417 1418 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1419 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1420 } 1421 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1422 1423 if (n > m) 1424 return (NULL); 1425 1426 szc = 0; 1427 1428 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1429 if (n == 0) { 1430 flags |= PGI_MT_RANGE0; 1431 n = m; 1432 } 1433 1434 /* 1435 * Try local memory node first, but try remote if we can't 1436 * get a page of the right color. 1437 */ 1438 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1439 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1440 /* 1441 * allocate pages from high pfn to low. 1442 */ 1443 for (mtype = m; mtype >= n; mtype--) { 1444 if (fullrange != 0) { 1445 pp = page_get_mnode_freelist(mnode, 1446 bin, mtype, szc, flags); 1447 if (pp == NULL) { 1448 pp = page_get_mnode_cachelist( 1449 bin, flags, mnode, mtype); 1450 } 1451 } else { 1452 pp = page_get_mnode_anylist(bin, szc, 1453 flags, mnode, mtype, dma_attr); 1454 } 1455 if (pp != NULL) { 1456 VM_STAT_ADD(pga_vmstats.pga_allocok); 1457 check_dma(dma_attr, pp, 1); 1458 return (pp); 1459 } 1460 } 1461 if (!local_failed_stat) { 1462 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1463 local_failed_stat = 1; 1464 } 1465 } 1466 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1467 1468 return (NULL); 1469 } 1470 1471 /* 1472 * page_create_io() 1473 * 1474 * This function is a copy of page_create_va() with an additional 1475 * argument 'mattr' that specifies DMA memory requirements to 1476 * the page list functions. This function is used by the segkmem 1477 * allocator so it is only to create new pages (i.e PG_EXCL is 1478 * set). 1479 * 1480 * Note: This interface is currently used by x86 PSM only and is 1481 * not fully specified so the commitment level is only for 1482 * private interface specific to x86. This interface uses PSM 1483 * specific page_get_anylist() interface. 1484 */ 1485 1486 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1487 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1488 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1489 break; \ 1490 } \ 1491 } 1492 1493 1494 page_t * 1495 page_create_io( 1496 struct vnode *vp, 1497 u_offset_t off, 1498 uint_t bytes, 1499 uint_t flags, 1500 struct as *as, 1501 caddr_t vaddr, 1502 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1503 { 1504 page_t *plist = NULL; 1505 uint_t plist_len = 0; 1506 pgcnt_t npages; 1507 page_t *npp = NULL; 1508 uint_t pages_req; 1509 page_t *pp; 1510 kmutex_t *phm = NULL; 1511 uint_t index; 1512 1513 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1514 "page_create_start:vp %p off %llx bytes %u flags %x", 1515 vp, off, bytes, flags); 1516 1517 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1518 1519 pages_req = npages = mmu_btopr(bytes); 1520 1521 /* 1522 * Do the freemem and pcf accounting. 1523 */ 1524 if (!page_create_wait(npages, flags)) { 1525 return (NULL); 1526 } 1527 1528 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1529 "page_create_success:vp %p off %llx", 1530 vp, off); 1531 1532 /* 1533 * If satisfying this request has left us with too little 1534 * memory, start the wheels turning to get some back. The 1535 * first clause of the test prevents waking up the pageout 1536 * daemon in situations where it would decide that there's 1537 * nothing to do. 1538 */ 1539 if (nscan < desscan && freemem < minfree) { 1540 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1541 "pageout_cv_signal:freemem %ld", freemem); 1542 cv_signal(&proc_pageout->p_cv); 1543 } 1544 1545 if (flags & PG_PHYSCONTIG) { 1546 1547 plist = page_get_contigpage(&npages, mattr, 1); 1548 if (plist == NULL) { 1549 page_create_putback(npages); 1550 return (NULL); 1551 } 1552 1553 pp = plist; 1554 1555 do { 1556 if (!page_hashin(pp, vp, off, NULL)) { 1557 panic("pg_creat_io: hashin failed %p %p %llx", 1558 (void *)pp, (void *)vp, off); 1559 } 1560 VM_STAT_ADD(page_create_new); 1561 off += MMU_PAGESIZE; 1562 PP_CLRFREE(pp); 1563 PP_CLRAGED(pp); 1564 page_set_props(pp, P_REF); 1565 pp = pp->p_next; 1566 } while (pp != plist); 1567 1568 if (!npages) { 1569 check_dma(mattr, plist, pages_req); 1570 return (plist); 1571 } else { 1572 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1573 } 1574 1575 /* 1576 * fall-thru: 1577 * 1578 * page_get_contigpage returns when npages <= sgllen. 1579 * Grab the rest of the non-contig pages below from anylist. 1580 */ 1581 } 1582 1583 /* 1584 * Loop around collecting the requested number of pages. 1585 * Most of the time, we have to `create' a new page. With 1586 * this in mind, pull the page off the free list before 1587 * getting the hash lock. This will minimize the hash 1588 * lock hold time, nesting, and the like. If it turns 1589 * out we don't need the page, we put it back at the end. 1590 */ 1591 while (npages--) { 1592 phm = NULL; 1593 1594 index = PAGE_HASH_FUNC(vp, off); 1595 top: 1596 ASSERT(phm == NULL); 1597 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1598 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1599 1600 if (npp == NULL) { 1601 /* 1602 * Try to get the page of any color either from 1603 * the freelist or from the cache list. 1604 */ 1605 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1606 flags & ~PG_MATCH_COLOR, mattr, NULL); 1607 if (npp == NULL) { 1608 if (mattr == NULL) { 1609 /* 1610 * Not looking for a special page; 1611 * panic! 1612 */ 1613 panic("no page found %d", (int)npages); 1614 } 1615 /* 1616 * No page found! This can happen 1617 * if we are looking for a page 1618 * within a specific memory range 1619 * for DMA purposes. If PG_WAIT is 1620 * specified then we wait for a 1621 * while and then try again. The 1622 * wait could be forever if we 1623 * don't get the page(s) we need. 1624 * 1625 * Note: XXX We really need a mechanism 1626 * to wait for pages in the desired 1627 * range. For now, we wait for any 1628 * pages and see if we can use it. 1629 */ 1630 1631 if ((mattr != NULL) && (flags & PG_WAIT)) { 1632 delay(10); 1633 goto top; 1634 } 1635 1636 goto fail; /* undo accounting stuff */ 1637 } 1638 1639 if (PP_ISAGED(npp) == 0) { 1640 /* 1641 * Since this page came from the 1642 * cachelist, we must destroy the 1643 * old vnode association. 1644 */ 1645 page_hashout(npp, (kmutex_t *)NULL); 1646 } 1647 } 1648 1649 /* 1650 * We own this page! 1651 */ 1652 ASSERT(PAGE_EXCL(npp)); 1653 ASSERT(npp->p_vnode == NULL); 1654 ASSERT(!hat_page_is_mapped(npp)); 1655 PP_CLRFREE(npp); 1656 PP_CLRAGED(npp); 1657 1658 /* 1659 * Here we have a page in our hot little mits and are 1660 * just waiting to stuff it on the appropriate lists. 1661 * Get the mutex and check to see if it really does 1662 * not exist. 1663 */ 1664 phm = PAGE_HASH_MUTEX(index); 1665 mutex_enter(phm); 1666 PAGE_HASH_SEARCH(index, pp, vp, off); 1667 if (pp == NULL) { 1668 VM_STAT_ADD(page_create_new); 1669 pp = npp; 1670 npp = NULL; 1671 if (!page_hashin(pp, vp, off, phm)) { 1672 /* 1673 * Since we hold the page hash mutex and 1674 * just searched for this page, page_hashin 1675 * had better not fail. If it does, that 1676 * means somethread did not follow the 1677 * page hash mutex rules. Panic now and 1678 * get it over with. As usual, go down 1679 * holding all the locks. 1680 */ 1681 ASSERT(MUTEX_HELD(phm)); 1682 panic("page_create: hashin fail %p %p %llx %p", 1683 (void *)pp, (void *)vp, off, (void *)phm); 1684 1685 } 1686 ASSERT(MUTEX_HELD(phm)); 1687 mutex_exit(phm); 1688 phm = NULL; 1689 1690 /* 1691 * Hat layer locking need not be done to set 1692 * the following bits since the page is not hashed 1693 * and was on the free list (i.e., had no mappings). 1694 * 1695 * Set the reference bit to protect 1696 * against immediate pageout 1697 * 1698 * XXXmh modify freelist code to set reference 1699 * bit so we don't have to do it here. 1700 */ 1701 page_set_props(pp, P_REF); 1702 } else { 1703 ASSERT(MUTEX_HELD(phm)); 1704 mutex_exit(phm); 1705 phm = NULL; 1706 /* 1707 * NOTE: This should not happen for pages associated 1708 * with kernel vnode 'kvp'. 1709 */ 1710 /* XX64 - to debug why this happens! */ 1711 ASSERT(vp != &kvp); 1712 if (vp == &kvp) 1713 cmn_err(CE_NOTE, 1714 "page_create: page not expected " 1715 "in hash list for kernel vnode - pp 0x%p", 1716 (void *)pp); 1717 VM_STAT_ADD(page_create_exists); 1718 goto fail; 1719 } 1720 1721 /* 1722 * Got a page! It is locked. Acquire the i/o 1723 * lock since we are going to use the p_next and 1724 * p_prev fields to link the requested pages together. 1725 */ 1726 page_io_lock(pp); 1727 page_add(&plist, pp); 1728 plist = plist->p_next; 1729 off += MMU_PAGESIZE; 1730 vaddr += MMU_PAGESIZE; 1731 } 1732 1733 check_dma(mattr, plist, pages_req); 1734 return (plist); 1735 1736 fail: 1737 if (npp != NULL) { 1738 /* 1739 * Did not need this page after all. 1740 * Put it back on the free list. 1741 */ 1742 VM_STAT_ADD(page_create_putbacks); 1743 PP_SETFREE(npp); 1744 PP_SETAGED(npp); 1745 npp->p_offset = (u_offset_t)-1; 1746 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1747 page_unlock(npp); 1748 } 1749 1750 /* 1751 * Give up the pages we already got. 1752 */ 1753 while (plist != NULL) { 1754 pp = plist; 1755 page_sub(&plist, pp); 1756 page_io_unlock(pp); 1757 plist_len++; 1758 /*LINTED: constant in conditional ctx*/ 1759 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1760 } 1761 1762 /* 1763 * VN_DISPOSE does freemem accounting for the pages in plist 1764 * by calling page_free. So, we need to undo the pcf accounting 1765 * for only the remaining pages. 1766 */ 1767 VM_STAT_ADD(page_create_putbacks); 1768 page_create_putback(pages_req - plist_len); 1769 1770 return (NULL); 1771 } 1772 1773 1774 /* 1775 * Copy the data from the physical page represented by "frompp" to 1776 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1777 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1778 * level and no one sleeps with an active mapping there. 1779 * 1780 * Note that the ref/mod bits in the page_t's are not affected by 1781 * this operation, hence it is up to the caller to update them appropriately. 1782 */ 1783 void 1784 ppcopy(page_t *frompp, page_t *topp) 1785 { 1786 caddr_t pp_addr1; 1787 caddr_t pp_addr2; 1788 void *pte1; 1789 void *pte2; 1790 kmutex_t *ppaddr_mutex; 1791 1792 ASSERT_STACK_ALIGNED(); 1793 ASSERT(PAGE_LOCKED(frompp)); 1794 ASSERT(PAGE_LOCKED(topp)); 1795 1796 if (kpm_enable) { 1797 pp_addr1 = hat_kpm_page2va(frompp, 0); 1798 pp_addr2 = hat_kpm_page2va(topp, 0); 1799 kpreempt_disable(); 1800 } else { 1801 /* 1802 * disable pre-emption so that CPU can't change 1803 */ 1804 kpreempt_disable(); 1805 1806 pp_addr1 = CPU->cpu_caddr1; 1807 pp_addr2 = CPU->cpu_caddr2; 1808 pte1 = (void *)CPU->cpu_caddr1pte; 1809 pte2 = (void *)CPU->cpu_caddr2pte; 1810 1811 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1812 mutex_enter(ppaddr_mutex); 1813 1814 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1815 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1816 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1817 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1818 HAT_LOAD_NOCONSIST); 1819 } 1820 1821 if (use_sse_pagecopy) 1822 hwblkpagecopy(pp_addr1, pp_addr2); 1823 else 1824 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1825 1826 if (!kpm_enable) 1827 mutex_exit(ppaddr_mutex); 1828 kpreempt_enable(); 1829 } 1830 1831 /* 1832 * Zero the physical page from off to off + len given by `pp' 1833 * without changing the reference and modified bits of page. 1834 * 1835 * We use this using CPU private page address #2, see ppcopy() for more info. 1836 * pagezero() must not be called at interrupt level. 1837 */ 1838 void 1839 pagezero(page_t *pp, uint_t off, uint_t len) 1840 { 1841 caddr_t pp_addr2; 1842 void *pte2; 1843 kmutex_t *ppaddr_mutex; 1844 1845 ASSERT_STACK_ALIGNED(); 1846 ASSERT(len <= MMU_PAGESIZE); 1847 ASSERT(off <= MMU_PAGESIZE); 1848 ASSERT(off + len <= MMU_PAGESIZE); 1849 ASSERT(PAGE_LOCKED(pp)); 1850 1851 if (kpm_enable) { 1852 pp_addr2 = hat_kpm_page2va(pp, 0); 1853 kpreempt_disable(); 1854 } else { 1855 kpreempt_disable(); 1856 1857 pp_addr2 = CPU->cpu_caddr2; 1858 pte2 = (void *)CPU->cpu_caddr2pte; 1859 1860 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1861 mutex_enter(ppaddr_mutex); 1862 1863 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 1864 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1865 HAT_LOAD_NOCONSIST); 1866 } 1867 1868 if (use_sse_pagezero) 1869 hwblkclr(pp_addr2 + off, len); 1870 else 1871 bzero(pp_addr2 + off, len); 1872 1873 if (!kpm_enable) 1874 mutex_exit(ppaddr_mutex); 1875 kpreempt_enable(); 1876 } 1877 1878 /* 1879 * Platform-dependent page scrub call. 1880 */ 1881 void 1882 pagescrub(page_t *pp, uint_t off, uint_t len) 1883 { 1884 /* 1885 * For now, we rely on the fact that pagezero() will 1886 * always clear UEs. 1887 */ 1888 pagezero(pp, off, len); 1889 } 1890 1891 /* 1892 * set up two private addresses for use on a given CPU for use in ppcopy() 1893 */ 1894 void 1895 setup_vaddr_for_ppcopy(struct cpu *cpup) 1896 { 1897 void *addr; 1898 void *pte; 1899 1900 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1901 pte = hat_mempte_setup(addr); 1902 cpup->cpu_caddr1 = addr; 1903 cpup->cpu_caddr1pte = (pteptr_t)pte; 1904 1905 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1906 pte = hat_mempte_setup(addr); 1907 cpup->cpu_caddr2 = addr; 1908 cpup->cpu_caddr2pte = (pteptr_t)pte; 1909 1910 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 1911 } 1912 1913 1914 /* 1915 * Create the pageout scanner thread. The thread has to 1916 * start at procedure with process pp and priority pri. 1917 */ 1918 void 1919 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1920 { 1921 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1922 } 1923 1924 /* 1925 * any use for this? 1926 */ 1927 void 1928 post_startup_mmu_initialization(void) 1929 {} 1930 1931 /* 1932 * Function for flushing D-cache when performing module relocations 1933 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 1934 */ 1935 void 1936 dcache_flushall() 1937 {} 1938