1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * UNIX machine dependent virtual memory support. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/user.h> 45 #include <sys/proc.h> 46 #include <sys/kmem.h> 47 #include <sys/vmem.h> 48 #include <sys/buf.h> 49 #include <sys/cpuvar.h> 50 #include <sys/lgrp.h> 51 #include <sys/disp.h> 52 #include <sys/vm.h> 53 #include <sys/mman.h> 54 #include <sys/vnode.h> 55 #include <sys/cred.h> 56 #include <sys/exec.h> 57 #include <sys/exechdr.h> 58 #include <sys/debug.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 0; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 extern uintptr_t eprom_kernelbase; 95 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 96 97 /* 4g memory management */ 98 pgcnt_t maxmem4g; 99 pgcnt_t freemem4g; 100 int physmax4g; 101 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 102 int lotsfree4gshift = 3; 103 104 #ifdef VM_STATS 105 struct { 106 ulong_t pga_alloc; 107 ulong_t pga_notfullrange; 108 ulong_t pga_nulldmaattr; 109 ulong_t pga_allocok; 110 ulong_t pga_allocfailed; 111 ulong_t pgma_alloc; 112 ulong_t pgma_allocok; 113 ulong_t pgma_allocfailed; 114 ulong_t pgma_allocempty; 115 } pga_vmstats; 116 #endif 117 118 uint_t mmu_page_sizes; 119 120 /* How many page sizes the users can see */ 121 uint_t mmu_exported_page_sizes; 122 123 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 124 125 /* 126 * Return the optimum page size for a given mapping 127 */ 128 /*ARGSUSED*/ 129 size_t 130 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 131 { 132 level_t l; 133 134 if (remap) 135 *remap = 0; 136 137 switch (maptype) { 138 139 case MAPPGSZ_STK: 140 case MAPPGSZ_HEAP: 141 case MAPPGSZ_VA: 142 /* 143 * use the pages size that best fits len 144 */ 145 for (l = mmu.max_page_level; l > 0; --l) { 146 if (len < LEVEL_SIZE(l)) 147 continue; 148 break; 149 } 150 return (LEVEL_SIZE(l)); 151 152 /* 153 * for ISM use the 1st large page size. 154 */ 155 case MAPPGSZ_ISM: 156 if (mmu.max_page_level == 0) 157 return (MMU_PAGESIZE); 158 return (LEVEL_SIZE(1)); 159 } 160 return (0); 161 } 162 163 /* 164 * This can be patched via /etc/system to allow large pages 165 * to be used for mapping application and libraries text segments. 166 */ 167 int use_text_largepages = 0; 168 169 /* 170 * Return a bit vector of large page size codes that 171 * can be used to map [addr, addr + len) region. 172 */ 173 174 /*ARGSUSED*/ 175 uint_t 176 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 177 { 178 size_t pgsz; 179 caddr_t a; 180 181 if (!text || !use_text_largepages || 182 mmu.max_page_level == 0) 183 return (0); 184 185 pgsz = LEVEL_SIZE(1); 186 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 187 if (a < addr || a >= addr + len) { 188 return (0); 189 } 190 len -= (a - addr); 191 if (len < pgsz) { 192 return (0); 193 } 194 return (1 << 1); 195 } 196 197 /* 198 * Handle a pagefault. 199 */ 200 faultcode_t 201 pagefault( 202 caddr_t addr, 203 enum fault_type type, 204 enum seg_rw rw, 205 int iskernel) 206 { 207 struct as *as; 208 struct hat *hat; 209 struct proc *p; 210 kthread_t *t; 211 faultcode_t res; 212 caddr_t base; 213 size_t len; 214 int err; 215 int mapped_red; 216 uintptr_t ea; 217 218 ASSERT_STACK_ALIGNED(); 219 220 if (INVALID_VADDR(addr)) 221 return (FC_NOMAP); 222 223 mapped_red = segkp_map_red(); 224 225 if (iskernel) { 226 as = &kas; 227 hat = as->a_hat; 228 } else { 229 t = curthread; 230 p = ttoproc(t); 231 as = p->p_as; 232 hat = as->a_hat; 233 } 234 235 /* 236 * Dispatch pagefault. 237 */ 238 res = as_fault(hat, as, addr, 1, type, rw); 239 240 /* 241 * If this isn't a potential unmapped hole in the user's 242 * UNIX data or stack segments, just return status info. 243 */ 244 if (res != FC_NOMAP || iskernel) 245 goto out; 246 247 /* 248 * Check to see if we happened to faulted on a currently unmapped 249 * part of the UNIX data or stack segments. If so, create a zfod 250 * mapping there and then try calling the fault routine again. 251 */ 252 base = p->p_brkbase; 253 len = p->p_brksize; 254 255 if (addr < base || addr >= base + len) { /* data seg? */ 256 base = (caddr_t)p->p_usrstack - p->p_stksize; 257 len = p->p_stksize; 258 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 259 /* not in either UNIX data or stack segments */ 260 res = FC_NOMAP; 261 goto out; 262 } 263 } 264 265 /* 266 * the rest of this function implements a 3.X 4.X 5.X compatibility 267 * This code is probably not needed anymore 268 */ 269 if (p->p_model == DATAMODEL_ILP32) { 270 271 /* expand the gap to the page boundaries on each side */ 272 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 273 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 274 len = ea - (uintptr_t)base; 275 276 as_rangelock(as); 277 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 278 0) { 279 err = as_map(as, base, len, segvn_create, zfod_argsp); 280 as_rangeunlock(as); 281 if (err) { 282 res = FC_MAKE_ERR(err); 283 goto out; 284 } 285 } else { 286 /* 287 * This page is already mapped by another thread after 288 * we returned from as_fault() above. We just fall 289 * through as_fault() below. 290 */ 291 as_rangeunlock(as); 292 } 293 294 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 295 } 296 297 out: 298 if (mapped_red) 299 segkp_unmap_red(); 300 301 return (res); 302 } 303 304 void 305 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 306 { 307 struct proc *p = curproc; 308 caddr_t userlimit = (flags & _MAP_LOW32) ? 309 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 310 311 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 312 } 313 314 /*ARGSUSED*/ 315 int 316 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 317 { 318 return (0); 319 } 320 321 /* 322 * map_addr_proc() is the routine called when the system is to 323 * choose an address for the user. We will pick an address 324 * range which is the highest available below kernelbase. 325 * 326 * addrp is a value/result parameter. 327 * On input it is a hint from the user to be used in a completely 328 * machine dependent fashion. We decide to completely ignore this hint. 329 * 330 * On output it is NULL if no address can be found in the current 331 * processes address space or else an address that is currently 332 * not mapped for len bytes with a page of red zone on either side. 333 * 334 * align is not needed on x86 (it's for viturally addressed caches) 335 */ 336 /*ARGSUSED*/ 337 void 338 map_addr_proc( 339 caddr_t *addrp, 340 size_t len, 341 offset_t off, 342 int vacalign, 343 caddr_t userlimit, 344 struct proc *p, 345 uint_t flags) 346 { 347 struct as *as = p->p_as; 348 caddr_t addr; 349 caddr_t base; 350 size_t slen; 351 size_t align_amount; 352 353 ASSERT32(userlimit == as->a_userlimit); 354 355 base = p->p_brkbase; 356 #if defined(__amd64) 357 /* 358 * XX64 Yes, this needs more work. 359 */ 360 if (p->p_model == DATAMODEL_NATIVE) { 361 if (userlimit < as->a_userlimit) { 362 /* 363 * This happens when a program wants to map 364 * something in a range that's accessible to a 365 * program in a smaller address space. For example, 366 * a 64-bit program calling mmap32(2) to guarantee 367 * that the returned address is below 4Gbytes. 368 */ 369 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 370 371 if (userlimit > base) 372 slen = userlimit - base; 373 else { 374 *addrp = NULL; 375 return; 376 } 377 } else { 378 /* 379 * XX64 This layout is probably wrong .. but in 380 * the event we make the amd64 address space look 381 * like sparcv9 i.e. with the stack -above- the 382 * heap, this bit of code might even be correct. 383 */ 384 slen = p->p_usrstack - base - 385 (((size_t)rctl_enforced_value( 386 rctlproc_legacy[RLIMIT_STACK], 387 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 388 } 389 } else 390 #endif 391 slen = userlimit - base; 392 393 len = (len + PAGEOFFSET) & PAGEMASK; 394 395 /* 396 * Redzone for each side of the request. This is done to leave 397 * one page unmapped between segments. This is not required, but 398 * it's useful for the user because if their program strays across 399 * a segment boundary, it will catch a fault immediately making 400 * debugging a little easier. 401 */ 402 len += 2 * MMU_PAGESIZE; 403 404 /* 405 * figure out what the alignment should be 406 * 407 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 408 */ 409 if (len <= ELF_386_MAXPGSZ) { 410 /* 411 * Align virtual addresses to ensure that ELF shared libraries 412 * are mapped with the appropriate alignment constraints by 413 * the run-time linker. 414 */ 415 align_amount = ELF_386_MAXPGSZ; 416 } else { 417 int l = mmu.max_page_level; 418 419 while (l && len < LEVEL_SIZE(l)) 420 --l; 421 422 align_amount = LEVEL_SIZE(l); 423 } 424 425 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 426 align_amount = (uintptr_t)*addrp; 427 428 len += align_amount; 429 430 /* 431 * Look for a large enough hole starting below userlimit. 432 * After finding it, use the upper part. Addition of PAGESIZE 433 * is for the redzone as described above. 434 */ 435 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 436 caddr_t as_addr; 437 438 addr = base + slen - len + MMU_PAGESIZE; 439 as_addr = addr; 440 /* 441 * Round address DOWN to the alignment amount, 442 * add the offset, and if this address is less 443 * than the original address, add alignment amount. 444 */ 445 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 446 addr += (uintptr_t)(off & (align_amount - 1)); 447 if (addr < as_addr) 448 addr += align_amount; 449 450 ASSERT(addr <= (as_addr + align_amount)); 451 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 452 ((uintptr_t)(off & (align_amount - 1)))); 453 *addrp = addr; 454 } else { 455 *addrp = NULL; /* no more virtual space */ 456 } 457 } 458 459 /* 460 * Determine whether [base, base+len] contains a valid range of 461 * addresses at least minlen long. base and len are adjusted if 462 * required to provide a valid range. 463 */ 464 /*ARGSUSED3*/ 465 int 466 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 467 { 468 uintptr_t hi, lo; 469 470 lo = (uintptr_t)*basep; 471 hi = lo + *lenp; 472 473 /* 474 * If hi rolled over the top, try cutting back. 475 */ 476 if (hi < lo) { 477 if (0 - lo + hi < minlen) 478 return (0); 479 if (0 - lo < minlen) 480 return (0); 481 *lenp = 0 - lo; 482 } else if (hi - lo < minlen) { 483 return (0); 484 } 485 #if defined(__amd64) 486 /* 487 * Deal with a possible hole in the address range between 488 * hole_start and hole_end that should never be mapped. 489 */ 490 if (lo < hole_start) { 491 if (hi > hole_start) { 492 if (hi < hole_end) { 493 hi = hole_start; 494 } else { 495 /* lo < hole_start && hi >= hole_end */ 496 if (dir == AH_LO) { 497 /* 498 * prefer lowest range 499 */ 500 if (hole_start - lo >= minlen) 501 hi = hole_start; 502 else if (hi - hole_end >= minlen) 503 lo = hole_end; 504 else 505 return (0); 506 } else { 507 /* 508 * prefer highest range 509 */ 510 if (hi - hole_end >= minlen) 511 lo = hole_end; 512 else if (hole_start - lo >= minlen) 513 hi = hole_start; 514 else 515 return (0); 516 } 517 } 518 } 519 } else { 520 /* lo >= hole_start */ 521 if (hi < hole_end) 522 return (0); 523 if (lo < hole_end) 524 lo = hole_end; 525 } 526 527 if (hi - lo < minlen) 528 return (0); 529 530 *basep = (caddr_t)lo; 531 *lenp = hi - lo; 532 #endif 533 return (1); 534 } 535 536 /* 537 * Determine whether [addr, addr+len] are valid user addresses. 538 */ 539 /*ARGSUSED*/ 540 int 541 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 542 caddr_t userlimit) 543 { 544 caddr_t eaddr = addr + len; 545 546 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 547 return (RANGE_BADADDR); 548 549 #if defined(__amd64) 550 /* 551 * Check for the VA hole 552 */ 553 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 554 return (RANGE_BADADDR); 555 #endif 556 557 return (RANGE_OKAY); 558 } 559 560 /* 561 * Return 1 if the page frame is onboard memory, else 0. 562 */ 563 int 564 pf_is_memory(pfn_t pf) 565 { 566 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 567 } 568 569 570 /* 571 * initialized by page_coloring_init(). 572 */ 573 uint_t page_colors; 574 uint_t page_colors_mask; 575 uint_t page_coloring_shift; 576 int cpu_page_colors; 577 static uint_t l2_colors; 578 579 /* 580 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 581 * and page_colors are calculated from the l2 cache n-way set size. Within a 582 * mnode range, the page freelist and cachelist are hashed into bins based on 583 * color. This makes it easier to search for a page within a specific memory 584 * range. 585 */ 586 #define PAGE_COLORS_MIN 16 587 588 page_t ****page_freelists; 589 page_t ***page_cachelists; 590 591 /* 592 * As the PC architecture evolved memory up was clumped into several 593 * ranges for various historical I/O devices to do DMA. 594 * < 16Meg - ISA bus 595 * < 2Gig - ??? 596 * < 4Gig - PCI bus or drivers that don't understand PAE mode 597 */ 598 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 599 0x100000, /* pfn range for 4G and above */ 600 0x80000, /* pfn range for 2G-4G */ 601 0x01000, /* pfn range for 16M-2G */ 602 0x00000, /* pfn range for 0-16M */ 603 }; 604 605 /* 606 * These are changed during startup if the machine has limited memory. 607 */ 608 pfn_t *memranges = &arch_memranges[0]; 609 int nranges = NUM_MEM_RANGES; 610 611 /* 612 * Used by page layer to know about page sizes 613 */ 614 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 615 616 /* 617 * This can be patched via /etc/system to allow old non-PAE aware device 618 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 619 */ 620 #if defined(__i386) 621 int restricted_kmemalloc = 1; /* XX64 re-examine with PSARC 2004/405 */ 622 #elif defined(__amd64) 623 int restricted_kmemalloc = 0; 624 #endif 625 626 kmutex_t *fpc_mutex[NPC_MUTEX]; 627 kmutex_t *cpc_mutex[NPC_MUTEX]; 628 629 630 /* 631 * return the memrange containing pfn 632 */ 633 int 634 memrange_num(pfn_t pfn) 635 { 636 int n; 637 638 for (n = 0; n < nranges - 1; ++n) { 639 if (pfn >= memranges[n]) 640 break; 641 } 642 return (n); 643 } 644 645 /* 646 * return the mnoderange containing pfn 647 */ 648 int 649 pfn_2_mtype(pfn_t pfn) 650 { 651 int n; 652 653 for (n = mnoderangecnt - 1; n >= 0; n--) { 654 if (pfn >= mnoderanges[n].mnr_pfnlo) { 655 break; 656 } 657 } 658 return (n); 659 } 660 661 /* 662 * is_contigpage_free: 663 * returns a page list of contiguous pages. It minimally has to return 664 * minctg pages. Caller determines minctg based on the scatter-gather 665 * list length. 666 * 667 * pfnp is set to the next page frame to search on return. 668 */ 669 static page_t * 670 is_contigpage_free( 671 pfn_t *pfnp, 672 pgcnt_t *pgcnt, 673 pgcnt_t minctg, 674 uint64_t pfnseg, 675 int iolock) 676 { 677 int i = 0; 678 pfn_t pfn = *pfnp; 679 page_t *pp; 680 page_t *plist = NULL; 681 682 /* 683 * fail if pfn + minctg crosses a segment boundary. 684 * Adjust for next starting pfn to begin at segment boundary. 685 */ 686 687 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 688 *pfnp = roundup(*pfnp, pfnseg + 1); 689 return (NULL); 690 } 691 692 do { 693 retry: 694 pp = page_numtopp_nolock(pfn + i); 695 if ((pp == NULL) || 696 (page_trylock(pp, SE_EXCL) == 0)) { 697 (*pfnp)++; 698 break; 699 } 700 if (page_pptonum(pp) != pfn + i) { 701 page_unlock(pp); 702 goto retry; 703 } 704 705 if (!(PP_ISFREE(pp))) { 706 page_unlock(pp); 707 (*pfnp)++; 708 break; 709 } 710 711 if (!PP_ISAGED(pp)) { 712 page_list_sub(pp, PG_CACHE_LIST); 713 page_hashout(pp, (kmutex_t *)NULL); 714 } else { 715 page_list_sub(pp, PG_FREE_LIST); 716 } 717 718 if (iolock) 719 page_io_lock(pp); 720 page_list_concat(&plist, &pp); 721 722 /* 723 * exit loop when pgcnt satisfied or segment boundary reached. 724 */ 725 726 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 727 728 *pfnp += i; /* set to next pfn to search */ 729 730 if (i >= minctg) { 731 *pgcnt -= i; 732 return (plist); 733 } 734 735 /* 736 * failure: minctg not satisfied. 737 * 738 * if next request crosses segment boundary, set next pfn 739 * to search from the segment boundary. 740 */ 741 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 742 *pfnp = roundup(*pfnp, pfnseg + 1); 743 744 /* clean up any pages already allocated */ 745 746 while (plist) { 747 pp = plist; 748 page_sub(&plist, pp); 749 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 750 if (iolock) 751 page_io_unlock(pp); 752 page_unlock(pp); 753 } 754 755 return (NULL); 756 } 757 758 /* 759 * verify that pages being returned from allocator have correct DMA attribute 760 */ 761 #ifndef DEBUG 762 #define check_dma(a, b, c) (0) 763 #else 764 static void 765 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 766 { 767 if (dma_attr == NULL) 768 return; 769 770 while (cnt-- > 0) { 771 if (mmu_ptob((uint64_t)pp->p_pagenum) < 772 dma_attr->dma_attr_addr_lo) 773 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 774 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 775 dma_attr->dma_attr_addr_hi) 776 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 777 pp = pp->p_next; 778 } 779 } 780 #endif 781 782 static kmutex_t contig_lock; 783 784 #define CONTIG_LOCK() mutex_enter(&contig_lock); 785 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 786 787 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 788 789 static page_t * 790 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 791 { 792 pfn_t pfn; 793 int sgllen; 794 uint64_t pfnseg; 795 pgcnt_t minctg; 796 page_t *pplist = NULL, *plist; 797 uint64_t lo, hi; 798 pgcnt_t pfnalign = 0; 799 static pfn_t startpfn; 800 static pgcnt_t lastctgcnt; 801 uintptr_t align; 802 803 CONTIG_LOCK(); 804 805 if (mattr) { 806 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 807 hi = mmu_btop(mattr->dma_attr_addr_hi); 808 if (hi >= physmax) 809 hi = physmax - 1; 810 sgllen = mattr->dma_attr_sgllen; 811 pfnseg = mmu_btop(mattr->dma_attr_seg); 812 813 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 814 if (align > MMU_PAGESIZE) 815 pfnalign = mmu_btop(align); 816 817 /* 818 * in order to satisfy the request, must minimally 819 * acquire minctg contiguous pages 820 */ 821 minctg = howmany(*pgcnt, sgllen); 822 823 ASSERT(hi >= lo); 824 825 /* 826 * start from where last searched if the minctg >= lastctgcnt 827 */ 828 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 829 startpfn = lo; 830 } else { 831 hi = physmax - 1; 832 lo = 0; 833 sgllen = 1; 834 pfnseg = mmu.highest_pfn; 835 minctg = *pgcnt; 836 837 if (minctg < lastctgcnt) 838 startpfn = lo; 839 } 840 lastctgcnt = minctg; 841 842 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 843 844 /* conserve 16m memory - start search above 16m when possible */ 845 if (hi > PFN_16M && startpfn < PFN_16M) 846 startpfn = PFN_16M; 847 848 pfn = startpfn; 849 if (pfnalign) 850 pfn = P2ROUNDUP(pfn, pfnalign); 851 852 while (pfn + minctg - 1 <= hi) { 853 854 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 855 if (plist) { 856 page_list_concat(&pplist, &plist); 857 sgllen--; 858 /* 859 * return when contig pages no longer needed 860 */ 861 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 862 startpfn = pfn; 863 CONTIG_UNLOCK(); 864 check_dma(mattr, pplist, *pgcnt); 865 return (pplist); 866 } 867 minctg = howmany(*pgcnt, sgllen); 868 } 869 if (pfnalign) 870 pfn = P2ROUNDUP(pfn, pfnalign); 871 } 872 873 /* cannot find contig pages in specified range */ 874 if (startpfn == lo) { 875 CONTIG_UNLOCK(); 876 return (NULL); 877 } 878 879 /* did not start with lo previously */ 880 pfn = lo; 881 if (pfnalign) 882 pfn = P2ROUNDUP(pfn, pfnalign); 883 884 /* allow search to go above startpfn */ 885 while (pfn < startpfn) { 886 887 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 888 if (plist != NULL) { 889 890 page_list_concat(&pplist, &plist); 891 sgllen--; 892 893 /* 894 * return when contig pages no longer needed 895 */ 896 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 897 startpfn = pfn; 898 CONTIG_UNLOCK(); 899 check_dma(mattr, pplist, *pgcnt); 900 return (pplist); 901 } 902 minctg = howmany(*pgcnt, sgllen); 903 } 904 if (pfnalign) 905 pfn = P2ROUNDUP(pfn, pfnalign); 906 } 907 CONTIG_UNLOCK(); 908 return (NULL); 909 } 910 911 /* 912 * combine mem_node_config and memrange memory ranges into one data 913 * structure to be used for page list management. 914 * 915 * mnode_range_cnt() calculates the number of memory ranges for mnode and 916 * memranges[]. Used to determine the size of page lists and mnoderanges. 917 * 918 * mnode_range_setup() initializes mnoderanges. 919 */ 920 mnoderange_t *mnoderanges; 921 int mnoderangecnt; 922 int mtype4g; 923 924 int 925 mnode_range_cnt() 926 { 927 int mri; 928 int mnrcnt = 0; 929 int mnode; 930 931 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 932 if (mem_node_config[mnode].exists == 0) 933 continue; 934 935 mri = nranges - 1; 936 937 /* find the memranges index below contained in mnode range */ 938 939 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 940 mri--; 941 942 /* 943 * increment mnode range counter when memranges or mnode 944 * boundary is reached. 945 */ 946 while (mri >= 0 && 947 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 948 mnrcnt++; 949 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 950 mri--; 951 else 952 break; 953 } 954 } 955 return (mnrcnt); 956 } 957 958 void 959 mnode_range_setup(mnoderange_t *mnoderanges) 960 { 961 int mnode, mri; 962 963 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 964 if (mem_node_config[mnode].exists == 0) 965 continue; 966 967 mri = nranges - 1; 968 969 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 970 mri--; 971 972 while (mri >= 0 && mem_node_config[mnode].physmax >= 973 MEMRANGELO(mri)) { 974 mnoderanges->mnr_pfnlo = 975 MAX(MEMRANGELO(mri), 976 mem_node_config[mnode].physbase); 977 mnoderanges->mnr_pfnhi = 978 MIN(MEMRANGEHI(mri), 979 mem_node_config[mnode].physmax); 980 mnoderanges->mnr_mnode = mnode; 981 mnoderanges->mnr_memrange = mri; 982 mnoderanges++; 983 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 984 mri--; 985 else 986 break; 987 } 988 } 989 } 990 991 /* 992 * Determine if the mnode range specified in mtype contains memory belonging 993 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 994 * the range of indices to 0 or 4g. 995 * 996 * Return first mnode range type index found otherwise return -1 if none found. 997 */ 998 int 999 mtype_func(int mnode, int mtype, uint_t flags) 1000 { 1001 if (flags & PGI_MT_RANGE) { 1002 int mtlim = 0; /* default to PGI_MT_RANGEO */ 1003 1004 if (flags & PGI_MT_NEXT) 1005 mtype--; 1006 if (flags & PGI_MT_RANGE4G) 1007 mtlim = mtype4g + 1; 1008 while (mtype >= mtlim) { 1009 if (mnoderanges[mtype].mnr_mnode == mnode) 1010 return (mtype); 1011 mtype--; 1012 } 1013 } else { 1014 if (mnoderanges[mtype].mnr_mnode == mnode) 1015 return (mtype); 1016 } 1017 return (-1); 1018 } 1019 1020 /* 1021 * Returns the free page count for mnode 1022 */ 1023 int 1024 mnode_pgcnt(int mnode) 1025 { 1026 int mtype = mnoderangecnt - 1; 1027 int flags = PGI_MT_RANGE0; 1028 pgcnt_t pgcnt = 0; 1029 1030 mtype = mtype_func(mnode, mtype, flags); 1031 1032 while (mtype != -1) { 1033 pgcnt += (mnoderanges[mtype].mnr_mt_flpgcnt + 1034 mnoderanges[mtype].mnr_mt_lgpgcnt + 1035 mnoderanges[mtype].mnr_mt_clpgcnt); 1036 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1037 } 1038 return (pgcnt); 1039 } 1040 1041 /* 1042 * Initialize page coloring variables based on the l2 cache parameters. 1043 * Calculate and return memory needed for page coloring data structures. 1044 */ 1045 size_t 1046 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1047 { 1048 size_t colorsz = 0; 1049 int i; 1050 int colors; 1051 1052 /* 1053 * Reduce the memory ranges lists if we don't have large amounts 1054 * of memory. This avoids searching known empty free lists. 1055 */ 1056 i = memrange_num(physmax); 1057 memranges += i; 1058 nranges -= i; 1059 #if defined(__i386) 1060 if (i > 0) 1061 restricted_kmemalloc = 0; 1062 #endif 1063 /* physmax greater than 4g */ 1064 if (i == 0) 1065 physmax4g = 1; 1066 1067 /* 1068 * setup pagesize for generic page layer 1069 */ 1070 for (i = 0; i <= mmu.max_page_level; ++i) { 1071 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1072 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1073 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1074 } 1075 1076 ASSERT(ISP2(l2_sz)); 1077 ASSERT(ISP2(l2_linesz)); 1078 ASSERT(l2_sz > MMU_PAGESIZE); 1079 1080 /* l2_assoc is 0 for fully associative l2 cache */ 1081 if (l2_assoc) 1082 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1083 else 1084 l2_colors = 1; 1085 1086 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1087 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1088 1089 /* 1090 * cpu_page_colors is non-zero when a page color may be spread across 1091 * multiple bins. 1092 */ 1093 if (l2_colors < page_colors) 1094 cpu_page_colors = l2_colors; 1095 1096 ASSERT(ISP2(page_colors)); 1097 1098 page_colors_mask = page_colors - 1; 1099 1100 ASSERT(ISP2(CPUSETSIZE())); 1101 page_coloring_shift = lowbit(CPUSETSIZE()); 1102 1103 /* size for mnoderanges */ 1104 mnoderangecnt = mnode_range_cnt(); 1105 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1106 1107 /* size for fpc_mutex and cpc_mutex */ 1108 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1109 1110 /* size of page_freelists */ 1111 colorsz += mnoderangecnt * sizeof (page_t ***); 1112 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1113 1114 for (i = 0; i < mmu_page_sizes; i++) { 1115 colors = page_get_pagecolors(i); 1116 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1117 } 1118 1119 /* size of page_cachelists */ 1120 colorsz += mnoderangecnt * sizeof (page_t **); 1121 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1122 1123 return (colorsz); 1124 } 1125 1126 /* 1127 * Called once at startup to configure page_coloring data structures and 1128 * does the 1st page_free()/page_freelist_add(). 1129 */ 1130 void 1131 page_coloring_setup(caddr_t pcmemaddr) 1132 { 1133 int i; 1134 int j; 1135 int k; 1136 caddr_t addr; 1137 int colors; 1138 1139 /* 1140 * do page coloring setup 1141 */ 1142 addr = pcmemaddr; 1143 1144 mnoderanges = (mnoderange_t *)addr; 1145 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1146 1147 mnode_range_setup(mnoderanges); 1148 1149 if (physmax4g) 1150 mtype4g = pfn_2_mtype(0xfffff); 1151 1152 for (k = 0; k < NPC_MUTEX; k++) { 1153 fpc_mutex[k] = (kmutex_t *)addr; 1154 addr += (max_mem_nodes * sizeof (kmutex_t)); 1155 } 1156 for (k = 0; k < NPC_MUTEX; k++) { 1157 cpc_mutex[k] = (kmutex_t *)addr; 1158 addr += (max_mem_nodes * sizeof (kmutex_t)); 1159 } 1160 page_freelists = (page_t ****)addr; 1161 addr += (mnoderangecnt * sizeof (page_t ***)); 1162 1163 page_cachelists = (page_t ***)addr; 1164 addr += (mnoderangecnt * sizeof (page_t **)); 1165 1166 for (i = 0; i < mnoderangecnt; i++) { 1167 page_freelists[i] = (page_t ***)addr; 1168 addr += (mmu_page_sizes * sizeof (page_t **)); 1169 1170 for (j = 0; j < mmu_page_sizes; j++) { 1171 colors = page_get_pagecolors(j); 1172 page_freelists[i][j] = (page_t **)addr; 1173 addr += (colors * sizeof (page_t *)); 1174 } 1175 page_cachelists[i] = (page_t **)addr; 1176 addr += (page_colors * sizeof (page_t *)); 1177 } 1178 } 1179 1180 /*ARGSUSED*/ 1181 int 1182 bp_color(struct buf *bp) 1183 { 1184 return (0); 1185 } 1186 1187 /* 1188 * get a page from any list with the given mnode 1189 */ 1190 page_t * 1191 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1192 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1193 { 1194 kmutex_t *pcm; 1195 int i; 1196 page_t *pp; 1197 page_t *first_pp; 1198 uint64_t pgaddr; 1199 ulong_t bin; 1200 int mtypestart; 1201 1202 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1203 1204 ASSERT((flags & PG_MATCH_COLOR) == 0); 1205 ASSERT(szc == 0); 1206 ASSERT(dma_attr != NULL); 1207 1208 1209 MTYPE_START(mnode, mtype, flags); 1210 if (mtype < 0) { 1211 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1212 return (NULL); 1213 } 1214 1215 mtypestart = mtype; 1216 1217 bin = origbin; 1218 1219 /* 1220 * check up to page_colors + 1 bins - origbin may be checked twice 1221 * because of BIN_STEP skip 1222 */ 1223 do { 1224 i = 0; 1225 while (i <= page_colors) { 1226 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1227 goto nextfreebin; 1228 1229 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1230 mutex_enter(pcm); 1231 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1232 first_pp = pp; 1233 while (pp != NULL) { 1234 if (page_trylock(pp, SE_EXCL) == 0) { 1235 pp = pp->p_next; 1236 if (pp == first_pp) { 1237 pp = NULL; 1238 } 1239 continue; 1240 } 1241 1242 ASSERT(PP_ISFREE(pp)); 1243 ASSERT(PP_ISAGED(pp)); 1244 ASSERT(pp->p_vnode == NULL); 1245 ASSERT(pp->p_hash == NULL); 1246 ASSERT(pp->p_offset == (u_offset_t)-1); 1247 ASSERT(pp->p_szc == szc); 1248 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1249 /* check if page within DMA attributes */ 1250 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1251 1252 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1253 (pgaddr + MMU_PAGESIZE - 1 <= 1254 dma_attr->dma_attr_addr_hi)) { 1255 break; 1256 } 1257 1258 /* continue looking */ 1259 page_unlock(pp); 1260 pp = pp->p_next; 1261 if (pp == first_pp) 1262 pp = NULL; 1263 1264 } 1265 if (pp != NULL) { 1266 ASSERT(mtype == PP_2_MTYPE(pp)); 1267 ASSERT(pp->p_szc == 0); 1268 1269 /* found a page with specified DMA attributes */ 1270 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1271 mtype), pp); 1272 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1273 1274 if ((PP_ISFREE(pp) == 0) || 1275 (PP_ISAGED(pp) == 0)) { 1276 cmn_err(CE_PANIC, "page %p is not free", 1277 (void *)pp); 1278 } 1279 1280 mutex_exit(pcm); 1281 check_dma(dma_attr, pp, 1); 1282 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1283 return (pp); 1284 } 1285 mutex_exit(pcm); 1286 nextfreebin: 1287 pp = page_freelist_fill(szc, bin, mnode, mtype, 1288 mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 1289 if (pp) 1290 return (pp); 1291 1292 /* try next bin */ 1293 bin += (i == 0) ? BIN_STEP : 1; 1294 bin &= page_colors_mask; 1295 i++; 1296 } 1297 MTYPE_NEXT(mnode, mtype, flags); 1298 } while (mtype >= 0); 1299 1300 /* failed to find a page in the freelist; try it in the cachelist */ 1301 1302 /* reset mtype start for cachelist search */ 1303 mtype = mtypestart; 1304 ASSERT(mtype >= 0); 1305 1306 /* start with the bin of matching color */ 1307 bin = origbin; 1308 1309 do { 1310 for (i = 0; i <= page_colors; i++) { 1311 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1312 goto nextcachebin; 1313 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1314 mutex_enter(pcm); 1315 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1316 first_pp = pp; 1317 while (pp != NULL) { 1318 if (page_trylock(pp, SE_EXCL) == 0) { 1319 pp = pp->p_next; 1320 if (pp == first_pp) 1321 break; 1322 continue; 1323 } 1324 ASSERT(pp->p_vnode); 1325 ASSERT(PP_ISAGED(pp) == 0); 1326 ASSERT(pp->p_szc == 0); 1327 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1328 1329 /* check if page within DMA attributes */ 1330 1331 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1332 1333 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1334 (pgaddr + MMU_PAGESIZE - 1 <= 1335 dma_attr->dma_attr_addr_hi)) { 1336 break; 1337 } 1338 1339 /* continue looking */ 1340 page_unlock(pp); 1341 pp = pp->p_next; 1342 if (pp == first_pp) 1343 pp = NULL; 1344 } 1345 1346 if (pp != NULL) { 1347 ASSERT(mtype == PP_2_MTYPE(pp)); 1348 ASSERT(pp->p_szc == 0); 1349 1350 /* found a page with specified DMA attributes */ 1351 page_sub(&PAGE_CACHELISTS(mnode, bin, 1352 mtype), pp); 1353 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1354 1355 mutex_exit(pcm); 1356 ASSERT(pp->p_vnode); 1357 ASSERT(PP_ISAGED(pp) == 0); 1358 check_dma(dma_attr, pp, 1); 1359 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1360 return (pp); 1361 } 1362 mutex_exit(pcm); 1363 nextcachebin: 1364 bin += (i == 0) ? BIN_STEP : 1; 1365 bin &= page_colors_mask; 1366 } 1367 MTYPE_NEXT(mnode, mtype, flags); 1368 } while (mtype >= 0); 1369 1370 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1371 return (NULL); 1372 } 1373 1374 /* 1375 * This function is similar to page_get_freelist()/page_get_cachelist() 1376 * but it searches both the lists to find a page with the specified 1377 * color (or no color) and DMA attributes. The search is done in the 1378 * freelist first and then in the cache list within the highest memory 1379 * range (based on DMA attributes) before searching in the lower 1380 * memory ranges. 1381 * 1382 * Note: This function is called only by page_create_io(). 1383 */ 1384 /*ARGSUSED*/ 1385 page_t * 1386 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1387 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1388 { 1389 uint_t bin; 1390 int mtype; 1391 page_t *pp; 1392 int n; 1393 int m; 1394 int szc; 1395 int fullrange; 1396 int mnode; 1397 int local_failed_stat = 0; 1398 lgrp_mnode_cookie_t lgrp_cookie; 1399 1400 VM_STAT_ADD(pga_vmstats.pga_alloc); 1401 1402 /* only base pagesize currently supported */ 1403 if (size != MMU_PAGESIZE) 1404 return (NULL); 1405 1406 /* 1407 * If we're passed a specific lgroup, we use it. Otherwise, 1408 * assume first-touch placement is desired. 1409 */ 1410 if (!LGRP_EXISTS(lgrp)) 1411 lgrp = lgrp_home_lgrp(); 1412 1413 /* LINTED */ 1414 AS_2_BIN(as, seg, vp, vaddr, bin); 1415 1416 /* 1417 * Only hold one freelist or cachelist lock at a time, that way we 1418 * can start anywhere and not have to worry about lock 1419 * ordering. 1420 */ 1421 if (dma_attr == NULL) { 1422 n = 0; 1423 m = mnoderangecnt - 1; 1424 fullrange = 1; 1425 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1426 } else { 1427 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1428 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1429 1430 /* 1431 * We can guarantee alignment only for page boundary. 1432 */ 1433 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1434 return (NULL); 1435 1436 n = pfn_2_mtype(pfnlo); 1437 m = pfn_2_mtype(pfnhi); 1438 1439 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1440 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1441 } 1442 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1443 1444 if (n > m) 1445 return (NULL); 1446 1447 szc = 0; 1448 1449 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1450 if (n == 0) { 1451 flags |= PGI_MT_RANGE0; 1452 n = m; 1453 } 1454 1455 /* 1456 * Try local memory node first, but try remote if we can't 1457 * get a page of the right color. 1458 */ 1459 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1460 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1461 /* 1462 * allocate pages from high pfn to low. 1463 */ 1464 for (mtype = m; mtype >= n; mtype--) { 1465 if (fullrange != 0) { 1466 pp = page_get_mnode_freelist(mnode, 1467 bin, mtype, szc, flags); 1468 if (pp == NULL) { 1469 pp = page_get_mnode_cachelist( 1470 bin, flags, mnode, mtype); 1471 } 1472 } else { 1473 pp = page_get_mnode_anylist(bin, szc, 1474 flags, mnode, mtype, dma_attr); 1475 } 1476 if (pp != NULL) { 1477 VM_STAT_ADD(pga_vmstats.pga_allocok); 1478 check_dma(dma_attr, pp, 1); 1479 return (pp); 1480 } 1481 } 1482 if (!local_failed_stat) { 1483 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1484 local_failed_stat = 1; 1485 } 1486 } 1487 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1488 1489 return (NULL); 1490 } 1491 1492 /* 1493 * page_create_io() 1494 * 1495 * This function is a copy of page_create_va() with an additional 1496 * argument 'mattr' that specifies DMA memory requirements to 1497 * the page list functions. This function is used by the segkmem 1498 * allocator so it is only to create new pages (i.e PG_EXCL is 1499 * set). 1500 * 1501 * Note: This interface is currently used by x86 PSM only and is 1502 * not fully specified so the commitment level is only for 1503 * private interface specific to x86. This interface uses PSM 1504 * specific page_get_anylist() interface. 1505 */ 1506 1507 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1508 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1509 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1510 break; \ 1511 } \ 1512 } 1513 1514 1515 page_t * 1516 page_create_io( 1517 struct vnode *vp, 1518 u_offset_t off, 1519 uint_t bytes, 1520 uint_t flags, 1521 struct as *as, 1522 caddr_t vaddr, 1523 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1524 { 1525 page_t *plist = NULL; 1526 uint_t plist_len = 0; 1527 pgcnt_t npages; 1528 page_t *npp = NULL; 1529 uint_t pages_req; 1530 page_t *pp; 1531 kmutex_t *phm = NULL; 1532 uint_t index; 1533 1534 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1535 "page_create_start:vp %p off %llx bytes %u flags %x", 1536 vp, off, bytes, flags); 1537 1538 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1539 1540 pages_req = npages = mmu_btopr(bytes); 1541 1542 /* 1543 * Do the freemem and pcf accounting. 1544 */ 1545 if (!page_create_wait(npages, flags)) { 1546 return (NULL); 1547 } 1548 1549 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1550 "page_create_success:vp %p off %llx", 1551 vp, off); 1552 1553 /* 1554 * If satisfying this request has left us with too little 1555 * memory, start the wheels turning to get some back. The 1556 * first clause of the test prevents waking up the pageout 1557 * daemon in situations where it would decide that there's 1558 * nothing to do. 1559 */ 1560 if (nscan < desscan && freemem < minfree) { 1561 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1562 "pageout_cv_signal:freemem %ld", freemem); 1563 cv_signal(&proc_pageout->p_cv); 1564 } 1565 1566 if (flags & PG_PHYSCONTIG) { 1567 1568 plist = page_get_contigpage(&npages, mattr, 1); 1569 if (plist == NULL) { 1570 page_create_putback(npages); 1571 return (NULL); 1572 } 1573 1574 pp = plist; 1575 1576 do { 1577 if (!page_hashin(pp, vp, off, NULL)) { 1578 panic("pg_creat_io: hashin failed %p %p %llx", 1579 (void *)pp, (void *)vp, off); 1580 } 1581 VM_STAT_ADD(page_create_new); 1582 off += MMU_PAGESIZE; 1583 PP_CLRFREE(pp); 1584 PP_CLRAGED(pp); 1585 page_set_props(pp, P_REF); 1586 pp = pp->p_next; 1587 } while (pp != plist); 1588 1589 if (!npages) { 1590 check_dma(mattr, plist, pages_req); 1591 return (plist); 1592 } else { 1593 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1594 } 1595 1596 /* 1597 * fall-thru: 1598 * 1599 * page_get_contigpage returns when npages <= sgllen. 1600 * Grab the rest of the non-contig pages below from anylist. 1601 */ 1602 } 1603 1604 /* 1605 * Loop around collecting the requested number of pages. 1606 * Most of the time, we have to `create' a new page. With 1607 * this in mind, pull the page off the free list before 1608 * getting the hash lock. This will minimize the hash 1609 * lock hold time, nesting, and the like. If it turns 1610 * out we don't need the page, we put it back at the end. 1611 */ 1612 while (npages--) { 1613 phm = NULL; 1614 1615 index = PAGE_HASH_FUNC(vp, off); 1616 top: 1617 ASSERT(phm == NULL); 1618 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1619 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1620 1621 if (npp == NULL) { 1622 /* 1623 * Try to get the page of any color either from 1624 * the freelist or from the cache list. 1625 */ 1626 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1627 flags & ~PG_MATCH_COLOR, mattr, NULL); 1628 if (npp == NULL) { 1629 if (mattr == NULL) { 1630 /* 1631 * Not looking for a special page; 1632 * panic! 1633 */ 1634 panic("no page found %d", (int)npages); 1635 } 1636 /* 1637 * No page found! This can happen 1638 * if we are looking for a page 1639 * within a specific memory range 1640 * for DMA purposes. If PG_WAIT is 1641 * specified then we wait for a 1642 * while and then try again. The 1643 * wait could be forever if we 1644 * don't get the page(s) we need. 1645 * 1646 * Note: XXX We really need a mechanism 1647 * to wait for pages in the desired 1648 * range. For now, we wait for any 1649 * pages and see if we can use it. 1650 */ 1651 1652 if ((mattr != NULL) && (flags & PG_WAIT)) { 1653 delay(10); 1654 goto top; 1655 } 1656 1657 goto fail; /* undo accounting stuff */ 1658 } 1659 1660 if (PP_ISAGED(npp) == 0) { 1661 /* 1662 * Since this page came from the 1663 * cachelist, we must destroy the 1664 * old vnode association. 1665 */ 1666 page_hashout(npp, (kmutex_t *)NULL); 1667 } 1668 } 1669 1670 /* 1671 * We own this page! 1672 */ 1673 ASSERT(PAGE_EXCL(npp)); 1674 ASSERT(npp->p_vnode == NULL); 1675 ASSERT(!hat_page_is_mapped(npp)); 1676 PP_CLRFREE(npp); 1677 PP_CLRAGED(npp); 1678 1679 /* 1680 * Here we have a page in our hot little mits and are 1681 * just waiting to stuff it on the appropriate lists. 1682 * Get the mutex and check to see if it really does 1683 * not exist. 1684 */ 1685 phm = PAGE_HASH_MUTEX(index); 1686 mutex_enter(phm); 1687 PAGE_HASH_SEARCH(index, pp, vp, off); 1688 if (pp == NULL) { 1689 VM_STAT_ADD(page_create_new); 1690 pp = npp; 1691 npp = NULL; 1692 if (!page_hashin(pp, vp, off, phm)) { 1693 /* 1694 * Since we hold the page hash mutex and 1695 * just searched for this page, page_hashin 1696 * had better not fail. If it does, that 1697 * means somethread did not follow the 1698 * page hash mutex rules. Panic now and 1699 * get it over with. As usual, go down 1700 * holding all the locks. 1701 */ 1702 ASSERT(MUTEX_HELD(phm)); 1703 panic("page_create: hashin fail %p %p %llx %p", 1704 (void *)pp, (void *)vp, off, (void *)phm); 1705 1706 } 1707 ASSERT(MUTEX_HELD(phm)); 1708 mutex_exit(phm); 1709 phm = NULL; 1710 1711 /* 1712 * Hat layer locking need not be done to set 1713 * the following bits since the page is not hashed 1714 * and was on the free list (i.e., had no mappings). 1715 * 1716 * Set the reference bit to protect 1717 * against immediate pageout 1718 * 1719 * XXXmh modify freelist code to set reference 1720 * bit so we don't have to do it here. 1721 */ 1722 page_set_props(pp, P_REF); 1723 } else { 1724 ASSERT(MUTEX_HELD(phm)); 1725 mutex_exit(phm); 1726 phm = NULL; 1727 /* 1728 * NOTE: This should not happen for pages associated 1729 * with kernel vnode 'kvp'. 1730 */ 1731 /* XX64 - to debug why this happens! */ 1732 ASSERT(vp != &kvp); 1733 if (vp == &kvp) 1734 cmn_err(CE_NOTE, 1735 "page_create: page not expected " 1736 "in hash list for kernel vnode - pp 0x%p", 1737 (void *)pp); 1738 VM_STAT_ADD(page_create_exists); 1739 goto fail; 1740 } 1741 1742 /* 1743 * Got a page! It is locked. Acquire the i/o 1744 * lock since we are going to use the p_next and 1745 * p_prev fields to link the requested pages together. 1746 */ 1747 page_io_lock(pp); 1748 page_add(&plist, pp); 1749 plist = plist->p_next; 1750 off += MMU_PAGESIZE; 1751 vaddr += MMU_PAGESIZE; 1752 } 1753 1754 check_dma(mattr, plist, pages_req); 1755 return (plist); 1756 1757 fail: 1758 if (npp != NULL) { 1759 /* 1760 * Did not need this page after all. 1761 * Put it back on the free list. 1762 */ 1763 VM_STAT_ADD(page_create_putbacks); 1764 PP_SETFREE(npp); 1765 PP_SETAGED(npp); 1766 npp->p_offset = (u_offset_t)-1; 1767 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1768 page_unlock(npp); 1769 } 1770 1771 /* 1772 * Give up the pages we already got. 1773 */ 1774 while (plist != NULL) { 1775 pp = plist; 1776 page_sub(&plist, pp); 1777 page_io_unlock(pp); 1778 plist_len++; 1779 /*LINTED: constant in conditional ctx*/ 1780 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1781 } 1782 1783 /* 1784 * VN_DISPOSE does freemem accounting for the pages in plist 1785 * by calling page_free. So, we need to undo the pcf accounting 1786 * for only the remaining pages. 1787 */ 1788 VM_STAT_ADD(page_create_putbacks); 1789 page_create_putback(pages_req - plist_len); 1790 1791 return (NULL); 1792 } 1793 1794 1795 /* 1796 * Copy the data from the physical page represented by "frompp" to 1797 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1798 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1799 * level and no one sleeps with an active mapping there. 1800 * 1801 * Note that the ref/mod bits in the page_t's are not affected by 1802 * this operation, hence it is up to the caller to update them appropriately. 1803 */ 1804 void 1805 ppcopy(page_t *frompp, page_t *topp) 1806 { 1807 caddr_t pp_addr1; 1808 caddr_t pp_addr2; 1809 void *pte1; 1810 void *pte2; 1811 kmutex_t *ppaddr_mutex; 1812 1813 ASSERT_STACK_ALIGNED(); 1814 ASSERT(PAGE_LOCKED(frompp)); 1815 ASSERT(PAGE_LOCKED(topp)); 1816 1817 if (kpm_enable) { 1818 pp_addr1 = hat_kpm_page2va(frompp, 0); 1819 pp_addr2 = hat_kpm_page2va(topp, 0); 1820 kpreempt_disable(); 1821 } else { 1822 /* 1823 * disable pre-emption so that CPU can't change 1824 */ 1825 kpreempt_disable(); 1826 1827 pp_addr1 = CPU->cpu_caddr1; 1828 pp_addr2 = CPU->cpu_caddr2; 1829 pte1 = (void *)CPU->cpu_caddr1pte; 1830 pte2 = (void *)CPU->cpu_caddr2pte; 1831 1832 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1833 mutex_enter(ppaddr_mutex); 1834 1835 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1836 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1837 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1838 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1839 HAT_LOAD_NOCONSIST); 1840 } 1841 1842 if (use_sse_pagecopy) 1843 hwblkpagecopy(pp_addr1, pp_addr2); 1844 else 1845 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1846 1847 if (!kpm_enable) 1848 mutex_exit(ppaddr_mutex); 1849 kpreempt_enable(); 1850 } 1851 1852 /* 1853 * Zero the physical page from off to off + len given by `pp' 1854 * without changing the reference and modified bits of page. 1855 * 1856 * We use this using CPU private page address #2, see ppcopy() for more info. 1857 * pagezero() must not be called at interrupt level. 1858 */ 1859 void 1860 pagezero(page_t *pp, uint_t off, uint_t len) 1861 { 1862 caddr_t pp_addr2; 1863 void *pte2; 1864 kmutex_t *ppaddr_mutex; 1865 1866 ASSERT_STACK_ALIGNED(); 1867 ASSERT(len <= MMU_PAGESIZE); 1868 ASSERT(off <= MMU_PAGESIZE); 1869 ASSERT(off + len <= MMU_PAGESIZE); 1870 ASSERT(PAGE_LOCKED(pp)); 1871 1872 if (kpm_enable) { 1873 pp_addr2 = hat_kpm_page2va(pp, 0); 1874 kpreempt_disable(); 1875 } else { 1876 kpreempt_disable(); 1877 1878 pp_addr2 = CPU->cpu_caddr2; 1879 pte2 = (void *)CPU->cpu_caddr2pte; 1880 1881 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1882 mutex_enter(ppaddr_mutex); 1883 1884 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 1885 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1886 HAT_LOAD_NOCONSIST); 1887 } 1888 1889 if (use_sse_pagezero) 1890 hwblkclr(pp_addr2 + off, len); 1891 else 1892 bzero(pp_addr2 + off, len); 1893 1894 if (!kpm_enable) 1895 mutex_exit(ppaddr_mutex); 1896 kpreempt_enable(); 1897 } 1898 1899 /* 1900 * Platform-dependent page scrub call. 1901 */ 1902 void 1903 pagescrub(page_t *pp, uint_t off, uint_t len) 1904 { 1905 /* 1906 * For now, we rely on the fact that pagezero() will 1907 * always clear UEs. 1908 */ 1909 pagezero(pp, off, len); 1910 } 1911 1912 /* 1913 * set up two private addresses for use on a given CPU for use in ppcopy() 1914 */ 1915 void 1916 setup_vaddr_for_ppcopy(struct cpu *cpup) 1917 { 1918 void *addr; 1919 void *pte; 1920 1921 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1922 pte = hat_mempte_setup(addr); 1923 cpup->cpu_caddr1 = addr; 1924 cpup->cpu_caddr1pte = (pteptr_t)pte; 1925 1926 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1927 pte = hat_mempte_setup(addr); 1928 cpup->cpu_caddr2 = addr; 1929 cpup->cpu_caddr2pte = (pteptr_t)pte; 1930 1931 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 1932 } 1933 1934 1935 /* 1936 * Create the pageout scanner thread. The thread has to 1937 * start at procedure with process pp and priority pri. 1938 */ 1939 void 1940 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 1941 { 1942 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 1943 } 1944 1945 /* 1946 * any use for this? 1947 */ 1948 void 1949 post_startup_mmu_initialization(void) 1950 {} 1951 1952 /* 1953 * Function for flushing D-cache when performing module relocations 1954 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 1955 */ 1956 void 1957 dcache_flushall() 1958 {} 1959