1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 59 #include <vm/hat.h> 60 #include <vm/as.h> 61 #include <vm/seg.h> 62 #include <vm/seg_kp.h> 63 #include <vm/seg_vn.h> 64 #include <vm/page.h> 65 #include <vm/seg_kmem.h> 66 #include <vm/seg_kpm.h> 67 #include <vm/vm_dep.h> 68 69 #include <sys/cpu.h> 70 #include <sys/vm_machparam.h> 71 #include <sys/memlist.h> 72 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 73 #include <vm/hat_i86.h> 74 #include <sys/x86_archext.h> 75 #include <sys/elf_386.h> 76 #include <sys/cmn_err.h> 77 #include <sys/archsystm.h> 78 #include <sys/machsystm.h> 79 80 #include <sys/vtrace.h> 81 #include <sys/ddidmareq.h> 82 #include <sys/promif.h> 83 #include <sys/memnode.h> 84 #include <sys/stack.h> 85 86 uint_t vac_colors = 0; 87 88 int largepagesupport = 0; 89 extern uint_t page_create_new; 90 extern uint_t page_create_exists; 91 extern uint_t page_create_putbacks; 92 extern uint_t page_create_putbacks; 93 extern uintptr_t eprom_kernelbase; 94 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 95 96 /* 4g memory management */ 97 pgcnt_t maxmem4g; 98 pgcnt_t freemem4g; 99 int physmax4g; 100 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 101 int lotsfree4gshift = 3; 102 103 /* 16m memory management: desired number of free pages below 16m. */ 104 pgcnt_t desfree16m = 0x380; 105 106 #ifdef VM_STATS 107 struct { 108 ulong_t pga_alloc; 109 ulong_t pga_notfullrange; 110 ulong_t pga_nulldmaattr; 111 ulong_t pga_allocok; 112 ulong_t pga_allocfailed; 113 ulong_t pgma_alloc; 114 ulong_t pgma_allocok; 115 ulong_t pgma_allocfailed; 116 ulong_t pgma_allocempty; 117 } pga_vmstats; 118 #endif 119 120 uint_t mmu_page_sizes; 121 122 /* How many page sizes the users can see */ 123 uint_t mmu_exported_page_sizes; 124 125 size_t auto_lpg_va_default = MMU_PAGESIZE; /* used by zmap() */ 126 /* 127 * Number of pages in 1 GB. Don't enable automatic large pages if we have 128 * fewer than this many pages. 129 */ 130 pgcnt_t auto_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 131 132 /* 133 * Return the optimum page size for a given mapping 134 */ 135 /*ARGSUSED*/ 136 size_t 137 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) 138 { 139 level_t l; 140 141 if (remap) 142 *remap = 0; 143 144 switch (maptype) { 145 146 case MAPPGSZ_STK: 147 case MAPPGSZ_HEAP: 148 case MAPPGSZ_VA: 149 /* 150 * use the pages size that best fits len 151 */ 152 for (l = mmu.max_page_level; l > 0; --l) { 153 if (len < LEVEL_SIZE(l)) 154 continue; 155 break; 156 } 157 return (LEVEL_SIZE(l)); 158 159 /* 160 * for ISM use the 1st large page size. 161 */ 162 case MAPPGSZ_ISM: 163 if (mmu.max_page_level == 0) 164 return (MMU_PAGESIZE); 165 return (LEVEL_SIZE(1)); 166 } 167 return (0); 168 } 169 170 /* 171 * This can be patched via /etc/system to allow large pages 172 * to be used for mapping application and libraries text segments. 173 */ 174 int use_text_largepages = 0; 175 int use_shm_largepages = 0; 176 177 /* 178 * Return a bit vector of large page size codes that 179 * can be used to map [addr, addr + len) region. 180 */ 181 182 /*ARGSUSED*/ 183 uint_t 184 map_execseg_pgszcvec(int text, caddr_t addr, size_t len) 185 { 186 size_t pgsz; 187 caddr_t a; 188 189 if (!text || !use_text_largepages || 190 mmu.max_page_level == 0) 191 return (0); 192 193 pgsz = LEVEL_SIZE(1); 194 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 195 if (a < addr || a >= addr + len) { 196 return (0); 197 } 198 len -= (a - addr); 199 if (len < pgsz) { 200 return (0); 201 } 202 return (1 << 1); 203 } 204 205 uint_t 206 map_shm_pgszcvec(caddr_t addr, size_t len, uintptr_t off) 207 { 208 size_t pgsz; 209 caddr_t a; 210 211 if (!use_shm_largepages || mmu.max_page_level == 0) { 212 return (0); 213 } 214 215 pgsz = LEVEL_SIZE(1); 216 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 217 if (a < addr || a >= addr + len || 218 P2PHASE((uintptr_t)addr ^ off, pgsz)) { 219 return (0); 220 } 221 len -= (a - addr); 222 if (len < pgsz) { 223 return (0); 224 } 225 return (1 << 1); 226 } 227 228 /* 229 * Handle a pagefault. 230 */ 231 faultcode_t 232 pagefault( 233 caddr_t addr, 234 enum fault_type type, 235 enum seg_rw rw, 236 int iskernel) 237 { 238 struct as *as; 239 struct hat *hat; 240 struct proc *p; 241 kthread_t *t; 242 faultcode_t res; 243 caddr_t base; 244 size_t len; 245 int err; 246 int mapped_red; 247 uintptr_t ea; 248 249 ASSERT_STACK_ALIGNED(); 250 251 if (INVALID_VADDR(addr)) 252 return (FC_NOMAP); 253 254 mapped_red = segkp_map_red(); 255 256 if (iskernel) { 257 as = &kas; 258 hat = as->a_hat; 259 } else { 260 t = curthread; 261 p = ttoproc(t); 262 as = p->p_as; 263 hat = as->a_hat; 264 } 265 266 /* 267 * Dispatch pagefault. 268 */ 269 res = as_fault(hat, as, addr, 1, type, rw); 270 271 /* 272 * If this isn't a potential unmapped hole in the user's 273 * UNIX data or stack segments, just return status info. 274 */ 275 if (res != FC_NOMAP || iskernel) 276 goto out; 277 278 /* 279 * Check to see if we happened to faulted on a currently unmapped 280 * part of the UNIX data or stack segments. If so, create a zfod 281 * mapping there and then try calling the fault routine again. 282 */ 283 base = p->p_brkbase; 284 len = p->p_brksize; 285 286 if (addr < base || addr >= base + len) { /* data seg? */ 287 base = (caddr_t)p->p_usrstack - p->p_stksize; 288 len = p->p_stksize; 289 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 290 /* not in either UNIX data or stack segments */ 291 res = FC_NOMAP; 292 goto out; 293 } 294 } 295 296 /* 297 * the rest of this function implements a 3.X 4.X 5.X compatibility 298 * This code is probably not needed anymore 299 */ 300 if (p->p_model == DATAMODEL_ILP32) { 301 302 /* expand the gap to the page boundaries on each side */ 303 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 304 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 305 len = ea - (uintptr_t)base; 306 307 as_rangelock(as); 308 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 309 0) { 310 err = as_map(as, base, len, segvn_create, zfod_argsp); 311 as_rangeunlock(as); 312 if (err) { 313 res = FC_MAKE_ERR(err); 314 goto out; 315 } 316 } else { 317 /* 318 * This page is already mapped by another thread after 319 * we returned from as_fault() above. We just fall 320 * through as_fault() below. 321 */ 322 as_rangeunlock(as); 323 } 324 325 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 326 } 327 328 out: 329 if (mapped_red) 330 segkp_unmap_red(); 331 332 return (res); 333 } 334 335 void 336 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 337 { 338 struct proc *p = curproc; 339 caddr_t userlimit = (flags & _MAP_LOW32) ? 340 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 341 342 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 343 } 344 345 /*ARGSUSED*/ 346 int 347 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 348 { 349 return (0); 350 } 351 352 /* 353 * map_addr_proc() is the routine called when the system is to 354 * choose an address for the user. We will pick an address 355 * range which is the highest available below kernelbase. 356 * 357 * addrp is a value/result parameter. 358 * On input it is a hint from the user to be used in a completely 359 * machine dependent fashion. We decide to completely ignore this hint. 360 * 361 * On output it is NULL if no address can be found in the current 362 * processes address space or else an address that is currently 363 * not mapped for len bytes with a page of red zone on either side. 364 * 365 * align is not needed on x86 (it's for viturally addressed caches) 366 */ 367 /*ARGSUSED*/ 368 void 369 map_addr_proc( 370 caddr_t *addrp, 371 size_t len, 372 offset_t off, 373 int vacalign, 374 caddr_t userlimit, 375 struct proc *p, 376 uint_t flags) 377 { 378 struct as *as = p->p_as; 379 caddr_t addr; 380 caddr_t base; 381 size_t slen; 382 size_t align_amount; 383 384 ASSERT32(userlimit == as->a_userlimit); 385 386 base = p->p_brkbase; 387 #if defined(__amd64) 388 /* 389 * XX64 Yes, this needs more work. 390 */ 391 if (p->p_model == DATAMODEL_NATIVE) { 392 if (userlimit < as->a_userlimit) { 393 /* 394 * This happens when a program wants to map 395 * something in a range that's accessible to a 396 * program in a smaller address space. For example, 397 * a 64-bit program calling mmap32(2) to guarantee 398 * that the returned address is below 4Gbytes. 399 */ 400 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 401 402 if (userlimit > base) 403 slen = userlimit - base; 404 else { 405 *addrp = NULL; 406 return; 407 } 408 } else { 409 /* 410 * XX64 This layout is probably wrong .. but in 411 * the event we make the amd64 address space look 412 * like sparcv9 i.e. with the stack -above- the 413 * heap, this bit of code might even be correct. 414 */ 415 slen = p->p_usrstack - base - 416 (((size_t)rctl_enforced_value( 417 rctlproc_legacy[RLIMIT_STACK], 418 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 419 } 420 } else 421 #endif 422 slen = userlimit - base; 423 424 len = (len + PAGEOFFSET) & PAGEMASK; 425 426 /* 427 * Redzone for each side of the request. This is done to leave 428 * one page unmapped between segments. This is not required, but 429 * it's useful for the user because if their program strays across 430 * a segment boundary, it will catch a fault immediately making 431 * debugging a little easier. 432 */ 433 len += 2 * MMU_PAGESIZE; 434 435 /* 436 * figure out what the alignment should be 437 * 438 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 439 */ 440 if (len <= ELF_386_MAXPGSZ) { 441 /* 442 * Align virtual addresses to ensure that ELF shared libraries 443 * are mapped with the appropriate alignment constraints by 444 * the run-time linker. 445 */ 446 align_amount = ELF_386_MAXPGSZ; 447 } else { 448 int l = mmu.max_page_level; 449 450 while (l && len < LEVEL_SIZE(l)) 451 --l; 452 453 align_amount = LEVEL_SIZE(l); 454 } 455 456 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 457 align_amount = (uintptr_t)*addrp; 458 459 len += align_amount; 460 461 /* 462 * Look for a large enough hole starting below userlimit. 463 * After finding it, use the upper part. Addition of PAGESIZE 464 * is for the redzone as described above. 465 */ 466 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 467 caddr_t as_addr; 468 469 addr = base + slen - len + MMU_PAGESIZE; 470 as_addr = addr; 471 /* 472 * Round address DOWN to the alignment amount, 473 * add the offset, and if this address is less 474 * than the original address, add alignment amount. 475 */ 476 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 477 addr += (uintptr_t)(off & (align_amount - 1)); 478 if (addr < as_addr) 479 addr += align_amount; 480 481 ASSERT(addr <= (as_addr + align_amount)); 482 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 483 ((uintptr_t)(off & (align_amount - 1)))); 484 *addrp = addr; 485 } else { 486 *addrp = NULL; /* no more virtual space */ 487 } 488 } 489 490 /* 491 * Determine whether [base, base+len] contains a valid range of 492 * addresses at least minlen long. base and len are adjusted if 493 * required to provide a valid range. 494 */ 495 /*ARGSUSED3*/ 496 int 497 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 498 { 499 uintptr_t hi, lo; 500 501 lo = (uintptr_t)*basep; 502 hi = lo + *lenp; 503 504 /* 505 * If hi rolled over the top, try cutting back. 506 */ 507 if (hi < lo) { 508 if (0 - lo + hi < minlen) 509 return (0); 510 if (0 - lo < minlen) 511 return (0); 512 *lenp = 0 - lo; 513 } else if (hi - lo < minlen) { 514 return (0); 515 } 516 #if defined(__amd64) 517 /* 518 * Deal with a possible hole in the address range between 519 * hole_start and hole_end that should never be mapped. 520 */ 521 if (lo < hole_start) { 522 if (hi > hole_start) { 523 if (hi < hole_end) { 524 hi = hole_start; 525 } else { 526 /* lo < hole_start && hi >= hole_end */ 527 if (dir == AH_LO) { 528 /* 529 * prefer lowest range 530 */ 531 if (hole_start - lo >= minlen) 532 hi = hole_start; 533 else if (hi - hole_end >= minlen) 534 lo = hole_end; 535 else 536 return (0); 537 } else { 538 /* 539 * prefer highest range 540 */ 541 if (hi - hole_end >= minlen) 542 lo = hole_end; 543 else if (hole_start - lo >= minlen) 544 hi = hole_start; 545 else 546 return (0); 547 } 548 } 549 } 550 } else { 551 /* lo >= hole_start */ 552 if (hi < hole_end) 553 return (0); 554 if (lo < hole_end) 555 lo = hole_end; 556 } 557 558 if (hi - lo < minlen) 559 return (0); 560 561 *basep = (caddr_t)lo; 562 *lenp = hi - lo; 563 #endif 564 return (1); 565 } 566 567 /* 568 * Determine whether [addr, addr+len] are valid user addresses. 569 */ 570 /*ARGSUSED*/ 571 int 572 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 573 caddr_t userlimit) 574 { 575 caddr_t eaddr = addr + len; 576 577 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 578 return (RANGE_BADADDR); 579 580 #if defined(__amd64) 581 /* 582 * Check for the VA hole 583 */ 584 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 585 return (RANGE_BADADDR); 586 #endif 587 588 return (RANGE_OKAY); 589 } 590 591 /* 592 * Return 1 if the page frame is onboard memory, else 0. 593 */ 594 int 595 pf_is_memory(pfn_t pf) 596 { 597 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 598 } 599 600 601 /* 602 * initialized by page_coloring_init(). 603 */ 604 uint_t page_colors; 605 uint_t page_colors_mask; 606 uint_t page_coloring_shift; 607 int cpu_page_colors; 608 static uint_t l2_colors; 609 610 /* 611 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 612 * and page_colors are calculated from the l2 cache n-way set size. Within a 613 * mnode range, the page freelist and cachelist are hashed into bins based on 614 * color. This makes it easier to search for a page within a specific memory 615 * range. 616 */ 617 #define PAGE_COLORS_MIN 16 618 619 page_t ****page_freelists; 620 page_t ***page_cachelists; 621 622 /* 623 * As the PC architecture evolved memory up was clumped into several 624 * ranges for various historical I/O devices to do DMA. 625 * < 16Meg - ISA bus 626 * < 2Gig - ??? 627 * < 4Gig - PCI bus or drivers that don't understand PAE mode 628 */ 629 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 630 0x100000, /* pfn range for 4G and above */ 631 0x80000, /* pfn range for 2G-4G */ 632 0x01000, /* pfn range for 16M-2G */ 633 0x00000, /* pfn range for 0-16M */ 634 }; 635 636 /* 637 * These are changed during startup if the machine has limited memory. 638 */ 639 pfn_t *memranges = &arch_memranges[0]; 640 int nranges = NUM_MEM_RANGES; 641 642 /* 643 * Used by page layer to know about page sizes 644 */ 645 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 646 647 /* 648 * This can be patched via /etc/system to allow old non-PAE aware device 649 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 650 */ 651 #if defined(__i386) 652 int restricted_kmemalloc = 0; 653 #elif defined(__amd64) 654 int restricted_kmemalloc = 0; 655 #endif 656 657 kmutex_t *fpc_mutex[NPC_MUTEX]; 658 kmutex_t *cpc_mutex[NPC_MUTEX]; 659 660 661 /* 662 * return the memrange containing pfn 663 */ 664 int 665 memrange_num(pfn_t pfn) 666 { 667 int n; 668 669 for (n = 0; n < nranges - 1; ++n) { 670 if (pfn >= memranges[n]) 671 break; 672 } 673 return (n); 674 } 675 676 /* 677 * return the mnoderange containing pfn 678 */ 679 int 680 pfn_2_mtype(pfn_t pfn) 681 { 682 int n; 683 684 for (n = mnoderangecnt - 1; n >= 0; n--) { 685 if (pfn >= mnoderanges[n].mnr_pfnlo) { 686 break; 687 } 688 } 689 return (n); 690 } 691 692 /* 693 * is_contigpage_free: 694 * returns a page list of contiguous pages. It minimally has to return 695 * minctg pages. Caller determines minctg based on the scatter-gather 696 * list length. 697 * 698 * pfnp is set to the next page frame to search on return. 699 */ 700 static page_t * 701 is_contigpage_free( 702 pfn_t *pfnp, 703 pgcnt_t *pgcnt, 704 pgcnt_t minctg, 705 uint64_t pfnseg, 706 int iolock) 707 { 708 int i = 0; 709 pfn_t pfn = *pfnp; 710 page_t *pp; 711 page_t *plist = NULL; 712 713 /* 714 * fail if pfn + minctg crosses a segment boundary. 715 * Adjust for next starting pfn to begin at segment boundary. 716 */ 717 718 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 719 *pfnp = roundup(*pfnp, pfnseg + 1); 720 return (NULL); 721 } 722 723 do { 724 retry: 725 pp = page_numtopp_nolock(pfn + i); 726 if ((pp == NULL) || 727 (page_trylock(pp, SE_EXCL) == 0)) { 728 (*pfnp)++; 729 break; 730 } 731 if (page_pptonum(pp) != pfn + i) { 732 page_unlock(pp); 733 goto retry; 734 } 735 736 if (!(PP_ISFREE(pp))) { 737 page_unlock(pp); 738 (*pfnp)++; 739 break; 740 } 741 742 if (!PP_ISAGED(pp)) { 743 page_list_sub(pp, PG_CACHE_LIST); 744 page_hashout(pp, (kmutex_t *)NULL); 745 } else { 746 page_list_sub(pp, PG_FREE_LIST); 747 } 748 749 if (iolock) 750 page_io_lock(pp); 751 page_list_concat(&plist, &pp); 752 753 /* 754 * exit loop when pgcnt satisfied or segment boundary reached. 755 */ 756 757 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 758 759 *pfnp += i; /* set to next pfn to search */ 760 761 if (i >= minctg) { 762 *pgcnt -= i; 763 return (plist); 764 } 765 766 /* 767 * failure: minctg not satisfied. 768 * 769 * if next request crosses segment boundary, set next pfn 770 * to search from the segment boundary. 771 */ 772 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 773 *pfnp = roundup(*pfnp, pfnseg + 1); 774 775 /* clean up any pages already allocated */ 776 777 while (plist) { 778 pp = plist; 779 page_sub(&plist, pp); 780 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 781 if (iolock) 782 page_io_unlock(pp); 783 page_unlock(pp); 784 } 785 786 return (NULL); 787 } 788 789 /* 790 * verify that pages being returned from allocator have correct DMA attribute 791 */ 792 #ifndef DEBUG 793 #define check_dma(a, b, c) (0) 794 #else 795 static void 796 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 797 { 798 if (dma_attr == NULL) 799 return; 800 801 while (cnt-- > 0) { 802 if (mmu_ptob((uint64_t)pp->p_pagenum) < 803 dma_attr->dma_attr_addr_lo) 804 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 805 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 806 dma_attr->dma_attr_addr_hi) 807 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 808 pp = pp->p_next; 809 } 810 } 811 #endif 812 813 static kmutex_t contig_lock; 814 815 #define CONTIG_LOCK() mutex_enter(&contig_lock); 816 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 817 818 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 819 820 static page_t * 821 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 822 { 823 pfn_t pfn; 824 int sgllen; 825 uint64_t pfnseg; 826 pgcnt_t minctg; 827 page_t *pplist = NULL, *plist; 828 uint64_t lo, hi; 829 pgcnt_t pfnalign = 0; 830 static pfn_t startpfn; 831 static pgcnt_t lastctgcnt; 832 uintptr_t align; 833 834 CONTIG_LOCK(); 835 836 if (mattr) { 837 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 838 hi = mmu_btop(mattr->dma_attr_addr_hi); 839 if (hi >= physmax) 840 hi = physmax - 1; 841 sgllen = mattr->dma_attr_sgllen; 842 pfnseg = mmu_btop(mattr->dma_attr_seg); 843 844 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 845 if (align > MMU_PAGESIZE) 846 pfnalign = mmu_btop(align); 847 848 /* 849 * in order to satisfy the request, must minimally 850 * acquire minctg contiguous pages 851 */ 852 minctg = howmany(*pgcnt, sgllen); 853 854 ASSERT(hi >= lo); 855 856 /* 857 * start from where last searched if the minctg >= lastctgcnt 858 */ 859 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 860 startpfn = lo; 861 } else { 862 hi = physmax - 1; 863 lo = 0; 864 sgllen = 1; 865 pfnseg = mmu.highest_pfn; 866 minctg = *pgcnt; 867 868 if (minctg < lastctgcnt) 869 startpfn = lo; 870 } 871 lastctgcnt = minctg; 872 873 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 874 875 /* conserve 16m memory - start search above 16m when possible */ 876 if (hi > PFN_16M && startpfn < PFN_16M) 877 startpfn = PFN_16M; 878 879 pfn = startpfn; 880 if (pfnalign) 881 pfn = P2ROUNDUP(pfn, pfnalign); 882 883 while (pfn + minctg - 1 <= hi) { 884 885 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 886 if (plist) { 887 page_list_concat(&pplist, &plist); 888 sgllen--; 889 /* 890 * return when contig pages no longer needed 891 */ 892 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 893 startpfn = pfn; 894 CONTIG_UNLOCK(); 895 check_dma(mattr, pplist, *pgcnt); 896 return (pplist); 897 } 898 minctg = howmany(*pgcnt, sgllen); 899 } 900 if (pfnalign) 901 pfn = P2ROUNDUP(pfn, pfnalign); 902 } 903 904 /* cannot find contig pages in specified range */ 905 if (startpfn == lo) { 906 CONTIG_UNLOCK(); 907 return (NULL); 908 } 909 910 /* did not start with lo previously */ 911 pfn = lo; 912 if (pfnalign) 913 pfn = P2ROUNDUP(pfn, pfnalign); 914 915 /* allow search to go above startpfn */ 916 while (pfn < startpfn) { 917 918 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 919 if (plist != NULL) { 920 921 page_list_concat(&pplist, &plist); 922 sgllen--; 923 924 /* 925 * return when contig pages no longer needed 926 */ 927 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 928 startpfn = pfn; 929 CONTIG_UNLOCK(); 930 check_dma(mattr, pplist, *pgcnt); 931 return (pplist); 932 } 933 minctg = howmany(*pgcnt, sgllen); 934 } 935 if (pfnalign) 936 pfn = P2ROUNDUP(pfn, pfnalign); 937 } 938 CONTIG_UNLOCK(); 939 return (NULL); 940 } 941 942 /* 943 * combine mem_node_config and memrange memory ranges into one data 944 * structure to be used for page list management. 945 * 946 * mnode_range_cnt() calculates the number of memory ranges for mnode and 947 * memranges[]. Used to determine the size of page lists and mnoderanges. 948 * 949 * mnode_range_setup() initializes mnoderanges. 950 */ 951 mnoderange_t *mnoderanges; 952 int mnoderangecnt; 953 int mtype4g; 954 955 int 956 mnode_range_cnt() 957 { 958 int mri; 959 int mnrcnt = 0; 960 int mnode; 961 962 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 963 if (mem_node_config[mnode].exists == 0) 964 continue; 965 966 mri = nranges - 1; 967 968 /* find the memranges index below contained in mnode range */ 969 970 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 971 mri--; 972 973 /* 974 * increment mnode range counter when memranges or mnode 975 * boundary is reached. 976 */ 977 while (mri >= 0 && 978 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 979 mnrcnt++; 980 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 981 mri--; 982 else 983 break; 984 } 985 } 986 return (mnrcnt); 987 } 988 989 void 990 mnode_range_setup(mnoderange_t *mnoderanges) 991 { 992 int mnode, mri; 993 994 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 995 if (mem_node_config[mnode].exists == 0) 996 continue; 997 998 mri = nranges - 1; 999 1000 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1001 mri--; 1002 1003 while (mri >= 0 && mem_node_config[mnode].physmax >= 1004 MEMRANGELO(mri)) { 1005 mnoderanges->mnr_pfnlo = 1006 MAX(MEMRANGELO(mri), 1007 mem_node_config[mnode].physbase); 1008 mnoderanges->mnr_pfnhi = 1009 MIN(MEMRANGEHI(mri), 1010 mem_node_config[mnode].physmax); 1011 mnoderanges->mnr_mnode = mnode; 1012 mnoderanges->mnr_memrange = mri; 1013 mnoderanges++; 1014 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1015 mri--; 1016 else 1017 break; 1018 } 1019 } 1020 } 1021 1022 /* 1023 * Determine if the mnode range specified in mtype contains memory belonging 1024 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1025 * the range of indices from high pfn to 0, 16m or 4g. 1026 * 1027 * Return first mnode range type index found otherwise return -1 if none found. 1028 */ 1029 int 1030 mtype_func(int mnode, int mtype, uint_t flags) 1031 { 1032 if (flags & PGI_MT_RANGE) { 1033 int mtlim; 1034 1035 if (flags & PGI_MT_NEXT) 1036 mtype--; 1037 if (flags & PGI_MT_RANGE0) 1038 mtlim = 0; 1039 else if (flags & PGI_MT_RANGE4G) 1040 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1041 else if (flags & PGI_MT_RANGE16M) 1042 mtlim = 1; /* exclude 0-16m range */ 1043 while (mtype >= mtlim) { 1044 if (mnoderanges[mtype].mnr_mnode == mnode) 1045 return (mtype); 1046 mtype--; 1047 } 1048 } else { 1049 if (mnoderanges[mtype].mnr_mnode == mnode) 1050 return (mtype); 1051 } 1052 return (-1); 1053 } 1054 1055 /* 1056 * Update the page list max counts with the pfn range specified by the 1057 * input parameters. Called from add_physmem() when physical memory with 1058 * page_t's are initially added to the page lists. 1059 */ 1060 void 1061 mtype_modify_max(pfn_t startpfn, long cnt) 1062 { 1063 int mtype = 0; 1064 pfn_t endpfn = startpfn + cnt, pfn; 1065 pgcnt_t inc; 1066 1067 ASSERT(cnt > 0); 1068 1069 for (pfn = startpfn; pfn < endpfn; ) { 1070 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1071 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1072 inc = endpfn - pfn; 1073 } else { 1074 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1075 } 1076 mnoderanges[mtype].mnr_mt_pgmax += inc; 1077 if (physmax4g && mtype <= mtype4g) 1078 maxmem4g += inc; 1079 pfn += inc; 1080 } 1081 mtype++; 1082 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1083 } 1084 } 1085 1086 /* 1087 * Returns the free page count for mnode 1088 */ 1089 int 1090 mnode_pgcnt(int mnode) 1091 { 1092 int mtype = mnoderangecnt - 1; 1093 int flags = PGI_MT_RANGE0; 1094 pgcnt_t pgcnt = 0; 1095 1096 mtype = mtype_func(mnode, mtype, flags); 1097 1098 while (mtype != -1) { 1099 pgcnt += MTYPE_FREEMEM(mtype); 1100 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1101 } 1102 return (pgcnt); 1103 } 1104 1105 /* 1106 * Initialize page coloring variables based on the l2 cache parameters. 1107 * Calculate and return memory needed for page coloring data structures. 1108 */ 1109 size_t 1110 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1111 { 1112 size_t colorsz = 0; 1113 int i; 1114 int colors; 1115 1116 /* 1117 * Reduce the memory ranges lists if we don't have large amounts 1118 * of memory. This avoids searching known empty free lists. 1119 */ 1120 i = memrange_num(physmax); 1121 memranges += i; 1122 nranges -= i; 1123 #if defined(__i386) 1124 if (i > 0) 1125 restricted_kmemalloc = 0; 1126 #endif 1127 /* physmax greater than 4g */ 1128 if (i == 0) 1129 physmax4g = 1; 1130 1131 /* 1132 * setup pagesize for generic page layer 1133 */ 1134 for (i = 0; i <= mmu.max_page_level; ++i) { 1135 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1136 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1137 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1138 } 1139 1140 ASSERT(ISP2(l2_sz)); 1141 ASSERT(ISP2(l2_linesz)); 1142 ASSERT(l2_sz > MMU_PAGESIZE); 1143 1144 /* l2_assoc is 0 for fully associative l2 cache */ 1145 if (l2_assoc) 1146 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1147 else 1148 l2_colors = 1; 1149 1150 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1151 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1152 1153 /* 1154 * cpu_page_colors is non-zero when a page color may be spread across 1155 * multiple bins. 1156 */ 1157 if (l2_colors < page_colors) 1158 cpu_page_colors = l2_colors; 1159 1160 ASSERT(ISP2(page_colors)); 1161 1162 page_colors_mask = page_colors - 1; 1163 1164 ASSERT(ISP2(CPUSETSIZE())); 1165 page_coloring_shift = lowbit(CPUSETSIZE()); 1166 1167 /* size for mnoderanges */ 1168 mnoderangecnt = mnode_range_cnt(); 1169 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1170 1171 /* size for fpc_mutex and cpc_mutex */ 1172 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1173 1174 /* size of page_freelists */ 1175 colorsz += mnoderangecnt * sizeof (page_t ***); 1176 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1177 1178 for (i = 0; i < mmu_page_sizes; i++) { 1179 colors = page_get_pagecolors(i); 1180 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1181 } 1182 1183 /* size of page_cachelists */ 1184 colorsz += mnoderangecnt * sizeof (page_t **); 1185 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1186 1187 return (colorsz); 1188 } 1189 1190 /* 1191 * Called once at startup to configure page_coloring data structures and 1192 * does the 1st page_free()/page_freelist_add(). 1193 */ 1194 void 1195 page_coloring_setup(caddr_t pcmemaddr) 1196 { 1197 int i; 1198 int j; 1199 int k; 1200 caddr_t addr; 1201 int colors; 1202 1203 /* 1204 * do page coloring setup 1205 */ 1206 addr = pcmemaddr; 1207 1208 mnoderanges = (mnoderange_t *)addr; 1209 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1210 1211 mnode_range_setup(mnoderanges); 1212 1213 if (physmax4g) 1214 mtype4g = pfn_2_mtype(0xfffff); 1215 1216 for (k = 0; k < NPC_MUTEX; k++) { 1217 fpc_mutex[k] = (kmutex_t *)addr; 1218 addr += (max_mem_nodes * sizeof (kmutex_t)); 1219 } 1220 for (k = 0; k < NPC_MUTEX; k++) { 1221 cpc_mutex[k] = (kmutex_t *)addr; 1222 addr += (max_mem_nodes * sizeof (kmutex_t)); 1223 } 1224 page_freelists = (page_t ****)addr; 1225 addr += (mnoderangecnt * sizeof (page_t ***)); 1226 1227 page_cachelists = (page_t ***)addr; 1228 addr += (mnoderangecnt * sizeof (page_t **)); 1229 1230 for (i = 0; i < mnoderangecnt; i++) { 1231 page_freelists[i] = (page_t ***)addr; 1232 addr += (mmu_page_sizes * sizeof (page_t **)); 1233 1234 for (j = 0; j < mmu_page_sizes; j++) { 1235 colors = page_get_pagecolors(j); 1236 page_freelists[i][j] = (page_t **)addr; 1237 addr += (colors * sizeof (page_t *)); 1238 } 1239 page_cachelists[i] = (page_t **)addr; 1240 addr += (page_colors * sizeof (page_t *)); 1241 } 1242 } 1243 1244 /*ARGSUSED*/ 1245 int 1246 bp_color(struct buf *bp) 1247 { 1248 return (0); 1249 } 1250 1251 /* 1252 * get a page from any list with the given mnode 1253 */ 1254 page_t * 1255 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1256 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1257 { 1258 kmutex_t *pcm; 1259 int i; 1260 page_t *pp; 1261 page_t *first_pp; 1262 uint64_t pgaddr; 1263 ulong_t bin; 1264 int mtypestart; 1265 1266 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1267 1268 ASSERT((flags & PG_MATCH_COLOR) == 0); 1269 ASSERT(szc == 0); 1270 ASSERT(dma_attr != NULL); 1271 1272 1273 MTYPE_START(mnode, mtype, flags); 1274 if (mtype < 0) { 1275 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1276 return (NULL); 1277 } 1278 1279 mtypestart = mtype; 1280 1281 bin = origbin; 1282 1283 /* 1284 * check up to page_colors + 1 bins - origbin may be checked twice 1285 * because of BIN_STEP skip 1286 */ 1287 do { 1288 i = 0; 1289 while (i <= page_colors) { 1290 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1291 goto nextfreebin; 1292 1293 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1294 mutex_enter(pcm); 1295 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1296 first_pp = pp; 1297 while (pp != NULL) { 1298 if (page_trylock(pp, SE_EXCL) == 0) { 1299 pp = pp->p_next; 1300 if (pp == first_pp) { 1301 pp = NULL; 1302 } 1303 continue; 1304 } 1305 1306 ASSERT(PP_ISFREE(pp)); 1307 ASSERT(PP_ISAGED(pp)); 1308 ASSERT(pp->p_vnode == NULL); 1309 ASSERT(pp->p_hash == NULL); 1310 ASSERT(pp->p_offset == (u_offset_t)-1); 1311 ASSERT(pp->p_szc == szc); 1312 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1313 /* check if page within DMA attributes */ 1314 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1315 1316 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1317 (pgaddr + MMU_PAGESIZE - 1 <= 1318 dma_attr->dma_attr_addr_hi)) { 1319 break; 1320 } 1321 1322 /* continue looking */ 1323 page_unlock(pp); 1324 pp = pp->p_next; 1325 if (pp == first_pp) 1326 pp = NULL; 1327 1328 } 1329 if (pp != NULL) { 1330 ASSERT(mtype == PP_2_MTYPE(pp)); 1331 ASSERT(pp->p_szc == 0); 1332 1333 /* found a page with specified DMA attributes */ 1334 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1335 mtype), pp); 1336 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1337 1338 if ((PP_ISFREE(pp) == 0) || 1339 (PP_ISAGED(pp) == 0)) { 1340 cmn_err(CE_PANIC, "page %p is not free", 1341 (void *)pp); 1342 } 1343 1344 mutex_exit(pcm); 1345 check_dma(dma_attr, pp, 1); 1346 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1347 return (pp); 1348 } 1349 mutex_exit(pcm); 1350 nextfreebin: 1351 pp = page_freelist_fill(szc, bin, mnode, mtype, 1352 mmu_btop(dma_attr->dma_attr_addr_hi + 1)); 1353 if (pp) 1354 return (pp); 1355 1356 /* try next bin */ 1357 bin += (i == 0) ? BIN_STEP : 1; 1358 bin &= page_colors_mask; 1359 i++; 1360 } 1361 MTYPE_NEXT(mnode, mtype, flags); 1362 } while (mtype >= 0); 1363 1364 /* failed to find a page in the freelist; try it in the cachelist */ 1365 1366 /* reset mtype start for cachelist search */ 1367 mtype = mtypestart; 1368 ASSERT(mtype >= 0); 1369 1370 /* start with the bin of matching color */ 1371 bin = origbin; 1372 1373 do { 1374 for (i = 0; i <= page_colors; i++) { 1375 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1376 goto nextcachebin; 1377 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1378 mutex_enter(pcm); 1379 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1380 first_pp = pp; 1381 while (pp != NULL) { 1382 if (page_trylock(pp, SE_EXCL) == 0) { 1383 pp = pp->p_next; 1384 if (pp == first_pp) 1385 break; 1386 continue; 1387 } 1388 ASSERT(pp->p_vnode); 1389 ASSERT(PP_ISAGED(pp) == 0); 1390 ASSERT(pp->p_szc == 0); 1391 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1392 1393 /* check if page within DMA attributes */ 1394 1395 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1396 1397 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1398 (pgaddr + MMU_PAGESIZE - 1 <= 1399 dma_attr->dma_attr_addr_hi)) { 1400 break; 1401 } 1402 1403 /* continue looking */ 1404 page_unlock(pp); 1405 pp = pp->p_next; 1406 if (pp == first_pp) 1407 pp = NULL; 1408 } 1409 1410 if (pp != NULL) { 1411 ASSERT(mtype == PP_2_MTYPE(pp)); 1412 ASSERT(pp->p_szc == 0); 1413 1414 /* found a page with specified DMA attributes */ 1415 page_sub(&PAGE_CACHELISTS(mnode, bin, 1416 mtype), pp); 1417 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1418 1419 mutex_exit(pcm); 1420 ASSERT(pp->p_vnode); 1421 ASSERT(PP_ISAGED(pp) == 0); 1422 check_dma(dma_attr, pp, 1); 1423 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1424 return (pp); 1425 } 1426 mutex_exit(pcm); 1427 nextcachebin: 1428 bin += (i == 0) ? BIN_STEP : 1; 1429 bin &= page_colors_mask; 1430 } 1431 MTYPE_NEXT(mnode, mtype, flags); 1432 } while (mtype >= 0); 1433 1434 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1435 return (NULL); 1436 } 1437 1438 /* 1439 * This function is similar to page_get_freelist()/page_get_cachelist() 1440 * but it searches both the lists to find a page with the specified 1441 * color (or no color) and DMA attributes. The search is done in the 1442 * freelist first and then in the cache list within the highest memory 1443 * range (based on DMA attributes) before searching in the lower 1444 * memory ranges. 1445 * 1446 * Note: This function is called only by page_create_io(). 1447 */ 1448 /*ARGSUSED*/ 1449 page_t * 1450 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1451 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1452 { 1453 uint_t bin; 1454 int mtype; 1455 page_t *pp; 1456 int n; 1457 int m; 1458 int szc; 1459 int fullrange; 1460 int mnode; 1461 int local_failed_stat = 0; 1462 lgrp_mnode_cookie_t lgrp_cookie; 1463 1464 VM_STAT_ADD(pga_vmstats.pga_alloc); 1465 1466 /* only base pagesize currently supported */ 1467 if (size != MMU_PAGESIZE) 1468 return (NULL); 1469 1470 /* 1471 * If we're passed a specific lgroup, we use it. Otherwise, 1472 * assume first-touch placement is desired. 1473 */ 1474 if (!LGRP_EXISTS(lgrp)) 1475 lgrp = lgrp_home_lgrp(); 1476 1477 /* LINTED */ 1478 AS_2_BIN(as, seg, vp, vaddr, bin); 1479 1480 /* 1481 * Only hold one freelist or cachelist lock at a time, that way we 1482 * can start anywhere and not have to worry about lock 1483 * ordering. 1484 */ 1485 if (dma_attr == NULL) { 1486 n = 0; 1487 m = mnoderangecnt - 1; 1488 fullrange = 1; 1489 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1490 } else { 1491 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1492 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1493 1494 /* 1495 * We can guarantee alignment only for page boundary. 1496 */ 1497 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1498 return (NULL); 1499 1500 n = pfn_2_mtype(pfnlo); 1501 m = pfn_2_mtype(pfnhi); 1502 1503 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1504 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1505 } 1506 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1507 1508 if (n > m) 1509 return (NULL); 1510 1511 szc = 0; 1512 1513 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1514 if (n == 0) { 1515 flags |= PGI_MT_RANGE0; 1516 n = m; 1517 } 1518 1519 /* 1520 * Try local memory node first, but try remote if we can't 1521 * get a page of the right color. 1522 */ 1523 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1524 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1525 /* 1526 * allocate pages from high pfn to low. 1527 */ 1528 for (mtype = m; mtype >= n; mtype--) { 1529 if (fullrange != 0) { 1530 pp = page_get_mnode_freelist(mnode, 1531 bin, mtype, szc, flags); 1532 if (pp == NULL) { 1533 pp = page_get_mnode_cachelist( 1534 bin, flags, mnode, mtype); 1535 } 1536 } else { 1537 pp = page_get_mnode_anylist(bin, szc, 1538 flags, mnode, mtype, dma_attr); 1539 } 1540 if (pp != NULL) { 1541 VM_STAT_ADD(pga_vmstats.pga_allocok); 1542 check_dma(dma_attr, pp, 1); 1543 return (pp); 1544 } 1545 } 1546 if (!local_failed_stat) { 1547 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1548 local_failed_stat = 1; 1549 } 1550 } 1551 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1552 1553 return (NULL); 1554 } 1555 1556 /* 1557 * page_create_io() 1558 * 1559 * This function is a copy of page_create_va() with an additional 1560 * argument 'mattr' that specifies DMA memory requirements to 1561 * the page list functions. This function is used by the segkmem 1562 * allocator so it is only to create new pages (i.e PG_EXCL is 1563 * set). 1564 * 1565 * Note: This interface is currently used by x86 PSM only and is 1566 * not fully specified so the commitment level is only for 1567 * private interface specific to x86. This interface uses PSM 1568 * specific page_get_anylist() interface. 1569 */ 1570 1571 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1572 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1573 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1574 break; \ 1575 } \ 1576 } 1577 1578 1579 page_t * 1580 page_create_io( 1581 struct vnode *vp, 1582 u_offset_t off, 1583 uint_t bytes, 1584 uint_t flags, 1585 struct as *as, 1586 caddr_t vaddr, 1587 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1588 { 1589 page_t *plist = NULL; 1590 uint_t plist_len = 0; 1591 pgcnt_t npages; 1592 page_t *npp = NULL; 1593 uint_t pages_req; 1594 page_t *pp; 1595 kmutex_t *phm = NULL; 1596 uint_t index; 1597 1598 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1599 "page_create_start:vp %p off %llx bytes %u flags %x", 1600 vp, off, bytes, flags); 1601 1602 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1603 1604 pages_req = npages = mmu_btopr(bytes); 1605 1606 /* 1607 * Do the freemem and pcf accounting. 1608 */ 1609 if (!page_create_wait(npages, flags)) { 1610 return (NULL); 1611 } 1612 1613 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1614 "page_create_success:vp %p off %llx", 1615 vp, off); 1616 1617 /* 1618 * If satisfying this request has left us with too little 1619 * memory, start the wheels turning to get some back. The 1620 * first clause of the test prevents waking up the pageout 1621 * daemon in situations where it would decide that there's 1622 * nothing to do. 1623 */ 1624 if (nscan < desscan && freemem < minfree) { 1625 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1626 "pageout_cv_signal:freemem %ld", freemem); 1627 cv_signal(&proc_pageout->p_cv); 1628 } 1629 1630 if (flags & PG_PHYSCONTIG) { 1631 1632 plist = page_get_contigpage(&npages, mattr, 1); 1633 if (plist == NULL) { 1634 page_create_putback(npages); 1635 return (NULL); 1636 } 1637 1638 pp = plist; 1639 1640 do { 1641 if (!page_hashin(pp, vp, off, NULL)) { 1642 panic("pg_creat_io: hashin failed %p %p %llx", 1643 (void *)pp, (void *)vp, off); 1644 } 1645 VM_STAT_ADD(page_create_new); 1646 off += MMU_PAGESIZE; 1647 PP_CLRFREE(pp); 1648 PP_CLRAGED(pp); 1649 page_set_props(pp, P_REF); 1650 pp = pp->p_next; 1651 } while (pp != plist); 1652 1653 if (!npages) { 1654 check_dma(mattr, plist, pages_req); 1655 return (plist); 1656 } else { 1657 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1658 } 1659 1660 /* 1661 * fall-thru: 1662 * 1663 * page_get_contigpage returns when npages <= sgllen. 1664 * Grab the rest of the non-contig pages below from anylist. 1665 */ 1666 } 1667 1668 /* 1669 * Loop around collecting the requested number of pages. 1670 * Most of the time, we have to `create' a new page. With 1671 * this in mind, pull the page off the free list before 1672 * getting the hash lock. This will minimize the hash 1673 * lock hold time, nesting, and the like. If it turns 1674 * out we don't need the page, we put it back at the end. 1675 */ 1676 while (npages--) { 1677 phm = NULL; 1678 1679 index = PAGE_HASH_FUNC(vp, off); 1680 top: 1681 ASSERT(phm == NULL); 1682 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1683 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1684 1685 if (npp == NULL) { 1686 /* 1687 * Try to get the page of any color either from 1688 * the freelist or from the cache list. 1689 */ 1690 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1691 flags & ~PG_MATCH_COLOR, mattr, NULL); 1692 if (npp == NULL) { 1693 if (mattr == NULL) { 1694 /* 1695 * Not looking for a special page; 1696 * panic! 1697 */ 1698 panic("no page found %d", (int)npages); 1699 } 1700 /* 1701 * No page found! This can happen 1702 * if we are looking for a page 1703 * within a specific memory range 1704 * for DMA purposes. If PG_WAIT is 1705 * specified then we wait for a 1706 * while and then try again. The 1707 * wait could be forever if we 1708 * don't get the page(s) we need. 1709 * 1710 * Note: XXX We really need a mechanism 1711 * to wait for pages in the desired 1712 * range. For now, we wait for any 1713 * pages and see if we can use it. 1714 */ 1715 1716 if ((mattr != NULL) && (flags & PG_WAIT)) { 1717 delay(10); 1718 goto top; 1719 } 1720 1721 goto fail; /* undo accounting stuff */ 1722 } 1723 1724 if (PP_ISAGED(npp) == 0) { 1725 /* 1726 * Since this page came from the 1727 * cachelist, we must destroy the 1728 * old vnode association. 1729 */ 1730 page_hashout(npp, (kmutex_t *)NULL); 1731 } 1732 } 1733 1734 /* 1735 * We own this page! 1736 */ 1737 ASSERT(PAGE_EXCL(npp)); 1738 ASSERT(npp->p_vnode == NULL); 1739 ASSERT(!hat_page_is_mapped(npp)); 1740 PP_CLRFREE(npp); 1741 PP_CLRAGED(npp); 1742 1743 /* 1744 * Here we have a page in our hot little mits and are 1745 * just waiting to stuff it on the appropriate lists. 1746 * Get the mutex and check to see if it really does 1747 * not exist. 1748 */ 1749 phm = PAGE_HASH_MUTEX(index); 1750 mutex_enter(phm); 1751 PAGE_HASH_SEARCH(index, pp, vp, off); 1752 if (pp == NULL) { 1753 VM_STAT_ADD(page_create_new); 1754 pp = npp; 1755 npp = NULL; 1756 if (!page_hashin(pp, vp, off, phm)) { 1757 /* 1758 * Since we hold the page hash mutex and 1759 * just searched for this page, page_hashin 1760 * had better not fail. If it does, that 1761 * means somethread did not follow the 1762 * page hash mutex rules. Panic now and 1763 * get it over with. As usual, go down 1764 * holding all the locks. 1765 */ 1766 ASSERT(MUTEX_HELD(phm)); 1767 panic("page_create: hashin fail %p %p %llx %p", 1768 (void *)pp, (void *)vp, off, (void *)phm); 1769 1770 } 1771 ASSERT(MUTEX_HELD(phm)); 1772 mutex_exit(phm); 1773 phm = NULL; 1774 1775 /* 1776 * Hat layer locking need not be done to set 1777 * the following bits since the page is not hashed 1778 * and was on the free list (i.e., had no mappings). 1779 * 1780 * Set the reference bit to protect 1781 * against immediate pageout 1782 * 1783 * XXXmh modify freelist code to set reference 1784 * bit so we don't have to do it here. 1785 */ 1786 page_set_props(pp, P_REF); 1787 } else { 1788 ASSERT(MUTEX_HELD(phm)); 1789 mutex_exit(phm); 1790 phm = NULL; 1791 /* 1792 * NOTE: This should not happen for pages associated 1793 * with kernel vnode 'kvp'. 1794 */ 1795 /* XX64 - to debug why this happens! */ 1796 ASSERT(vp != &kvp); 1797 if (vp == &kvp) 1798 cmn_err(CE_NOTE, 1799 "page_create: page not expected " 1800 "in hash list for kernel vnode - pp 0x%p", 1801 (void *)pp); 1802 VM_STAT_ADD(page_create_exists); 1803 goto fail; 1804 } 1805 1806 /* 1807 * Got a page! It is locked. Acquire the i/o 1808 * lock since we are going to use the p_next and 1809 * p_prev fields to link the requested pages together. 1810 */ 1811 page_io_lock(pp); 1812 page_add(&plist, pp); 1813 plist = plist->p_next; 1814 off += MMU_PAGESIZE; 1815 vaddr += MMU_PAGESIZE; 1816 } 1817 1818 check_dma(mattr, plist, pages_req); 1819 return (plist); 1820 1821 fail: 1822 if (npp != NULL) { 1823 /* 1824 * Did not need this page after all. 1825 * Put it back on the free list. 1826 */ 1827 VM_STAT_ADD(page_create_putbacks); 1828 PP_SETFREE(npp); 1829 PP_SETAGED(npp); 1830 npp->p_offset = (u_offset_t)-1; 1831 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1832 page_unlock(npp); 1833 } 1834 1835 /* 1836 * Give up the pages we already got. 1837 */ 1838 while (plist != NULL) { 1839 pp = plist; 1840 page_sub(&plist, pp); 1841 page_io_unlock(pp); 1842 plist_len++; 1843 /*LINTED: constant in conditional ctx*/ 1844 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1845 } 1846 1847 /* 1848 * VN_DISPOSE does freemem accounting for the pages in plist 1849 * by calling page_free. So, we need to undo the pcf accounting 1850 * for only the remaining pages. 1851 */ 1852 VM_STAT_ADD(page_create_putbacks); 1853 page_create_putback(pages_req - plist_len); 1854 1855 return (NULL); 1856 } 1857 1858 1859 /* 1860 * Copy the data from the physical page represented by "frompp" to 1861 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1862 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1863 * level and no one sleeps with an active mapping there. 1864 * 1865 * Note that the ref/mod bits in the page_t's are not affected by 1866 * this operation, hence it is up to the caller to update them appropriately. 1867 */ 1868 void 1869 ppcopy(page_t *frompp, page_t *topp) 1870 { 1871 caddr_t pp_addr1; 1872 caddr_t pp_addr2; 1873 void *pte1; 1874 void *pte2; 1875 kmutex_t *ppaddr_mutex; 1876 1877 ASSERT_STACK_ALIGNED(); 1878 ASSERT(PAGE_LOCKED(frompp)); 1879 ASSERT(PAGE_LOCKED(topp)); 1880 1881 if (kpm_enable) { 1882 pp_addr1 = hat_kpm_page2va(frompp, 0); 1883 pp_addr2 = hat_kpm_page2va(topp, 0); 1884 kpreempt_disable(); 1885 } else { 1886 /* 1887 * disable pre-emption so that CPU can't change 1888 */ 1889 kpreempt_disable(); 1890 1891 pp_addr1 = CPU->cpu_caddr1; 1892 pp_addr2 = CPU->cpu_caddr2; 1893 pte1 = (void *)CPU->cpu_caddr1pte; 1894 pte2 = (void *)CPU->cpu_caddr2pte; 1895 1896 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1897 mutex_enter(ppaddr_mutex); 1898 1899 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 1900 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 1901 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 1902 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1903 HAT_LOAD_NOCONSIST); 1904 } 1905 1906 if (use_sse_pagecopy) 1907 hwblkpagecopy(pp_addr1, pp_addr2); 1908 else 1909 bcopy(pp_addr1, pp_addr2, PAGESIZE); 1910 1911 if (!kpm_enable) 1912 mutex_exit(ppaddr_mutex); 1913 kpreempt_enable(); 1914 } 1915 1916 /* 1917 * Zero the physical page from off to off + len given by `pp' 1918 * without changing the reference and modified bits of page. 1919 * 1920 * We use this using CPU private page address #2, see ppcopy() for more info. 1921 * pagezero() must not be called at interrupt level. 1922 */ 1923 void 1924 pagezero(page_t *pp, uint_t off, uint_t len) 1925 { 1926 caddr_t pp_addr2; 1927 void *pte2; 1928 kmutex_t *ppaddr_mutex; 1929 1930 ASSERT_STACK_ALIGNED(); 1931 ASSERT(len <= MMU_PAGESIZE); 1932 ASSERT(off <= MMU_PAGESIZE); 1933 ASSERT(off + len <= MMU_PAGESIZE); 1934 ASSERT(PAGE_LOCKED(pp)); 1935 1936 if (kpm_enable) { 1937 pp_addr2 = hat_kpm_page2va(pp, 0); 1938 kpreempt_disable(); 1939 } else { 1940 kpreempt_disable(); 1941 1942 pp_addr2 = CPU->cpu_caddr2; 1943 pte2 = (void *)CPU->cpu_caddr2pte; 1944 1945 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 1946 mutex_enter(ppaddr_mutex); 1947 1948 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 1949 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 1950 HAT_LOAD_NOCONSIST); 1951 } 1952 1953 if (use_sse_pagezero) 1954 hwblkclr(pp_addr2 + off, len); 1955 else 1956 bzero(pp_addr2 + off, len); 1957 1958 if (!kpm_enable) 1959 mutex_exit(ppaddr_mutex); 1960 kpreempt_enable(); 1961 } 1962 1963 /* 1964 * Platform-dependent page scrub call. 1965 */ 1966 void 1967 pagescrub(page_t *pp, uint_t off, uint_t len) 1968 { 1969 /* 1970 * For now, we rely on the fact that pagezero() will 1971 * always clear UEs. 1972 */ 1973 pagezero(pp, off, len); 1974 } 1975 1976 /* 1977 * set up two private addresses for use on a given CPU for use in ppcopy() 1978 */ 1979 void 1980 setup_vaddr_for_ppcopy(struct cpu *cpup) 1981 { 1982 void *addr; 1983 void *pte; 1984 1985 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1986 pte = hat_mempte_setup(addr); 1987 cpup->cpu_caddr1 = addr; 1988 cpup->cpu_caddr1pte = (pteptr_t)pte; 1989 1990 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 1991 pte = hat_mempte_setup(addr); 1992 cpup->cpu_caddr2 = addr; 1993 cpup->cpu_caddr2pte = (pteptr_t)pte; 1994 1995 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 1996 } 1997 1998 1999 /* 2000 * Create the pageout scanner thread. The thread has to 2001 * start at procedure with process pp and priority pri. 2002 */ 2003 void 2004 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 2005 { 2006 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 2007 } 2008 2009 /* 2010 * Function for flushing D-cache when performing module relocations 2011 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 2012 */ 2013 void 2014 dcache_flushall() 2015 {} 2016