1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 #include <sys/vmsystm.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 1; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 extern uintptr_t eprom_kernelbase; 95 extern int use_sse_pagecopy, use_sse_pagezero; /* in ml/float.s */ 96 97 /* 4g memory management */ 98 pgcnt_t maxmem4g; 99 pgcnt_t freemem4g; 100 int physmax4g; 101 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 102 int lotsfree4gshift = 3; 103 104 /* 16m memory management: desired number of free pages below 16m. */ 105 pgcnt_t desfree16m = 0x380; 106 107 #ifdef VM_STATS 108 struct { 109 ulong_t pga_alloc; 110 ulong_t pga_notfullrange; 111 ulong_t pga_nulldmaattr; 112 ulong_t pga_allocok; 113 ulong_t pga_allocfailed; 114 ulong_t pgma_alloc; 115 ulong_t pgma_allocok; 116 ulong_t pgma_allocfailed; 117 ulong_t pgma_allocempty; 118 } pga_vmstats; 119 #endif 120 121 uint_t mmu_page_sizes; 122 123 /* How many page sizes the users can see */ 124 uint_t mmu_exported_page_sizes; 125 126 /* 127 * Number of pages in 1 GB. Don't enable automatic large pages if we have 128 * fewer than this many pages. 129 */ 130 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 131 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 132 133 /* 134 * Maximum and default segment size tunables for user private 135 * and shared anon memory, and user text and initialized data. 136 * These can be patched via /etc/system to allow large pages 137 * to be used for mapping application private and shared anon memory. 138 */ 139 size_t mcntl0_lpsize = MMU_PAGESIZE; 140 size_t max_uheap_lpsize = MMU_PAGESIZE; 141 size_t default_uheap_lpsize = MMU_PAGESIZE; 142 size_t max_ustack_lpsize = MMU_PAGESIZE; 143 size_t default_ustack_lpsize = MMU_PAGESIZE; 144 size_t max_privmap_lpsize = MMU_PAGESIZE; 145 size_t max_uidata_lpsize = MMU_PAGESIZE; 146 size_t max_utext_lpsize = MMU_PAGESIZE; 147 size_t max_shm_lpsize = MMU_PAGESIZE; 148 149 /* 150 * Return the optimum page size for a given mapping 151 */ 152 /*ARGSUSED*/ 153 size_t 154 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 155 { 156 level_t l = 0; 157 size_t pgsz = MMU_PAGESIZE; 158 size_t max_lpsize; 159 uint_t mszc; 160 161 ASSERT(maptype != MAPPGSZ_VA); 162 163 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 164 return (MMU_PAGESIZE); 165 } 166 167 switch (maptype) { 168 case MAPPGSZ_HEAP: 169 case MAPPGSZ_STK: 170 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 171 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 172 if (max_lpsize == MMU_PAGESIZE) { 173 return (MMU_PAGESIZE); 174 } 175 if (len == 0) { 176 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 177 p->p_brksize - p->p_bssbase : p->p_stksize; 178 } 179 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 180 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 181 182 /* 183 * use the pages size that best fits len 184 */ 185 for (l = mmu.max_page_level; l > 0; --l) { 186 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 187 continue; 188 } else { 189 pgsz = LEVEL_SIZE(l); 190 } 191 break; 192 } 193 194 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 195 p->p_stkpageszc); 196 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 197 pgsz = hw_page_array[mszc].hp_size; 198 } 199 return (pgsz); 200 201 /* 202 * for ISM use the 1st large page size. 203 */ 204 case MAPPGSZ_ISM: 205 if (mmu.max_page_level == 0) 206 return (MMU_PAGESIZE); 207 return (LEVEL_SIZE(1)); 208 } 209 return (pgsz); 210 } 211 212 static uint_t 213 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 214 size_t min_physmem) 215 { 216 caddr_t eaddr = addr + size; 217 uint_t szcvec = 0; 218 caddr_t raddr; 219 caddr_t readdr; 220 size_t pgsz; 221 int i; 222 223 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 224 return (0); 225 } 226 227 for (i = mmu_page_sizes - 1; i > 0; i--) { 228 pgsz = page_get_pagesize(i); 229 if (pgsz > max_lpsize) { 230 continue; 231 } 232 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 233 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 234 if (raddr < addr || raddr >= readdr) { 235 continue; 236 } 237 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 238 continue; 239 } 240 /* 241 * Set szcvec to the remaining page sizes. 242 */ 243 szcvec = ((1 << (i + 1)) - 1) & ~1; 244 break; 245 } 246 return (szcvec); 247 } 248 249 /* 250 * Return a bit vector of large page size codes that 251 * can be used to map [addr, addr + len) region. 252 */ 253 /*ARGSUSED*/ 254 uint_t 255 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 256 int memcntl) 257 { 258 size_t max_lpsize = mcntl0_lpsize; 259 260 if (mmu.max_page_level == 0) 261 return (0); 262 263 if (flags & MAP_TEXT) { 264 if (!memcntl) 265 max_lpsize = max_utext_lpsize; 266 return (map_szcvec(addr, size, off, max_lpsize, 267 shm_lpg_min_physmem)); 268 269 } else if (flags & MAP_INITDATA) { 270 if (!memcntl) 271 max_lpsize = max_uidata_lpsize; 272 return (map_szcvec(addr, size, off, max_lpsize, 273 privm_lpg_min_physmem)); 274 275 } else if (type == MAPPGSZC_SHM) { 276 if (!memcntl) 277 max_lpsize = max_shm_lpsize; 278 return (map_szcvec(addr, size, off, max_lpsize, 279 shm_lpg_min_physmem)); 280 281 } else if (type == MAPPGSZC_HEAP) { 282 if (!memcntl) 283 max_lpsize = max_uheap_lpsize; 284 return (map_szcvec(addr, size, off, max_lpsize, 285 privm_lpg_min_physmem)); 286 287 } else if (type == MAPPGSZC_STACK) { 288 if (!memcntl) 289 max_lpsize = max_ustack_lpsize; 290 return (map_szcvec(addr, size, off, max_lpsize, 291 privm_lpg_min_physmem)); 292 293 } else { 294 if (!memcntl) 295 max_lpsize = max_privmap_lpsize; 296 return (map_szcvec(addr, size, off, max_lpsize, 297 privm_lpg_min_physmem)); 298 } 299 } 300 301 /* 302 * Handle a pagefault. 303 */ 304 faultcode_t 305 pagefault( 306 caddr_t addr, 307 enum fault_type type, 308 enum seg_rw rw, 309 int iskernel) 310 { 311 struct as *as; 312 struct hat *hat; 313 struct proc *p; 314 kthread_t *t; 315 faultcode_t res; 316 caddr_t base; 317 size_t len; 318 int err; 319 int mapped_red; 320 uintptr_t ea; 321 322 ASSERT_STACK_ALIGNED(); 323 324 if (INVALID_VADDR(addr)) 325 return (FC_NOMAP); 326 327 mapped_red = segkp_map_red(); 328 329 if (iskernel) { 330 as = &kas; 331 hat = as->a_hat; 332 } else { 333 t = curthread; 334 p = ttoproc(t); 335 as = p->p_as; 336 hat = as->a_hat; 337 } 338 339 /* 340 * Dispatch pagefault. 341 */ 342 res = as_fault(hat, as, addr, 1, type, rw); 343 344 /* 345 * If this isn't a potential unmapped hole in the user's 346 * UNIX data or stack segments, just return status info. 347 */ 348 if (res != FC_NOMAP || iskernel) 349 goto out; 350 351 /* 352 * Check to see if we happened to faulted on a currently unmapped 353 * part of the UNIX data or stack segments. If so, create a zfod 354 * mapping there and then try calling the fault routine again. 355 */ 356 base = p->p_brkbase; 357 len = p->p_brksize; 358 359 if (addr < base || addr >= base + len) { /* data seg? */ 360 base = (caddr_t)p->p_usrstack - p->p_stksize; 361 len = p->p_stksize; 362 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 363 /* not in either UNIX data or stack segments */ 364 res = FC_NOMAP; 365 goto out; 366 } 367 } 368 369 /* 370 * the rest of this function implements a 3.X 4.X 5.X compatibility 371 * This code is probably not needed anymore 372 */ 373 if (p->p_model == DATAMODEL_ILP32) { 374 375 /* expand the gap to the page boundaries on each side */ 376 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 377 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 378 len = ea - (uintptr_t)base; 379 380 as_rangelock(as); 381 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 382 0) { 383 err = as_map(as, base, len, segvn_create, zfod_argsp); 384 as_rangeunlock(as); 385 if (err) { 386 res = FC_MAKE_ERR(err); 387 goto out; 388 } 389 } else { 390 /* 391 * This page is already mapped by another thread after 392 * we returned from as_fault() above. We just fall 393 * through as_fault() below. 394 */ 395 as_rangeunlock(as); 396 } 397 398 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 399 } 400 401 out: 402 if (mapped_red) 403 segkp_unmap_red(); 404 405 return (res); 406 } 407 408 void 409 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 410 { 411 struct proc *p = curproc; 412 caddr_t userlimit = (flags & _MAP_LOW32) ? 413 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 414 415 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 416 } 417 418 /*ARGSUSED*/ 419 int 420 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 421 { 422 return (0); 423 } 424 425 /* 426 * map_addr_proc() is the routine called when the system is to 427 * choose an address for the user. We will pick an address 428 * range which is the highest available below kernelbase. 429 * 430 * addrp is a value/result parameter. 431 * On input it is a hint from the user to be used in a completely 432 * machine dependent fashion. We decide to completely ignore this hint. 433 * 434 * On output it is NULL if no address can be found in the current 435 * processes address space or else an address that is currently 436 * not mapped for len bytes with a page of red zone on either side. 437 * 438 * align is not needed on x86 (it's for viturally addressed caches) 439 */ 440 /*ARGSUSED*/ 441 void 442 map_addr_proc( 443 caddr_t *addrp, 444 size_t len, 445 offset_t off, 446 int vacalign, 447 caddr_t userlimit, 448 struct proc *p, 449 uint_t flags) 450 { 451 struct as *as = p->p_as; 452 caddr_t addr; 453 caddr_t base; 454 size_t slen; 455 size_t align_amount; 456 457 ASSERT32(userlimit == as->a_userlimit); 458 459 base = p->p_brkbase; 460 #if defined(__amd64) 461 /* 462 * XX64 Yes, this needs more work. 463 */ 464 if (p->p_model == DATAMODEL_NATIVE) { 465 if (userlimit < as->a_userlimit) { 466 /* 467 * This happens when a program wants to map 468 * something in a range that's accessible to a 469 * program in a smaller address space. For example, 470 * a 64-bit program calling mmap32(2) to guarantee 471 * that the returned address is below 4Gbytes. 472 */ 473 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 474 475 if (userlimit > base) 476 slen = userlimit - base; 477 else { 478 *addrp = NULL; 479 return; 480 } 481 } else { 482 /* 483 * XX64 This layout is probably wrong .. but in 484 * the event we make the amd64 address space look 485 * like sparcv9 i.e. with the stack -above- the 486 * heap, this bit of code might even be correct. 487 */ 488 slen = p->p_usrstack - base - 489 (((size_t)rctl_enforced_value( 490 rctlproc_legacy[RLIMIT_STACK], 491 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 492 } 493 } else 494 #endif 495 slen = userlimit - base; 496 497 len = (len + PAGEOFFSET) & PAGEMASK; 498 499 /* 500 * Redzone for each side of the request. This is done to leave 501 * one page unmapped between segments. This is not required, but 502 * it's useful for the user because if their program strays across 503 * a segment boundary, it will catch a fault immediately making 504 * debugging a little easier. 505 */ 506 len += 2 * MMU_PAGESIZE; 507 508 /* 509 * figure out what the alignment should be 510 * 511 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 512 */ 513 if (len <= ELF_386_MAXPGSZ) { 514 /* 515 * Align virtual addresses to ensure that ELF shared libraries 516 * are mapped with the appropriate alignment constraints by 517 * the run-time linker. 518 */ 519 align_amount = ELF_386_MAXPGSZ; 520 } else { 521 int l = mmu.max_page_level; 522 523 while (l && len < LEVEL_SIZE(l)) 524 --l; 525 526 align_amount = LEVEL_SIZE(l); 527 } 528 529 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 530 align_amount = (uintptr_t)*addrp; 531 532 len += align_amount; 533 534 /* 535 * Look for a large enough hole starting below userlimit. 536 * After finding it, use the upper part. Addition of PAGESIZE 537 * is for the redzone as described above. 538 */ 539 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 540 caddr_t as_addr; 541 542 addr = base + slen - len + MMU_PAGESIZE; 543 as_addr = addr; 544 /* 545 * Round address DOWN to the alignment amount, 546 * add the offset, and if this address is less 547 * than the original address, add alignment amount. 548 */ 549 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 550 addr += (uintptr_t)(off & (align_amount - 1)); 551 if (addr < as_addr) 552 addr += align_amount; 553 554 ASSERT(addr <= (as_addr + align_amount)); 555 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 556 ((uintptr_t)(off & (align_amount - 1)))); 557 *addrp = addr; 558 } else { 559 *addrp = NULL; /* no more virtual space */ 560 } 561 } 562 563 /* 564 * Determine whether [base, base+len] contains a valid range of 565 * addresses at least minlen long. base and len are adjusted if 566 * required to provide a valid range. 567 */ 568 /*ARGSUSED3*/ 569 int 570 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 571 { 572 uintptr_t hi, lo; 573 574 lo = (uintptr_t)*basep; 575 hi = lo + *lenp; 576 577 /* 578 * If hi rolled over the top, try cutting back. 579 */ 580 if (hi < lo) { 581 if (0 - lo + hi < minlen) 582 return (0); 583 if (0 - lo < minlen) 584 return (0); 585 *lenp = 0 - lo; 586 } else if (hi - lo < minlen) { 587 return (0); 588 } 589 #if defined(__amd64) 590 /* 591 * Deal with a possible hole in the address range between 592 * hole_start and hole_end that should never be mapped. 593 */ 594 if (lo < hole_start) { 595 if (hi > hole_start) { 596 if (hi < hole_end) { 597 hi = hole_start; 598 } else { 599 /* lo < hole_start && hi >= hole_end */ 600 if (dir == AH_LO) { 601 /* 602 * prefer lowest range 603 */ 604 if (hole_start - lo >= minlen) 605 hi = hole_start; 606 else if (hi - hole_end >= minlen) 607 lo = hole_end; 608 else 609 return (0); 610 } else { 611 /* 612 * prefer highest range 613 */ 614 if (hi - hole_end >= minlen) 615 lo = hole_end; 616 else if (hole_start - lo >= minlen) 617 hi = hole_start; 618 else 619 return (0); 620 } 621 } 622 } 623 } else { 624 /* lo >= hole_start */ 625 if (hi < hole_end) 626 return (0); 627 if (lo < hole_end) 628 lo = hole_end; 629 } 630 631 if (hi - lo < minlen) 632 return (0); 633 634 *basep = (caddr_t)lo; 635 *lenp = hi - lo; 636 #endif 637 return (1); 638 } 639 640 /* 641 * Determine whether [addr, addr+len] are valid user addresses. 642 */ 643 /*ARGSUSED*/ 644 int 645 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 646 caddr_t userlimit) 647 { 648 caddr_t eaddr = addr + len; 649 650 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 651 return (RANGE_BADADDR); 652 653 #if defined(__amd64) 654 /* 655 * Check for the VA hole 656 */ 657 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 658 return (RANGE_BADADDR); 659 #endif 660 661 return (RANGE_OKAY); 662 } 663 664 /* 665 * Return 1 if the page frame is onboard memory, else 0. 666 */ 667 int 668 pf_is_memory(pfn_t pf) 669 { 670 return (address_in_memlist(phys_install, mmu_ptob((uint64_t)pf), 1)); 671 } 672 673 674 /* 675 * initialized by page_coloring_init(). 676 */ 677 uint_t page_colors; 678 uint_t page_colors_mask; 679 uint_t page_coloring_shift; 680 int cpu_page_colors; 681 static uint_t l2_colors; 682 683 /* 684 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 685 * and page_colors are calculated from the l2 cache n-way set size. Within a 686 * mnode range, the page freelist and cachelist are hashed into bins based on 687 * color. This makes it easier to search for a page within a specific memory 688 * range. 689 */ 690 #define PAGE_COLORS_MIN 16 691 692 page_t ****page_freelists; 693 page_t ***page_cachelists; 694 695 /* 696 * As the PC architecture evolved memory up was clumped into several 697 * ranges for various historical I/O devices to do DMA. 698 * < 16Meg - ISA bus 699 * < 2Gig - ??? 700 * < 4Gig - PCI bus or drivers that don't understand PAE mode 701 */ 702 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 703 0x100000, /* pfn range for 4G and above */ 704 0x80000, /* pfn range for 2G-4G */ 705 0x01000, /* pfn range for 16M-2G */ 706 0x00000, /* pfn range for 0-16M */ 707 }; 708 709 /* 710 * These are changed during startup if the machine has limited memory. 711 */ 712 pfn_t *memranges = &arch_memranges[0]; 713 int nranges = NUM_MEM_RANGES; 714 715 /* 716 * Used by page layer to know about page sizes 717 */ 718 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 719 720 /* 721 * This can be patched via /etc/system to allow old non-PAE aware device 722 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 723 */ 724 #if defined(__i386) 725 int restricted_kmemalloc = 0; 726 #elif defined(__amd64) 727 int restricted_kmemalloc = 0; 728 #endif 729 730 kmutex_t *fpc_mutex[NPC_MUTEX]; 731 kmutex_t *cpc_mutex[NPC_MUTEX]; 732 733 734 /* 735 * return the memrange containing pfn 736 */ 737 int 738 memrange_num(pfn_t pfn) 739 { 740 int n; 741 742 for (n = 0; n < nranges - 1; ++n) { 743 if (pfn >= memranges[n]) 744 break; 745 } 746 return (n); 747 } 748 749 /* 750 * return the mnoderange containing pfn 751 */ 752 int 753 pfn_2_mtype(pfn_t pfn) 754 { 755 int n; 756 757 for (n = mnoderangecnt - 1; n >= 0; n--) { 758 if (pfn >= mnoderanges[n].mnr_pfnlo) { 759 break; 760 } 761 } 762 return (n); 763 } 764 765 /* 766 * is_contigpage_free: 767 * returns a page list of contiguous pages. It minimally has to return 768 * minctg pages. Caller determines minctg based on the scatter-gather 769 * list length. 770 * 771 * pfnp is set to the next page frame to search on return. 772 */ 773 static page_t * 774 is_contigpage_free( 775 pfn_t *pfnp, 776 pgcnt_t *pgcnt, 777 pgcnt_t minctg, 778 uint64_t pfnseg, 779 int iolock) 780 { 781 int i = 0; 782 pfn_t pfn = *pfnp; 783 page_t *pp; 784 page_t *plist = NULL; 785 786 /* 787 * fail if pfn + minctg crosses a segment boundary. 788 * Adjust for next starting pfn to begin at segment boundary. 789 */ 790 791 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 792 *pfnp = roundup(*pfnp, pfnseg + 1); 793 return (NULL); 794 } 795 796 do { 797 retry: 798 pp = page_numtopp_nolock(pfn + i); 799 if ((pp == NULL) || 800 (page_trylock(pp, SE_EXCL) == 0)) { 801 (*pfnp)++; 802 break; 803 } 804 if (page_pptonum(pp) != pfn + i) { 805 page_unlock(pp); 806 goto retry; 807 } 808 809 if (!(PP_ISFREE(pp))) { 810 page_unlock(pp); 811 (*pfnp)++; 812 break; 813 } 814 815 if (!PP_ISAGED(pp)) { 816 page_list_sub(pp, PG_CACHE_LIST); 817 page_hashout(pp, (kmutex_t *)NULL); 818 } else { 819 page_list_sub(pp, PG_FREE_LIST); 820 } 821 822 if (iolock) 823 page_io_lock(pp); 824 page_list_concat(&plist, &pp); 825 826 /* 827 * exit loop when pgcnt satisfied or segment boundary reached. 828 */ 829 830 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 831 832 *pfnp += i; /* set to next pfn to search */ 833 834 if (i >= minctg) { 835 *pgcnt -= i; 836 return (plist); 837 } 838 839 /* 840 * failure: minctg not satisfied. 841 * 842 * if next request crosses segment boundary, set next pfn 843 * to search from the segment boundary. 844 */ 845 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 846 *pfnp = roundup(*pfnp, pfnseg + 1); 847 848 /* clean up any pages already allocated */ 849 850 while (plist) { 851 pp = plist; 852 page_sub(&plist, pp); 853 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 854 if (iolock) 855 page_io_unlock(pp); 856 page_unlock(pp); 857 } 858 859 return (NULL); 860 } 861 862 /* 863 * verify that pages being returned from allocator have correct DMA attribute 864 */ 865 #ifndef DEBUG 866 #define check_dma(a, b, c) (0) 867 #else 868 static void 869 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 870 { 871 if (dma_attr == NULL) 872 return; 873 874 while (cnt-- > 0) { 875 if (mmu_ptob((uint64_t)pp->p_pagenum) < 876 dma_attr->dma_attr_addr_lo) 877 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 878 if (mmu_ptob((uint64_t)pp->p_pagenum) >= 879 dma_attr->dma_attr_addr_hi) 880 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 881 pp = pp->p_next; 882 } 883 } 884 #endif 885 886 static kmutex_t contig_lock; 887 888 #define CONTIG_LOCK() mutex_enter(&contig_lock); 889 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 890 891 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 892 893 static page_t * 894 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 895 { 896 pfn_t pfn; 897 int sgllen; 898 uint64_t pfnseg; 899 pgcnt_t minctg; 900 page_t *pplist = NULL, *plist; 901 uint64_t lo, hi; 902 pgcnt_t pfnalign = 0; 903 static pfn_t startpfn; 904 static pgcnt_t lastctgcnt; 905 uintptr_t align; 906 907 CONTIG_LOCK(); 908 909 if (mattr) { 910 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 911 hi = mmu_btop(mattr->dma_attr_addr_hi); 912 if (hi >= physmax) 913 hi = physmax - 1; 914 sgllen = mattr->dma_attr_sgllen; 915 pfnseg = mmu_btop(mattr->dma_attr_seg); 916 917 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 918 if (align > MMU_PAGESIZE) 919 pfnalign = mmu_btop(align); 920 921 /* 922 * in order to satisfy the request, must minimally 923 * acquire minctg contiguous pages 924 */ 925 minctg = howmany(*pgcnt, sgllen); 926 927 ASSERT(hi >= lo); 928 929 /* 930 * start from where last searched if the minctg >= lastctgcnt 931 */ 932 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 933 startpfn = lo; 934 } else { 935 hi = physmax - 1; 936 lo = 0; 937 sgllen = 1; 938 pfnseg = mmu.highest_pfn; 939 minctg = *pgcnt; 940 941 if (minctg < lastctgcnt) 942 startpfn = lo; 943 } 944 lastctgcnt = minctg; 945 946 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 947 948 /* conserve 16m memory - start search above 16m when possible */ 949 if (hi > PFN_16M && startpfn < PFN_16M) 950 startpfn = PFN_16M; 951 952 pfn = startpfn; 953 if (pfnalign) 954 pfn = P2ROUNDUP(pfn, pfnalign); 955 956 while (pfn + minctg - 1 <= hi) { 957 958 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 959 if (plist) { 960 page_list_concat(&pplist, &plist); 961 sgllen--; 962 /* 963 * return when contig pages no longer needed 964 */ 965 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 966 startpfn = pfn; 967 CONTIG_UNLOCK(); 968 check_dma(mattr, pplist, *pgcnt); 969 return (pplist); 970 } 971 minctg = howmany(*pgcnt, sgllen); 972 } 973 if (pfnalign) 974 pfn = P2ROUNDUP(pfn, pfnalign); 975 } 976 977 /* cannot find contig pages in specified range */ 978 if (startpfn == lo) { 979 CONTIG_UNLOCK(); 980 return (NULL); 981 } 982 983 /* did not start with lo previously */ 984 pfn = lo; 985 if (pfnalign) 986 pfn = P2ROUNDUP(pfn, pfnalign); 987 988 /* allow search to go above startpfn */ 989 while (pfn < startpfn) { 990 991 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 992 if (plist != NULL) { 993 994 page_list_concat(&pplist, &plist); 995 sgllen--; 996 997 /* 998 * return when contig pages no longer needed 999 */ 1000 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1001 startpfn = pfn; 1002 CONTIG_UNLOCK(); 1003 check_dma(mattr, pplist, *pgcnt); 1004 return (pplist); 1005 } 1006 minctg = howmany(*pgcnt, sgllen); 1007 } 1008 if (pfnalign) 1009 pfn = P2ROUNDUP(pfn, pfnalign); 1010 } 1011 CONTIG_UNLOCK(); 1012 return (NULL); 1013 } 1014 1015 /* 1016 * combine mem_node_config and memrange memory ranges into one data 1017 * structure to be used for page list management. 1018 * 1019 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1020 * memranges[]. Used to determine the size of page lists and mnoderanges. 1021 * 1022 * mnode_range_setup() initializes mnoderanges. 1023 */ 1024 mnoderange_t *mnoderanges; 1025 int mnoderangecnt; 1026 int mtype4g; 1027 1028 int 1029 mnode_range_cnt(int mnode) 1030 { 1031 int mri; 1032 int mnrcnt = 0; 1033 1034 if (mem_node_config[mnode].exists != 0) { 1035 mri = nranges - 1; 1036 1037 /* find the memranges index below contained in mnode range */ 1038 1039 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1040 mri--; 1041 1042 /* 1043 * increment mnode range counter when memranges or mnode 1044 * boundary is reached. 1045 */ 1046 while (mri >= 0 && 1047 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1048 mnrcnt++; 1049 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1050 mri--; 1051 else 1052 break; 1053 } 1054 } 1055 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1056 return (mnrcnt); 1057 } 1058 1059 void 1060 mnode_range_setup(mnoderange_t *mnoderanges) 1061 { 1062 int mnode, mri; 1063 1064 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1065 if (mem_node_config[mnode].exists == 0) 1066 continue; 1067 1068 mri = nranges - 1; 1069 1070 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1071 mri--; 1072 1073 while (mri >= 0 && mem_node_config[mnode].physmax >= 1074 MEMRANGELO(mri)) { 1075 mnoderanges->mnr_pfnlo = 1076 MAX(MEMRANGELO(mri), 1077 mem_node_config[mnode].physbase); 1078 mnoderanges->mnr_pfnhi = 1079 MIN(MEMRANGEHI(mri), 1080 mem_node_config[mnode].physmax); 1081 mnoderanges->mnr_mnode = mnode; 1082 mnoderanges->mnr_memrange = mri; 1083 mnoderanges++; 1084 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1085 mri--; 1086 else 1087 break; 1088 } 1089 } 1090 } 1091 1092 /* 1093 * Determine if the mnode range specified in mtype contains memory belonging 1094 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1095 * the range of indices from high pfn to 0, 16m or 4g. 1096 * 1097 * Return first mnode range type index found otherwise return -1 if none found. 1098 */ 1099 int 1100 mtype_func(int mnode, int mtype, uint_t flags) 1101 { 1102 if (flags & PGI_MT_RANGE) { 1103 int mtlim; 1104 1105 if (flags & PGI_MT_NEXT) 1106 mtype--; 1107 if (flags & PGI_MT_RANGE0) 1108 mtlim = 0; 1109 else if (flags & PGI_MT_RANGE4G) 1110 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1111 else if (flags & PGI_MT_RANGE16M) 1112 mtlim = 1; /* exclude 0-16m range */ 1113 while (mtype >= mtlim) { 1114 if (mnoderanges[mtype].mnr_mnode == mnode) 1115 return (mtype); 1116 mtype--; 1117 } 1118 } else { 1119 if (mnoderanges[mtype].mnr_mnode == mnode) 1120 return (mtype); 1121 } 1122 return (-1); 1123 } 1124 1125 /* 1126 * Update the page list max counts with the pfn range specified by the 1127 * input parameters. Called from add_physmem() when physical memory with 1128 * page_t's are initially added to the page lists. 1129 */ 1130 void 1131 mtype_modify_max(pfn_t startpfn, long cnt) 1132 { 1133 int mtype = 0; 1134 pfn_t endpfn = startpfn + cnt, pfn; 1135 pgcnt_t inc; 1136 1137 ASSERT(cnt > 0); 1138 1139 for (pfn = startpfn; pfn < endpfn; ) { 1140 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1141 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1142 inc = endpfn - pfn; 1143 } else { 1144 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1145 } 1146 mnoderanges[mtype].mnr_mt_pgmax += inc; 1147 if (physmax4g && mtype <= mtype4g) 1148 maxmem4g += inc; 1149 pfn += inc; 1150 } 1151 mtype++; 1152 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1153 } 1154 } 1155 1156 /* 1157 * Returns the free page count for mnode 1158 */ 1159 int 1160 mnode_pgcnt(int mnode) 1161 { 1162 int mtype = mnoderangecnt - 1; 1163 int flags = PGI_MT_RANGE0; 1164 pgcnt_t pgcnt = 0; 1165 1166 mtype = mtype_func(mnode, mtype, flags); 1167 1168 while (mtype != -1) { 1169 pgcnt += MTYPE_FREEMEM(mtype); 1170 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1171 } 1172 return (pgcnt); 1173 } 1174 1175 /* 1176 * Initialize page coloring variables based on the l2 cache parameters. 1177 * Calculate and return memory needed for page coloring data structures. 1178 */ 1179 size_t 1180 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1181 { 1182 size_t colorsz = 0; 1183 int i; 1184 int colors; 1185 1186 /* 1187 * Reduce the memory ranges lists if we don't have large amounts 1188 * of memory. This avoids searching known empty free lists. 1189 */ 1190 i = memrange_num(physmax); 1191 memranges += i; 1192 nranges -= i; 1193 #if defined(__i386) 1194 if (i > 0) 1195 restricted_kmemalloc = 0; 1196 #endif 1197 /* physmax greater than 4g */ 1198 if (i == 0) 1199 physmax4g = 1; 1200 1201 ASSERT(ISP2(l2_sz)); 1202 ASSERT(ISP2(l2_linesz)); 1203 ASSERT(l2_sz > MMU_PAGESIZE); 1204 1205 /* l2_assoc is 0 for fully associative l2 cache */ 1206 if (l2_assoc) 1207 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1208 else 1209 l2_colors = 1; 1210 1211 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1212 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1213 1214 /* 1215 * cpu_page_colors is non-zero when a page color may be spread across 1216 * multiple bins. 1217 */ 1218 if (l2_colors < page_colors) 1219 cpu_page_colors = l2_colors; 1220 1221 ASSERT(ISP2(page_colors)); 1222 1223 page_colors_mask = page_colors - 1; 1224 1225 ASSERT(ISP2(CPUSETSIZE())); 1226 page_coloring_shift = lowbit(CPUSETSIZE()); 1227 1228 /* initialize number of colors per page size */ 1229 for (i = 0; i <= mmu.max_page_level; i++) { 1230 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1231 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1232 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1233 hw_page_array[i].hp_colors = (page_colors_mask >> 1234 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1235 + 1; 1236 } 1237 1238 /* 1239 * The value of cpu_page_colors determines if additional color bins 1240 * need to be checked for a particular color in the page_get routines. 1241 */ 1242 if (cpu_page_colors != 0) { 1243 1244 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1245 ASSERT(a > 0); 1246 ASSERT(a < 16); 1247 1248 for (i = 0; i <= mmu.max_page_level; i++) { 1249 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1250 colorequivszc[i] = 0; 1251 continue; 1252 } 1253 while ((colors >> a) == 0) 1254 a--; 1255 ASSERT(a >= 0); 1256 1257 /* higher 4 bits encodes color equiv mask */ 1258 colorequivszc[i] = (a << 4); 1259 } 1260 } 1261 1262 /* factor in colorequiv to check additional 'equivalent' bins. */ 1263 if (colorequiv > 1) { 1264 1265 int a = lowbit(colorequiv) - 1; 1266 if (a > 15) 1267 a = 15; 1268 1269 for (i = 0; i <= mmu.max_page_level; i++) { 1270 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1271 continue; 1272 } 1273 while ((colors >> a) == 0) 1274 a--; 1275 if ((a << 4) > colorequivszc[i]) { 1276 colorequivszc[i] = (a << 4); 1277 } 1278 } 1279 } 1280 1281 /* size for mnoderanges */ 1282 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1283 mnoderangecnt += mnode_range_cnt(i); 1284 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1285 1286 /* size for fpc_mutex and cpc_mutex */ 1287 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1288 1289 /* size of page_freelists */ 1290 colorsz += mnoderangecnt * sizeof (page_t ***); 1291 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1292 1293 for (i = 0; i < mmu_page_sizes; i++) { 1294 colors = page_get_pagecolors(i); 1295 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1296 } 1297 1298 /* size of page_cachelists */ 1299 colorsz += mnoderangecnt * sizeof (page_t **); 1300 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1301 1302 return (colorsz); 1303 } 1304 1305 /* 1306 * Called once at startup to configure page_coloring data structures and 1307 * does the 1st page_free()/page_freelist_add(). 1308 */ 1309 void 1310 page_coloring_setup(caddr_t pcmemaddr) 1311 { 1312 int i; 1313 int j; 1314 int k; 1315 caddr_t addr; 1316 int colors; 1317 1318 /* 1319 * do page coloring setup 1320 */ 1321 addr = pcmemaddr; 1322 1323 mnoderanges = (mnoderange_t *)addr; 1324 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1325 1326 mnode_range_setup(mnoderanges); 1327 1328 if (physmax4g) 1329 mtype4g = pfn_2_mtype(0xfffff); 1330 1331 for (k = 0; k < NPC_MUTEX; k++) { 1332 fpc_mutex[k] = (kmutex_t *)addr; 1333 addr += (max_mem_nodes * sizeof (kmutex_t)); 1334 } 1335 for (k = 0; k < NPC_MUTEX; k++) { 1336 cpc_mutex[k] = (kmutex_t *)addr; 1337 addr += (max_mem_nodes * sizeof (kmutex_t)); 1338 } 1339 page_freelists = (page_t ****)addr; 1340 addr += (mnoderangecnt * sizeof (page_t ***)); 1341 1342 page_cachelists = (page_t ***)addr; 1343 addr += (mnoderangecnt * sizeof (page_t **)); 1344 1345 for (i = 0; i < mnoderangecnt; i++) { 1346 page_freelists[i] = (page_t ***)addr; 1347 addr += (mmu_page_sizes * sizeof (page_t **)); 1348 1349 for (j = 0; j < mmu_page_sizes; j++) { 1350 colors = page_get_pagecolors(j); 1351 page_freelists[i][j] = (page_t **)addr; 1352 addr += (colors * sizeof (page_t *)); 1353 } 1354 page_cachelists[i] = (page_t **)addr; 1355 addr += (page_colors * sizeof (page_t *)); 1356 } 1357 } 1358 1359 /*ARGSUSED*/ 1360 int 1361 bp_color(struct buf *bp) 1362 { 1363 return (0); 1364 } 1365 1366 /* 1367 * get a page from any list with the given mnode 1368 */ 1369 page_t * 1370 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1371 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1372 { 1373 kmutex_t *pcm; 1374 int i; 1375 page_t *pp; 1376 page_t *first_pp; 1377 uint64_t pgaddr; 1378 ulong_t bin; 1379 int mtypestart; 1380 int plw_initialized; 1381 page_list_walker_t plw; 1382 1383 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1384 1385 ASSERT((flags & PG_MATCH_COLOR) == 0); 1386 ASSERT(szc == 0); 1387 ASSERT(dma_attr != NULL); 1388 1389 MTYPE_START(mnode, mtype, flags); 1390 if (mtype < 0) { 1391 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1392 return (NULL); 1393 } 1394 1395 mtypestart = mtype; 1396 1397 bin = origbin; 1398 1399 /* 1400 * check up to page_colors + 1 bins - origbin may be checked twice 1401 * because of BIN_STEP skip 1402 */ 1403 do { 1404 plw_initialized = 0; 1405 1406 for (plw.plw_count = 0; 1407 plw.plw_count < page_colors; plw.plw_count++) { 1408 1409 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1410 goto nextfreebin; 1411 1412 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1413 mutex_enter(pcm); 1414 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1415 first_pp = pp; 1416 while (pp != NULL) { 1417 if (page_trylock(pp, SE_EXCL) == 0) { 1418 pp = pp->p_next; 1419 if (pp == first_pp) { 1420 pp = NULL; 1421 } 1422 continue; 1423 } 1424 1425 ASSERT(PP_ISFREE(pp)); 1426 ASSERT(PP_ISAGED(pp)); 1427 ASSERT(pp->p_vnode == NULL); 1428 ASSERT(pp->p_hash == NULL); 1429 ASSERT(pp->p_offset == (u_offset_t)-1); 1430 ASSERT(pp->p_szc == szc); 1431 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1432 /* check if page within DMA attributes */ 1433 pgaddr = mmu_ptob((uint64_t)(pp->p_pagenum)); 1434 1435 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1436 (pgaddr + MMU_PAGESIZE - 1 <= 1437 dma_attr->dma_attr_addr_hi)) { 1438 break; 1439 } 1440 1441 /* continue looking */ 1442 page_unlock(pp); 1443 pp = pp->p_next; 1444 if (pp == first_pp) 1445 pp = NULL; 1446 1447 } 1448 if (pp != NULL) { 1449 ASSERT(mtype == PP_2_MTYPE(pp)); 1450 ASSERT(pp->p_szc == 0); 1451 1452 /* found a page with specified DMA attributes */ 1453 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1454 mtype), pp); 1455 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1456 1457 if ((PP_ISFREE(pp) == 0) || 1458 (PP_ISAGED(pp) == 0)) { 1459 cmn_err(CE_PANIC, "page %p is not free", 1460 (void *)pp); 1461 } 1462 1463 mutex_exit(pcm); 1464 check_dma(dma_attr, pp, 1); 1465 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1466 return (pp); 1467 } 1468 mutex_exit(pcm); 1469 nextfreebin: 1470 if (plw_initialized == 0) { 1471 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 1472 ASSERT(plw.plw_ceq_dif == page_colors); 1473 plw_initialized = 1; 1474 } 1475 1476 if (plw.plw_do_split) { 1477 pp = page_freelist_split(szc, bin, mnode, 1478 mtype, 1479 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 1480 &plw); 1481 if (pp != NULL) 1482 return (pp); 1483 } 1484 1485 bin = page_list_walk_next_bin(szc, bin, &plw); 1486 } 1487 1488 MTYPE_NEXT(mnode, mtype, flags); 1489 } while (mtype >= 0); 1490 1491 /* failed to find a page in the freelist; try it in the cachelist */ 1492 1493 /* reset mtype start for cachelist search */ 1494 mtype = mtypestart; 1495 ASSERT(mtype >= 0); 1496 1497 /* start with the bin of matching color */ 1498 bin = origbin; 1499 1500 do { 1501 for (i = 0; i <= page_colors; i++) { 1502 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1503 goto nextcachebin; 1504 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1505 mutex_enter(pcm); 1506 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1507 first_pp = pp; 1508 while (pp != NULL) { 1509 if (page_trylock(pp, SE_EXCL) == 0) { 1510 pp = pp->p_next; 1511 if (pp == first_pp) 1512 break; 1513 continue; 1514 } 1515 ASSERT(pp->p_vnode); 1516 ASSERT(PP_ISAGED(pp) == 0); 1517 ASSERT(pp->p_szc == 0); 1518 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1519 1520 /* check if page within DMA attributes */ 1521 1522 pgaddr = ptob((uint64_t)(pp->p_pagenum)); 1523 1524 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1525 (pgaddr + MMU_PAGESIZE - 1 <= 1526 dma_attr->dma_attr_addr_hi)) { 1527 break; 1528 } 1529 1530 /* continue looking */ 1531 page_unlock(pp); 1532 pp = pp->p_next; 1533 if (pp == first_pp) 1534 pp = NULL; 1535 } 1536 1537 if (pp != NULL) { 1538 ASSERT(mtype == PP_2_MTYPE(pp)); 1539 ASSERT(pp->p_szc == 0); 1540 1541 /* found a page with specified DMA attributes */ 1542 page_sub(&PAGE_CACHELISTS(mnode, bin, 1543 mtype), pp); 1544 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1545 1546 mutex_exit(pcm); 1547 ASSERT(pp->p_vnode); 1548 ASSERT(PP_ISAGED(pp) == 0); 1549 check_dma(dma_attr, pp, 1); 1550 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1551 return (pp); 1552 } 1553 mutex_exit(pcm); 1554 nextcachebin: 1555 bin += (i == 0) ? BIN_STEP : 1; 1556 bin &= page_colors_mask; 1557 } 1558 MTYPE_NEXT(mnode, mtype, flags); 1559 } while (mtype >= 0); 1560 1561 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1562 return (NULL); 1563 } 1564 1565 /* 1566 * This function is similar to page_get_freelist()/page_get_cachelist() 1567 * but it searches both the lists to find a page with the specified 1568 * color (or no color) and DMA attributes. The search is done in the 1569 * freelist first and then in the cache list within the highest memory 1570 * range (based on DMA attributes) before searching in the lower 1571 * memory ranges. 1572 * 1573 * Note: This function is called only by page_create_io(). 1574 */ 1575 /*ARGSUSED*/ 1576 page_t * 1577 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1578 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1579 { 1580 uint_t bin; 1581 int mtype; 1582 page_t *pp; 1583 int n; 1584 int m; 1585 int szc; 1586 int fullrange; 1587 int mnode; 1588 int local_failed_stat = 0; 1589 lgrp_mnode_cookie_t lgrp_cookie; 1590 1591 VM_STAT_ADD(pga_vmstats.pga_alloc); 1592 1593 /* only base pagesize currently supported */ 1594 if (size != MMU_PAGESIZE) 1595 return (NULL); 1596 1597 /* 1598 * If we're passed a specific lgroup, we use it. Otherwise, 1599 * assume first-touch placement is desired. 1600 */ 1601 if (!LGRP_EXISTS(lgrp)) 1602 lgrp = lgrp_home_lgrp(); 1603 1604 /* LINTED */ 1605 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 1606 1607 /* 1608 * Only hold one freelist or cachelist lock at a time, that way we 1609 * can start anywhere and not have to worry about lock 1610 * ordering. 1611 */ 1612 if (dma_attr == NULL) { 1613 n = 0; 1614 m = mnoderangecnt - 1; 1615 fullrange = 1; 1616 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1617 } else { 1618 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1619 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1620 1621 /* 1622 * We can guarantee alignment only for page boundary. 1623 */ 1624 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1625 return (NULL); 1626 1627 n = pfn_2_mtype(pfnlo); 1628 m = pfn_2_mtype(pfnhi); 1629 1630 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1631 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1632 } 1633 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1634 1635 if (n > m) 1636 return (NULL); 1637 1638 szc = 0; 1639 1640 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1641 if (n == 0) { 1642 flags |= PGI_MT_RANGE0; 1643 n = m; 1644 } 1645 1646 /* 1647 * Try local memory node first, but try remote if we can't 1648 * get a page of the right color. 1649 */ 1650 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1651 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1652 /* 1653 * allocate pages from high pfn to low. 1654 */ 1655 for (mtype = m; mtype >= n; mtype--) { 1656 if (fullrange != 0) { 1657 pp = page_get_mnode_freelist(mnode, 1658 bin, mtype, szc, flags); 1659 if (pp == NULL) { 1660 pp = page_get_mnode_cachelist( 1661 bin, flags, mnode, mtype); 1662 } 1663 } else { 1664 pp = page_get_mnode_anylist(bin, szc, 1665 flags, mnode, mtype, dma_attr); 1666 } 1667 if (pp != NULL) { 1668 VM_STAT_ADD(pga_vmstats.pga_allocok); 1669 check_dma(dma_attr, pp, 1); 1670 return (pp); 1671 } 1672 } 1673 if (!local_failed_stat) { 1674 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1675 local_failed_stat = 1; 1676 } 1677 } 1678 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1679 1680 return (NULL); 1681 } 1682 1683 /* 1684 * page_create_io() 1685 * 1686 * This function is a copy of page_create_va() with an additional 1687 * argument 'mattr' that specifies DMA memory requirements to 1688 * the page list functions. This function is used by the segkmem 1689 * allocator so it is only to create new pages (i.e PG_EXCL is 1690 * set). 1691 * 1692 * Note: This interface is currently used by x86 PSM only and is 1693 * not fully specified so the commitment level is only for 1694 * private interface specific to x86. This interface uses PSM 1695 * specific page_get_anylist() interface. 1696 */ 1697 1698 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1699 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1700 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1701 break; \ 1702 } \ 1703 } 1704 1705 1706 page_t * 1707 page_create_io( 1708 struct vnode *vp, 1709 u_offset_t off, 1710 uint_t bytes, 1711 uint_t flags, 1712 struct as *as, 1713 caddr_t vaddr, 1714 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1715 { 1716 page_t *plist = NULL; 1717 uint_t plist_len = 0; 1718 pgcnt_t npages; 1719 page_t *npp = NULL; 1720 uint_t pages_req; 1721 page_t *pp; 1722 kmutex_t *phm = NULL; 1723 uint_t index; 1724 1725 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1726 "page_create_start:vp %p off %llx bytes %u flags %x", 1727 vp, off, bytes, flags); 1728 1729 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1730 1731 pages_req = npages = mmu_btopr(bytes); 1732 1733 /* 1734 * Do the freemem and pcf accounting. 1735 */ 1736 if (!page_create_wait(npages, flags)) { 1737 return (NULL); 1738 } 1739 1740 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1741 "page_create_success:vp %p off %llx", 1742 vp, off); 1743 1744 /* 1745 * If satisfying this request has left us with too little 1746 * memory, start the wheels turning to get some back. The 1747 * first clause of the test prevents waking up the pageout 1748 * daemon in situations where it would decide that there's 1749 * nothing to do. 1750 */ 1751 if (nscan < desscan && freemem < minfree) { 1752 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1753 "pageout_cv_signal:freemem %ld", freemem); 1754 cv_signal(&proc_pageout->p_cv); 1755 } 1756 1757 if (flags & PG_PHYSCONTIG) { 1758 1759 plist = page_get_contigpage(&npages, mattr, 1); 1760 if (plist == NULL) { 1761 page_create_putback(npages); 1762 return (NULL); 1763 } 1764 1765 pp = plist; 1766 1767 do { 1768 if (!page_hashin(pp, vp, off, NULL)) { 1769 panic("pg_creat_io: hashin failed %p %p %llx", 1770 (void *)pp, (void *)vp, off); 1771 } 1772 VM_STAT_ADD(page_create_new); 1773 off += MMU_PAGESIZE; 1774 PP_CLRFREE(pp); 1775 PP_CLRAGED(pp); 1776 page_set_props(pp, P_REF); 1777 pp = pp->p_next; 1778 } while (pp != plist); 1779 1780 if (!npages) { 1781 check_dma(mattr, plist, pages_req); 1782 return (plist); 1783 } else { 1784 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1785 } 1786 1787 /* 1788 * fall-thru: 1789 * 1790 * page_get_contigpage returns when npages <= sgllen. 1791 * Grab the rest of the non-contig pages below from anylist. 1792 */ 1793 } 1794 1795 /* 1796 * Loop around collecting the requested number of pages. 1797 * Most of the time, we have to `create' a new page. With 1798 * this in mind, pull the page off the free list before 1799 * getting the hash lock. This will minimize the hash 1800 * lock hold time, nesting, and the like. If it turns 1801 * out we don't need the page, we put it back at the end. 1802 */ 1803 while (npages--) { 1804 phm = NULL; 1805 1806 index = PAGE_HASH_FUNC(vp, off); 1807 top: 1808 ASSERT(phm == NULL); 1809 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1810 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1811 1812 if (npp == NULL) { 1813 /* 1814 * Try to get the page of any color either from 1815 * the freelist or from the cache list. 1816 */ 1817 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1818 flags & ~PG_MATCH_COLOR, mattr, NULL); 1819 if (npp == NULL) { 1820 if (mattr == NULL) { 1821 /* 1822 * Not looking for a special page; 1823 * panic! 1824 */ 1825 panic("no page found %d", (int)npages); 1826 } 1827 /* 1828 * No page found! This can happen 1829 * if we are looking for a page 1830 * within a specific memory range 1831 * for DMA purposes. If PG_WAIT is 1832 * specified then we wait for a 1833 * while and then try again. The 1834 * wait could be forever if we 1835 * don't get the page(s) we need. 1836 * 1837 * Note: XXX We really need a mechanism 1838 * to wait for pages in the desired 1839 * range. For now, we wait for any 1840 * pages and see if we can use it. 1841 */ 1842 1843 if ((mattr != NULL) && (flags & PG_WAIT)) { 1844 delay(10); 1845 goto top; 1846 } 1847 1848 goto fail; /* undo accounting stuff */ 1849 } 1850 1851 if (PP_ISAGED(npp) == 0) { 1852 /* 1853 * Since this page came from the 1854 * cachelist, we must destroy the 1855 * old vnode association. 1856 */ 1857 page_hashout(npp, (kmutex_t *)NULL); 1858 } 1859 } 1860 1861 /* 1862 * We own this page! 1863 */ 1864 ASSERT(PAGE_EXCL(npp)); 1865 ASSERT(npp->p_vnode == NULL); 1866 ASSERT(!hat_page_is_mapped(npp)); 1867 PP_CLRFREE(npp); 1868 PP_CLRAGED(npp); 1869 1870 /* 1871 * Here we have a page in our hot little mits and are 1872 * just waiting to stuff it on the appropriate lists. 1873 * Get the mutex and check to see if it really does 1874 * not exist. 1875 */ 1876 phm = PAGE_HASH_MUTEX(index); 1877 mutex_enter(phm); 1878 PAGE_HASH_SEARCH(index, pp, vp, off); 1879 if (pp == NULL) { 1880 VM_STAT_ADD(page_create_new); 1881 pp = npp; 1882 npp = NULL; 1883 if (!page_hashin(pp, vp, off, phm)) { 1884 /* 1885 * Since we hold the page hash mutex and 1886 * just searched for this page, page_hashin 1887 * had better not fail. If it does, that 1888 * means somethread did not follow the 1889 * page hash mutex rules. Panic now and 1890 * get it over with. As usual, go down 1891 * holding all the locks. 1892 */ 1893 ASSERT(MUTEX_HELD(phm)); 1894 panic("page_create: hashin fail %p %p %llx %p", 1895 (void *)pp, (void *)vp, off, (void *)phm); 1896 1897 } 1898 ASSERT(MUTEX_HELD(phm)); 1899 mutex_exit(phm); 1900 phm = NULL; 1901 1902 /* 1903 * Hat layer locking need not be done to set 1904 * the following bits since the page is not hashed 1905 * and was on the free list (i.e., had no mappings). 1906 * 1907 * Set the reference bit to protect 1908 * against immediate pageout 1909 * 1910 * XXXmh modify freelist code to set reference 1911 * bit so we don't have to do it here. 1912 */ 1913 page_set_props(pp, P_REF); 1914 } else { 1915 ASSERT(MUTEX_HELD(phm)); 1916 mutex_exit(phm); 1917 phm = NULL; 1918 /* 1919 * NOTE: This should not happen for pages associated 1920 * with kernel vnode 'kvp'. 1921 */ 1922 /* XX64 - to debug why this happens! */ 1923 ASSERT(vp != &kvp); 1924 if (vp == &kvp) 1925 cmn_err(CE_NOTE, 1926 "page_create: page not expected " 1927 "in hash list for kernel vnode - pp 0x%p", 1928 (void *)pp); 1929 VM_STAT_ADD(page_create_exists); 1930 goto fail; 1931 } 1932 1933 /* 1934 * Got a page! It is locked. Acquire the i/o 1935 * lock since we are going to use the p_next and 1936 * p_prev fields to link the requested pages together. 1937 */ 1938 page_io_lock(pp); 1939 page_add(&plist, pp); 1940 plist = plist->p_next; 1941 off += MMU_PAGESIZE; 1942 vaddr += MMU_PAGESIZE; 1943 } 1944 1945 check_dma(mattr, plist, pages_req); 1946 return (plist); 1947 1948 fail: 1949 if (npp != NULL) { 1950 /* 1951 * Did not need this page after all. 1952 * Put it back on the free list. 1953 */ 1954 VM_STAT_ADD(page_create_putbacks); 1955 PP_SETFREE(npp); 1956 PP_SETAGED(npp); 1957 npp->p_offset = (u_offset_t)-1; 1958 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1959 page_unlock(npp); 1960 } 1961 1962 /* 1963 * Give up the pages we already got. 1964 */ 1965 while (plist != NULL) { 1966 pp = plist; 1967 page_sub(&plist, pp); 1968 page_io_unlock(pp); 1969 plist_len++; 1970 /*LINTED: constant in conditional ctx*/ 1971 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1972 } 1973 1974 /* 1975 * VN_DISPOSE does freemem accounting for the pages in plist 1976 * by calling page_free. So, we need to undo the pcf accounting 1977 * for only the remaining pages. 1978 */ 1979 VM_STAT_ADD(page_create_putbacks); 1980 page_create_putback(pages_req - plist_len); 1981 1982 return (NULL); 1983 } 1984 1985 1986 /* 1987 * Copy the data from the physical page represented by "frompp" to 1988 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1989 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1990 * level and no one sleeps with an active mapping there. 1991 * 1992 * Note that the ref/mod bits in the page_t's are not affected by 1993 * this operation, hence it is up to the caller to update them appropriately. 1994 */ 1995 int 1996 ppcopy(page_t *frompp, page_t *topp) 1997 { 1998 caddr_t pp_addr1; 1999 caddr_t pp_addr2; 2000 void *pte1; 2001 void *pte2; 2002 kmutex_t *ppaddr_mutex; 2003 label_t ljb; 2004 int ret = 1; 2005 2006 ASSERT_STACK_ALIGNED(); 2007 ASSERT(PAGE_LOCKED(frompp)); 2008 ASSERT(PAGE_LOCKED(topp)); 2009 2010 if (kpm_enable) { 2011 pp_addr1 = hat_kpm_page2va(frompp, 0); 2012 pp_addr2 = hat_kpm_page2va(topp, 0); 2013 kpreempt_disable(); 2014 } else { 2015 /* 2016 * disable pre-emption so that CPU can't change 2017 */ 2018 kpreempt_disable(); 2019 2020 pp_addr1 = CPU->cpu_caddr1; 2021 pp_addr2 = CPU->cpu_caddr2; 2022 pte1 = (void *)CPU->cpu_caddr1pte; 2023 pte2 = (void *)CPU->cpu_caddr2pte; 2024 2025 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 2026 mutex_enter(ppaddr_mutex); 2027 2028 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 2029 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 2030 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 2031 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 2032 HAT_LOAD_NOCONSIST); 2033 } 2034 2035 if (on_fault(&ljb)) { 2036 ret = 0; 2037 goto faulted; 2038 } 2039 if (use_sse_pagecopy) 2040 hwblkpagecopy(pp_addr1, pp_addr2); 2041 else 2042 bcopy(pp_addr1, pp_addr2, PAGESIZE); 2043 2044 no_fault(); 2045 faulted: 2046 if (!kpm_enable) 2047 mutex_exit(ppaddr_mutex); 2048 kpreempt_enable(); 2049 return (ret); 2050 } 2051 2052 /* 2053 * Zero the physical page from off to off + len given by `pp' 2054 * without changing the reference and modified bits of page. 2055 * 2056 * We use this using CPU private page address #2, see ppcopy() for more info. 2057 * pagezero() must not be called at interrupt level. 2058 */ 2059 void 2060 pagezero(page_t *pp, uint_t off, uint_t len) 2061 { 2062 caddr_t pp_addr2; 2063 void *pte2; 2064 kmutex_t *ppaddr_mutex; 2065 2066 ASSERT_STACK_ALIGNED(); 2067 ASSERT(len <= MMU_PAGESIZE); 2068 ASSERT(off <= MMU_PAGESIZE); 2069 ASSERT(off + len <= MMU_PAGESIZE); 2070 ASSERT(PAGE_LOCKED(pp)); 2071 2072 if (kpm_enable) { 2073 pp_addr2 = hat_kpm_page2va(pp, 0); 2074 kpreempt_disable(); 2075 } else { 2076 kpreempt_disable(); 2077 2078 pp_addr2 = CPU->cpu_caddr2; 2079 pte2 = (void *)CPU->cpu_caddr2pte; 2080 2081 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 2082 mutex_enter(ppaddr_mutex); 2083 2084 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 2085 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 2086 HAT_LOAD_NOCONSIST); 2087 } 2088 2089 if (use_sse_pagezero) 2090 hwblkclr(pp_addr2 + off, len); 2091 else 2092 bzero(pp_addr2 + off, len); 2093 2094 if (!kpm_enable) 2095 mutex_exit(ppaddr_mutex); 2096 kpreempt_enable(); 2097 } 2098 2099 /* 2100 * Platform-dependent page scrub call. 2101 */ 2102 void 2103 pagescrub(page_t *pp, uint_t off, uint_t len) 2104 { 2105 /* 2106 * For now, we rely on the fact that pagezero() will 2107 * always clear UEs. 2108 */ 2109 pagezero(pp, off, len); 2110 } 2111 2112 /* 2113 * set up two private addresses for use on a given CPU for use in ppcopy() 2114 */ 2115 void 2116 setup_vaddr_for_ppcopy(struct cpu *cpup) 2117 { 2118 void *addr; 2119 void *pte; 2120 2121 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2122 pte = hat_mempte_setup(addr); 2123 cpup->cpu_caddr1 = addr; 2124 cpup->cpu_caddr1pte = (pteptr_t)pte; 2125 2126 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2127 pte = hat_mempte_setup(addr); 2128 cpup->cpu_caddr2 = addr; 2129 cpup->cpu_caddr2pte = (pteptr_t)pte; 2130 2131 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 2132 } 2133 2134 2135 /* 2136 * Create the pageout scanner thread. The thread has to 2137 * start at procedure with process pp and priority pri. 2138 */ 2139 void 2140 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 2141 { 2142 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 2143 } 2144 2145 /* 2146 * Function for flushing D-cache when performing module relocations 2147 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 2148 */ 2149 void 2150 dcache_flushall() 2151 {} 2152 2153 size_t 2154 exec_get_spslew(void) 2155 { 2156 return (0); 2157 } 2158