1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 #include <sys/vmsystm.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 1; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 /* 95 * Allow users to disable the kernel's use of SSE. 96 */ 97 extern int use_sse_pagecopy, use_sse_pagezero; 98 99 /* 4g memory management */ 100 pgcnt_t maxmem4g; 101 pgcnt_t freemem4g; 102 int physmax4g; 103 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 104 int lotsfree4gshift = 3; 105 106 /* 16m memory management: desired number of free pages below 16m. */ 107 pgcnt_t desfree16m = 0x380; 108 109 #ifdef VM_STATS 110 struct { 111 ulong_t pga_alloc; 112 ulong_t pga_notfullrange; 113 ulong_t pga_nulldmaattr; 114 ulong_t pga_allocok; 115 ulong_t pga_allocfailed; 116 ulong_t pgma_alloc; 117 ulong_t pgma_allocok; 118 ulong_t pgma_allocfailed; 119 ulong_t pgma_allocempty; 120 } pga_vmstats; 121 #endif 122 123 uint_t mmu_page_sizes; 124 125 /* How many page sizes the users can see */ 126 uint_t mmu_exported_page_sizes; 127 128 /* 129 * Number of pages in 1 GB. Don't enable automatic large pages if we have 130 * fewer than this many pages. 131 */ 132 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 133 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 134 135 /* 136 * Maximum and default segment size tunables for user private 137 * and shared anon memory, and user text and initialized data. 138 * These can be patched via /etc/system to allow large pages 139 * to be used for mapping application private and shared anon memory. 140 */ 141 size_t mcntl0_lpsize = MMU_PAGESIZE; 142 size_t max_uheap_lpsize = MMU_PAGESIZE; 143 size_t default_uheap_lpsize = MMU_PAGESIZE; 144 size_t max_ustack_lpsize = MMU_PAGESIZE; 145 size_t default_ustack_lpsize = MMU_PAGESIZE; 146 size_t max_privmap_lpsize = MMU_PAGESIZE; 147 size_t max_uidata_lpsize = MMU_PAGESIZE; 148 size_t max_utext_lpsize = MMU_PAGESIZE; 149 size_t max_shm_lpsize = MMU_PAGESIZE; 150 151 /* 152 * Return the optimum page size for a given mapping 153 */ 154 /*ARGSUSED*/ 155 size_t 156 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 157 { 158 level_t l = 0; 159 size_t pgsz = MMU_PAGESIZE; 160 size_t max_lpsize; 161 uint_t mszc; 162 163 ASSERT(maptype != MAPPGSZ_VA); 164 165 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 166 return (MMU_PAGESIZE); 167 } 168 169 switch (maptype) { 170 case MAPPGSZ_HEAP: 171 case MAPPGSZ_STK: 172 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 173 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 174 if (max_lpsize == MMU_PAGESIZE) { 175 return (MMU_PAGESIZE); 176 } 177 if (len == 0) { 178 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 179 p->p_brksize - p->p_bssbase : p->p_stksize; 180 } 181 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 182 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 183 184 /* 185 * use the pages size that best fits len 186 */ 187 for (l = mmu.max_page_level; l > 0; --l) { 188 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 189 continue; 190 } else { 191 pgsz = LEVEL_SIZE(l); 192 } 193 break; 194 } 195 196 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 197 p->p_stkpageszc); 198 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 199 pgsz = hw_page_array[mszc].hp_size; 200 } 201 return (pgsz); 202 203 /* 204 * for ISM use the 1st large page size. 205 */ 206 case MAPPGSZ_ISM: 207 if (mmu.max_page_level == 0) 208 return (MMU_PAGESIZE); 209 return (LEVEL_SIZE(1)); 210 } 211 return (pgsz); 212 } 213 214 static uint_t 215 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 216 size_t min_physmem) 217 { 218 caddr_t eaddr = addr + size; 219 uint_t szcvec = 0; 220 caddr_t raddr; 221 caddr_t readdr; 222 size_t pgsz; 223 int i; 224 225 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 226 return (0); 227 } 228 229 for (i = mmu_page_sizes - 1; i > 0; i--) { 230 pgsz = page_get_pagesize(i); 231 if (pgsz > max_lpsize) { 232 continue; 233 } 234 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 235 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 236 if (raddr < addr || raddr >= readdr) { 237 continue; 238 } 239 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 240 continue; 241 } 242 /* 243 * Set szcvec to the remaining page sizes. 244 */ 245 szcvec = ((1 << (i + 1)) - 1) & ~1; 246 break; 247 } 248 return (szcvec); 249 } 250 251 /* 252 * Return a bit vector of large page size codes that 253 * can be used to map [addr, addr + len) region. 254 */ 255 /*ARGSUSED*/ 256 uint_t 257 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 258 int memcntl) 259 { 260 size_t max_lpsize = mcntl0_lpsize; 261 262 if (mmu.max_page_level == 0) 263 return (0); 264 265 if (flags & MAP_TEXT) { 266 if (!memcntl) 267 max_lpsize = max_utext_lpsize; 268 return (map_szcvec(addr, size, off, max_lpsize, 269 shm_lpg_min_physmem)); 270 271 } else if (flags & MAP_INITDATA) { 272 if (!memcntl) 273 max_lpsize = max_uidata_lpsize; 274 return (map_szcvec(addr, size, off, max_lpsize, 275 privm_lpg_min_physmem)); 276 277 } else if (type == MAPPGSZC_SHM) { 278 if (!memcntl) 279 max_lpsize = max_shm_lpsize; 280 return (map_szcvec(addr, size, off, max_lpsize, 281 shm_lpg_min_physmem)); 282 283 } else if (type == MAPPGSZC_HEAP) { 284 if (!memcntl) 285 max_lpsize = max_uheap_lpsize; 286 return (map_szcvec(addr, size, off, max_lpsize, 287 privm_lpg_min_physmem)); 288 289 } else if (type == MAPPGSZC_STACK) { 290 if (!memcntl) 291 max_lpsize = max_ustack_lpsize; 292 return (map_szcvec(addr, size, off, max_lpsize, 293 privm_lpg_min_physmem)); 294 295 } else { 296 if (!memcntl) 297 max_lpsize = max_privmap_lpsize; 298 return (map_szcvec(addr, size, off, max_lpsize, 299 privm_lpg_min_physmem)); 300 } 301 } 302 303 /* 304 * Handle a pagefault. 305 */ 306 faultcode_t 307 pagefault( 308 caddr_t addr, 309 enum fault_type type, 310 enum seg_rw rw, 311 int iskernel) 312 { 313 struct as *as; 314 struct hat *hat; 315 struct proc *p; 316 kthread_t *t; 317 faultcode_t res; 318 caddr_t base; 319 size_t len; 320 int err; 321 int mapped_red; 322 uintptr_t ea; 323 324 ASSERT_STACK_ALIGNED(); 325 326 if (INVALID_VADDR(addr)) 327 return (FC_NOMAP); 328 329 mapped_red = segkp_map_red(); 330 331 if (iskernel) { 332 as = &kas; 333 hat = as->a_hat; 334 } else { 335 t = curthread; 336 p = ttoproc(t); 337 as = p->p_as; 338 hat = as->a_hat; 339 } 340 341 /* 342 * Dispatch pagefault. 343 */ 344 res = as_fault(hat, as, addr, 1, type, rw); 345 346 /* 347 * If this isn't a potential unmapped hole in the user's 348 * UNIX data or stack segments, just return status info. 349 */ 350 if (res != FC_NOMAP || iskernel) 351 goto out; 352 353 /* 354 * Check to see if we happened to faulted on a currently unmapped 355 * part of the UNIX data or stack segments. If so, create a zfod 356 * mapping there and then try calling the fault routine again. 357 */ 358 base = p->p_brkbase; 359 len = p->p_brksize; 360 361 if (addr < base || addr >= base + len) { /* data seg? */ 362 base = (caddr_t)p->p_usrstack - p->p_stksize; 363 len = p->p_stksize; 364 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 365 /* not in either UNIX data or stack segments */ 366 res = FC_NOMAP; 367 goto out; 368 } 369 } 370 371 /* 372 * the rest of this function implements a 3.X 4.X 5.X compatibility 373 * This code is probably not needed anymore 374 */ 375 if (p->p_model == DATAMODEL_ILP32) { 376 377 /* expand the gap to the page boundaries on each side */ 378 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 379 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 380 len = ea - (uintptr_t)base; 381 382 as_rangelock(as); 383 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 384 0) { 385 err = as_map(as, base, len, segvn_create, zfod_argsp); 386 as_rangeunlock(as); 387 if (err) { 388 res = FC_MAKE_ERR(err); 389 goto out; 390 } 391 } else { 392 /* 393 * This page is already mapped by another thread after 394 * we returned from as_fault() above. We just fall 395 * through as_fault() below. 396 */ 397 as_rangeunlock(as); 398 } 399 400 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 401 } 402 403 out: 404 if (mapped_red) 405 segkp_unmap_red(); 406 407 return (res); 408 } 409 410 void 411 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 412 { 413 struct proc *p = curproc; 414 caddr_t userlimit = (flags & _MAP_LOW32) ? 415 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 416 417 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 418 } 419 420 /*ARGSUSED*/ 421 int 422 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 423 { 424 return (0); 425 } 426 427 /* 428 * map_addr_proc() is the routine called when the system is to 429 * choose an address for the user. We will pick an address 430 * range which is the highest available below userlimit. 431 * 432 * addrp is a value/result parameter. 433 * On input it is a hint from the user to be used in a completely 434 * machine dependent fashion. We decide to completely ignore this hint. 435 * 436 * On output it is NULL if no address can be found in the current 437 * processes address space or else an address that is currently 438 * not mapped for len bytes with a page of red zone on either side. 439 * 440 * align is not needed on x86 (it's for viturally addressed caches) 441 */ 442 /*ARGSUSED*/ 443 void 444 map_addr_proc( 445 caddr_t *addrp, 446 size_t len, 447 offset_t off, 448 int vacalign, 449 caddr_t userlimit, 450 struct proc *p, 451 uint_t flags) 452 { 453 struct as *as = p->p_as; 454 caddr_t addr; 455 caddr_t base; 456 size_t slen; 457 size_t align_amount; 458 459 ASSERT32(userlimit == as->a_userlimit); 460 461 base = p->p_brkbase; 462 #if defined(__amd64) 463 /* 464 * XX64 Yes, this needs more work. 465 */ 466 if (p->p_model == DATAMODEL_NATIVE) { 467 if (userlimit < as->a_userlimit) { 468 /* 469 * This happens when a program wants to map 470 * something in a range that's accessible to a 471 * program in a smaller address space. For example, 472 * a 64-bit program calling mmap32(2) to guarantee 473 * that the returned address is below 4Gbytes. 474 */ 475 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 476 477 if (userlimit > base) 478 slen = userlimit - base; 479 else { 480 *addrp = NULL; 481 return; 482 } 483 } else { 484 /* 485 * XX64 This layout is probably wrong .. but in 486 * the event we make the amd64 address space look 487 * like sparcv9 i.e. with the stack -above- the 488 * heap, this bit of code might even be correct. 489 */ 490 slen = p->p_usrstack - base - 491 (((size_t)rctl_enforced_value( 492 rctlproc_legacy[RLIMIT_STACK], 493 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 494 } 495 } else 496 #endif 497 slen = userlimit - base; 498 499 len = (len + PAGEOFFSET) & PAGEMASK; 500 501 /* 502 * Redzone for each side of the request. This is done to leave 503 * one page unmapped between segments. This is not required, but 504 * it's useful for the user because if their program strays across 505 * a segment boundary, it will catch a fault immediately making 506 * debugging a little easier. 507 */ 508 len += 2 * MMU_PAGESIZE; 509 510 /* 511 * figure out what the alignment should be 512 * 513 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 514 */ 515 if (len <= ELF_386_MAXPGSZ) { 516 /* 517 * Align virtual addresses to ensure that ELF shared libraries 518 * are mapped with the appropriate alignment constraints by 519 * the run-time linker. 520 */ 521 align_amount = ELF_386_MAXPGSZ; 522 } else { 523 int l = mmu.max_page_level; 524 525 while (l && len < LEVEL_SIZE(l)) 526 --l; 527 528 align_amount = LEVEL_SIZE(l); 529 } 530 531 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 532 align_amount = (uintptr_t)*addrp; 533 534 len += align_amount; 535 536 /* 537 * Look for a large enough hole starting below userlimit. 538 * After finding it, use the upper part. Addition of PAGESIZE 539 * is for the redzone as described above. 540 */ 541 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 542 caddr_t as_addr; 543 544 addr = base + slen - len + MMU_PAGESIZE; 545 as_addr = addr; 546 /* 547 * Round address DOWN to the alignment amount, 548 * add the offset, and if this address is less 549 * than the original address, add alignment amount. 550 */ 551 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 552 addr += (uintptr_t)(off & (align_amount - 1)); 553 if (addr < as_addr) 554 addr += align_amount; 555 556 ASSERT(addr <= (as_addr + align_amount)); 557 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 558 ((uintptr_t)(off & (align_amount - 1)))); 559 *addrp = addr; 560 } else { 561 *addrp = NULL; /* no more virtual space */ 562 } 563 } 564 565 /* 566 * Determine whether [base, base+len] contains a valid range of 567 * addresses at least minlen long. base and len are adjusted if 568 * required to provide a valid range. 569 */ 570 /*ARGSUSED3*/ 571 int 572 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 573 { 574 uintptr_t hi, lo; 575 576 lo = (uintptr_t)*basep; 577 hi = lo + *lenp; 578 579 /* 580 * If hi rolled over the top, try cutting back. 581 */ 582 if (hi < lo) { 583 if (0 - lo + hi < minlen) 584 return (0); 585 if (0 - lo < minlen) 586 return (0); 587 *lenp = 0 - lo; 588 } else if (hi - lo < minlen) { 589 return (0); 590 } 591 #if defined(__amd64) 592 /* 593 * Deal with a possible hole in the address range between 594 * hole_start and hole_end that should never be mapped. 595 */ 596 if (lo < hole_start) { 597 if (hi > hole_start) { 598 if (hi < hole_end) { 599 hi = hole_start; 600 } else { 601 /* lo < hole_start && hi >= hole_end */ 602 if (dir == AH_LO) { 603 /* 604 * prefer lowest range 605 */ 606 if (hole_start - lo >= minlen) 607 hi = hole_start; 608 else if (hi - hole_end >= minlen) 609 lo = hole_end; 610 else 611 return (0); 612 } else { 613 /* 614 * prefer highest range 615 */ 616 if (hi - hole_end >= minlen) 617 lo = hole_end; 618 else if (hole_start - lo >= minlen) 619 hi = hole_start; 620 else 621 return (0); 622 } 623 } 624 } 625 } else { 626 /* lo >= hole_start */ 627 if (hi < hole_end) 628 return (0); 629 if (lo < hole_end) 630 lo = hole_end; 631 } 632 633 if (hi - lo < minlen) 634 return (0); 635 636 *basep = (caddr_t)lo; 637 *lenp = hi - lo; 638 #endif 639 return (1); 640 } 641 642 /* 643 * Determine whether [addr, addr+len] are valid user addresses. 644 */ 645 /*ARGSUSED*/ 646 int 647 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 648 caddr_t userlimit) 649 { 650 caddr_t eaddr = addr + len; 651 652 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 653 return (RANGE_BADADDR); 654 655 #if defined(__amd64) 656 /* 657 * Check for the VA hole 658 */ 659 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 660 return (RANGE_BADADDR); 661 #endif 662 663 return (RANGE_OKAY); 664 } 665 666 /* 667 * Return 1 if the page frame is onboard memory, else 0. 668 */ 669 int 670 pf_is_memory(pfn_t pf) 671 { 672 if (pfn_is_foreign(pf)) 673 return (0); 674 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 675 } 676 677 678 /* 679 * initialized by page_coloring_init(). 680 */ 681 uint_t page_colors; 682 uint_t page_colors_mask; 683 uint_t page_coloring_shift; 684 int cpu_page_colors; 685 static uint_t l2_colors; 686 687 /* 688 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 689 * and page_colors are calculated from the l2 cache n-way set size. Within a 690 * mnode range, the page freelist and cachelist are hashed into bins based on 691 * color. This makes it easier to search for a page within a specific memory 692 * range. 693 */ 694 #define PAGE_COLORS_MIN 16 695 696 page_t ****page_freelists; 697 page_t ***page_cachelists; 698 699 /* 700 * As the PC architecture evolved memory up was clumped into several 701 * ranges for various historical I/O devices to do DMA. 702 * < 16Meg - ISA bus 703 * < 2Gig - ??? 704 * < 4Gig - PCI bus or drivers that don't understand PAE mode 705 */ 706 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 707 0x100000, /* pfn range for 4G and above */ 708 0x80000, /* pfn range for 2G-4G */ 709 0x01000, /* pfn range for 16M-2G */ 710 0x00000, /* pfn range for 0-16M */ 711 }; 712 713 /* 714 * These are changed during startup if the machine has limited memory. 715 */ 716 pfn_t *memranges = &arch_memranges[0]; 717 int nranges = NUM_MEM_RANGES; 718 719 /* 720 * Used by page layer to know about page sizes 721 */ 722 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 723 724 /* 725 * This can be patched via /etc/system to allow old non-PAE aware device 726 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 727 */ 728 #if defined(__i386) 729 int restricted_kmemalloc = 0; 730 #elif defined(__amd64) 731 int restricted_kmemalloc = 0; 732 #endif 733 734 kmutex_t *fpc_mutex[NPC_MUTEX]; 735 kmutex_t *cpc_mutex[NPC_MUTEX]; 736 737 738 /* 739 * return the memrange containing pfn 740 */ 741 int 742 memrange_num(pfn_t pfn) 743 { 744 int n; 745 746 for (n = 0; n < nranges - 1; ++n) { 747 if (pfn >= memranges[n]) 748 break; 749 } 750 return (n); 751 } 752 753 /* 754 * return the mnoderange containing pfn 755 */ 756 int 757 pfn_2_mtype(pfn_t pfn) 758 { 759 int n; 760 761 for (n = mnoderangecnt - 1; n >= 0; n--) { 762 if (pfn >= mnoderanges[n].mnr_pfnlo) { 763 break; 764 } 765 } 766 return (n); 767 } 768 769 /* 770 * is_contigpage_free: 771 * returns a page list of contiguous pages. It minimally has to return 772 * minctg pages. Caller determines minctg based on the scatter-gather 773 * list length. 774 * 775 * pfnp is set to the next page frame to search on return. 776 */ 777 static page_t * 778 is_contigpage_free( 779 pfn_t *pfnp, 780 pgcnt_t *pgcnt, 781 pgcnt_t minctg, 782 uint64_t pfnseg, 783 int iolock) 784 { 785 int i = 0; 786 pfn_t pfn = *pfnp; 787 page_t *pp; 788 page_t *plist = NULL; 789 790 /* 791 * fail if pfn + minctg crosses a segment boundary. 792 * Adjust for next starting pfn to begin at segment boundary. 793 */ 794 795 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 796 *pfnp = roundup(*pfnp, pfnseg + 1); 797 return (NULL); 798 } 799 800 do { 801 retry: 802 pp = page_numtopp_nolock(pfn + i); 803 if ((pp == NULL) || 804 (page_trylock(pp, SE_EXCL) == 0)) { 805 (*pfnp)++; 806 break; 807 } 808 if (page_pptonum(pp) != pfn + i) { 809 page_unlock(pp); 810 goto retry; 811 } 812 813 if (!(PP_ISFREE(pp))) { 814 page_unlock(pp); 815 (*pfnp)++; 816 break; 817 } 818 819 if (!PP_ISAGED(pp)) { 820 page_list_sub(pp, PG_CACHE_LIST); 821 page_hashout(pp, (kmutex_t *)NULL); 822 } else { 823 page_list_sub(pp, PG_FREE_LIST); 824 } 825 826 if (iolock) 827 page_io_lock(pp); 828 page_list_concat(&plist, &pp); 829 830 /* 831 * exit loop when pgcnt satisfied or segment boundary reached. 832 */ 833 834 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 835 836 *pfnp += i; /* set to next pfn to search */ 837 838 if (i >= minctg) { 839 *pgcnt -= i; 840 return (plist); 841 } 842 843 /* 844 * failure: minctg not satisfied. 845 * 846 * if next request crosses segment boundary, set next pfn 847 * to search from the segment boundary. 848 */ 849 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 850 *pfnp = roundup(*pfnp, pfnseg + 1); 851 852 /* clean up any pages already allocated */ 853 854 while (plist) { 855 pp = plist; 856 page_sub(&plist, pp); 857 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 858 if (iolock) 859 page_io_unlock(pp); 860 page_unlock(pp); 861 } 862 863 return (NULL); 864 } 865 866 /* 867 * verify that pages being returned from allocator have correct DMA attribute 868 */ 869 #ifndef DEBUG 870 #define check_dma(a, b, c) (0) 871 #else 872 static void 873 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 874 { 875 if (dma_attr == NULL) 876 return; 877 878 while (cnt-- > 0) { 879 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 880 dma_attr->dma_attr_addr_lo) 881 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 882 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 883 dma_attr->dma_attr_addr_hi) 884 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 885 pp = pp->p_next; 886 } 887 } 888 #endif 889 890 static kmutex_t contig_lock; 891 892 #define CONTIG_LOCK() mutex_enter(&contig_lock); 893 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 894 895 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 896 897 static page_t * 898 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 899 { 900 pfn_t pfn; 901 int sgllen; 902 uint64_t pfnseg; 903 pgcnt_t minctg; 904 page_t *pplist = NULL, *plist; 905 uint64_t lo, hi; 906 pgcnt_t pfnalign = 0; 907 static pfn_t startpfn; 908 static pgcnt_t lastctgcnt; 909 uintptr_t align; 910 911 CONTIG_LOCK(); 912 913 if (mattr) { 914 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 915 hi = mmu_btop(mattr->dma_attr_addr_hi); 916 if (hi >= physmax) 917 hi = physmax - 1; 918 sgllen = mattr->dma_attr_sgllen; 919 pfnseg = mmu_btop(mattr->dma_attr_seg); 920 921 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 922 if (align > MMU_PAGESIZE) 923 pfnalign = mmu_btop(align); 924 925 /* 926 * in order to satisfy the request, must minimally 927 * acquire minctg contiguous pages 928 */ 929 minctg = howmany(*pgcnt, sgllen); 930 931 ASSERT(hi >= lo); 932 933 /* 934 * start from where last searched if the minctg >= lastctgcnt 935 */ 936 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 937 startpfn = lo; 938 } else { 939 hi = physmax - 1; 940 lo = 0; 941 sgllen = 1; 942 pfnseg = mmu.highest_pfn; 943 minctg = *pgcnt; 944 945 if (minctg < lastctgcnt) 946 startpfn = lo; 947 } 948 lastctgcnt = minctg; 949 950 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 951 952 /* conserve 16m memory - start search above 16m when possible */ 953 if (hi > PFN_16M && startpfn < PFN_16M) 954 startpfn = PFN_16M; 955 956 pfn = startpfn; 957 if (pfnalign) 958 pfn = P2ROUNDUP(pfn, pfnalign); 959 960 while (pfn + minctg - 1 <= hi) { 961 962 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 963 if (plist) { 964 page_list_concat(&pplist, &plist); 965 sgllen--; 966 /* 967 * return when contig pages no longer needed 968 */ 969 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 970 startpfn = pfn; 971 CONTIG_UNLOCK(); 972 check_dma(mattr, pplist, *pgcnt); 973 return (pplist); 974 } 975 minctg = howmany(*pgcnt, sgllen); 976 } 977 if (pfnalign) 978 pfn = P2ROUNDUP(pfn, pfnalign); 979 } 980 981 /* cannot find contig pages in specified range */ 982 if (startpfn == lo) { 983 CONTIG_UNLOCK(); 984 return (NULL); 985 } 986 987 /* did not start with lo previously */ 988 pfn = lo; 989 if (pfnalign) 990 pfn = P2ROUNDUP(pfn, pfnalign); 991 992 /* allow search to go above startpfn */ 993 while (pfn < startpfn) { 994 995 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 996 if (plist != NULL) { 997 998 page_list_concat(&pplist, &plist); 999 sgllen--; 1000 1001 /* 1002 * return when contig pages no longer needed 1003 */ 1004 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1005 startpfn = pfn; 1006 CONTIG_UNLOCK(); 1007 check_dma(mattr, pplist, *pgcnt); 1008 return (pplist); 1009 } 1010 minctg = howmany(*pgcnt, sgllen); 1011 } 1012 if (pfnalign) 1013 pfn = P2ROUNDUP(pfn, pfnalign); 1014 } 1015 CONTIG_UNLOCK(); 1016 return (NULL); 1017 } 1018 1019 /* 1020 * combine mem_node_config and memrange memory ranges into one data 1021 * structure to be used for page list management. 1022 * 1023 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1024 * memranges[]. Used to determine the size of page lists and mnoderanges. 1025 * 1026 * mnode_range_setup() initializes mnoderanges. 1027 */ 1028 mnoderange_t *mnoderanges; 1029 int mnoderangecnt; 1030 int mtype4g; 1031 1032 int 1033 mnode_range_cnt(int mnode) 1034 { 1035 int mri; 1036 int mnrcnt = 0; 1037 1038 if (mem_node_config[mnode].exists != 0) { 1039 mri = nranges - 1; 1040 1041 /* find the memranges index below contained in mnode range */ 1042 1043 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1044 mri--; 1045 1046 /* 1047 * increment mnode range counter when memranges or mnode 1048 * boundary is reached. 1049 */ 1050 while (mri >= 0 && 1051 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1052 mnrcnt++; 1053 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1054 mri--; 1055 else 1056 break; 1057 } 1058 } 1059 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1060 return (mnrcnt); 1061 } 1062 1063 void 1064 mnode_range_setup(mnoderange_t *mnoderanges) 1065 { 1066 int mnode, mri; 1067 1068 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1069 if (mem_node_config[mnode].exists == 0) 1070 continue; 1071 1072 mri = nranges - 1; 1073 1074 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1075 mri--; 1076 1077 while (mri >= 0 && mem_node_config[mnode].physmax >= 1078 MEMRANGELO(mri)) { 1079 mnoderanges->mnr_pfnlo = 1080 MAX(MEMRANGELO(mri), 1081 mem_node_config[mnode].physbase); 1082 mnoderanges->mnr_pfnhi = 1083 MIN(MEMRANGEHI(mri), 1084 mem_node_config[mnode].physmax); 1085 mnoderanges->mnr_mnode = mnode; 1086 mnoderanges->mnr_memrange = mri; 1087 mnoderanges++; 1088 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1089 mri--; 1090 else 1091 break; 1092 } 1093 } 1094 } 1095 1096 /* 1097 * Determine if the mnode range specified in mtype contains memory belonging 1098 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1099 * the range of indices from high pfn to 0, 16m or 4g. 1100 * 1101 * Return first mnode range type index found otherwise return -1 if none found. 1102 */ 1103 int 1104 mtype_func(int mnode, int mtype, uint_t flags) 1105 { 1106 if (flags & PGI_MT_RANGE) { 1107 int mtlim; 1108 1109 if (flags & PGI_MT_NEXT) 1110 mtype--; 1111 if (flags & PGI_MT_RANGE0) 1112 mtlim = 0; 1113 else if (flags & PGI_MT_RANGE4G) 1114 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1115 else if (flags & PGI_MT_RANGE16M) 1116 mtlim = 1; /* exclude 0-16m range */ 1117 while (mtype >= mtlim) { 1118 if (mnoderanges[mtype].mnr_mnode == mnode) 1119 return (mtype); 1120 mtype--; 1121 } 1122 } else { 1123 if (mnoderanges[mtype].mnr_mnode == mnode) 1124 return (mtype); 1125 } 1126 return (-1); 1127 } 1128 1129 /* 1130 * Update the page list max counts with the pfn range specified by the 1131 * input parameters. Called from add_physmem() when physical memory with 1132 * page_t's are initially added to the page lists. 1133 */ 1134 void 1135 mtype_modify_max(pfn_t startpfn, long cnt) 1136 { 1137 int mtype = 0; 1138 pfn_t endpfn = startpfn + cnt, pfn; 1139 pgcnt_t inc; 1140 1141 ASSERT(cnt > 0); 1142 1143 for (pfn = startpfn; pfn < endpfn; ) { 1144 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1145 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1146 inc = endpfn - pfn; 1147 } else { 1148 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1149 } 1150 mnoderanges[mtype].mnr_mt_pgmax += inc; 1151 if (physmax4g && mtype <= mtype4g) 1152 maxmem4g += inc; 1153 pfn += inc; 1154 } 1155 mtype++; 1156 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1157 } 1158 } 1159 1160 /* 1161 * Returns the free page count for mnode 1162 */ 1163 int 1164 mnode_pgcnt(int mnode) 1165 { 1166 int mtype = mnoderangecnt - 1; 1167 int flags = PGI_MT_RANGE0; 1168 pgcnt_t pgcnt = 0; 1169 1170 mtype = mtype_func(mnode, mtype, flags); 1171 1172 while (mtype != -1) { 1173 pgcnt += MTYPE_FREEMEM(mtype); 1174 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1175 } 1176 return (pgcnt); 1177 } 1178 1179 /* 1180 * Initialize page coloring variables based on the l2 cache parameters. 1181 * Calculate and return memory needed for page coloring data structures. 1182 */ 1183 size_t 1184 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1185 { 1186 size_t colorsz = 0; 1187 int i; 1188 int colors; 1189 1190 /* 1191 * Reduce the memory ranges lists if we don't have large amounts 1192 * of memory. This avoids searching known empty free lists. 1193 */ 1194 i = memrange_num(physmax); 1195 memranges += i; 1196 nranges -= i; 1197 #if defined(__i386) 1198 if (i > 0) 1199 restricted_kmemalloc = 0; 1200 #endif 1201 /* physmax greater than 4g */ 1202 if (i == 0) 1203 physmax4g = 1; 1204 1205 ASSERT(ISP2(l2_sz)); 1206 ASSERT(ISP2(l2_linesz)); 1207 ASSERT(l2_sz > MMU_PAGESIZE); 1208 1209 /* l2_assoc is 0 for fully associative l2 cache */ 1210 if (l2_assoc) 1211 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1212 else 1213 l2_colors = 1; 1214 1215 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1216 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1217 1218 /* 1219 * cpu_page_colors is non-zero when a page color may be spread across 1220 * multiple bins. 1221 */ 1222 if (l2_colors < page_colors) 1223 cpu_page_colors = l2_colors; 1224 1225 ASSERT(ISP2(page_colors)); 1226 1227 page_colors_mask = page_colors - 1; 1228 1229 ASSERT(ISP2(CPUSETSIZE())); 1230 page_coloring_shift = lowbit(CPUSETSIZE()); 1231 1232 /* initialize number of colors per page size */ 1233 for (i = 0; i <= mmu.max_page_level; i++) { 1234 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1235 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1236 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1237 hw_page_array[i].hp_colors = (page_colors_mask >> 1238 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1239 + 1; 1240 colorequivszc[i] = 0; 1241 } 1242 1243 /* 1244 * The value of cpu_page_colors determines if additional color bins 1245 * need to be checked for a particular color in the page_get routines. 1246 */ 1247 if (cpu_page_colors != 0) { 1248 1249 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1250 ASSERT(a > 0); 1251 ASSERT(a < 16); 1252 1253 for (i = 0; i <= mmu.max_page_level; i++) { 1254 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1255 colorequivszc[i] = 0; 1256 continue; 1257 } 1258 while ((colors >> a) == 0) 1259 a--; 1260 ASSERT(a >= 0); 1261 1262 /* higher 4 bits encodes color equiv mask */ 1263 colorequivszc[i] = (a << 4); 1264 } 1265 } 1266 1267 /* size for mnoderanges */ 1268 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1269 mnoderangecnt += mnode_range_cnt(i); 1270 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1271 1272 /* size for fpc_mutex and cpc_mutex */ 1273 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1274 1275 /* size of page_freelists */ 1276 colorsz += mnoderangecnt * sizeof (page_t ***); 1277 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1278 1279 for (i = 0; i < mmu_page_sizes; i++) { 1280 colors = page_get_pagecolors(i); 1281 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1282 } 1283 1284 /* size of page_cachelists */ 1285 colorsz += mnoderangecnt * sizeof (page_t **); 1286 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1287 1288 return (colorsz); 1289 } 1290 1291 /* 1292 * Called once at startup to configure page_coloring data structures and 1293 * does the 1st page_free()/page_freelist_add(). 1294 */ 1295 void 1296 page_coloring_setup(caddr_t pcmemaddr) 1297 { 1298 int i; 1299 int j; 1300 int k; 1301 caddr_t addr; 1302 int colors; 1303 1304 /* 1305 * do page coloring setup 1306 */ 1307 addr = pcmemaddr; 1308 1309 mnoderanges = (mnoderange_t *)addr; 1310 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1311 1312 mnode_range_setup(mnoderanges); 1313 1314 if (physmax4g) 1315 mtype4g = pfn_2_mtype(0xfffff); 1316 1317 for (k = 0; k < NPC_MUTEX; k++) { 1318 fpc_mutex[k] = (kmutex_t *)addr; 1319 addr += (max_mem_nodes * sizeof (kmutex_t)); 1320 } 1321 for (k = 0; k < NPC_MUTEX; k++) { 1322 cpc_mutex[k] = (kmutex_t *)addr; 1323 addr += (max_mem_nodes * sizeof (kmutex_t)); 1324 } 1325 page_freelists = (page_t ****)addr; 1326 addr += (mnoderangecnt * sizeof (page_t ***)); 1327 1328 page_cachelists = (page_t ***)addr; 1329 addr += (mnoderangecnt * sizeof (page_t **)); 1330 1331 for (i = 0; i < mnoderangecnt; i++) { 1332 page_freelists[i] = (page_t ***)addr; 1333 addr += (mmu_page_sizes * sizeof (page_t **)); 1334 1335 for (j = 0; j < mmu_page_sizes; j++) { 1336 colors = page_get_pagecolors(j); 1337 page_freelists[i][j] = (page_t **)addr; 1338 addr += (colors * sizeof (page_t *)); 1339 } 1340 page_cachelists[i] = (page_t **)addr; 1341 addr += (page_colors * sizeof (page_t *)); 1342 } 1343 } 1344 1345 /*ARGSUSED*/ 1346 int 1347 bp_color(struct buf *bp) 1348 { 1349 return (0); 1350 } 1351 1352 /* 1353 * get a page from any list with the given mnode 1354 */ 1355 page_t * 1356 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1357 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1358 { 1359 kmutex_t *pcm; 1360 int i; 1361 page_t *pp; 1362 page_t *first_pp; 1363 uint64_t pgaddr; 1364 ulong_t bin; 1365 int mtypestart; 1366 int plw_initialized; 1367 page_list_walker_t plw; 1368 1369 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1370 1371 ASSERT((flags & PG_MATCH_COLOR) == 0); 1372 ASSERT(szc == 0); 1373 ASSERT(dma_attr != NULL); 1374 1375 MTYPE_START(mnode, mtype, flags); 1376 if (mtype < 0) { 1377 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1378 return (NULL); 1379 } 1380 1381 mtypestart = mtype; 1382 1383 bin = origbin; 1384 1385 /* 1386 * check up to page_colors + 1 bins - origbin may be checked twice 1387 * because of BIN_STEP skip 1388 */ 1389 do { 1390 plw_initialized = 0; 1391 1392 for (plw.plw_count = 0; 1393 plw.plw_count < page_colors; plw.plw_count++) { 1394 1395 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1396 goto nextfreebin; 1397 1398 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1399 mutex_enter(pcm); 1400 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1401 first_pp = pp; 1402 while (pp != NULL) { 1403 if (page_trylock(pp, SE_EXCL) == 0) { 1404 pp = pp->p_next; 1405 if (pp == first_pp) { 1406 pp = NULL; 1407 } 1408 continue; 1409 } 1410 1411 ASSERT(PP_ISFREE(pp)); 1412 ASSERT(PP_ISAGED(pp)); 1413 ASSERT(pp->p_vnode == NULL); 1414 ASSERT(pp->p_hash == NULL); 1415 ASSERT(pp->p_offset == (u_offset_t)-1); 1416 ASSERT(pp->p_szc == szc); 1417 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1418 /* check if page within DMA attributes */ 1419 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 1420 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1421 (pgaddr + MMU_PAGESIZE - 1 <= 1422 dma_attr->dma_attr_addr_hi)) { 1423 break; 1424 } 1425 1426 /* continue looking */ 1427 page_unlock(pp); 1428 pp = pp->p_next; 1429 if (pp == first_pp) 1430 pp = NULL; 1431 1432 } 1433 if (pp != NULL) { 1434 ASSERT(mtype == PP_2_MTYPE(pp)); 1435 ASSERT(pp->p_szc == 0); 1436 1437 /* found a page with specified DMA attributes */ 1438 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1439 mtype), pp); 1440 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1441 1442 if ((PP_ISFREE(pp) == 0) || 1443 (PP_ISAGED(pp) == 0)) { 1444 cmn_err(CE_PANIC, "page %p is not free", 1445 (void *)pp); 1446 } 1447 1448 mutex_exit(pcm); 1449 check_dma(dma_attr, pp, 1); 1450 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1451 return (pp); 1452 } 1453 mutex_exit(pcm); 1454 nextfreebin: 1455 if (plw_initialized == 0) { 1456 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 1457 ASSERT(plw.plw_ceq_dif == page_colors); 1458 plw_initialized = 1; 1459 } 1460 1461 if (plw.plw_do_split) { 1462 pp = page_freelist_split(szc, bin, mnode, 1463 mtype, 1464 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 1465 &plw); 1466 if (pp != NULL) 1467 return (pp); 1468 } 1469 1470 bin = page_list_walk_next_bin(szc, bin, &plw); 1471 } 1472 1473 MTYPE_NEXT(mnode, mtype, flags); 1474 } while (mtype >= 0); 1475 1476 /* failed to find a page in the freelist; try it in the cachelist */ 1477 1478 /* reset mtype start for cachelist search */ 1479 mtype = mtypestart; 1480 ASSERT(mtype >= 0); 1481 1482 /* start with the bin of matching color */ 1483 bin = origbin; 1484 1485 do { 1486 for (i = 0; i <= page_colors; i++) { 1487 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1488 goto nextcachebin; 1489 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1490 mutex_enter(pcm); 1491 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1492 first_pp = pp; 1493 while (pp != NULL) { 1494 if (page_trylock(pp, SE_EXCL) == 0) { 1495 pp = pp->p_next; 1496 if (pp == first_pp) 1497 break; 1498 continue; 1499 } 1500 ASSERT(pp->p_vnode); 1501 ASSERT(PP_ISAGED(pp) == 0); 1502 ASSERT(pp->p_szc == 0); 1503 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1504 1505 /* check if page within DMA attributes */ 1506 1507 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 1508 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1509 (pgaddr + MMU_PAGESIZE - 1 <= 1510 dma_attr->dma_attr_addr_hi)) { 1511 break; 1512 } 1513 1514 /* continue looking */ 1515 page_unlock(pp); 1516 pp = pp->p_next; 1517 if (pp == first_pp) 1518 pp = NULL; 1519 } 1520 1521 if (pp != NULL) { 1522 ASSERT(mtype == PP_2_MTYPE(pp)); 1523 ASSERT(pp->p_szc == 0); 1524 1525 /* found a page with specified DMA attributes */ 1526 page_sub(&PAGE_CACHELISTS(mnode, bin, 1527 mtype), pp); 1528 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1529 1530 mutex_exit(pcm); 1531 ASSERT(pp->p_vnode); 1532 ASSERT(PP_ISAGED(pp) == 0); 1533 check_dma(dma_attr, pp, 1); 1534 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1535 return (pp); 1536 } 1537 mutex_exit(pcm); 1538 nextcachebin: 1539 bin += (i == 0) ? BIN_STEP : 1; 1540 bin &= page_colors_mask; 1541 } 1542 MTYPE_NEXT(mnode, mtype, flags); 1543 } while (mtype >= 0); 1544 1545 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1546 return (NULL); 1547 } 1548 1549 /* 1550 * This function is similar to page_get_freelist()/page_get_cachelist() 1551 * but it searches both the lists to find a page with the specified 1552 * color (or no color) and DMA attributes. The search is done in the 1553 * freelist first and then in the cache list within the highest memory 1554 * range (based on DMA attributes) before searching in the lower 1555 * memory ranges. 1556 * 1557 * Note: This function is called only by page_create_io(). 1558 */ 1559 /*ARGSUSED*/ 1560 page_t * 1561 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1562 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1563 { 1564 uint_t bin; 1565 int mtype; 1566 page_t *pp; 1567 int n; 1568 int m; 1569 int szc; 1570 int fullrange; 1571 int mnode; 1572 int local_failed_stat = 0; 1573 lgrp_mnode_cookie_t lgrp_cookie; 1574 1575 VM_STAT_ADD(pga_vmstats.pga_alloc); 1576 1577 /* only base pagesize currently supported */ 1578 if (size != MMU_PAGESIZE) 1579 return (NULL); 1580 1581 /* 1582 * If we're passed a specific lgroup, we use it. Otherwise, 1583 * assume first-touch placement is desired. 1584 */ 1585 if (!LGRP_EXISTS(lgrp)) 1586 lgrp = lgrp_home_lgrp(); 1587 1588 /* LINTED */ 1589 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 1590 1591 /* 1592 * Only hold one freelist or cachelist lock at a time, that way we 1593 * can start anywhere and not have to worry about lock 1594 * ordering. 1595 */ 1596 if (dma_attr == NULL) { 1597 n = 0; 1598 m = mnoderangecnt - 1; 1599 fullrange = 1; 1600 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1601 } else { 1602 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1603 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1604 1605 /* 1606 * We can guarantee alignment only for page boundary. 1607 */ 1608 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1609 return (NULL); 1610 1611 n = pfn_2_mtype(pfnlo); 1612 m = pfn_2_mtype(pfnhi); 1613 1614 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1615 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1616 } 1617 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1618 1619 if (n > m) 1620 return (NULL); 1621 1622 szc = 0; 1623 1624 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1625 if (n == 0) { 1626 flags |= PGI_MT_RANGE0; 1627 n = m; 1628 } 1629 1630 /* 1631 * Try local memory node first, but try remote if we can't 1632 * get a page of the right color. 1633 */ 1634 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1635 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1636 /* 1637 * allocate pages from high pfn to low. 1638 */ 1639 for (mtype = m; mtype >= n; mtype--) { 1640 if (fullrange != 0) { 1641 pp = page_get_mnode_freelist(mnode, 1642 bin, mtype, szc, flags); 1643 if (pp == NULL) { 1644 pp = page_get_mnode_cachelist( 1645 bin, flags, mnode, mtype); 1646 } 1647 } else { 1648 pp = page_get_mnode_anylist(bin, szc, 1649 flags, mnode, mtype, dma_attr); 1650 } 1651 if (pp != NULL) { 1652 VM_STAT_ADD(pga_vmstats.pga_allocok); 1653 check_dma(dma_attr, pp, 1); 1654 return (pp); 1655 } 1656 } 1657 if (!local_failed_stat) { 1658 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1659 local_failed_stat = 1; 1660 } 1661 } 1662 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1663 1664 return (NULL); 1665 } 1666 1667 /* 1668 * page_create_io() 1669 * 1670 * This function is a copy of page_create_va() with an additional 1671 * argument 'mattr' that specifies DMA memory requirements to 1672 * the page list functions. This function is used by the segkmem 1673 * allocator so it is only to create new pages (i.e PG_EXCL is 1674 * set). 1675 * 1676 * Note: This interface is currently used by x86 PSM only and is 1677 * not fully specified so the commitment level is only for 1678 * private interface specific to x86. This interface uses PSM 1679 * specific page_get_anylist() interface. 1680 */ 1681 1682 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1683 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1684 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1685 break; \ 1686 } \ 1687 } 1688 1689 1690 page_t * 1691 page_create_io( 1692 struct vnode *vp, 1693 u_offset_t off, 1694 uint_t bytes, 1695 uint_t flags, 1696 struct as *as, 1697 caddr_t vaddr, 1698 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1699 { 1700 page_t *plist = NULL; 1701 uint_t plist_len = 0; 1702 pgcnt_t npages; 1703 page_t *npp = NULL; 1704 uint_t pages_req; 1705 page_t *pp; 1706 kmutex_t *phm = NULL; 1707 uint_t index; 1708 1709 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1710 "page_create_start:vp %p off %llx bytes %u flags %x", 1711 vp, off, bytes, flags); 1712 1713 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1714 1715 pages_req = npages = mmu_btopr(bytes); 1716 1717 /* 1718 * Do the freemem and pcf accounting. 1719 */ 1720 if (!page_create_wait(npages, flags)) { 1721 return (NULL); 1722 } 1723 1724 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1725 "page_create_success:vp %p off %llx", 1726 vp, off); 1727 1728 /* 1729 * If satisfying this request has left us with too little 1730 * memory, start the wheels turning to get some back. The 1731 * first clause of the test prevents waking up the pageout 1732 * daemon in situations where it would decide that there's 1733 * nothing to do. 1734 */ 1735 if (nscan < desscan && freemem < minfree) { 1736 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1737 "pageout_cv_signal:freemem %ld", freemem); 1738 cv_signal(&proc_pageout->p_cv); 1739 } 1740 1741 if (flags & PG_PHYSCONTIG) { 1742 1743 plist = page_get_contigpage(&npages, mattr, 1); 1744 if (plist == NULL) { 1745 page_create_putback(npages); 1746 return (NULL); 1747 } 1748 1749 pp = plist; 1750 1751 do { 1752 if (!page_hashin(pp, vp, off, NULL)) { 1753 panic("pg_creat_io: hashin failed %p %p %llx", 1754 (void *)pp, (void *)vp, off); 1755 } 1756 VM_STAT_ADD(page_create_new); 1757 off += MMU_PAGESIZE; 1758 PP_CLRFREE(pp); 1759 PP_CLRAGED(pp); 1760 page_set_props(pp, P_REF); 1761 pp = pp->p_next; 1762 } while (pp != plist); 1763 1764 if (!npages) { 1765 check_dma(mattr, plist, pages_req); 1766 return (plist); 1767 } else { 1768 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1769 } 1770 1771 /* 1772 * fall-thru: 1773 * 1774 * page_get_contigpage returns when npages <= sgllen. 1775 * Grab the rest of the non-contig pages below from anylist. 1776 */ 1777 } 1778 1779 /* 1780 * Loop around collecting the requested number of pages. 1781 * Most of the time, we have to `create' a new page. With 1782 * this in mind, pull the page off the free list before 1783 * getting the hash lock. This will minimize the hash 1784 * lock hold time, nesting, and the like. If it turns 1785 * out we don't need the page, we put it back at the end. 1786 */ 1787 while (npages--) { 1788 phm = NULL; 1789 1790 index = PAGE_HASH_FUNC(vp, off); 1791 top: 1792 ASSERT(phm == NULL); 1793 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1794 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1795 1796 if (npp == NULL) { 1797 /* 1798 * Try to get the page of any color either from 1799 * the freelist or from the cache list. 1800 */ 1801 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1802 flags & ~PG_MATCH_COLOR, mattr, NULL); 1803 if (npp == NULL) { 1804 if (mattr == NULL) { 1805 /* 1806 * Not looking for a special page; 1807 * panic! 1808 */ 1809 panic("no page found %d", (int)npages); 1810 } 1811 /* 1812 * No page found! This can happen 1813 * if we are looking for a page 1814 * within a specific memory range 1815 * for DMA purposes. If PG_WAIT is 1816 * specified then we wait for a 1817 * while and then try again. The 1818 * wait could be forever if we 1819 * don't get the page(s) we need. 1820 * 1821 * Note: XXX We really need a mechanism 1822 * to wait for pages in the desired 1823 * range. For now, we wait for any 1824 * pages and see if we can use it. 1825 */ 1826 1827 if ((mattr != NULL) && (flags & PG_WAIT)) { 1828 delay(10); 1829 goto top; 1830 } 1831 goto fail; /* undo accounting stuff */ 1832 } 1833 1834 if (PP_ISAGED(npp) == 0) { 1835 /* 1836 * Since this page came from the 1837 * cachelist, we must destroy the 1838 * old vnode association. 1839 */ 1840 page_hashout(npp, (kmutex_t *)NULL); 1841 } 1842 } 1843 1844 /* 1845 * We own this page! 1846 */ 1847 ASSERT(PAGE_EXCL(npp)); 1848 ASSERT(npp->p_vnode == NULL); 1849 ASSERT(!hat_page_is_mapped(npp)); 1850 PP_CLRFREE(npp); 1851 PP_CLRAGED(npp); 1852 1853 /* 1854 * Here we have a page in our hot little mits and are 1855 * just waiting to stuff it on the appropriate lists. 1856 * Get the mutex and check to see if it really does 1857 * not exist. 1858 */ 1859 phm = PAGE_HASH_MUTEX(index); 1860 mutex_enter(phm); 1861 PAGE_HASH_SEARCH(index, pp, vp, off); 1862 if (pp == NULL) { 1863 VM_STAT_ADD(page_create_new); 1864 pp = npp; 1865 npp = NULL; 1866 if (!page_hashin(pp, vp, off, phm)) { 1867 /* 1868 * Since we hold the page hash mutex and 1869 * just searched for this page, page_hashin 1870 * had better not fail. If it does, that 1871 * means somethread did not follow the 1872 * page hash mutex rules. Panic now and 1873 * get it over with. As usual, go down 1874 * holding all the locks. 1875 */ 1876 ASSERT(MUTEX_HELD(phm)); 1877 panic("page_create: hashin fail %p %p %llx %p", 1878 (void *)pp, (void *)vp, off, (void *)phm); 1879 1880 } 1881 ASSERT(MUTEX_HELD(phm)); 1882 mutex_exit(phm); 1883 phm = NULL; 1884 1885 /* 1886 * Hat layer locking need not be done to set 1887 * the following bits since the page is not hashed 1888 * and was on the free list (i.e., had no mappings). 1889 * 1890 * Set the reference bit to protect 1891 * against immediate pageout 1892 * 1893 * XXXmh modify freelist code to set reference 1894 * bit so we don't have to do it here. 1895 */ 1896 page_set_props(pp, P_REF); 1897 } else { 1898 ASSERT(MUTEX_HELD(phm)); 1899 mutex_exit(phm); 1900 phm = NULL; 1901 /* 1902 * NOTE: This should not happen for pages associated 1903 * with kernel vnode 'kvp'. 1904 */ 1905 /* XX64 - to debug why this happens! */ 1906 ASSERT(!VN_ISKAS(vp)); 1907 if (VN_ISKAS(vp)) 1908 cmn_err(CE_NOTE, 1909 "page_create: page not expected " 1910 "in hash list for kernel vnode - pp 0x%p", 1911 (void *)pp); 1912 VM_STAT_ADD(page_create_exists); 1913 goto fail; 1914 } 1915 1916 /* 1917 * Got a page! It is locked. Acquire the i/o 1918 * lock since we are going to use the p_next and 1919 * p_prev fields to link the requested pages together. 1920 */ 1921 page_io_lock(pp); 1922 page_add(&plist, pp); 1923 plist = plist->p_next; 1924 off += MMU_PAGESIZE; 1925 vaddr += MMU_PAGESIZE; 1926 } 1927 1928 check_dma(mattr, plist, pages_req); 1929 return (plist); 1930 1931 fail: 1932 if (npp != NULL) { 1933 /* 1934 * Did not need this page after all. 1935 * Put it back on the free list. 1936 */ 1937 VM_STAT_ADD(page_create_putbacks); 1938 PP_SETFREE(npp); 1939 PP_SETAGED(npp); 1940 npp->p_offset = (u_offset_t)-1; 1941 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1942 page_unlock(npp); 1943 } 1944 1945 /* 1946 * Give up the pages we already got. 1947 */ 1948 while (plist != NULL) { 1949 pp = plist; 1950 page_sub(&plist, pp); 1951 page_io_unlock(pp); 1952 plist_len++; 1953 /*LINTED: constant in conditional ctx*/ 1954 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1955 } 1956 1957 /* 1958 * VN_DISPOSE does freemem accounting for the pages in plist 1959 * by calling page_free. So, we need to undo the pcf accounting 1960 * for only the remaining pages. 1961 */ 1962 VM_STAT_ADD(page_create_putbacks); 1963 page_create_putback(pages_req - plist_len); 1964 1965 return (NULL); 1966 } 1967 1968 1969 /* 1970 * Copy the data from the physical page represented by "frompp" to 1971 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1972 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1973 * level and no one sleeps with an active mapping there. 1974 * 1975 * Note that the ref/mod bits in the page_t's are not affected by 1976 * this operation, hence it is up to the caller to update them appropriately. 1977 */ 1978 int 1979 ppcopy(page_t *frompp, page_t *topp) 1980 { 1981 caddr_t pp_addr1; 1982 caddr_t pp_addr2; 1983 hat_mempte_t pte1; 1984 hat_mempte_t pte2; 1985 kmutex_t *ppaddr_mutex; 1986 label_t ljb; 1987 int ret = 1; 1988 1989 ASSERT_STACK_ALIGNED(); 1990 ASSERT(PAGE_LOCKED(frompp)); 1991 ASSERT(PAGE_LOCKED(topp)); 1992 1993 if (kpm_enable) { 1994 pp_addr1 = hat_kpm_page2va(frompp, 0); 1995 pp_addr2 = hat_kpm_page2va(topp, 0); 1996 kpreempt_disable(); 1997 } else { 1998 /* 1999 * disable pre-emption so that CPU can't change 2000 */ 2001 kpreempt_disable(); 2002 2003 pp_addr1 = CPU->cpu_caddr1; 2004 pp_addr2 = CPU->cpu_caddr2; 2005 pte1 = CPU->cpu_caddr1pte; 2006 pte2 = CPU->cpu_caddr2pte; 2007 2008 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 2009 mutex_enter(ppaddr_mutex); 2010 2011 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 2012 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 2013 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 2014 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 2015 HAT_LOAD_NOCONSIST); 2016 } 2017 2018 if (on_fault(&ljb)) { 2019 ret = 0; 2020 goto faulted; 2021 } 2022 if (use_sse_pagecopy) 2023 hwblkpagecopy(pp_addr1, pp_addr2); 2024 else 2025 bcopy(pp_addr1, pp_addr2, PAGESIZE); 2026 2027 no_fault(); 2028 faulted: 2029 if (!kpm_enable) { 2030 mutex_exit(ppaddr_mutex); 2031 } 2032 kpreempt_enable(); 2033 return (ret); 2034 } 2035 2036 /* 2037 * Zero the physical page from off to off + len given by `pp' 2038 * without changing the reference and modified bits of page. 2039 * 2040 * We use this using CPU private page address #2, see ppcopy() for more info. 2041 * pagezero() must not be called at interrupt level. 2042 */ 2043 void 2044 pagezero(page_t *pp, uint_t off, uint_t len) 2045 { 2046 caddr_t pp_addr2; 2047 hat_mempte_t pte2; 2048 kmutex_t *ppaddr_mutex; 2049 2050 ASSERT_STACK_ALIGNED(); 2051 ASSERT(len <= MMU_PAGESIZE); 2052 ASSERT(off <= MMU_PAGESIZE); 2053 ASSERT(off + len <= MMU_PAGESIZE); 2054 ASSERT(PAGE_LOCKED(pp)); 2055 2056 if (kpm_enable) { 2057 pp_addr2 = hat_kpm_page2va(pp, 0); 2058 kpreempt_disable(); 2059 } else { 2060 kpreempt_disable(); 2061 2062 pp_addr2 = CPU->cpu_caddr2; 2063 pte2 = CPU->cpu_caddr2pte; 2064 2065 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 2066 mutex_enter(ppaddr_mutex); 2067 2068 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 2069 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 2070 HAT_LOAD_NOCONSIST); 2071 } 2072 2073 if (use_sse_pagezero) { 2074 hwblkclr(pp_addr2 + off, len); 2075 } else { 2076 bzero(pp_addr2 + off, len); 2077 } 2078 2079 if (!kpm_enable) 2080 mutex_exit(ppaddr_mutex); 2081 kpreempt_enable(); 2082 } 2083 2084 /* 2085 * Platform-dependent page scrub call. 2086 */ 2087 void 2088 pagescrub(page_t *pp, uint_t off, uint_t len) 2089 { 2090 /* 2091 * For now, we rely on the fact that pagezero() will 2092 * always clear UEs. 2093 */ 2094 pagezero(pp, off, len); 2095 } 2096 2097 /* 2098 * set up two private addresses for use on a given CPU for use in ppcopy() 2099 */ 2100 void 2101 setup_vaddr_for_ppcopy(struct cpu *cpup) 2102 { 2103 void *addr; 2104 hat_mempte_t pte_pa; 2105 2106 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2107 pte_pa = hat_mempte_setup(addr); 2108 cpup->cpu_caddr1 = addr; 2109 cpup->cpu_caddr1pte = pte_pa; 2110 2111 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2112 pte_pa = hat_mempte_setup(addr); 2113 cpup->cpu_caddr2 = addr; 2114 cpup->cpu_caddr2pte = pte_pa; 2115 2116 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 2117 } 2118 2119 /* 2120 * Undo setup_vaddr_for_ppcopy 2121 */ 2122 void 2123 teardown_vaddr_for_ppcopy(struct cpu *cpup) 2124 { 2125 mutex_destroy(&cpup->cpu_ppaddr_mutex); 2126 2127 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 2128 cpup->cpu_caddr2pte = 0; 2129 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 2130 cpup->cpu_caddr2 = 0; 2131 2132 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 2133 cpup->cpu_caddr1pte = 0; 2134 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 2135 cpup->cpu_caddr1 = 0; 2136 } 2137 2138 /* 2139 * Create the pageout scanner thread. The thread has to 2140 * start at procedure with process pp and priority pri. 2141 */ 2142 void 2143 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 2144 { 2145 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 2146 } 2147 2148 /* 2149 * Function for flushing D-cache when performing module relocations 2150 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 2151 */ 2152 void 2153 dcache_flushall() 2154 {} 2155 2156 size_t 2157 exec_get_spslew(void) 2158 { 2159 return (0); 2160 } 2161 2162 /* 2163 * Allocate a memory page. The argument 'seed' can be any pseudo-random 2164 * number to vary where the pages come from. This is quite a hacked up 2165 * method -- it works for now, but really needs to be fixed up a bit. 2166 * 2167 * We currently use page_create_va() on the kvp with fake offsets, 2168 * segments and virt address. This is pretty bogus, but was copied from the 2169 * old hat_i86.c code. A better approach would be to specify either mnode 2170 * random or mnode local and takes a page from whatever color has the MOST 2171 * available - this would have a minimal impact on page coloring. 2172 */ 2173 page_t * 2174 page_get_physical(uintptr_t seed) 2175 { 2176 page_t *pp; 2177 u_offset_t offset; 2178 static struct seg tmpseg; 2179 static uintptr_t ctr = 0; 2180 2181 /* 2182 * This code is gross, we really need a simpler page allocator. 2183 * 2184 * We need assign an offset for the page to call page_create_va(). 2185 * To avoid conflicts with other pages, we get creative with the offset. 2186 * For 32 bits, we pick an offset > 4Gig 2187 * For 64 bits, pick an offset somewhere in the VA hole. 2188 */ 2189 offset = seed; 2190 if (offset > kernelbase) 2191 offset -= kernelbase; 2192 offset <<= MMU_PAGESHIFT; 2193 #if defined(__amd64) 2194 offset += mmu.hole_start; /* something in VA hole */ 2195 #else 2196 offset += 1ULL << 40; /* something > 4 Gig */ 2197 #endif 2198 2199 if (page_resv(1, KM_NOSLEEP) == 0) 2200 return (NULL); 2201 2202 #ifdef DEBUG 2203 pp = page_exists(&kvp, offset); 2204 if (pp != NULL) 2205 panic("page already exists %p", pp); 2206 #endif 2207 2208 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL | PG_NORELOC, 2209 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 2210 if (pp == NULL) 2211 return (NULL); 2212 page_io_unlock(pp); 2213 page_hashout(pp, NULL); 2214 return (pp); 2215 } 2216