1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 #include <sys/vmsystm.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 87 uint_t vac_colors = 1; 88 89 int largepagesupport = 0; 90 extern uint_t page_create_new; 91 extern uint_t page_create_exists; 92 extern uint_t page_create_putbacks; 93 extern uint_t page_create_putbacks; 94 /* 95 * Allow users to disable the kernel's use of SSE. 96 */ 97 extern int use_sse_pagecopy, use_sse_pagezero; 98 99 /* 4g memory management */ 100 pgcnt_t maxmem4g; 101 pgcnt_t freemem4g; 102 int physmax4g; 103 int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 104 int lotsfree4gshift = 3; 105 106 /* 16m memory management: desired number of free pages below 16m. */ 107 pgcnt_t desfree16m = 0x380; 108 109 #ifdef VM_STATS 110 struct { 111 ulong_t pga_alloc; 112 ulong_t pga_notfullrange; 113 ulong_t pga_nulldmaattr; 114 ulong_t pga_allocok; 115 ulong_t pga_allocfailed; 116 ulong_t pgma_alloc; 117 ulong_t pgma_allocok; 118 ulong_t pgma_allocfailed; 119 ulong_t pgma_allocempty; 120 } pga_vmstats; 121 #endif 122 123 uint_t mmu_page_sizes; 124 125 /* How many page sizes the users can see */ 126 uint_t mmu_exported_page_sizes; 127 128 /* 129 * Number of pages in 1 GB. Don't enable automatic large pages if we have 130 * fewer than this many pages. 131 */ 132 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 133 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 134 135 /* 136 * Maximum and default segment size tunables for user private 137 * and shared anon memory, and user text and initialized data. 138 * These can be patched via /etc/system to allow large pages 139 * to be used for mapping application private and shared anon memory. 140 */ 141 size_t mcntl0_lpsize = MMU_PAGESIZE; 142 size_t max_uheap_lpsize = MMU_PAGESIZE; 143 size_t default_uheap_lpsize = MMU_PAGESIZE; 144 size_t max_ustack_lpsize = MMU_PAGESIZE; 145 size_t default_ustack_lpsize = MMU_PAGESIZE; 146 size_t max_privmap_lpsize = MMU_PAGESIZE; 147 size_t max_uidata_lpsize = MMU_PAGESIZE; 148 size_t max_utext_lpsize = MMU_PAGESIZE; 149 size_t max_shm_lpsize = MMU_PAGESIZE; 150 151 /* 152 * Return the optimum page size for a given mapping 153 */ 154 /*ARGSUSED*/ 155 size_t 156 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 157 { 158 level_t l = 0; 159 size_t pgsz = MMU_PAGESIZE; 160 size_t max_lpsize; 161 uint_t mszc; 162 163 ASSERT(maptype != MAPPGSZ_VA); 164 165 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 166 return (MMU_PAGESIZE); 167 } 168 169 switch (maptype) { 170 case MAPPGSZ_HEAP: 171 case MAPPGSZ_STK: 172 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 173 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 174 if (max_lpsize == MMU_PAGESIZE) { 175 return (MMU_PAGESIZE); 176 } 177 if (len == 0) { 178 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 179 p->p_brksize - p->p_bssbase : p->p_stksize; 180 } 181 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 182 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 183 184 /* 185 * use the pages size that best fits len 186 */ 187 for (l = mmu.max_page_level; l > 0; --l) { 188 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 189 continue; 190 } else { 191 pgsz = LEVEL_SIZE(l); 192 } 193 break; 194 } 195 196 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 197 p->p_stkpageszc); 198 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 199 pgsz = hw_page_array[mszc].hp_size; 200 } 201 return (pgsz); 202 203 /* 204 * for ISM use the 1st large page size. 205 */ 206 case MAPPGSZ_ISM: 207 if (mmu.max_page_level == 0) 208 return (MMU_PAGESIZE); 209 return (LEVEL_SIZE(1)); 210 } 211 return (pgsz); 212 } 213 214 static uint_t 215 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 216 size_t min_physmem) 217 { 218 caddr_t eaddr = addr + size; 219 uint_t szcvec = 0; 220 caddr_t raddr; 221 caddr_t readdr; 222 size_t pgsz; 223 int i; 224 225 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 226 return (0); 227 } 228 229 for (i = mmu_page_sizes - 1; i > 0; i--) { 230 pgsz = page_get_pagesize(i); 231 if (pgsz > max_lpsize) { 232 continue; 233 } 234 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 235 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 236 if (raddr < addr || raddr >= readdr) { 237 continue; 238 } 239 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 240 continue; 241 } 242 /* 243 * Set szcvec to the remaining page sizes. 244 */ 245 szcvec = ((1 << (i + 1)) - 1) & ~1; 246 break; 247 } 248 return (szcvec); 249 } 250 251 /* 252 * Return a bit vector of large page size codes that 253 * can be used to map [addr, addr + len) region. 254 */ 255 /*ARGSUSED*/ 256 uint_t 257 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 258 int memcntl) 259 { 260 size_t max_lpsize = mcntl0_lpsize; 261 262 if (mmu.max_page_level == 0) 263 return (0); 264 265 if (flags & MAP_TEXT) { 266 if (!memcntl) 267 max_lpsize = max_utext_lpsize; 268 return (map_szcvec(addr, size, off, max_lpsize, 269 shm_lpg_min_physmem)); 270 271 } else if (flags & MAP_INITDATA) { 272 if (!memcntl) 273 max_lpsize = max_uidata_lpsize; 274 return (map_szcvec(addr, size, off, max_lpsize, 275 privm_lpg_min_physmem)); 276 277 } else if (type == MAPPGSZC_SHM) { 278 if (!memcntl) 279 max_lpsize = max_shm_lpsize; 280 return (map_szcvec(addr, size, off, max_lpsize, 281 shm_lpg_min_physmem)); 282 283 } else if (type == MAPPGSZC_HEAP) { 284 if (!memcntl) 285 max_lpsize = max_uheap_lpsize; 286 return (map_szcvec(addr, size, off, max_lpsize, 287 privm_lpg_min_physmem)); 288 289 } else if (type == MAPPGSZC_STACK) { 290 if (!memcntl) 291 max_lpsize = max_ustack_lpsize; 292 return (map_szcvec(addr, size, off, max_lpsize, 293 privm_lpg_min_physmem)); 294 295 } else { 296 if (!memcntl) 297 max_lpsize = max_privmap_lpsize; 298 return (map_szcvec(addr, size, off, max_lpsize, 299 privm_lpg_min_physmem)); 300 } 301 } 302 303 /* 304 * Handle a pagefault. 305 */ 306 faultcode_t 307 pagefault( 308 caddr_t addr, 309 enum fault_type type, 310 enum seg_rw rw, 311 int iskernel) 312 { 313 struct as *as; 314 struct hat *hat; 315 struct proc *p; 316 kthread_t *t; 317 faultcode_t res; 318 caddr_t base; 319 size_t len; 320 int err; 321 int mapped_red; 322 uintptr_t ea; 323 324 ASSERT_STACK_ALIGNED(); 325 326 if (INVALID_VADDR(addr)) 327 return (FC_NOMAP); 328 329 mapped_red = segkp_map_red(); 330 331 if (iskernel) { 332 as = &kas; 333 hat = as->a_hat; 334 } else { 335 t = curthread; 336 p = ttoproc(t); 337 as = p->p_as; 338 hat = as->a_hat; 339 } 340 341 /* 342 * Dispatch pagefault. 343 */ 344 res = as_fault(hat, as, addr, 1, type, rw); 345 346 /* 347 * If this isn't a potential unmapped hole in the user's 348 * UNIX data or stack segments, just return status info. 349 */ 350 if (res != FC_NOMAP || iskernel) 351 goto out; 352 353 /* 354 * Check to see if we happened to faulted on a currently unmapped 355 * part of the UNIX data or stack segments. If so, create a zfod 356 * mapping there and then try calling the fault routine again. 357 */ 358 base = p->p_brkbase; 359 len = p->p_brksize; 360 361 if (addr < base || addr >= base + len) { /* data seg? */ 362 base = (caddr_t)p->p_usrstack - p->p_stksize; 363 len = p->p_stksize; 364 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 365 /* not in either UNIX data or stack segments */ 366 res = FC_NOMAP; 367 goto out; 368 } 369 } 370 371 /* 372 * the rest of this function implements a 3.X 4.X 5.X compatibility 373 * This code is probably not needed anymore 374 */ 375 if (p->p_model == DATAMODEL_ILP32) { 376 377 /* expand the gap to the page boundaries on each side */ 378 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 379 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 380 len = ea - (uintptr_t)base; 381 382 as_rangelock(as); 383 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 384 0) { 385 err = as_map(as, base, len, segvn_create, zfod_argsp); 386 as_rangeunlock(as); 387 if (err) { 388 res = FC_MAKE_ERR(err); 389 goto out; 390 } 391 } else { 392 /* 393 * This page is already mapped by another thread after 394 * we returned from as_fault() above. We just fall 395 * through as_fault() below. 396 */ 397 as_rangeunlock(as); 398 } 399 400 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 401 } 402 403 out: 404 if (mapped_red) 405 segkp_unmap_red(); 406 407 return (res); 408 } 409 410 void 411 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 412 { 413 struct proc *p = curproc; 414 caddr_t userlimit = (flags & _MAP_LOW32) ? 415 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 416 417 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 418 } 419 420 /*ARGSUSED*/ 421 int 422 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 423 { 424 return (0); 425 } 426 427 /* 428 * map_addr_proc() is the routine called when the system is to 429 * choose an address for the user. We will pick an address 430 * range which is the highest available below userlimit. 431 * 432 * addrp is a value/result parameter. 433 * On input it is a hint from the user to be used in a completely 434 * machine dependent fashion. We decide to completely ignore this hint. 435 * 436 * On output it is NULL if no address can be found in the current 437 * processes address space or else an address that is currently 438 * not mapped for len bytes with a page of red zone on either side. 439 * 440 * align is not needed on x86 (it's for viturally addressed caches) 441 */ 442 /*ARGSUSED*/ 443 void 444 map_addr_proc( 445 caddr_t *addrp, 446 size_t len, 447 offset_t off, 448 int vacalign, 449 caddr_t userlimit, 450 struct proc *p, 451 uint_t flags) 452 { 453 struct as *as = p->p_as; 454 caddr_t addr; 455 caddr_t base; 456 size_t slen; 457 size_t align_amount; 458 459 ASSERT32(userlimit == as->a_userlimit); 460 461 base = p->p_brkbase; 462 #if defined(__amd64) 463 /* 464 * XX64 Yes, this needs more work. 465 */ 466 if (p->p_model == DATAMODEL_NATIVE) { 467 if (userlimit < as->a_userlimit) { 468 /* 469 * This happens when a program wants to map 470 * something in a range that's accessible to a 471 * program in a smaller address space. For example, 472 * a 64-bit program calling mmap32(2) to guarantee 473 * that the returned address is below 4Gbytes. 474 */ 475 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 476 477 if (userlimit > base) 478 slen = userlimit - base; 479 else { 480 *addrp = NULL; 481 return; 482 } 483 } else { 484 /* 485 * XX64 This layout is probably wrong .. but in 486 * the event we make the amd64 address space look 487 * like sparcv9 i.e. with the stack -above- the 488 * heap, this bit of code might even be correct. 489 */ 490 slen = p->p_usrstack - base - 491 (((size_t)rctl_enforced_value( 492 rctlproc_legacy[RLIMIT_STACK], 493 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 494 } 495 } else 496 #endif 497 slen = userlimit - base; 498 499 len = (len + PAGEOFFSET) & PAGEMASK; 500 501 /* 502 * Redzone for each side of the request. This is done to leave 503 * one page unmapped between segments. This is not required, but 504 * it's useful for the user because if their program strays across 505 * a segment boundary, it will catch a fault immediately making 506 * debugging a little easier. 507 */ 508 len += 2 * MMU_PAGESIZE; 509 510 /* 511 * figure out what the alignment should be 512 * 513 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 514 */ 515 if (len <= ELF_386_MAXPGSZ) { 516 /* 517 * Align virtual addresses to ensure that ELF shared libraries 518 * are mapped with the appropriate alignment constraints by 519 * the run-time linker. 520 */ 521 align_amount = ELF_386_MAXPGSZ; 522 } else { 523 int l = mmu.max_page_level; 524 525 while (l && len < LEVEL_SIZE(l)) 526 --l; 527 528 align_amount = LEVEL_SIZE(l); 529 } 530 531 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 532 align_amount = (uintptr_t)*addrp; 533 534 len += align_amount; 535 536 /* 537 * Look for a large enough hole starting below userlimit. 538 * After finding it, use the upper part. Addition of PAGESIZE 539 * is for the redzone as described above. 540 */ 541 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 542 caddr_t as_addr; 543 544 addr = base + slen - len + MMU_PAGESIZE; 545 as_addr = addr; 546 /* 547 * Round address DOWN to the alignment amount, 548 * add the offset, and if this address is less 549 * than the original address, add alignment amount. 550 */ 551 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 552 addr += (uintptr_t)(off & (align_amount - 1)); 553 if (addr < as_addr) 554 addr += align_amount; 555 556 ASSERT(addr <= (as_addr + align_amount)); 557 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 558 ((uintptr_t)(off & (align_amount - 1)))); 559 *addrp = addr; 560 } else { 561 *addrp = NULL; /* no more virtual space */ 562 } 563 } 564 565 /* 566 * Determine whether [base, base+len] contains a valid range of 567 * addresses at least minlen long. base and len are adjusted if 568 * required to provide a valid range. 569 */ 570 /*ARGSUSED3*/ 571 int 572 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 573 { 574 uintptr_t hi, lo; 575 576 lo = (uintptr_t)*basep; 577 hi = lo + *lenp; 578 579 /* 580 * If hi rolled over the top, try cutting back. 581 */ 582 if (hi < lo) { 583 if (0 - lo + hi < minlen) 584 return (0); 585 if (0 - lo < minlen) 586 return (0); 587 *lenp = 0 - lo; 588 } else if (hi - lo < minlen) { 589 return (0); 590 } 591 #if defined(__amd64) 592 /* 593 * Deal with a possible hole in the address range between 594 * hole_start and hole_end that should never be mapped. 595 */ 596 if (lo < hole_start) { 597 if (hi > hole_start) { 598 if (hi < hole_end) { 599 hi = hole_start; 600 } else { 601 /* lo < hole_start && hi >= hole_end */ 602 if (dir == AH_LO) { 603 /* 604 * prefer lowest range 605 */ 606 if (hole_start - lo >= minlen) 607 hi = hole_start; 608 else if (hi - hole_end >= minlen) 609 lo = hole_end; 610 else 611 return (0); 612 } else { 613 /* 614 * prefer highest range 615 */ 616 if (hi - hole_end >= minlen) 617 lo = hole_end; 618 else if (hole_start - lo >= minlen) 619 hi = hole_start; 620 else 621 return (0); 622 } 623 } 624 } 625 } else { 626 /* lo >= hole_start */ 627 if (hi < hole_end) 628 return (0); 629 if (lo < hole_end) 630 lo = hole_end; 631 } 632 633 if (hi - lo < minlen) 634 return (0); 635 636 *basep = (caddr_t)lo; 637 *lenp = hi - lo; 638 #endif 639 return (1); 640 } 641 642 /* 643 * Determine whether [addr, addr+len] are valid user addresses. 644 */ 645 /*ARGSUSED*/ 646 int 647 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 648 caddr_t userlimit) 649 { 650 caddr_t eaddr = addr + len; 651 652 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 653 return (RANGE_BADADDR); 654 655 #if defined(__amd64) 656 /* 657 * Check for the VA hole 658 */ 659 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 660 return (RANGE_BADADDR); 661 #endif 662 663 return (RANGE_OKAY); 664 } 665 666 /* 667 * Return 1 if the page frame is onboard memory, else 0. 668 */ 669 int 670 pf_is_memory(pfn_t pf) 671 { 672 if (pfn_is_foreign(pf)) 673 return (0); 674 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 675 } 676 677 678 /* 679 * initialized by page_coloring_init(). 680 */ 681 uint_t page_colors; 682 uint_t page_colors_mask; 683 uint_t page_coloring_shift; 684 int cpu_page_colors; 685 static uint_t l2_colors; 686 687 /* 688 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 689 * and page_colors are calculated from the l2 cache n-way set size. Within a 690 * mnode range, the page freelist and cachelist are hashed into bins based on 691 * color. This makes it easier to search for a page within a specific memory 692 * range. 693 */ 694 #define PAGE_COLORS_MIN 16 695 696 page_t ****page_freelists; 697 page_t ***page_cachelists; 698 699 /* 700 * As the PC architecture evolved memory up was clumped into several 701 * ranges for various historical I/O devices to do DMA. 702 * < 16Meg - ISA bus 703 * < 2Gig - ??? 704 * < 4Gig - PCI bus or drivers that don't understand PAE mode 705 */ 706 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 707 0x100000, /* pfn range for 4G and above */ 708 0x80000, /* pfn range for 2G-4G */ 709 0x01000, /* pfn range for 16M-2G */ 710 0x00000, /* pfn range for 0-16M */ 711 }; 712 713 /* 714 * These are changed during startup if the machine has limited memory. 715 */ 716 pfn_t *memranges = &arch_memranges[0]; 717 int nranges = NUM_MEM_RANGES; 718 719 /* 720 * Used by page layer to know about page sizes 721 */ 722 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 723 724 /* 725 * This can be patched via /etc/system to allow old non-PAE aware device 726 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 727 */ 728 #if defined(__i386) 729 int restricted_kmemalloc = 0; 730 #elif defined(__amd64) 731 int restricted_kmemalloc = 0; 732 #endif 733 734 kmutex_t *fpc_mutex[NPC_MUTEX]; 735 kmutex_t *cpc_mutex[NPC_MUTEX]; 736 737 738 /* 739 * return the memrange containing pfn 740 */ 741 int 742 memrange_num(pfn_t pfn) 743 { 744 int n; 745 746 for (n = 0; n < nranges - 1; ++n) { 747 if (pfn >= memranges[n]) 748 break; 749 } 750 return (n); 751 } 752 753 /* 754 * return the mnoderange containing pfn 755 */ 756 int 757 pfn_2_mtype(pfn_t pfn) 758 { 759 int n; 760 761 for (n = mnoderangecnt - 1; n >= 0; n--) { 762 if (pfn >= mnoderanges[n].mnr_pfnlo) { 763 break; 764 } 765 } 766 return (n); 767 } 768 769 /* 770 * is_contigpage_free: 771 * returns a page list of contiguous pages. It minimally has to return 772 * minctg pages. Caller determines minctg based on the scatter-gather 773 * list length. 774 * 775 * pfnp is set to the next page frame to search on return. 776 */ 777 static page_t * 778 is_contigpage_free( 779 pfn_t *pfnp, 780 pgcnt_t *pgcnt, 781 pgcnt_t minctg, 782 uint64_t pfnseg, 783 int iolock) 784 { 785 int i = 0; 786 pfn_t pfn = *pfnp; 787 page_t *pp; 788 page_t *plist = NULL; 789 790 /* 791 * fail if pfn + minctg crosses a segment boundary. 792 * Adjust for next starting pfn to begin at segment boundary. 793 */ 794 795 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 796 *pfnp = roundup(*pfnp, pfnseg + 1); 797 return (NULL); 798 } 799 800 do { 801 retry: 802 pp = page_numtopp_nolock(pfn + i); 803 if ((pp == NULL) || 804 (page_trylock(pp, SE_EXCL) == 0)) { 805 (*pfnp)++; 806 break; 807 } 808 if (page_pptonum(pp) != pfn + i) { 809 page_unlock(pp); 810 goto retry; 811 } 812 813 if (!(PP_ISFREE(pp))) { 814 page_unlock(pp); 815 (*pfnp)++; 816 break; 817 } 818 819 if (!PP_ISAGED(pp)) { 820 page_list_sub(pp, PG_CACHE_LIST); 821 page_hashout(pp, (kmutex_t *)NULL); 822 } else { 823 page_list_sub(pp, PG_FREE_LIST); 824 } 825 826 if (iolock) 827 page_io_lock(pp); 828 page_list_concat(&plist, &pp); 829 830 /* 831 * exit loop when pgcnt satisfied or segment boundary reached. 832 */ 833 834 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 835 836 *pfnp += i; /* set to next pfn to search */ 837 838 if (i >= minctg) { 839 *pgcnt -= i; 840 return (plist); 841 } 842 843 /* 844 * failure: minctg not satisfied. 845 * 846 * if next request crosses segment boundary, set next pfn 847 * to search from the segment boundary. 848 */ 849 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 850 *pfnp = roundup(*pfnp, pfnseg + 1); 851 852 /* clean up any pages already allocated */ 853 854 while (plist) { 855 pp = plist; 856 page_sub(&plist, pp); 857 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 858 if (iolock) 859 page_io_unlock(pp); 860 page_unlock(pp); 861 } 862 863 return (NULL); 864 } 865 866 /* 867 * verify that pages being returned from allocator have correct DMA attribute 868 */ 869 #ifndef DEBUG 870 #define check_dma(a, b, c) (0) 871 #else 872 static void 873 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 874 { 875 if (dma_attr == NULL) 876 return; 877 878 while (cnt-- > 0) { 879 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 880 dma_attr->dma_attr_addr_lo) 881 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 882 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 883 dma_attr->dma_attr_addr_hi) 884 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 885 pp = pp->p_next; 886 } 887 } 888 #endif 889 890 static kmutex_t contig_lock; 891 892 #define CONTIG_LOCK() mutex_enter(&contig_lock); 893 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 894 895 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 896 897 static page_t * 898 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 899 { 900 pfn_t pfn; 901 int sgllen; 902 uint64_t pfnseg; 903 pgcnt_t minctg; 904 page_t *pplist = NULL, *plist; 905 uint64_t lo, hi; 906 pgcnt_t pfnalign = 0; 907 static pfn_t startpfn; 908 static pgcnt_t lastctgcnt; 909 uintptr_t align; 910 911 CONTIG_LOCK(); 912 913 if (mattr) { 914 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 915 hi = mmu_btop(mattr->dma_attr_addr_hi); 916 if (hi >= physmax) 917 hi = physmax - 1; 918 sgllen = mattr->dma_attr_sgllen; 919 pfnseg = mmu_btop(mattr->dma_attr_seg); 920 921 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 922 if (align > MMU_PAGESIZE) 923 pfnalign = mmu_btop(align); 924 925 /* 926 * in order to satisfy the request, must minimally 927 * acquire minctg contiguous pages 928 */ 929 minctg = howmany(*pgcnt, sgllen); 930 931 ASSERT(hi >= lo); 932 933 /* 934 * start from where last searched if the minctg >= lastctgcnt 935 */ 936 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 937 startpfn = lo; 938 } else { 939 hi = physmax - 1; 940 lo = 0; 941 sgllen = 1; 942 pfnseg = mmu.highest_pfn; 943 minctg = *pgcnt; 944 945 if (minctg < lastctgcnt) 946 startpfn = lo; 947 } 948 lastctgcnt = minctg; 949 950 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 951 952 /* conserve 16m memory - start search above 16m when possible */ 953 if (hi > PFN_16M && startpfn < PFN_16M) 954 startpfn = PFN_16M; 955 956 pfn = startpfn; 957 if (pfnalign) 958 pfn = P2ROUNDUP(pfn, pfnalign); 959 960 while (pfn + minctg - 1 <= hi) { 961 962 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 963 if (plist) { 964 page_list_concat(&pplist, &plist); 965 sgllen--; 966 /* 967 * return when contig pages no longer needed 968 */ 969 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 970 startpfn = pfn; 971 CONTIG_UNLOCK(); 972 check_dma(mattr, pplist, *pgcnt); 973 return (pplist); 974 } 975 minctg = howmany(*pgcnt, sgllen); 976 } 977 if (pfnalign) 978 pfn = P2ROUNDUP(pfn, pfnalign); 979 } 980 981 /* cannot find contig pages in specified range */ 982 if (startpfn == lo) { 983 CONTIG_UNLOCK(); 984 return (NULL); 985 } 986 987 /* did not start with lo previously */ 988 pfn = lo; 989 if (pfnalign) 990 pfn = P2ROUNDUP(pfn, pfnalign); 991 992 /* allow search to go above startpfn */ 993 while (pfn < startpfn) { 994 995 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 996 if (plist != NULL) { 997 998 page_list_concat(&pplist, &plist); 999 sgllen--; 1000 1001 /* 1002 * return when contig pages no longer needed 1003 */ 1004 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1005 startpfn = pfn; 1006 CONTIG_UNLOCK(); 1007 check_dma(mattr, pplist, *pgcnt); 1008 return (pplist); 1009 } 1010 minctg = howmany(*pgcnt, sgllen); 1011 } 1012 if (pfnalign) 1013 pfn = P2ROUNDUP(pfn, pfnalign); 1014 } 1015 CONTIG_UNLOCK(); 1016 return (NULL); 1017 } 1018 1019 /* 1020 * combine mem_node_config and memrange memory ranges into one data 1021 * structure to be used for page list management. 1022 * 1023 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1024 * memranges[]. Used to determine the size of page lists and mnoderanges. 1025 * 1026 * mnode_range_setup() initializes mnoderanges. 1027 */ 1028 mnoderange_t *mnoderanges; 1029 int mnoderangecnt; 1030 int mtype4g; 1031 1032 int 1033 mnode_range_cnt(int mnode) 1034 { 1035 int mri; 1036 int mnrcnt = 0; 1037 1038 if (mem_node_config[mnode].exists != 0) { 1039 mri = nranges - 1; 1040 1041 /* find the memranges index below contained in mnode range */ 1042 1043 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1044 mri--; 1045 1046 /* 1047 * increment mnode range counter when memranges or mnode 1048 * boundary is reached. 1049 */ 1050 while (mri >= 0 && 1051 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1052 mnrcnt++; 1053 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1054 mri--; 1055 else 1056 break; 1057 } 1058 } 1059 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1060 return (mnrcnt); 1061 } 1062 1063 void 1064 mnode_range_setup(mnoderange_t *mnoderanges) 1065 { 1066 int mnode, mri; 1067 1068 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1069 if (mem_node_config[mnode].exists == 0) 1070 continue; 1071 1072 mri = nranges - 1; 1073 1074 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1075 mri--; 1076 1077 while (mri >= 0 && mem_node_config[mnode].physmax >= 1078 MEMRANGELO(mri)) { 1079 mnoderanges->mnr_pfnlo = 1080 MAX(MEMRANGELO(mri), 1081 mem_node_config[mnode].physbase); 1082 mnoderanges->mnr_pfnhi = 1083 MIN(MEMRANGEHI(mri), 1084 mem_node_config[mnode].physmax); 1085 mnoderanges->mnr_mnode = mnode; 1086 mnoderanges->mnr_memrange = mri; 1087 mnoderanges++; 1088 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1089 mri--; 1090 else 1091 break; 1092 } 1093 } 1094 } 1095 1096 /* 1097 * Determine if the mnode range specified in mtype contains memory belonging 1098 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1099 * the range of indices from high pfn to 0, 16m or 4g. 1100 * 1101 * Return first mnode range type index found otherwise return -1 if none found. 1102 */ 1103 int 1104 mtype_func(int mnode, int mtype, uint_t flags) 1105 { 1106 if (flags & PGI_MT_RANGE) { 1107 int mtlim; 1108 1109 if (flags & PGI_MT_NEXT) 1110 mtype--; 1111 if (flags & PGI_MT_RANGE0) 1112 mtlim = 0; 1113 else if (flags & PGI_MT_RANGE4G) 1114 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1115 else if (flags & PGI_MT_RANGE16M) 1116 mtlim = 1; /* exclude 0-16m range */ 1117 while (mtype >= mtlim) { 1118 if (mnoderanges[mtype].mnr_mnode == mnode) 1119 return (mtype); 1120 mtype--; 1121 } 1122 } else { 1123 if (mnoderanges[mtype].mnr_mnode == mnode) 1124 return (mtype); 1125 } 1126 return (-1); 1127 } 1128 1129 /* 1130 * Update the page list max counts with the pfn range specified by the 1131 * input parameters. Called from add_physmem() when physical memory with 1132 * page_t's are initially added to the page lists. 1133 */ 1134 void 1135 mtype_modify_max(pfn_t startpfn, long cnt) 1136 { 1137 int mtype = 0; 1138 pfn_t endpfn = startpfn + cnt, pfn; 1139 pgcnt_t inc; 1140 1141 ASSERT(cnt > 0); 1142 1143 for (pfn = startpfn; pfn < endpfn; ) { 1144 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1145 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1146 inc = endpfn - pfn; 1147 } else { 1148 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1149 } 1150 mnoderanges[mtype].mnr_mt_pgmax += inc; 1151 if (physmax4g && mtype <= mtype4g) 1152 maxmem4g += inc; 1153 pfn += inc; 1154 } 1155 mtype++; 1156 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1157 } 1158 } 1159 1160 /* 1161 * Returns the free page count for mnode 1162 */ 1163 int 1164 mnode_pgcnt(int mnode) 1165 { 1166 int mtype = mnoderangecnt - 1; 1167 int flags = PGI_MT_RANGE0; 1168 pgcnt_t pgcnt = 0; 1169 1170 mtype = mtype_func(mnode, mtype, flags); 1171 1172 while (mtype != -1) { 1173 pgcnt += MTYPE_FREEMEM(mtype); 1174 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1175 } 1176 return (pgcnt); 1177 } 1178 1179 /* 1180 * Initialize page coloring variables based on the l2 cache parameters. 1181 * Calculate and return memory needed for page coloring data structures. 1182 */ 1183 size_t 1184 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1185 { 1186 size_t colorsz = 0; 1187 int i; 1188 int colors; 1189 1190 /* 1191 * Reduce the memory ranges lists if we don't have large amounts 1192 * of memory. This avoids searching known empty free lists. 1193 */ 1194 i = memrange_num(physmax); 1195 memranges += i; 1196 nranges -= i; 1197 #if defined(__i386) 1198 if (i > 0) 1199 restricted_kmemalloc = 0; 1200 #endif 1201 /* physmax greater than 4g */ 1202 if (i == 0) 1203 physmax4g = 1; 1204 1205 ASSERT(ISP2(l2_sz)); 1206 ASSERT(ISP2(l2_linesz)); 1207 ASSERT(l2_sz > MMU_PAGESIZE); 1208 1209 /* l2_assoc is 0 for fully associative l2 cache */ 1210 if (l2_assoc) 1211 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1212 else 1213 l2_colors = 1; 1214 1215 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1216 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1217 1218 /* 1219 * cpu_page_colors is non-zero when a page color may be spread across 1220 * multiple bins. 1221 */ 1222 if (l2_colors < page_colors) 1223 cpu_page_colors = l2_colors; 1224 1225 ASSERT(ISP2(page_colors)); 1226 1227 page_colors_mask = page_colors - 1; 1228 1229 ASSERT(ISP2(CPUSETSIZE())); 1230 page_coloring_shift = lowbit(CPUSETSIZE()); 1231 1232 /* initialize number of colors per page size */ 1233 for (i = 0; i <= mmu.max_page_level; i++) { 1234 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1235 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1236 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1237 hw_page_array[i].hp_colors = (page_colors_mask >> 1238 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1239 + 1; 1240 } 1241 1242 /* 1243 * The value of cpu_page_colors determines if additional color bins 1244 * need to be checked for a particular color in the page_get routines. 1245 */ 1246 if (cpu_page_colors != 0) { 1247 1248 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1249 ASSERT(a > 0); 1250 ASSERT(a < 16); 1251 1252 for (i = 0; i <= mmu.max_page_level; i++) { 1253 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1254 colorequivszc[i] = 0; 1255 continue; 1256 } 1257 while ((colors >> a) == 0) 1258 a--; 1259 ASSERT(a >= 0); 1260 1261 /* higher 4 bits encodes color equiv mask */ 1262 colorequivszc[i] = (a << 4); 1263 } 1264 } 1265 1266 /* factor in colorequiv to check additional 'equivalent' bins. */ 1267 if (colorequiv > 1) { 1268 1269 int a = lowbit(colorequiv) - 1; 1270 if (a > 15) 1271 a = 15; 1272 1273 for (i = 0; i <= mmu.max_page_level; i++) { 1274 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1275 continue; 1276 } 1277 while ((colors >> a) == 0) 1278 a--; 1279 if ((a << 4) > colorequivszc[i]) { 1280 colorequivszc[i] = (a << 4); 1281 } 1282 } 1283 } 1284 1285 /* size for mnoderanges */ 1286 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1287 mnoderangecnt += mnode_range_cnt(i); 1288 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1289 1290 /* size for fpc_mutex and cpc_mutex */ 1291 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1292 1293 /* size of page_freelists */ 1294 colorsz += mnoderangecnt * sizeof (page_t ***); 1295 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1296 1297 for (i = 0; i < mmu_page_sizes; i++) { 1298 colors = page_get_pagecolors(i); 1299 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1300 } 1301 1302 /* size of page_cachelists */ 1303 colorsz += mnoderangecnt * sizeof (page_t **); 1304 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1305 1306 return (colorsz); 1307 } 1308 1309 /* 1310 * Called once at startup to configure page_coloring data structures and 1311 * does the 1st page_free()/page_freelist_add(). 1312 */ 1313 void 1314 page_coloring_setup(caddr_t pcmemaddr) 1315 { 1316 int i; 1317 int j; 1318 int k; 1319 caddr_t addr; 1320 int colors; 1321 1322 /* 1323 * do page coloring setup 1324 */ 1325 addr = pcmemaddr; 1326 1327 mnoderanges = (mnoderange_t *)addr; 1328 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1329 1330 mnode_range_setup(mnoderanges); 1331 1332 if (physmax4g) 1333 mtype4g = pfn_2_mtype(0xfffff); 1334 1335 for (k = 0; k < NPC_MUTEX; k++) { 1336 fpc_mutex[k] = (kmutex_t *)addr; 1337 addr += (max_mem_nodes * sizeof (kmutex_t)); 1338 } 1339 for (k = 0; k < NPC_MUTEX; k++) { 1340 cpc_mutex[k] = (kmutex_t *)addr; 1341 addr += (max_mem_nodes * sizeof (kmutex_t)); 1342 } 1343 page_freelists = (page_t ****)addr; 1344 addr += (mnoderangecnt * sizeof (page_t ***)); 1345 1346 page_cachelists = (page_t ***)addr; 1347 addr += (mnoderangecnt * sizeof (page_t **)); 1348 1349 for (i = 0; i < mnoderangecnt; i++) { 1350 page_freelists[i] = (page_t ***)addr; 1351 addr += (mmu_page_sizes * sizeof (page_t **)); 1352 1353 for (j = 0; j < mmu_page_sizes; j++) { 1354 colors = page_get_pagecolors(j); 1355 page_freelists[i][j] = (page_t **)addr; 1356 addr += (colors * sizeof (page_t *)); 1357 } 1358 page_cachelists[i] = (page_t **)addr; 1359 addr += (page_colors * sizeof (page_t *)); 1360 } 1361 } 1362 1363 /*ARGSUSED*/ 1364 int 1365 bp_color(struct buf *bp) 1366 { 1367 return (0); 1368 } 1369 1370 /* 1371 * get a page from any list with the given mnode 1372 */ 1373 page_t * 1374 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 1375 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 1376 { 1377 kmutex_t *pcm; 1378 int i; 1379 page_t *pp; 1380 page_t *first_pp; 1381 uint64_t pgaddr; 1382 ulong_t bin; 1383 int mtypestart; 1384 int plw_initialized; 1385 page_list_walker_t plw; 1386 1387 VM_STAT_ADD(pga_vmstats.pgma_alloc); 1388 1389 ASSERT((flags & PG_MATCH_COLOR) == 0); 1390 ASSERT(szc == 0); 1391 ASSERT(dma_attr != NULL); 1392 1393 MTYPE_START(mnode, mtype, flags); 1394 if (mtype < 0) { 1395 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 1396 return (NULL); 1397 } 1398 1399 mtypestart = mtype; 1400 1401 bin = origbin; 1402 1403 /* 1404 * check up to page_colors + 1 bins - origbin may be checked twice 1405 * because of BIN_STEP skip 1406 */ 1407 do { 1408 plw_initialized = 0; 1409 1410 for (plw.plw_count = 0; 1411 plw.plw_count < page_colors; plw.plw_count++) { 1412 1413 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 1414 goto nextfreebin; 1415 1416 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1417 mutex_enter(pcm); 1418 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 1419 first_pp = pp; 1420 while (pp != NULL) { 1421 if (page_trylock(pp, SE_EXCL) == 0) { 1422 pp = pp->p_next; 1423 if (pp == first_pp) { 1424 pp = NULL; 1425 } 1426 continue; 1427 } 1428 1429 ASSERT(PP_ISFREE(pp)); 1430 ASSERT(PP_ISAGED(pp)); 1431 ASSERT(pp->p_vnode == NULL); 1432 ASSERT(pp->p_hash == NULL); 1433 ASSERT(pp->p_offset == (u_offset_t)-1); 1434 ASSERT(pp->p_szc == szc); 1435 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1436 /* check if page within DMA attributes */ 1437 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 1438 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1439 (pgaddr + MMU_PAGESIZE - 1 <= 1440 dma_attr->dma_attr_addr_hi)) { 1441 break; 1442 } 1443 1444 /* continue looking */ 1445 page_unlock(pp); 1446 pp = pp->p_next; 1447 if (pp == first_pp) 1448 pp = NULL; 1449 1450 } 1451 if (pp != NULL) { 1452 ASSERT(mtype == PP_2_MTYPE(pp)); 1453 ASSERT(pp->p_szc == 0); 1454 1455 /* found a page with specified DMA attributes */ 1456 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 1457 mtype), pp); 1458 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1459 1460 if ((PP_ISFREE(pp) == 0) || 1461 (PP_ISAGED(pp) == 0)) { 1462 cmn_err(CE_PANIC, "page %p is not free", 1463 (void *)pp); 1464 } 1465 1466 mutex_exit(pcm); 1467 check_dma(dma_attr, pp, 1); 1468 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1469 return (pp); 1470 } 1471 mutex_exit(pcm); 1472 nextfreebin: 1473 if (plw_initialized == 0) { 1474 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 1475 ASSERT(plw.plw_ceq_dif == page_colors); 1476 plw_initialized = 1; 1477 } 1478 1479 if (plw.plw_do_split) { 1480 pp = page_freelist_split(szc, bin, mnode, 1481 mtype, 1482 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 1483 &plw); 1484 if (pp != NULL) 1485 return (pp); 1486 } 1487 1488 bin = page_list_walk_next_bin(szc, bin, &plw); 1489 } 1490 1491 MTYPE_NEXT(mnode, mtype, flags); 1492 } while (mtype >= 0); 1493 1494 /* failed to find a page in the freelist; try it in the cachelist */ 1495 1496 /* reset mtype start for cachelist search */ 1497 mtype = mtypestart; 1498 ASSERT(mtype >= 0); 1499 1500 /* start with the bin of matching color */ 1501 bin = origbin; 1502 1503 do { 1504 for (i = 0; i <= page_colors; i++) { 1505 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 1506 goto nextcachebin; 1507 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 1508 mutex_enter(pcm); 1509 pp = PAGE_CACHELISTS(mnode, bin, mtype); 1510 first_pp = pp; 1511 while (pp != NULL) { 1512 if (page_trylock(pp, SE_EXCL) == 0) { 1513 pp = pp->p_next; 1514 if (pp == first_pp) 1515 break; 1516 continue; 1517 } 1518 ASSERT(pp->p_vnode); 1519 ASSERT(PP_ISAGED(pp) == 0); 1520 ASSERT(pp->p_szc == 0); 1521 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 1522 1523 /* check if page within DMA attributes */ 1524 1525 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 1526 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 1527 (pgaddr + MMU_PAGESIZE - 1 <= 1528 dma_attr->dma_attr_addr_hi)) { 1529 break; 1530 } 1531 1532 /* continue looking */ 1533 page_unlock(pp); 1534 pp = pp->p_next; 1535 if (pp == first_pp) 1536 pp = NULL; 1537 } 1538 1539 if (pp != NULL) { 1540 ASSERT(mtype == PP_2_MTYPE(pp)); 1541 ASSERT(pp->p_szc == 0); 1542 1543 /* found a page with specified DMA attributes */ 1544 page_sub(&PAGE_CACHELISTS(mnode, bin, 1545 mtype), pp); 1546 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 1547 1548 mutex_exit(pcm); 1549 ASSERT(pp->p_vnode); 1550 ASSERT(PP_ISAGED(pp) == 0); 1551 check_dma(dma_attr, pp, 1); 1552 VM_STAT_ADD(pga_vmstats.pgma_allocok); 1553 return (pp); 1554 } 1555 mutex_exit(pcm); 1556 nextcachebin: 1557 bin += (i == 0) ? BIN_STEP : 1; 1558 bin &= page_colors_mask; 1559 } 1560 MTYPE_NEXT(mnode, mtype, flags); 1561 } while (mtype >= 0); 1562 1563 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 1564 return (NULL); 1565 } 1566 1567 /* 1568 * This function is similar to page_get_freelist()/page_get_cachelist() 1569 * but it searches both the lists to find a page with the specified 1570 * color (or no color) and DMA attributes. The search is done in the 1571 * freelist first and then in the cache list within the highest memory 1572 * range (based on DMA attributes) before searching in the lower 1573 * memory ranges. 1574 * 1575 * Note: This function is called only by page_create_io(). 1576 */ 1577 /*ARGSUSED*/ 1578 page_t * 1579 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 1580 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 1581 { 1582 uint_t bin; 1583 int mtype; 1584 page_t *pp; 1585 int n; 1586 int m; 1587 int szc; 1588 int fullrange; 1589 int mnode; 1590 int local_failed_stat = 0; 1591 lgrp_mnode_cookie_t lgrp_cookie; 1592 1593 VM_STAT_ADD(pga_vmstats.pga_alloc); 1594 1595 /* only base pagesize currently supported */ 1596 if (size != MMU_PAGESIZE) 1597 return (NULL); 1598 1599 /* 1600 * If we're passed a specific lgroup, we use it. Otherwise, 1601 * assume first-touch placement is desired. 1602 */ 1603 if (!LGRP_EXISTS(lgrp)) 1604 lgrp = lgrp_home_lgrp(); 1605 1606 /* LINTED */ 1607 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 1608 1609 /* 1610 * Only hold one freelist or cachelist lock at a time, that way we 1611 * can start anywhere and not have to worry about lock 1612 * ordering. 1613 */ 1614 if (dma_attr == NULL) { 1615 n = 0; 1616 m = mnoderangecnt - 1; 1617 fullrange = 1; 1618 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 1619 } else { 1620 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 1621 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 1622 1623 /* 1624 * We can guarantee alignment only for page boundary. 1625 */ 1626 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 1627 return (NULL); 1628 1629 n = pfn_2_mtype(pfnlo); 1630 m = pfn_2_mtype(pfnhi); 1631 1632 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 1633 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 1634 } 1635 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 1636 1637 if (n > m) 1638 return (NULL); 1639 1640 szc = 0; 1641 1642 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 1643 if (n == 0) { 1644 flags |= PGI_MT_RANGE0; 1645 n = m; 1646 } 1647 1648 /* 1649 * Try local memory node first, but try remote if we can't 1650 * get a page of the right color. 1651 */ 1652 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 1653 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 1654 /* 1655 * allocate pages from high pfn to low. 1656 */ 1657 for (mtype = m; mtype >= n; mtype--) { 1658 if (fullrange != 0) { 1659 pp = page_get_mnode_freelist(mnode, 1660 bin, mtype, szc, flags); 1661 if (pp == NULL) { 1662 pp = page_get_mnode_cachelist( 1663 bin, flags, mnode, mtype); 1664 } 1665 } else { 1666 pp = page_get_mnode_anylist(bin, szc, 1667 flags, mnode, mtype, dma_attr); 1668 } 1669 if (pp != NULL) { 1670 VM_STAT_ADD(pga_vmstats.pga_allocok); 1671 check_dma(dma_attr, pp, 1); 1672 return (pp); 1673 } 1674 } 1675 if (!local_failed_stat) { 1676 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 1677 local_failed_stat = 1; 1678 } 1679 } 1680 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 1681 1682 return (NULL); 1683 } 1684 1685 /* 1686 * page_create_io() 1687 * 1688 * This function is a copy of page_create_va() with an additional 1689 * argument 'mattr' that specifies DMA memory requirements to 1690 * the page list functions. This function is used by the segkmem 1691 * allocator so it is only to create new pages (i.e PG_EXCL is 1692 * set). 1693 * 1694 * Note: This interface is currently used by x86 PSM only and is 1695 * not fully specified so the commitment level is only for 1696 * private interface specific to x86. This interface uses PSM 1697 * specific page_get_anylist() interface. 1698 */ 1699 1700 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 1701 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 1702 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 1703 break; \ 1704 } \ 1705 } 1706 1707 1708 page_t * 1709 page_create_io( 1710 struct vnode *vp, 1711 u_offset_t off, 1712 uint_t bytes, 1713 uint_t flags, 1714 struct as *as, 1715 caddr_t vaddr, 1716 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 1717 { 1718 page_t *plist = NULL; 1719 uint_t plist_len = 0; 1720 pgcnt_t npages; 1721 page_t *npp = NULL; 1722 uint_t pages_req; 1723 page_t *pp; 1724 kmutex_t *phm = NULL; 1725 uint_t index; 1726 1727 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 1728 "page_create_start:vp %p off %llx bytes %u flags %x", 1729 vp, off, bytes, flags); 1730 1731 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 1732 1733 pages_req = npages = mmu_btopr(bytes); 1734 1735 /* 1736 * Do the freemem and pcf accounting. 1737 */ 1738 if (!page_create_wait(npages, flags)) { 1739 return (NULL); 1740 } 1741 1742 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 1743 "page_create_success:vp %p off %llx", 1744 vp, off); 1745 1746 /* 1747 * If satisfying this request has left us with too little 1748 * memory, start the wheels turning to get some back. The 1749 * first clause of the test prevents waking up the pageout 1750 * daemon in situations where it would decide that there's 1751 * nothing to do. 1752 */ 1753 if (nscan < desscan && freemem < minfree) { 1754 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 1755 "pageout_cv_signal:freemem %ld", freemem); 1756 cv_signal(&proc_pageout->p_cv); 1757 } 1758 1759 if (flags & PG_PHYSCONTIG) { 1760 1761 plist = page_get_contigpage(&npages, mattr, 1); 1762 if (plist == NULL) { 1763 page_create_putback(npages); 1764 return (NULL); 1765 } 1766 1767 pp = plist; 1768 1769 do { 1770 if (!page_hashin(pp, vp, off, NULL)) { 1771 panic("pg_creat_io: hashin failed %p %p %llx", 1772 (void *)pp, (void *)vp, off); 1773 } 1774 VM_STAT_ADD(page_create_new); 1775 off += MMU_PAGESIZE; 1776 PP_CLRFREE(pp); 1777 PP_CLRAGED(pp); 1778 page_set_props(pp, P_REF); 1779 pp = pp->p_next; 1780 } while (pp != plist); 1781 1782 if (!npages) { 1783 check_dma(mattr, plist, pages_req); 1784 return (plist); 1785 } else { 1786 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 1787 } 1788 1789 /* 1790 * fall-thru: 1791 * 1792 * page_get_contigpage returns when npages <= sgllen. 1793 * Grab the rest of the non-contig pages below from anylist. 1794 */ 1795 } 1796 1797 /* 1798 * Loop around collecting the requested number of pages. 1799 * Most of the time, we have to `create' a new page. With 1800 * this in mind, pull the page off the free list before 1801 * getting the hash lock. This will minimize the hash 1802 * lock hold time, nesting, and the like. If it turns 1803 * out we don't need the page, we put it back at the end. 1804 */ 1805 while (npages--) { 1806 phm = NULL; 1807 1808 index = PAGE_HASH_FUNC(vp, off); 1809 top: 1810 ASSERT(phm == NULL); 1811 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 1812 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1813 1814 if (npp == NULL) { 1815 /* 1816 * Try to get the page of any color either from 1817 * the freelist or from the cache list. 1818 */ 1819 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 1820 flags & ~PG_MATCH_COLOR, mattr, NULL); 1821 if (npp == NULL) { 1822 if (mattr == NULL) { 1823 /* 1824 * Not looking for a special page; 1825 * panic! 1826 */ 1827 panic("no page found %d", (int)npages); 1828 } 1829 /* 1830 * No page found! This can happen 1831 * if we are looking for a page 1832 * within a specific memory range 1833 * for DMA purposes. If PG_WAIT is 1834 * specified then we wait for a 1835 * while and then try again. The 1836 * wait could be forever if we 1837 * don't get the page(s) we need. 1838 * 1839 * Note: XXX We really need a mechanism 1840 * to wait for pages in the desired 1841 * range. For now, we wait for any 1842 * pages and see if we can use it. 1843 */ 1844 1845 if ((mattr != NULL) && (flags & PG_WAIT)) { 1846 delay(10); 1847 goto top; 1848 } 1849 goto fail; /* undo accounting stuff */ 1850 } 1851 1852 if (PP_ISAGED(npp) == 0) { 1853 /* 1854 * Since this page came from the 1855 * cachelist, we must destroy the 1856 * old vnode association. 1857 */ 1858 page_hashout(npp, (kmutex_t *)NULL); 1859 } 1860 } 1861 1862 /* 1863 * We own this page! 1864 */ 1865 ASSERT(PAGE_EXCL(npp)); 1866 ASSERT(npp->p_vnode == NULL); 1867 ASSERT(!hat_page_is_mapped(npp)); 1868 PP_CLRFREE(npp); 1869 PP_CLRAGED(npp); 1870 1871 /* 1872 * Here we have a page in our hot little mits and are 1873 * just waiting to stuff it on the appropriate lists. 1874 * Get the mutex and check to see if it really does 1875 * not exist. 1876 */ 1877 phm = PAGE_HASH_MUTEX(index); 1878 mutex_enter(phm); 1879 PAGE_HASH_SEARCH(index, pp, vp, off); 1880 if (pp == NULL) { 1881 VM_STAT_ADD(page_create_new); 1882 pp = npp; 1883 npp = NULL; 1884 if (!page_hashin(pp, vp, off, phm)) { 1885 /* 1886 * Since we hold the page hash mutex and 1887 * just searched for this page, page_hashin 1888 * had better not fail. If it does, that 1889 * means somethread did not follow the 1890 * page hash mutex rules. Panic now and 1891 * get it over with. As usual, go down 1892 * holding all the locks. 1893 */ 1894 ASSERT(MUTEX_HELD(phm)); 1895 panic("page_create: hashin fail %p %p %llx %p", 1896 (void *)pp, (void *)vp, off, (void *)phm); 1897 1898 } 1899 ASSERT(MUTEX_HELD(phm)); 1900 mutex_exit(phm); 1901 phm = NULL; 1902 1903 /* 1904 * Hat layer locking need not be done to set 1905 * the following bits since the page is not hashed 1906 * and was on the free list (i.e., had no mappings). 1907 * 1908 * Set the reference bit to protect 1909 * against immediate pageout 1910 * 1911 * XXXmh modify freelist code to set reference 1912 * bit so we don't have to do it here. 1913 */ 1914 page_set_props(pp, P_REF); 1915 } else { 1916 ASSERT(MUTEX_HELD(phm)); 1917 mutex_exit(phm); 1918 phm = NULL; 1919 /* 1920 * NOTE: This should not happen for pages associated 1921 * with kernel vnode 'kvp'. 1922 */ 1923 /* XX64 - to debug why this happens! */ 1924 ASSERT(!VN_ISKAS(vp)); 1925 if (VN_ISKAS(vp)) 1926 cmn_err(CE_NOTE, 1927 "page_create: page not expected " 1928 "in hash list for kernel vnode - pp 0x%p", 1929 (void *)pp); 1930 VM_STAT_ADD(page_create_exists); 1931 goto fail; 1932 } 1933 1934 /* 1935 * Got a page! It is locked. Acquire the i/o 1936 * lock since we are going to use the p_next and 1937 * p_prev fields to link the requested pages together. 1938 */ 1939 page_io_lock(pp); 1940 page_add(&plist, pp); 1941 plist = plist->p_next; 1942 off += MMU_PAGESIZE; 1943 vaddr += MMU_PAGESIZE; 1944 } 1945 1946 check_dma(mattr, plist, pages_req); 1947 return (plist); 1948 1949 fail: 1950 if (npp != NULL) { 1951 /* 1952 * Did not need this page after all. 1953 * Put it back on the free list. 1954 */ 1955 VM_STAT_ADD(page_create_putbacks); 1956 PP_SETFREE(npp); 1957 PP_SETAGED(npp); 1958 npp->p_offset = (u_offset_t)-1; 1959 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 1960 page_unlock(npp); 1961 } 1962 1963 /* 1964 * Give up the pages we already got. 1965 */ 1966 while (plist != NULL) { 1967 pp = plist; 1968 page_sub(&plist, pp); 1969 page_io_unlock(pp); 1970 plist_len++; 1971 /*LINTED: constant in conditional ctx*/ 1972 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1973 } 1974 1975 /* 1976 * VN_DISPOSE does freemem accounting for the pages in plist 1977 * by calling page_free. So, we need to undo the pcf accounting 1978 * for only the remaining pages. 1979 */ 1980 VM_STAT_ADD(page_create_putbacks); 1981 page_create_putback(pages_req - plist_len); 1982 1983 return (NULL); 1984 } 1985 1986 1987 /* 1988 * Copy the data from the physical page represented by "frompp" to 1989 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 1990 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 1991 * level and no one sleeps with an active mapping there. 1992 * 1993 * Note that the ref/mod bits in the page_t's are not affected by 1994 * this operation, hence it is up to the caller to update them appropriately. 1995 */ 1996 int 1997 ppcopy(page_t *frompp, page_t *topp) 1998 { 1999 caddr_t pp_addr1; 2000 caddr_t pp_addr2; 2001 hat_mempte_t pte1; 2002 hat_mempte_t pte2; 2003 kmutex_t *ppaddr_mutex; 2004 label_t ljb; 2005 int ret = 1; 2006 2007 ASSERT_STACK_ALIGNED(); 2008 ASSERT(PAGE_LOCKED(frompp)); 2009 ASSERT(PAGE_LOCKED(topp)); 2010 2011 if (kpm_enable) { 2012 pp_addr1 = hat_kpm_page2va(frompp, 0); 2013 pp_addr2 = hat_kpm_page2va(topp, 0); 2014 kpreempt_disable(); 2015 } else { 2016 /* 2017 * disable pre-emption so that CPU can't change 2018 */ 2019 kpreempt_disable(); 2020 2021 pp_addr1 = CPU->cpu_caddr1; 2022 pp_addr2 = CPU->cpu_caddr2; 2023 pte1 = CPU->cpu_caddr1pte; 2024 pte2 = CPU->cpu_caddr2pte; 2025 2026 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 2027 mutex_enter(ppaddr_mutex); 2028 2029 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 2030 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 2031 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 2032 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 2033 HAT_LOAD_NOCONSIST); 2034 } 2035 2036 if (on_fault(&ljb)) { 2037 ret = 0; 2038 goto faulted; 2039 } 2040 if (use_sse_pagecopy) 2041 hwblkpagecopy(pp_addr1, pp_addr2); 2042 else 2043 bcopy(pp_addr1, pp_addr2, PAGESIZE); 2044 2045 no_fault(); 2046 faulted: 2047 if (!kpm_enable) { 2048 mutex_exit(ppaddr_mutex); 2049 } 2050 kpreempt_enable(); 2051 return (ret); 2052 } 2053 2054 /* 2055 * Zero the physical page from off to off + len given by `pp' 2056 * without changing the reference and modified bits of page. 2057 * 2058 * We use this using CPU private page address #2, see ppcopy() for more info. 2059 * pagezero() must not be called at interrupt level. 2060 */ 2061 void 2062 pagezero(page_t *pp, uint_t off, uint_t len) 2063 { 2064 caddr_t pp_addr2; 2065 hat_mempte_t pte2; 2066 kmutex_t *ppaddr_mutex; 2067 2068 ASSERT_STACK_ALIGNED(); 2069 ASSERT(len <= MMU_PAGESIZE); 2070 ASSERT(off <= MMU_PAGESIZE); 2071 ASSERT(off + len <= MMU_PAGESIZE); 2072 ASSERT(PAGE_LOCKED(pp)); 2073 2074 if (kpm_enable) { 2075 pp_addr2 = hat_kpm_page2va(pp, 0); 2076 kpreempt_disable(); 2077 } else { 2078 kpreempt_disable(); 2079 2080 pp_addr2 = CPU->cpu_caddr2; 2081 pte2 = CPU->cpu_caddr2pte; 2082 2083 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 2084 mutex_enter(ppaddr_mutex); 2085 2086 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 2087 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 2088 HAT_LOAD_NOCONSIST); 2089 } 2090 2091 if (use_sse_pagezero) { 2092 hwblkclr(pp_addr2 + off, len); 2093 } else { 2094 bzero(pp_addr2 + off, len); 2095 } 2096 2097 if (!kpm_enable) 2098 mutex_exit(ppaddr_mutex); 2099 kpreempt_enable(); 2100 } 2101 2102 /* 2103 * Platform-dependent page scrub call. 2104 */ 2105 void 2106 pagescrub(page_t *pp, uint_t off, uint_t len) 2107 { 2108 /* 2109 * For now, we rely on the fact that pagezero() will 2110 * always clear UEs. 2111 */ 2112 pagezero(pp, off, len); 2113 } 2114 2115 /* 2116 * set up two private addresses for use on a given CPU for use in ppcopy() 2117 */ 2118 void 2119 setup_vaddr_for_ppcopy(struct cpu *cpup) 2120 { 2121 void *addr; 2122 hat_mempte_t pte_pa; 2123 2124 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2125 pte_pa = hat_mempte_setup(addr); 2126 cpup->cpu_caddr1 = addr; 2127 cpup->cpu_caddr1pte = pte_pa; 2128 2129 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 2130 pte_pa = hat_mempte_setup(addr); 2131 cpup->cpu_caddr2 = addr; 2132 cpup->cpu_caddr2pte = pte_pa; 2133 2134 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 2135 } 2136 2137 /* 2138 * Undo setup_vaddr_for_ppcopy 2139 */ 2140 void 2141 teardown_vaddr_for_ppcopy(struct cpu *cpup) 2142 { 2143 mutex_destroy(&cpup->cpu_ppaddr_mutex); 2144 2145 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 2146 cpup->cpu_caddr2pte = 0; 2147 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 2148 cpup->cpu_caddr2 = 0; 2149 2150 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 2151 cpup->cpu_caddr1pte = 0; 2152 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 2153 cpup->cpu_caddr1 = 0; 2154 } 2155 2156 /* 2157 * Create the pageout scanner thread. The thread has to 2158 * start at procedure with process pp and priority pri. 2159 */ 2160 void 2161 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 2162 { 2163 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 2164 } 2165 2166 /* 2167 * Function for flushing D-cache when performing module relocations 2168 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 2169 */ 2170 void 2171 dcache_flushall() 2172 {} 2173 2174 size_t 2175 exec_get_spslew(void) 2176 { 2177 return (0); 2178 } 2179 2180 /* 2181 * Allocate a memory page. The argument 'seed' can be any pseudo-random 2182 * number to vary where the pages come from. This is quite a hacked up 2183 * method -- it works for now, but really needs to be fixed up a bit. 2184 * 2185 * We currently use page_create_va() on the kvp with fake offsets, 2186 * segments and virt address. This is pretty bogus, but was copied from the 2187 * old hat_i86.c code. A better approach would be to specify either mnode 2188 * random or mnode local and takes a page from whatever color has the MOST 2189 * available - this would have a minimal impact on page coloring. 2190 */ 2191 page_t * 2192 page_get_physical(uintptr_t seed) 2193 { 2194 page_t *pp; 2195 u_offset_t offset; 2196 static struct seg tmpseg; 2197 static uintptr_t ctr = 0; 2198 2199 /* 2200 * This code is gross, we really need a simpler page allocator. 2201 * 2202 * We need assign an offset for the page to call page_create_va(). 2203 * To avoid conflicts with other pages, we get creative with the offset. 2204 * For 32 bits, we pick an offset > 4Gig 2205 * For 64 bits, pick an offset somewhere in the VA hole. 2206 */ 2207 offset = seed; 2208 if (offset > kernelbase) 2209 offset -= kernelbase; 2210 offset <<= MMU_PAGESHIFT; 2211 #if defined(__amd64) 2212 offset += mmu.hole_start; /* something in VA hole */ 2213 #else 2214 offset += 1ULL << 40; /* something > 4 Gig */ 2215 #endif 2216 2217 if (page_resv(1, KM_NOSLEEP) == 0) 2218 return (NULL); 2219 2220 #ifdef DEBUG 2221 pp = page_exists(&kvp, offset); 2222 if (pp != NULL) 2223 panic("page already exists %p", pp); 2224 #endif 2225 2226 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL | PG_NORELOC, 2227 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 2228 if (pp == NULL) 2229 return (NULL); 2230 page_io_unlock(pp); 2231 page_hashout(pp, NULL); 2232 return (pp); 2233 } 2234