1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 #include <sys/vmsystm.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 #include <util/qsort.h> 87 #include <sys/taskq.h> 88 89 #ifdef __xpv 90 91 #include <sys/hypervisor.h> 92 #include <sys/xen_mmu.h> 93 #include <sys/balloon_impl.h> 94 95 /* 96 * domain 0 pages usable for DMA are kept pre-allocated and kept in 97 * distinct lists, ordered by increasing mfn. 98 */ 99 static kmutex_t io_pool_lock; 100 static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 101 static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 102 static long io_pool_cnt; 103 static long io_pool_cnt_max = 0; 104 #define DEFAULT_IO_POOL_MIN 128 105 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 106 static long io_pool_cnt_lowater = 0; 107 static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 108 static long io_pool_shrinks; /* how many times did we really shrink */ 109 static long io_pool_grows; /* how many times did we grow */ 110 static mfn_t start_mfn = 1; 111 static caddr_t io_pool_kva; /* use to alloc pages when needed */ 112 113 static int create_contig_pfnlist(uint_t); 114 115 /* 116 * percentage of phys mem to hold in the i/o pool 117 */ 118 #define DEFAULT_IO_POOL_PCT 2 119 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 120 static void page_io_pool_sub(page_t **, page_t *, page_t *); 121 122 #endif /* __xpv */ 123 124 uint_t vac_colors = 1; 125 126 int largepagesupport = 0; 127 extern uint_t page_create_new; 128 extern uint_t page_create_exists; 129 extern uint_t page_create_putbacks; 130 extern uint_t page_create_putbacks; 131 /* 132 * Allow users to disable the kernel's use of SSE. 133 */ 134 extern int use_sse_pagecopy, use_sse_pagezero; 135 136 /* 137 * combined memory ranges from mnode and memranges[] to manage single 138 * mnode/mtype dimension in the page lists. 139 */ 140 typedef struct { 141 pfn_t mnr_pfnlo; 142 pfn_t mnr_pfnhi; 143 int mnr_mnode; 144 int mnr_memrange; /* index into memranges[] */ 145 /* maintain page list stats */ 146 pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 147 pgcnt_t mnr_mt_flpgcnt; /* free list cnt - small pages */ 148 pgcnt_t mnr_mt_lgpgcnt; /* free list cnt - large pages */ 149 #ifdef DEBUG 150 struct mnr_mts { /* mnode/mtype szc stats */ 151 pgcnt_t mnr_mts_pgcnt; 152 int mnr_mts_colors; 153 pgcnt_t *mnr_mtsc_pgcnt; 154 } *mnr_mts; 155 #endif 156 } mnoderange_t; 157 158 #define MEMRANGEHI(mtype) \ 159 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 160 #define MEMRANGELO(mtype) (memranges[mtype]) 161 162 #define MTYPE_FREEMEM(mt) \ 163 (mnoderanges[mt].mnr_mt_clpgcnt + \ 164 mnoderanges[mt].mnr_mt_flpgcnt + \ 165 mnoderanges[mt].mnr_mt_lgpgcnt) 166 167 /* 168 * As the PC architecture evolved memory up was clumped into several 169 * ranges for various historical I/O devices to do DMA. 170 * < 16Meg - ISA bus 171 * < 2Gig - ??? 172 * < 4Gig - PCI bus or drivers that don't understand PAE mode 173 * 174 * These are listed in reverse order, so that we can skip over unused 175 * ranges on machines with small memories. 176 * 177 * For now under the Hypervisor, we'll only ever have one memrange. 178 */ 179 #define PFN_4GIG 0x100000 180 #define PFN_16MEG 0x1000 181 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 182 PFN_4GIG, /* pfn range for 4G and above */ 183 0x80000, /* pfn range for 2G-4G */ 184 PFN_16MEG, /* pfn range for 16M-2G */ 185 0x00000, /* pfn range for 0-16M */ 186 }; 187 pfn_t *memranges = &arch_memranges[0]; 188 int nranges = NUM_MEM_RANGES; 189 190 /* 191 * This combines mem_node_config and memranges into one data 192 * structure to be used for page list management. 193 */ 194 mnoderange_t *mnoderanges; 195 int mnoderangecnt; 196 int mtype4g; 197 198 /* 199 * 4g memory management variables for systems with more than 4g of memory: 200 * 201 * physical memory below 4g is required for 32bit dma devices and, currently, 202 * for kmem memory. On systems with more than 4g of memory, the pool of memory 203 * below 4g can be depleted without any paging activity given that there is 204 * likely to be sufficient memory above 4g. 205 * 206 * physmax4g is set true if the largest pfn is over 4g. The rest of the 207 * 4g memory management code is enabled only when physmax4g is true. 208 * 209 * maxmem4g is the count of the maximum number of pages on the page lists 210 * with physical addresses below 4g. It can be a lot less then 4g given that 211 * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 212 * agp aperture etc. 213 * 214 * freemem4g maintains the count of the number of available pages on the 215 * page lists with physical addresses below 4g. 216 * 217 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 218 * 6% (desfree4gshift = 4) of maxmem4g. 219 * 220 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 221 * and the amount of physical memory above 4g is greater than freemem4g. 222 * In this case, page_get_* routines will restrict below 4g allocations 223 * for requests that don't specifically require it. 224 */ 225 226 #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 227 #define DESFREE4G (maxmem4g >> desfree4gshift) 228 229 #define RESTRICT4G_ALLOC \ 230 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 231 232 static pgcnt_t maxmem4g; 233 static pgcnt_t freemem4g; 234 static int physmax4g; 235 static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 236 static int lotsfree4gshift = 3; 237 238 /* 239 * 16m memory management: 240 * 241 * reserve some amount of physical memory below 16m for legacy devices. 242 * 243 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 244 * 16m or if the 16m pool drops below DESFREE16M. 245 * 246 * In this case, general page allocations via page_get_{free,cache}list 247 * routines will be restricted from allocating from the 16m pool. Allocations 248 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 249 * are not restricted. 250 */ 251 252 #define FREEMEM16M MTYPE_FREEMEM(0) 253 #define DESFREE16M desfree16m 254 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 255 ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 256 ((freemem >= (FREEMEM16M)) || \ 257 (FREEMEM16M < (DESFREE16M + pgcnt)))) 258 259 static pgcnt_t desfree16m = 0x380; 260 261 /* 262 * This can be patched via /etc/system to allow old non-PAE aware device 263 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 264 */ 265 int restricted_kmemalloc = 0; 266 267 #ifdef VM_STATS 268 struct { 269 ulong_t pga_alloc; 270 ulong_t pga_notfullrange; 271 ulong_t pga_nulldmaattr; 272 ulong_t pga_allocok; 273 ulong_t pga_allocfailed; 274 ulong_t pgma_alloc; 275 ulong_t pgma_allocok; 276 ulong_t pgma_allocfailed; 277 ulong_t pgma_allocempty; 278 } pga_vmstats; 279 #endif 280 281 uint_t mmu_page_sizes; 282 283 /* How many page sizes the users can see */ 284 uint_t mmu_exported_page_sizes; 285 286 /* 287 * Number of pages in 1 GB. Don't enable automatic large pages if we have 288 * fewer than this many pages. 289 */ 290 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 291 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 292 293 /* 294 * Maximum and default segment size tunables for user private 295 * and shared anon memory, and user text and initialized data. 296 * These can be patched via /etc/system to allow large pages 297 * to be used for mapping application private and shared anon memory. 298 */ 299 size_t mcntl0_lpsize = MMU_PAGESIZE; 300 size_t max_uheap_lpsize = MMU_PAGESIZE; 301 size_t default_uheap_lpsize = MMU_PAGESIZE; 302 size_t max_ustack_lpsize = MMU_PAGESIZE; 303 size_t default_ustack_lpsize = MMU_PAGESIZE; 304 size_t max_privmap_lpsize = MMU_PAGESIZE; 305 size_t max_uidata_lpsize = MMU_PAGESIZE; 306 size_t max_utext_lpsize = MMU_PAGESIZE; 307 size_t max_shm_lpsize = MMU_PAGESIZE; 308 309 310 /* 311 * initialized by page_coloring_init(). 312 */ 313 uint_t page_colors; 314 uint_t page_colors_mask; 315 uint_t page_coloring_shift; 316 int cpu_page_colors; 317 static uint_t l2_colors; 318 319 /* 320 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 321 * and page_colors are calculated from the l2 cache n-way set size. Within a 322 * mnode range, the page freelist and cachelist are hashed into bins based on 323 * color. This makes it easier to search for a page within a specific memory 324 * range. 325 */ 326 #define PAGE_COLORS_MIN 16 327 328 page_t ****page_freelists; 329 page_t ***page_cachelists; 330 331 332 /* 333 * Used by page layer to know about page sizes 334 */ 335 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 336 337 kmutex_t *fpc_mutex[NPC_MUTEX]; 338 kmutex_t *cpc_mutex[NPC_MUTEX]; 339 340 /* 341 * Only let one thread at a time try to coalesce large pages, to 342 * prevent them from working against each other. 343 */ 344 static kmutex_t contig_lock; 345 #define CONTIG_LOCK() mutex_enter(&contig_lock); 346 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 347 348 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 349 350 /* 351 * Return the optimum page size for a given mapping 352 */ 353 /*ARGSUSED*/ 354 size_t 355 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 356 { 357 level_t l = 0; 358 size_t pgsz = MMU_PAGESIZE; 359 size_t max_lpsize; 360 uint_t mszc; 361 362 ASSERT(maptype != MAPPGSZ_VA); 363 364 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 365 return (MMU_PAGESIZE); 366 } 367 368 switch (maptype) { 369 case MAPPGSZ_HEAP: 370 case MAPPGSZ_STK: 371 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 372 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 373 if (max_lpsize == MMU_PAGESIZE) { 374 return (MMU_PAGESIZE); 375 } 376 if (len == 0) { 377 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 378 p->p_brksize - p->p_bssbase : p->p_stksize; 379 } 380 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 381 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 382 383 /* 384 * use the pages size that best fits len 385 */ 386 for (l = mmu.max_page_level; l > 0; --l) { 387 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 388 continue; 389 } else { 390 pgsz = LEVEL_SIZE(l); 391 } 392 break; 393 } 394 395 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 396 p->p_stkpageszc); 397 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 398 pgsz = hw_page_array[mszc].hp_size; 399 } 400 return (pgsz); 401 402 /* 403 * for ISM use the 1st large page size. 404 */ 405 case MAPPGSZ_ISM: 406 if (mmu.max_page_level == 0) 407 return (MMU_PAGESIZE); 408 return (LEVEL_SIZE(1)); 409 } 410 return (pgsz); 411 } 412 413 static uint_t 414 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 415 size_t min_physmem) 416 { 417 caddr_t eaddr = addr + size; 418 uint_t szcvec = 0; 419 caddr_t raddr; 420 caddr_t readdr; 421 size_t pgsz; 422 int i; 423 424 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 425 return (0); 426 } 427 428 for (i = mmu_page_sizes - 1; i > 0; i--) { 429 pgsz = page_get_pagesize(i); 430 if (pgsz > max_lpsize) { 431 continue; 432 } 433 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 434 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 435 if (raddr < addr || raddr >= readdr) { 436 continue; 437 } 438 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 439 continue; 440 } 441 /* 442 * Set szcvec to the remaining page sizes. 443 */ 444 szcvec = ((1 << (i + 1)) - 1) & ~1; 445 break; 446 } 447 return (szcvec); 448 } 449 450 /* 451 * Return a bit vector of large page size codes that 452 * can be used to map [addr, addr + len) region. 453 */ 454 /*ARGSUSED*/ 455 uint_t 456 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 457 int memcntl) 458 { 459 size_t max_lpsize = mcntl0_lpsize; 460 461 if (mmu.max_page_level == 0) 462 return (0); 463 464 if (flags & MAP_TEXT) { 465 if (!memcntl) 466 max_lpsize = max_utext_lpsize; 467 return (map_szcvec(addr, size, off, max_lpsize, 468 shm_lpg_min_physmem)); 469 470 } else if (flags & MAP_INITDATA) { 471 if (!memcntl) 472 max_lpsize = max_uidata_lpsize; 473 return (map_szcvec(addr, size, off, max_lpsize, 474 privm_lpg_min_physmem)); 475 476 } else if (type == MAPPGSZC_SHM) { 477 if (!memcntl) 478 max_lpsize = max_shm_lpsize; 479 return (map_szcvec(addr, size, off, max_lpsize, 480 shm_lpg_min_physmem)); 481 482 } else if (type == MAPPGSZC_HEAP) { 483 if (!memcntl) 484 max_lpsize = max_uheap_lpsize; 485 return (map_szcvec(addr, size, off, max_lpsize, 486 privm_lpg_min_physmem)); 487 488 } else if (type == MAPPGSZC_STACK) { 489 if (!memcntl) 490 max_lpsize = max_ustack_lpsize; 491 return (map_szcvec(addr, size, off, max_lpsize, 492 privm_lpg_min_physmem)); 493 494 } else { 495 if (!memcntl) 496 max_lpsize = max_privmap_lpsize; 497 return (map_szcvec(addr, size, off, max_lpsize, 498 privm_lpg_min_physmem)); 499 } 500 } 501 502 /* 503 * Handle a pagefault. 504 */ 505 faultcode_t 506 pagefault( 507 caddr_t addr, 508 enum fault_type type, 509 enum seg_rw rw, 510 int iskernel) 511 { 512 struct as *as; 513 struct hat *hat; 514 struct proc *p; 515 kthread_t *t; 516 faultcode_t res; 517 caddr_t base; 518 size_t len; 519 int err; 520 int mapped_red; 521 uintptr_t ea; 522 523 ASSERT_STACK_ALIGNED(); 524 525 if (INVALID_VADDR(addr)) 526 return (FC_NOMAP); 527 528 mapped_red = segkp_map_red(); 529 530 if (iskernel) { 531 as = &kas; 532 hat = as->a_hat; 533 } else { 534 t = curthread; 535 p = ttoproc(t); 536 as = p->p_as; 537 hat = as->a_hat; 538 } 539 540 /* 541 * Dispatch pagefault. 542 */ 543 res = as_fault(hat, as, addr, 1, type, rw); 544 545 /* 546 * If this isn't a potential unmapped hole in the user's 547 * UNIX data or stack segments, just return status info. 548 */ 549 if (res != FC_NOMAP || iskernel) 550 goto out; 551 552 /* 553 * Check to see if we happened to faulted on a currently unmapped 554 * part of the UNIX data or stack segments. If so, create a zfod 555 * mapping there and then try calling the fault routine again. 556 */ 557 base = p->p_brkbase; 558 len = p->p_brksize; 559 560 if (addr < base || addr >= base + len) { /* data seg? */ 561 base = (caddr_t)p->p_usrstack - p->p_stksize; 562 len = p->p_stksize; 563 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 564 /* not in either UNIX data or stack segments */ 565 res = FC_NOMAP; 566 goto out; 567 } 568 } 569 570 /* 571 * the rest of this function implements a 3.X 4.X 5.X compatibility 572 * This code is probably not needed anymore 573 */ 574 if (p->p_model == DATAMODEL_ILP32) { 575 576 /* expand the gap to the page boundaries on each side */ 577 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 578 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 579 len = ea - (uintptr_t)base; 580 581 as_rangelock(as); 582 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 583 0) { 584 err = as_map(as, base, len, segvn_create, zfod_argsp); 585 as_rangeunlock(as); 586 if (err) { 587 res = FC_MAKE_ERR(err); 588 goto out; 589 } 590 } else { 591 /* 592 * This page is already mapped by another thread after 593 * we returned from as_fault() above. We just fall 594 * through as_fault() below. 595 */ 596 as_rangeunlock(as); 597 } 598 599 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 600 } 601 602 out: 603 if (mapped_red) 604 segkp_unmap_red(); 605 606 return (res); 607 } 608 609 void 610 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 611 { 612 struct proc *p = curproc; 613 caddr_t userlimit = (flags & _MAP_LOW32) ? 614 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 615 616 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 617 } 618 619 /*ARGSUSED*/ 620 int 621 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 622 { 623 return (0); 624 } 625 626 /* 627 * map_addr_proc() is the routine called when the system is to 628 * choose an address for the user. We will pick an address 629 * range which is the highest available below userlimit. 630 * 631 * addrp is a value/result parameter. 632 * On input it is a hint from the user to be used in a completely 633 * machine dependent fashion. We decide to completely ignore this hint. 634 * 635 * On output it is NULL if no address can be found in the current 636 * processes address space or else an address that is currently 637 * not mapped for len bytes with a page of red zone on either side. 638 * 639 * align is not needed on x86 (it's for viturally addressed caches) 640 */ 641 /*ARGSUSED*/ 642 void 643 map_addr_proc( 644 caddr_t *addrp, 645 size_t len, 646 offset_t off, 647 int vacalign, 648 caddr_t userlimit, 649 struct proc *p, 650 uint_t flags) 651 { 652 struct as *as = p->p_as; 653 caddr_t addr; 654 caddr_t base; 655 size_t slen; 656 size_t align_amount; 657 658 ASSERT32(userlimit == as->a_userlimit); 659 660 base = p->p_brkbase; 661 #if defined(__amd64) 662 /* 663 * XX64 Yes, this needs more work. 664 */ 665 if (p->p_model == DATAMODEL_NATIVE) { 666 if (userlimit < as->a_userlimit) { 667 /* 668 * This happens when a program wants to map 669 * something in a range that's accessible to a 670 * program in a smaller address space. For example, 671 * a 64-bit program calling mmap32(2) to guarantee 672 * that the returned address is below 4Gbytes. 673 */ 674 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 675 676 if (userlimit > base) 677 slen = userlimit - base; 678 else { 679 *addrp = NULL; 680 return; 681 } 682 } else { 683 /* 684 * XX64 This layout is probably wrong .. but in 685 * the event we make the amd64 address space look 686 * like sparcv9 i.e. with the stack -above- the 687 * heap, this bit of code might even be correct. 688 */ 689 slen = p->p_usrstack - base - 690 (((size_t)rctl_enforced_value( 691 rctlproc_legacy[RLIMIT_STACK], 692 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 693 } 694 } else 695 #endif 696 slen = userlimit - base; 697 698 len = (len + PAGEOFFSET) & PAGEMASK; 699 700 /* 701 * Redzone for each side of the request. This is done to leave 702 * one page unmapped between segments. This is not required, but 703 * it's useful for the user because if their program strays across 704 * a segment boundary, it will catch a fault immediately making 705 * debugging a little easier. 706 */ 707 len += 2 * MMU_PAGESIZE; 708 709 /* 710 * figure out what the alignment should be 711 * 712 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 713 */ 714 if (len <= ELF_386_MAXPGSZ) { 715 /* 716 * Align virtual addresses to ensure that ELF shared libraries 717 * are mapped with the appropriate alignment constraints by 718 * the run-time linker. 719 */ 720 align_amount = ELF_386_MAXPGSZ; 721 } else { 722 int l = mmu.max_page_level; 723 724 while (l && len < LEVEL_SIZE(l)) 725 --l; 726 727 align_amount = LEVEL_SIZE(l); 728 } 729 730 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 731 align_amount = (uintptr_t)*addrp; 732 733 len += align_amount; 734 735 /* 736 * Look for a large enough hole starting below userlimit. 737 * After finding it, use the upper part. Addition of PAGESIZE 738 * is for the redzone as described above. 739 */ 740 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 741 caddr_t as_addr; 742 743 addr = base + slen - len + MMU_PAGESIZE; 744 as_addr = addr; 745 /* 746 * Round address DOWN to the alignment amount, 747 * add the offset, and if this address is less 748 * than the original address, add alignment amount. 749 */ 750 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 751 addr += (uintptr_t)(off & (align_amount - 1)); 752 if (addr < as_addr) 753 addr += align_amount; 754 755 ASSERT(addr <= (as_addr + align_amount)); 756 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 757 ((uintptr_t)(off & (align_amount - 1)))); 758 *addrp = addr; 759 } else { 760 *addrp = NULL; /* no more virtual space */ 761 } 762 } 763 764 /* 765 * Determine whether [base, base+len] contains a valid range of 766 * addresses at least minlen long. base and len are adjusted if 767 * required to provide a valid range. 768 */ 769 /*ARGSUSED3*/ 770 int 771 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 772 { 773 uintptr_t hi, lo; 774 775 lo = (uintptr_t)*basep; 776 hi = lo + *lenp; 777 778 /* 779 * If hi rolled over the top, try cutting back. 780 */ 781 if (hi < lo) { 782 if (0 - lo + hi < minlen) 783 return (0); 784 if (0 - lo < minlen) 785 return (0); 786 *lenp = 0 - lo; 787 } else if (hi - lo < minlen) { 788 return (0); 789 } 790 #if defined(__amd64) 791 /* 792 * Deal with a possible hole in the address range between 793 * hole_start and hole_end that should never be mapped. 794 */ 795 if (lo < hole_start) { 796 if (hi > hole_start) { 797 if (hi < hole_end) { 798 hi = hole_start; 799 } else { 800 /* lo < hole_start && hi >= hole_end */ 801 if (dir == AH_LO) { 802 /* 803 * prefer lowest range 804 */ 805 if (hole_start - lo >= minlen) 806 hi = hole_start; 807 else if (hi - hole_end >= minlen) 808 lo = hole_end; 809 else 810 return (0); 811 } else { 812 /* 813 * prefer highest range 814 */ 815 if (hi - hole_end >= minlen) 816 lo = hole_end; 817 else if (hole_start - lo >= minlen) 818 hi = hole_start; 819 else 820 return (0); 821 } 822 } 823 } 824 } else { 825 /* lo >= hole_start */ 826 if (hi < hole_end) 827 return (0); 828 if (lo < hole_end) 829 lo = hole_end; 830 } 831 832 if (hi - lo < minlen) 833 return (0); 834 835 *basep = (caddr_t)lo; 836 *lenp = hi - lo; 837 #endif 838 return (1); 839 } 840 841 /* 842 * Determine whether [addr, addr+len] are valid user addresses. 843 */ 844 /*ARGSUSED*/ 845 int 846 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 847 caddr_t userlimit) 848 { 849 caddr_t eaddr = addr + len; 850 851 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 852 return (RANGE_BADADDR); 853 854 #if defined(__amd64) 855 /* 856 * Check for the VA hole 857 */ 858 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 859 return (RANGE_BADADDR); 860 #endif 861 862 return (RANGE_OKAY); 863 } 864 865 /* 866 * Return 1 if the page frame is onboard memory, else 0. 867 */ 868 int 869 pf_is_memory(pfn_t pf) 870 { 871 if (pfn_is_foreign(pf)) 872 return (0); 873 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 874 } 875 876 /* 877 * return the memrange containing pfn 878 */ 879 int 880 memrange_num(pfn_t pfn) 881 { 882 int n; 883 884 for (n = 0; n < nranges - 1; ++n) { 885 if (pfn >= memranges[n]) 886 break; 887 } 888 return (n); 889 } 890 891 /* 892 * return the mnoderange containing pfn 893 */ 894 /*ARGSUSED*/ 895 int 896 pfn_2_mtype(pfn_t pfn) 897 { 898 #if defined(__xpv) 899 return (0); 900 #else 901 int n; 902 903 for (n = mnoderangecnt - 1; n >= 0; n--) { 904 if (pfn >= mnoderanges[n].mnr_pfnlo) { 905 break; 906 } 907 } 908 return (n); 909 #endif 910 } 911 912 #if !defined(__xpv) 913 /* 914 * is_contigpage_free: 915 * returns a page list of contiguous pages. It minimally has to return 916 * minctg pages. Caller determines minctg based on the scatter-gather 917 * list length. 918 * 919 * pfnp is set to the next page frame to search on return. 920 */ 921 static page_t * 922 is_contigpage_free( 923 pfn_t *pfnp, 924 pgcnt_t *pgcnt, 925 pgcnt_t minctg, 926 uint64_t pfnseg, 927 int iolock) 928 { 929 int i = 0; 930 pfn_t pfn = *pfnp; 931 page_t *pp; 932 page_t *plist = NULL; 933 934 /* 935 * fail if pfn + minctg crosses a segment boundary. 936 * Adjust for next starting pfn to begin at segment boundary. 937 */ 938 939 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 940 *pfnp = roundup(*pfnp, pfnseg + 1); 941 return (NULL); 942 } 943 944 do { 945 retry: 946 pp = page_numtopp_nolock(pfn + i); 947 if ((pp == NULL) || 948 (page_trylock(pp, SE_EXCL) == 0)) { 949 (*pfnp)++; 950 break; 951 } 952 if (page_pptonum(pp) != pfn + i) { 953 page_unlock(pp); 954 goto retry; 955 } 956 957 if (!(PP_ISFREE(pp))) { 958 page_unlock(pp); 959 (*pfnp)++; 960 break; 961 } 962 963 if (!PP_ISAGED(pp)) { 964 page_list_sub(pp, PG_CACHE_LIST); 965 page_hashout(pp, (kmutex_t *)NULL); 966 } else { 967 page_list_sub(pp, PG_FREE_LIST); 968 } 969 970 if (iolock) 971 page_io_lock(pp); 972 page_list_concat(&plist, &pp); 973 974 /* 975 * exit loop when pgcnt satisfied or segment boundary reached. 976 */ 977 978 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 979 980 *pfnp += i; /* set to next pfn to search */ 981 982 if (i >= minctg) { 983 *pgcnt -= i; 984 return (plist); 985 } 986 987 /* 988 * failure: minctg not satisfied. 989 * 990 * if next request crosses segment boundary, set next pfn 991 * to search from the segment boundary. 992 */ 993 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 994 *pfnp = roundup(*pfnp, pfnseg + 1); 995 996 /* clean up any pages already allocated */ 997 998 while (plist) { 999 pp = plist; 1000 page_sub(&plist, pp); 1001 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 1002 if (iolock) 1003 page_io_unlock(pp); 1004 page_unlock(pp); 1005 } 1006 1007 return (NULL); 1008 } 1009 #endif /* !__xpv */ 1010 1011 /* 1012 * verify that pages being returned from allocator have correct DMA attribute 1013 */ 1014 #ifndef DEBUG 1015 #define check_dma(a, b, c) (0) 1016 #else 1017 static void 1018 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 1019 { 1020 if (dma_attr == NULL) 1021 return; 1022 1023 while (cnt-- > 0) { 1024 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 1025 dma_attr->dma_attr_addr_lo) 1026 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 1027 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 1028 dma_attr->dma_attr_addr_hi) 1029 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 1030 pp = pp->p_next; 1031 } 1032 } 1033 #endif 1034 1035 #if !defined(__xpv) 1036 static page_t * 1037 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 1038 { 1039 pfn_t pfn; 1040 int sgllen; 1041 uint64_t pfnseg; 1042 pgcnt_t minctg; 1043 page_t *pplist = NULL, *plist; 1044 uint64_t lo, hi; 1045 pgcnt_t pfnalign = 0; 1046 static pfn_t startpfn; 1047 static pgcnt_t lastctgcnt; 1048 uintptr_t align; 1049 1050 CONTIG_LOCK(); 1051 1052 if (mattr) { 1053 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 1054 hi = mmu_btop(mattr->dma_attr_addr_hi); 1055 if (hi >= physmax) 1056 hi = physmax - 1; 1057 sgllen = mattr->dma_attr_sgllen; 1058 pfnseg = mmu_btop(mattr->dma_attr_seg); 1059 1060 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 1061 if (align > MMU_PAGESIZE) 1062 pfnalign = mmu_btop(align); 1063 1064 /* 1065 * in order to satisfy the request, must minimally 1066 * acquire minctg contiguous pages 1067 */ 1068 minctg = howmany(*pgcnt, sgllen); 1069 1070 ASSERT(hi >= lo); 1071 1072 /* 1073 * start from where last searched if the minctg >= lastctgcnt 1074 */ 1075 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 1076 startpfn = lo; 1077 } else { 1078 hi = physmax - 1; 1079 lo = 0; 1080 sgllen = 1; 1081 pfnseg = mmu.highest_pfn; 1082 minctg = *pgcnt; 1083 1084 if (minctg < lastctgcnt) 1085 startpfn = lo; 1086 } 1087 lastctgcnt = minctg; 1088 1089 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 1090 1091 /* conserve 16m memory - start search above 16m when possible */ 1092 if (hi > PFN_16M && startpfn < PFN_16M) 1093 startpfn = PFN_16M; 1094 1095 pfn = startpfn; 1096 if (pfnalign) 1097 pfn = P2ROUNDUP(pfn, pfnalign); 1098 1099 while (pfn + minctg - 1 <= hi) { 1100 1101 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1102 if (plist) { 1103 page_list_concat(&pplist, &plist); 1104 sgllen--; 1105 /* 1106 * return when contig pages no longer needed 1107 */ 1108 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1109 startpfn = pfn; 1110 CONTIG_UNLOCK(); 1111 check_dma(mattr, pplist, *pgcnt); 1112 return (pplist); 1113 } 1114 minctg = howmany(*pgcnt, sgllen); 1115 } 1116 if (pfnalign) 1117 pfn = P2ROUNDUP(pfn, pfnalign); 1118 } 1119 1120 /* cannot find contig pages in specified range */ 1121 if (startpfn == lo) { 1122 CONTIG_UNLOCK(); 1123 return (NULL); 1124 } 1125 1126 /* did not start with lo previously */ 1127 pfn = lo; 1128 if (pfnalign) 1129 pfn = P2ROUNDUP(pfn, pfnalign); 1130 1131 /* allow search to go above startpfn */ 1132 while (pfn < startpfn) { 1133 1134 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1135 if (plist != NULL) { 1136 1137 page_list_concat(&pplist, &plist); 1138 sgllen--; 1139 1140 /* 1141 * return when contig pages no longer needed 1142 */ 1143 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1144 startpfn = pfn; 1145 CONTIG_UNLOCK(); 1146 check_dma(mattr, pplist, *pgcnt); 1147 return (pplist); 1148 } 1149 minctg = howmany(*pgcnt, sgllen); 1150 } 1151 if (pfnalign) 1152 pfn = P2ROUNDUP(pfn, pfnalign); 1153 } 1154 CONTIG_UNLOCK(); 1155 return (NULL); 1156 } 1157 #endif /* !__xpv */ 1158 1159 /* 1160 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1161 * memranges[]. Used to determine the size of page lists and mnoderanges. 1162 */ 1163 int 1164 mnode_range_cnt(int mnode) 1165 { 1166 #if defined(__xpv) 1167 ASSERT(mnode == 0); 1168 return (1); 1169 #else /* __xpv */ 1170 int mri; 1171 int mnrcnt = 0; 1172 1173 if (mem_node_config[mnode].exists != 0) { 1174 mri = nranges - 1; 1175 1176 /* find the memranges index below contained in mnode range */ 1177 1178 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1179 mri--; 1180 1181 /* 1182 * increment mnode range counter when memranges or mnode 1183 * boundary is reached. 1184 */ 1185 while (mri >= 0 && 1186 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1187 mnrcnt++; 1188 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1189 mri--; 1190 else 1191 break; 1192 } 1193 } 1194 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1195 return (mnrcnt); 1196 #endif /* __xpv */ 1197 } 1198 1199 /* 1200 * mnode_range_setup() initializes mnoderanges. 1201 */ 1202 void 1203 mnode_range_setup(mnoderange_t *mnoderanges) 1204 { 1205 int mnode, mri; 1206 1207 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1208 if (mem_node_config[mnode].exists == 0) 1209 continue; 1210 1211 mri = nranges - 1; 1212 1213 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1214 mri--; 1215 1216 while (mri >= 0 && mem_node_config[mnode].physmax >= 1217 MEMRANGELO(mri)) { 1218 mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 1219 mem_node_config[mnode].physbase); 1220 mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 1221 mem_node_config[mnode].physmax); 1222 mnoderanges->mnr_mnode = mnode; 1223 mnoderanges->mnr_memrange = mri; 1224 mnoderanges++; 1225 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1226 mri--; 1227 else 1228 break; 1229 } 1230 } 1231 } 1232 1233 /*ARGSUSED*/ 1234 int 1235 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1236 { 1237 int mtype = mnoderangecnt - 1; 1238 1239 #if !defined(__xpv) 1240 #if defined(__i386) 1241 /* 1242 * set the mtype range 1243 * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 1244 * - for non kmem requests, set range to above 4g if memory below 4g 1245 * runs low. 1246 */ 1247 if (restricted_kmemalloc && VN_ISKAS(vp) && 1248 (caddr_t)(vaddr) >= kernelheap && 1249 (caddr_t)(vaddr) < ekernelheap) { 1250 ASSERT(physmax4g); 1251 mtype = mtype4g; 1252 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1253 btop(pgsz), *flags)) { 1254 *flags |= PGI_MT_RANGE16M; 1255 } else { 1256 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1257 VM_STAT_COND_ADD((*flags & PG_PANIC), 1258 vmm_vmstats.pgpanicalloc); 1259 *flags |= PGI_MT_RANGE0; 1260 } 1261 return (mtype); 1262 } 1263 #endif /* __i386 */ 1264 1265 if (RESTRICT4G_ALLOC) { 1266 VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1267 /* here only for > 4g systems */ 1268 *flags |= PGI_MT_RANGE4G; 1269 } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1270 *flags |= PGI_MT_RANGE16M; 1271 } else { 1272 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1273 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1274 *flags |= PGI_MT_RANGE0; 1275 } 1276 #endif /* !__xpv */ 1277 return (mtype); 1278 } 1279 1280 1281 /* mtype init for page_get_replacement_page */ 1282 /*ARGSUSED*/ 1283 int 1284 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1285 { 1286 int mtype = mnoderangecnt - 1; 1287 #if !defined(__ixpv) 1288 if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1289 *flags |= PGI_MT_RANGE16M; 1290 } else { 1291 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1292 *flags |= PGI_MT_RANGE0; 1293 } 1294 #endif 1295 return (mtype); 1296 } 1297 1298 /* 1299 * Determine if the mnode range specified in mtype contains memory belonging 1300 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1301 * the range of indices from high pfn to 0, 16m or 4g. 1302 * 1303 * Return first mnode range type index found otherwise return -1 if none found. 1304 */ 1305 int 1306 mtype_func(int mnode, int mtype, uint_t flags) 1307 { 1308 if (flags & PGI_MT_RANGE) { 1309 int mtlim = 0; 1310 1311 if (flags & PGI_MT_NEXT) 1312 mtype--; 1313 if (flags & PGI_MT_RANGE4G) 1314 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1315 else if (flags & PGI_MT_RANGE16M) 1316 mtlim = 1; /* exclude 0-16m range */ 1317 while (mtype >= mtlim) { 1318 if (mnoderanges[mtype].mnr_mnode == mnode) 1319 return (mtype); 1320 mtype--; 1321 } 1322 } else if (mnoderanges[mtype].mnr_mnode == mnode) { 1323 return (mtype); 1324 } 1325 return (-1); 1326 } 1327 1328 /* 1329 * Update the page list max counts with the pfn range specified by the 1330 * input parameters. Called from add_physmem() when physical memory with 1331 * page_t's are initially added to the page lists. 1332 */ 1333 void 1334 mtype_modify_max(pfn_t startpfn, long cnt) 1335 { 1336 int mtype = 0; 1337 pfn_t endpfn = startpfn + cnt, pfn; 1338 pgcnt_t inc; 1339 1340 ASSERT(cnt > 0); 1341 1342 if (!physmax4g) 1343 return; 1344 1345 for (pfn = startpfn; pfn < endpfn; ) { 1346 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1347 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1348 inc = endpfn - pfn; 1349 } else { 1350 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1351 } 1352 if (mtype <= mtype4g) 1353 maxmem4g += inc; 1354 pfn += inc; 1355 } 1356 mtype++; 1357 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1358 } 1359 } 1360 1361 int 1362 mtype_2_mrange(int mtype) 1363 { 1364 return (mnoderanges[mtype].mnr_memrange); 1365 } 1366 1367 void 1368 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1369 { 1370 ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1371 *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1372 *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1373 } 1374 1375 size_t 1376 plcnt_sz(size_t ctrs_sz) 1377 { 1378 #ifdef DEBUG 1379 int szc, colors; 1380 1381 ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1382 for (szc = 0; szc < mmu_page_sizes; szc++) { 1383 colors = page_get_pagecolors(szc); 1384 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1385 } 1386 #endif 1387 return (ctrs_sz); 1388 } 1389 1390 caddr_t 1391 plcnt_init(caddr_t addr) 1392 { 1393 #ifdef DEBUG 1394 int mt, szc, colors; 1395 1396 for (mt = 0; mt < mnoderangecnt; mt++) { 1397 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1398 addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1399 for (szc = 0; szc < mmu_page_sizes; szc++) { 1400 colors = page_get_pagecolors(szc); 1401 mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1402 mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1403 (pgcnt_t *)addr; 1404 addr += (sizeof (pgcnt_t) * colors); 1405 } 1406 } 1407 #endif 1408 return (addr); 1409 } 1410 1411 void 1412 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1413 { 1414 #ifdef DEBUG 1415 int bin = PP_2_BIN(pp); 1416 1417 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1418 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1419 cnt); 1420 #endif 1421 ASSERT(mtype == PP_2_MTYPE(pp)); 1422 if (physmax4g && mtype <= mtype4g) 1423 atomic_add_long(&freemem4g, cnt); 1424 if (flags & PG_CACHE_LIST) 1425 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1426 else if (szc) 1427 atomic_add_long(&mnoderanges[mtype].mnr_mt_lgpgcnt, cnt); 1428 else 1429 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt, cnt); 1430 } 1431 1432 /* 1433 * Returns the free page count for mnode 1434 */ 1435 int 1436 mnode_pgcnt(int mnode) 1437 { 1438 int mtype = mnoderangecnt - 1; 1439 int flags = PGI_MT_RANGE0; 1440 pgcnt_t pgcnt = 0; 1441 1442 mtype = mtype_func(mnode, mtype, flags); 1443 1444 while (mtype != -1) { 1445 pgcnt += MTYPE_FREEMEM(mtype); 1446 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1447 } 1448 return (pgcnt); 1449 } 1450 1451 /* 1452 * Initialize page coloring variables based on the l2 cache parameters. 1453 * Calculate and return memory needed for page coloring data structures. 1454 */ 1455 size_t 1456 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1457 { 1458 size_t colorsz = 0; 1459 int i; 1460 int colors; 1461 1462 #if defined(__xpv) 1463 /* 1464 * Hypervisor domains currently don't have any concept of NUMA. 1465 * Hence we'll act like there is only 1 memrange. 1466 */ 1467 i = memrange_num(1); 1468 #else /* !__xpv */ 1469 /* 1470 * Reduce the memory ranges lists if we don't have large amounts 1471 * of memory. This avoids searching known empty free lists. 1472 */ 1473 i = memrange_num(physmax); 1474 #if defined(__i386) 1475 if (i > 0) 1476 restricted_kmemalloc = 0; 1477 #endif 1478 /* physmax greater than 4g */ 1479 if (i == 0) 1480 physmax4g = 1; 1481 #endif /* !__xpv */ 1482 memranges += i; 1483 nranges -= i; 1484 1485 ASSERT(ISP2(l2_sz)); 1486 ASSERT(ISP2(l2_linesz)); 1487 ASSERT(l2_sz > MMU_PAGESIZE); 1488 1489 /* l2_assoc is 0 for fully associative l2 cache */ 1490 if (l2_assoc) 1491 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1492 else 1493 l2_colors = 1; 1494 1495 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1496 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1497 1498 /* 1499 * cpu_page_colors is non-zero when a page color may be spread across 1500 * multiple bins. 1501 */ 1502 if (l2_colors < page_colors) 1503 cpu_page_colors = l2_colors; 1504 1505 ASSERT(ISP2(page_colors)); 1506 1507 page_colors_mask = page_colors - 1; 1508 1509 ASSERT(ISP2(CPUSETSIZE())); 1510 page_coloring_shift = lowbit(CPUSETSIZE()); 1511 1512 /* initialize number of colors per page size */ 1513 for (i = 0; i <= mmu.max_page_level; i++) { 1514 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1515 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1516 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1517 hw_page_array[i].hp_colors = (page_colors_mask >> 1518 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1519 + 1; 1520 colorequivszc[i] = 0; 1521 } 1522 1523 /* 1524 * The value of cpu_page_colors determines if additional color bins 1525 * need to be checked for a particular color in the page_get routines. 1526 */ 1527 if (cpu_page_colors != 0) { 1528 1529 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1530 ASSERT(a > 0); 1531 ASSERT(a < 16); 1532 1533 for (i = 0; i <= mmu.max_page_level; i++) { 1534 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1535 colorequivszc[i] = 0; 1536 continue; 1537 } 1538 while ((colors >> a) == 0) 1539 a--; 1540 ASSERT(a >= 0); 1541 1542 /* higher 4 bits encodes color equiv mask */ 1543 colorequivszc[i] = (a << 4); 1544 } 1545 } 1546 1547 /* factor in colorequiv to check additional 'equivalent' bins. */ 1548 if (colorequiv > 1) { 1549 1550 int a = lowbit(colorequiv) - 1; 1551 if (a > 15) 1552 a = 15; 1553 1554 for (i = 0; i <= mmu.max_page_level; i++) { 1555 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1556 continue; 1557 } 1558 while ((colors >> a) == 0) 1559 a--; 1560 if ((a << 4) > colorequivszc[i]) { 1561 colorequivszc[i] = (a << 4); 1562 } 1563 } 1564 } 1565 1566 /* size for mnoderanges */ 1567 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1568 mnoderangecnt += mnode_range_cnt(i); 1569 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1570 1571 /* size for fpc_mutex and cpc_mutex */ 1572 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1573 1574 /* size of page_freelists */ 1575 colorsz += mnoderangecnt * sizeof (page_t ***); 1576 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1577 1578 for (i = 0; i < mmu_page_sizes; i++) { 1579 colors = page_get_pagecolors(i); 1580 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1581 } 1582 1583 /* size of page_cachelists */ 1584 colorsz += mnoderangecnt * sizeof (page_t **); 1585 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1586 1587 return (colorsz); 1588 } 1589 1590 /* 1591 * Called once at startup to configure page_coloring data structures and 1592 * does the 1st page_free()/page_freelist_add(). 1593 */ 1594 void 1595 page_coloring_setup(caddr_t pcmemaddr) 1596 { 1597 int i; 1598 int j; 1599 int k; 1600 caddr_t addr; 1601 int colors; 1602 1603 /* 1604 * do page coloring setup 1605 */ 1606 addr = pcmemaddr; 1607 1608 mnoderanges = (mnoderange_t *)addr; 1609 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1610 1611 mnode_range_setup(mnoderanges); 1612 1613 if (physmax4g) 1614 mtype4g = pfn_2_mtype(0xfffff); 1615 1616 for (k = 0; k < NPC_MUTEX; k++) { 1617 fpc_mutex[k] = (kmutex_t *)addr; 1618 addr += (max_mem_nodes * sizeof (kmutex_t)); 1619 } 1620 for (k = 0; k < NPC_MUTEX; k++) { 1621 cpc_mutex[k] = (kmutex_t *)addr; 1622 addr += (max_mem_nodes * sizeof (kmutex_t)); 1623 } 1624 page_freelists = (page_t ****)addr; 1625 addr += (mnoderangecnt * sizeof (page_t ***)); 1626 1627 page_cachelists = (page_t ***)addr; 1628 addr += (mnoderangecnt * sizeof (page_t **)); 1629 1630 for (i = 0; i < mnoderangecnt; i++) { 1631 page_freelists[i] = (page_t ***)addr; 1632 addr += (mmu_page_sizes * sizeof (page_t **)); 1633 1634 for (j = 0; j < mmu_page_sizes; j++) { 1635 colors = page_get_pagecolors(j); 1636 page_freelists[i][j] = (page_t **)addr; 1637 addr += (colors * sizeof (page_t *)); 1638 } 1639 page_cachelists[i] = (page_t **)addr; 1640 addr += (page_colors * sizeof (page_t *)); 1641 } 1642 } 1643 1644 #if defined(__xpv) 1645 /* 1646 * Give back 10% of the io_pool pages to the free list. 1647 * Don't shrink the pool below some absolute minimum. 1648 */ 1649 static void 1650 page_io_pool_shrink() 1651 { 1652 int retcnt; 1653 page_t *pp, *pp_first, *pp_last, **curpool; 1654 mfn_t mfn; 1655 int bothpools = 0; 1656 1657 mutex_enter(&io_pool_lock); 1658 io_pool_shrink_attempts++; /* should be a kstat? */ 1659 retcnt = io_pool_cnt / 10; 1660 if (io_pool_cnt - retcnt < io_pool_cnt_min) 1661 retcnt = io_pool_cnt - io_pool_cnt_min; 1662 if (retcnt <= 0) 1663 goto done; 1664 io_pool_shrinks++; /* should be a kstat? */ 1665 curpool = &io_pool_4g; 1666 domore: 1667 /* 1668 * Loop through taking pages from the end of the list 1669 * (highest mfns) till amount to return reached. 1670 */ 1671 for (pp = *curpool; pp && retcnt > 0; ) { 1672 pp_first = pp_last = pp->p_prev; 1673 if (pp_first == *curpool) 1674 break; 1675 retcnt--; 1676 io_pool_cnt--; 1677 page_io_pool_sub(curpool, pp_first, pp_last); 1678 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 1679 start_mfn = mfn; 1680 page_free(pp_first, 1); 1681 pp = *curpool; 1682 } 1683 if (retcnt != 0 && !bothpools) { 1684 /* 1685 * If not enough found in less constrained pool try the 1686 * more constrained one. 1687 */ 1688 curpool = &io_pool_16m; 1689 bothpools = 1; 1690 goto domore; 1691 } 1692 done: 1693 mutex_exit(&io_pool_lock); 1694 } 1695 1696 #endif /* __xpv */ 1697 1698 uint_t 1699 page_create_update_flags_x86(uint_t flags) 1700 { 1701 #if defined(__xpv) 1702 /* 1703 * Check this is an urgent allocation and free pages are depleted. 1704 */ 1705 if (!(flags & PG_WAIT) && freemem < desfree) 1706 page_io_pool_shrink(); 1707 #else /* !__xpv */ 1708 /* 1709 * page_create_get_something may call this because 4g memory may be 1710 * depleted. Set flags to allow for relocation of base page below 1711 * 4g if necessary. 1712 */ 1713 if (physmax4g) 1714 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1715 #endif /* __xpv */ 1716 return (flags); 1717 } 1718 1719 /*ARGSUSED*/ 1720 int 1721 bp_color(struct buf *bp) 1722 { 1723 return (0); 1724 } 1725 1726 #if defined(__xpv) 1727 1728 /* 1729 * Take pages out of an io_pool 1730 */ 1731 static void 1732 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 1733 { 1734 if (*poolp == pp_first) { 1735 *poolp = pp_last->p_next; 1736 if (*poolp == pp_first) 1737 *poolp = NULL; 1738 } 1739 pp_first->p_prev->p_next = pp_last->p_next; 1740 pp_last->p_next->p_prev = pp_first->p_prev; 1741 pp_first->p_prev = pp_last; 1742 pp_last->p_next = pp_first; 1743 } 1744 1745 /* 1746 * Put a page on the io_pool list. The list is ordered by increasing MFN. 1747 */ 1748 static void 1749 page_io_pool_add(page_t **poolp, page_t *pp) 1750 { 1751 page_t *look; 1752 mfn_t mfn = mfn_list[pp->p_pagenum]; 1753 1754 if (*poolp == NULL) { 1755 *poolp = pp; 1756 pp->p_next = pp; 1757 pp->p_prev = pp; 1758 return; 1759 } 1760 1761 /* 1762 * Since we try to take pages from the high end of the pool 1763 * chances are good that the pages to be put on the list will 1764 * go at or near the end of the list. so start at the end and 1765 * work backwards. 1766 */ 1767 look = (*poolp)->p_prev; 1768 while (mfn < mfn_list[look->p_pagenum]) { 1769 look = look->p_prev; 1770 if (look == (*poolp)->p_prev) 1771 break; /* backed all the way to front of list */ 1772 } 1773 1774 /* insert after look */ 1775 pp->p_prev = look; 1776 pp->p_next = look->p_next; 1777 pp->p_next->p_prev = pp; 1778 look->p_next = pp; 1779 if (mfn < mfn_list[(*poolp)->p_pagenum]) { 1780 /* 1781 * we inserted a new first list element 1782 * adjust pool pointer to newly inserted element 1783 */ 1784 *poolp = pp; 1785 } 1786 } 1787 1788 /* 1789 * Add a page to the io_pool. Setting the force flag will force the page 1790 * into the io_pool no matter what. 1791 */ 1792 static void 1793 add_page_to_pool(page_t *pp, int force) 1794 { 1795 page_t *highest; 1796 page_t *freep = NULL; 1797 1798 mutex_enter(&io_pool_lock); 1799 /* 1800 * Always keep the scarce low memory pages 1801 */ 1802 if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 1803 ++io_pool_cnt; 1804 page_io_pool_add(&io_pool_16m, pp); 1805 goto done; 1806 } 1807 if (io_pool_cnt < io_pool_cnt_max || force) { 1808 ++io_pool_cnt; 1809 page_io_pool_add(&io_pool_4g, pp); 1810 } else { 1811 highest = io_pool_4g->p_prev; 1812 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 1813 page_io_pool_sub(&io_pool_4g, highest, highest); 1814 page_io_pool_add(&io_pool_4g, pp); 1815 freep = highest; 1816 } else { 1817 freep = pp; 1818 } 1819 } 1820 done: 1821 mutex_exit(&io_pool_lock); 1822 if (freep) 1823 page_free(freep, 1); 1824 } 1825 1826 1827 int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 1828 int contig_pfn_max; /* capacity of the contig pfn list */ 1829 int next_alloc_pfn; /* next position in list to start a contig search */ 1830 int contig_pfnlist_updates; /* pfn list update count */ 1831 int contig_pfnlist_locked; /* contig pfn list locked against use */ 1832 int contig_pfnlist_builds; /* how many times have we (re)built list */ 1833 int contig_pfnlist_buildfailed; /* how many times has list build failed */ 1834 int create_contig_pending; /* nonzero means taskq creating contig list */ 1835 pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 1836 1837 /* 1838 * Function to use in sorting a list of pfns by their underlying mfns. 1839 */ 1840 static int 1841 mfn_compare(const void *pfnp1, const void *pfnp2) 1842 { 1843 mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 1844 mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 1845 1846 if (mfn1 > mfn2) 1847 return (1); 1848 if (mfn1 < mfn2) 1849 return (-1); 1850 return (0); 1851 } 1852 1853 /* 1854 * Compact the contig_pfn_list by tossing all the non-contiguous 1855 * elements from the list. 1856 */ 1857 static void 1858 compact_contig_pfn_list(void) 1859 { 1860 pfn_t pfn, lapfn, prev_lapfn; 1861 mfn_t mfn; 1862 int i, newcnt = 0; 1863 1864 prev_lapfn = 0; 1865 for (i = 0; i < contig_pfn_cnt - 1; i++) { 1866 pfn = contig_pfn_list[i]; 1867 lapfn = contig_pfn_list[i + 1]; 1868 mfn = mfn_list[pfn]; 1869 /* 1870 * See if next pfn is for a contig mfn 1871 */ 1872 if (mfn_list[lapfn] != mfn + 1) 1873 continue; 1874 /* 1875 * pfn and lookahead are both put in list 1876 * unless pfn is the previous lookahead. 1877 */ 1878 if (pfn != prev_lapfn) 1879 contig_pfn_list[newcnt++] = pfn; 1880 contig_pfn_list[newcnt++] = lapfn; 1881 prev_lapfn = lapfn; 1882 } 1883 for (i = newcnt; i < contig_pfn_cnt; i++) 1884 contig_pfn_list[i] = 0; 1885 contig_pfn_cnt = newcnt; 1886 } 1887 1888 /*ARGSUSED*/ 1889 static void 1890 call_create_contiglist(void *arg) 1891 { 1892 mutex_enter(&io_pool_lock); 1893 (void) create_contig_pfnlist(PG_WAIT); 1894 create_contig_pending = 0; 1895 mutex_exit(&io_pool_lock); 1896 } 1897 1898 /* 1899 * Create list of freelist pfns that have underlying 1900 * contiguous mfns. The list is kept in ascending mfn order. 1901 * returns 1 if list created else 0. 1902 */ 1903 static int 1904 create_contig_pfnlist(uint_t flags) 1905 { 1906 pfn_t pfn; 1907 page_t *pp; 1908 1909 if (contig_pfn_list != NULL) 1910 return (1); 1911 ASSERT(!contig_pfnlist_locked); 1912 contig_pfn_max = freemem + (freemem / 10); 1913 contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 1914 (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 1915 if (contig_pfn_list == NULL) { 1916 /* 1917 * If we could not create the contig list (because 1918 * we could not sleep for memory). Dispatch a taskq that can 1919 * sleep to get the memory. 1920 */ 1921 if (!create_contig_pending) { 1922 if (taskq_dispatch(system_taskq, call_create_contiglist, 1923 NULL, TQ_NOSLEEP) != NULL) 1924 create_contig_pending = 1; 1925 } 1926 contig_pfnlist_buildfailed++; /* count list build failures */ 1927 return (0); 1928 } 1929 ASSERT(contig_pfn_cnt == 0); 1930 for (pfn = 0; pfn < mfn_count; pfn++) { 1931 pp = page_numtopp_nolock(pfn); 1932 if (pp == NULL || !PP_ISFREE(pp)) 1933 continue; 1934 contig_pfn_list[contig_pfn_cnt] = pfn; 1935 if (++contig_pfn_cnt == contig_pfn_max) 1936 break; 1937 } 1938 qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 1939 compact_contig_pfn_list(); 1940 /* 1941 * Make sure next search of the newly created contiguous pfn 1942 * list starts at the beginning of the list. 1943 */ 1944 next_alloc_pfn = 0; 1945 contig_pfnlist_builds++; /* count list builds */ 1946 return (1); 1947 } 1948 1949 1950 /* 1951 * Toss the current contig pfnlist. Someone is about to do a massive 1952 * update to pfn<->mfn mappings. So we have them destroy the list and lock 1953 * it till they are done with their update. 1954 */ 1955 void 1956 clear_and_lock_contig_pfnlist() 1957 { 1958 pfn_t *listp = NULL; 1959 size_t listsize; 1960 1961 mutex_enter(&io_pool_lock); 1962 ASSERT(!contig_pfnlist_locked); 1963 if (contig_pfn_list != NULL) { 1964 listp = contig_pfn_list; 1965 listsize = contig_pfn_max * sizeof (pfn_t); 1966 contig_pfn_list = NULL; 1967 contig_pfn_max = contig_pfn_cnt = 0; 1968 } 1969 contig_pfnlist_locked = 1; 1970 mutex_exit(&io_pool_lock); 1971 if (listp != NULL) 1972 kmem_free(listp, listsize); 1973 } 1974 1975 /* 1976 * Unlock the contig_pfn_list. The next attempted use of it will cause 1977 * it to be re-created. 1978 */ 1979 void 1980 unlock_contig_pfnlist() 1981 { 1982 mutex_enter(&io_pool_lock); 1983 ASSERT(contig_pfnlist_locked); 1984 contig_pfnlist_locked = 0; 1985 mutex_exit(&io_pool_lock); 1986 } 1987 1988 /* 1989 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 1990 */ 1991 void 1992 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 1993 { 1994 int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 1995 pfn_t probe_pfn; 1996 mfn_t probe_mfn; 1997 1998 if (contig_pfn_list == NULL) 1999 return; 2000 mutex_enter(&io_pool_lock); 2001 contig_pfnlist_updates++; 2002 /* 2003 * Find the pfn in the current list. Use a binary chop to locate it. 2004 */ 2005 probe_hi = contig_pfn_cnt - 1; 2006 probe_lo = 0; 2007 probe_pos = (probe_hi + probe_lo) / 2; 2008 while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2009 if (probe_pos == probe_lo) { /* pfn not in list */ 2010 probe_pos = -1; 2011 break; 2012 } 2013 if (pfn_to_mfn(probe_pfn) <= oldmfn) 2014 probe_lo = probe_pos; 2015 else 2016 probe_hi = probe_pos; 2017 probe_pos = (probe_hi + probe_lo) / 2; 2018 } 2019 if (probe_pos >= 0) { /* remove pfn fom list */ 2020 contig_pfn_cnt--; 2021 ovbcopy(&contig_pfn_list[probe_pos + 1], 2022 &contig_pfn_list[probe_pos], 2023 (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2024 } 2025 if (newmfn == MFN_INVALID) 2026 goto done; 2027 /* 2028 * Check if new mfn has adjacent mfns in the list 2029 */ 2030 probe_hi = contig_pfn_cnt - 1; 2031 probe_lo = 0; 2032 insert_after = -2; 2033 do { 2034 probe_pos = (probe_hi + probe_lo) / 2; 2035 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2036 if (newmfn == probe_mfn + 1) 2037 insert_after = probe_pos; 2038 else if (newmfn == probe_mfn - 1) 2039 insert_after = probe_pos - 1; 2040 if (probe_pos == probe_lo) 2041 break; 2042 if (probe_mfn <= newmfn) 2043 probe_lo = probe_pos; 2044 else 2045 probe_hi = probe_pos; 2046 } while (insert_after == -2); 2047 /* 2048 * If there is space in the list and there are adjacent mfns 2049 * insert the pfn in to its proper place in the list. 2050 */ 2051 if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2052 insert_point = insert_after + 1; 2053 ovbcopy(&contig_pfn_list[insert_point], 2054 &contig_pfn_list[insert_point + 1], 2055 (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2056 contig_pfn_list[insert_point] = pfn; 2057 contig_pfn_cnt++; 2058 } 2059 done: 2060 mutex_exit(&io_pool_lock); 2061 } 2062 2063 /* 2064 * Called to (re-)populate the io_pool from the free page lists. 2065 */ 2066 long 2067 populate_io_pool(void) 2068 { 2069 pfn_t pfn; 2070 mfn_t mfn, max_mfn; 2071 page_t *pp; 2072 2073 /* 2074 * Figure out the bounds of the pool on first invocation. 2075 * We use a percentage of memory for the io pool size. 2076 * we allow that to shrink, but not to less than a fixed minimum 2077 */ 2078 if (io_pool_cnt_max == 0) { 2079 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2080 io_pool_cnt_lowater = io_pool_cnt_max; 2081 /* 2082 * This is the first time in populate_io_pool, grab a va to use 2083 * when we need to allocate pages. 2084 */ 2085 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2086 } 2087 /* 2088 * If we are out of pages in the pool, then grow the size of the pool 2089 */ 2090 if (io_pool_cnt == 0) 2091 io_pool_cnt_max += io_pool_cnt_max / 20; /* grow by 5% */ 2092 io_pool_grows++; /* should be a kstat? */ 2093 2094 /* 2095 * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2096 */ 2097 (void) mfn_to_pfn(start_mfn); 2098 max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2099 for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2100 pfn = mfn_to_pfn(mfn); 2101 if (pfn & PFN_IS_FOREIGN_MFN) 2102 continue; 2103 /* 2104 * try to allocate it from free pages 2105 */ 2106 pp = page_numtopp_alloc(pfn); 2107 if (pp == NULL) 2108 continue; 2109 PP_CLRFREE(pp); 2110 add_page_to_pool(pp, 1); 2111 if (io_pool_cnt >= io_pool_cnt_max) 2112 break; 2113 } 2114 2115 return (io_pool_cnt); 2116 } 2117 2118 /* 2119 * Destroy a page that was being used for DMA I/O. It may or 2120 * may not actually go back to the io_pool. 2121 */ 2122 void 2123 page_destroy_io(page_t *pp) 2124 { 2125 mfn_t mfn = mfn_list[pp->p_pagenum]; 2126 2127 /* 2128 * When the page was alloc'd a reservation was made, release it now 2129 */ 2130 page_unresv(1); 2131 /* 2132 * Unload translations, if any, then hash out the 2133 * page to erase its identity. 2134 */ 2135 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2136 page_hashout(pp, NULL); 2137 2138 /* 2139 * If the page came from the free lists, just put it back to them. 2140 * DomU pages always go on the free lists as well. 2141 */ 2142 if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2143 page_free(pp, 1); 2144 return; 2145 } 2146 2147 add_page_to_pool(pp, 0); 2148 } 2149 2150 2151 long contig_searches; /* count of times contig pages requested */ 2152 long contig_search_restarts; /* count of contig ranges tried */ 2153 long contig_search_failed; /* count of contig alloc failures */ 2154 2155 /* 2156 * Look thru the contiguous pfns that are not part of the io_pool for 2157 * contiguous free pages. Return a list of the found pages or NULL. 2158 */ 2159 page_t * 2160 find_contig_free(uint_t bytes, uint_t flags) 2161 { 2162 page_t *pp, *plist = NULL; 2163 mfn_t mfn, prev_mfn; 2164 pfn_t pfn; 2165 int pages_needed, pages_requested; 2166 int search_start; 2167 2168 /* 2169 * create the contig pfn list if not already done 2170 */ 2171 if (contig_pfn_list == NULL) { 2172 if (contig_pfnlist_locked) { 2173 return (NULL); 2174 } else { 2175 if (!create_contig_pfnlist(flags)) 2176 return (NULL); 2177 } 2178 } 2179 contig_searches++; 2180 /* 2181 * Search contiguous pfn list for physically contiguous pages not in 2182 * the io_pool. Start the search where the last search left off. 2183 */ 2184 pages_requested = pages_needed = mmu_btop(bytes); 2185 search_start = next_alloc_pfn; 2186 prev_mfn = 0; 2187 while (pages_needed) { 2188 pfn = contig_pfn_list[next_alloc_pfn]; 2189 mfn = pfn_to_mfn(pfn); 2190 if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 2191 (pp = page_numtopp_alloc(pfn)) != NULL) { 2192 PP_CLRFREE(pp); 2193 page_io_pool_add(&plist, pp); 2194 pages_needed--; 2195 prev_mfn = mfn; 2196 } else { 2197 contig_search_restarts++; 2198 /* 2199 * free partial page list 2200 */ 2201 while (plist != NULL) { 2202 pp = plist; 2203 page_io_pool_sub(&plist, pp, pp); 2204 page_free(pp, 1); 2205 } 2206 pages_needed = pages_requested; 2207 prev_mfn = 0; 2208 } 2209 if (++next_alloc_pfn == contig_pfn_cnt) 2210 next_alloc_pfn = 0; 2211 if (next_alloc_pfn == search_start) 2212 break; /* all pfns searched */ 2213 } 2214 if (pages_needed) { 2215 contig_search_failed++; 2216 /* 2217 * Failed to find enough contig pages. 2218 * free partial page list 2219 */ 2220 while (plist != NULL) { 2221 pp = plist; 2222 page_io_pool_sub(&plist, pp, pp); 2223 page_free(pp, 1); 2224 } 2225 } 2226 return (plist); 2227 } 2228 2229 /* 2230 * Allocator for domain 0 I/O pages. We match the required 2231 * DMA attributes and contiguity constraints. 2232 */ 2233 /*ARGSUSED*/ 2234 page_t * 2235 page_create_io( 2236 struct vnode *vp, 2237 u_offset_t off, 2238 uint_t bytes, 2239 uint_t flags, 2240 struct as *as, 2241 caddr_t vaddr, 2242 ddi_dma_attr_t *mattr) 2243 { 2244 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2245 page_t *pp_first; /* list to return */ 2246 page_t *pp_last; /* last in list to return */ 2247 page_t *pp, **poolp, **pplist = NULL, *expp; 2248 int i, extpages = 0, npages = 0, contig, anyaddr, extra; 2249 mfn_t lo_mfn; 2250 mfn_t hi_mfn; 2251 mfn_t mfn, tmfn; 2252 mfn_t *mfnlist = 0; 2253 pgcnt_t pfnalign = 0; 2254 int align, order, nbits, extents; 2255 uint64_t pfnseg; 2256 int attempt = 0, is_domu = 0; 2257 int asked_hypervisor = 0; 2258 uint_t kflags; 2259 2260 ASSERT(mattr != NULL); 2261 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2262 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2263 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2264 if (align > MMU_PAGESIZE) 2265 pfnalign = mmu_btop(align); 2266 pfnseg = mmu_btop(mattr->dma_attr_seg); 2267 2268 /* 2269 * Clear the contig flag if only one page is needed. 2270 */ 2271 contig = (flags & PG_PHYSCONTIG); 2272 flags &= ~PG_PHYSCONTIG; 2273 bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 2274 if (bytes == MMU_PAGESIZE) 2275 contig = 0; 2276 2277 /* 2278 * Check if any old page in the system is fine. 2279 * DomU should always go down this path. 2280 */ 2281 is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 2282 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 2283 if ((!contig && anyaddr) || is_domu) { 2284 pp = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 2285 if (pp) 2286 return (pp); 2287 else if (is_domu) 2288 return (NULL); /* no memory available */ 2289 } 2290 /* 2291 * DomU should never reach here 2292 */ 2293 try_again: 2294 /* 2295 * We could just want unconstrained but contig pages. 2296 */ 2297 if (anyaddr && contig && pfnseg >= max_mfn) { 2298 /* 2299 * Look for free contig pages to satisfy the request. 2300 */ 2301 mutex_enter(&io_pool_lock); 2302 pp_first = find_contig_free(bytes, flags); 2303 mutex_exit(&io_pool_lock); 2304 if (pp_first != NULL) 2305 goto done; 2306 } 2307 /* 2308 * See if we want pages for a legacy device 2309 */ 2310 if (hi_mfn < PFN_16MEG) 2311 poolp = &io_pool_16m; 2312 else 2313 poolp = &io_pool_4g; 2314 try_smaller: 2315 /* 2316 * Take pages from I/O pool. We'll use pages from the highest MFN 2317 * range possible. 2318 */ 2319 pp_first = pp_last = NULL; 2320 npages = mmu_btop(bytes); 2321 mutex_enter(&io_pool_lock); 2322 for (pp = *poolp; pp && npages > 0; ) { 2323 pp = pp->p_prev; 2324 2325 /* 2326 * skip pages above allowable range 2327 */ 2328 mfn = mfn_list[pp->p_pagenum]; 2329 if (hi_mfn < mfn) 2330 goto skip; 2331 2332 /* 2333 * stop at pages below allowable range 2334 */ 2335 if (lo_mfn > mfn) 2336 break; 2337 restart: 2338 if (pp_last == NULL) { 2339 /* 2340 * Check alignment 2341 */ 2342 tmfn = mfn - (npages - 1); 2343 if (pfnalign) { 2344 if (tmfn != P2ROUNDUP(tmfn, pfnalign)) 2345 goto skip; /* not properly aligned */ 2346 } 2347 /* 2348 * Check segment 2349 */ 2350 if ((mfn & pfnseg) < (tmfn & pfnseg)) 2351 goto skip; /* crosses segment boundary */ 2352 /* 2353 * Start building page list 2354 */ 2355 pp_first = pp_last = pp; 2356 npages--; 2357 } else { 2358 /* 2359 * check physical contiguity if required 2360 */ 2361 if (contig && 2362 mfn_list[pp_first->p_pagenum] != mfn + 1) { 2363 /* 2364 * not a contiguous page, restart list. 2365 */ 2366 pp_last = NULL; 2367 npages = mmu_btop(bytes); 2368 goto restart; 2369 } else { /* add page to list */ 2370 pp_first = pp; 2371 --npages; 2372 } 2373 } 2374 skip: 2375 if (pp == *poolp) 2376 break; 2377 } 2378 2379 /* 2380 * If we didn't find memory. Try the more constrained pool, then 2381 * sweep free pages into the DMA pool and try again. If we fail 2382 * repeatedly, ask the Hypervisor for help. 2383 */ 2384 if (npages != 0) { 2385 mutex_exit(&io_pool_lock); 2386 /* 2387 * If we were looking in the less constrained pool and didn't 2388 * find pages, try the more constrained pool. 2389 */ 2390 if (poolp == &io_pool_4g) { 2391 poolp = &io_pool_16m; 2392 goto try_smaller; 2393 } 2394 kmem_reap(); 2395 if (++attempt < 4) { 2396 /* 2397 * Grab some more io_pool pages 2398 */ 2399 (void) populate_io_pool(); 2400 goto try_again; 2401 } 2402 2403 if (asked_hypervisor++) 2404 return (NULL); /* really out of luck */ 2405 /* 2406 * Hypervisor exchange doesn't handle segment or alignment 2407 * constraints 2408 */ 2409 if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || pfnalign) 2410 return (NULL); 2411 /* 2412 * Try exchanging pages with the hypervisor. 2413 */ 2414 npages = mmu_btop(bytes); 2415 kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 2416 /* 2417 * Hypervisor will allocate extents, if we want contig pages 2418 * extent must be >= npages 2419 */ 2420 if (contig) { 2421 order = highbit(npages) - 1; 2422 if (npages & ((1 << order) - 1)) 2423 order++; 2424 extpages = 1 << order; 2425 } else { 2426 order = 0; 2427 extpages = npages; 2428 } 2429 if (extpages > npages) { 2430 extra = extpages - npages; 2431 if (!page_resv(extra, kflags)) 2432 return (NULL); 2433 } 2434 pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 2435 if (pplist == NULL) 2436 goto fail; 2437 mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 2438 if (mfnlist == NULL) 2439 goto fail; 2440 pp = page_create_va(vp, off, npages * PAGESIZE, flags, 2441 &kvseg, vaddr); 2442 if (pp == NULL) 2443 goto fail; 2444 pp_first = pp; 2445 if (extpages > npages) { 2446 /* 2447 * fill out the rest of extent pages to swap with the 2448 * hypervisor 2449 */ 2450 for (i = 0; i < extra; i++) { 2451 expp = page_create_va(vp, 2452 (u_offset_t)(uintptr_t)io_pool_kva, 2453 PAGESIZE, flags, &kvseg, io_pool_kva); 2454 if (expp == NULL) 2455 goto balloon_fail; 2456 (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 2457 page_io_unlock(expp); 2458 page_hashout(expp, NULL); 2459 page_io_lock(expp); 2460 /* 2461 * add page to end of list 2462 */ 2463 expp->p_prev = pp_first->p_prev; 2464 expp->p_next = pp_first; 2465 expp->p_prev->p_next = expp; 2466 pp_first->p_prev = expp; 2467 } 2468 2469 } 2470 for (i = 0; i < extpages; i++) { 2471 pplist[i] = pp; 2472 pp = pp->p_next; 2473 } 2474 nbits = highbit(mattr->dma_attr_addr_hi); 2475 extents = contig ? 1 : npages; 2476 if (balloon_replace_pages(extents, pplist, nbits, order, 2477 mfnlist) != extents) 2478 goto balloon_fail; 2479 2480 kmem_free(pplist, extpages * sizeof (page_t *)); 2481 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2482 /* 2483 * Return any excess pages to free list 2484 */ 2485 if (extpages > npages) { 2486 for (i = 0; i < extra; i++) { 2487 pp = pp_first->p_prev; 2488 page_sub(&pp_first, pp); 2489 page_io_unlock(pp); 2490 page_unresv(1); 2491 page_free(pp, 1); 2492 } 2493 } 2494 check_dma(mattr, pp_first, mmu_btop(bytes)); 2495 return (pp_first); 2496 } 2497 2498 /* 2499 * Found the pages, now snip them from the list 2500 */ 2501 page_io_pool_sub(poolp, pp_first, pp_last); 2502 io_pool_cnt -= mmu_btop(bytes); 2503 if (io_pool_cnt < io_pool_cnt_lowater) 2504 io_pool_cnt_lowater = io_pool_cnt; /* io pool low water mark */ 2505 mutex_exit(&io_pool_lock); 2506 done: 2507 check_dma(mattr, pp_first, mmu_btop(bytes)); 2508 pp = pp_first; 2509 do { 2510 if (!page_hashin(pp, vp, off, NULL)) { 2511 panic("pg_create_io: hashin failed pp %p, vp %p," 2512 " off %llx", 2513 (void *)pp, (void *)vp, off); 2514 } 2515 off += MMU_PAGESIZE; 2516 PP_CLRFREE(pp); 2517 PP_CLRAGED(pp); 2518 page_set_props(pp, P_REF); 2519 page_io_lock(pp); 2520 pp = pp->p_next; 2521 } while (pp != pp_first); 2522 return (pp_first); 2523 balloon_fail: 2524 /* 2525 * Return pages to free list and return failure 2526 */ 2527 while (pp_first != NULL) { 2528 pp = pp_first; 2529 page_sub(&pp_first, pp); 2530 page_io_unlock(pp); 2531 if (pp->p_vnode != NULL) 2532 page_hashout(pp, NULL); 2533 page_free(pp, 1); 2534 } 2535 fail: 2536 if (pplist) 2537 kmem_free(pplist, extpages * sizeof (page_t *)); 2538 if (mfnlist) 2539 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2540 page_unresv(extpages - npages); 2541 return (NULL); 2542 } 2543 2544 /* 2545 * Lock and return the page with the highest mfn that we can find. last_mfn 2546 * holds the last one found, so the next search can start from there. We 2547 * also keep a counter so that we don't loop forever if the machine has no 2548 * free pages. 2549 * 2550 * This is called from the balloon thread to find pages to give away. new_high 2551 * is used when new mfn's have been added to the system - we will reset our 2552 * search if the new mfn's are higher than our current search position. 2553 */ 2554 page_t * 2555 page_get_high_mfn(mfn_t new_high) 2556 { 2557 static mfn_t last_mfn = 0; 2558 pfn_t pfn; 2559 page_t *pp; 2560 ulong_t loop_count = 0; 2561 2562 if (new_high > last_mfn) 2563 last_mfn = new_high; 2564 2565 for (; loop_count < mfn_count; loop_count++, last_mfn--) { 2566 if (last_mfn == 0) { 2567 last_mfn = cached_max_mfn; 2568 } 2569 2570 pfn = mfn_to_pfn(last_mfn); 2571 if (pfn & PFN_IS_FOREIGN_MFN) 2572 continue; 2573 2574 /* See if the page is free. If so, lock it. */ 2575 pp = page_numtopp_alloc(pfn); 2576 if (pp == NULL) 2577 continue; 2578 PP_CLRFREE(pp); 2579 2580 ASSERT(PAGE_EXCL(pp)); 2581 ASSERT(pp->p_vnode == NULL); 2582 ASSERT(!hat_page_is_mapped(pp)); 2583 last_mfn--; 2584 return (pp); 2585 } 2586 return (NULL); 2587 } 2588 2589 #else /* !__xpv */ 2590 2591 /* 2592 * get a page from any list with the given mnode 2593 */ 2594 static page_t * 2595 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 2596 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 2597 { 2598 kmutex_t *pcm; 2599 int i; 2600 page_t *pp; 2601 page_t *first_pp; 2602 uint64_t pgaddr; 2603 ulong_t bin; 2604 int mtypestart; 2605 int plw_initialized; 2606 page_list_walker_t plw; 2607 2608 VM_STAT_ADD(pga_vmstats.pgma_alloc); 2609 2610 ASSERT((flags & PG_MATCH_COLOR) == 0); 2611 ASSERT(szc == 0); 2612 ASSERT(dma_attr != NULL); 2613 2614 MTYPE_START(mnode, mtype, flags); 2615 if (mtype < 0) { 2616 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 2617 return (NULL); 2618 } 2619 2620 mtypestart = mtype; 2621 2622 bin = origbin; 2623 2624 /* 2625 * check up to page_colors + 1 bins - origbin may be checked twice 2626 * because of BIN_STEP skip 2627 */ 2628 do { 2629 plw_initialized = 0; 2630 2631 for (plw.plw_count = 0; 2632 plw.plw_count < page_colors; plw.plw_count++) { 2633 2634 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 2635 goto nextfreebin; 2636 2637 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2638 mutex_enter(pcm); 2639 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2640 first_pp = pp; 2641 while (pp != NULL) { 2642 if (page_trylock(pp, SE_EXCL) == 0) { 2643 pp = pp->p_next; 2644 if (pp == first_pp) { 2645 pp = NULL; 2646 } 2647 continue; 2648 } 2649 2650 ASSERT(PP_ISFREE(pp)); 2651 ASSERT(PP_ISAGED(pp)); 2652 ASSERT(pp->p_vnode == NULL); 2653 ASSERT(pp->p_hash == NULL); 2654 ASSERT(pp->p_offset == (u_offset_t)-1); 2655 ASSERT(pp->p_szc == szc); 2656 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2657 /* check if page within DMA attributes */ 2658 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2659 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2660 (pgaddr + MMU_PAGESIZE - 1 <= 2661 dma_attr->dma_attr_addr_hi)) { 2662 break; 2663 } 2664 2665 /* continue looking */ 2666 page_unlock(pp); 2667 pp = pp->p_next; 2668 if (pp == first_pp) 2669 pp = NULL; 2670 2671 } 2672 if (pp != NULL) { 2673 ASSERT(mtype == PP_2_MTYPE(pp)); 2674 ASSERT(pp->p_szc == 0); 2675 2676 /* found a page with specified DMA attributes */ 2677 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 2678 mtype), pp); 2679 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2680 2681 if ((PP_ISFREE(pp) == 0) || 2682 (PP_ISAGED(pp) == 0)) { 2683 cmn_err(CE_PANIC, "page %p is not free", 2684 (void *)pp); 2685 } 2686 2687 mutex_exit(pcm); 2688 check_dma(dma_attr, pp, 1); 2689 VM_STAT_ADD(pga_vmstats.pgma_allocok); 2690 return (pp); 2691 } 2692 mutex_exit(pcm); 2693 nextfreebin: 2694 if (plw_initialized == 0) { 2695 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 2696 ASSERT(plw.plw_ceq_dif == page_colors); 2697 plw_initialized = 1; 2698 } 2699 2700 if (plw.plw_do_split) { 2701 pp = page_freelist_split(szc, bin, mnode, 2702 mtype, 2703 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 2704 &plw); 2705 if (pp != NULL) 2706 return (pp); 2707 } 2708 2709 bin = page_list_walk_next_bin(szc, bin, &plw); 2710 } 2711 2712 MTYPE_NEXT(mnode, mtype, flags); 2713 } while (mtype >= 0); 2714 2715 /* failed to find a page in the freelist; try it in the cachelist */ 2716 2717 /* reset mtype start for cachelist search */ 2718 mtype = mtypestart; 2719 ASSERT(mtype >= 0); 2720 2721 /* start with the bin of matching color */ 2722 bin = origbin; 2723 2724 do { 2725 for (i = 0; i <= page_colors; i++) { 2726 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 2727 goto nextcachebin; 2728 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 2729 mutex_enter(pcm); 2730 pp = PAGE_CACHELISTS(mnode, bin, mtype); 2731 first_pp = pp; 2732 while (pp != NULL) { 2733 if (page_trylock(pp, SE_EXCL) == 0) { 2734 pp = pp->p_next; 2735 if (pp == first_pp) 2736 break; 2737 continue; 2738 } 2739 ASSERT(pp->p_vnode); 2740 ASSERT(PP_ISAGED(pp) == 0); 2741 ASSERT(pp->p_szc == 0); 2742 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2743 2744 /* check if page within DMA attributes */ 2745 2746 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2747 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2748 (pgaddr + MMU_PAGESIZE - 1 <= 2749 dma_attr->dma_attr_addr_hi)) { 2750 break; 2751 } 2752 2753 /* continue looking */ 2754 page_unlock(pp); 2755 pp = pp->p_next; 2756 if (pp == first_pp) 2757 pp = NULL; 2758 } 2759 2760 if (pp != NULL) { 2761 ASSERT(mtype == PP_2_MTYPE(pp)); 2762 ASSERT(pp->p_szc == 0); 2763 2764 /* found a page with specified DMA attributes */ 2765 page_sub(&PAGE_CACHELISTS(mnode, bin, 2766 mtype), pp); 2767 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 2768 2769 mutex_exit(pcm); 2770 ASSERT(pp->p_vnode); 2771 ASSERT(PP_ISAGED(pp) == 0); 2772 check_dma(dma_attr, pp, 1); 2773 VM_STAT_ADD(pga_vmstats.pgma_allocok); 2774 return (pp); 2775 } 2776 mutex_exit(pcm); 2777 nextcachebin: 2778 bin += (i == 0) ? BIN_STEP : 1; 2779 bin &= page_colors_mask; 2780 } 2781 MTYPE_NEXT(mnode, mtype, flags); 2782 } while (mtype >= 0); 2783 2784 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 2785 return (NULL); 2786 } 2787 2788 /* 2789 * This function is similar to page_get_freelist()/page_get_cachelist() 2790 * but it searches both the lists to find a page with the specified 2791 * color (or no color) and DMA attributes. The search is done in the 2792 * freelist first and then in the cache list within the highest memory 2793 * range (based on DMA attributes) before searching in the lower 2794 * memory ranges. 2795 * 2796 * Note: This function is called only by page_create_io(). 2797 */ 2798 /*ARGSUSED*/ 2799 static page_t * 2800 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 2801 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 2802 { 2803 uint_t bin; 2804 int mtype; 2805 page_t *pp; 2806 int n; 2807 int m; 2808 int szc; 2809 int fullrange; 2810 int mnode; 2811 int local_failed_stat = 0; 2812 lgrp_mnode_cookie_t lgrp_cookie; 2813 2814 VM_STAT_ADD(pga_vmstats.pga_alloc); 2815 2816 /* only base pagesize currently supported */ 2817 if (size != MMU_PAGESIZE) 2818 return (NULL); 2819 2820 /* 2821 * If we're passed a specific lgroup, we use it. Otherwise, 2822 * assume first-touch placement is desired. 2823 */ 2824 if (!LGRP_EXISTS(lgrp)) 2825 lgrp = lgrp_home_lgrp(); 2826 2827 /* LINTED */ 2828 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 2829 2830 /* 2831 * Only hold one freelist or cachelist lock at a time, that way we 2832 * can start anywhere and not have to worry about lock 2833 * ordering. 2834 */ 2835 if (dma_attr == NULL) { 2836 n = 0; 2837 m = mnoderangecnt - 1; 2838 fullrange = 1; 2839 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 2840 } else { 2841 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 2842 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 2843 2844 /* 2845 * We can guarantee alignment only for page boundary. 2846 */ 2847 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 2848 return (NULL); 2849 2850 n = pfn_2_mtype(pfnlo); 2851 m = pfn_2_mtype(pfnhi); 2852 2853 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 2854 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 2855 } 2856 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 2857 2858 if (n > m) 2859 return (NULL); 2860 2861 szc = 0; 2862 2863 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 2864 if (n == 0) { 2865 flags |= PGI_MT_RANGE0; 2866 n = m; 2867 } 2868 2869 /* 2870 * Try local memory node first, but try remote if we can't 2871 * get a page of the right color. 2872 */ 2873 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 2874 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 2875 /* 2876 * allocate pages from high pfn to low. 2877 */ 2878 for (mtype = m; mtype >= n; mtype--) { 2879 if (fullrange != 0) { 2880 pp = page_get_mnode_freelist(mnode, 2881 bin, mtype, szc, flags); 2882 if (pp == NULL) { 2883 pp = page_get_mnode_cachelist( 2884 bin, flags, mnode, mtype); 2885 } 2886 } else { 2887 pp = page_get_mnode_anylist(bin, szc, 2888 flags, mnode, mtype, dma_attr); 2889 } 2890 if (pp != NULL) { 2891 VM_STAT_ADD(pga_vmstats.pga_allocok); 2892 check_dma(dma_attr, pp, 1); 2893 return (pp); 2894 } 2895 } 2896 if (!local_failed_stat) { 2897 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 2898 local_failed_stat = 1; 2899 } 2900 } 2901 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 2902 2903 return (NULL); 2904 } 2905 2906 /* 2907 * page_create_io() 2908 * 2909 * This function is a copy of page_create_va() with an additional 2910 * argument 'mattr' that specifies DMA memory requirements to 2911 * the page list functions. This function is used by the segkmem 2912 * allocator so it is only to create new pages (i.e PG_EXCL is 2913 * set). 2914 * 2915 * Note: This interface is currently used by x86 PSM only and is 2916 * not fully specified so the commitment level is only for 2917 * private interface specific to x86. This interface uses PSM 2918 * specific page_get_anylist() interface. 2919 */ 2920 2921 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 2922 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 2923 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 2924 break; \ 2925 } \ 2926 } 2927 2928 2929 page_t * 2930 page_create_io( 2931 struct vnode *vp, 2932 u_offset_t off, 2933 uint_t bytes, 2934 uint_t flags, 2935 struct as *as, 2936 caddr_t vaddr, 2937 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 2938 { 2939 page_t *plist = NULL; 2940 uint_t plist_len = 0; 2941 pgcnt_t npages; 2942 page_t *npp = NULL; 2943 uint_t pages_req; 2944 page_t *pp; 2945 kmutex_t *phm = NULL; 2946 uint_t index; 2947 2948 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2949 "page_create_start:vp %p off %llx bytes %u flags %x", 2950 vp, off, bytes, flags); 2951 2952 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 2953 2954 pages_req = npages = mmu_btopr(bytes); 2955 2956 /* 2957 * Do the freemem and pcf accounting. 2958 */ 2959 if (!page_create_wait(npages, flags)) { 2960 return (NULL); 2961 } 2962 2963 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2964 "page_create_success:vp %p off %llx", vp, off); 2965 2966 /* 2967 * If satisfying this request has left us with too little 2968 * memory, start the wheels turning to get some back. The 2969 * first clause of the test prevents waking up the pageout 2970 * daemon in situations where it would decide that there's 2971 * nothing to do. 2972 */ 2973 if (nscan < desscan && freemem < minfree) { 2974 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2975 "pageout_cv_signal:freemem %ld", freemem); 2976 cv_signal(&proc_pageout->p_cv); 2977 } 2978 2979 if (flags & PG_PHYSCONTIG) { 2980 2981 plist = page_get_contigpage(&npages, mattr, 1); 2982 if (plist == NULL) { 2983 page_create_putback(npages); 2984 return (NULL); 2985 } 2986 2987 pp = plist; 2988 2989 do { 2990 if (!page_hashin(pp, vp, off, NULL)) { 2991 panic("pg_creat_io: hashin failed %p %p %llx", 2992 (void *)pp, (void *)vp, off); 2993 } 2994 VM_STAT_ADD(page_create_new); 2995 off += MMU_PAGESIZE; 2996 PP_CLRFREE(pp); 2997 PP_CLRAGED(pp); 2998 page_set_props(pp, P_REF); 2999 pp = pp->p_next; 3000 } while (pp != plist); 3001 3002 if (!npages) { 3003 check_dma(mattr, plist, pages_req); 3004 return (plist); 3005 } else { 3006 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 3007 } 3008 3009 /* 3010 * fall-thru: 3011 * 3012 * page_get_contigpage returns when npages <= sgllen. 3013 * Grab the rest of the non-contig pages below from anylist. 3014 */ 3015 } 3016 3017 /* 3018 * Loop around collecting the requested number of pages. 3019 * Most of the time, we have to `create' a new page. With 3020 * this in mind, pull the page off the free list before 3021 * getting the hash lock. This will minimize the hash 3022 * lock hold time, nesting, and the like. If it turns 3023 * out we don't need the page, we put it back at the end. 3024 */ 3025 while (npages--) { 3026 phm = NULL; 3027 3028 index = PAGE_HASH_FUNC(vp, off); 3029 top: 3030 ASSERT(phm == NULL); 3031 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 3032 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3033 3034 if (npp == NULL) { 3035 /* 3036 * Try to get the page of any color either from 3037 * the freelist or from the cache list. 3038 */ 3039 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 3040 flags & ~PG_MATCH_COLOR, mattr, NULL); 3041 if (npp == NULL) { 3042 if (mattr == NULL) { 3043 /* 3044 * Not looking for a special page; 3045 * panic! 3046 */ 3047 panic("no page found %d", (int)npages); 3048 } 3049 /* 3050 * No page found! This can happen 3051 * if we are looking for a page 3052 * within a specific memory range 3053 * for DMA purposes. If PG_WAIT is 3054 * specified then we wait for a 3055 * while and then try again. The 3056 * wait could be forever if we 3057 * don't get the page(s) we need. 3058 * 3059 * Note: XXX We really need a mechanism 3060 * to wait for pages in the desired 3061 * range. For now, we wait for any 3062 * pages and see if we can use it. 3063 */ 3064 3065 if ((mattr != NULL) && (flags & PG_WAIT)) { 3066 delay(10); 3067 goto top; 3068 } 3069 goto fail; /* undo accounting stuff */ 3070 } 3071 3072 if (PP_ISAGED(npp) == 0) { 3073 /* 3074 * Since this page came from the 3075 * cachelist, we must destroy the 3076 * old vnode association. 3077 */ 3078 page_hashout(npp, (kmutex_t *)NULL); 3079 } 3080 } 3081 3082 /* 3083 * We own this page! 3084 */ 3085 ASSERT(PAGE_EXCL(npp)); 3086 ASSERT(npp->p_vnode == NULL); 3087 ASSERT(!hat_page_is_mapped(npp)); 3088 PP_CLRFREE(npp); 3089 PP_CLRAGED(npp); 3090 3091 /* 3092 * Here we have a page in our hot little mits and are 3093 * just waiting to stuff it on the appropriate lists. 3094 * Get the mutex and check to see if it really does 3095 * not exist. 3096 */ 3097 phm = PAGE_HASH_MUTEX(index); 3098 mutex_enter(phm); 3099 PAGE_HASH_SEARCH(index, pp, vp, off); 3100 if (pp == NULL) { 3101 VM_STAT_ADD(page_create_new); 3102 pp = npp; 3103 npp = NULL; 3104 if (!page_hashin(pp, vp, off, phm)) { 3105 /* 3106 * Since we hold the page hash mutex and 3107 * just searched for this page, page_hashin 3108 * had better not fail. If it does, that 3109 * means somethread did not follow the 3110 * page hash mutex rules. Panic now and 3111 * get it over with. As usual, go down 3112 * holding all the locks. 3113 */ 3114 ASSERT(MUTEX_HELD(phm)); 3115 panic("page_create: hashin fail %p %p %llx %p", 3116 (void *)pp, (void *)vp, off, (void *)phm); 3117 3118 } 3119 ASSERT(MUTEX_HELD(phm)); 3120 mutex_exit(phm); 3121 phm = NULL; 3122 3123 /* 3124 * Hat layer locking need not be done to set 3125 * the following bits since the page is not hashed 3126 * and was on the free list (i.e., had no mappings). 3127 * 3128 * Set the reference bit to protect 3129 * against immediate pageout 3130 * 3131 * XXXmh modify freelist code to set reference 3132 * bit so we don't have to do it here. 3133 */ 3134 page_set_props(pp, P_REF); 3135 } else { 3136 ASSERT(MUTEX_HELD(phm)); 3137 mutex_exit(phm); 3138 phm = NULL; 3139 /* 3140 * NOTE: This should not happen for pages associated 3141 * with kernel vnode 'kvp'. 3142 */ 3143 /* XX64 - to debug why this happens! */ 3144 ASSERT(!VN_ISKAS(vp)); 3145 if (VN_ISKAS(vp)) 3146 cmn_err(CE_NOTE, 3147 "page_create: page not expected " 3148 "in hash list for kernel vnode - pp 0x%p", 3149 (void *)pp); 3150 VM_STAT_ADD(page_create_exists); 3151 goto fail; 3152 } 3153 3154 /* 3155 * Got a page! It is locked. Acquire the i/o 3156 * lock since we are going to use the p_next and 3157 * p_prev fields to link the requested pages together. 3158 */ 3159 page_io_lock(pp); 3160 page_add(&plist, pp); 3161 plist = plist->p_next; 3162 off += MMU_PAGESIZE; 3163 vaddr += MMU_PAGESIZE; 3164 } 3165 3166 check_dma(mattr, plist, pages_req); 3167 return (plist); 3168 3169 fail: 3170 if (npp != NULL) { 3171 /* 3172 * Did not need this page after all. 3173 * Put it back on the free list. 3174 */ 3175 VM_STAT_ADD(page_create_putbacks); 3176 PP_SETFREE(npp); 3177 PP_SETAGED(npp); 3178 npp->p_offset = (u_offset_t)-1; 3179 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 3180 page_unlock(npp); 3181 } 3182 3183 /* 3184 * Give up the pages we already got. 3185 */ 3186 while (plist != NULL) { 3187 pp = plist; 3188 page_sub(&plist, pp); 3189 page_io_unlock(pp); 3190 plist_len++; 3191 /*LINTED: constant in conditional ctx*/ 3192 VN_DISPOSE(pp, B_INVAL, 0, kcred); 3193 } 3194 3195 /* 3196 * VN_DISPOSE does freemem accounting for the pages in plist 3197 * by calling page_free. So, we need to undo the pcf accounting 3198 * for only the remaining pages. 3199 */ 3200 VM_STAT_ADD(page_create_putbacks); 3201 page_create_putback(pages_req - plist_len); 3202 3203 return (NULL); 3204 } 3205 #endif /* !__xpv */ 3206 3207 3208 /* 3209 * Copy the data from the physical page represented by "frompp" to 3210 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 3211 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 3212 * level and no one sleeps with an active mapping there. 3213 * 3214 * Note that the ref/mod bits in the page_t's are not affected by 3215 * this operation, hence it is up to the caller to update them appropriately. 3216 */ 3217 int 3218 ppcopy(page_t *frompp, page_t *topp) 3219 { 3220 caddr_t pp_addr1; 3221 caddr_t pp_addr2; 3222 hat_mempte_t pte1; 3223 hat_mempte_t pte2; 3224 kmutex_t *ppaddr_mutex; 3225 label_t ljb; 3226 int ret = 1; 3227 3228 ASSERT_STACK_ALIGNED(); 3229 ASSERT(PAGE_LOCKED(frompp)); 3230 ASSERT(PAGE_LOCKED(topp)); 3231 3232 if (kpm_enable) { 3233 pp_addr1 = hat_kpm_page2va(frompp, 0); 3234 pp_addr2 = hat_kpm_page2va(topp, 0); 3235 kpreempt_disable(); 3236 } else { 3237 /* 3238 * disable pre-emption so that CPU can't change 3239 */ 3240 kpreempt_disable(); 3241 3242 pp_addr1 = CPU->cpu_caddr1; 3243 pp_addr2 = CPU->cpu_caddr2; 3244 pte1 = CPU->cpu_caddr1pte; 3245 pte2 = CPU->cpu_caddr2pte; 3246 3247 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3248 mutex_enter(ppaddr_mutex); 3249 3250 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 3251 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 3252 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 3253 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3254 HAT_LOAD_NOCONSIST); 3255 } 3256 3257 if (on_fault(&ljb)) { 3258 ret = 0; 3259 goto faulted; 3260 } 3261 if (use_sse_pagecopy) 3262 #ifdef __xpv 3263 page_copy_no_xmm(pp_addr2, pp_addr1); 3264 #else 3265 hwblkpagecopy(pp_addr1, pp_addr2); 3266 #endif 3267 else 3268 bcopy(pp_addr1, pp_addr2, PAGESIZE); 3269 3270 no_fault(); 3271 faulted: 3272 if (!kpm_enable) { 3273 #ifdef __xpv 3274 /* 3275 * We can't leave unused mappings laying about under the 3276 * hypervisor, so blow them away. 3277 */ 3278 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 3279 UVMF_INVLPG | UVMF_LOCAL) < 0) 3280 panic("HYPERVISOR_update_va_mapping() failed"); 3281 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3282 UVMF_INVLPG | UVMF_LOCAL) < 0) 3283 panic("HYPERVISOR_update_va_mapping() failed"); 3284 #endif 3285 mutex_exit(ppaddr_mutex); 3286 } 3287 kpreempt_enable(); 3288 return (ret); 3289 } 3290 3291 /* 3292 * Zero the physical page from off to off + len given by `pp' 3293 * without changing the reference and modified bits of page. 3294 * 3295 * We use this using CPU private page address #2, see ppcopy() for more info. 3296 * pagezero() must not be called at interrupt level. 3297 */ 3298 void 3299 pagezero(page_t *pp, uint_t off, uint_t len) 3300 { 3301 caddr_t pp_addr2; 3302 hat_mempte_t pte2; 3303 kmutex_t *ppaddr_mutex; 3304 3305 ASSERT_STACK_ALIGNED(); 3306 ASSERT(len <= MMU_PAGESIZE); 3307 ASSERT(off <= MMU_PAGESIZE); 3308 ASSERT(off + len <= MMU_PAGESIZE); 3309 ASSERT(PAGE_LOCKED(pp)); 3310 3311 if (kpm_enable) { 3312 pp_addr2 = hat_kpm_page2va(pp, 0); 3313 kpreempt_disable(); 3314 } else { 3315 kpreempt_disable(); 3316 3317 pp_addr2 = CPU->cpu_caddr2; 3318 pte2 = CPU->cpu_caddr2pte; 3319 3320 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3321 mutex_enter(ppaddr_mutex); 3322 3323 hat_mempte_remap(page_pptonum(pp), pp_addr2, pte2, 3324 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3325 HAT_LOAD_NOCONSIST); 3326 } 3327 3328 if (use_sse_pagezero) { 3329 #ifdef __xpv 3330 uint_t rem; 3331 3332 /* 3333 * zero a byte at a time until properly aligned for 3334 * block_zero_no_xmm(). 3335 */ 3336 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3337 pp_addr2[off++] = 0; 3338 3339 /* 3340 * Now use faster block_zero_no_xmm() for any range 3341 * that is properly aligned and sized. 3342 */ 3343 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3344 len -= rem; 3345 if (len != 0) { 3346 block_zero_no_xmm(pp_addr2 + off, len); 3347 off += len; 3348 } 3349 3350 /* 3351 * zero remainder with byte stores. 3352 */ 3353 while (rem-- > 0) 3354 pp_addr2[off++] = 0; 3355 #else 3356 hwblkclr(pp_addr2 + off, len); 3357 #endif 3358 } else { 3359 bzero(pp_addr2 + off, len); 3360 } 3361 3362 #ifdef __xpv 3363 /* 3364 * On the hypervisor this page might get used for a page table before 3365 * any intervening change to this mapping, so blow it away. 3366 */ 3367 if (!kpm_enable && HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3368 UVMF_INVLPG) < 0) 3369 panic("HYPERVISOR_update_va_mapping() failed"); 3370 #endif 3371 3372 if (!kpm_enable) 3373 mutex_exit(ppaddr_mutex); 3374 kpreempt_enable(); 3375 } 3376 3377 /* 3378 * Platform-dependent page scrub call. 3379 */ 3380 void 3381 pagescrub(page_t *pp, uint_t off, uint_t len) 3382 { 3383 /* 3384 * For now, we rely on the fact that pagezero() will 3385 * always clear UEs. 3386 */ 3387 pagezero(pp, off, len); 3388 } 3389 3390 /* 3391 * set up two private addresses for use on a given CPU for use in ppcopy() 3392 */ 3393 void 3394 setup_vaddr_for_ppcopy(struct cpu *cpup) 3395 { 3396 void *addr; 3397 hat_mempte_t pte_pa; 3398 3399 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3400 pte_pa = hat_mempte_setup(addr); 3401 cpup->cpu_caddr1 = addr; 3402 cpup->cpu_caddr1pte = pte_pa; 3403 3404 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3405 pte_pa = hat_mempte_setup(addr); 3406 cpup->cpu_caddr2 = addr; 3407 cpup->cpu_caddr2pte = pte_pa; 3408 3409 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 3410 } 3411 3412 /* 3413 * Undo setup_vaddr_for_ppcopy 3414 */ 3415 void 3416 teardown_vaddr_for_ppcopy(struct cpu *cpup) 3417 { 3418 mutex_destroy(&cpup->cpu_ppaddr_mutex); 3419 3420 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 3421 cpup->cpu_caddr2pte = 0; 3422 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 3423 cpup->cpu_caddr2 = 0; 3424 3425 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 3426 cpup->cpu_caddr1pte = 0; 3427 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 3428 cpup->cpu_caddr1 = 0; 3429 } 3430 3431 /* 3432 * Create the pageout scanner thread. The thread has to 3433 * start at procedure with process pp and priority pri. 3434 */ 3435 void 3436 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 3437 { 3438 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 3439 } 3440 3441 /* 3442 * Function for flushing D-cache when performing module relocations 3443 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 3444 */ 3445 void 3446 dcache_flushall() 3447 {} 3448 3449 size_t 3450 exec_get_spslew(void) 3451 { 3452 return (0); 3453 } 3454 3455 /* 3456 * Allocate a memory page. The argument 'seed' can be any pseudo-random 3457 * number to vary where the pages come from. This is quite a hacked up 3458 * method -- it works for now, but really needs to be fixed up a bit. 3459 * 3460 * We currently use page_create_va() on the kvp with fake offsets, 3461 * segments and virt address. This is pretty bogus, but was copied from the 3462 * old hat_i86.c code. A better approach would be to specify either mnode 3463 * random or mnode local and takes a page from whatever color has the MOST 3464 * available - this would have a minimal impact on page coloring. 3465 */ 3466 page_t * 3467 page_get_physical(uintptr_t seed) 3468 { 3469 page_t *pp; 3470 u_offset_t offset; 3471 static struct seg tmpseg; 3472 static uintptr_t ctr = 0; 3473 3474 /* 3475 * This code is gross, we really need a simpler page allocator. 3476 * 3477 * We need assign an offset for the page to call page_create_va(). 3478 * To avoid conflicts with other pages, we get creative with the offset. 3479 * For 32 bits, we pick an offset > 4Gig 3480 * For 64 bits, pick an offset somewhere in the VA hole. 3481 */ 3482 offset = seed; 3483 if (offset > kernelbase) 3484 offset -= kernelbase; 3485 offset <<= MMU_PAGESHIFT; 3486 #if defined(__amd64) 3487 offset += mmu.hole_start; /* something in VA hole */ 3488 #else 3489 offset += 1ULL << 40; /* something > 4 Gig */ 3490 #endif 3491 3492 if (page_resv(1, KM_NOSLEEP) == 0) 3493 return (NULL); 3494 3495 #ifdef DEBUG 3496 pp = page_exists(&kvp, offset); 3497 if (pp != NULL) 3498 panic("page already exists %p", pp); 3499 #endif 3500 3501 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 3502 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 3503 if (pp == NULL) 3504 return (NULL); 3505 page_io_unlock(pp); 3506 page_hashout(pp, NULL); 3507 return (pp); 3508 } 3509