1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 /* 35 * UNIX machine dependent virtual memory support. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/user.h> 42 #include <sys/proc.h> 43 #include <sys/kmem.h> 44 #include <sys/vmem.h> 45 #include <sys/buf.h> 46 #include <sys/cpuvar.h> 47 #include <sys/lgrp.h> 48 #include <sys/disp.h> 49 #include <sys/vm.h> 50 #include <sys/mman.h> 51 #include <sys/vnode.h> 52 #include <sys/cred.h> 53 #include <sys/exec.h> 54 #include <sys/exechdr.h> 55 #include <sys/debug.h> 56 #include <sys/vmsystm.h> 57 58 #include <vm/hat.h> 59 #include <vm/as.h> 60 #include <vm/seg.h> 61 #include <vm/seg_kp.h> 62 #include <vm/seg_vn.h> 63 #include <vm/page.h> 64 #include <vm/seg_kmem.h> 65 #include <vm/seg_kpm.h> 66 #include <vm/vm_dep.h> 67 68 #include <sys/cpu.h> 69 #include <sys/vm_machparam.h> 70 #include <sys/memlist.h> 71 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 72 #include <vm/hat_i86.h> 73 #include <sys/x86_archext.h> 74 #include <sys/elf_386.h> 75 #include <sys/cmn_err.h> 76 #include <sys/archsystm.h> 77 #include <sys/machsystm.h> 78 79 #include <sys/vtrace.h> 80 #include <sys/ddidmareq.h> 81 #include <sys/promif.h> 82 #include <sys/memnode.h> 83 #include <sys/stack.h> 84 #include <util/qsort.h> 85 #include <sys/taskq.h> 86 87 #ifdef __xpv 88 89 #include <sys/hypervisor.h> 90 #include <sys/xen_mmu.h> 91 #include <sys/balloon_impl.h> 92 93 /* 94 * domain 0 pages usable for DMA are kept pre-allocated and kept in 95 * distinct lists, ordered by increasing mfn. 96 */ 97 static kmutex_t io_pool_lock; 98 static kmutex_t contig_list_lock; 99 static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 100 static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 101 static long io_pool_cnt; 102 static long io_pool_cnt_max = 0; 103 #define DEFAULT_IO_POOL_MIN 128 104 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 105 static long io_pool_cnt_lowater = 0; 106 static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 107 static long io_pool_shrinks; /* how many times did we really shrink */ 108 static long io_pool_grows; /* how many times did we grow */ 109 static mfn_t start_mfn = 1; 110 static caddr_t io_pool_kva; /* use to alloc pages when needed */ 111 112 static int create_contig_pfnlist(uint_t); 113 114 /* 115 * percentage of phys mem to hold in the i/o pool 116 */ 117 #define DEFAULT_IO_POOL_PCT 2 118 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 119 static void page_io_pool_sub(page_t **, page_t *, page_t *); 120 int ioalloc_dbg = 0; 121 122 #endif /* __xpv */ 123 124 uint_t vac_colors = 1; 125 126 int largepagesupport = 0; 127 extern uint_t page_create_new; 128 extern uint_t page_create_exists; 129 extern uint_t page_create_putbacks; 130 extern uint_t page_create_putbacks; 131 /* 132 * Allow users to disable the kernel's use of SSE. 133 */ 134 extern int use_sse_pagecopy, use_sse_pagezero; 135 136 /* 137 * combined memory ranges from mnode and memranges[] to manage single 138 * mnode/mtype dimension in the page lists. 139 */ 140 typedef struct { 141 pfn_t mnr_pfnlo; 142 pfn_t mnr_pfnhi; 143 int mnr_mnode; 144 int mnr_memrange; /* index into memranges[] */ 145 /* maintain page list stats */ 146 pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 147 pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 148 pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 149 #ifdef DEBUG 150 struct mnr_mts { /* mnode/mtype szc stats */ 151 pgcnt_t mnr_mts_pgcnt; 152 int mnr_mts_colors; 153 pgcnt_t *mnr_mtsc_pgcnt; 154 } *mnr_mts; 155 #endif 156 } mnoderange_t; 157 158 #define MEMRANGEHI(mtype) \ 159 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 160 #define MEMRANGELO(mtype) (memranges[mtype]) 161 162 #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 163 164 /* 165 * As the PC architecture evolved memory up was clumped into several 166 * ranges for various historical I/O devices to do DMA. 167 * < 16Meg - ISA bus 168 * < 2Gig - ??? 169 * < 4Gig - PCI bus or drivers that don't understand PAE mode 170 * 171 * These are listed in reverse order, so that we can skip over unused 172 * ranges on machines with small memories. 173 * 174 * For now under the Hypervisor, we'll only ever have one memrange. 175 */ 176 #define PFN_4GIG 0x100000 177 #define PFN_16MEG 0x1000 178 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 179 PFN_4GIG, /* pfn range for 4G and above */ 180 0x80000, /* pfn range for 2G-4G */ 181 PFN_16MEG, /* pfn range for 16M-2G */ 182 0x00000, /* pfn range for 0-16M */ 183 }; 184 pfn_t *memranges = &arch_memranges[0]; 185 int nranges = NUM_MEM_RANGES; 186 187 /* 188 * This combines mem_node_config and memranges into one data 189 * structure to be used for page list management. 190 */ 191 mnoderange_t *mnoderanges; 192 int mnoderangecnt; 193 int mtype4g; 194 195 /* 196 * 4g memory management variables for systems with more than 4g of memory: 197 * 198 * physical memory below 4g is required for 32bit dma devices and, currently, 199 * for kmem memory. On systems with more than 4g of memory, the pool of memory 200 * below 4g can be depleted without any paging activity given that there is 201 * likely to be sufficient memory above 4g. 202 * 203 * physmax4g is set true if the largest pfn is over 4g. The rest of the 204 * 4g memory management code is enabled only when physmax4g is true. 205 * 206 * maxmem4g is the count of the maximum number of pages on the page lists 207 * with physical addresses below 4g. It can be a lot less then 4g given that 208 * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 209 * agp aperture etc. 210 * 211 * freemem4g maintains the count of the number of available pages on the 212 * page lists with physical addresses below 4g. 213 * 214 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 215 * 6% (desfree4gshift = 4) of maxmem4g. 216 * 217 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 218 * and the amount of physical memory above 4g is greater than freemem4g. 219 * In this case, page_get_* routines will restrict below 4g allocations 220 * for requests that don't specifically require it. 221 */ 222 223 #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 224 #define DESFREE4G (maxmem4g >> desfree4gshift) 225 226 #define RESTRICT4G_ALLOC \ 227 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 228 229 static pgcnt_t maxmem4g; 230 static pgcnt_t freemem4g; 231 static int physmax4g; 232 static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 233 static int lotsfree4gshift = 3; 234 235 /* 236 * 16m memory management: 237 * 238 * reserve some amount of physical memory below 16m for legacy devices. 239 * 240 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 241 * 16m or if the 16m pool drops below DESFREE16M. 242 * 243 * In this case, general page allocations via page_get_{free,cache}list 244 * routines will be restricted from allocating from the 16m pool. Allocations 245 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 246 * are not restricted. 247 */ 248 249 #define FREEMEM16M MTYPE_FREEMEM(0) 250 #define DESFREE16M desfree16m 251 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 252 ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 253 ((freemem >= (FREEMEM16M)) || \ 254 (FREEMEM16M < (DESFREE16M + pgcnt)))) 255 256 static pgcnt_t desfree16m = 0x380; 257 258 /* 259 * This can be patched via /etc/system to allow old non-PAE aware device 260 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 261 */ 262 int restricted_kmemalloc = 0; 263 264 #ifdef VM_STATS 265 struct { 266 ulong_t pga_alloc; 267 ulong_t pga_notfullrange; 268 ulong_t pga_nulldmaattr; 269 ulong_t pga_allocok; 270 ulong_t pga_allocfailed; 271 ulong_t pgma_alloc; 272 ulong_t pgma_allocok; 273 ulong_t pgma_allocfailed; 274 ulong_t pgma_allocempty; 275 } pga_vmstats; 276 #endif 277 278 uint_t mmu_page_sizes; 279 280 /* How many page sizes the users can see */ 281 uint_t mmu_exported_page_sizes; 282 283 /* page sizes that legacy applications can see */ 284 uint_t mmu_legacy_page_sizes; 285 286 /* 287 * Number of pages in 1 GB. Don't enable automatic large pages if we have 288 * fewer than this many pages. 289 */ 290 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 291 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 292 293 /* 294 * Maximum and default segment size tunables for user private 295 * and shared anon memory, and user text and initialized data. 296 * These can be patched via /etc/system to allow large pages 297 * to be used for mapping application private and shared anon memory. 298 */ 299 size_t mcntl0_lpsize = MMU_PAGESIZE; 300 size_t max_uheap_lpsize = MMU_PAGESIZE; 301 size_t default_uheap_lpsize = MMU_PAGESIZE; 302 size_t max_ustack_lpsize = MMU_PAGESIZE; 303 size_t default_ustack_lpsize = MMU_PAGESIZE; 304 size_t max_privmap_lpsize = MMU_PAGESIZE; 305 size_t max_uidata_lpsize = MMU_PAGESIZE; 306 size_t max_utext_lpsize = MMU_PAGESIZE; 307 size_t max_shm_lpsize = MMU_PAGESIZE; 308 309 310 /* 311 * initialized by page_coloring_init(). 312 */ 313 uint_t page_colors; 314 uint_t page_colors_mask; 315 uint_t page_coloring_shift; 316 int cpu_page_colors; 317 static uint_t l2_colors; 318 319 /* 320 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 321 * and page_colors are calculated from the l2 cache n-way set size. Within a 322 * mnode range, the page freelist and cachelist are hashed into bins based on 323 * color. This makes it easier to search for a page within a specific memory 324 * range. 325 */ 326 #define PAGE_COLORS_MIN 16 327 328 page_t ****page_freelists; 329 page_t ***page_cachelists; 330 331 332 /* 333 * Used by page layer to know about page sizes 334 */ 335 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 336 337 kmutex_t *fpc_mutex[NPC_MUTEX]; 338 kmutex_t *cpc_mutex[NPC_MUTEX]; 339 340 /* 341 * Only let one thread at a time try to coalesce large pages, to 342 * prevent them from working against each other. 343 */ 344 static kmutex_t contig_lock; 345 #define CONTIG_LOCK() mutex_enter(&contig_lock); 346 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 347 348 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 349 350 /* 351 * Return the optimum page size for a given mapping 352 */ 353 /*ARGSUSED*/ 354 size_t 355 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 356 { 357 level_t l = 0; 358 size_t pgsz = MMU_PAGESIZE; 359 size_t max_lpsize; 360 uint_t mszc; 361 362 ASSERT(maptype != MAPPGSZ_VA); 363 364 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 365 return (MMU_PAGESIZE); 366 } 367 368 switch (maptype) { 369 case MAPPGSZ_HEAP: 370 case MAPPGSZ_STK: 371 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 372 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 373 if (max_lpsize == MMU_PAGESIZE) { 374 return (MMU_PAGESIZE); 375 } 376 if (len == 0) { 377 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 378 p->p_brksize - p->p_bssbase : p->p_stksize; 379 } 380 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 381 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 382 383 /* 384 * use the pages size that best fits len 385 */ 386 for (l = mmu.umax_page_level; l > 0; --l) { 387 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 388 continue; 389 } else { 390 pgsz = LEVEL_SIZE(l); 391 } 392 break; 393 } 394 395 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 396 p->p_stkpageszc); 397 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 398 pgsz = hw_page_array[mszc].hp_size; 399 } 400 return (pgsz); 401 402 case MAPPGSZ_ISM: 403 for (l = mmu.umax_page_level; l > 0; --l) { 404 if (len >= LEVEL_SIZE(l)) 405 return (LEVEL_SIZE(l)); 406 } 407 return (LEVEL_SIZE(0)); 408 } 409 return (pgsz); 410 } 411 412 static uint_t 413 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 414 size_t min_physmem) 415 { 416 caddr_t eaddr = addr + size; 417 uint_t szcvec = 0; 418 caddr_t raddr; 419 caddr_t readdr; 420 size_t pgsz; 421 int i; 422 423 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 424 return (0); 425 } 426 427 for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 428 pgsz = page_get_pagesize(i); 429 if (pgsz > max_lpsize) { 430 continue; 431 } 432 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 433 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 434 if (raddr < addr || raddr >= readdr) { 435 continue; 436 } 437 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 438 continue; 439 } 440 /* 441 * Set szcvec to the remaining page sizes. 442 */ 443 szcvec = ((1 << (i + 1)) - 1) & ~1; 444 break; 445 } 446 return (szcvec); 447 } 448 449 /* 450 * Return a bit vector of large page size codes that 451 * can be used to map [addr, addr + len) region. 452 */ 453 /*ARGSUSED*/ 454 uint_t 455 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 456 int memcntl) 457 { 458 size_t max_lpsize = mcntl0_lpsize; 459 460 if (mmu.max_page_level == 0) 461 return (0); 462 463 if (flags & MAP_TEXT) { 464 if (!memcntl) 465 max_lpsize = max_utext_lpsize; 466 return (map_szcvec(addr, size, off, max_lpsize, 467 shm_lpg_min_physmem)); 468 469 } else if (flags & MAP_INITDATA) { 470 if (!memcntl) 471 max_lpsize = max_uidata_lpsize; 472 return (map_szcvec(addr, size, off, max_lpsize, 473 privm_lpg_min_physmem)); 474 475 } else if (type == MAPPGSZC_SHM) { 476 if (!memcntl) 477 max_lpsize = max_shm_lpsize; 478 return (map_szcvec(addr, size, off, max_lpsize, 479 shm_lpg_min_physmem)); 480 481 } else if (type == MAPPGSZC_HEAP) { 482 if (!memcntl) 483 max_lpsize = max_uheap_lpsize; 484 return (map_szcvec(addr, size, off, max_lpsize, 485 privm_lpg_min_physmem)); 486 487 } else if (type == MAPPGSZC_STACK) { 488 if (!memcntl) 489 max_lpsize = max_ustack_lpsize; 490 return (map_szcvec(addr, size, off, max_lpsize, 491 privm_lpg_min_physmem)); 492 493 } else { 494 if (!memcntl) 495 max_lpsize = max_privmap_lpsize; 496 return (map_szcvec(addr, size, off, max_lpsize, 497 privm_lpg_min_physmem)); 498 } 499 } 500 501 /* 502 * Handle a pagefault. 503 */ 504 faultcode_t 505 pagefault( 506 caddr_t addr, 507 enum fault_type type, 508 enum seg_rw rw, 509 int iskernel) 510 { 511 struct as *as; 512 struct hat *hat; 513 struct proc *p; 514 kthread_t *t; 515 faultcode_t res; 516 caddr_t base; 517 size_t len; 518 int err; 519 int mapped_red; 520 uintptr_t ea; 521 522 ASSERT_STACK_ALIGNED(); 523 524 if (INVALID_VADDR(addr)) 525 return (FC_NOMAP); 526 527 mapped_red = segkp_map_red(); 528 529 if (iskernel) { 530 as = &kas; 531 hat = as->a_hat; 532 } else { 533 t = curthread; 534 p = ttoproc(t); 535 as = p->p_as; 536 hat = as->a_hat; 537 } 538 539 /* 540 * Dispatch pagefault. 541 */ 542 res = as_fault(hat, as, addr, 1, type, rw); 543 544 /* 545 * If this isn't a potential unmapped hole in the user's 546 * UNIX data or stack segments, just return status info. 547 */ 548 if (res != FC_NOMAP || iskernel) 549 goto out; 550 551 /* 552 * Check to see if we happened to faulted on a currently unmapped 553 * part of the UNIX data or stack segments. If so, create a zfod 554 * mapping there and then try calling the fault routine again. 555 */ 556 base = p->p_brkbase; 557 len = p->p_brksize; 558 559 if (addr < base || addr >= base + len) { /* data seg? */ 560 base = (caddr_t)p->p_usrstack - p->p_stksize; 561 len = p->p_stksize; 562 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 563 /* not in either UNIX data or stack segments */ 564 res = FC_NOMAP; 565 goto out; 566 } 567 } 568 569 /* 570 * the rest of this function implements a 3.X 4.X 5.X compatibility 571 * This code is probably not needed anymore 572 */ 573 if (p->p_model == DATAMODEL_ILP32) { 574 575 /* expand the gap to the page boundaries on each side */ 576 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 577 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 578 len = ea - (uintptr_t)base; 579 580 as_rangelock(as); 581 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 582 0) { 583 err = as_map(as, base, len, segvn_create, zfod_argsp); 584 as_rangeunlock(as); 585 if (err) { 586 res = FC_MAKE_ERR(err); 587 goto out; 588 } 589 } else { 590 /* 591 * This page is already mapped by another thread after 592 * we returned from as_fault() above. We just fall 593 * through as_fault() below. 594 */ 595 as_rangeunlock(as); 596 } 597 598 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 599 } 600 601 out: 602 if (mapped_red) 603 segkp_unmap_red(); 604 605 return (res); 606 } 607 608 void 609 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 610 { 611 struct proc *p = curproc; 612 caddr_t userlimit = (flags & _MAP_LOW32) ? 613 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 614 615 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 616 } 617 618 /*ARGSUSED*/ 619 int 620 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 621 { 622 return (0); 623 } 624 625 /* 626 * map_addr_proc() is the routine called when the system is to 627 * choose an address for the user. We will pick an address 628 * range which is the highest available below userlimit. 629 * 630 * Every mapping will have a redzone of a single page on either side of 631 * the request. This is done to leave one page unmapped between segments. 632 * This is not required, but it's useful for the user because if their 633 * program strays across a segment boundary, it will catch a fault 634 * immediately making debugging a little easier. Currently the redzone 635 * is mandatory. 636 * 637 * addrp is a value/result parameter. 638 * On input it is a hint from the user to be used in a completely 639 * machine dependent fashion. We decide to completely ignore this hint. 640 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which 641 * must be some "power of two" multiple of pagesize. 642 * 643 * On output it is NULL if no address can be found in the current 644 * processes address space or else an address that is currently 645 * not mapped for len bytes with a page of red zone on either side. 646 * 647 * vacalign is not needed on x86 (it's for viturally addressed caches) 648 */ 649 /*ARGSUSED*/ 650 void 651 map_addr_proc( 652 caddr_t *addrp, 653 size_t len, 654 offset_t off, 655 int vacalign, 656 caddr_t userlimit, 657 struct proc *p, 658 uint_t flags) 659 { 660 struct as *as = p->p_as; 661 caddr_t addr; 662 caddr_t base; 663 size_t slen; 664 size_t align_amount; 665 666 ASSERT32(userlimit == as->a_userlimit); 667 668 base = p->p_brkbase; 669 #if defined(__amd64) 670 /* 671 * XX64 Yes, this needs more work. 672 */ 673 if (p->p_model == DATAMODEL_NATIVE) { 674 if (userlimit < as->a_userlimit) { 675 /* 676 * This happens when a program wants to map 677 * something in a range that's accessible to a 678 * program in a smaller address space. For example, 679 * a 64-bit program calling mmap32(2) to guarantee 680 * that the returned address is below 4Gbytes. 681 */ 682 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 683 684 if (userlimit > base) 685 slen = userlimit - base; 686 else { 687 *addrp = NULL; 688 return; 689 } 690 } else { 691 /* 692 * XX64 This layout is probably wrong .. but in 693 * the event we make the amd64 address space look 694 * like sparcv9 i.e. with the stack -above- the 695 * heap, this bit of code might even be correct. 696 */ 697 slen = p->p_usrstack - base - 698 ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK); 699 } 700 } else 701 #endif 702 slen = userlimit - base; 703 704 /* Make len be a multiple of PAGESIZE */ 705 len = (len + PAGEOFFSET) & PAGEMASK; 706 707 /* 708 * figure out what the alignment should be 709 * 710 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 711 */ 712 if (len <= ELF_386_MAXPGSZ) { 713 /* 714 * Align virtual addresses to ensure that ELF shared libraries 715 * are mapped with the appropriate alignment constraints by 716 * the run-time linker. 717 */ 718 align_amount = ELF_386_MAXPGSZ; 719 } else { 720 /* 721 * For 32-bit processes, only those which have specified 722 * MAP_ALIGN and an addr will be aligned on a larger page size. 723 * Not doing so can potentially waste up to 1G of process 724 * address space. 725 */ 726 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 : 727 mmu.umax_page_level; 728 729 while (lvl && len < LEVEL_SIZE(lvl)) 730 --lvl; 731 732 align_amount = LEVEL_SIZE(lvl); 733 } 734 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 735 align_amount = (uintptr_t)*addrp; 736 737 ASSERT(ISP2(align_amount)); 738 ASSERT(align_amount == 0 || align_amount >= PAGESIZE); 739 740 off = off & (align_amount - 1); 741 /* 742 * Look for a large enough hole starting below userlimit. 743 * After finding it, use the upper part. 744 */ 745 if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount, 746 PAGESIZE, off) == 0) { 747 caddr_t as_addr; 748 749 /* 750 * addr is the highest possible address to use since we have 751 * a PAGESIZE redzone at the beginning and end. 752 */ 753 addr = base + slen - (PAGESIZE + len); 754 as_addr = addr; 755 /* 756 * Round address DOWN to the alignment amount and 757 * add the offset in. 758 * If addr is greater than as_addr, len would not be large 759 * enough to include the redzone, so we must adjust down 760 * by the alignment amount. 761 */ 762 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 763 addr += (uintptr_t)off; 764 if (addr > as_addr) { 765 addr -= align_amount; 766 } 767 768 ASSERT(addr > base); 769 ASSERT(addr + len < base + slen); 770 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 771 ((uintptr_t)(off))); 772 *addrp = addr; 773 } else { 774 *addrp = NULL; /* no more virtual space */ 775 } 776 } 777 778 int valid_va_range_aligned_wraparound; 779 780 /* 781 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 782 * addresses at least "minlen" long, where the base of the range is at "off" 783 * phase from an "align" boundary and there is space for a "redzone"-sized 784 * redzone on either side of the range. On success, 1 is returned and *basep 785 * and *lenp are adjusted to describe the acceptable range (including 786 * the redzone). On failure, 0 is returned. 787 */ 788 /*ARGSUSED3*/ 789 int 790 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 791 size_t align, size_t redzone, size_t off) 792 { 793 uintptr_t hi, lo; 794 size_t tot_len; 795 796 ASSERT(align == 0 ? off == 0 : off < align); 797 ASSERT(ISP2(align)); 798 ASSERT(align == 0 || align >= PAGESIZE); 799 800 lo = (uintptr_t)*basep; 801 hi = lo + *lenp; 802 tot_len = minlen + 2 * redzone; /* need at least this much space */ 803 804 /* 805 * If hi rolled over the top, try cutting back. 806 */ 807 if (hi < lo) { 808 *lenp = 0UL - lo - 1UL; 809 /* See if this really happens. If so, then we figure out why */ 810 valid_va_range_aligned_wraparound++; 811 hi = lo + *lenp; 812 } 813 if (*lenp < tot_len) { 814 return (0); 815 } 816 817 #if defined(__amd64) 818 /* 819 * Deal with a possible hole in the address range between 820 * hole_start and hole_end that should never be mapped. 821 */ 822 if (lo < hole_start) { 823 if (hi > hole_start) { 824 if (hi < hole_end) { 825 hi = hole_start; 826 } else { 827 /* lo < hole_start && hi >= hole_end */ 828 if (dir == AH_LO) { 829 /* 830 * prefer lowest range 831 */ 832 if (hole_start - lo >= tot_len) 833 hi = hole_start; 834 else if (hi - hole_end >= tot_len) 835 lo = hole_end; 836 else 837 return (0); 838 } else { 839 /* 840 * prefer highest range 841 */ 842 if (hi - hole_end >= tot_len) 843 lo = hole_end; 844 else if (hole_start - lo >= tot_len) 845 hi = hole_start; 846 else 847 return (0); 848 } 849 } 850 } 851 } else { 852 /* lo >= hole_start */ 853 if (hi < hole_end) 854 return (0); 855 if (lo < hole_end) 856 lo = hole_end; 857 } 858 #endif 859 860 if (hi - lo < tot_len) 861 return (0); 862 863 if (align > 1) { 864 uintptr_t tlo = lo + redzone; 865 uintptr_t thi = hi - redzone; 866 tlo = (uintptr_t)P2PHASEUP(tlo, align, off); 867 if (tlo < lo + redzone) { 868 return (0); 869 } 870 if (thi < tlo || thi - tlo < minlen) { 871 return (0); 872 } 873 } 874 875 *basep = (caddr_t)lo; 876 *lenp = hi - lo; 877 return (1); 878 } 879 880 /* 881 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 882 * addresses at least "minlen" long. On success, 1 is returned and *basep 883 * and *lenp are adjusted to describe the acceptable range. On failure, 0 884 * is returned. 885 */ 886 int 887 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 888 { 889 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 890 } 891 892 /* 893 * Determine whether [addr, addr+len] are valid user addresses. 894 */ 895 /*ARGSUSED*/ 896 int 897 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 898 caddr_t userlimit) 899 { 900 caddr_t eaddr = addr + len; 901 902 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 903 return (RANGE_BADADDR); 904 905 #if defined(__amd64) 906 /* 907 * Check for the VA hole 908 */ 909 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 910 return (RANGE_BADADDR); 911 #endif 912 913 return (RANGE_OKAY); 914 } 915 916 /* 917 * Return 1 if the page frame is onboard memory, else 0. 918 */ 919 int 920 pf_is_memory(pfn_t pf) 921 { 922 if (pfn_is_foreign(pf)) 923 return (0); 924 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 925 } 926 927 /* 928 * return the memrange containing pfn 929 */ 930 int 931 memrange_num(pfn_t pfn) 932 { 933 int n; 934 935 for (n = 0; n < nranges - 1; ++n) { 936 if (pfn >= memranges[n]) 937 break; 938 } 939 return (n); 940 } 941 942 /* 943 * return the mnoderange containing pfn 944 */ 945 /*ARGSUSED*/ 946 int 947 pfn_2_mtype(pfn_t pfn) 948 { 949 #if defined(__xpv) 950 return (0); 951 #else 952 int n; 953 954 for (n = mnoderangecnt - 1; n >= 0; n--) { 955 if (pfn >= mnoderanges[n].mnr_pfnlo) { 956 break; 957 } 958 } 959 return (n); 960 #endif 961 } 962 963 #if !defined(__xpv) 964 /* 965 * is_contigpage_free: 966 * returns a page list of contiguous pages. It minimally has to return 967 * minctg pages. Caller determines minctg based on the scatter-gather 968 * list length. 969 * 970 * pfnp is set to the next page frame to search on return. 971 */ 972 static page_t * 973 is_contigpage_free( 974 pfn_t *pfnp, 975 pgcnt_t *pgcnt, 976 pgcnt_t minctg, 977 uint64_t pfnseg, 978 int iolock) 979 { 980 int i = 0; 981 pfn_t pfn = *pfnp; 982 page_t *pp; 983 page_t *plist = NULL; 984 985 /* 986 * fail if pfn + minctg crosses a segment boundary. 987 * Adjust for next starting pfn to begin at segment boundary. 988 */ 989 990 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 991 *pfnp = roundup(*pfnp, pfnseg + 1); 992 return (NULL); 993 } 994 995 do { 996 retry: 997 pp = page_numtopp_nolock(pfn + i); 998 if ((pp == NULL) || 999 (page_trylock(pp, SE_EXCL) == 0)) { 1000 (*pfnp)++; 1001 break; 1002 } 1003 if (page_pptonum(pp) != pfn + i) { 1004 page_unlock(pp); 1005 goto retry; 1006 } 1007 1008 if (!(PP_ISFREE(pp))) { 1009 page_unlock(pp); 1010 (*pfnp)++; 1011 break; 1012 } 1013 1014 if (!PP_ISAGED(pp)) { 1015 page_list_sub(pp, PG_CACHE_LIST); 1016 page_hashout(pp, (kmutex_t *)NULL); 1017 } else { 1018 page_list_sub(pp, PG_FREE_LIST); 1019 } 1020 1021 if (iolock) 1022 page_io_lock(pp); 1023 page_list_concat(&plist, &pp); 1024 1025 /* 1026 * exit loop when pgcnt satisfied or segment boundary reached. 1027 */ 1028 1029 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 1030 1031 *pfnp += i; /* set to next pfn to search */ 1032 1033 if (i >= minctg) { 1034 *pgcnt -= i; 1035 return (plist); 1036 } 1037 1038 /* 1039 * failure: minctg not satisfied. 1040 * 1041 * if next request crosses segment boundary, set next pfn 1042 * to search from the segment boundary. 1043 */ 1044 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 1045 *pfnp = roundup(*pfnp, pfnseg + 1); 1046 1047 /* clean up any pages already allocated */ 1048 1049 while (plist) { 1050 pp = plist; 1051 page_sub(&plist, pp); 1052 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 1053 if (iolock) 1054 page_io_unlock(pp); 1055 page_unlock(pp); 1056 } 1057 1058 return (NULL); 1059 } 1060 #endif /* !__xpv */ 1061 1062 /* 1063 * verify that pages being returned from allocator have correct DMA attribute 1064 */ 1065 #ifndef DEBUG 1066 #define check_dma(a, b, c) (0) 1067 #else 1068 static void 1069 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 1070 { 1071 if (dma_attr == NULL) 1072 return; 1073 1074 while (cnt-- > 0) { 1075 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 1076 dma_attr->dma_attr_addr_lo) 1077 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp); 1078 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 1079 dma_attr->dma_attr_addr_hi) 1080 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp); 1081 pp = pp->p_next; 1082 } 1083 } 1084 #endif 1085 1086 #if !defined(__xpv) 1087 static page_t * 1088 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 1089 { 1090 pfn_t pfn; 1091 int sgllen; 1092 uint64_t pfnseg; 1093 pgcnt_t minctg; 1094 page_t *pplist = NULL, *plist; 1095 uint64_t lo, hi; 1096 pgcnt_t pfnalign = 0; 1097 static pfn_t startpfn; 1098 static pgcnt_t lastctgcnt; 1099 uintptr_t align; 1100 1101 CONTIG_LOCK(); 1102 1103 if (mattr) { 1104 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 1105 hi = mmu_btop(mattr->dma_attr_addr_hi); 1106 if (hi >= physmax) 1107 hi = physmax - 1; 1108 sgllen = mattr->dma_attr_sgllen; 1109 pfnseg = mmu_btop(mattr->dma_attr_seg); 1110 1111 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 1112 if (align > MMU_PAGESIZE) 1113 pfnalign = mmu_btop(align); 1114 1115 /* 1116 * in order to satisfy the request, must minimally 1117 * acquire minctg contiguous pages 1118 */ 1119 minctg = howmany(*pgcnt, sgllen); 1120 1121 ASSERT(hi >= lo); 1122 1123 /* 1124 * start from where last searched if the minctg >= lastctgcnt 1125 */ 1126 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 1127 startpfn = lo; 1128 } else { 1129 hi = physmax - 1; 1130 lo = 0; 1131 sgllen = 1; 1132 pfnseg = mmu.highest_pfn; 1133 minctg = *pgcnt; 1134 1135 if (minctg < lastctgcnt) 1136 startpfn = lo; 1137 } 1138 lastctgcnt = minctg; 1139 1140 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 1141 1142 /* conserve 16m memory - start search above 16m when possible */ 1143 if (hi > PFN_16M && startpfn < PFN_16M) 1144 startpfn = PFN_16M; 1145 1146 pfn = startpfn; 1147 if (pfnalign) 1148 pfn = P2ROUNDUP(pfn, pfnalign); 1149 1150 while (pfn + minctg - 1 <= hi) { 1151 1152 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1153 if (plist) { 1154 page_list_concat(&pplist, &plist); 1155 sgllen--; 1156 /* 1157 * return when contig pages no longer needed 1158 */ 1159 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1160 startpfn = pfn; 1161 CONTIG_UNLOCK(); 1162 check_dma(mattr, pplist, *pgcnt); 1163 return (pplist); 1164 } 1165 minctg = howmany(*pgcnt, sgllen); 1166 } 1167 if (pfnalign) 1168 pfn = P2ROUNDUP(pfn, pfnalign); 1169 } 1170 1171 /* cannot find contig pages in specified range */ 1172 if (startpfn == lo) { 1173 CONTIG_UNLOCK(); 1174 return (NULL); 1175 } 1176 1177 /* did not start with lo previously */ 1178 pfn = lo; 1179 if (pfnalign) 1180 pfn = P2ROUNDUP(pfn, pfnalign); 1181 1182 /* allow search to go above startpfn */ 1183 while (pfn < startpfn) { 1184 1185 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1186 if (plist != NULL) { 1187 1188 page_list_concat(&pplist, &plist); 1189 sgllen--; 1190 1191 /* 1192 * return when contig pages no longer needed 1193 */ 1194 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1195 startpfn = pfn; 1196 CONTIG_UNLOCK(); 1197 check_dma(mattr, pplist, *pgcnt); 1198 return (pplist); 1199 } 1200 minctg = howmany(*pgcnt, sgllen); 1201 } 1202 if (pfnalign) 1203 pfn = P2ROUNDUP(pfn, pfnalign); 1204 } 1205 CONTIG_UNLOCK(); 1206 return (NULL); 1207 } 1208 #endif /* !__xpv */ 1209 1210 /* 1211 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1212 * memranges[]. Used to determine the size of page lists and mnoderanges. 1213 */ 1214 int 1215 mnode_range_cnt(int mnode) 1216 { 1217 #if defined(__xpv) 1218 ASSERT(mnode == 0); 1219 return (1); 1220 #else /* __xpv */ 1221 int mri; 1222 int mnrcnt = 0; 1223 1224 if (mem_node_config[mnode].exists != 0) { 1225 mri = nranges - 1; 1226 1227 /* find the memranges index below contained in mnode range */ 1228 1229 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1230 mri--; 1231 1232 /* 1233 * increment mnode range counter when memranges or mnode 1234 * boundary is reached. 1235 */ 1236 while (mri >= 0 && 1237 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1238 mnrcnt++; 1239 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1240 mri--; 1241 else 1242 break; 1243 } 1244 } 1245 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1246 return (mnrcnt); 1247 #endif /* __xpv */ 1248 } 1249 1250 /* 1251 * mnode_range_setup() initializes mnoderanges. 1252 */ 1253 void 1254 mnode_range_setup(mnoderange_t *mnoderanges) 1255 { 1256 int mnode, mri; 1257 1258 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1259 if (mem_node_config[mnode].exists == 0) 1260 continue; 1261 1262 mri = nranges - 1; 1263 1264 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1265 mri--; 1266 1267 while (mri >= 0 && mem_node_config[mnode].physmax >= 1268 MEMRANGELO(mri)) { 1269 mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 1270 mem_node_config[mnode].physbase); 1271 mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 1272 mem_node_config[mnode].physmax); 1273 mnoderanges->mnr_mnode = mnode; 1274 mnoderanges->mnr_memrange = mri; 1275 mnoderanges++; 1276 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1277 mri--; 1278 else 1279 break; 1280 } 1281 } 1282 } 1283 1284 /*ARGSUSED*/ 1285 int 1286 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1287 { 1288 int mtype = mnoderangecnt - 1; 1289 1290 #if !defined(__xpv) 1291 #if defined(__i386) 1292 /* 1293 * set the mtype range 1294 * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 1295 * - for non kmem requests, set range to above 4g if memory below 4g 1296 * runs low. 1297 */ 1298 if (restricted_kmemalloc && VN_ISKAS(vp) && 1299 (caddr_t)(vaddr) >= kernelheap && 1300 (caddr_t)(vaddr) < ekernelheap) { 1301 ASSERT(physmax4g); 1302 mtype = mtype4g; 1303 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1304 btop(pgsz), *flags)) { 1305 *flags |= PGI_MT_RANGE16M; 1306 } else { 1307 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1308 VM_STAT_COND_ADD((*flags & PG_PANIC), 1309 vmm_vmstats.pgpanicalloc); 1310 *flags |= PGI_MT_RANGE0; 1311 } 1312 return (mtype); 1313 } 1314 #endif /* __i386 */ 1315 1316 if (RESTRICT4G_ALLOC) { 1317 VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1318 /* here only for > 4g systems */ 1319 *flags |= PGI_MT_RANGE4G; 1320 } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1321 *flags |= PGI_MT_RANGE16M; 1322 } else { 1323 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1324 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1325 *flags |= PGI_MT_RANGE0; 1326 } 1327 #endif /* !__xpv */ 1328 return (mtype); 1329 } 1330 1331 1332 /* mtype init for page_get_replacement_page */ 1333 /*ARGSUSED*/ 1334 int 1335 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1336 { 1337 int mtype = mnoderangecnt - 1; 1338 #if !defined(__ixpv) 1339 if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1340 *flags |= PGI_MT_RANGE16M; 1341 } else { 1342 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1343 *flags |= PGI_MT_RANGE0; 1344 } 1345 #endif 1346 return (mtype); 1347 } 1348 1349 /* 1350 * Determine if the mnode range specified in mtype contains memory belonging 1351 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1352 * the range of indices from high pfn to 0, 16m or 4g. 1353 * 1354 * Return first mnode range type index found otherwise return -1 if none found. 1355 */ 1356 int 1357 mtype_func(int mnode, int mtype, uint_t flags) 1358 { 1359 if (flags & PGI_MT_RANGE) { 1360 int mtlim = 0; 1361 1362 if (flags & PGI_MT_NEXT) 1363 mtype--; 1364 if (flags & PGI_MT_RANGE4G) 1365 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1366 else if (flags & PGI_MT_RANGE16M) 1367 mtlim = 1; /* exclude 0-16m range */ 1368 while (mtype >= mtlim) { 1369 if (mnoderanges[mtype].mnr_mnode == mnode) 1370 return (mtype); 1371 mtype--; 1372 } 1373 } else if (mnoderanges[mtype].mnr_mnode == mnode) { 1374 return (mtype); 1375 } 1376 return (-1); 1377 } 1378 1379 /* 1380 * Update the page list max counts with the pfn range specified by the 1381 * input parameters. Called from add_physmem() when physical memory with 1382 * page_t's are initially added to the page lists. 1383 */ 1384 void 1385 mtype_modify_max(pfn_t startpfn, long cnt) 1386 { 1387 int mtype = 0; 1388 pfn_t endpfn = startpfn + cnt, pfn; 1389 pgcnt_t inc; 1390 1391 ASSERT(cnt > 0); 1392 1393 if (!physmax4g) 1394 return; 1395 1396 for (pfn = startpfn; pfn < endpfn; ) { 1397 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1398 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1399 inc = endpfn - pfn; 1400 } else { 1401 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1402 } 1403 if (mtype <= mtype4g) 1404 maxmem4g += inc; 1405 pfn += inc; 1406 } 1407 mtype++; 1408 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1409 } 1410 } 1411 1412 int 1413 mtype_2_mrange(int mtype) 1414 { 1415 return (mnoderanges[mtype].mnr_memrange); 1416 } 1417 1418 void 1419 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1420 { 1421 ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1422 *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1423 *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1424 } 1425 1426 size_t 1427 plcnt_sz(size_t ctrs_sz) 1428 { 1429 #ifdef DEBUG 1430 int szc, colors; 1431 1432 ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1433 for (szc = 0; szc < mmu_page_sizes; szc++) { 1434 colors = page_get_pagecolors(szc); 1435 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1436 } 1437 #endif 1438 return (ctrs_sz); 1439 } 1440 1441 caddr_t 1442 plcnt_init(caddr_t addr) 1443 { 1444 #ifdef DEBUG 1445 int mt, szc, colors; 1446 1447 for (mt = 0; mt < mnoderangecnt; mt++) { 1448 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1449 addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1450 for (szc = 0; szc < mmu_page_sizes; szc++) { 1451 colors = page_get_pagecolors(szc); 1452 mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1453 mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1454 (pgcnt_t *)addr; 1455 addr += (sizeof (pgcnt_t) * colors); 1456 } 1457 } 1458 #endif 1459 return (addr); 1460 } 1461 1462 void 1463 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1464 { 1465 #ifdef DEBUG 1466 int bin = PP_2_BIN(pp); 1467 1468 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1469 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1470 cnt); 1471 #endif 1472 ASSERT(mtype == PP_2_MTYPE(pp)); 1473 if (physmax4g && mtype <= mtype4g) 1474 atomic_add_long(&freemem4g, cnt); 1475 if (flags & PG_CACHE_LIST) 1476 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1477 else 1478 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 1479 atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 1480 } 1481 1482 /* 1483 * Returns the free page count for mnode 1484 */ 1485 int 1486 mnode_pgcnt(int mnode) 1487 { 1488 int mtype = mnoderangecnt - 1; 1489 int flags = PGI_MT_RANGE0; 1490 pgcnt_t pgcnt = 0; 1491 1492 mtype = mtype_func(mnode, mtype, flags); 1493 1494 while (mtype != -1) { 1495 pgcnt += MTYPE_FREEMEM(mtype); 1496 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1497 } 1498 return (pgcnt); 1499 } 1500 1501 /* 1502 * Initialize page coloring variables based on the l2 cache parameters. 1503 * Calculate and return memory needed for page coloring data structures. 1504 */ 1505 size_t 1506 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1507 { 1508 size_t colorsz = 0; 1509 int i; 1510 int colors; 1511 1512 #if defined(__xpv) 1513 /* 1514 * Hypervisor domains currently don't have any concept of NUMA. 1515 * Hence we'll act like there is only 1 memrange. 1516 */ 1517 i = memrange_num(1); 1518 #else /* !__xpv */ 1519 /* 1520 * Reduce the memory ranges lists if we don't have large amounts 1521 * of memory. This avoids searching known empty free lists. 1522 */ 1523 i = memrange_num(physmax); 1524 #if defined(__i386) 1525 if (i > 0) 1526 restricted_kmemalloc = 0; 1527 #endif 1528 /* physmax greater than 4g */ 1529 if (i == 0) 1530 physmax4g = 1; 1531 #endif /* !__xpv */ 1532 memranges += i; 1533 nranges -= i; 1534 1535 ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 1536 1537 ASSERT(ISP2(l2_linesz)); 1538 ASSERT(l2_sz > MMU_PAGESIZE); 1539 1540 /* l2_assoc is 0 for fully associative l2 cache */ 1541 if (l2_assoc) 1542 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1543 else 1544 l2_colors = 1; 1545 1546 ASSERT(ISP2(l2_colors)); 1547 1548 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1549 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1550 1551 /* 1552 * cpu_page_colors is non-zero when a page color may be spread across 1553 * multiple bins. 1554 */ 1555 if (l2_colors < page_colors) 1556 cpu_page_colors = l2_colors; 1557 1558 ASSERT(ISP2(page_colors)); 1559 1560 page_colors_mask = page_colors - 1; 1561 1562 ASSERT(ISP2(CPUSETSIZE())); 1563 page_coloring_shift = lowbit(CPUSETSIZE()); 1564 1565 /* initialize number of colors per page size */ 1566 for (i = 0; i <= mmu.max_page_level; i++) { 1567 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1568 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1569 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1570 hw_page_array[i].hp_colors = (page_colors_mask >> 1571 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1572 + 1; 1573 colorequivszc[i] = 0; 1574 } 1575 1576 /* 1577 * The value of cpu_page_colors determines if additional color bins 1578 * need to be checked for a particular color in the page_get routines. 1579 */ 1580 if (cpu_page_colors != 0) { 1581 1582 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1583 ASSERT(a > 0); 1584 ASSERT(a < 16); 1585 1586 for (i = 0; i <= mmu.max_page_level; i++) { 1587 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1588 colorequivszc[i] = 0; 1589 continue; 1590 } 1591 while ((colors >> a) == 0) 1592 a--; 1593 ASSERT(a >= 0); 1594 1595 /* higher 4 bits encodes color equiv mask */ 1596 colorequivszc[i] = (a << 4); 1597 } 1598 } 1599 1600 /* factor in colorequiv to check additional 'equivalent' bins. */ 1601 if (colorequiv > 1) { 1602 1603 int a = lowbit(colorequiv) - 1; 1604 if (a > 15) 1605 a = 15; 1606 1607 for (i = 0; i <= mmu.max_page_level; i++) { 1608 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1609 continue; 1610 } 1611 while ((colors >> a) == 0) 1612 a--; 1613 if ((a << 4) > colorequivszc[i]) { 1614 colorequivszc[i] = (a << 4); 1615 } 1616 } 1617 } 1618 1619 /* size for mnoderanges */ 1620 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1621 mnoderangecnt += mnode_range_cnt(i); 1622 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1623 1624 /* size for fpc_mutex and cpc_mutex */ 1625 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1626 1627 /* size of page_freelists */ 1628 colorsz += mnoderangecnt * sizeof (page_t ***); 1629 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1630 1631 for (i = 0; i < mmu_page_sizes; i++) { 1632 colors = page_get_pagecolors(i); 1633 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1634 } 1635 1636 /* size of page_cachelists */ 1637 colorsz += mnoderangecnt * sizeof (page_t **); 1638 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1639 1640 return (colorsz); 1641 } 1642 1643 /* 1644 * Called once at startup to configure page_coloring data structures and 1645 * does the 1st page_free()/page_freelist_add(). 1646 */ 1647 void 1648 page_coloring_setup(caddr_t pcmemaddr) 1649 { 1650 int i; 1651 int j; 1652 int k; 1653 caddr_t addr; 1654 int colors; 1655 1656 /* 1657 * do page coloring setup 1658 */ 1659 addr = pcmemaddr; 1660 1661 mnoderanges = (mnoderange_t *)addr; 1662 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1663 1664 mnode_range_setup(mnoderanges); 1665 1666 if (physmax4g) 1667 mtype4g = pfn_2_mtype(0xfffff); 1668 1669 for (k = 0; k < NPC_MUTEX; k++) { 1670 fpc_mutex[k] = (kmutex_t *)addr; 1671 addr += (max_mem_nodes * sizeof (kmutex_t)); 1672 } 1673 for (k = 0; k < NPC_MUTEX; k++) { 1674 cpc_mutex[k] = (kmutex_t *)addr; 1675 addr += (max_mem_nodes * sizeof (kmutex_t)); 1676 } 1677 page_freelists = (page_t ****)addr; 1678 addr += (mnoderangecnt * sizeof (page_t ***)); 1679 1680 page_cachelists = (page_t ***)addr; 1681 addr += (mnoderangecnt * sizeof (page_t **)); 1682 1683 for (i = 0; i < mnoderangecnt; i++) { 1684 page_freelists[i] = (page_t ***)addr; 1685 addr += (mmu_page_sizes * sizeof (page_t **)); 1686 1687 for (j = 0; j < mmu_page_sizes; j++) { 1688 colors = page_get_pagecolors(j); 1689 page_freelists[i][j] = (page_t **)addr; 1690 addr += (colors * sizeof (page_t *)); 1691 } 1692 page_cachelists[i] = (page_t **)addr; 1693 addr += (page_colors * sizeof (page_t *)); 1694 } 1695 } 1696 1697 #if defined(__xpv) 1698 /* 1699 * Give back 10% of the io_pool pages to the free list. 1700 * Don't shrink the pool below some absolute minimum. 1701 */ 1702 static void 1703 page_io_pool_shrink() 1704 { 1705 int retcnt; 1706 page_t *pp, *pp_first, *pp_last, **curpool; 1707 mfn_t mfn; 1708 int bothpools = 0; 1709 1710 mutex_enter(&io_pool_lock); 1711 io_pool_shrink_attempts++; /* should be a kstat? */ 1712 retcnt = io_pool_cnt / 10; 1713 if (io_pool_cnt - retcnt < io_pool_cnt_min) 1714 retcnt = io_pool_cnt - io_pool_cnt_min; 1715 if (retcnt <= 0) 1716 goto done; 1717 io_pool_shrinks++; /* should be a kstat? */ 1718 curpool = &io_pool_4g; 1719 domore: 1720 /* 1721 * Loop through taking pages from the end of the list 1722 * (highest mfns) till amount to return reached. 1723 */ 1724 for (pp = *curpool; pp && retcnt > 0; ) { 1725 pp_first = pp_last = pp->p_prev; 1726 if (pp_first == *curpool) 1727 break; 1728 retcnt--; 1729 io_pool_cnt--; 1730 page_io_pool_sub(curpool, pp_first, pp_last); 1731 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 1732 start_mfn = mfn; 1733 page_free(pp_first, 1); 1734 pp = *curpool; 1735 } 1736 if (retcnt != 0 && !bothpools) { 1737 /* 1738 * If not enough found in less constrained pool try the 1739 * more constrained one. 1740 */ 1741 curpool = &io_pool_16m; 1742 bothpools = 1; 1743 goto domore; 1744 } 1745 done: 1746 mutex_exit(&io_pool_lock); 1747 } 1748 1749 #endif /* __xpv */ 1750 1751 uint_t 1752 page_create_update_flags_x86(uint_t flags) 1753 { 1754 #if defined(__xpv) 1755 /* 1756 * Check this is an urgent allocation and free pages are depleted. 1757 */ 1758 if (!(flags & PG_WAIT) && freemem < desfree) 1759 page_io_pool_shrink(); 1760 #else /* !__xpv */ 1761 /* 1762 * page_create_get_something may call this because 4g memory may be 1763 * depleted. Set flags to allow for relocation of base page below 1764 * 4g if necessary. 1765 */ 1766 if (physmax4g) 1767 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1768 #endif /* __xpv */ 1769 return (flags); 1770 } 1771 1772 /*ARGSUSED*/ 1773 int 1774 bp_color(struct buf *bp) 1775 { 1776 return (0); 1777 } 1778 1779 #if defined(__xpv) 1780 1781 /* 1782 * Take pages out of an io_pool 1783 */ 1784 static void 1785 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 1786 { 1787 if (*poolp == pp_first) { 1788 *poolp = pp_last->p_next; 1789 if (*poolp == pp_first) 1790 *poolp = NULL; 1791 } 1792 pp_first->p_prev->p_next = pp_last->p_next; 1793 pp_last->p_next->p_prev = pp_first->p_prev; 1794 pp_first->p_prev = pp_last; 1795 pp_last->p_next = pp_first; 1796 } 1797 1798 /* 1799 * Put a page on the io_pool list. The list is ordered by increasing MFN. 1800 */ 1801 static void 1802 page_io_pool_add(page_t **poolp, page_t *pp) 1803 { 1804 page_t *look; 1805 mfn_t mfn = mfn_list[pp->p_pagenum]; 1806 1807 if (*poolp == NULL) { 1808 *poolp = pp; 1809 pp->p_next = pp; 1810 pp->p_prev = pp; 1811 return; 1812 } 1813 1814 /* 1815 * Since we try to take pages from the high end of the pool 1816 * chances are good that the pages to be put on the list will 1817 * go at or near the end of the list. so start at the end and 1818 * work backwards. 1819 */ 1820 look = (*poolp)->p_prev; 1821 while (mfn < mfn_list[look->p_pagenum]) { 1822 look = look->p_prev; 1823 if (look == (*poolp)->p_prev) 1824 break; /* backed all the way to front of list */ 1825 } 1826 1827 /* insert after look */ 1828 pp->p_prev = look; 1829 pp->p_next = look->p_next; 1830 pp->p_next->p_prev = pp; 1831 look->p_next = pp; 1832 if (mfn < mfn_list[(*poolp)->p_pagenum]) { 1833 /* 1834 * we inserted a new first list element 1835 * adjust pool pointer to newly inserted element 1836 */ 1837 *poolp = pp; 1838 } 1839 } 1840 1841 /* 1842 * Add a page to the io_pool. Setting the force flag will force the page 1843 * into the io_pool no matter what. 1844 */ 1845 static void 1846 add_page_to_pool(page_t *pp, int force) 1847 { 1848 page_t *highest; 1849 page_t *freep = NULL; 1850 1851 mutex_enter(&io_pool_lock); 1852 /* 1853 * Always keep the scarce low memory pages 1854 */ 1855 if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 1856 ++io_pool_cnt; 1857 page_io_pool_add(&io_pool_16m, pp); 1858 goto done; 1859 } 1860 if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) { 1861 ++io_pool_cnt; 1862 page_io_pool_add(&io_pool_4g, pp); 1863 } else { 1864 highest = io_pool_4g->p_prev; 1865 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 1866 page_io_pool_sub(&io_pool_4g, highest, highest); 1867 page_io_pool_add(&io_pool_4g, pp); 1868 freep = highest; 1869 } else { 1870 freep = pp; 1871 } 1872 } 1873 done: 1874 mutex_exit(&io_pool_lock); 1875 if (freep) 1876 page_free(freep, 1); 1877 } 1878 1879 1880 int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 1881 int contig_pfn_max; /* capacity of the contig pfn list */ 1882 int next_alloc_pfn; /* next position in list to start a contig search */ 1883 int contig_pfnlist_updates; /* pfn list update count */ 1884 int contig_pfnlist_builds; /* how many times have we (re)built list */ 1885 int contig_pfnlist_buildfailed; /* how many times has list build failed */ 1886 int create_contig_pending; /* nonzero means taskq creating contig list */ 1887 pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 1888 1889 /* 1890 * Function to use in sorting a list of pfns by their underlying mfns. 1891 */ 1892 static int 1893 mfn_compare(const void *pfnp1, const void *pfnp2) 1894 { 1895 mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 1896 mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 1897 1898 if (mfn1 > mfn2) 1899 return (1); 1900 if (mfn1 < mfn2) 1901 return (-1); 1902 return (0); 1903 } 1904 1905 /* 1906 * Compact the contig_pfn_list by tossing all the non-contiguous 1907 * elements from the list. 1908 */ 1909 static void 1910 compact_contig_pfn_list(void) 1911 { 1912 pfn_t pfn, lapfn, prev_lapfn; 1913 mfn_t mfn; 1914 int i, newcnt = 0; 1915 1916 prev_lapfn = 0; 1917 for (i = 0; i < contig_pfn_cnt - 1; i++) { 1918 pfn = contig_pfn_list[i]; 1919 lapfn = contig_pfn_list[i + 1]; 1920 mfn = mfn_list[pfn]; 1921 /* 1922 * See if next pfn is for a contig mfn 1923 */ 1924 if (mfn_list[lapfn] != mfn + 1) 1925 continue; 1926 /* 1927 * pfn and lookahead are both put in list 1928 * unless pfn is the previous lookahead. 1929 */ 1930 if (pfn != prev_lapfn) 1931 contig_pfn_list[newcnt++] = pfn; 1932 contig_pfn_list[newcnt++] = lapfn; 1933 prev_lapfn = lapfn; 1934 } 1935 for (i = newcnt; i < contig_pfn_cnt; i++) 1936 contig_pfn_list[i] = 0; 1937 contig_pfn_cnt = newcnt; 1938 } 1939 1940 /*ARGSUSED*/ 1941 static void 1942 call_create_contiglist(void *arg) 1943 { 1944 (void) create_contig_pfnlist(PG_WAIT); 1945 } 1946 1947 /* 1948 * Create list of freelist pfns that have underlying 1949 * contiguous mfns. The list is kept in ascending mfn order. 1950 * returns 1 if list created else 0. 1951 */ 1952 static int 1953 create_contig_pfnlist(uint_t flags) 1954 { 1955 pfn_t pfn; 1956 page_t *pp; 1957 int ret = 1; 1958 1959 mutex_enter(&contig_list_lock); 1960 if (contig_pfn_list != NULL) 1961 goto out; 1962 contig_pfn_max = freemem + (freemem / 10); 1963 contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 1964 (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 1965 if (contig_pfn_list == NULL) { 1966 /* 1967 * If we could not create the contig list (because 1968 * we could not sleep for memory). Dispatch a taskq that can 1969 * sleep to get the memory. 1970 */ 1971 if (!create_contig_pending) { 1972 if (taskq_dispatch(system_taskq, call_create_contiglist, 1973 NULL, TQ_NOSLEEP) != NULL) 1974 create_contig_pending = 1; 1975 } 1976 contig_pfnlist_buildfailed++; /* count list build failures */ 1977 ret = 0; 1978 goto out; 1979 } 1980 create_contig_pending = 0; 1981 ASSERT(contig_pfn_cnt == 0); 1982 for (pfn = 0; pfn < mfn_count; pfn++) { 1983 pp = page_numtopp_nolock(pfn); 1984 if (pp == NULL || !PP_ISFREE(pp)) 1985 continue; 1986 contig_pfn_list[contig_pfn_cnt] = pfn; 1987 if (++contig_pfn_cnt == contig_pfn_max) 1988 break; 1989 } 1990 /* 1991 * Sanity check the new list. 1992 */ 1993 if (contig_pfn_cnt < 2) { /* no contig pfns */ 1994 contig_pfn_cnt = 0; 1995 contig_pfnlist_buildfailed++; 1996 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t)); 1997 contig_pfn_list = NULL; 1998 contig_pfn_max = 0; 1999 ret = 0; 2000 goto out; 2001 } 2002 qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 2003 compact_contig_pfn_list(); 2004 /* 2005 * Make sure next search of the newly created contiguous pfn 2006 * list starts at the beginning of the list. 2007 */ 2008 next_alloc_pfn = 0; 2009 contig_pfnlist_builds++; /* count list builds */ 2010 out: 2011 mutex_exit(&contig_list_lock); 2012 return (ret); 2013 } 2014 2015 2016 /* 2017 * Toss the current contig pfnlist. Someone is about to do a massive 2018 * update to pfn<->mfn mappings. So we have them destroy the list and lock 2019 * it till they are done with their update. 2020 */ 2021 void 2022 clear_and_lock_contig_pfnlist() 2023 { 2024 pfn_t *listp = NULL; 2025 size_t listsize; 2026 2027 mutex_enter(&contig_list_lock); 2028 if (contig_pfn_list != NULL) { 2029 listp = contig_pfn_list; 2030 listsize = contig_pfn_max * sizeof (pfn_t); 2031 contig_pfn_list = NULL; 2032 contig_pfn_max = contig_pfn_cnt = 0; 2033 } 2034 if (listp != NULL) 2035 kmem_free(listp, listsize); 2036 } 2037 2038 /* 2039 * Unlock the contig_pfn_list. The next attempted use of it will cause 2040 * it to be re-created. 2041 */ 2042 void 2043 unlock_contig_pfnlist() 2044 { 2045 mutex_exit(&contig_list_lock); 2046 } 2047 2048 /* 2049 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 2050 */ 2051 void 2052 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 2053 { 2054 int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 2055 pfn_t probe_pfn; 2056 mfn_t probe_mfn; 2057 int drop_lock = 0; 2058 2059 if (mutex_owner(&contig_list_lock) != curthread) { 2060 drop_lock = 1; 2061 mutex_enter(&contig_list_lock); 2062 } 2063 if (contig_pfn_list == NULL) 2064 goto done; 2065 contig_pfnlist_updates++; 2066 /* 2067 * Find the pfn in the current list. Use a binary chop to locate it. 2068 */ 2069 probe_hi = contig_pfn_cnt - 1; 2070 probe_lo = 0; 2071 probe_pos = (probe_hi + probe_lo) / 2; 2072 while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2073 if (probe_pos == probe_lo) { /* pfn not in list */ 2074 probe_pos = -1; 2075 break; 2076 } 2077 if (pfn_to_mfn(probe_pfn) <= oldmfn) 2078 probe_lo = probe_pos; 2079 else 2080 probe_hi = probe_pos; 2081 probe_pos = (probe_hi + probe_lo) / 2; 2082 } 2083 if (probe_pos >= 0) { 2084 /* 2085 * Remove pfn from list and ensure next alloc 2086 * position stays in bounds. 2087 */ 2088 if (--contig_pfn_cnt <= next_alloc_pfn) 2089 next_alloc_pfn = 0; 2090 if (contig_pfn_cnt < 2) { /* no contig pfns */ 2091 contig_pfn_cnt = 0; 2092 kmem_free(contig_pfn_list, 2093 contig_pfn_max * sizeof (pfn_t)); 2094 contig_pfn_list = NULL; 2095 contig_pfn_max = 0; 2096 goto done; 2097 } 2098 ovbcopy(&contig_pfn_list[probe_pos + 1], 2099 &contig_pfn_list[probe_pos], 2100 (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2101 } 2102 if (newmfn == MFN_INVALID) 2103 goto done; 2104 /* 2105 * Check if new mfn has adjacent mfns in the list 2106 */ 2107 probe_hi = contig_pfn_cnt - 1; 2108 probe_lo = 0; 2109 insert_after = -2; 2110 do { 2111 probe_pos = (probe_hi + probe_lo) / 2; 2112 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2113 if (newmfn == probe_mfn + 1) 2114 insert_after = probe_pos; 2115 else if (newmfn == probe_mfn - 1) 2116 insert_after = probe_pos - 1; 2117 if (probe_pos == probe_lo) 2118 break; 2119 if (probe_mfn <= newmfn) 2120 probe_lo = probe_pos; 2121 else 2122 probe_hi = probe_pos; 2123 } while (insert_after == -2); 2124 /* 2125 * If there is space in the list and there are adjacent mfns 2126 * insert the pfn in to its proper place in the list. 2127 */ 2128 if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2129 insert_point = insert_after + 1; 2130 ovbcopy(&contig_pfn_list[insert_point], 2131 &contig_pfn_list[insert_point + 1], 2132 (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2133 contig_pfn_list[insert_point] = pfn; 2134 contig_pfn_cnt++; 2135 } 2136 done: 2137 if (drop_lock) 2138 mutex_exit(&contig_list_lock); 2139 } 2140 2141 /* 2142 * Called to (re-)populate the io_pool from the free page lists. 2143 */ 2144 long 2145 populate_io_pool(void) 2146 { 2147 pfn_t pfn; 2148 mfn_t mfn, max_mfn; 2149 page_t *pp; 2150 2151 /* 2152 * Figure out the bounds of the pool on first invocation. 2153 * We use a percentage of memory for the io pool size. 2154 * we allow that to shrink, but not to less than a fixed minimum 2155 */ 2156 if (io_pool_cnt_max == 0) { 2157 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2158 io_pool_cnt_lowater = io_pool_cnt_max; 2159 /* 2160 * This is the first time in populate_io_pool, grab a va to use 2161 * when we need to allocate pages. 2162 */ 2163 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2164 } 2165 /* 2166 * If we are out of pages in the pool, then grow the size of the pool 2167 */ 2168 if (io_pool_cnt == 0) { 2169 /* 2170 * Grow the max size of the io pool by 5%, but never more than 2171 * 25% of physical memory. 2172 */ 2173 if (io_pool_cnt_max < physmem / 4) 2174 io_pool_cnt_max += io_pool_cnt_max / 20; 2175 } 2176 io_pool_grows++; /* should be a kstat? */ 2177 2178 /* 2179 * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2180 */ 2181 (void) mfn_to_pfn(start_mfn); 2182 max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2183 for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2184 pfn = mfn_to_pfn(mfn); 2185 if (pfn & PFN_IS_FOREIGN_MFN) 2186 continue; 2187 /* 2188 * try to allocate it from free pages 2189 */ 2190 pp = page_numtopp_alloc(pfn); 2191 if (pp == NULL) 2192 continue; 2193 PP_CLRFREE(pp); 2194 add_page_to_pool(pp, 1); 2195 if (io_pool_cnt >= io_pool_cnt_max) 2196 break; 2197 } 2198 2199 return (io_pool_cnt); 2200 } 2201 2202 /* 2203 * Destroy a page that was being used for DMA I/O. It may or 2204 * may not actually go back to the io_pool. 2205 */ 2206 void 2207 page_destroy_io(page_t *pp) 2208 { 2209 mfn_t mfn = mfn_list[pp->p_pagenum]; 2210 2211 /* 2212 * When the page was alloc'd a reservation was made, release it now 2213 */ 2214 page_unresv(1); 2215 /* 2216 * Unload translations, if any, then hash out the 2217 * page to erase its identity. 2218 */ 2219 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2220 page_hashout(pp, NULL); 2221 2222 /* 2223 * If the page came from the free lists, just put it back to them. 2224 * DomU pages always go on the free lists as well. 2225 */ 2226 if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2227 page_free(pp, 1); 2228 return; 2229 } 2230 2231 add_page_to_pool(pp, 0); 2232 } 2233 2234 2235 long contig_searches; /* count of times contig pages requested */ 2236 long contig_search_restarts; /* count of contig ranges tried */ 2237 long contig_search_failed; /* count of contig alloc failures */ 2238 2239 /* 2240 * Free partial page list 2241 */ 2242 static void 2243 free_partial_list(page_t **pplist) 2244 { 2245 page_t *pp; 2246 2247 while (*pplist != NULL) { 2248 pp = *pplist; 2249 page_io_pool_sub(pplist, pp, pp); 2250 page_free(pp, 1); 2251 } 2252 } 2253 2254 /* 2255 * Look thru the contiguous pfns that are not part of the io_pool for 2256 * contiguous free pages. Return a list of the found pages or NULL. 2257 */ 2258 page_t * 2259 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg, 2260 pgcnt_t pfnalign) 2261 { 2262 page_t *pp, *plist = NULL; 2263 mfn_t mfn, prev_mfn, start_mfn; 2264 pfn_t pfn; 2265 int pages_needed, pages_requested; 2266 int search_start; 2267 2268 /* 2269 * create the contig pfn list if not already done 2270 */ 2271 retry: 2272 mutex_enter(&contig_list_lock); 2273 if (contig_pfn_list == NULL) { 2274 mutex_exit(&contig_list_lock); 2275 if (!create_contig_pfnlist(flags)) { 2276 return (NULL); 2277 } 2278 goto retry; 2279 } 2280 contig_searches++; 2281 /* 2282 * Search contiguous pfn list for physically contiguous pages not in 2283 * the io_pool. Start the search where the last search left off. 2284 */ 2285 pages_requested = pages_needed = npages; 2286 search_start = next_alloc_pfn; 2287 start_mfn = prev_mfn = 0; 2288 while (pages_needed) { 2289 pfn = contig_pfn_list[next_alloc_pfn]; 2290 mfn = pfn_to_mfn(pfn); 2291 /* 2292 * Check if mfn is first one or contig to previous one and 2293 * if page corresponding to mfn is free and that mfn 2294 * range is not crossing a segment boundary. 2295 */ 2296 if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 2297 (pp = page_numtopp_alloc(pfn)) != NULL && 2298 !((mfn & pfnseg) < (start_mfn & pfnseg))) { 2299 PP_CLRFREE(pp); 2300 page_io_pool_add(&plist, pp); 2301 pages_needed--; 2302 if (prev_mfn == 0) { 2303 if (pfnalign && 2304 mfn != P2ROUNDUP(mfn, pfnalign)) { 2305 /* 2306 * not properly aligned 2307 */ 2308 contig_search_restarts++; 2309 free_partial_list(&plist); 2310 pages_needed = pages_requested; 2311 start_mfn = prev_mfn = 0; 2312 goto skip; 2313 } 2314 start_mfn = mfn; 2315 } 2316 prev_mfn = mfn; 2317 } else { 2318 contig_search_restarts++; 2319 free_partial_list(&plist); 2320 pages_needed = pages_requested; 2321 start_mfn = prev_mfn = 0; 2322 } 2323 skip: 2324 if (++next_alloc_pfn == contig_pfn_cnt) 2325 next_alloc_pfn = 0; 2326 if (next_alloc_pfn == search_start) 2327 break; /* all pfns searched */ 2328 } 2329 mutex_exit(&contig_list_lock); 2330 if (pages_needed) { 2331 contig_search_failed++; 2332 /* 2333 * Failed to find enough contig pages. 2334 * free partial page list 2335 */ 2336 free_partial_list(&plist); 2337 } 2338 return (plist); 2339 } 2340 2341 /* 2342 * Search the reserved io pool pages for a page range with the 2343 * desired characteristics. 2344 */ 2345 page_t * 2346 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg) 2347 { 2348 page_t *pp_first, *pp_last; 2349 page_t *pp, **poolp; 2350 pgcnt_t nwanted, pfnalign; 2351 uint64_t pfnseg; 2352 mfn_t mfn, tmfn, hi_mfn, lo_mfn; 2353 int align, attempt = 0; 2354 2355 if (minctg == 1) 2356 contig = 0; 2357 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2358 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2359 pfnseg = mmu_btop(mattr->dma_attr_seg); 2360 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2361 if (align > MMU_PAGESIZE) 2362 pfnalign = mmu_btop(align); 2363 else 2364 pfnalign = 0; 2365 2366 try_again: 2367 /* 2368 * See if we want pages for a legacy device 2369 */ 2370 if (hi_mfn < PFN_16MEG) 2371 poolp = &io_pool_16m; 2372 else 2373 poolp = &io_pool_4g; 2374 try_smaller: 2375 /* 2376 * Take pages from I/O pool. We'll use pages from the highest 2377 * MFN range possible. 2378 */ 2379 pp_first = pp_last = NULL; 2380 mutex_enter(&io_pool_lock); 2381 nwanted = minctg; 2382 for (pp = *poolp; pp && nwanted > 0; ) { 2383 pp = pp->p_prev; 2384 2385 /* 2386 * skip pages above allowable range 2387 */ 2388 mfn = mfn_list[pp->p_pagenum]; 2389 if (hi_mfn < mfn) 2390 goto skip; 2391 2392 /* 2393 * stop at pages below allowable range 2394 */ 2395 if (lo_mfn > mfn) 2396 break; 2397 restart: 2398 if (pp_last == NULL) { 2399 /* 2400 * Check alignment 2401 */ 2402 tmfn = mfn - (minctg - 1); 2403 if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign)) 2404 goto skip; /* not properly aligned */ 2405 /* 2406 * Check segment 2407 */ 2408 if ((mfn & pfnseg) < (tmfn & pfnseg)) 2409 goto skip; /* crosses seg boundary */ 2410 /* 2411 * Start building page list 2412 */ 2413 pp_first = pp_last = pp; 2414 nwanted--; 2415 } else { 2416 /* 2417 * check physical contiguity if required 2418 */ 2419 if (contig && 2420 mfn_list[pp_first->p_pagenum] != mfn + 1) { 2421 /* 2422 * not a contiguous page, restart list. 2423 */ 2424 pp_last = NULL; 2425 nwanted = minctg; 2426 goto restart; 2427 } else { /* add page to list */ 2428 pp_first = pp; 2429 nwanted--; 2430 } 2431 } 2432 skip: 2433 if (pp == *poolp) 2434 break; 2435 } 2436 2437 /* 2438 * If we didn't find memory. Try the more constrained pool, then 2439 * sweep free pages into the DMA pool and try again. 2440 */ 2441 if (nwanted != 0) { 2442 mutex_exit(&io_pool_lock); 2443 /* 2444 * If we were looking in the less constrained pool and 2445 * didn't find pages, try the more constrained pool. 2446 */ 2447 if (poolp == &io_pool_4g) { 2448 poolp = &io_pool_16m; 2449 goto try_smaller; 2450 } 2451 kmem_reap(); 2452 if (++attempt < 4) { 2453 /* 2454 * Grab some more io_pool pages 2455 */ 2456 (void) populate_io_pool(); 2457 goto try_again; /* go around and retry */ 2458 } 2459 return (NULL); 2460 } 2461 /* 2462 * Found the pages, now snip them from the list 2463 */ 2464 page_io_pool_sub(poolp, pp_first, pp_last); 2465 io_pool_cnt -= minctg; 2466 /* 2467 * reset low water mark 2468 */ 2469 if (io_pool_cnt < io_pool_cnt_lowater) 2470 io_pool_cnt_lowater = io_pool_cnt; 2471 mutex_exit(&io_pool_lock); 2472 return (pp_first); 2473 } 2474 2475 page_t * 2476 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr, 2477 ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg) 2478 { 2479 uint_t kflags; 2480 int order, extra, extpages, i, contig, nbits, extents; 2481 page_t *pp, *expp, *pp_first, **pplist = NULL; 2482 mfn_t *mfnlist = NULL; 2483 2484 contig = flags & PG_PHYSCONTIG; 2485 if (minctg == 1) 2486 contig = 0; 2487 flags &= ~PG_PHYSCONTIG; 2488 kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 2489 /* 2490 * Hypervisor will allocate extents, if we want contig 2491 * pages extent must be >= minctg 2492 */ 2493 if (contig) { 2494 order = highbit(minctg) - 1; 2495 if (minctg & ((1 << order) - 1)) 2496 order++; 2497 extpages = 1 << order; 2498 } else { 2499 order = 0; 2500 extpages = minctg; 2501 } 2502 if (extpages > minctg) { 2503 extra = extpages - minctg; 2504 if (!page_resv(extra, kflags)) 2505 return (NULL); 2506 } 2507 pp_first = NULL; 2508 pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 2509 if (pplist == NULL) 2510 goto balloon_fail; 2511 mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 2512 if (mfnlist == NULL) 2513 goto balloon_fail; 2514 pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr); 2515 if (pp == NULL) 2516 goto balloon_fail; 2517 pp_first = pp; 2518 if (extpages > minctg) { 2519 /* 2520 * fill out the rest of extent pages to swap 2521 * with the hypervisor 2522 */ 2523 for (i = 0; i < extra; i++) { 2524 expp = page_create_va(vp, 2525 (u_offset_t)(uintptr_t)io_pool_kva, 2526 PAGESIZE, flags, &kvseg, io_pool_kva); 2527 if (expp == NULL) 2528 goto balloon_fail; 2529 (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 2530 page_io_unlock(expp); 2531 page_hashout(expp, NULL); 2532 page_io_lock(expp); 2533 /* 2534 * add page to end of list 2535 */ 2536 expp->p_prev = pp_first->p_prev; 2537 expp->p_next = pp_first; 2538 expp->p_prev->p_next = expp; 2539 pp_first->p_prev = expp; 2540 } 2541 2542 } 2543 for (i = 0; i < extpages; i++) { 2544 pplist[i] = pp; 2545 pp = pp->p_next; 2546 } 2547 nbits = highbit(mattr->dma_attr_addr_hi); 2548 extents = contig ? 1 : minctg; 2549 if (balloon_replace_pages(extents, pplist, nbits, order, 2550 mfnlist) != extents) { 2551 if (ioalloc_dbg) 2552 cmn_err(CE_NOTE, "request to hypervisor" 2553 " for %d pages, maxaddr %" PRIx64 " failed", 2554 extpages, mattr->dma_attr_addr_hi); 2555 goto balloon_fail; 2556 } 2557 2558 kmem_free(pplist, extpages * sizeof (page_t *)); 2559 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2560 /* 2561 * Return any excess pages to free list 2562 */ 2563 if (extpages > minctg) { 2564 for (i = 0; i < extra; i++) { 2565 pp = pp_first->p_prev; 2566 page_sub(&pp_first, pp); 2567 page_io_unlock(pp); 2568 page_unresv(1); 2569 page_free(pp, 1); 2570 } 2571 } 2572 return (pp_first); 2573 balloon_fail: 2574 /* 2575 * Return pages to free list and return failure 2576 */ 2577 while (pp_first != NULL) { 2578 pp = pp_first; 2579 page_sub(&pp_first, pp); 2580 page_io_unlock(pp); 2581 if (pp->p_vnode != NULL) 2582 page_hashout(pp, NULL); 2583 page_free(pp, 1); 2584 } 2585 if (pplist) 2586 kmem_free(pplist, extpages * sizeof (page_t *)); 2587 if (mfnlist) 2588 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2589 page_unresv(extpages - minctg); 2590 return (NULL); 2591 } 2592 2593 static void 2594 return_partial_alloc(page_t *plist) 2595 { 2596 page_t *pp; 2597 2598 while (plist != NULL) { 2599 pp = plist; 2600 page_sub(&plist, pp); 2601 page_io_unlock(pp); 2602 page_destroy_io(pp); 2603 } 2604 } 2605 2606 static page_t * 2607 page_get_contigpages( 2608 struct vnode *vp, 2609 u_offset_t off, 2610 int *npagesp, 2611 uint_t flags, 2612 caddr_t vaddr, 2613 ddi_dma_attr_t *mattr) 2614 { 2615 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2616 page_t *plist; /* list to return */ 2617 page_t *pp, *mcpl; 2618 int contig, anyaddr, npages, getone = 0; 2619 mfn_t lo_mfn; 2620 mfn_t hi_mfn; 2621 pgcnt_t pfnalign = 0; 2622 int align, sgllen; 2623 uint64_t pfnseg; 2624 pgcnt_t minctg; 2625 2626 npages = *npagesp; 2627 ASSERT(mattr != NULL); 2628 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2629 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2630 sgllen = mattr->dma_attr_sgllen; 2631 pfnseg = mmu_btop(mattr->dma_attr_seg); 2632 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2633 if (align > MMU_PAGESIZE) 2634 pfnalign = mmu_btop(align); 2635 2636 contig = flags & PG_PHYSCONTIG; 2637 if (npages == -1) { 2638 npages = 1; 2639 pfnalign = 0; 2640 } 2641 /* 2642 * Clear the contig flag if only one page is needed. 2643 */ 2644 if (npages == 1) { 2645 getone = 1; 2646 contig = 0; 2647 } 2648 2649 /* 2650 * Check if any page in the system is fine. 2651 */ 2652 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn; 2653 if (!contig && anyaddr && !pfnalign) { 2654 flags &= ~PG_PHYSCONTIG; 2655 plist = page_create_va(vp, off, npages * MMU_PAGESIZE, 2656 flags, &kvseg, vaddr); 2657 if (plist != NULL) { 2658 *npagesp = 0; 2659 return (plist); 2660 } 2661 } 2662 plist = NULL; 2663 minctg = howmany(npages, sgllen); 2664 while (npages > sgllen || getone) { 2665 if (minctg > npages) 2666 minctg = npages; 2667 mcpl = NULL; 2668 /* 2669 * We could want contig pages with no address range limits. 2670 */ 2671 if (anyaddr && contig) { 2672 /* 2673 * Look for free contig pages to satisfy the request. 2674 */ 2675 mcpl = find_contig_free(minctg, flags, pfnseg, 2676 pfnalign); 2677 } 2678 /* 2679 * Try the reserved io pools next 2680 */ 2681 if (mcpl == NULL) 2682 mcpl = page_io_pool_alloc(mattr, contig, minctg); 2683 if (mcpl != NULL) { 2684 pp = mcpl; 2685 do { 2686 if (!page_hashin(pp, vp, off, NULL)) { 2687 panic("page_get_contigpages:" 2688 " hashin failed" 2689 " pp %p, vp %p, off %llx", 2690 (void *)pp, (void *)vp, off); 2691 } 2692 off += MMU_PAGESIZE; 2693 PP_CLRFREE(pp); 2694 PP_CLRAGED(pp); 2695 page_set_props(pp, P_REF); 2696 page_io_lock(pp); 2697 pp = pp->p_next; 2698 } while (pp != mcpl); 2699 } else { 2700 /* 2701 * Hypervisor exchange doesn't handle segment or 2702 * alignment constraints 2703 */ 2704 if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || 2705 pfnalign) 2706 goto fail; 2707 /* 2708 * Try exchanging pages with the hypervisor 2709 */ 2710 mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr, 2711 flags, minctg); 2712 if (mcpl == NULL) 2713 goto fail; 2714 off += minctg * MMU_PAGESIZE; 2715 } 2716 check_dma(mattr, mcpl, minctg); 2717 /* 2718 * Here with a minctg run of contiguous pages, add them to the 2719 * list we will return for this request. 2720 */ 2721 page_list_concat(&plist, &mcpl); 2722 npages -= minctg; 2723 *npagesp = npages; 2724 sgllen--; 2725 if (getone) 2726 break; 2727 } 2728 return (plist); 2729 fail: 2730 return_partial_alloc(plist); 2731 return (NULL); 2732 } 2733 2734 /* 2735 * Allocator for domain 0 I/O pages. We match the required 2736 * DMA attributes and contiguity constraints. 2737 */ 2738 /*ARGSUSED*/ 2739 page_t * 2740 page_create_io( 2741 struct vnode *vp, 2742 u_offset_t off, 2743 uint_t bytes, 2744 uint_t flags, 2745 struct as *as, 2746 caddr_t vaddr, 2747 ddi_dma_attr_t *mattr) 2748 { 2749 page_t *plist = NULL, *pp; 2750 int npages = 0, contig, anyaddr, pages_req; 2751 mfn_t lo_mfn; 2752 mfn_t hi_mfn; 2753 pgcnt_t pfnalign = 0; 2754 int align; 2755 int is_domu = 0; 2756 int dummy, bytes_got; 2757 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2758 2759 ASSERT(mattr != NULL); 2760 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2761 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2762 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2763 if (align > MMU_PAGESIZE) 2764 pfnalign = mmu_btop(align); 2765 2766 /* 2767 * Clear the contig flag if only one page is needed or the scatter 2768 * gather list length is >= npages. 2769 */ 2770 pages_req = npages = mmu_btopr(bytes); 2771 contig = (flags & PG_PHYSCONTIG); 2772 bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 2773 if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages) 2774 contig = 0; 2775 2776 /* 2777 * Check if any old page in the system is fine. 2778 * DomU should always go down this path. 2779 */ 2780 is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 2781 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 2782 if ((!contig && anyaddr) || is_domu) { 2783 flags &= ~PG_PHYSCONTIG; 2784 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 2785 if (plist != NULL) 2786 return (plist); 2787 else if (is_domu) 2788 return (NULL); /* no memory available */ 2789 } 2790 /* 2791 * DomU should never reach here 2792 */ 2793 if (contig) { 2794 plist = page_get_contigpages(vp, off, &npages, flags, vaddr, 2795 mattr); 2796 if (plist == NULL) 2797 goto fail; 2798 bytes_got = (pages_req - npages) << MMU_PAGESHIFT; 2799 vaddr += bytes_got; 2800 off += bytes_got; 2801 /* 2802 * We now have all the contiguous pages we need, but 2803 * we may still need additional non-contiguous pages. 2804 */ 2805 } 2806 /* 2807 * now loop collecting the requested number of pages, these do 2808 * not have to be contiguous pages but we will use the contig 2809 * page alloc code to get the pages since it will honor any 2810 * other constraints the pages may have. 2811 */ 2812 while (npages--) { 2813 dummy = -1; 2814 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr); 2815 if (pp == NULL) 2816 goto fail; 2817 page_add(&plist, pp); 2818 vaddr += MMU_PAGESIZE; 2819 off += MMU_PAGESIZE; 2820 } 2821 return (plist); 2822 fail: 2823 /* 2824 * Failed to get enough pages, return ones we did get 2825 */ 2826 return_partial_alloc(plist); 2827 return (NULL); 2828 } 2829 2830 /* 2831 * Lock and return the page with the highest mfn that we can find. last_mfn 2832 * holds the last one found, so the next search can start from there. We 2833 * also keep a counter so that we don't loop forever if the machine has no 2834 * free pages. 2835 * 2836 * This is called from the balloon thread to find pages to give away. new_high 2837 * is used when new mfn's have been added to the system - we will reset our 2838 * search if the new mfn's are higher than our current search position. 2839 */ 2840 page_t * 2841 page_get_high_mfn(mfn_t new_high) 2842 { 2843 static mfn_t last_mfn = 0; 2844 pfn_t pfn; 2845 page_t *pp; 2846 ulong_t loop_count = 0; 2847 2848 if (new_high > last_mfn) 2849 last_mfn = new_high; 2850 2851 for (; loop_count < mfn_count; loop_count++, last_mfn--) { 2852 if (last_mfn == 0) { 2853 last_mfn = cached_max_mfn; 2854 } 2855 2856 pfn = mfn_to_pfn(last_mfn); 2857 if (pfn & PFN_IS_FOREIGN_MFN) 2858 continue; 2859 2860 /* See if the page is free. If so, lock it. */ 2861 pp = page_numtopp_alloc(pfn); 2862 if (pp == NULL) 2863 continue; 2864 PP_CLRFREE(pp); 2865 2866 ASSERT(PAGE_EXCL(pp)); 2867 ASSERT(pp->p_vnode == NULL); 2868 ASSERT(!hat_page_is_mapped(pp)); 2869 last_mfn--; 2870 return (pp); 2871 } 2872 return (NULL); 2873 } 2874 2875 #else /* !__xpv */ 2876 2877 /* 2878 * get a page from any list with the given mnode 2879 */ 2880 static page_t * 2881 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 2882 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 2883 { 2884 kmutex_t *pcm; 2885 int i; 2886 page_t *pp; 2887 page_t *first_pp; 2888 uint64_t pgaddr; 2889 ulong_t bin; 2890 int mtypestart; 2891 int plw_initialized; 2892 page_list_walker_t plw; 2893 2894 VM_STAT_ADD(pga_vmstats.pgma_alloc); 2895 2896 ASSERT((flags & PG_MATCH_COLOR) == 0); 2897 ASSERT(szc == 0); 2898 ASSERT(dma_attr != NULL); 2899 2900 MTYPE_START(mnode, mtype, flags); 2901 if (mtype < 0) { 2902 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 2903 return (NULL); 2904 } 2905 2906 mtypestart = mtype; 2907 2908 bin = origbin; 2909 2910 /* 2911 * check up to page_colors + 1 bins - origbin may be checked twice 2912 * because of BIN_STEP skip 2913 */ 2914 do { 2915 plw_initialized = 0; 2916 2917 for (plw.plw_count = 0; 2918 plw.plw_count < page_colors; plw.plw_count++) { 2919 2920 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 2921 goto nextfreebin; 2922 2923 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2924 mutex_enter(pcm); 2925 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2926 first_pp = pp; 2927 while (pp != NULL) { 2928 if (page_trylock(pp, SE_EXCL) == 0) { 2929 pp = pp->p_next; 2930 if (pp == first_pp) { 2931 pp = NULL; 2932 } 2933 continue; 2934 } 2935 2936 ASSERT(PP_ISFREE(pp)); 2937 ASSERT(PP_ISAGED(pp)); 2938 ASSERT(pp->p_vnode == NULL); 2939 ASSERT(pp->p_hash == NULL); 2940 ASSERT(pp->p_offset == (u_offset_t)-1); 2941 ASSERT(pp->p_szc == szc); 2942 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2943 /* check if page within DMA attributes */ 2944 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2945 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2946 (pgaddr + MMU_PAGESIZE - 1 <= 2947 dma_attr->dma_attr_addr_hi)) { 2948 break; 2949 } 2950 2951 /* continue looking */ 2952 page_unlock(pp); 2953 pp = pp->p_next; 2954 if (pp == first_pp) 2955 pp = NULL; 2956 2957 } 2958 if (pp != NULL) { 2959 ASSERT(mtype == PP_2_MTYPE(pp)); 2960 ASSERT(pp->p_szc == 0); 2961 2962 /* found a page with specified DMA attributes */ 2963 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 2964 mtype), pp); 2965 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2966 2967 if ((PP_ISFREE(pp) == 0) || 2968 (PP_ISAGED(pp) == 0)) { 2969 cmn_err(CE_PANIC, "page %p is not free", 2970 (void *)pp); 2971 } 2972 2973 mutex_exit(pcm); 2974 check_dma(dma_attr, pp, 1); 2975 VM_STAT_ADD(pga_vmstats.pgma_allocok); 2976 return (pp); 2977 } 2978 mutex_exit(pcm); 2979 nextfreebin: 2980 if (plw_initialized == 0) { 2981 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 2982 ASSERT(plw.plw_ceq_dif == page_colors); 2983 plw_initialized = 1; 2984 } 2985 2986 if (plw.plw_do_split) { 2987 pp = page_freelist_split(szc, bin, mnode, 2988 mtype, 2989 mmu_btop(dma_attr->dma_attr_addr_lo), 2990 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 2991 &plw); 2992 if (pp != NULL) { 2993 check_dma(dma_attr, pp, 1); 2994 return (pp); 2995 } 2996 } 2997 2998 bin = page_list_walk_next_bin(szc, bin, &plw); 2999 } 3000 3001 MTYPE_NEXT(mnode, mtype, flags); 3002 } while (mtype >= 0); 3003 3004 /* failed to find a page in the freelist; try it in the cachelist */ 3005 3006 /* reset mtype start for cachelist search */ 3007 mtype = mtypestart; 3008 ASSERT(mtype >= 0); 3009 3010 /* start with the bin of matching color */ 3011 bin = origbin; 3012 3013 do { 3014 for (i = 0; i <= page_colors; i++) { 3015 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 3016 goto nextcachebin; 3017 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3018 mutex_enter(pcm); 3019 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3020 first_pp = pp; 3021 while (pp != NULL) { 3022 if (page_trylock(pp, SE_EXCL) == 0) { 3023 pp = pp->p_next; 3024 if (pp == first_pp) 3025 pp = NULL; 3026 continue; 3027 } 3028 ASSERT(pp->p_vnode); 3029 ASSERT(PP_ISAGED(pp) == 0); 3030 ASSERT(pp->p_szc == 0); 3031 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3032 3033 /* check if page within DMA attributes */ 3034 3035 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 3036 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 3037 (pgaddr + MMU_PAGESIZE - 1 <= 3038 dma_attr->dma_attr_addr_hi)) { 3039 break; 3040 } 3041 3042 /* continue looking */ 3043 page_unlock(pp); 3044 pp = pp->p_next; 3045 if (pp == first_pp) 3046 pp = NULL; 3047 } 3048 3049 if (pp != NULL) { 3050 ASSERT(mtype == PP_2_MTYPE(pp)); 3051 ASSERT(pp->p_szc == 0); 3052 3053 /* found a page with specified DMA attributes */ 3054 page_sub(&PAGE_CACHELISTS(mnode, bin, 3055 mtype), pp); 3056 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3057 3058 mutex_exit(pcm); 3059 ASSERT(pp->p_vnode); 3060 ASSERT(PP_ISAGED(pp) == 0); 3061 check_dma(dma_attr, pp, 1); 3062 VM_STAT_ADD(pga_vmstats.pgma_allocok); 3063 return (pp); 3064 } 3065 mutex_exit(pcm); 3066 nextcachebin: 3067 bin += (i == 0) ? BIN_STEP : 1; 3068 bin &= page_colors_mask; 3069 } 3070 MTYPE_NEXT(mnode, mtype, flags); 3071 } while (mtype >= 0); 3072 3073 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 3074 return (NULL); 3075 } 3076 3077 /* 3078 * This function is similar to page_get_freelist()/page_get_cachelist() 3079 * but it searches both the lists to find a page with the specified 3080 * color (or no color) and DMA attributes. The search is done in the 3081 * freelist first and then in the cache list within the highest memory 3082 * range (based on DMA attributes) before searching in the lower 3083 * memory ranges. 3084 * 3085 * Note: This function is called only by page_create_io(). 3086 */ 3087 /*ARGSUSED*/ 3088 static page_t * 3089 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 3090 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 3091 { 3092 uint_t bin; 3093 int mtype; 3094 page_t *pp; 3095 int n; 3096 int m; 3097 int szc; 3098 int fullrange; 3099 int mnode; 3100 int local_failed_stat = 0; 3101 lgrp_mnode_cookie_t lgrp_cookie; 3102 3103 VM_STAT_ADD(pga_vmstats.pga_alloc); 3104 3105 /* only base pagesize currently supported */ 3106 if (size != MMU_PAGESIZE) 3107 return (NULL); 3108 3109 /* 3110 * If we're passed a specific lgroup, we use it. Otherwise, 3111 * assume first-touch placement is desired. 3112 */ 3113 if (!LGRP_EXISTS(lgrp)) 3114 lgrp = lgrp_home_lgrp(); 3115 3116 /* LINTED */ 3117 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3118 3119 /* 3120 * Only hold one freelist or cachelist lock at a time, that way we 3121 * can start anywhere and not have to worry about lock 3122 * ordering. 3123 */ 3124 if (dma_attr == NULL) { 3125 n = 0; 3126 m = mnoderangecnt - 1; 3127 fullrange = 1; 3128 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 3129 } else { 3130 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 3131 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 3132 3133 /* 3134 * We can guarantee alignment only for page boundary. 3135 */ 3136 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 3137 return (NULL); 3138 3139 n = pfn_2_mtype(pfnlo); 3140 m = pfn_2_mtype(pfnhi); 3141 3142 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 3143 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 3144 } 3145 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 3146 3147 if (n > m) 3148 return (NULL); 3149 3150 szc = 0; 3151 3152 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 3153 if (n == 0) { 3154 flags |= PGI_MT_RANGE0; 3155 n = m; 3156 } 3157 3158 /* 3159 * Try local memory node first, but try remote if we can't 3160 * get a page of the right color. 3161 */ 3162 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 3163 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3164 /* 3165 * allocate pages from high pfn to low. 3166 */ 3167 for (mtype = m; mtype >= n; mtype--) { 3168 if (fullrange != 0) { 3169 pp = page_get_mnode_freelist(mnode, 3170 bin, mtype, szc, flags); 3171 if (pp == NULL) { 3172 pp = page_get_mnode_cachelist( 3173 bin, flags, mnode, mtype); 3174 } 3175 } else { 3176 pp = page_get_mnode_anylist(bin, szc, 3177 flags, mnode, mtype, dma_attr); 3178 } 3179 if (pp != NULL) { 3180 VM_STAT_ADD(pga_vmstats.pga_allocok); 3181 check_dma(dma_attr, pp, 1); 3182 return (pp); 3183 } 3184 } 3185 if (!local_failed_stat) { 3186 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3187 local_failed_stat = 1; 3188 } 3189 } 3190 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 3191 3192 return (NULL); 3193 } 3194 3195 /* 3196 * page_create_io() 3197 * 3198 * This function is a copy of page_create_va() with an additional 3199 * argument 'mattr' that specifies DMA memory requirements to 3200 * the page list functions. This function is used by the segkmem 3201 * allocator so it is only to create new pages (i.e PG_EXCL is 3202 * set). 3203 * 3204 * Note: This interface is currently used by x86 PSM only and is 3205 * not fully specified so the commitment level is only for 3206 * private interface specific to x86. This interface uses PSM 3207 * specific page_get_anylist() interface. 3208 */ 3209 3210 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 3211 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 3212 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 3213 break; \ 3214 } \ 3215 } 3216 3217 3218 page_t * 3219 page_create_io( 3220 struct vnode *vp, 3221 u_offset_t off, 3222 uint_t bytes, 3223 uint_t flags, 3224 struct as *as, 3225 caddr_t vaddr, 3226 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 3227 { 3228 page_t *plist = NULL; 3229 uint_t plist_len = 0; 3230 pgcnt_t npages; 3231 page_t *npp = NULL; 3232 uint_t pages_req; 3233 page_t *pp; 3234 kmutex_t *phm = NULL; 3235 uint_t index; 3236 3237 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 3238 "page_create_start:vp %p off %llx bytes %u flags %x", 3239 vp, off, bytes, flags); 3240 3241 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 3242 3243 pages_req = npages = mmu_btopr(bytes); 3244 3245 /* 3246 * Do the freemem and pcf accounting. 3247 */ 3248 if (!page_create_wait(npages, flags)) { 3249 return (NULL); 3250 } 3251 3252 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 3253 "page_create_success:vp %p off %llx", vp, off); 3254 3255 /* 3256 * If satisfying this request has left us with too little 3257 * memory, start the wheels turning to get some back. The 3258 * first clause of the test prevents waking up the pageout 3259 * daemon in situations where it would decide that there's 3260 * nothing to do. 3261 */ 3262 if (nscan < desscan && freemem < minfree) { 3263 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 3264 "pageout_cv_signal:freemem %ld", freemem); 3265 cv_signal(&proc_pageout->p_cv); 3266 } 3267 3268 if (flags & PG_PHYSCONTIG) { 3269 3270 plist = page_get_contigpage(&npages, mattr, 1); 3271 if (plist == NULL) { 3272 page_create_putback(npages); 3273 return (NULL); 3274 } 3275 3276 pp = plist; 3277 3278 do { 3279 if (!page_hashin(pp, vp, off, NULL)) { 3280 panic("pg_creat_io: hashin failed %p %p %llx", 3281 (void *)pp, (void *)vp, off); 3282 } 3283 VM_STAT_ADD(page_create_new); 3284 off += MMU_PAGESIZE; 3285 PP_CLRFREE(pp); 3286 PP_CLRAGED(pp); 3287 page_set_props(pp, P_REF); 3288 pp = pp->p_next; 3289 } while (pp != plist); 3290 3291 if (!npages) { 3292 check_dma(mattr, plist, pages_req); 3293 return (plist); 3294 } else { 3295 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 3296 } 3297 3298 /* 3299 * fall-thru: 3300 * 3301 * page_get_contigpage returns when npages <= sgllen. 3302 * Grab the rest of the non-contig pages below from anylist. 3303 */ 3304 } 3305 3306 /* 3307 * Loop around collecting the requested number of pages. 3308 * Most of the time, we have to `create' a new page. With 3309 * this in mind, pull the page off the free list before 3310 * getting the hash lock. This will minimize the hash 3311 * lock hold time, nesting, and the like. If it turns 3312 * out we don't need the page, we put it back at the end. 3313 */ 3314 while (npages--) { 3315 phm = NULL; 3316 3317 index = PAGE_HASH_FUNC(vp, off); 3318 top: 3319 ASSERT(phm == NULL); 3320 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 3321 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3322 3323 if (npp == NULL) { 3324 /* 3325 * Try to get the page of any color either from 3326 * the freelist or from the cache list. 3327 */ 3328 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 3329 flags & ~PG_MATCH_COLOR, mattr, NULL); 3330 if (npp == NULL) { 3331 if (mattr == NULL) { 3332 /* 3333 * Not looking for a special page; 3334 * panic! 3335 */ 3336 panic("no page found %d", (int)npages); 3337 } 3338 /* 3339 * No page found! This can happen 3340 * if we are looking for a page 3341 * within a specific memory range 3342 * for DMA purposes. If PG_WAIT is 3343 * specified then we wait for a 3344 * while and then try again. The 3345 * wait could be forever if we 3346 * don't get the page(s) we need. 3347 * 3348 * Note: XXX We really need a mechanism 3349 * to wait for pages in the desired 3350 * range. For now, we wait for any 3351 * pages and see if we can use it. 3352 */ 3353 3354 if ((mattr != NULL) && (flags & PG_WAIT)) { 3355 delay(10); 3356 goto top; 3357 } 3358 goto fail; /* undo accounting stuff */ 3359 } 3360 3361 if (PP_ISAGED(npp) == 0) { 3362 /* 3363 * Since this page came from the 3364 * cachelist, we must destroy the 3365 * old vnode association. 3366 */ 3367 page_hashout(npp, (kmutex_t *)NULL); 3368 } 3369 } 3370 3371 /* 3372 * We own this page! 3373 */ 3374 ASSERT(PAGE_EXCL(npp)); 3375 ASSERT(npp->p_vnode == NULL); 3376 ASSERT(!hat_page_is_mapped(npp)); 3377 PP_CLRFREE(npp); 3378 PP_CLRAGED(npp); 3379 3380 /* 3381 * Here we have a page in our hot little mits and are 3382 * just waiting to stuff it on the appropriate lists. 3383 * Get the mutex and check to see if it really does 3384 * not exist. 3385 */ 3386 phm = PAGE_HASH_MUTEX(index); 3387 mutex_enter(phm); 3388 PAGE_HASH_SEARCH(index, pp, vp, off); 3389 if (pp == NULL) { 3390 VM_STAT_ADD(page_create_new); 3391 pp = npp; 3392 npp = NULL; 3393 if (!page_hashin(pp, vp, off, phm)) { 3394 /* 3395 * Since we hold the page hash mutex and 3396 * just searched for this page, page_hashin 3397 * had better not fail. If it does, that 3398 * means somethread did not follow the 3399 * page hash mutex rules. Panic now and 3400 * get it over with. As usual, go down 3401 * holding all the locks. 3402 */ 3403 ASSERT(MUTEX_HELD(phm)); 3404 panic("page_create: hashin fail %p %p %llx %p", 3405 (void *)pp, (void *)vp, off, (void *)phm); 3406 3407 } 3408 ASSERT(MUTEX_HELD(phm)); 3409 mutex_exit(phm); 3410 phm = NULL; 3411 3412 /* 3413 * Hat layer locking need not be done to set 3414 * the following bits since the page is not hashed 3415 * and was on the free list (i.e., had no mappings). 3416 * 3417 * Set the reference bit to protect 3418 * against immediate pageout 3419 * 3420 * XXXmh modify freelist code to set reference 3421 * bit so we don't have to do it here. 3422 */ 3423 page_set_props(pp, P_REF); 3424 } else { 3425 ASSERT(MUTEX_HELD(phm)); 3426 mutex_exit(phm); 3427 phm = NULL; 3428 /* 3429 * NOTE: This should not happen for pages associated 3430 * with kernel vnode 'kvp'. 3431 */ 3432 /* XX64 - to debug why this happens! */ 3433 ASSERT(!VN_ISKAS(vp)); 3434 if (VN_ISKAS(vp)) 3435 cmn_err(CE_NOTE, 3436 "page_create: page not expected " 3437 "in hash list for kernel vnode - pp 0x%p", 3438 (void *)pp); 3439 VM_STAT_ADD(page_create_exists); 3440 goto fail; 3441 } 3442 3443 /* 3444 * Got a page! It is locked. Acquire the i/o 3445 * lock since we are going to use the p_next and 3446 * p_prev fields to link the requested pages together. 3447 */ 3448 page_io_lock(pp); 3449 page_add(&plist, pp); 3450 plist = plist->p_next; 3451 off += MMU_PAGESIZE; 3452 vaddr += MMU_PAGESIZE; 3453 } 3454 3455 check_dma(mattr, plist, pages_req); 3456 return (plist); 3457 3458 fail: 3459 if (npp != NULL) { 3460 /* 3461 * Did not need this page after all. 3462 * Put it back on the free list. 3463 */ 3464 VM_STAT_ADD(page_create_putbacks); 3465 PP_SETFREE(npp); 3466 PP_SETAGED(npp); 3467 npp->p_offset = (u_offset_t)-1; 3468 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 3469 page_unlock(npp); 3470 } 3471 3472 /* 3473 * Give up the pages we already got. 3474 */ 3475 while (plist != NULL) { 3476 pp = plist; 3477 page_sub(&plist, pp); 3478 page_io_unlock(pp); 3479 plist_len++; 3480 /*LINTED: constant in conditional ctx*/ 3481 VN_DISPOSE(pp, B_INVAL, 0, kcred); 3482 } 3483 3484 /* 3485 * VN_DISPOSE does freemem accounting for the pages in plist 3486 * by calling page_free. So, we need to undo the pcf accounting 3487 * for only the remaining pages. 3488 */ 3489 VM_STAT_ADD(page_create_putbacks); 3490 page_create_putback(pages_req - plist_len); 3491 3492 return (NULL); 3493 } 3494 #endif /* !__xpv */ 3495 3496 3497 /* 3498 * Copy the data from the physical page represented by "frompp" to 3499 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 3500 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 3501 * level and no one sleeps with an active mapping there. 3502 * 3503 * Note that the ref/mod bits in the page_t's are not affected by 3504 * this operation, hence it is up to the caller to update them appropriately. 3505 */ 3506 int 3507 ppcopy(page_t *frompp, page_t *topp) 3508 { 3509 caddr_t pp_addr1; 3510 caddr_t pp_addr2; 3511 hat_mempte_t pte1; 3512 hat_mempte_t pte2; 3513 kmutex_t *ppaddr_mutex; 3514 label_t ljb; 3515 int ret = 1; 3516 3517 ASSERT_STACK_ALIGNED(); 3518 ASSERT(PAGE_LOCKED(frompp)); 3519 ASSERT(PAGE_LOCKED(topp)); 3520 3521 if (kpm_enable) { 3522 pp_addr1 = hat_kpm_page2va(frompp, 0); 3523 pp_addr2 = hat_kpm_page2va(topp, 0); 3524 kpreempt_disable(); 3525 } else { 3526 /* 3527 * disable pre-emption so that CPU can't change 3528 */ 3529 kpreempt_disable(); 3530 3531 pp_addr1 = CPU->cpu_caddr1; 3532 pp_addr2 = CPU->cpu_caddr2; 3533 pte1 = CPU->cpu_caddr1pte; 3534 pte2 = CPU->cpu_caddr2pte; 3535 3536 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3537 mutex_enter(ppaddr_mutex); 3538 3539 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 3540 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 3541 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 3542 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3543 HAT_LOAD_NOCONSIST); 3544 } 3545 3546 if (on_fault(&ljb)) { 3547 ret = 0; 3548 goto faulted; 3549 } 3550 if (use_sse_pagecopy) 3551 #ifdef __xpv 3552 page_copy_no_xmm(pp_addr2, pp_addr1); 3553 #else 3554 hwblkpagecopy(pp_addr1, pp_addr2); 3555 #endif 3556 else 3557 bcopy(pp_addr1, pp_addr2, PAGESIZE); 3558 3559 no_fault(); 3560 faulted: 3561 if (!kpm_enable) { 3562 #ifdef __xpv 3563 /* 3564 * We can't leave unused mappings laying about under the 3565 * hypervisor, so blow them away. 3566 */ 3567 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 3568 UVMF_INVLPG | UVMF_LOCAL) < 0) 3569 panic("HYPERVISOR_update_va_mapping() failed"); 3570 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3571 UVMF_INVLPG | UVMF_LOCAL) < 0) 3572 panic("HYPERVISOR_update_va_mapping() failed"); 3573 #endif 3574 mutex_exit(ppaddr_mutex); 3575 } 3576 kpreempt_enable(); 3577 return (ret); 3578 } 3579 3580 void 3581 pagezero(page_t *pp, uint_t off, uint_t len) 3582 { 3583 ASSERT(PAGE_LOCKED(pp)); 3584 pfnzero(page_pptonum(pp), off, len); 3585 } 3586 3587 /* 3588 * Zero the physical page from off to off + len given by pfn 3589 * without changing the reference and modified bits of page. 3590 * 3591 * We use this using CPU private page address #2, see ppcopy() for more info. 3592 * pfnzero() must not be called at interrupt level. 3593 */ 3594 void 3595 pfnzero(pfn_t pfn, uint_t off, uint_t len) 3596 { 3597 caddr_t pp_addr2; 3598 hat_mempte_t pte2; 3599 kmutex_t *ppaddr_mutex = NULL; 3600 3601 ASSERT_STACK_ALIGNED(); 3602 ASSERT(len <= MMU_PAGESIZE); 3603 ASSERT(off <= MMU_PAGESIZE); 3604 ASSERT(off + len <= MMU_PAGESIZE); 3605 3606 if (kpm_enable && !pfn_is_foreign(pfn)) { 3607 pp_addr2 = hat_kpm_pfn2va(pfn); 3608 kpreempt_disable(); 3609 } else { 3610 kpreempt_disable(); 3611 3612 pp_addr2 = CPU->cpu_caddr2; 3613 pte2 = CPU->cpu_caddr2pte; 3614 3615 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3616 mutex_enter(ppaddr_mutex); 3617 3618 hat_mempte_remap(pfn, pp_addr2, pte2, 3619 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3620 HAT_LOAD_NOCONSIST); 3621 } 3622 3623 if (use_sse_pagezero) { 3624 #ifdef __xpv 3625 uint_t rem; 3626 3627 /* 3628 * zero a byte at a time until properly aligned for 3629 * block_zero_no_xmm(). 3630 */ 3631 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3632 pp_addr2[off++] = 0; 3633 3634 /* 3635 * Now use faster block_zero_no_xmm() for any range 3636 * that is properly aligned and sized. 3637 */ 3638 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3639 len -= rem; 3640 if (len != 0) { 3641 block_zero_no_xmm(pp_addr2 + off, len); 3642 off += len; 3643 } 3644 3645 /* 3646 * zero remainder with byte stores. 3647 */ 3648 while (rem-- > 0) 3649 pp_addr2[off++] = 0; 3650 #else 3651 hwblkclr(pp_addr2 + off, len); 3652 #endif 3653 } else { 3654 bzero(pp_addr2 + off, len); 3655 } 3656 3657 if (!kpm_enable || pfn_is_foreign(pfn)) { 3658 #ifdef __xpv 3659 /* 3660 * On the hypervisor this page might get used for a page 3661 * table before any intervening change to this mapping, 3662 * so blow it away. 3663 */ 3664 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3665 UVMF_INVLPG) < 0) 3666 panic("HYPERVISOR_update_va_mapping() failed"); 3667 #endif 3668 mutex_exit(ppaddr_mutex); 3669 } 3670 3671 kpreempt_enable(); 3672 } 3673 3674 /* 3675 * Platform-dependent page scrub call. 3676 */ 3677 void 3678 pagescrub(page_t *pp, uint_t off, uint_t len) 3679 { 3680 /* 3681 * For now, we rely on the fact that pagezero() will 3682 * always clear UEs. 3683 */ 3684 pagezero(pp, off, len); 3685 } 3686 3687 /* 3688 * set up two private addresses for use on a given CPU for use in ppcopy() 3689 */ 3690 void 3691 setup_vaddr_for_ppcopy(struct cpu *cpup) 3692 { 3693 void *addr; 3694 hat_mempte_t pte_pa; 3695 3696 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3697 pte_pa = hat_mempte_setup(addr); 3698 cpup->cpu_caddr1 = addr; 3699 cpup->cpu_caddr1pte = pte_pa; 3700 3701 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3702 pte_pa = hat_mempte_setup(addr); 3703 cpup->cpu_caddr2 = addr; 3704 cpup->cpu_caddr2pte = pte_pa; 3705 3706 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 3707 } 3708 3709 /* 3710 * Undo setup_vaddr_for_ppcopy 3711 */ 3712 void 3713 teardown_vaddr_for_ppcopy(struct cpu *cpup) 3714 { 3715 mutex_destroy(&cpup->cpu_ppaddr_mutex); 3716 3717 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 3718 cpup->cpu_caddr2pte = 0; 3719 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 3720 cpup->cpu_caddr2 = 0; 3721 3722 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 3723 cpup->cpu_caddr1pte = 0; 3724 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 3725 cpup->cpu_caddr1 = 0; 3726 } 3727 3728 /* 3729 * Function for flushing D-cache when performing module relocations 3730 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 3731 */ 3732 void 3733 dcache_flushall() 3734 {} 3735 3736 size_t 3737 exec_get_spslew(void) 3738 { 3739 return (0); 3740 } 3741 3742 /* 3743 * Allocate a memory page. The argument 'seed' can be any pseudo-random 3744 * number to vary where the pages come from. This is quite a hacked up 3745 * method -- it works for now, but really needs to be fixed up a bit. 3746 * 3747 * We currently use page_create_va() on the kvp with fake offsets, 3748 * segments and virt address. This is pretty bogus, but was copied from the 3749 * old hat_i86.c code. A better approach would be to specify either mnode 3750 * random or mnode local and takes a page from whatever color has the MOST 3751 * available - this would have a minimal impact on page coloring. 3752 */ 3753 page_t * 3754 page_get_physical(uintptr_t seed) 3755 { 3756 page_t *pp; 3757 u_offset_t offset; 3758 static struct seg tmpseg; 3759 static uintptr_t ctr = 0; 3760 3761 /* 3762 * This code is gross, we really need a simpler page allocator. 3763 * 3764 * We need to assign an offset for the page to call page_create_va() 3765 * To avoid conflicts with other pages, we get creative with the offset. 3766 * For 32 bits, we need an offset > 4Gig 3767 * For 64 bits, need an offset somewhere in the VA hole. 3768 */ 3769 offset = seed; 3770 if (offset > kernelbase) 3771 offset -= kernelbase; 3772 offset <<= MMU_PAGESHIFT; 3773 #if defined(__amd64) 3774 offset += mmu.hole_start; /* something in VA hole */ 3775 #else 3776 offset += 1ULL << 40; /* something > 4 Gig */ 3777 #endif 3778 3779 if (page_resv(1, KM_NOSLEEP) == 0) 3780 return (NULL); 3781 3782 #ifdef DEBUG 3783 pp = page_exists(&kvp, offset); 3784 if (pp != NULL) 3785 panic("page already exists %p", (void *)pp); 3786 #endif 3787 3788 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 3789 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 3790 if (pp != NULL) { 3791 page_io_unlock(pp); 3792 page_hashout(pp, NULL); 3793 page_downgrade(pp); 3794 } 3795 return (pp); 3796 } 3797