1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 /* 35 * UNIX machine dependent virtual memory support. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/user.h> 42 #include <sys/proc.h> 43 #include <sys/kmem.h> 44 #include <sys/vmem.h> 45 #include <sys/buf.h> 46 #include <sys/cpuvar.h> 47 #include <sys/lgrp.h> 48 #include <sys/disp.h> 49 #include <sys/vm.h> 50 #include <sys/mman.h> 51 #include <sys/vnode.h> 52 #include <sys/cred.h> 53 #include <sys/exec.h> 54 #include <sys/exechdr.h> 55 #include <sys/debug.h> 56 #include <sys/vmsystm.h> 57 58 #include <vm/hat.h> 59 #include <vm/as.h> 60 #include <vm/seg.h> 61 #include <vm/seg_kp.h> 62 #include <vm/seg_vn.h> 63 #include <vm/page.h> 64 #include <vm/seg_kmem.h> 65 #include <vm/seg_kpm.h> 66 #include <vm/vm_dep.h> 67 68 #include <sys/cpu.h> 69 #include <sys/vm_machparam.h> 70 #include <sys/memlist.h> 71 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 72 #include <vm/hat_i86.h> 73 #include <sys/x86_archext.h> 74 #include <sys/elf_386.h> 75 #include <sys/cmn_err.h> 76 #include <sys/archsystm.h> 77 #include <sys/machsystm.h> 78 79 #include <sys/vtrace.h> 80 #include <sys/ddidmareq.h> 81 #include <sys/promif.h> 82 #include <sys/memnode.h> 83 #include <sys/stack.h> 84 #include <util/qsort.h> 85 #include <sys/taskq.h> 86 87 #ifdef __xpv 88 89 #include <sys/hypervisor.h> 90 #include <sys/xen_mmu.h> 91 #include <sys/balloon_impl.h> 92 93 /* 94 * domain 0 pages usable for DMA are kept pre-allocated and kept in 95 * distinct lists, ordered by increasing mfn. 96 */ 97 static kmutex_t io_pool_lock; 98 static kmutex_t contig_list_lock; 99 static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 100 static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 101 static long io_pool_cnt; 102 static long io_pool_cnt_max = 0; 103 #define DEFAULT_IO_POOL_MIN 128 104 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 105 static long io_pool_cnt_lowater = 0; 106 static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 107 static long io_pool_shrinks; /* how many times did we really shrink */ 108 static long io_pool_grows; /* how many times did we grow */ 109 static mfn_t start_mfn = 1; 110 static caddr_t io_pool_kva; /* use to alloc pages when needed */ 111 112 static int create_contig_pfnlist(uint_t); 113 114 /* 115 * percentage of phys mem to hold in the i/o pool 116 */ 117 #define DEFAULT_IO_POOL_PCT 2 118 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 119 static void page_io_pool_sub(page_t **, page_t *, page_t *); 120 int ioalloc_dbg = 0; 121 122 #endif /* __xpv */ 123 124 uint_t vac_colors = 1; 125 126 int largepagesupport = 0; 127 extern uint_t page_create_new; 128 extern uint_t page_create_exists; 129 extern uint_t page_create_putbacks; 130 extern uint_t page_create_putbacks; 131 /* 132 * Allow users to disable the kernel's use of SSE. 133 */ 134 extern int use_sse_pagecopy, use_sse_pagezero; 135 136 /* 137 * combined memory ranges from mnode and memranges[] to manage single 138 * mnode/mtype dimension in the page lists. 139 */ 140 typedef struct { 141 pfn_t mnr_pfnlo; 142 pfn_t mnr_pfnhi; 143 int mnr_mnode; 144 int mnr_memrange; /* index into memranges[] */ 145 /* maintain page list stats */ 146 pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 147 pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 148 pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 149 #ifdef DEBUG 150 struct mnr_mts { /* mnode/mtype szc stats */ 151 pgcnt_t mnr_mts_pgcnt; 152 int mnr_mts_colors; 153 pgcnt_t *mnr_mtsc_pgcnt; 154 } *mnr_mts; 155 #endif 156 } mnoderange_t; 157 158 #define MEMRANGEHI(mtype) \ 159 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 160 #define MEMRANGELO(mtype) (memranges[mtype]) 161 162 #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 163 164 /* 165 * As the PC architecture evolved memory up was clumped into several 166 * ranges for various historical I/O devices to do DMA. 167 * < 16Meg - ISA bus 168 * < 2Gig - ??? 169 * < 4Gig - PCI bus or drivers that don't understand PAE mode 170 * 171 * These are listed in reverse order, so that we can skip over unused 172 * ranges on machines with small memories. 173 * 174 * For now under the Hypervisor, we'll only ever have one memrange. 175 */ 176 #define PFN_4GIG 0x100000 177 #define PFN_16MEG 0x1000 178 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 179 PFN_4GIG, /* pfn range for 4G and above */ 180 0x80000, /* pfn range for 2G-4G */ 181 PFN_16MEG, /* pfn range for 16M-2G */ 182 0x00000, /* pfn range for 0-16M */ 183 }; 184 pfn_t *memranges = &arch_memranges[0]; 185 int nranges = NUM_MEM_RANGES; 186 187 /* 188 * This combines mem_node_config and memranges into one data 189 * structure to be used for page list management. 190 */ 191 mnoderange_t *mnoderanges; 192 int mnoderangecnt; 193 int mtype4g; 194 195 /* 196 * 4g memory management variables for systems with more than 4g of memory: 197 * 198 * physical memory below 4g is required for 32bit dma devices and, currently, 199 * for kmem memory. On systems with more than 4g of memory, the pool of memory 200 * below 4g can be depleted without any paging activity given that there is 201 * likely to be sufficient memory above 4g. 202 * 203 * physmax4g is set true if the largest pfn is over 4g. The rest of the 204 * 4g memory management code is enabled only when physmax4g is true. 205 * 206 * maxmem4g is the count of the maximum number of pages on the page lists 207 * with physical addresses below 4g. It can be a lot less then 4g given that 208 * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 209 * agp aperture etc. 210 * 211 * freemem4g maintains the count of the number of available pages on the 212 * page lists with physical addresses below 4g. 213 * 214 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 215 * 6% (desfree4gshift = 4) of maxmem4g. 216 * 217 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 218 * and the amount of physical memory above 4g is greater than freemem4g. 219 * In this case, page_get_* routines will restrict below 4g allocations 220 * for requests that don't specifically require it. 221 */ 222 223 #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 224 #define DESFREE4G (maxmem4g >> desfree4gshift) 225 226 #define RESTRICT4G_ALLOC \ 227 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 228 229 static pgcnt_t maxmem4g; 230 static pgcnt_t freemem4g; 231 static int physmax4g; 232 static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 233 static int lotsfree4gshift = 3; 234 235 /* 236 * 16m memory management: 237 * 238 * reserve some amount of physical memory below 16m for legacy devices. 239 * 240 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 241 * 16m or if the 16m pool drops below DESFREE16M. 242 * 243 * In this case, general page allocations via page_get_{free,cache}list 244 * routines will be restricted from allocating from the 16m pool. Allocations 245 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 246 * are not restricted. 247 */ 248 249 #define FREEMEM16M MTYPE_FREEMEM(0) 250 #define DESFREE16M desfree16m 251 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 252 ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 253 ((freemem >= (FREEMEM16M)) || \ 254 (FREEMEM16M < (DESFREE16M + pgcnt)))) 255 256 static pgcnt_t desfree16m = 0x380; 257 258 /* 259 * This can be patched via /etc/system to allow old non-PAE aware device 260 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 261 */ 262 int restricted_kmemalloc = 0; 263 264 #ifdef VM_STATS 265 struct { 266 ulong_t pga_alloc; 267 ulong_t pga_notfullrange; 268 ulong_t pga_nulldmaattr; 269 ulong_t pga_allocok; 270 ulong_t pga_allocfailed; 271 ulong_t pgma_alloc; 272 ulong_t pgma_allocok; 273 ulong_t pgma_allocfailed; 274 ulong_t pgma_allocempty; 275 } pga_vmstats; 276 #endif 277 278 uint_t mmu_page_sizes; 279 280 /* How many page sizes the users can see */ 281 uint_t mmu_exported_page_sizes; 282 283 /* page sizes that legacy applications can see */ 284 uint_t mmu_legacy_page_sizes; 285 286 /* 287 * Number of pages in 1 GB. Don't enable automatic large pages if we have 288 * fewer than this many pages. 289 */ 290 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 291 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 292 293 /* 294 * Maximum and default segment size tunables for user private 295 * and shared anon memory, and user text and initialized data. 296 * These can be patched via /etc/system to allow large pages 297 * to be used for mapping application private and shared anon memory. 298 */ 299 size_t mcntl0_lpsize = MMU_PAGESIZE; 300 size_t max_uheap_lpsize = MMU_PAGESIZE; 301 size_t default_uheap_lpsize = MMU_PAGESIZE; 302 size_t max_ustack_lpsize = MMU_PAGESIZE; 303 size_t default_ustack_lpsize = MMU_PAGESIZE; 304 size_t max_privmap_lpsize = MMU_PAGESIZE; 305 size_t max_uidata_lpsize = MMU_PAGESIZE; 306 size_t max_utext_lpsize = MMU_PAGESIZE; 307 size_t max_shm_lpsize = MMU_PAGESIZE; 308 309 310 /* 311 * initialized by page_coloring_init(). 312 */ 313 uint_t page_colors; 314 uint_t page_colors_mask; 315 uint_t page_coloring_shift; 316 int cpu_page_colors; 317 static uint_t l2_colors; 318 319 /* 320 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 321 * and page_colors are calculated from the l2 cache n-way set size. Within a 322 * mnode range, the page freelist and cachelist are hashed into bins based on 323 * color. This makes it easier to search for a page within a specific memory 324 * range. 325 */ 326 #define PAGE_COLORS_MIN 16 327 328 page_t ****page_freelists; 329 page_t ***page_cachelists; 330 331 332 /* 333 * Used by page layer to know about page sizes 334 */ 335 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 336 337 kmutex_t *fpc_mutex[NPC_MUTEX]; 338 kmutex_t *cpc_mutex[NPC_MUTEX]; 339 340 /* 341 * Only let one thread at a time try to coalesce large pages, to 342 * prevent them from working against each other. 343 */ 344 static kmutex_t contig_lock; 345 #define CONTIG_LOCK() mutex_enter(&contig_lock); 346 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 347 348 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 349 350 /* 351 * Return the optimum page size for a given mapping 352 */ 353 /*ARGSUSED*/ 354 size_t 355 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 356 { 357 level_t l = 0; 358 size_t pgsz = MMU_PAGESIZE; 359 size_t max_lpsize; 360 uint_t mszc; 361 362 ASSERT(maptype != MAPPGSZ_VA); 363 364 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 365 return (MMU_PAGESIZE); 366 } 367 368 switch (maptype) { 369 case MAPPGSZ_HEAP: 370 case MAPPGSZ_STK: 371 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 372 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 373 if (max_lpsize == MMU_PAGESIZE) { 374 return (MMU_PAGESIZE); 375 } 376 if (len == 0) { 377 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 378 p->p_brksize - p->p_bssbase : p->p_stksize; 379 } 380 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 381 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 382 383 /* 384 * use the pages size that best fits len 385 */ 386 for (l = mmu.umax_page_level; l > 0; --l) { 387 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 388 continue; 389 } else { 390 pgsz = LEVEL_SIZE(l); 391 } 392 break; 393 } 394 395 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 396 p->p_stkpageszc); 397 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 398 pgsz = hw_page_array[mszc].hp_size; 399 } 400 return (pgsz); 401 402 case MAPPGSZ_ISM: 403 for (l = mmu.umax_page_level; l > 0; --l) { 404 if (len >= LEVEL_SIZE(l)) 405 return (LEVEL_SIZE(l)); 406 } 407 return (LEVEL_SIZE(0)); 408 } 409 return (pgsz); 410 } 411 412 static uint_t 413 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 414 size_t min_physmem) 415 { 416 caddr_t eaddr = addr + size; 417 uint_t szcvec = 0; 418 caddr_t raddr; 419 caddr_t readdr; 420 size_t pgsz; 421 int i; 422 423 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 424 return (0); 425 } 426 427 for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 428 pgsz = page_get_pagesize(i); 429 if (pgsz > max_lpsize) { 430 continue; 431 } 432 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 433 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 434 if (raddr < addr || raddr >= readdr) { 435 continue; 436 } 437 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 438 continue; 439 } 440 /* 441 * Set szcvec to the remaining page sizes. 442 */ 443 szcvec = ((1 << (i + 1)) - 1) & ~1; 444 break; 445 } 446 return (szcvec); 447 } 448 449 /* 450 * Return a bit vector of large page size codes that 451 * can be used to map [addr, addr + len) region. 452 */ 453 /*ARGSUSED*/ 454 uint_t 455 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 456 int memcntl) 457 { 458 size_t max_lpsize = mcntl0_lpsize; 459 460 if (mmu.max_page_level == 0) 461 return (0); 462 463 if (flags & MAP_TEXT) { 464 if (!memcntl) 465 max_lpsize = max_utext_lpsize; 466 return (map_szcvec(addr, size, off, max_lpsize, 467 shm_lpg_min_physmem)); 468 469 } else if (flags & MAP_INITDATA) { 470 if (!memcntl) 471 max_lpsize = max_uidata_lpsize; 472 return (map_szcvec(addr, size, off, max_lpsize, 473 privm_lpg_min_physmem)); 474 475 } else if (type == MAPPGSZC_SHM) { 476 if (!memcntl) 477 max_lpsize = max_shm_lpsize; 478 return (map_szcvec(addr, size, off, max_lpsize, 479 shm_lpg_min_physmem)); 480 481 } else if (type == MAPPGSZC_HEAP) { 482 if (!memcntl) 483 max_lpsize = max_uheap_lpsize; 484 return (map_szcvec(addr, size, off, max_lpsize, 485 privm_lpg_min_physmem)); 486 487 } else if (type == MAPPGSZC_STACK) { 488 if (!memcntl) 489 max_lpsize = max_ustack_lpsize; 490 return (map_szcvec(addr, size, off, max_lpsize, 491 privm_lpg_min_physmem)); 492 493 } else { 494 if (!memcntl) 495 max_lpsize = max_privmap_lpsize; 496 return (map_szcvec(addr, size, off, max_lpsize, 497 privm_lpg_min_physmem)); 498 } 499 } 500 501 /* 502 * Handle a pagefault. 503 */ 504 faultcode_t 505 pagefault( 506 caddr_t addr, 507 enum fault_type type, 508 enum seg_rw rw, 509 int iskernel) 510 { 511 struct as *as; 512 struct hat *hat; 513 struct proc *p; 514 kthread_t *t; 515 faultcode_t res; 516 caddr_t base; 517 size_t len; 518 int err; 519 int mapped_red; 520 uintptr_t ea; 521 522 ASSERT_STACK_ALIGNED(); 523 524 if (INVALID_VADDR(addr)) 525 return (FC_NOMAP); 526 527 mapped_red = segkp_map_red(); 528 529 if (iskernel) { 530 as = &kas; 531 hat = as->a_hat; 532 } else { 533 t = curthread; 534 p = ttoproc(t); 535 as = p->p_as; 536 hat = as->a_hat; 537 } 538 539 /* 540 * Dispatch pagefault. 541 */ 542 res = as_fault(hat, as, addr, 1, type, rw); 543 544 /* 545 * If this isn't a potential unmapped hole in the user's 546 * UNIX data or stack segments, just return status info. 547 */ 548 if (res != FC_NOMAP || iskernel) 549 goto out; 550 551 /* 552 * Check to see if we happened to faulted on a currently unmapped 553 * part of the UNIX data or stack segments. If so, create a zfod 554 * mapping there and then try calling the fault routine again. 555 */ 556 base = p->p_brkbase; 557 len = p->p_brksize; 558 559 if (addr < base || addr >= base + len) { /* data seg? */ 560 base = (caddr_t)p->p_usrstack - p->p_stksize; 561 len = p->p_stksize; 562 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 563 /* not in either UNIX data or stack segments */ 564 res = FC_NOMAP; 565 goto out; 566 } 567 } 568 569 /* 570 * the rest of this function implements a 3.X 4.X 5.X compatibility 571 * This code is probably not needed anymore 572 */ 573 if (p->p_model == DATAMODEL_ILP32) { 574 575 /* expand the gap to the page boundaries on each side */ 576 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 577 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 578 len = ea - (uintptr_t)base; 579 580 as_rangelock(as); 581 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 582 0) { 583 err = as_map(as, base, len, segvn_create, zfod_argsp); 584 as_rangeunlock(as); 585 if (err) { 586 res = FC_MAKE_ERR(err); 587 goto out; 588 } 589 } else { 590 /* 591 * This page is already mapped by another thread after 592 * we returned from as_fault() above. We just fall 593 * through as_fault() below. 594 */ 595 as_rangeunlock(as); 596 } 597 598 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 599 } 600 601 out: 602 if (mapped_red) 603 segkp_unmap_red(); 604 605 return (res); 606 } 607 608 void 609 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 610 { 611 struct proc *p = curproc; 612 caddr_t userlimit = (flags & _MAP_LOW32) ? 613 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 614 615 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 616 } 617 618 /*ARGSUSED*/ 619 int 620 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 621 { 622 return (0); 623 } 624 625 /* 626 * map_addr_proc() is the routine called when the system is to 627 * choose an address for the user. We will pick an address 628 * range which is the highest available below userlimit. 629 * 630 * Every mapping will have a redzone of a single page on either side of 631 * the request. This is done to leave one page unmapped between segments. 632 * This is not required, but it's useful for the user because if their 633 * program strays across a segment boundary, it will catch a fault 634 * immediately making debugging a little easier. Currently the redzone 635 * is mandatory. 636 * 637 * addrp is a value/result parameter. 638 * On input it is a hint from the user to be used in a completely 639 * machine dependent fashion. We decide to completely ignore this hint. 640 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which 641 * must be some "power of two" multiple of pagesize. 642 * 643 * On output it is NULL if no address can be found in the current 644 * processes address space or else an address that is currently 645 * not mapped for len bytes with a page of red zone on either side. 646 * 647 * vacalign is not needed on x86 (it's for viturally addressed caches) 648 */ 649 /*ARGSUSED*/ 650 void 651 map_addr_proc( 652 caddr_t *addrp, 653 size_t len, 654 offset_t off, 655 int vacalign, 656 caddr_t userlimit, 657 struct proc *p, 658 uint_t flags) 659 { 660 struct as *as = p->p_as; 661 caddr_t addr; 662 caddr_t base; 663 size_t slen; 664 size_t align_amount; 665 666 ASSERT32(userlimit == as->a_userlimit); 667 668 base = p->p_brkbase; 669 #if defined(__amd64) 670 /* 671 * XX64 Yes, this needs more work. 672 */ 673 if (p->p_model == DATAMODEL_NATIVE) { 674 if (userlimit < as->a_userlimit) { 675 /* 676 * This happens when a program wants to map 677 * something in a range that's accessible to a 678 * program in a smaller address space. For example, 679 * a 64-bit program calling mmap32(2) to guarantee 680 * that the returned address is below 4Gbytes. 681 */ 682 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 683 684 if (userlimit > base) 685 slen = userlimit - base; 686 else { 687 *addrp = NULL; 688 return; 689 } 690 } else { 691 /* 692 * XX64 This layout is probably wrong .. but in 693 * the event we make the amd64 address space look 694 * like sparcv9 i.e. with the stack -above- the 695 * heap, this bit of code might even be correct. 696 */ 697 slen = p->p_usrstack - base - 698 ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK); 699 } 700 } else 701 #endif 702 slen = userlimit - base; 703 704 /* Make len be a multiple of PAGESIZE */ 705 len = (len + PAGEOFFSET) & PAGEMASK; 706 707 /* 708 * figure out what the alignment should be 709 * 710 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 711 */ 712 if (len <= ELF_386_MAXPGSZ) { 713 /* 714 * Align virtual addresses to ensure that ELF shared libraries 715 * are mapped with the appropriate alignment constraints by 716 * the run-time linker. 717 */ 718 align_amount = ELF_386_MAXPGSZ; 719 } else { 720 int l = mmu.umax_page_level; 721 722 while (l && len < LEVEL_SIZE(l)) 723 --l; 724 725 align_amount = LEVEL_SIZE(l); 726 } 727 728 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 729 align_amount = (uintptr_t)*addrp; 730 731 ASSERT(ISP2(align_amount)); 732 ASSERT(align_amount == 0 || align_amount >= PAGESIZE); 733 734 off = off & (align_amount - 1); 735 /* 736 * Look for a large enough hole starting below userlimit. 737 * After finding it, use the upper part. 738 */ 739 if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount, 740 PAGESIZE, off) == 0) { 741 caddr_t as_addr; 742 743 /* 744 * addr is the highest possible address to use since we have 745 * a PAGESIZE redzone at the beginning and end. 746 */ 747 addr = base + slen - (PAGESIZE + len); 748 as_addr = addr; 749 /* 750 * Round address DOWN to the alignment amount and 751 * add the offset in. 752 * If addr is greater than as_addr, len would not be large 753 * enough to include the redzone, so we must adjust down 754 * by the alignment amount. 755 */ 756 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 757 addr += (uintptr_t)off; 758 if (addr > as_addr) { 759 addr -= align_amount; 760 } 761 762 ASSERT(addr > base); 763 ASSERT(addr + len < base + slen); 764 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 765 ((uintptr_t)(off))); 766 *addrp = addr; 767 } else { 768 *addrp = NULL; /* no more virtual space */ 769 } 770 } 771 772 int valid_va_range_aligned_wraparound; 773 774 /* 775 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 776 * addresses at least "minlen" long, where the base of the range is at "off" 777 * phase from an "align" boundary and there is space for a "redzone"-sized 778 * redzone on either side of the range. On success, 1 is returned and *basep 779 * and *lenp are adjusted to describe the acceptable range (including 780 * the redzone). On failure, 0 is returned. 781 */ 782 /*ARGSUSED3*/ 783 int 784 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 785 size_t align, size_t redzone, size_t off) 786 { 787 uintptr_t hi, lo; 788 size_t tot_len; 789 790 ASSERT(align == 0 ? off == 0 : off < align); 791 ASSERT(ISP2(align)); 792 ASSERT(align == 0 || align >= PAGESIZE); 793 794 lo = (uintptr_t)*basep; 795 hi = lo + *lenp; 796 tot_len = minlen + 2 * redzone; /* need at least this much space */ 797 798 /* 799 * If hi rolled over the top, try cutting back. 800 */ 801 if (hi < lo) { 802 *lenp = 0UL - lo - 1UL; 803 /* See if this really happens. If so, then we figure out why */ 804 valid_va_range_aligned_wraparound++; 805 hi = lo + *lenp; 806 } 807 if (*lenp < tot_len) { 808 return (0); 809 } 810 811 #if defined(__amd64) 812 /* 813 * Deal with a possible hole in the address range between 814 * hole_start and hole_end that should never be mapped. 815 */ 816 if (lo < hole_start) { 817 if (hi > hole_start) { 818 if (hi < hole_end) { 819 hi = hole_start; 820 } else { 821 /* lo < hole_start && hi >= hole_end */ 822 if (dir == AH_LO) { 823 /* 824 * prefer lowest range 825 */ 826 if (hole_start - lo >= tot_len) 827 hi = hole_start; 828 else if (hi - hole_end >= tot_len) 829 lo = hole_end; 830 else 831 return (0); 832 } else { 833 /* 834 * prefer highest range 835 */ 836 if (hi - hole_end >= tot_len) 837 lo = hole_end; 838 else if (hole_start - lo >= tot_len) 839 hi = hole_start; 840 else 841 return (0); 842 } 843 } 844 } 845 } else { 846 /* lo >= hole_start */ 847 if (hi < hole_end) 848 return (0); 849 if (lo < hole_end) 850 lo = hole_end; 851 } 852 #endif 853 854 if (hi - lo < tot_len) 855 return (0); 856 857 if (align > 1) { 858 uintptr_t tlo = lo + redzone; 859 uintptr_t thi = hi - redzone; 860 tlo = (uintptr_t)P2PHASEUP(tlo, align, off); 861 if (tlo < lo + redzone) { 862 return (0); 863 } 864 if (thi < tlo || thi - tlo < minlen) { 865 return (0); 866 } 867 } 868 869 *basep = (caddr_t)lo; 870 *lenp = hi - lo; 871 return (1); 872 } 873 874 /* 875 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 876 * addresses at least "minlen" long. On success, 1 is returned and *basep 877 * and *lenp are adjusted to describe the acceptable range. On failure, 0 878 * is returned. 879 */ 880 int 881 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 882 { 883 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 884 } 885 886 /* 887 * Determine whether [addr, addr+len] are valid user addresses. 888 */ 889 /*ARGSUSED*/ 890 int 891 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 892 caddr_t userlimit) 893 { 894 caddr_t eaddr = addr + len; 895 896 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 897 return (RANGE_BADADDR); 898 899 #if defined(__amd64) 900 /* 901 * Check for the VA hole 902 */ 903 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 904 return (RANGE_BADADDR); 905 #endif 906 907 return (RANGE_OKAY); 908 } 909 910 /* 911 * Return 1 if the page frame is onboard memory, else 0. 912 */ 913 int 914 pf_is_memory(pfn_t pf) 915 { 916 if (pfn_is_foreign(pf)) 917 return (0); 918 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 919 } 920 921 /* 922 * return the memrange containing pfn 923 */ 924 int 925 memrange_num(pfn_t pfn) 926 { 927 int n; 928 929 for (n = 0; n < nranges - 1; ++n) { 930 if (pfn >= memranges[n]) 931 break; 932 } 933 return (n); 934 } 935 936 /* 937 * return the mnoderange containing pfn 938 */ 939 /*ARGSUSED*/ 940 int 941 pfn_2_mtype(pfn_t pfn) 942 { 943 #if defined(__xpv) 944 return (0); 945 #else 946 int n; 947 948 for (n = mnoderangecnt - 1; n >= 0; n--) { 949 if (pfn >= mnoderanges[n].mnr_pfnlo) { 950 break; 951 } 952 } 953 return (n); 954 #endif 955 } 956 957 #if !defined(__xpv) 958 /* 959 * is_contigpage_free: 960 * returns a page list of contiguous pages. It minimally has to return 961 * minctg pages. Caller determines minctg based on the scatter-gather 962 * list length. 963 * 964 * pfnp is set to the next page frame to search on return. 965 */ 966 static page_t * 967 is_contigpage_free( 968 pfn_t *pfnp, 969 pgcnt_t *pgcnt, 970 pgcnt_t minctg, 971 uint64_t pfnseg, 972 int iolock) 973 { 974 int i = 0; 975 pfn_t pfn = *pfnp; 976 page_t *pp; 977 page_t *plist = NULL; 978 979 /* 980 * fail if pfn + minctg crosses a segment boundary. 981 * Adjust for next starting pfn to begin at segment boundary. 982 */ 983 984 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 985 *pfnp = roundup(*pfnp, pfnseg + 1); 986 return (NULL); 987 } 988 989 do { 990 retry: 991 pp = page_numtopp_nolock(pfn + i); 992 if ((pp == NULL) || 993 (page_trylock(pp, SE_EXCL) == 0)) { 994 (*pfnp)++; 995 break; 996 } 997 if (page_pptonum(pp) != pfn + i) { 998 page_unlock(pp); 999 goto retry; 1000 } 1001 1002 if (!(PP_ISFREE(pp))) { 1003 page_unlock(pp); 1004 (*pfnp)++; 1005 break; 1006 } 1007 1008 if (!PP_ISAGED(pp)) { 1009 page_list_sub(pp, PG_CACHE_LIST); 1010 page_hashout(pp, (kmutex_t *)NULL); 1011 } else { 1012 page_list_sub(pp, PG_FREE_LIST); 1013 } 1014 1015 if (iolock) 1016 page_io_lock(pp); 1017 page_list_concat(&plist, &pp); 1018 1019 /* 1020 * exit loop when pgcnt satisfied or segment boundary reached. 1021 */ 1022 1023 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 1024 1025 *pfnp += i; /* set to next pfn to search */ 1026 1027 if (i >= minctg) { 1028 *pgcnt -= i; 1029 return (plist); 1030 } 1031 1032 /* 1033 * failure: minctg not satisfied. 1034 * 1035 * if next request crosses segment boundary, set next pfn 1036 * to search from the segment boundary. 1037 */ 1038 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 1039 *pfnp = roundup(*pfnp, pfnseg + 1); 1040 1041 /* clean up any pages already allocated */ 1042 1043 while (plist) { 1044 pp = plist; 1045 page_sub(&plist, pp); 1046 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 1047 if (iolock) 1048 page_io_unlock(pp); 1049 page_unlock(pp); 1050 } 1051 1052 return (NULL); 1053 } 1054 #endif /* !__xpv */ 1055 1056 /* 1057 * verify that pages being returned from allocator have correct DMA attribute 1058 */ 1059 #ifndef DEBUG 1060 #define check_dma(a, b, c) (0) 1061 #else 1062 static void 1063 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 1064 { 1065 if (dma_attr == NULL) 1066 return; 1067 1068 while (cnt-- > 0) { 1069 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 1070 dma_attr->dma_attr_addr_lo) 1071 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp); 1072 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 1073 dma_attr->dma_attr_addr_hi) 1074 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp); 1075 pp = pp->p_next; 1076 } 1077 } 1078 #endif 1079 1080 #if !defined(__xpv) 1081 static page_t * 1082 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 1083 { 1084 pfn_t pfn; 1085 int sgllen; 1086 uint64_t pfnseg; 1087 pgcnt_t minctg; 1088 page_t *pplist = NULL, *plist; 1089 uint64_t lo, hi; 1090 pgcnt_t pfnalign = 0; 1091 static pfn_t startpfn; 1092 static pgcnt_t lastctgcnt; 1093 uintptr_t align; 1094 1095 CONTIG_LOCK(); 1096 1097 if (mattr) { 1098 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 1099 hi = mmu_btop(mattr->dma_attr_addr_hi); 1100 if (hi >= physmax) 1101 hi = physmax - 1; 1102 sgllen = mattr->dma_attr_sgllen; 1103 pfnseg = mmu_btop(mattr->dma_attr_seg); 1104 1105 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 1106 if (align > MMU_PAGESIZE) 1107 pfnalign = mmu_btop(align); 1108 1109 /* 1110 * in order to satisfy the request, must minimally 1111 * acquire minctg contiguous pages 1112 */ 1113 minctg = howmany(*pgcnt, sgllen); 1114 1115 ASSERT(hi >= lo); 1116 1117 /* 1118 * start from where last searched if the minctg >= lastctgcnt 1119 */ 1120 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 1121 startpfn = lo; 1122 } else { 1123 hi = physmax - 1; 1124 lo = 0; 1125 sgllen = 1; 1126 pfnseg = mmu.highest_pfn; 1127 minctg = *pgcnt; 1128 1129 if (minctg < lastctgcnt) 1130 startpfn = lo; 1131 } 1132 lastctgcnt = minctg; 1133 1134 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 1135 1136 /* conserve 16m memory - start search above 16m when possible */ 1137 if (hi > PFN_16M && startpfn < PFN_16M) 1138 startpfn = PFN_16M; 1139 1140 pfn = startpfn; 1141 if (pfnalign) 1142 pfn = P2ROUNDUP(pfn, pfnalign); 1143 1144 while (pfn + minctg - 1 <= hi) { 1145 1146 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1147 if (plist) { 1148 page_list_concat(&pplist, &plist); 1149 sgllen--; 1150 /* 1151 * return when contig pages no longer needed 1152 */ 1153 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1154 startpfn = pfn; 1155 CONTIG_UNLOCK(); 1156 check_dma(mattr, pplist, *pgcnt); 1157 return (pplist); 1158 } 1159 minctg = howmany(*pgcnt, sgllen); 1160 } 1161 if (pfnalign) 1162 pfn = P2ROUNDUP(pfn, pfnalign); 1163 } 1164 1165 /* cannot find contig pages in specified range */ 1166 if (startpfn == lo) { 1167 CONTIG_UNLOCK(); 1168 return (NULL); 1169 } 1170 1171 /* did not start with lo previously */ 1172 pfn = lo; 1173 if (pfnalign) 1174 pfn = P2ROUNDUP(pfn, pfnalign); 1175 1176 /* allow search to go above startpfn */ 1177 while (pfn < startpfn) { 1178 1179 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1180 if (plist != NULL) { 1181 1182 page_list_concat(&pplist, &plist); 1183 sgllen--; 1184 1185 /* 1186 * return when contig pages no longer needed 1187 */ 1188 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1189 startpfn = pfn; 1190 CONTIG_UNLOCK(); 1191 check_dma(mattr, pplist, *pgcnt); 1192 return (pplist); 1193 } 1194 minctg = howmany(*pgcnt, sgllen); 1195 } 1196 if (pfnalign) 1197 pfn = P2ROUNDUP(pfn, pfnalign); 1198 } 1199 CONTIG_UNLOCK(); 1200 return (NULL); 1201 } 1202 #endif /* !__xpv */ 1203 1204 /* 1205 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1206 * memranges[]. Used to determine the size of page lists and mnoderanges. 1207 */ 1208 int 1209 mnode_range_cnt(int mnode) 1210 { 1211 #if defined(__xpv) 1212 ASSERT(mnode == 0); 1213 return (1); 1214 #else /* __xpv */ 1215 int mri; 1216 int mnrcnt = 0; 1217 1218 if (mem_node_config[mnode].exists != 0) { 1219 mri = nranges - 1; 1220 1221 /* find the memranges index below contained in mnode range */ 1222 1223 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1224 mri--; 1225 1226 /* 1227 * increment mnode range counter when memranges or mnode 1228 * boundary is reached. 1229 */ 1230 while (mri >= 0 && 1231 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1232 mnrcnt++; 1233 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1234 mri--; 1235 else 1236 break; 1237 } 1238 } 1239 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1240 return (mnrcnt); 1241 #endif /* __xpv */ 1242 } 1243 1244 /* 1245 * mnode_range_setup() initializes mnoderanges. 1246 */ 1247 void 1248 mnode_range_setup(mnoderange_t *mnoderanges) 1249 { 1250 int mnode, mri; 1251 1252 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1253 if (mem_node_config[mnode].exists == 0) 1254 continue; 1255 1256 mri = nranges - 1; 1257 1258 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1259 mri--; 1260 1261 while (mri >= 0 && mem_node_config[mnode].physmax >= 1262 MEMRANGELO(mri)) { 1263 mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 1264 mem_node_config[mnode].physbase); 1265 mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 1266 mem_node_config[mnode].physmax); 1267 mnoderanges->mnr_mnode = mnode; 1268 mnoderanges->mnr_memrange = mri; 1269 mnoderanges++; 1270 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1271 mri--; 1272 else 1273 break; 1274 } 1275 } 1276 } 1277 1278 /*ARGSUSED*/ 1279 int 1280 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1281 { 1282 int mtype = mnoderangecnt - 1; 1283 1284 #if !defined(__xpv) 1285 #if defined(__i386) 1286 /* 1287 * set the mtype range 1288 * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 1289 * - for non kmem requests, set range to above 4g if memory below 4g 1290 * runs low. 1291 */ 1292 if (restricted_kmemalloc && VN_ISKAS(vp) && 1293 (caddr_t)(vaddr) >= kernelheap && 1294 (caddr_t)(vaddr) < ekernelheap) { 1295 ASSERT(physmax4g); 1296 mtype = mtype4g; 1297 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1298 btop(pgsz), *flags)) { 1299 *flags |= PGI_MT_RANGE16M; 1300 } else { 1301 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1302 VM_STAT_COND_ADD((*flags & PG_PANIC), 1303 vmm_vmstats.pgpanicalloc); 1304 *flags |= PGI_MT_RANGE0; 1305 } 1306 return (mtype); 1307 } 1308 #endif /* __i386 */ 1309 1310 if (RESTRICT4G_ALLOC) { 1311 VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1312 /* here only for > 4g systems */ 1313 *flags |= PGI_MT_RANGE4G; 1314 } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1315 *flags |= PGI_MT_RANGE16M; 1316 } else { 1317 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1318 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1319 *flags |= PGI_MT_RANGE0; 1320 } 1321 #endif /* !__xpv */ 1322 return (mtype); 1323 } 1324 1325 1326 /* mtype init for page_get_replacement_page */ 1327 /*ARGSUSED*/ 1328 int 1329 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1330 { 1331 int mtype = mnoderangecnt - 1; 1332 #if !defined(__ixpv) 1333 if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1334 *flags |= PGI_MT_RANGE16M; 1335 } else { 1336 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1337 *flags |= PGI_MT_RANGE0; 1338 } 1339 #endif 1340 return (mtype); 1341 } 1342 1343 /* 1344 * Determine if the mnode range specified in mtype contains memory belonging 1345 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1346 * the range of indices from high pfn to 0, 16m or 4g. 1347 * 1348 * Return first mnode range type index found otherwise return -1 if none found. 1349 */ 1350 int 1351 mtype_func(int mnode, int mtype, uint_t flags) 1352 { 1353 if (flags & PGI_MT_RANGE) { 1354 int mtlim = 0; 1355 1356 if (flags & PGI_MT_NEXT) 1357 mtype--; 1358 if (flags & PGI_MT_RANGE4G) 1359 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1360 else if (flags & PGI_MT_RANGE16M) 1361 mtlim = 1; /* exclude 0-16m range */ 1362 while (mtype >= mtlim) { 1363 if (mnoderanges[mtype].mnr_mnode == mnode) 1364 return (mtype); 1365 mtype--; 1366 } 1367 } else if (mnoderanges[mtype].mnr_mnode == mnode) { 1368 return (mtype); 1369 } 1370 return (-1); 1371 } 1372 1373 /* 1374 * Update the page list max counts with the pfn range specified by the 1375 * input parameters. Called from add_physmem() when physical memory with 1376 * page_t's are initially added to the page lists. 1377 */ 1378 void 1379 mtype_modify_max(pfn_t startpfn, long cnt) 1380 { 1381 int mtype = 0; 1382 pfn_t endpfn = startpfn + cnt, pfn; 1383 pgcnt_t inc; 1384 1385 ASSERT(cnt > 0); 1386 1387 if (!physmax4g) 1388 return; 1389 1390 for (pfn = startpfn; pfn < endpfn; ) { 1391 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1392 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1393 inc = endpfn - pfn; 1394 } else { 1395 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1396 } 1397 if (mtype <= mtype4g) 1398 maxmem4g += inc; 1399 pfn += inc; 1400 } 1401 mtype++; 1402 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1403 } 1404 } 1405 1406 int 1407 mtype_2_mrange(int mtype) 1408 { 1409 return (mnoderanges[mtype].mnr_memrange); 1410 } 1411 1412 void 1413 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1414 { 1415 ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1416 *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1417 *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1418 } 1419 1420 size_t 1421 plcnt_sz(size_t ctrs_sz) 1422 { 1423 #ifdef DEBUG 1424 int szc, colors; 1425 1426 ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1427 for (szc = 0; szc < mmu_page_sizes; szc++) { 1428 colors = page_get_pagecolors(szc); 1429 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1430 } 1431 #endif 1432 return (ctrs_sz); 1433 } 1434 1435 caddr_t 1436 plcnt_init(caddr_t addr) 1437 { 1438 #ifdef DEBUG 1439 int mt, szc, colors; 1440 1441 for (mt = 0; mt < mnoderangecnt; mt++) { 1442 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1443 addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1444 for (szc = 0; szc < mmu_page_sizes; szc++) { 1445 colors = page_get_pagecolors(szc); 1446 mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1447 mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1448 (pgcnt_t *)addr; 1449 addr += (sizeof (pgcnt_t) * colors); 1450 } 1451 } 1452 #endif 1453 return (addr); 1454 } 1455 1456 void 1457 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1458 { 1459 #ifdef DEBUG 1460 int bin = PP_2_BIN(pp); 1461 1462 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1463 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1464 cnt); 1465 #endif 1466 ASSERT(mtype == PP_2_MTYPE(pp)); 1467 if (physmax4g && mtype <= mtype4g) 1468 atomic_add_long(&freemem4g, cnt); 1469 if (flags & PG_CACHE_LIST) 1470 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1471 else 1472 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 1473 atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 1474 } 1475 1476 /* 1477 * Returns the free page count for mnode 1478 */ 1479 int 1480 mnode_pgcnt(int mnode) 1481 { 1482 int mtype = mnoderangecnt - 1; 1483 int flags = PGI_MT_RANGE0; 1484 pgcnt_t pgcnt = 0; 1485 1486 mtype = mtype_func(mnode, mtype, flags); 1487 1488 while (mtype != -1) { 1489 pgcnt += MTYPE_FREEMEM(mtype); 1490 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1491 } 1492 return (pgcnt); 1493 } 1494 1495 /* 1496 * Initialize page coloring variables based on the l2 cache parameters. 1497 * Calculate and return memory needed for page coloring data structures. 1498 */ 1499 size_t 1500 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1501 { 1502 size_t colorsz = 0; 1503 int i; 1504 int colors; 1505 1506 #if defined(__xpv) 1507 /* 1508 * Hypervisor domains currently don't have any concept of NUMA. 1509 * Hence we'll act like there is only 1 memrange. 1510 */ 1511 i = memrange_num(1); 1512 #else /* !__xpv */ 1513 /* 1514 * Reduce the memory ranges lists if we don't have large amounts 1515 * of memory. This avoids searching known empty free lists. 1516 */ 1517 i = memrange_num(physmax); 1518 #if defined(__i386) 1519 if (i > 0) 1520 restricted_kmemalloc = 0; 1521 #endif 1522 /* physmax greater than 4g */ 1523 if (i == 0) 1524 physmax4g = 1; 1525 #endif /* !__xpv */ 1526 memranges += i; 1527 nranges -= i; 1528 1529 ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 1530 1531 ASSERT(ISP2(l2_linesz)); 1532 ASSERT(l2_sz > MMU_PAGESIZE); 1533 1534 /* l2_assoc is 0 for fully associative l2 cache */ 1535 if (l2_assoc) 1536 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1537 else 1538 l2_colors = 1; 1539 1540 ASSERT(ISP2(l2_colors)); 1541 1542 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1543 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1544 1545 /* 1546 * cpu_page_colors is non-zero when a page color may be spread across 1547 * multiple bins. 1548 */ 1549 if (l2_colors < page_colors) 1550 cpu_page_colors = l2_colors; 1551 1552 ASSERT(ISP2(page_colors)); 1553 1554 page_colors_mask = page_colors - 1; 1555 1556 ASSERT(ISP2(CPUSETSIZE())); 1557 page_coloring_shift = lowbit(CPUSETSIZE()); 1558 1559 /* initialize number of colors per page size */ 1560 for (i = 0; i <= mmu.max_page_level; i++) { 1561 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1562 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1563 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1564 hw_page_array[i].hp_colors = (page_colors_mask >> 1565 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1566 + 1; 1567 colorequivszc[i] = 0; 1568 } 1569 1570 /* 1571 * The value of cpu_page_colors determines if additional color bins 1572 * need to be checked for a particular color in the page_get routines. 1573 */ 1574 if (cpu_page_colors != 0) { 1575 1576 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1577 ASSERT(a > 0); 1578 ASSERT(a < 16); 1579 1580 for (i = 0; i <= mmu.max_page_level; i++) { 1581 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1582 colorequivszc[i] = 0; 1583 continue; 1584 } 1585 while ((colors >> a) == 0) 1586 a--; 1587 ASSERT(a >= 0); 1588 1589 /* higher 4 bits encodes color equiv mask */ 1590 colorequivszc[i] = (a << 4); 1591 } 1592 } 1593 1594 /* factor in colorequiv to check additional 'equivalent' bins. */ 1595 if (colorequiv > 1) { 1596 1597 int a = lowbit(colorequiv) - 1; 1598 if (a > 15) 1599 a = 15; 1600 1601 for (i = 0; i <= mmu.max_page_level; i++) { 1602 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1603 continue; 1604 } 1605 while ((colors >> a) == 0) 1606 a--; 1607 if ((a << 4) > colorequivszc[i]) { 1608 colorequivszc[i] = (a << 4); 1609 } 1610 } 1611 } 1612 1613 /* size for mnoderanges */ 1614 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1615 mnoderangecnt += mnode_range_cnt(i); 1616 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1617 1618 /* size for fpc_mutex and cpc_mutex */ 1619 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1620 1621 /* size of page_freelists */ 1622 colorsz += mnoderangecnt * sizeof (page_t ***); 1623 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1624 1625 for (i = 0; i < mmu_page_sizes; i++) { 1626 colors = page_get_pagecolors(i); 1627 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1628 } 1629 1630 /* size of page_cachelists */ 1631 colorsz += mnoderangecnt * sizeof (page_t **); 1632 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1633 1634 return (colorsz); 1635 } 1636 1637 /* 1638 * Called once at startup to configure page_coloring data structures and 1639 * does the 1st page_free()/page_freelist_add(). 1640 */ 1641 void 1642 page_coloring_setup(caddr_t pcmemaddr) 1643 { 1644 int i; 1645 int j; 1646 int k; 1647 caddr_t addr; 1648 int colors; 1649 1650 /* 1651 * do page coloring setup 1652 */ 1653 addr = pcmemaddr; 1654 1655 mnoderanges = (mnoderange_t *)addr; 1656 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1657 1658 mnode_range_setup(mnoderanges); 1659 1660 if (physmax4g) 1661 mtype4g = pfn_2_mtype(0xfffff); 1662 1663 for (k = 0; k < NPC_MUTEX; k++) { 1664 fpc_mutex[k] = (kmutex_t *)addr; 1665 addr += (max_mem_nodes * sizeof (kmutex_t)); 1666 } 1667 for (k = 0; k < NPC_MUTEX; k++) { 1668 cpc_mutex[k] = (kmutex_t *)addr; 1669 addr += (max_mem_nodes * sizeof (kmutex_t)); 1670 } 1671 page_freelists = (page_t ****)addr; 1672 addr += (mnoderangecnt * sizeof (page_t ***)); 1673 1674 page_cachelists = (page_t ***)addr; 1675 addr += (mnoderangecnt * sizeof (page_t **)); 1676 1677 for (i = 0; i < mnoderangecnt; i++) { 1678 page_freelists[i] = (page_t ***)addr; 1679 addr += (mmu_page_sizes * sizeof (page_t **)); 1680 1681 for (j = 0; j < mmu_page_sizes; j++) { 1682 colors = page_get_pagecolors(j); 1683 page_freelists[i][j] = (page_t **)addr; 1684 addr += (colors * sizeof (page_t *)); 1685 } 1686 page_cachelists[i] = (page_t **)addr; 1687 addr += (page_colors * sizeof (page_t *)); 1688 } 1689 } 1690 1691 #if defined(__xpv) 1692 /* 1693 * Give back 10% of the io_pool pages to the free list. 1694 * Don't shrink the pool below some absolute minimum. 1695 */ 1696 static void 1697 page_io_pool_shrink() 1698 { 1699 int retcnt; 1700 page_t *pp, *pp_first, *pp_last, **curpool; 1701 mfn_t mfn; 1702 int bothpools = 0; 1703 1704 mutex_enter(&io_pool_lock); 1705 io_pool_shrink_attempts++; /* should be a kstat? */ 1706 retcnt = io_pool_cnt / 10; 1707 if (io_pool_cnt - retcnt < io_pool_cnt_min) 1708 retcnt = io_pool_cnt - io_pool_cnt_min; 1709 if (retcnt <= 0) 1710 goto done; 1711 io_pool_shrinks++; /* should be a kstat? */ 1712 curpool = &io_pool_4g; 1713 domore: 1714 /* 1715 * Loop through taking pages from the end of the list 1716 * (highest mfns) till amount to return reached. 1717 */ 1718 for (pp = *curpool; pp && retcnt > 0; ) { 1719 pp_first = pp_last = pp->p_prev; 1720 if (pp_first == *curpool) 1721 break; 1722 retcnt--; 1723 io_pool_cnt--; 1724 page_io_pool_sub(curpool, pp_first, pp_last); 1725 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 1726 start_mfn = mfn; 1727 page_free(pp_first, 1); 1728 pp = *curpool; 1729 } 1730 if (retcnt != 0 && !bothpools) { 1731 /* 1732 * If not enough found in less constrained pool try the 1733 * more constrained one. 1734 */ 1735 curpool = &io_pool_16m; 1736 bothpools = 1; 1737 goto domore; 1738 } 1739 done: 1740 mutex_exit(&io_pool_lock); 1741 } 1742 1743 #endif /* __xpv */ 1744 1745 uint_t 1746 page_create_update_flags_x86(uint_t flags) 1747 { 1748 #if defined(__xpv) 1749 /* 1750 * Check this is an urgent allocation and free pages are depleted. 1751 */ 1752 if (!(flags & PG_WAIT) && freemem < desfree) 1753 page_io_pool_shrink(); 1754 #else /* !__xpv */ 1755 /* 1756 * page_create_get_something may call this because 4g memory may be 1757 * depleted. Set flags to allow for relocation of base page below 1758 * 4g if necessary. 1759 */ 1760 if (physmax4g) 1761 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1762 #endif /* __xpv */ 1763 return (flags); 1764 } 1765 1766 /*ARGSUSED*/ 1767 int 1768 bp_color(struct buf *bp) 1769 { 1770 return (0); 1771 } 1772 1773 #if defined(__xpv) 1774 1775 /* 1776 * Take pages out of an io_pool 1777 */ 1778 static void 1779 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 1780 { 1781 if (*poolp == pp_first) { 1782 *poolp = pp_last->p_next; 1783 if (*poolp == pp_first) 1784 *poolp = NULL; 1785 } 1786 pp_first->p_prev->p_next = pp_last->p_next; 1787 pp_last->p_next->p_prev = pp_first->p_prev; 1788 pp_first->p_prev = pp_last; 1789 pp_last->p_next = pp_first; 1790 } 1791 1792 /* 1793 * Put a page on the io_pool list. The list is ordered by increasing MFN. 1794 */ 1795 static void 1796 page_io_pool_add(page_t **poolp, page_t *pp) 1797 { 1798 page_t *look; 1799 mfn_t mfn = mfn_list[pp->p_pagenum]; 1800 1801 if (*poolp == NULL) { 1802 *poolp = pp; 1803 pp->p_next = pp; 1804 pp->p_prev = pp; 1805 return; 1806 } 1807 1808 /* 1809 * Since we try to take pages from the high end of the pool 1810 * chances are good that the pages to be put on the list will 1811 * go at or near the end of the list. so start at the end and 1812 * work backwards. 1813 */ 1814 look = (*poolp)->p_prev; 1815 while (mfn < mfn_list[look->p_pagenum]) { 1816 look = look->p_prev; 1817 if (look == (*poolp)->p_prev) 1818 break; /* backed all the way to front of list */ 1819 } 1820 1821 /* insert after look */ 1822 pp->p_prev = look; 1823 pp->p_next = look->p_next; 1824 pp->p_next->p_prev = pp; 1825 look->p_next = pp; 1826 if (mfn < mfn_list[(*poolp)->p_pagenum]) { 1827 /* 1828 * we inserted a new first list element 1829 * adjust pool pointer to newly inserted element 1830 */ 1831 *poolp = pp; 1832 } 1833 } 1834 1835 /* 1836 * Add a page to the io_pool. Setting the force flag will force the page 1837 * into the io_pool no matter what. 1838 */ 1839 static void 1840 add_page_to_pool(page_t *pp, int force) 1841 { 1842 page_t *highest; 1843 page_t *freep = NULL; 1844 1845 mutex_enter(&io_pool_lock); 1846 /* 1847 * Always keep the scarce low memory pages 1848 */ 1849 if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 1850 ++io_pool_cnt; 1851 page_io_pool_add(&io_pool_16m, pp); 1852 goto done; 1853 } 1854 if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) { 1855 ++io_pool_cnt; 1856 page_io_pool_add(&io_pool_4g, pp); 1857 } else { 1858 highest = io_pool_4g->p_prev; 1859 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 1860 page_io_pool_sub(&io_pool_4g, highest, highest); 1861 page_io_pool_add(&io_pool_4g, pp); 1862 freep = highest; 1863 } else { 1864 freep = pp; 1865 } 1866 } 1867 done: 1868 mutex_exit(&io_pool_lock); 1869 if (freep) 1870 page_free(freep, 1); 1871 } 1872 1873 1874 int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 1875 int contig_pfn_max; /* capacity of the contig pfn list */ 1876 int next_alloc_pfn; /* next position in list to start a contig search */ 1877 int contig_pfnlist_updates; /* pfn list update count */ 1878 int contig_pfnlist_builds; /* how many times have we (re)built list */ 1879 int contig_pfnlist_buildfailed; /* how many times has list build failed */ 1880 int create_contig_pending; /* nonzero means taskq creating contig list */ 1881 pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 1882 1883 /* 1884 * Function to use in sorting a list of pfns by their underlying mfns. 1885 */ 1886 static int 1887 mfn_compare(const void *pfnp1, const void *pfnp2) 1888 { 1889 mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 1890 mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 1891 1892 if (mfn1 > mfn2) 1893 return (1); 1894 if (mfn1 < mfn2) 1895 return (-1); 1896 return (0); 1897 } 1898 1899 /* 1900 * Compact the contig_pfn_list by tossing all the non-contiguous 1901 * elements from the list. 1902 */ 1903 static void 1904 compact_contig_pfn_list(void) 1905 { 1906 pfn_t pfn, lapfn, prev_lapfn; 1907 mfn_t mfn; 1908 int i, newcnt = 0; 1909 1910 prev_lapfn = 0; 1911 for (i = 0; i < contig_pfn_cnt - 1; i++) { 1912 pfn = contig_pfn_list[i]; 1913 lapfn = contig_pfn_list[i + 1]; 1914 mfn = mfn_list[pfn]; 1915 /* 1916 * See if next pfn is for a contig mfn 1917 */ 1918 if (mfn_list[lapfn] != mfn + 1) 1919 continue; 1920 /* 1921 * pfn and lookahead are both put in list 1922 * unless pfn is the previous lookahead. 1923 */ 1924 if (pfn != prev_lapfn) 1925 contig_pfn_list[newcnt++] = pfn; 1926 contig_pfn_list[newcnt++] = lapfn; 1927 prev_lapfn = lapfn; 1928 } 1929 for (i = newcnt; i < contig_pfn_cnt; i++) 1930 contig_pfn_list[i] = 0; 1931 contig_pfn_cnt = newcnt; 1932 } 1933 1934 /*ARGSUSED*/ 1935 static void 1936 call_create_contiglist(void *arg) 1937 { 1938 (void) create_contig_pfnlist(PG_WAIT); 1939 } 1940 1941 /* 1942 * Create list of freelist pfns that have underlying 1943 * contiguous mfns. The list is kept in ascending mfn order. 1944 * returns 1 if list created else 0. 1945 */ 1946 static int 1947 create_contig_pfnlist(uint_t flags) 1948 { 1949 pfn_t pfn; 1950 page_t *pp; 1951 int ret = 1; 1952 1953 mutex_enter(&contig_list_lock); 1954 if (contig_pfn_list != NULL) 1955 goto out; 1956 contig_pfn_max = freemem + (freemem / 10); 1957 contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 1958 (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 1959 if (contig_pfn_list == NULL) { 1960 /* 1961 * If we could not create the contig list (because 1962 * we could not sleep for memory). Dispatch a taskq that can 1963 * sleep to get the memory. 1964 */ 1965 if (!create_contig_pending) { 1966 if (taskq_dispatch(system_taskq, call_create_contiglist, 1967 NULL, TQ_NOSLEEP) != NULL) 1968 create_contig_pending = 1; 1969 } 1970 contig_pfnlist_buildfailed++; /* count list build failures */ 1971 ret = 0; 1972 goto out; 1973 } 1974 create_contig_pending = 0; 1975 ASSERT(contig_pfn_cnt == 0); 1976 for (pfn = 0; pfn < mfn_count; pfn++) { 1977 pp = page_numtopp_nolock(pfn); 1978 if (pp == NULL || !PP_ISFREE(pp)) 1979 continue; 1980 contig_pfn_list[contig_pfn_cnt] = pfn; 1981 if (++contig_pfn_cnt == contig_pfn_max) 1982 break; 1983 } 1984 /* 1985 * Sanity check the new list. 1986 */ 1987 if (contig_pfn_cnt < 2) { /* no contig pfns */ 1988 contig_pfn_cnt = 0; 1989 contig_pfnlist_buildfailed++; 1990 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t)); 1991 contig_pfn_list = NULL; 1992 contig_pfn_max = 0; 1993 ret = 0; 1994 goto out; 1995 } 1996 qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 1997 compact_contig_pfn_list(); 1998 /* 1999 * Make sure next search of the newly created contiguous pfn 2000 * list starts at the beginning of the list. 2001 */ 2002 next_alloc_pfn = 0; 2003 contig_pfnlist_builds++; /* count list builds */ 2004 out: 2005 mutex_exit(&contig_list_lock); 2006 return (ret); 2007 } 2008 2009 2010 /* 2011 * Toss the current contig pfnlist. Someone is about to do a massive 2012 * update to pfn<->mfn mappings. So we have them destroy the list and lock 2013 * it till they are done with their update. 2014 */ 2015 void 2016 clear_and_lock_contig_pfnlist() 2017 { 2018 pfn_t *listp = NULL; 2019 size_t listsize; 2020 2021 mutex_enter(&contig_list_lock); 2022 if (contig_pfn_list != NULL) { 2023 listp = contig_pfn_list; 2024 listsize = contig_pfn_max * sizeof (pfn_t); 2025 contig_pfn_list = NULL; 2026 contig_pfn_max = contig_pfn_cnt = 0; 2027 } 2028 if (listp != NULL) 2029 kmem_free(listp, listsize); 2030 } 2031 2032 /* 2033 * Unlock the contig_pfn_list. The next attempted use of it will cause 2034 * it to be re-created. 2035 */ 2036 void 2037 unlock_contig_pfnlist() 2038 { 2039 mutex_exit(&contig_list_lock); 2040 } 2041 2042 /* 2043 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 2044 */ 2045 void 2046 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 2047 { 2048 int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 2049 pfn_t probe_pfn; 2050 mfn_t probe_mfn; 2051 int drop_lock = 0; 2052 2053 if (mutex_owner(&contig_list_lock) != curthread) { 2054 drop_lock = 1; 2055 mutex_enter(&contig_list_lock); 2056 } 2057 if (contig_pfn_list == NULL) 2058 goto done; 2059 contig_pfnlist_updates++; 2060 /* 2061 * Find the pfn in the current list. Use a binary chop to locate it. 2062 */ 2063 probe_hi = contig_pfn_cnt - 1; 2064 probe_lo = 0; 2065 probe_pos = (probe_hi + probe_lo) / 2; 2066 while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2067 if (probe_pos == probe_lo) { /* pfn not in list */ 2068 probe_pos = -1; 2069 break; 2070 } 2071 if (pfn_to_mfn(probe_pfn) <= oldmfn) 2072 probe_lo = probe_pos; 2073 else 2074 probe_hi = probe_pos; 2075 probe_pos = (probe_hi + probe_lo) / 2; 2076 } 2077 if (probe_pos >= 0) { 2078 /* 2079 * Remove pfn from list and ensure next alloc 2080 * position stays in bounds. 2081 */ 2082 if (--contig_pfn_cnt <= next_alloc_pfn) 2083 next_alloc_pfn = 0; 2084 if (contig_pfn_cnt < 2) { /* no contig pfns */ 2085 contig_pfn_cnt = 0; 2086 kmem_free(contig_pfn_list, 2087 contig_pfn_max * sizeof (pfn_t)); 2088 contig_pfn_list = NULL; 2089 contig_pfn_max = 0; 2090 goto done; 2091 } 2092 ovbcopy(&contig_pfn_list[probe_pos + 1], 2093 &contig_pfn_list[probe_pos], 2094 (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2095 } 2096 if (newmfn == MFN_INVALID) 2097 goto done; 2098 /* 2099 * Check if new mfn has adjacent mfns in the list 2100 */ 2101 probe_hi = contig_pfn_cnt - 1; 2102 probe_lo = 0; 2103 insert_after = -2; 2104 do { 2105 probe_pos = (probe_hi + probe_lo) / 2; 2106 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2107 if (newmfn == probe_mfn + 1) 2108 insert_after = probe_pos; 2109 else if (newmfn == probe_mfn - 1) 2110 insert_after = probe_pos - 1; 2111 if (probe_pos == probe_lo) 2112 break; 2113 if (probe_mfn <= newmfn) 2114 probe_lo = probe_pos; 2115 else 2116 probe_hi = probe_pos; 2117 } while (insert_after == -2); 2118 /* 2119 * If there is space in the list and there are adjacent mfns 2120 * insert the pfn in to its proper place in the list. 2121 */ 2122 if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2123 insert_point = insert_after + 1; 2124 ovbcopy(&contig_pfn_list[insert_point], 2125 &contig_pfn_list[insert_point + 1], 2126 (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2127 contig_pfn_list[insert_point] = pfn; 2128 contig_pfn_cnt++; 2129 } 2130 done: 2131 if (drop_lock) 2132 mutex_exit(&contig_list_lock); 2133 } 2134 2135 /* 2136 * Called to (re-)populate the io_pool from the free page lists. 2137 */ 2138 long 2139 populate_io_pool(void) 2140 { 2141 pfn_t pfn; 2142 mfn_t mfn, max_mfn; 2143 page_t *pp; 2144 2145 /* 2146 * Figure out the bounds of the pool on first invocation. 2147 * We use a percentage of memory for the io pool size. 2148 * we allow that to shrink, but not to less than a fixed minimum 2149 */ 2150 if (io_pool_cnt_max == 0) { 2151 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2152 io_pool_cnt_lowater = io_pool_cnt_max; 2153 /* 2154 * This is the first time in populate_io_pool, grab a va to use 2155 * when we need to allocate pages. 2156 */ 2157 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2158 } 2159 /* 2160 * If we are out of pages in the pool, then grow the size of the pool 2161 */ 2162 if (io_pool_cnt == 0) { 2163 /* 2164 * Grow the max size of the io pool by 5%, but never more than 2165 * 25% of physical memory. 2166 */ 2167 if (io_pool_cnt_max < physmem / 4) 2168 io_pool_cnt_max += io_pool_cnt_max / 20; 2169 } 2170 io_pool_grows++; /* should be a kstat? */ 2171 2172 /* 2173 * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2174 */ 2175 (void) mfn_to_pfn(start_mfn); 2176 max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2177 for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2178 pfn = mfn_to_pfn(mfn); 2179 if (pfn & PFN_IS_FOREIGN_MFN) 2180 continue; 2181 /* 2182 * try to allocate it from free pages 2183 */ 2184 pp = page_numtopp_alloc(pfn); 2185 if (pp == NULL) 2186 continue; 2187 PP_CLRFREE(pp); 2188 add_page_to_pool(pp, 1); 2189 if (io_pool_cnt >= io_pool_cnt_max) 2190 break; 2191 } 2192 2193 return (io_pool_cnt); 2194 } 2195 2196 /* 2197 * Destroy a page that was being used for DMA I/O. It may or 2198 * may not actually go back to the io_pool. 2199 */ 2200 void 2201 page_destroy_io(page_t *pp) 2202 { 2203 mfn_t mfn = mfn_list[pp->p_pagenum]; 2204 2205 /* 2206 * When the page was alloc'd a reservation was made, release it now 2207 */ 2208 page_unresv(1); 2209 /* 2210 * Unload translations, if any, then hash out the 2211 * page to erase its identity. 2212 */ 2213 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2214 page_hashout(pp, NULL); 2215 2216 /* 2217 * If the page came from the free lists, just put it back to them. 2218 * DomU pages always go on the free lists as well. 2219 */ 2220 if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2221 page_free(pp, 1); 2222 return; 2223 } 2224 2225 add_page_to_pool(pp, 0); 2226 } 2227 2228 2229 long contig_searches; /* count of times contig pages requested */ 2230 long contig_search_restarts; /* count of contig ranges tried */ 2231 long contig_search_failed; /* count of contig alloc failures */ 2232 2233 /* 2234 * Free partial page list 2235 */ 2236 static void 2237 free_partial_list(page_t **pplist) 2238 { 2239 page_t *pp; 2240 2241 while (*pplist != NULL) { 2242 pp = *pplist; 2243 page_io_pool_sub(pplist, pp, pp); 2244 page_free(pp, 1); 2245 } 2246 } 2247 2248 /* 2249 * Look thru the contiguous pfns that are not part of the io_pool for 2250 * contiguous free pages. Return a list of the found pages or NULL. 2251 */ 2252 page_t * 2253 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg, 2254 pgcnt_t pfnalign) 2255 { 2256 page_t *pp, *plist = NULL; 2257 mfn_t mfn, prev_mfn, start_mfn; 2258 pfn_t pfn; 2259 int pages_needed, pages_requested; 2260 int search_start; 2261 2262 /* 2263 * create the contig pfn list if not already done 2264 */ 2265 retry: 2266 mutex_enter(&contig_list_lock); 2267 if (contig_pfn_list == NULL) { 2268 mutex_exit(&contig_list_lock); 2269 if (!create_contig_pfnlist(flags)) { 2270 return (NULL); 2271 } 2272 goto retry; 2273 } 2274 contig_searches++; 2275 /* 2276 * Search contiguous pfn list for physically contiguous pages not in 2277 * the io_pool. Start the search where the last search left off. 2278 */ 2279 pages_requested = pages_needed = npages; 2280 search_start = next_alloc_pfn; 2281 start_mfn = prev_mfn = 0; 2282 while (pages_needed) { 2283 pfn = contig_pfn_list[next_alloc_pfn]; 2284 mfn = pfn_to_mfn(pfn); 2285 /* 2286 * Check if mfn is first one or contig to previous one and 2287 * if page corresponding to mfn is free and that mfn 2288 * range is not crossing a segment boundary. 2289 */ 2290 if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 2291 (pp = page_numtopp_alloc(pfn)) != NULL && 2292 !((mfn & pfnseg) < (start_mfn & pfnseg))) { 2293 PP_CLRFREE(pp); 2294 page_io_pool_add(&plist, pp); 2295 pages_needed--; 2296 if (prev_mfn == 0) { 2297 if (pfnalign && 2298 mfn != P2ROUNDUP(mfn, pfnalign)) { 2299 /* 2300 * not properly aligned 2301 */ 2302 contig_search_restarts++; 2303 free_partial_list(&plist); 2304 pages_needed = pages_requested; 2305 start_mfn = prev_mfn = 0; 2306 goto skip; 2307 } 2308 start_mfn = mfn; 2309 } 2310 prev_mfn = mfn; 2311 } else { 2312 contig_search_restarts++; 2313 free_partial_list(&plist); 2314 pages_needed = pages_requested; 2315 start_mfn = prev_mfn = 0; 2316 } 2317 skip: 2318 if (++next_alloc_pfn == contig_pfn_cnt) 2319 next_alloc_pfn = 0; 2320 if (next_alloc_pfn == search_start) 2321 break; /* all pfns searched */ 2322 } 2323 mutex_exit(&contig_list_lock); 2324 if (pages_needed) { 2325 contig_search_failed++; 2326 /* 2327 * Failed to find enough contig pages. 2328 * free partial page list 2329 */ 2330 free_partial_list(&plist); 2331 } 2332 return (plist); 2333 } 2334 2335 /* 2336 * Search the reserved io pool pages for a page range with the 2337 * desired characteristics. 2338 */ 2339 page_t * 2340 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg) 2341 { 2342 page_t *pp_first, *pp_last; 2343 page_t *pp, **poolp; 2344 pgcnt_t nwanted, pfnalign; 2345 uint64_t pfnseg; 2346 mfn_t mfn, tmfn, hi_mfn, lo_mfn; 2347 int align, attempt = 0; 2348 2349 if (minctg == 1) 2350 contig = 0; 2351 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2352 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2353 pfnseg = mmu_btop(mattr->dma_attr_seg); 2354 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2355 if (align > MMU_PAGESIZE) 2356 pfnalign = mmu_btop(align); 2357 else 2358 pfnalign = 0; 2359 2360 try_again: 2361 /* 2362 * See if we want pages for a legacy device 2363 */ 2364 if (hi_mfn < PFN_16MEG) 2365 poolp = &io_pool_16m; 2366 else 2367 poolp = &io_pool_4g; 2368 try_smaller: 2369 /* 2370 * Take pages from I/O pool. We'll use pages from the highest 2371 * MFN range possible. 2372 */ 2373 pp_first = pp_last = NULL; 2374 mutex_enter(&io_pool_lock); 2375 nwanted = minctg; 2376 for (pp = *poolp; pp && nwanted > 0; ) { 2377 pp = pp->p_prev; 2378 2379 /* 2380 * skip pages above allowable range 2381 */ 2382 mfn = mfn_list[pp->p_pagenum]; 2383 if (hi_mfn < mfn) 2384 goto skip; 2385 2386 /* 2387 * stop at pages below allowable range 2388 */ 2389 if (lo_mfn > mfn) 2390 break; 2391 restart: 2392 if (pp_last == NULL) { 2393 /* 2394 * Check alignment 2395 */ 2396 tmfn = mfn - (minctg - 1); 2397 if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign)) 2398 goto skip; /* not properly aligned */ 2399 /* 2400 * Check segment 2401 */ 2402 if ((mfn & pfnseg) < (tmfn & pfnseg)) 2403 goto skip; /* crosses seg boundary */ 2404 /* 2405 * Start building page list 2406 */ 2407 pp_first = pp_last = pp; 2408 nwanted--; 2409 } else { 2410 /* 2411 * check physical contiguity if required 2412 */ 2413 if (contig && 2414 mfn_list[pp_first->p_pagenum] != mfn + 1) { 2415 /* 2416 * not a contiguous page, restart list. 2417 */ 2418 pp_last = NULL; 2419 nwanted = minctg; 2420 goto restart; 2421 } else { /* add page to list */ 2422 pp_first = pp; 2423 nwanted--; 2424 } 2425 } 2426 skip: 2427 if (pp == *poolp) 2428 break; 2429 } 2430 2431 /* 2432 * If we didn't find memory. Try the more constrained pool, then 2433 * sweep free pages into the DMA pool and try again. 2434 */ 2435 if (nwanted != 0) { 2436 mutex_exit(&io_pool_lock); 2437 /* 2438 * If we were looking in the less constrained pool and 2439 * didn't find pages, try the more constrained pool. 2440 */ 2441 if (poolp == &io_pool_4g) { 2442 poolp = &io_pool_16m; 2443 goto try_smaller; 2444 } 2445 kmem_reap(); 2446 if (++attempt < 4) { 2447 /* 2448 * Grab some more io_pool pages 2449 */ 2450 (void) populate_io_pool(); 2451 goto try_again; /* go around and retry */ 2452 } 2453 return (NULL); 2454 } 2455 /* 2456 * Found the pages, now snip them from the list 2457 */ 2458 page_io_pool_sub(poolp, pp_first, pp_last); 2459 io_pool_cnt -= minctg; 2460 /* 2461 * reset low water mark 2462 */ 2463 if (io_pool_cnt < io_pool_cnt_lowater) 2464 io_pool_cnt_lowater = io_pool_cnt; 2465 mutex_exit(&io_pool_lock); 2466 return (pp_first); 2467 } 2468 2469 page_t * 2470 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr, 2471 ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg) 2472 { 2473 uint_t kflags; 2474 int order, extra, extpages, i, contig, nbits, extents; 2475 page_t *pp, *expp, *pp_first, **pplist = NULL; 2476 mfn_t *mfnlist = NULL; 2477 2478 contig = flags & PG_PHYSCONTIG; 2479 if (minctg == 1) 2480 contig = 0; 2481 flags &= ~PG_PHYSCONTIG; 2482 kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 2483 /* 2484 * Hypervisor will allocate extents, if we want contig 2485 * pages extent must be >= minctg 2486 */ 2487 if (contig) { 2488 order = highbit(minctg) - 1; 2489 if (minctg & ((1 << order) - 1)) 2490 order++; 2491 extpages = 1 << order; 2492 } else { 2493 order = 0; 2494 extpages = minctg; 2495 } 2496 if (extpages > minctg) { 2497 extra = extpages - minctg; 2498 if (!page_resv(extra, kflags)) 2499 return (NULL); 2500 } 2501 pp_first = NULL; 2502 pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 2503 if (pplist == NULL) 2504 goto balloon_fail; 2505 mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 2506 if (mfnlist == NULL) 2507 goto balloon_fail; 2508 pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr); 2509 if (pp == NULL) 2510 goto balloon_fail; 2511 pp_first = pp; 2512 if (extpages > minctg) { 2513 /* 2514 * fill out the rest of extent pages to swap 2515 * with the hypervisor 2516 */ 2517 for (i = 0; i < extra; i++) { 2518 expp = page_create_va(vp, 2519 (u_offset_t)(uintptr_t)io_pool_kva, 2520 PAGESIZE, flags, &kvseg, io_pool_kva); 2521 if (expp == NULL) 2522 goto balloon_fail; 2523 (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 2524 page_io_unlock(expp); 2525 page_hashout(expp, NULL); 2526 page_io_lock(expp); 2527 /* 2528 * add page to end of list 2529 */ 2530 expp->p_prev = pp_first->p_prev; 2531 expp->p_next = pp_first; 2532 expp->p_prev->p_next = expp; 2533 pp_first->p_prev = expp; 2534 } 2535 2536 } 2537 for (i = 0; i < extpages; i++) { 2538 pplist[i] = pp; 2539 pp = pp->p_next; 2540 } 2541 nbits = highbit(mattr->dma_attr_addr_hi); 2542 extents = contig ? 1 : minctg; 2543 if (balloon_replace_pages(extents, pplist, nbits, order, 2544 mfnlist) != extents) { 2545 if (ioalloc_dbg) 2546 cmn_err(CE_NOTE, "request to hypervisor" 2547 " for %d pages, maxaddr %" PRIx64 " failed", 2548 extpages, mattr->dma_attr_addr_hi); 2549 goto balloon_fail; 2550 } 2551 2552 kmem_free(pplist, extpages * sizeof (page_t *)); 2553 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2554 /* 2555 * Return any excess pages to free list 2556 */ 2557 if (extpages > minctg) { 2558 for (i = 0; i < extra; i++) { 2559 pp = pp_first->p_prev; 2560 page_sub(&pp_first, pp); 2561 page_io_unlock(pp); 2562 page_unresv(1); 2563 page_free(pp, 1); 2564 } 2565 } 2566 return (pp_first); 2567 balloon_fail: 2568 /* 2569 * Return pages to free list and return failure 2570 */ 2571 while (pp_first != NULL) { 2572 pp = pp_first; 2573 page_sub(&pp_first, pp); 2574 page_io_unlock(pp); 2575 if (pp->p_vnode != NULL) 2576 page_hashout(pp, NULL); 2577 page_free(pp, 1); 2578 } 2579 if (pplist) 2580 kmem_free(pplist, extpages * sizeof (page_t *)); 2581 if (mfnlist) 2582 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2583 page_unresv(extpages - minctg); 2584 return (NULL); 2585 } 2586 2587 static void 2588 return_partial_alloc(page_t *plist) 2589 { 2590 page_t *pp; 2591 2592 while (plist != NULL) { 2593 pp = plist; 2594 page_sub(&plist, pp); 2595 page_io_unlock(pp); 2596 page_destroy_io(pp); 2597 } 2598 } 2599 2600 static page_t * 2601 page_get_contigpages( 2602 struct vnode *vp, 2603 u_offset_t off, 2604 int *npagesp, 2605 uint_t flags, 2606 caddr_t vaddr, 2607 ddi_dma_attr_t *mattr) 2608 { 2609 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2610 page_t *plist; /* list to return */ 2611 page_t *pp, *mcpl; 2612 int contig, anyaddr, npages, getone = 0; 2613 mfn_t lo_mfn; 2614 mfn_t hi_mfn; 2615 pgcnt_t pfnalign = 0; 2616 int align, sgllen; 2617 uint64_t pfnseg; 2618 pgcnt_t minctg; 2619 2620 npages = *npagesp; 2621 ASSERT(mattr != NULL); 2622 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2623 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2624 sgllen = mattr->dma_attr_sgllen; 2625 pfnseg = mmu_btop(mattr->dma_attr_seg); 2626 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2627 if (align > MMU_PAGESIZE) 2628 pfnalign = mmu_btop(align); 2629 2630 contig = flags & PG_PHYSCONTIG; 2631 if (npages == -1) { 2632 npages = 1; 2633 pfnalign = 0; 2634 } 2635 /* 2636 * Clear the contig flag if only one page is needed. 2637 */ 2638 if (npages == 1) { 2639 getone = 1; 2640 contig = 0; 2641 } 2642 2643 /* 2644 * Check if any page in the system is fine. 2645 */ 2646 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn; 2647 if (!contig && anyaddr && !pfnalign) { 2648 flags &= ~PG_PHYSCONTIG; 2649 plist = page_create_va(vp, off, npages * MMU_PAGESIZE, 2650 flags, &kvseg, vaddr); 2651 if (plist != NULL) { 2652 *npagesp = 0; 2653 return (plist); 2654 } 2655 } 2656 plist = NULL; 2657 minctg = howmany(npages, sgllen); 2658 while (npages > sgllen || getone) { 2659 if (minctg > npages) 2660 minctg = npages; 2661 mcpl = NULL; 2662 /* 2663 * We could want contig pages with no address range limits. 2664 */ 2665 if (anyaddr && contig) { 2666 /* 2667 * Look for free contig pages to satisfy the request. 2668 */ 2669 mcpl = find_contig_free(minctg, flags, pfnseg, 2670 pfnalign); 2671 } 2672 /* 2673 * Try the reserved io pools next 2674 */ 2675 if (mcpl == NULL) 2676 mcpl = page_io_pool_alloc(mattr, contig, minctg); 2677 if (mcpl != NULL) { 2678 pp = mcpl; 2679 do { 2680 if (!page_hashin(pp, vp, off, NULL)) { 2681 panic("page_get_contigpages:" 2682 " hashin failed" 2683 " pp %p, vp %p, off %llx", 2684 (void *)pp, (void *)vp, off); 2685 } 2686 off += MMU_PAGESIZE; 2687 PP_CLRFREE(pp); 2688 PP_CLRAGED(pp); 2689 page_set_props(pp, P_REF); 2690 page_io_lock(pp); 2691 pp = pp->p_next; 2692 } while (pp != mcpl); 2693 } else { 2694 /* 2695 * Hypervisor exchange doesn't handle segment or 2696 * alignment constraints 2697 */ 2698 if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || 2699 pfnalign) 2700 goto fail; 2701 /* 2702 * Try exchanging pages with the hypervisor 2703 */ 2704 mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr, 2705 flags, minctg); 2706 if (mcpl == NULL) 2707 goto fail; 2708 off += minctg * MMU_PAGESIZE; 2709 } 2710 check_dma(mattr, mcpl, minctg); 2711 /* 2712 * Here with a minctg run of contiguous pages, add them to the 2713 * list we will return for this request. 2714 */ 2715 page_list_concat(&plist, &mcpl); 2716 npages -= minctg; 2717 *npagesp = npages; 2718 sgllen--; 2719 if (getone) 2720 break; 2721 } 2722 return (plist); 2723 fail: 2724 return_partial_alloc(plist); 2725 return (NULL); 2726 } 2727 2728 /* 2729 * Allocator for domain 0 I/O pages. We match the required 2730 * DMA attributes and contiguity constraints. 2731 */ 2732 /*ARGSUSED*/ 2733 page_t * 2734 page_create_io( 2735 struct vnode *vp, 2736 u_offset_t off, 2737 uint_t bytes, 2738 uint_t flags, 2739 struct as *as, 2740 caddr_t vaddr, 2741 ddi_dma_attr_t *mattr) 2742 { 2743 page_t *plist = NULL, *pp; 2744 int npages = 0, contig, anyaddr, pages_req; 2745 mfn_t lo_mfn; 2746 mfn_t hi_mfn; 2747 pgcnt_t pfnalign = 0; 2748 int align; 2749 int is_domu = 0; 2750 int dummy, bytes_got; 2751 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2752 2753 ASSERT(mattr != NULL); 2754 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2755 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2756 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2757 if (align > MMU_PAGESIZE) 2758 pfnalign = mmu_btop(align); 2759 2760 /* 2761 * Clear the contig flag if only one page is needed or the scatter 2762 * gather list length is >= npages. 2763 */ 2764 pages_req = npages = mmu_btopr(bytes); 2765 contig = (flags & PG_PHYSCONTIG); 2766 bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 2767 if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages) 2768 contig = 0; 2769 2770 /* 2771 * Check if any old page in the system is fine. 2772 * DomU should always go down this path. 2773 */ 2774 is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 2775 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 2776 if ((!contig && anyaddr) || is_domu) { 2777 flags &= ~PG_PHYSCONTIG; 2778 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 2779 if (plist != NULL) 2780 return (plist); 2781 else if (is_domu) 2782 return (NULL); /* no memory available */ 2783 } 2784 /* 2785 * DomU should never reach here 2786 */ 2787 if (contig) { 2788 plist = page_get_contigpages(vp, off, &npages, flags, vaddr, 2789 mattr); 2790 if (plist == NULL) 2791 goto fail; 2792 bytes_got = (pages_req - npages) << MMU_PAGESHIFT; 2793 vaddr += bytes_got; 2794 off += bytes_got; 2795 /* 2796 * We now have all the contiguous pages we need, but 2797 * we may still need additional non-contiguous pages. 2798 */ 2799 } 2800 /* 2801 * now loop collecting the requested number of pages, these do 2802 * not have to be contiguous pages but we will use the contig 2803 * page alloc code to get the pages since it will honor any 2804 * other constraints the pages may have. 2805 */ 2806 while (npages--) { 2807 dummy = -1; 2808 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr); 2809 if (pp == NULL) 2810 goto fail; 2811 page_add(&plist, pp); 2812 vaddr += MMU_PAGESIZE; 2813 off += MMU_PAGESIZE; 2814 } 2815 return (plist); 2816 fail: 2817 /* 2818 * Failed to get enough pages, return ones we did get 2819 */ 2820 return_partial_alloc(plist); 2821 return (NULL); 2822 } 2823 2824 /* 2825 * Lock and return the page with the highest mfn that we can find. last_mfn 2826 * holds the last one found, so the next search can start from there. We 2827 * also keep a counter so that we don't loop forever if the machine has no 2828 * free pages. 2829 * 2830 * This is called from the balloon thread to find pages to give away. new_high 2831 * is used when new mfn's have been added to the system - we will reset our 2832 * search if the new mfn's are higher than our current search position. 2833 */ 2834 page_t * 2835 page_get_high_mfn(mfn_t new_high) 2836 { 2837 static mfn_t last_mfn = 0; 2838 pfn_t pfn; 2839 page_t *pp; 2840 ulong_t loop_count = 0; 2841 2842 if (new_high > last_mfn) 2843 last_mfn = new_high; 2844 2845 for (; loop_count < mfn_count; loop_count++, last_mfn--) { 2846 if (last_mfn == 0) { 2847 last_mfn = cached_max_mfn; 2848 } 2849 2850 pfn = mfn_to_pfn(last_mfn); 2851 if (pfn & PFN_IS_FOREIGN_MFN) 2852 continue; 2853 2854 /* See if the page is free. If so, lock it. */ 2855 pp = page_numtopp_alloc(pfn); 2856 if (pp == NULL) 2857 continue; 2858 PP_CLRFREE(pp); 2859 2860 ASSERT(PAGE_EXCL(pp)); 2861 ASSERT(pp->p_vnode == NULL); 2862 ASSERT(!hat_page_is_mapped(pp)); 2863 last_mfn--; 2864 return (pp); 2865 } 2866 return (NULL); 2867 } 2868 2869 #else /* !__xpv */ 2870 2871 /* 2872 * get a page from any list with the given mnode 2873 */ 2874 static page_t * 2875 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 2876 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 2877 { 2878 kmutex_t *pcm; 2879 int i; 2880 page_t *pp; 2881 page_t *first_pp; 2882 uint64_t pgaddr; 2883 ulong_t bin; 2884 int mtypestart; 2885 int plw_initialized; 2886 page_list_walker_t plw; 2887 2888 VM_STAT_ADD(pga_vmstats.pgma_alloc); 2889 2890 ASSERT((flags & PG_MATCH_COLOR) == 0); 2891 ASSERT(szc == 0); 2892 ASSERT(dma_attr != NULL); 2893 2894 MTYPE_START(mnode, mtype, flags); 2895 if (mtype < 0) { 2896 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 2897 return (NULL); 2898 } 2899 2900 mtypestart = mtype; 2901 2902 bin = origbin; 2903 2904 /* 2905 * check up to page_colors + 1 bins - origbin may be checked twice 2906 * because of BIN_STEP skip 2907 */ 2908 do { 2909 plw_initialized = 0; 2910 2911 for (plw.plw_count = 0; 2912 plw.plw_count < page_colors; plw.plw_count++) { 2913 2914 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 2915 goto nextfreebin; 2916 2917 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2918 mutex_enter(pcm); 2919 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2920 first_pp = pp; 2921 while (pp != NULL) { 2922 if (page_trylock(pp, SE_EXCL) == 0) { 2923 pp = pp->p_next; 2924 if (pp == first_pp) { 2925 pp = NULL; 2926 } 2927 continue; 2928 } 2929 2930 ASSERT(PP_ISFREE(pp)); 2931 ASSERT(PP_ISAGED(pp)); 2932 ASSERT(pp->p_vnode == NULL); 2933 ASSERT(pp->p_hash == NULL); 2934 ASSERT(pp->p_offset == (u_offset_t)-1); 2935 ASSERT(pp->p_szc == szc); 2936 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2937 /* check if page within DMA attributes */ 2938 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2939 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2940 (pgaddr + MMU_PAGESIZE - 1 <= 2941 dma_attr->dma_attr_addr_hi)) { 2942 break; 2943 } 2944 2945 /* continue looking */ 2946 page_unlock(pp); 2947 pp = pp->p_next; 2948 if (pp == first_pp) 2949 pp = NULL; 2950 2951 } 2952 if (pp != NULL) { 2953 ASSERT(mtype == PP_2_MTYPE(pp)); 2954 ASSERT(pp->p_szc == 0); 2955 2956 /* found a page with specified DMA attributes */ 2957 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 2958 mtype), pp); 2959 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2960 2961 if ((PP_ISFREE(pp) == 0) || 2962 (PP_ISAGED(pp) == 0)) { 2963 cmn_err(CE_PANIC, "page %p is not free", 2964 (void *)pp); 2965 } 2966 2967 mutex_exit(pcm); 2968 check_dma(dma_attr, pp, 1); 2969 VM_STAT_ADD(pga_vmstats.pgma_allocok); 2970 return (pp); 2971 } 2972 mutex_exit(pcm); 2973 nextfreebin: 2974 if (plw_initialized == 0) { 2975 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 2976 ASSERT(plw.plw_ceq_dif == page_colors); 2977 plw_initialized = 1; 2978 } 2979 2980 if (plw.plw_do_split) { 2981 pp = page_freelist_split(szc, bin, mnode, 2982 mtype, 2983 mmu_btop(dma_attr->dma_attr_addr_lo), 2984 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 2985 &plw); 2986 if (pp != NULL) { 2987 check_dma(dma_attr, pp, 1); 2988 return (pp); 2989 } 2990 } 2991 2992 bin = page_list_walk_next_bin(szc, bin, &plw); 2993 } 2994 2995 MTYPE_NEXT(mnode, mtype, flags); 2996 } while (mtype >= 0); 2997 2998 /* failed to find a page in the freelist; try it in the cachelist */ 2999 3000 /* reset mtype start for cachelist search */ 3001 mtype = mtypestart; 3002 ASSERT(mtype >= 0); 3003 3004 /* start with the bin of matching color */ 3005 bin = origbin; 3006 3007 do { 3008 for (i = 0; i <= page_colors; i++) { 3009 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 3010 goto nextcachebin; 3011 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3012 mutex_enter(pcm); 3013 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3014 first_pp = pp; 3015 while (pp != NULL) { 3016 if (page_trylock(pp, SE_EXCL) == 0) { 3017 pp = pp->p_next; 3018 if (pp == first_pp) 3019 pp = NULL; 3020 continue; 3021 } 3022 ASSERT(pp->p_vnode); 3023 ASSERT(PP_ISAGED(pp) == 0); 3024 ASSERT(pp->p_szc == 0); 3025 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3026 3027 /* check if page within DMA attributes */ 3028 3029 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 3030 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 3031 (pgaddr + MMU_PAGESIZE - 1 <= 3032 dma_attr->dma_attr_addr_hi)) { 3033 break; 3034 } 3035 3036 /* continue looking */ 3037 page_unlock(pp); 3038 pp = pp->p_next; 3039 if (pp == first_pp) 3040 pp = NULL; 3041 } 3042 3043 if (pp != NULL) { 3044 ASSERT(mtype == PP_2_MTYPE(pp)); 3045 ASSERT(pp->p_szc == 0); 3046 3047 /* found a page with specified DMA attributes */ 3048 page_sub(&PAGE_CACHELISTS(mnode, bin, 3049 mtype), pp); 3050 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3051 3052 mutex_exit(pcm); 3053 ASSERT(pp->p_vnode); 3054 ASSERT(PP_ISAGED(pp) == 0); 3055 check_dma(dma_attr, pp, 1); 3056 VM_STAT_ADD(pga_vmstats.pgma_allocok); 3057 return (pp); 3058 } 3059 mutex_exit(pcm); 3060 nextcachebin: 3061 bin += (i == 0) ? BIN_STEP : 1; 3062 bin &= page_colors_mask; 3063 } 3064 MTYPE_NEXT(mnode, mtype, flags); 3065 } while (mtype >= 0); 3066 3067 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 3068 return (NULL); 3069 } 3070 3071 /* 3072 * This function is similar to page_get_freelist()/page_get_cachelist() 3073 * but it searches both the lists to find a page with the specified 3074 * color (or no color) and DMA attributes. The search is done in the 3075 * freelist first and then in the cache list within the highest memory 3076 * range (based on DMA attributes) before searching in the lower 3077 * memory ranges. 3078 * 3079 * Note: This function is called only by page_create_io(). 3080 */ 3081 /*ARGSUSED*/ 3082 static page_t * 3083 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 3084 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 3085 { 3086 uint_t bin; 3087 int mtype; 3088 page_t *pp; 3089 int n; 3090 int m; 3091 int szc; 3092 int fullrange; 3093 int mnode; 3094 int local_failed_stat = 0; 3095 lgrp_mnode_cookie_t lgrp_cookie; 3096 3097 VM_STAT_ADD(pga_vmstats.pga_alloc); 3098 3099 /* only base pagesize currently supported */ 3100 if (size != MMU_PAGESIZE) 3101 return (NULL); 3102 3103 /* 3104 * If we're passed a specific lgroup, we use it. Otherwise, 3105 * assume first-touch placement is desired. 3106 */ 3107 if (!LGRP_EXISTS(lgrp)) 3108 lgrp = lgrp_home_lgrp(); 3109 3110 /* LINTED */ 3111 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3112 3113 /* 3114 * Only hold one freelist or cachelist lock at a time, that way we 3115 * can start anywhere and not have to worry about lock 3116 * ordering. 3117 */ 3118 if (dma_attr == NULL) { 3119 n = 0; 3120 m = mnoderangecnt - 1; 3121 fullrange = 1; 3122 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 3123 } else { 3124 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 3125 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 3126 3127 /* 3128 * We can guarantee alignment only for page boundary. 3129 */ 3130 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 3131 return (NULL); 3132 3133 n = pfn_2_mtype(pfnlo); 3134 m = pfn_2_mtype(pfnhi); 3135 3136 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 3137 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 3138 } 3139 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 3140 3141 if (n > m) 3142 return (NULL); 3143 3144 szc = 0; 3145 3146 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 3147 if (n == 0) { 3148 flags |= PGI_MT_RANGE0; 3149 n = m; 3150 } 3151 3152 /* 3153 * Try local memory node first, but try remote if we can't 3154 * get a page of the right color. 3155 */ 3156 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 3157 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3158 /* 3159 * allocate pages from high pfn to low. 3160 */ 3161 for (mtype = m; mtype >= n; mtype--) { 3162 if (fullrange != 0) { 3163 pp = page_get_mnode_freelist(mnode, 3164 bin, mtype, szc, flags); 3165 if (pp == NULL) { 3166 pp = page_get_mnode_cachelist( 3167 bin, flags, mnode, mtype); 3168 } 3169 } else { 3170 pp = page_get_mnode_anylist(bin, szc, 3171 flags, mnode, mtype, dma_attr); 3172 } 3173 if (pp != NULL) { 3174 VM_STAT_ADD(pga_vmstats.pga_allocok); 3175 check_dma(dma_attr, pp, 1); 3176 return (pp); 3177 } 3178 } 3179 if (!local_failed_stat) { 3180 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3181 local_failed_stat = 1; 3182 } 3183 } 3184 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 3185 3186 return (NULL); 3187 } 3188 3189 /* 3190 * page_create_io() 3191 * 3192 * This function is a copy of page_create_va() with an additional 3193 * argument 'mattr' that specifies DMA memory requirements to 3194 * the page list functions. This function is used by the segkmem 3195 * allocator so it is only to create new pages (i.e PG_EXCL is 3196 * set). 3197 * 3198 * Note: This interface is currently used by x86 PSM only and is 3199 * not fully specified so the commitment level is only for 3200 * private interface specific to x86. This interface uses PSM 3201 * specific page_get_anylist() interface. 3202 */ 3203 3204 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 3205 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 3206 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 3207 break; \ 3208 } \ 3209 } 3210 3211 3212 page_t * 3213 page_create_io( 3214 struct vnode *vp, 3215 u_offset_t off, 3216 uint_t bytes, 3217 uint_t flags, 3218 struct as *as, 3219 caddr_t vaddr, 3220 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 3221 { 3222 page_t *plist = NULL; 3223 uint_t plist_len = 0; 3224 pgcnt_t npages; 3225 page_t *npp = NULL; 3226 uint_t pages_req; 3227 page_t *pp; 3228 kmutex_t *phm = NULL; 3229 uint_t index; 3230 3231 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 3232 "page_create_start:vp %p off %llx bytes %u flags %x", 3233 vp, off, bytes, flags); 3234 3235 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 3236 3237 pages_req = npages = mmu_btopr(bytes); 3238 3239 /* 3240 * Do the freemem and pcf accounting. 3241 */ 3242 if (!page_create_wait(npages, flags)) { 3243 return (NULL); 3244 } 3245 3246 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 3247 "page_create_success:vp %p off %llx", vp, off); 3248 3249 /* 3250 * If satisfying this request has left us with too little 3251 * memory, start the wheels turning to get some back. The 3252 * first clause of the test prevents waking up the pageout 3253 * daemon in situations where it would decide that there's 3254 * nothing to do. 3255 */ 3256 if (nscan < desscan && freemem < minfree) { 3257 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 3258 "pageout_cv_signal:freemem %ld", freemem); 3259 cv_signal(&proc_pageout->p_cv); 3260 } 3261 3262 if (flags & PG_PHYSCONTIG) { 3263 3264 plist = page_get_contigpage(&npages, mattr, 1); 3265 if (plist == NULL) { 3266 page_create_putback(npages); 3267 return (NULL); 3268 } 3269 3270 pp = plist; 3271 3272 do { 3273 if (!page_hashin(pp, vp, off, NULL)) { 3274 panic("pg_creat_io: hashin failed %p %p %llx", 3275 (void *)pp, (void *)vp, off); 3276 } 3277 VM_STAT_ADD(page_create_new); 3278 off += MMU_PAGESIZE; 3279 PP_CLRFREE(pp); 3280 PP_CLRAGED(pp); 3281 page_set_props(pp, P_REF); 3282 pp = pp->p_next; 3283 } while (pp != plist); 3284 3285 if (!npages) { 3286 check_dma(mattr, plist, pages_req); 3287 return (plist); 3288 } else { 3289 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 3290 } 3291 3292 /* 3293 * fall-thru: 3294 * 3295 * page_get_contigpage returns when npages <= sgllen. 3296 * Grab the rest of the non-contig pages below from anylist. 3297 */ 3298 } 3299 3300 /* 3301 * Loop around collecting the requested number of pages. 3302 * Most of the time, we have to `create' a new page. With 3303 * this in mind, pull the page off the free list before 3304 * getting the hash lock. This will minimize the hash 3305 * lock hold time, nesting, and the like. If it turns 3306 * out we don't need the page, we put it back at the end. 3307 */ 3308 while (npages--) { 3309 phm = NULL; 3310 3311 index = PAGE_HASH_FUNC(vp, off); 3312 top: 3313 ASSERT(phm == NULL); 3314 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 3315 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3316 3317 if (npp == NULL) { 3318 /* 3319 * Try to get the page of any color either from 3320 * the freelist or from the cache list. 3321 */ 3322 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 3323 flags & ~PG_MATCH_COLOR, mattr, NULL); 3324 if (npp == NULL) { 3325 if (mattr == NULL) { 3326 /* 3327 * Not looking for a special page; 3328 * panic! 3329 */ 3330 panic("no page found %d", (int)npages); 3331 } 3332 /* 3333 * No page found! This can happen 3334 * if we are looking for a page 3335 * within a specific memory range 3336 * for DMA purposes. If PG_WAIT is 3337 * specified then we wait for a 3338 * while and then try again. The 3339 * wait could be forever if we 3340 * don't get the page(s) we need. 3341 * 3342 * Note: XXX We really need a mechanism 3343 * to wait for pages in the desired 3344 * range. For now, we wait for any 3345 * pages and see if we can use it. 3346 */ 3347 3348 if ((mattr != NULL) && (flags & PG_WAIT)) { 3349 delay(10); 3350 goto top; 3351 } 3352 goto fail; /* undo accounting stuff */ 3353 } 3354 3355 if (PP_ISAGED(npp) == 0) { 3356 /* 3357 * Since this page came from the 3358 * cachelist, we must destroy the 3359 * old vnode association. 3360 */ 3361 page_hashout(npp, (kmutex_t *)NULL); 3362 } 3363 } 3364 3365 /* 3366 * We own this page! 3367 */ 3368 ASSERT(PAGE_EXCL(npp)); 3369 ASSERT(npp->p_vnode == NULL); 3370 ASSERT(!hat_page_is_mapped(npp)); 3371 PP_CLRFREE(npp); 3372 PP_CLRAGED(npp); 3373 3374 /* 3375 * Here we have a page in our hot little mits and are 3376 * just waiting to stuff it on the appropriate lists. 3377 * Get the mutex and check to see if it really does 3378 * not exist. 3379 */ 3380 phm = PAGE_HASH_MUTEX(index); 3381 mutex_enter(phm); 3382 PAGE_HASH_SEARCH(index, pp, vp, off); 3383 if (pp == NULL) { 3384 VM_STAT_ADD(page_create_new); 3385 pp = npp; 3386 npp = NULL; 3387 if (!page_hashin(pp, vp, off, phm)) { 3388 /* 3389 * Since we hold the page hash mutex and 3390 * just searched for this page, page_hashin 3391 * had better not fail. If it does, that 3392 * means somethread did not follow the 3393 * page hash mutex rules. Panic now and 3394 * get it over with. As usual, go down 3395 * holding all the locks. 3396 */ 3397 ASSERT(MUTEX_HELD(phm)); 3398 panic("page_create: hashin fail %p %p %llx %p", 3399 (void *)pp, (void *)vp, off, (void *)phm); 3400 3401 } 3402 ASSERT(MUTEX_HELD(phm)); 3403 mutex_exit(phm); 3404 phm = NULL; 3405 3406 /* 3407 * Hat layer locking need not be done to set 3408 * the following bits since the page is not hashed 3409 * and was on the free list (i.e., had no mappings). 3410 * 3411 * Set the reference bit to protect 3412 * against immediate pageout 3413 * 3414 * XXXmh modify freelist code to set reference 3415 * bit so we don't have to do it here. 3416 */ 3417 page_set_props(pp, P_REF); 3418 } else { 3419 ASSERT(MUTEX_HELD(phm)); 3420 mutex_exit(phm); 3421 phm = NULL; 3422 /* 3423 * NOTE: This should not happen for pages associated 3424 * with kernel vnode 'kvp'. 3425 */ 3426 /* XX64 - to debug why this happens! */ 3427 ASSERT(!VN_ISKAS(vp)); 3428 if (VN_ISKAS(vp)) 3429 cmn_err(CE_NOTE, 3430 "page_create: page not expected " 3431 "in hash list for kernel vnode - pp 0x%p", 3432 (void *)pp); 3433 VM_STAT_ADD(page_create_exists); 3434 goto fail; 3435 } 3436 3437 /* 3438 * Got a page! It is locked. Acquire the i/o 3439 * lock since we are going to use the p_next and 3440 * p_prev fields to link the requested pages together. 3441 */ 3442 page_io_lock(pp); 3443 page_add(&plist, pp); 3444 plist = plist->p_next; 3445 off += MMU_PAGESIZE; 3446 vaddr += MMU_PAGESIZE; 3447 } 3448 3449 check_dma(mattr, plist, pages_req); 3450 return (plist); 3451 3452 fail: 3453 if (npp != NULL) { 3454 /* 3455 * Did not need this page after all. 3456 * Put it back on the free list. 3457 */ 3458 VM_STAT_ADD(page_create_putbacks); 3459 PP_SETFREE(npp); 3460 PP_SETAGED(npp); 3461 npp->p_offset = (u_offset_t)-1; 3462 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 3463 page_unlock(npp); 3464 } 3465 3466 /* 3467 * Give up the pages we already got. 3468 */ 3469 while (plist != NULL) { 3470 pp = plist; 3471 page_sub(&plist, pp); 3472 page_io_unlock(pp); 3473 plist_len++; 3474 /*LINTED: constant in conditional ctx*/ 3475 VN_DISPOSE(pp, B_INVAL, 0, kcred); 3476 } 3477 3478 /* 3479 * VN_DISPOSE does freemem accounting for the pages in plist 3480 * by calling page_free. So, we need to undo the pcf accounting 3481 * for only the remaining pages. 3482 */ 3483 VM_STAT_ADD(page_create_putbacks); 3484 page_create_putback(pages_req - plist_len); 3485 3486 return (NULL); 3487 } 3488 #endif /* !__xpv */ 3489 3490 3491 /* 3492 * Copy the data from the physical page represented by "frompp" to 3493 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 3494 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 3495 * level and no one sleeps with an active mapping there. 3496 * 3497 * Note that the ref/mod bits in the page_t's are not affected by 3498 * this operation, hence it is up to the caller to update them appropriately. 3499 */ 3500 int 3501 ppcopy(page_t *frompp, page_t *topp) 3502 { 3503 caddr_t pp_addr1; 3504 caddr_t pp_addr2; 3505 hat_mempte_t pte1; 3506 hat_mempte_t pte2; 3507 kmutex_t *ppaddr_mutex; 3508 label_t ljb; 3509 int ret = 1; 3510 3511 ASSERT_STACK_ALIGNED(); 3512 ASSERT(PAGE_LOCKED(frompp)); 3513 ASSERT(PAGE_LOCKED(topp)); 3514 3515 if (kpm_enable) { 3516 pp_addr1 = hat_kpm_page2va(frompp, 0); 3517 pp_addr2 = hat_kpm_page2va(topp, 0); 3518 kpreempt_disable(); 3519 } else { 3520 /* 3521 * disable pre-emption so that CPU can't change 3522 */ 3523 kpreempt_disable(); 3524 3525 pp_addr1 = CPU->cpu_caddr1; 3526 pp_addr2 = CPU->cpu_caddr2; 3527 pte1 = CPU->cpu_caddr1pte; 3528 pte2 = CPU->cpu_caddr2pte; 3529 3530 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3531 mutex_enter(ppaddr_mutex); 3532 3533 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 3534 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 3535 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 3536 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3537 HAT_LOAD_NOCONSIST); 3538 } 3539 3540 if (on_fault(&ljb)) { 3541 ret = 0; 3542 goto faulted; 3543 } 3544 if (use_sse_pagecopy) 3545 #ifdef __xpv 3546 page_copy_no_xmm(pp_addr2, pp_addr1); 3547 #else 3548 hwblkpagecopy(pp_addr1, pp_addr2); 3549 #endif 3550 else 3551 bcopy(pp_addr1, pp_addr2, PAGESIZE); 3552 3553 no_fault(); 3554 faulted: 3555 if (!kpm_enable) { 3556 #ifdef __xpv 3557 /* 3558 * We can't leave unused mappings laying about under the 3559 * hypervisor, so blow them away. 3560 */ 3561 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 3562 UVMF_INVLPG | UVMF_LOCAL) < 0) 3563 panic("HYPERVISOR_update_va_mapping() failed"); 3564 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3565 UVMF_INVLPG | UVMF_LOCAL) < 0) 3566 panic("HYPERVISOR_update_va_mapping() failed"); 3567 #endif 3568 mutex_exit(ppaddr_mutex); 3569 } 3570 kpreempt_enable(); 3571 return (ret); 3572 } 3573 3574 void 3575 pagezero(page_t *pp, uint_t off, uint_t len) 3576 { 3577 ASSERT(PAGE_LOCKED(pp)); 3578 pfnzero(page_pptonum(pp), off, len); 3579 } 3580 3581 /* 3582 * Zero the physical page from off to off + len given by pfn 3583 * without changing the reference and modified bits of page. 3584 * 3585 * We use this using CPU private page address #2, see ppcopy() for more info. 3586 * pfnzero() must not be called at interrupt level. 3587 */ 3588 void 3589 pfnzero(pfn_t pfn, uint_t off, uint_t len) 3590 { 3591 caddr_t pp_addr2; 3592 hat_mempte_t pte2; 3593 kmutex_t *ppaddr_mutex = NULL; 3594 3595 ASSERT_STACK_ALIGNED(); 3596 ASSERT(len <= MMU_PAGESIZE); 3597 ASSERT(off <= MMU_PAGESIZE); 3598 ASSERT(off + len <= MMU_PAGESIZE); 3599 3600 if (kpm_enable && !pfn_is_foreign(pfn)) { 3601 pp_addr2 = hat_kpm_pfn2va(pfn); 3602 kpreempt_disable(); 3603 } else { 3604 kpreempt_disable(); 3605 3606 pp_addr2 = CPU->cpu_caddr2; 3607 pte2 = CPU->cpu_caddr2pte; 3608 3609 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3610 mutex_enter(ppaddr_mutex); 3611 3612 hat_mempte_remap(pfn, pp_addr2, pte2, 3613 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3614 HAT_LOAD_NOCONSIST); 3615 } 3616 3617 if (use_sse_pagezero) { 3618 #ifdef __xpv 3619 uint_t rem; 3620 3621 /* 3622 * zero a byte at a time until properly aligned for 3623 * block_zero_no_xmm(). 3624 */ 3625 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3626 pp_addr2[off++] = 0; 3627 3628 /* 3629 * Now use faster block_zero_no_xmm() for any range 3630 * that is properly aligned and sized. 3631 */ 3632 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3633 len -= rem; 3634 if (len != 0) { 3635 block_zero_no_xmm(pp_addr2 + off, len); 3636 off += len; 3637 } 3638 3639 /* 3640 * zero remainder with byte stores. 3641 */ 3642 while (rem-- > 0) 3643 pp_addr2[off++] = 0; 3644 #else 3645 hwblkclr(pp_addr2 + off, len); 3646 #endif 3647 } else { 3648 bzero(pp_addr2 + off, len); 3649 } 3650 3651 if (!kpm_enable || pfn_is_foreign(pfn)) { 3652 #ifdef __xpv 3653 /* 3654 * On the hypervisor this page might get used for a page 3655 * table before any intervening change to this mapping, 3656 * so blow it away. 3657 */ 3658 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3659 UVMF_INVLPG) < 0) 3660 panic("HYPERVISOR_update_va_mapping() failed"); 3661 #endif 3662 mutex_exit(ppaddr_mutex); 3663 } 3664 3665 kpreempt_enable(); 3666 } 3667 3668 /* 3669 * Platform-dependent page scrub call. 3670 */ 3671 void 3672 pagescrub(page_t *pp, uint_t off, uint_t len) 3673 { 3674 /* 3675 * For now, we rely on the fact that pagezero() will 3676 * always clear UEs. 3677 */ 3678 pagezero(pp, off, len); 3679 } 3680 3681 /* 3682 * set up two private addresses for use on a given CPU for use in ppcopy() 3683 */ 3684 void 3685 setup_vaddr_for_ppcopy(struct cpu *cpup) 3686 { 3687 void *addr; 3688 hat_mempte_t pte_pa; 3689 3690 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3691 pte_pa = hat_mempte_setup(addr); 3692 cpup->cpu_caddr1 = addr; 3693 cpup->cpu_caddr1pte = pte_pa; 3694 3695 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3696 pte_pa = hat_mempte_setup(addr); 3697 cpup->cpu_caddr2 = addr; 3698 cpup->cpu_caddr2pte = pte_pa; 3699 3700 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 3701 } 3702 3703 /* 3704 * Undo setup_vaddr_for_ppcopy 3705 */ 3706 void 3707 teardown_vaddr_for_ppcopy(struct cpu *cpup) 3708 { 3709 mutex_destroy(&cpup->cpu_ppaddr_mutex); 3710 3711 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 3712 cpup->cpu_caddr2pte = 0; 3713 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 3714 cpup->cpu_caddr2 = 0; 3715 3716 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 3717 cpup->cpu_caddr1pte = 0; 3718 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 3719 cpup->cpu_caddr1 = 0; 3720 } 3721 3722 /* 3723 * Create the pageout scanner thread. The thread has to 3724 * start at procedure with process pp and priority pri. 3725 */ 3726 void 3727 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 3728 { 3729 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 3730 } 3731 3732 /* 3733 * Function for flushing D-cache when performing module relocations 3734 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 3735 */ 3736 void 3737 dcache_flushall() 3738 {} 3739 3740 size_t 3741 exec_get_spslew(void) 3742 { 3743 return (0); 3744 } 3745 3746 /* 3747 * Allocate a memory page. The argument 'seed' can be any pseudo-random 3748 * number to vary where the pages come from. This is quite a hacked up 3749 * method -- it works for now, but really needs to be fixed up a bit. 3750 * 3751 * We currently use page_create_va() on the kvp with fake offsets, 3752 * segments and virt address. This is pretty bogus, but was copied from the 3753 * old hat_i86.c code. A better approach would be to specify either mnode 3754 * random or mnode local and takes a page from whatever color has the MOST 3755 * available - this would have a minimal impact on page coloring. 3756 */ 3757 page_t * 3758 page_get_physical(uintptr_t seed) 3759 { 3760 page_t *pp; 3761 u_offset_t offset; 3762 static struct seg tmpseg; 3763 static uintptr_t ctr = 0; 3764 3765 /* 3766 * This code is gross, we really need a simpler page allocator. 3767 * 3768 * We need to assign an offset for the page to call page_create_va() 3769 * To avoid conflicts with other pages, we get creative with the offset. 3770 * For 32 bits, we need an offset > 4Gig 3771 * For 64 bits, need an offset somewhere in the VA hole. 3772 */ 3773 offset = seed; 3774 if (offset > kernelbase) 3775 offset -= kernelbase; 3776 offset <<= MMU_PAGESHIFT; 3777 #if defined(__amd64) 3778 offset += mmu.hole_start; /* something in VA hole */ 3779 #else 3780 offset += 1ULL << 40; /* something > 4 Gig */ 3781 #endif 3782 3783 if (page_resv(1, KM_NOSLEEP) == 0) 3784 return (NULL); 3785 3786 #ifdef DEBUG 3787 pp = page_exists(&kvp, offset); 3788 if (pp != NULL) 3789 panic("page already exists %p", (void *)pp); 3790 #endif 3791 3792 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 3793 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 3794 if (pp != NULL) { 3795 page_io_unlock(pp); 3796 page_hashout(pp, NULL); 3797 page_downgrade(pp); 3798 } 3799 return (pp); 3800 } 3801