1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 #include <sys/vmsystm.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 #include <util/qsort.h> 87 #include <sys/taskq.h> 88 89 #ifdef __xpv 90 91 #include <sys/hypervisor.h> 92 #include <sys/xen_mmu.h> 93 #include <sys/balloon_impl.h> 94 95 /* 96 * domain 0 pages usable for DMA are kept pre-allocated and kept in 97 * distinct lists, ordered by increasing mfn. 98 */ 99 static kmutex_t io_pool_lock; 100 static kmutex_t contig_list_lock; 101 static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 102 static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 103 static long io_pool_cnt; 104 static long io_pool_cnt_max = 0; 105 #define DEFAULT_IO_POOL_MIN 128 106 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 107 static long io_pool_cnt_lowater = 0; 108 static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 109 static long io_pool_shrinks; /* how many times did we really shrink */ 110 static long io_pool_grows; /* how many times did we grow */ 111 static mfn_t start_mfn = 1; 112 static caddr_t io_pool_kva; /* use to alloc pages when needed */ 113 114 static int create_contig_pfnlist(uint_t); 115 116 /* 117 * percentage of phys mem to hold in the i/o pool 118 */ 119 #define DEFAULT_IO_POOL_PCT 2 120 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 121 static void page_io_pool_sub(page_t **, page_t *, page_t *); 122 int ioalloc_dbg = 0; 123 124 #endif /* __xpv */ 125 126 uint_t vac_colors = 1; 127 128 int largepagesupport = 0; 129 extern uint_t page_create_new; 130 extern uint_t page_create_exists; 131 extern uint_t page_create_putbacks; 132 extern uint_t page_create_putbacks; 133 /* 134 * Allow users to disable the kernel's use of SSE. 135 */ 136 extern int use_sse_pagecopy, use_sse_pagezero; 137 138 /* 139 * combined memory ranges from mnode and memranges[] to manage single 140 * mnode/mtype dimension in the page lists. 141 */ 142 typedef struct { 143 pfn_t mnr_pfnlo; 144 pfn_t mnr_pfnhi; 145 int mnr_mnode; 146 int mnr_memrange; /* index into memranges[] */ 147 /* maintain page list stats */ 148 pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 149 pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 150 pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 151 #ifdef DEBUG 152 struct mnr_mts { /* mnode/mtype szc stats */ 153 pgcnt_t mnr_mts_pgcnt; 154 int mnr_mts_colors; 155 pgcnt_t *mnr_mtsc_pgcnt; 156 } *mnr_mts; 157 #endif 158 } mnoderange_t; 159 160 #define MEMRANGEHI(mtype) \ 161 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 162 #define MEMRANGELO(mtype) (memranges[mtype]) 163 164 #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 165 166 /* 167 * As the PC architecture evolved memory up was clumped into several 168 * ranges for various historical I/O devices to do DMA. 169 * < 16Meg - ISA bus 170 * < 2Gig - ??? 171 * < 4Gig - PCI bus or drivers that don't understand PAE mode 172 * 173 * These are listed in reverse order, so that we can skip over unused 174 * ranges on machines with small memories. 175 * 176 * For now under the Hypervisor, we'll only ever have one memrange. 177 */ 178 #define PFN_4GIG 0x100000 179 #define PFN_16MEG 0x1000 180 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 181 PFN_4GIG, /* pfn range for 4G and above */ 182 0x80000, /* pfn range for 2G-4G */ 183 PFN_16MEG, /* pfn range for 16M-2G */ 184 0x00000, /* pfn range for 0-16M */ 185 }; 186 pfn_t *memranges = &arch_memranges[0]; 187 int nranges = NUM_MEM_RANGES; 188 189 /* 190 * This combines mem_node_config and memranges into one data 191 * structure to be used for page list management. 192 */ 193 mnoderange_t *mnoderanges; 194 int mnoderangecnt; 195 int mtype4g; 196 197 /* 198 * 4g memory management variables for systems with more than 4g of memory: 199 * 200 * physical memory below 4g is required for 32bit dma devices and, currently, 201 * for kmem memory. On systems with more than 4g of memory, the pool of memory 202 * below 4g can be depleted without any paging activity given that there is 203 * likely to be sufficient memory above 4g. 204 * 205 * physmax4g is set true if the largest pfn is over 4g. The rest of the 206 * 4g memory management code is enabled only when physmax4g is true. 207 * 208 * maxmem4g is the count of the maximum number of pages on the page lists 209 * with physical addresses below 4g. It can be a lot less then 4g given that 210 * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 211 * agp aperture etc. 212 * 213 * freemem4g maintains the count of the number of available pages on the 214 * page lists with physical addresses below 4g. 215 * 216 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 217 * 6% (desfree4gshift = 4) of maxmem4g. 218 * 219 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 220 * and the amount of physical memory above 4g is greater than freemem4g. 221 * In this case, page_get_* routines will restrict below 4g allocations 222 * for requests that don't specifically require it. 223 */ 224 225 #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 226 #define DESFREE4G (maxmem4g >> desfree4gshift) 227 228 #define RESTRICT4G_ALLOC \ 229 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 230 231 static pgcnt_t maxmem4g; 232 static pgcnt_t freemem4g; 233 static int physmax4g; 234 static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 235 static int lotsfree4gshift = 3; 236 237 /* 238 * 16m memory management: 239 * 240 * reserve some amount of physical memory below 16m for legacy devices. 241 * 242 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 243 * 16m or if the 16m pool drops below DESFREE16M. 244 * 245 * In this case, general page allocations via page_get_{free,cache}list 246 * routines will be restricted from allocating from the 16m pool. Allocations 247 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 248 * are not restricted. 249 */ 250 251 #define FREEMEM16M MTYPE_FREEMEM(0) 252 #define DESFREE16M desfree16m 253 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 254 ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 255 ((freemem >= (FREEMEM16M)) || \ 256 (FREEMEM16M < (DESFREE16M + pgcnt)))) 257 258 static pgcnt_t desfree16m = 0x380; 259 260 /* 261 * This can be patched via /etc/system to allow old non-PAE aware device 262 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 263 */ 264 int restricted_kmemalloc = 0; 265 266 #ifdef VM_STATS 267 struct { 268 ulong_t pga_alloc; 269 ulong_t pga_notfullrange; 270 ulong_t pga_nulldmaattr; 271 ulong_t pga_allocok; 272 ulong_t pga_allocfailed; 273 ulong_t pgma_alloc; 274 ulong_t pgma_allocok; 275 ulong_t pgma_allocfailed; 276 ulong_t pgma_allocempty; 277 } pga_vmstats; 278 #endif 279 280 uint_t mmu_page_sizes; 281 282 /* How many page sizes the users can see */ 283 uint_t mmu_exported_page_sizes; 284 285 /* page sizes that legacy applications can see */ 286 uint_t mmu_legacy_page_sizes; 287 288 /* 289 * Number of pages in 1 GB. Don't enable automatic large pages if we have 290 * fewer than this many pages. 291 */ 292 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 293 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 294 295 /* 296 * Maximum and default segment size tunables for user private 297 * and shared anon memory, and user text and initialized data. 298 * These can be patched via /etc/system to allow large pages 299 * to be used for mapping application private and shared anon memory. 300 */ 301 size_t mcntl0_lpsize = MMU_PAGESIZE; 302 size_t max_uheap_lpsize = MMU_PAGESIZE; 303 size_t default_uheap_lpsize = MMU_PAGESIZE; 304 size_t max_ustack_lpsize = MMU_PAGESIZE; 305 size_t default_ustack_lpsize = MMU_PAGESIZE; 306 size_t max_privmap_lpsize = MMU_PAGESIZE; 307 size_t max_uidata_lpsize = MMU_PAGESIZE; 308 size_t max_utext_lpsize = MMU_PAGESIZE; 309 size_t max_shm_lpsize = MMU_PAGESIZE; 310 311 312 /* 313 * initialized by page_coloring_init(). 314 */ 315 uint_t page_colors; 316 uint_t page_colors_mask; 317 uint_t page_coloring_shift; 318 int cpu_page_colors; 319 static uint_t l2_colors; 320 321 /* 322 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 323 * and page_colors are calculated from the l2 cache n-way set size. Within a 324 * mnode range, the page freelist and cachelist are hashed into bins based on 325 * color. This makes it easier to search for a page within a specific memory 326 * range. 327 */ 328 #define PAGE_COLORS_MIN 16 329 330 page_t ****page_freelists; 331 page_t ***page_cachelists; 332 333 334 /* 335 * Used by page layer to know about page sizes 336 */ 337 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 338 339 kmutex_t *fpc_mutex[NPC_MUTEX]; 340 kmutex_t *cpc_mutex[NPC_MUTEX]; 341 342 /* 343 * Only let one thread at a time try to coalesce large pages, to 344 * prevent them from working against each other. 345 */ 346 static kmutex_t contig_lock; 347 #define CONTIG_LOCK() mutex_enter(&contig_lock); 348 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 349 350 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 351 352 /* 353 * Return the optimum page size for a given mapping 354 */ 355 /*ARGSUSED*/ 356 size_t 357 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 358 { 359 level_t l = 0; 360 size_t pgsz = MMU_PAGESIZE; 361 size_t max_lpsize; 362 uint_t mszc; 363 364 ASSERT(maptype != MAPPGSZ_VA); 365 366 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 367 return (MMU_PAGESIZE); 368 } 369 370 switch (maptype) { 371 case MAPPGSZ_HEAP: 372 case MAPPGSZ_STK: 373 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 374 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 375 if (max_lpsize == MMU_PAGESIZE) { 376 return (MMU_PAGESIZE); 377 } 378 if (len == 0) { 379 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 380 p->p_brksize - p->p_bssbase : p->p_stksize; 381 } 382 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 383 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 384 385 /* 386 * use the pages size that best fits len 387 */ 388 for (l = mmu.umax_page_level; l > 0; --l) { 389 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 390 continue; 391 } else { 392 pgsz = LEVEL_SIZE(l); 393 } 394 break; 395 } 396 397 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 398 p->p_stkpageszc); 399 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 400 pgsz = hw_page_array[mszc].hp_size; 401 } 402 return (pgsz); 403 404 case MAPPGSZ_ISM: 405 for (l = mmu.umax_page_level; l > 0; --l) { 406 if (len >= LEVEL_SIZE(l)) 407 return (LEVEL_SIZE(l)); 408 } 409 return (LEVEL_SIZE(0)); 410 } 411 return (pgsz); 412 } 413 414 static uint_t 415 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 416 size_t min_physmem) 417 { 418 caddr_t eaddr = addr + size; 419 uint_t szcvec = 0; 420 caddr_t raddr; 421 caddr_t readdr; 422 size_t pgsz; 423 int i; 424 425 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 426 return (0); 427 } 428 429 for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 430 pgsz = page_get_pagesize(i); 431 if (pgsz > max_lpsize) { 432 continue; 433 } 434 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 435 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 436 if (raddr < addr || raddr >= readdr) { 437 continue; 438 } 439 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 440 continue; 441 } 442 /* 443 * Set szcvec to the remaining page sizes. 444 */ 445 szcvec = ((1 << (i + 1)) - 1) & ~1; 446 break; 447 } 448 return (szcvec); 449 } 450 451 /* 452 * Return a bit vector of large page size codes that 453 * can be used to map [addr, addr + len) region. 454 */ 455 /*ARGSUSED*/ 456 uint_t 457 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 458 int memcntl) 459 { 460 size_t max_lpsize = mcntl0_lpsize; 461 462 if (mmu.max_page_level == 0) 463 return (0); 464 465 if (flags & MAP_TEXT) { 466 if (!memcntl) 467 max_lpsize = max_utext_lpsize; 468 return (map_szcvec(addr, size, off, max_lpsize, 469 shm_lpg_min_physmem)); 470 471 } else if (flags & MAP_INITDATA) { 472 if (!memcntl) 473 max_lpsize = max_uidata_lpsize; 474 return (map_szcvec(addr, size, off, max_lpsize, 475 privm_lpg_min_physmem)); 476 477 } else if (type == MAPPGSZC_SHM) { 478 if (!memcntl) 479 max_lpsize = max_shm_lpsize; 480 return (map_szcvec(addr, size, off, max_lpsize, 481 shm_lpg_min_physmem)); 482 483 } else if (type == MAPPGSZC_HEAP) { 484 if (!memcntl) 485 max_lpsize = max_uheap_lpsize; 486 return (map_szcvec(addr, size, off, max_lpsize, 487 privm_lpg_min_physmem)); 488 489 } else if (type == MAPPGSZC_STACK) { 490 if (!memcntl) 491 max_lpsize = max_ustack_lpsize; 492 return (map_szcvec(addr, size, off, max_lpsize, 493 privm_lpg_min_physmem)); 494 495 } else { 496 if (!memcntl) 497 max_lpsize = max_privmap_lpsize; 498 return (map_szcvec(addr, size, off, max_lpsize, 499 privm_lpg_min_physmem)); 500 } 501 } 502 503 /* 504 * Handle a pagefault. 505 */ 506 faultcode_t 507 pagefault( 508 caddr_t addr, 509 enum fault_type type, 510 enum seg_rw rw, 511 int iskernel) 512 { 513 struct as *as; 514 struct hat *hat; 515 struct proc *p; 516 kthread_t *t; 517 faultcode_t res; 518 caddr_t base; 519 size_t len; 520 int err; 521 int mapped_red; 522 uintptr_t ea; 523 524 ASSERT_STACK_ALIGNED(); 525 526 if (INVALID_VADDR(addr)) 527 return (FC_NOMAP); 528 529 mapped_red = segkp_map_red(); 530 531 if (iskernel) { 532 as = &kas; 533 hat = as->a_hat; 534 } else { 535 t = curthread; 536 p = ttoproc(t); 537 as = p->p_as; 538 hat = as->a_hat; 539 } 540 541 /* 542 * Dispatch pagefault. 543 */ 544 res = as_fault(hat, as, addr, 1, type, rw); 545 546 /* 547 * If this isn't a potential unmapped hole in the user's 548 * UNIX data or stack segments, just return status info. 549 */ 550 if (res != FC_NOMAP || iskernel) 551 goto out; 552 553 /* 554 * Check to see if we happened to faulted on a currently unmapped 555 * part of the UNIX data or stack segments. If so, create a zfod 556 * mapping there and then try calling the fault routine again. 557 */ 558 base = p->p_brkbase; 559 len = p->p_brksize; 560 561 if (addr < base || addr >= base + len) { /* data seg? */ 562 base = (caddr_t)p->p_usrstack - p->p_stksize; 563 len = p->p_stksize; 564 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 565 /* not in either UNIX data or stack segments */ 566 res = FC_NOMAP; 567 goto out; 568 } 569 } 570 571 /* 572 * the rest of this function implements a 3.X 4.X 5.X compatibility 573 * This code is probably not needed anymore 574 */ 575 if (p->p_model == DATAMODEL_ILP32) { 576 577 /* expand the gap to the page boundaries on each side */ 578 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 579 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 580 len = ea - (uintptr_t)base; 581 582 as_rangelock(as); 583 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 584 0) { 585 err = as_map(as, base, len, segvn_create, zfod_argsp); 586 as_rangeunlock(as); 587 if (err) { 588 res = FC_MAKE_ERR(err); 589 goto out; 590 } 591 } else { 592 /* 593 * This page is already mapped by another thread after 594 * we returned from as_fault() above. We just fall 595 * through as_fault() below. 596 */ 597 as_rangeunlock(as); 598 } 599 600 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 601 } 602 603 out: 604 if (mapped_red) 605 segkp_unmap_red(); 606 607 return (res); 608 } 609 610 void 611 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 612 { 613 struct proc *p = curproc; 614 caddr_t userlimit = (flags & _MAP_LOW32) ? 615 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 616 617 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 618 } 619 620 /*ARGSUSED*/ 621 int 622 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 623 { 624 return (0); 625 } 626 627 /* 628 * map_addr_proc() is the routine called when the system is to 629 * choose an address for the user. We will pick an address 630 * range which is the highest available below userlimit. 631 * 632 * Every mapping will have a redzone of a single page on either side of 633 * the request. This is done to leave one page unmapped between segments. 634 * This is not required, but it's useful for the user because if their 635 * program strays across a segment boundary, it will catch a fault 636 * immediately making debugging a little easier. Currently the redzone 637 * is mandatory. 638 * 639 * addrp is a value/result parameter. 640 * On input it is a hint from the user to be used in a completely 641 * machine dependent fashion. We decide to completely ignore this hint. 642 * If MAP_ALIGN was specified, addrp contains the minimal alignment, which 643 * must be some "power of two" multiple of pagesize. 644 * 645 * On output it is NULL if no address can be found in the current 646 * processes address space or else an address that is currently 647 * not mapped for len bytes with a page of red zone on either side. 648 * 649 * vacalign is not needed on x86 (it's for viturally addressed caches) 650 */ 651 /*ARGSUSED*/ 652 void 653 map_addr_proc( 654 caddr_t *addrp, 655 size_t len, 656 offset_t off, 657 int vacalign, 658 caddr_t userlimit, 659 struct proc *p, 660 uint_t flags) 661 { 662 struct as *as = p->p_as; 663 caddr_t addr; 664 caddr_t base; 665 size_t slen; 666 size_t align_amount; 667 668 ASSERT32(userlimit == as->a_userlimit); 669 670 base = p->p_brkbase; 671 #if defined(__amd64) 672 /* 673 * XX64 Yes, this needs more work. 674 */ 675 if (p->p_model == DATAMODEL_NATIVE) { 676 if (userlimit < as->a_userlimit) { 677 /* 678 * This happens when a program wants to map 679 * something in a range that's accessible to a 680 * program in a smaller address space. For example, 681 * a 64-bit program calling mmap32(2) to guarantee 682 * that the returned address is below 4Gbytes. 683 */ 684 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 685 686 if (userlimit > base) 687 slen = userlimit - base; 688 else { 689 *addrp = NULL; 690 return; 691 } 692 } else { 693 /* 694 * XX64 This layout is probably wrong .. but in 695 * the event we make the amd64 address space look 696 * like sparcv9 i.e. with the stack -above- the 697 * heap, this bit of code might even be correct. 698 */ 699 slen = p->p_usrstack - base - 700 (((size_t)rctl_enforced_value( 701 rctlproc_legacy[RLIMIT_STACK], 702 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 703 } 704 } else 705 #endif 706 slen = userlimit - base; 707 708 /* Make len be a multiple of PAGESIZE */ 709 len = (len + PAGEOFFSET) & PAGEMASK; 710 711 /* 712 * figure out what the alignment should be 713 * 714 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 715 */ 716 if (len <= ELF_386_MAXPGSZ) { 717 /* 718 * Align virtual addresses to ensure that ELF shared libraries 719 * are mapped with the appropriate alignment constraints by 720 * the run-time linker. 721 */ 722 align_amount = ELF_386_MAXPGSZ; 723 } else { 724 int l = mmu.umax_page_level; 725 726 while (l && len < LEVEL_SIZE(l)) 727 --l; 728 729 align_amount = LEVEL_SIZE(l); 730 } 731 732 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 733 align_amount = (uintptr_t)*addrp; 734 735 ASSERT(ISP2(align_amount)); 736 ASSERT(align_amount == 0 || align_amount >= PAGESIZE); 737 738 off = off & (align_amount - 1); 739 /* 740 * Look for a large enough hole starting below userlimit. 741 * After finding it, use the upper part. 742 */ 743 if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount, 744 PAGESIZE, off) == 0) { 745 caddr_t as_addr; 746 747 /* 748 * addr is the highest possible address to use since we have 749 * a PAGESIZE redzone at the beginning and end. 750 */ 751 addr = base + slen - (PAGESIZE + len); 752 as_addr = addr; 753 /* 754 * Round address DOWN to the alignment amount and 755 * add the offset in. 756 * If addr is greater than as_addr, len would not be large 757 * enough to include the redzone, so we must adjust down 758 * by the alignment amount. 759 */ 760 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 761 addr += (uintptr_t)off; 762 if (addr > as_addr) { 763 addr -= align_amount; 764 } 765 766 ASSERT(addr > base); 767 ASSERT(addr + len < base + slen); 768 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 769 ((uintptr_t)(off))); 770 *addrp = addr; 771 } else { 772 *addrp = NULL; /* no more virtual space */ 773 } 774 } 775 776 int valid_va_range_aligned_wraparound; 777 778 /* 779 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 780 * addresses at least "minlen" long, where the base of the range is at "off" 781 * phase from an "align" boundary and there is space for a "redzone"-sized 782 * redzone on either side of the range. On success, 1 is returned and *basep 783 * and *lenp are adjusted to describe the acceptable range (including 784 * the redzone). On failure, 0 is returned. 785 */ 786 /*ARGSUSED3*/ 787 int 788 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 789 size_t align, size_t redzone, size_t off) 790 { 791 uintptr_t hi, lo; 792 size_t tot_len; 793 794 ASSERT(align == 0 ? off == 0 : off < align); 795 ASSERT(ISP2(align)); 796 ASSERT(align == 0 || align >= PAGESIZE); 797 798 lo = (uintptr_t)*basep; 799 hi = lo + *lenp; 800 tot_len = minlen + 2 * redzone; /* need at least this much space */ 801 802 /* 803 * If hi rolled over the top, try cutting back. 804 */ 805 if (hi < lo) { 806 *lenp = 0UL - lo - 1UL; 807 /* See if this really happens. If so, then we figure out why */ 808 valid_va_range_aligned_wraparound++; 809 hi = lo + *lenp; 810 } 811 if (*lenp < tot_len) { 812 return (0); 813 } 814 815 #if defined(__amd64) 816 /* 817 * Deal with a possible hole in the address range between 818 * hole_start and hole_end that should never be mapped. 819 */ 820 if (lo < hole_start) { 821 if (hi > hole_start) { 822 if (hi < hole_end) { 823 hi = hole_start; 824 } else { 825 /* lo < hole_start && hi >= hole_end */ 826 if (dir == AH_LO) { 827 /* 828 * prefer lowest range 829 */ 830 if (hole_start - lo >= tot_len) 831 hi = hole_start; 832 else if (hi - hole_end >= tot_len) 833 lo = hole_end; 834 else 835 return (0); 836 } else { 837 /* 838 * prefer highest range 839 */ 840 if (hi - hole_end >= tot_len) 841 lo = hole_end; 842 else if (hole_start - lo >= tot_len) 843 hi = hole_start; 844 else 845 return (0); 846 } 847 } 848 } 849 } else { 850 /* lo >= hole_start */ 851 if (hi < hole_end) 852 return (0); 853 if (lo < hole_end) 854 lo = hole_end; 855 } 856 #endif 857 858 if (hi - lo < tot_len) 859 return (0); 860 861 if (align > 1) { 862 uintptr_t tlo = lo + redzone; 863 uintptr_t thi = hi - redzone; 864 tlo = (uintptr_t)P2PHASEUP(tlo, align, off); 865 if (tlo < lo + redzone) { 866 return (0); 867 } 868 if (thi < tlo || thi - tlo < minlen) { 869 return (0); 870 } 871 } 872 873 *basep = (caddr_t)lo; 874 *lenp = hi - lo; 875 return (1); 876 } 877 878 /* 879 * Determine whether [*basep, *basep + *lenp) contains a mappable range of 880 * addresses at least "minlen" long. On success, 1 is returned and *basep 881 * and *lenp are adjusted to describe the acceptable range. On failure, 0 882 * is returned. 883 */ 884 int 885 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 886 { 887 return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 888 } 889 890 /* 891 * Determine whether [addr, addr+len] are valid user addresses. 892 */ 893 /*ARGSUSED*/ 894 int 895 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 896 caddr_t userlimit) 897 { 898 caddr_t eaddr = addr + len; 899 900 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 901 return (RANGE_BADADDR); 902 903 #if defined(__amd64) 904 /* 905 * Check for the VA hole 906 */ 907 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 908 return (RANGE_BADADDR); 909 #endif 910 911 return (RANGE_OKAY); 912 } 913 914 /* 915 * Return 1 if the page frame is onboard memory, else 0. 916 */ 917 int 918 pf_is_memory(pfn_t pf) 919 { 920 if (pfn_is_foreign(pf)) 921 return (0); 922 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 923 } 924 925 /* 926 * return the memrange containing pfn 927 */ 928 int 929 memrange_num(pfn_t pfn) 930 { 931 int n; 932 933 for (n = 0; n < nranges - 1; ++n) { 934 if (pfn >= memranges[n]) 935 break; 936 } 937 return (n); 938 } 939 940 /* 941 * return the mnoderange containing pfn 942 */ 943 /*ARGSUSED*/ 944 int 945 pfn_2_mtype(pfn_t pfn) 946 { 947 #if defined(__xpv) 948 return (0); 949 #else 950 int n; 951 952 for (n = mnoderangecnt - 1; n >= 0; n--) { 953 if (pfn >= mnoderanges[n].mnr_pfnlo) { 954 break; 955 } 956 } 957 return (n); 958 #endif 959 } 960 961 #if !defined(__xpv) 962 /* 963 * is_contigpage_free: 964 * returns a page list of contiguous pages. It minimally has to return 965 * minctg pages. Caller determines minctg based on the scatter-gather 966 * list length. 967 * 968 * pfnp is set to the next page frame to search on return. 969 */ 970 static page_t * 971 is_contigpage_free( 972 pfn_t *pfnp, 973 pgcnt_t *pgcnt, 974 pgcnt_t minctg, 975 uint64_t pfnseg, 976 int iolock) 977 { 978 int i = 0; 979 pfn_t pfn = *pfnp; 980 page_t *pp; 981 page_t *plist = NULL; 982 983 /* 984 * fail if pfn + minctg crosses a segment boundary. 985 * Adjust for next starting pfn to begin at segment boundary. 986 */ 987 988 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 989 *pfnp = roundup(*pfnp, pfnseg + 1); 990 return (NULL); 991 } 992 993 do { 994 retry: 995 pp = page_numtopp_nolock(pfn + i); 996 if ((pp == NULL) || 997 (page_trylock(pp, SE_EXCL) == 0)) { 998 (*pfnp)++; 999 break; 1000 } 1001 if (page_pptonum(pp) != pfn + i) { 1002 page_unlock(pp); 1003 goto retry; 1004 } 1005 1006 if (!(PP_ISFREE(pp))) { 1007 page_unlock(pp); 1008 (*pfnp)++; 1009 break; 1010 } 1011 1012 if (!PP_ISAGED(pp)) { 1013 page_list_sub(pp, PG_CACHE_LIST); 1014 page_hashout(pp, (kmutex_t *)NULL); 1015 } else { 1016 page_list_sub(pp, PG_FREE_LIST); 1017 } 1018 1019 if (iolock) 1020 page_io_lock(pp); 1021 page_list_concat(&plist, &pp); 1022 1023 /* 1024 * exit loop when pgcnt satisfied or segment boundary reached. 1025 */ 1026 1027 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 1028 1029 *pfnp += i; /* set to next pfn to search */ 1030 1031 if (i >= minctg) { 1032 *pgcnt -= i; 1033 return (plist); 1034 } 1035 1036 /* 1037 * failure: minctg not satisfied. 1038 * 1039 * if next request crosses segment boundary, set next pfn 1040 * to search from the segment boundary. 1041 */ 1042 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 1043 *pfnp = roundup(*pfnp, pfnseg + 1); 1044 1045 /* clean up any pages already allocated */ 1046 1047 while (plist) { 1048 pp = plist; 1049 page_sub(&plist, pp); 1050 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 1051 if (iolock) 1052 page_io_unlock(pp); 1053 page_unlock(pp); 1054 } 1055 1056 return (NULL); 1057 } 1058 #endif /* !__xpv */ 1059 1060 /* 1061 * verify that pages being returned from allocator have correct DMA attribute 1062 */ 1063 #ifndef DEBUG 1064 #define check_dma(a, b, c) (0) 1065 #else 1066 static void 1067 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 1068 { 1069 if (dma_attr == NULL) 1070 return; 1071 1072 while (cnt-- > 0) { 1073 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 1074 dma_attr->dma_attr_addr_lo) 1075 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp); 1076 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 1077 dma_attr->dma_attr_addr_hi) 1078 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp); 1079 pp = pp->p_next; 1080 } 1081 } 1082 #endif 1083 1084 #if !defined(__xpv) 1085 static page_t * 1086 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 1087 { 1088 pfn_t pfn; 1089 int sgllen; 1090 uint64_t pfnseg; 1091 pgcnt_t minctg; 1092 page_t *pplist = NULL, *plist; 1093 uint64_t lo, hi; 1094 pgcnt_t pfnalign = 0; 1095 static pfn_t startpfn; 1096 static pgcnt_t lastctgcnt; 1097 uintptr_t align; 1098 1099 CONTIG_LOCK(); 1100 1101 if (mattr) { 1102 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 1103 hi = mmu_btop(mattr->dma_attr_addr_hi); 1104 if (hi >= physmax) 1105 hi = physmax - 1; 1106 sgllen = mattr->dma_attr_sgllen; 1107 pfnseg = mmu_btop(mattr->dma_attr_seg); 1108 1109 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 1110 if (align > MMU_PAGESIZE) 1111 pfnalign = mmu_btop(align); 1112 1113 /* 1114 * in order to satisfy the request, must minimally 1115 * acquire minctg contiguous pages 1116 */ 1117 minctg = howmany(*pgcnt, sgllen); 1118 1119 ASSERT(hi >= lo); 1120 1121 /* 1122 * start from where last searched if the minctg >= lastctgcnt 1123 */ 1124 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 1125 startpfn = lo; 1126 } else { 1127 hi = physmax - 1; 1128 lo = 0; 1129 sgllen = 1; 1130 pfnseg = mmu.highest_pfn; 1131 minctg = *pgcnt; 1132 1133 if (minctg < lastctgcnt) 1134 startpfn = lo; 1135 } 1136 lastctgcnt = minctg; 1137 1138 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 1139 1140 /* conserve 16m memory - start search above 16m when possible */ 1141 if (hi > PFN_16M && startpfn < PFN_16M) 1142 startpfn = PFN_16M; 1143 1144 pfn = startpfn; 1145 if (pfnalign) 1146 pfn = P2ROUNDUP(pfn, pfnalign); 1147 1148 while (pfn + minctg - 1 <= hi) { 1149 1150 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1151 if (plist) { 1152 page_list_concat(&pplist, &plist); 1153 sgllen--; 1154 /* 1155 * return when contig pages no longer needed 1156 */ 1157 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1158 startpfn = pfn; 1159 CONTIG_UNLOCK(); 1160 check_dma(mattr, pplist, *pgcnt); 1161 return (pplist); 1162 } 1163 minctg = howmany(*pgcnt, sgllen); 1164 } 1165 if (pfnalign) 1166 pfn = P2ROUNDUP(pfn, pfnalign); 1167 } 1168 1169 /* cannot find contig pages in specified range */ 1170 if (startpfn == lo) { 1171 CONTIG_UNLOCK(); 1172 return (NULL); 1173 } 1174 1175 /* did not start with lo previously */ 1176 pfn = lo; 1177 if (pfnalign) 1178 pfn = P2ROUNDUP(pfn, pfnalign); 1179 1180 /* allow search to go above startpfn */ 1181 while (pfn < startpfn) { 1182 1183 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1184 if (plist != NULL) { 1185 1186 page_list_concat(&pplist, &plist); 1187 sgllen--; 1188 1189 /* 1190 * return when contig pages no longer needed 1191 */ 1192 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1193 startpfn = pfn; 1194 CONTIG_UNLOCK(); 1195 check_dma(mattr, pplist, *pgcnt); 1196 return (pplist); 1197 } 1198 minctg = howmany(*pgcnt, sgllen); 1199 } 1200 if (pfnalign) 1201 pfn = P2ROUNDUP(pfn, pfnalign); 1202 } 1203 CONTIG_UNLOCK(); 1204 return (NULL); 1205 } 1206 #endif /* !__xpv */ 1207 1208 /* 1209 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1210 * memranges[]. Used to determine the size of page lists and mnoderanges. 1211 */ 1212 int 1213 mnode_range_cnt(int mnode) 1214 { 1215 #if defined(__xpv) 1216 ASSERT(mnode == 0); 1217 return (1); 1218 #else /* __xpv */ 1219 int mri; 1220 int mnrcnt = 0; 1221 1222 if (mem_node_config[mnode].exists != 0) { 1223 mri = nranges - 1; 1224 1225 /* find the memranges index below contained in mnode range */ 1226 1227 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1228 mri--; 1229 1230 /* 1231 * increment mnode range counter when memranges or mnode 1232 * boundary is reached. 1233 */ 1234 while (mri >= 0 && 1235 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1236 mnrcnt++; 1237 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1238 mri--; 1239 else 1240 break; 1241 } 1242 } 1243 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1244 return (mnrcnt); 1245 #endif /* __xpv */ 1246 } 1247 1248 /* 1249 * mnode_range_setup() initializes mnoderanges. 1250 */ 1251 void 1252 mnode_range_setup(mnoderange_t *mnoderanges) 1253 { 1254 int mnode, mri; 1255 1256 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1257 if (mem_node_config[mnode].exists == 0) 1258 continue; 1259 1260 mri = nranges - 1; 1261 1262 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1263 mri--; 1264 1265 while (mri >= 0 && mem_node_config[mnode].physmax >= 1266 MEMRANGELO(mri)) { 1267 mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 1268 mem_node_config[mnode].physbase); 1269 mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 1270 mem_node_config[mnode].physmax); 1271 mnoderanges->mnr_mnode = mnode; 1272 mnoderanges->mnr_memrange = mri; 1273 mnoderanges++; 1274 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1275 mri--; 1276 else 1277 break; 1278 } 1279 } 1280 } 1281 1282 /*ARGSUSED*/ 1283 int 1284 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1285 { 1286 int mtype = mnoderangecnt - 1; 1287 1288 #if !defined(__xpv) 1289 #if defined(__i386) 1290 /* 1291 * set the mtype range 1292 * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 1293 * - for non kmem requests, set range to above 4g if memory below 4g 1294 * runs low. 1295 */ 1296 if (restricted_kmemalloc && VN_ISKAS(vp) && 1297 (caddr_t)(vaddr) >= kernelheap && 1298 (caddr_t)(vaddr) < ekernelheap) { 1299 ASSERT(physmax4g); 1300 mtype = mtype4g; 1301 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1302 btop(pgsz), *flags)) { 1303 *flags |= PGI_MT_RANGE16M; 1304 } else { 1305 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1306 VM_STAT_COND_ADD((*flags & PG_PANIC), 1307 vmm_vmstats.pgpanicalloc); 1308 *flags |= PGI_MT_RANGE0; 1309 } 1310 return (mtype); 1311 } 1312 #endif /* __i386 */ 1313 1314 if (RESTRICT4G_ALLOC) { 1315 VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1316 /* here only for > 4g systems */ 1317 *flags |= PGI_MT_RANGE4G; 1318 } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1319 *flags |= PGI_MT_RANGE16M; 1320 } else { 1321 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1322 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1323 *flags |= PGI_MT_RANGE0; 1324 } 1325 #endif /* !__xpv */ 1326 return (mtype); 1327 } 1328 1329 1330 /* mtype init for page_get_replacement_page */ 1331 /*ARGSUSED*/ 1332 int 1333 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1334 { 1335 int mtype = mnoderangecnt - 1; 1336 #if !defined(__ixpv) 1337 if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1338 *flags |= PGI_MT_RANGE16M; 1339 } else { 1340 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1341 *flags |= PGI_MT_RANGE0; 1342 } 1343 #endif 1344 return (mtype); 1345 } 1346 1347 /* 1348 * Determine if the mnode range specified in mtype contains memory belonging 1349 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1350 * the range of indices from high pfn to 0, 16m or 4g. 1351 * 1352 * Return first mnode range type index found otherwise return -1 if none found. 1353 */ 1354 int 1355 mtype_func(int mnode, int mtype, uint_t flags) 1356 { 1357 if (flags & PGI_MT_RANGE) { 1358 int mtlim = 0; 1359 1360 if (flags & PGI_MT_NEXT) 1361 mtype--; 1362 if (flags & PGI_MT_RANGE4G) 1363 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1364 else if (flags & PGI_MT_RANGE16M) 1365 mtlim = 1; /* exclude 0-16m range */ 1366 while (mtype >= mtlim) { 1367 if (mnoderanges[mtype].mnr_mnode == mnode) 1368 return (mtype); 1369 mtype--; 1370 } 1371 } else if (mnoderanges[mtype].mnr_mnode == mnode) { 1372 return (mtype); 1373 } 1374 return (-1); 1375 } 1376 1377 /* 1378 * Update the page list max counts with the pfn range specified by the 1379 * input parameters. Called from add_physmem() when physical memory with 1380 * page_t's are initially added to the page lists. 1381 */ 1382 void 1383 mtype_modify_max(pfn_t startpfn, long cnt) 1384 { 1385 int mtype = 0; 1386 pfn_t endpfn = startpfn + cnt, pfn; 1387 pgcnt_t inc; 1388 1389 ASSERT(cnt > 0); 1390 1391 if (!physmax4g) 1392 return; 1393 1394 for (pfn = startpfn; pfn < endpfn; ) { 1395 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1396 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1397 inc = endpfn - pfn; 1398 } else { 1399 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1400 } 1401 if (mtype <= mtype4g) 1402 maxmem4g += inc; 1403 pfn += inc; 1404 } 1405 mtype++; 1406 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1407 } 1408 } 1409 1410 int 1411 mtype_2_mrange(int mtype) 1412 { 1413 return (mnoderanges[mtype].mnr_memrange); 1414 } 1415 1416 void 1417 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1418 { 1419 ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1420 *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1421 *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1422 } 1423 1424 size_t 1425 plcnt_sz(size_t ctrs_sz) 1426 { 1427 #ifdef DEBUG 1428 int szc, colors; 1429 1430 ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1431 for (szc = 0; szc < mmu_page_sizes; szc++) { 1432 colors = page_get_pagecolors(szc); 1433 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1434 } 1435 #endif 1436 return (ctrs_sz); 1437 } 1438 1439 caddr_t 1440 plcnt_init(caddr_t addr) 1441 { 1442 #ifdef DEBUG 1443 int mt, szc, colors; 1444 1445 for (mt = 0; mt < mnoderangecnt; mt++) { 1446 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1447 addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1448 for (szc = 0; szc < mmu_page_sizes; szc++) { 1449 colors = page_get_pagecolors(szc); 1450 mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1451 mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1452 (pgcnt_t *)addr; 1453 addr += (sizeof (pgcnt_t) * colors); 1454 } 1455 } 1456 #endif 1457 return (addr); 1458 } 1459 1460 void 1461 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1462 { 1463 #ifdef DEBUG 1464 int bin = PP_2_BIN(pp); 1465 1466 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1467 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1468 cnt); 1469 #endif 1470 ASSERT(mtype == PP_2_MTYPE(pp)); 1471 if (physmax4g && mtype <= mtype4g) 1472 atomic_add_long(&freemem4g, cnt); 1473 if (flags & PG_CACHE_LIST) 1474 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1475 else 1476 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 1477 atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 1478 } 1479 1480 /* 1481 * Returns the free page count for mnode 1482 */ 1483 int 1484 mnode_pgcnt(int mnode) 1485 { 1486 int mtype = mnoderangecnt - 1; 1487 int flags = PGI_MT_RANGE0; 1488 pgcnt_t pgcnt = 0; 1489 1490 mtype = mtype_func(mnode, mtype, flags); 1491 1492 while (mtype != -1) { 1493 pgcnt += MTYPE_FREEMEM(mtype); 1494 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1495 } 1496 return (pgcnt); 1497 } 1498 1499 /* 1500 * Initialize page coloring variables based on the l2 cache parameters. 1501 * Calculate and return memory needed for page coloring data structures. 1502 */ 1503 size_t 1504 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1505 { 1506 size_t colorsz = 0; 1507 int i; 1508 int colors; 1509 1510 #if defined(__xpv) 1511 /* 1512 * Hypervisor domains currently don't have any concept of NUMA. 1513 * Hence we'll act like there is only 1 memrange. 1514 */ 1515 i = memrange_num(1); 1516 #else /* !__xpv */ 1517 /* 1518 * Reduce the memory ranges lists if we don't have large amounts 1519 * of memory. This avoids searching known empty free lists. 1520 */ 1521 i = memrange_num(physmax); 1522 #if defined(__i386) 1523 if (i > 0) 1524 restricted_kmemalloc = 0; 1525 #endif 1526 /* physmax greater than 4g */ 1527 if (i == 0) 1528 physmax4g = 1; 1529 #endif /* !__xpv */ 1530 memranges += i; 1531 nranges -= i; 1532 1533 ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 1534 1535 ASSERT(ISP2(l2_linesz)); 1536 ASSERT(l2_sz > MMU_PAGESIZE); 1537 1538 /* l2_assoc is 0 for fully associative l2 cache */ 1539 if (l2_assoc) 1540 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1541 else 1542 l2_colors = 1; 1543 1544 ASSERT(ISP2(l2_colors)); 1545 1546 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1547 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1548 1549 /* 1550 * cpu_page_colors is non-zero when a page color may be spread across 1551 * multiple bins. 1552 */ 1553 if (l2_colors < page_colors) 1554 cpu_page_colors = l2_colors; 1555 1556 ASSERT(ISP2(page_colors)); 1557 1558 page_colors_mask = page_colors - 1; 1559 1560 ASSERT(ISP2(CPUSETSIZE())); 1561 page_coloring_shift = lowbit(CPUSETSIZE()); 1562 1563 /* initialize number of colors per page size */ 1564 for (i = 0; i <= mmu.max_page_level; i++) { 1565 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1566 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1567 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1568 hw_page_array[i].hp_colors = (page_colors_mask >> 1569 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1570 + 1; 1571 colorequivszc[i] = 0; 1572 } 1573 1574 /* 1575 * The value of cpu_page_colors determines if additional color bins 1576 * need to be checked for a particular color in the page_get routines. 1577 */ 1578 if (cpu_page_colors != 0) { 1579 1580 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1581 ASSERT(a > 0); 1582 ASSERT(a < 16); 1583 1584 for (i = 0; i <= mmu.max_page_level; i++) { 1585 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1586 colorequivszc[i] = 0; 1587 continue; 1588 } 1589 while ((colors >> a) == 0) 1590 a--; 1591 ASSERT(a >= 0); 1592 1593 /* higher 4 bits encodes color equiv mask */ 1594 colorequivszc[i] = (a << 4); 1595 } 1596 } 1597 1598 /* factor in colorequiv to check additional 'equivalent' bins. */ 1599 if (colorequiv > 1) { 1600 1601 int a = lowbit(colorequiv) - 1; 1602 if (a > 15) 1603 a = 15; 1604 1605 for (i = 0; i <= mmu.max_page_level; i++) { 1606 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1607 continue; 1608 } 1609 while ((colors >> a) == 0) 1610 a--; 1611 if ((a << 4) > colorequivszc[i]) { 1612 colorequivszc[i] = (a << 4); 1613 } 1614 } 1615 } 1616 1617 /* size for mnoderanges */ 1618 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1619 mnoderangecnt += mnode_range_cnt(i); 1620 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1621 1622 /* size for fpc_mutex and cpc_mutex */ 1623 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1624 1625 /* size of page_freelists */ 1626 colorsz += mnoderangecnt * sizeof (page_t ***); 1627 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1628 1629 for (i = 0; i < mmu_page_sizes; i++) { 1630 colors = page_get_pagecolors(i); 1631 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1632 } 1633 1634 /* size of page_cachelists */ 1635 colorsz += mnoderangecnt * sizeof (page_t **); 1636 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1637 1638 return (colorsz); 1639 } 1640 1641 /* 1642 * Called once at startup to configure page_coloring data structures and 1643 * does the 1st page_free()/page_freelist_add(). 1644 */ 1645 void 1646 page_coloring_setup(caddr_t pcmemaddr) 1647 { 1648 int i; 1649 int j; 1650 int k; 1651 caddr_t addr; 1652 int colors; 1653 1654 /* 1655 * do page coloring setup 1656 */ 1657 addr = pcmemaddr; 1658 1659 mnoderanges = (mnoderange_t *)addr; 1660 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1661 1662 mnode_range_setup(mnoderanges); 1663 1664 if (physmax4g) 1665 mtype4g = pfn_2_mtype(0xfffff); 1666 1667 for (k = 0; k < NPC_MUTEX; k++) { 1668 fpc_mutex[k] = (kmutex_t *)addr; 1669 addr += (max_mem_nodes * sizeof (kmutex_t)); 1670 } 1671 for (k = 0; k < NPC_MUTEX; k++) { 1672 cpc_mutex[k] = (kmutex_t *)addr; 1673 addr += (max_mem_nodes * sizeof (kmutex_t)); 1674 } 1675 page_freelists = (page_t ****)addr; 1676 addr += (mnoderangecnt * sizeof (page_t ***)); 1677 1678 page_cachelists = (page_t ***)addr; 1679 addr += (mnoderangecnt * sizeof (page_t **)); 1680 1681 for (i = 0; i < mnoderangecnt; i++) { 1682 page_freelists[i] = (page_t ***)addr; 1683 addr += (mmu_page_sizes * sizeof (page_t **)); 1684 1685 for (j = 0; j < mmu_page_sizes; j++) { 1686 colors = page_get_pagecolors(j); 1687 page_freelists[i][j] = (page_t **)addr; 1688 addr += (colors * sizeof (page_t *)); 1689 } 1690 page_cachelists[i] = (page_t **)addr; 1691 addr += (page_colors * sizeof (page_t *)); 1692 } 1693 } 1694 1695 #if defined(__xpv) 1696 /* 1697 * Give back 10% of the io_pool pages to the free list. 1698 * Don't shrink the pool below some absolute minimum. 1699 */ 1700 static void 1701 page_io_pool_shrink() 1702 { 1703 int retcnt; 1704 page_t *pp, *pp_first, *pp_last, **curpool; 1705 mfn_t mfn; 1706 int bothpools = 0; 1707 1708 mutex_enter(&io_pool_lock); 1709 io_pool_shrink_attempts++; /* should be a kstat? */ 1710 retcnt = io_pool_cnt / 10; 1711 if (io_pool_cnt - retcnt < io_pool_cnt_min) 1712 retcnt = io_pool_cnt - io_pool_cnt_min; 1713 if (retcnt <= 0) 1714 goto done; 1715 io_pool_shrinks++; /* should be a kstat? */ 1716 curpool = &io_pool_4g; 1717 domore: 1718 /* 1719 * Loop through taking pages from the end of the list 1720 * (highest mfns) till amount to return reached. 1721 */ 1722 for (pp = *curpool; pp && retcnt > 0; ) { 1723 pp_first = pp_last = pp->p_prev; 1724 if (pp_first == *curpool) 1725 break; 1726 retcnt--; 1727 io_pool_cnt--; 1728 page_io_pool_sub(curpool, pp_first, pp_last); 1729 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 1730 start_mfn = mfn; 1731 page_free(pp_first, 1); 1732 pp = *curpool; 1733 } 1734 if (retcnt != 0 && !bothpools) { 1735 /* 1736 * If not enough found in less constrained pool try the 1737 * more constrained one. 1738 */ 1739 curpool = &io_pool_16m; 1740 bothpools = 1; 1741 goto domore; 1742 } 1743 done: 1744 mutex_exit(&io_pool_lock); 1745 } 1746 1747 #endif /* __xpv */ 1748 1749 uint_t 1750 page_create_update_flags_x86(uint_t flags) 1751 { 1752 #if defined(__xpv) 1753 /* 1754 * Check this is an urgent allocation and free pages are depleted. 1755 */ 1756 if (!(flags & PG_WAIT) && freemem < desfree) 1757 page_io_pool_shrink(); 1758 #else /* !__xpv */ 1759 /* 1760 * page_create_get_something may call this because 4g memory may be 1761 * depleted. Set flags to allow for relocation of base page below 1762 * 4g if necessary. 1763 */ 1764 if (physmax4g) 1765 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1766 #endif /* __xpv */ 1767 return (flags); 1768 } 1769 1770 /*ARGSUSED*/ 1771 int 1772 bp_color(struct buf *bp) 1773 { 1774 return (0); 1775 } 1776 1777 #if defined(__xpv) 1778 1779 /* 1780 * Take pages out of an io_pool 1781 */ 1782 static void 1783 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 1784 { 1785 if (*poolp == pp_first) { 1786 *poolp = pp_last->p_next; 1787 if (*poolp == pp_first) 1788 *poolp = NULL; 1789 } 1790 pp_first->p_prev->p_next = pp_last->p_next; 1791 pp_last->p_next->p_prev = pp_first->p_prev; 1792 pp_first->p_prev = pp_last; 1793 pp_last->p_next = pp_first; 1794 } 1795 1796 /* 1797 * Put a page on the io_pool list. The list is ordered by increasing MFN. 1798 */ 1799 static void 1800 page_io_pool_add(page_t **poolp, page_t *pp) 1801 { 1802 page_t *look; 1803 mfn_t mfn = mfn_list[pp->p_pagenum]; 1804 1805 if (*poolp == NULL) { 1806 *poolp = pp; 1807 pp->p_next = pp; 1808 pp->p_prev = pp; 1809 return; 1810 } 1811 1812 /* 1813 * Since we try to take pages from the high end of the pool 1814 * chances are good that the pages to be put on the list will 1815 * go at or near the end of the list. so start at the end and 1816 * work backwards. 1817 */ 1818 look = (*poolp)->p_prev; 1819 while (mfn < mfn_list[look->p_pagenum]) { 1820 look = look->p_prev; 1821 if (look == (*poolp)->p_prev) 1822 break; /* backed all the way to front of list */ 1823 } 1824 1825 /* insert after look */ 1826 pp->p_prev = look; 1827 pp->p_next = look->p_next; 1828 pp->p_next->p_prev = pp; 1829 look->p_next = pp; 1830 if (mfn < mfn_list[(*poolp)->p_pagenum]) { 1831 /* 1832 * we inserted a new first list element 1833 * adjust pool pointer to newly inserted element 1834 */ 1835 *poolp = pp; 1836 } 1837 } 1838 1839 /* 1840 * Add a page to the io_pool. Setting the force flag will force the page 1841 * into the io_pool no matter what. 1842 */ 1843 static void 1844 add_page_to_pool(page_t *pp, int force) 1845 { 1846 page_t *highest; 1847 page_t *freep = NULL; 1848 1849 mutex_enter(&io_pool_lock); 1850 /* 1851 * Always keep the scarce low memory pages 1852 */ 1853 if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 1854 ++io_pool_cnt; 1855 page_io_pool_add(&io_pool_16m, pp); 1856 goto done; 1857 } 1858 if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) { 1859 ++io_pool_cnt; 1860 page_io_pool_add(&io_pool_4g, pp); 1861 } else { 1862 highest = io_pool_4g->p_prev; 1863 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 1864 page_io_pool_sub(&io_pool_4g, highest, highest); 1865 page_io_pool_add(&io_pool_4g, pp); 1866 freep = highest; 1867 } else { 1868 freep = pp; 1869 } 1870 } 1871 done: 1872 mutex_exit(&io_pool_lock); 1873 if (freep) 1874 page_free(freep, 1); 1875 } 1876 1877 1878 int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 1879 int contig_pfn_max; /* capacity of the contig pfn list */ 1880 int next_alloc_pfn; /* next position in list to start a contig search */ 1881 int contig_pfnlist_updates; /* pfn list update count */ 1882 int contig_pfnlist_builds; /* how many times have we (re)built list */ 1883 int contig_pfnlist_buildfailed; /* how many times has list build failed */ 1884 int create_contig_pending; /* nonzero means taskq creating contig list */ 1885 pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 1886 1887 /* 1888 * Function to use in sorting a list of pfns by their underlying mfns. 1889 */ 1890 static int 1891 mfn_compare(const void *pfnp1, const void *pfnp2) 1892 { 1893 mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 1894 mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 1895 1896 if (mfn1 > mfn2) 1897 return (1); 1898 if (mfn1 < mfn2) 1899 return (-1); 1900 return (0); 1901 } 1902 1903 /* 1904 * Compact the contig_pfn_list by tossing all the non-contiguous 1905 * elements from the list. 1906 */ 1907 static void 1908 compact_contig_pfn_list(void) 1909 { 1910 pfn_t pfn, lapfn, prev_lapfn; 1911 mfn_t mfn; 1912 int i, newcnt = 0; 1913 1914 prev_lapfn = 0; 1915 for (i = 0; i < contig_pfn_cnt - 1; i++) { 1916 pfn = contig_pfn_list[i]; 1917 lapfn = contig_pfn_list[i + 1]; 1918 mfn = mfn_list[pfn]; 1919 /* 1920 * See if next pfn is for a contig mfn 1921 */ 1922 if (mfn_list[lapfn] != mfn + 1) 1923 continue; 1924 /* 1925 * pfn and lookahead are both put in list 1926 * unless pfn is the previous lookahead. 1927 */ 1928 if (pfn != prev_lapfn) 1929 contig_pfn_list[newcnt++] = pfn; 1930 contig_pfn_list[newcnt++] = lapfn; 1931 prev_lapfn = lapfn; 1932 } 1933 for (i = newcnt; i < contig_pfn_cnt; i++) 1934 contig_pfn_list[i] = 0; 1935 contig_pfn_cnt = newcnt; 1936 } 1937 1938 /*ARGSUSED*/ 1939 static void 1940 call_create_contiglist(void *arg) 1941 { 1942 (void) create_contig_pfnlist(PG_WAIT); 1943 } 1944 1945 /* 1946 * Create list of freelist pfns that have underlying 1947 * contiguous mfns. The list is kept in ascending mfn order. 1948 * returns 1 if list created else 0. 1949 */ 1950 static int 1951 create_contig_pfnlist(uint_t flags) 1952 { 1953 pfn_t pfn; 1954 page_t *pp; 1955 int ret = 1; 1956 1957 mutex_enter(&contig_list_lock); 1958 if (contig_pfn_list != NULL) 1959 goto out; 1960 contig_pfn_max = freemem + (freemem / 10); 1961 contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 1962 (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 1963 if (contig_pfn_list == NULL) { 1964 /* 1965 * If we could not create the contig list (because 1966 * we could not sleep for memory). Dispatch a taskq that can 1967 * sleep to get the memory. 1968 */ 1969 if (!create_contig_pending) { 1970 if (taskq_dispatch(system_taskq, call_create_contiglist, 1971 NULL, TQ_NOSLEEP) != NULL) 1972 create_contig_pending = 1; 1973 } 1974 contig_pfnlist_buildfailed++; /* count list build failures */ 1975 ret = 0; 1976 goto out; 1977 } 1978 create_contig_pending = 0; 1979 ASSERT(contig_pfn_cnt == 0); 1980 for (pfn = 0; pfn < mfn_count; pfn++) { 1981 pp = page_numtopp_nolock(pfn); 1982 if (pp == NULL || !PP_ISFREE(pp)) 1983 continue; 1984 contig_pfn_list[contig_pfn_cnt] = pfn; 1985 if (++contig_pfn_cnt == contig_pfn_max) 1986 break; 1987 } 1988 qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 1989 compact_contig_pfn_list(); 1990 /* 1991 * Make sure next search of the newly created contiguous pfn 1992 * list starts at the beginning of the list. 1993 */ 1994 next_alloc_pfn = 0; 1995 contig_pfnlist_builds++; /* count list builds */ 1996 out: 1997 mutex_exit(&contig_list_lock); 1998 return (ret); 1999 } 2000 2001 2002 /* 2003 * Toss the current contig pfnlist. Someone is about to do a massive 2004 * update to pfn<->mfn mappings. So we have them destroy the list and lock 2005 * it till they are done with their update. 2006 */ 2007 void 2008 clear_and_lock_contig_pfnlist() 2009 { 2010 pfn_t *listp = NULL; 2011 size_t listsize; 2012 2013 mutex_enter(&contig_list_lock); 2014 if (contig_pfn_list != NULL) { 2015 listp = contig_pfn_list; 2016 listsize = contig_pfn_max * sizeof (pfn_t); 2017 contig_pfn_list = NULL; 2018 contig_pfn_max = contig_pfn_cnt = 0; 2019 } 2020 if (listp != NULL) 2021 kmem_free(listp, listsize); 2022 } 2023 2024 /* 2025 * Unlock the contig_pfn_list. The next attempted use of it will cause 2026 * it to be re-created. 2027 */ 2028 void 2029 unlock_contig_pfnlist() 2030 { 2031 mutex_exit(&contig_list_lock); 2032 } 2033 2034 /* 2035 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 2036 */ 2037 void 2038 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 2039 { 2040 int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 2041 pfn_t probe_pfn; 2042 mfn_t probe_mfn; 2043 int drop_lock = 0; 2044 2045 if (mutex_owner(&contig_list_lock) != curthread) { 2046 drop_lock = 1; 2047 mutex_enter(&contig_list_lock); 2048 } 2049 if (contig_pfn_list == NULL) 2050 goto done; 2051 contig_pfnlist_updates++; 2052 /* 2053 * Find the pfn in the current list. Use a binary chop to locate it. 2054 */ 2055 probe_hi = contig_pfn_cnt - 1; 2056 probe_lo = 0; 2057 probe_pos = (probe_hi + probe_lo) / 2; 2058 while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2059 if (probe_pos == probe_lo) { /* pfn not in list */ 2060 probe_pos = -1; 2061 break; 2062 } 2063 if (pfn_to_mfn(probe_pfn) <= oldmfn) 2064 probe_lo = probe_pos; 2065 else 2066 probe_hi = probe_pos; 2067 probe_pos = (probe_hi + probe_lo) / 2; 2068 } 2069 if (probe_pos >= 0) { /* remove pfn fom list */ 2070 contig_pfn_cnt--; 2071 ovbcopy(&contig_pfn_list[probe_pos + 1], 2072 &contig_pfn_list[probe_pos], 2073 (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2074 } 2075 if (newmfn == MFN_INVALID) 2076 goto done; 2077 /* 2078 * Check if new mfn has adjacent mfns in the list 2079 */ 2080 probe_hi = contig_pfn_cnt - 1; 2081 probe_lo = 0; 2082 insert_after = -2; 2083 do { 2084 probe_pos = (probe_hi + probe_lo) / 2; 2085 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2086 if (newmfn == probe_mfn + 1) 2087 insert_after = probe_pos; 2088 else if (newmfn == probe_mfn - 1) 2089 insert_after = probe_pos - 1; 2090 if (probe_pos == probe_lo) 2091 break; 2092 if (probe_mfn <= newmfn) 2093 probe_lo = probe_pos; 2094 else 2095 probe_hi = probe_pos; 2096 } while (insert_after == -2); 2097 /* 2098 * If there is space in the list and there are adjacent mfns 2099 * insert the pfn in to its proper place in the list. 2100 */ 2101 if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2102 insert_point = insert_after + 1; 2103 ovbcopy(&contig_pfn_list[insert_point], 2104 &contig_pfn_list[insert_point + 1], 2105 (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2106 contig_pfn_list[insert_point] = pfn; 2107 contig_pfn_cnt++; 2108 } 2109 done: 2110 if (drop_lock) 2111 mutex_exit(&contig_list_lock); 2112 } 2113 2114 /* 2115 * Called to (re-)populate the io_pool from the free page lists. 2116 */ 2117 long 2118 populate_io_pool(void) 2119 { 2120 pfn_t pfn; 2121 mfn_t mfn, max_mfn; 2122 page_t *pp; 2123 2124 /* 2125 * Figure out the bounds of the pool on first invocation. 2126 * We use a percentage of memory for the io pool size. 2127 * we allow that to shrink, but not to less than a fixed minimum 2128 */ 2129 if (io_pool_cnt_max == 0) { 2130 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2131 io_pool_cnt_lowater = io_pool_cnt_max; 2132 /* 2133 * This is the first time in populate_io_pool, grab a va to use 2134 * when we need to allocate pages. 2135 */ 2136 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2137 } 2138 /* 2139 * If we are out of pages in the pool, then grow the size of the pool 2140 */ 2141 if (io_pool_cnt == 0) { 2142 /* 2143 * Grow the max size of the io pool by 5%, but never more than 2144 * 25% of physical memory. 2145 */ 2146 if (io_pool_cnt_max < physmem / 4) 2147 io_pool_cnt_max += io_pool_cnt_max / 20; 2148 } 2149 io_pool_grows++; /* should be a kstat? */ 2150 2151 /* 2152 * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2153 */ 2154 (void) mfn_to_pfn(start_mfn); 2155 max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2156 for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2157 pfn = mfn_to_pfn(mfn); 2158 if (pfn & PFN_IS_FOREIGN_MFN) 2159 continue; 2160 /* 2161 * try to allocate it from free pages 2162 */ 2163 pp = page_numtopp_alloc(pfn); 2164 if (pp == NULL) 2165 continue; 2166 PP_CLRFREE(pp); 2167 add_page_to_pool(pp, 1); 2168 if (io_pool_cnt >= io_pool_cnt_max) 2169 break; 2170 } 2171 2172 return (io_pool_cnt); 2173 } 2174 2175 /* 2176 * Destroy a page that was being used for DMA I/O. It may or 2177 * may not actually go back to the io_pool. 2178 */ 2179 void 2180 page_destroy_io(page_t *pp) 2181 { 2182 mfn_t mfn = mfn_list[pp->p_pagenum]; 2183 2184 /* 2185 * When the page was alloc'd a reservation was made, release it now 2186 */ 2187 page_unresv(1); 2188 /* 2189 * Unload translations, if any, then hash out the 2190 * page to erase its identity. 2191 */ 2192 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2193 page_hashout(pp, NULL); 2194 2195 /* 2196 * If the page came from the free lists, just put it back to them. 2197 * DomU pages always go on the free lists as well. 2198 */ 2199 if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2200 page_free(pp, 1); 2201 return; 2202 } 2203 2204 add_page_to_pool(pp, 0); 2205 } 2206 2207 2208 long contig_searches; /* count of times contig pages requested */ 2209 long contig_search_restarts; /* count of contig ranges tried */ 2210 long contig_search_failed; /* count of contig alloc failures */ 2211 2212 /* 2213 * Look thru the contiguous pfns that are not part of the io_pool for 2214 * contiguous free pages. Return a list of the found pages or NULL. 2215 */ 2216 page_t * 2217 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg) 2218 { 2219 page_t *pp, *plist = NULL; 2220 mfn_t mfn, prev_mfn, start_mfn; 2221 pfn_t pfn; 2222 int pages_needed, pages_requested; 2223 int search_start; 2224 2225 /* 2226 * create the contig pfn list if not already done 2227 */ 2228 retry: 2229 mutex_enter(&contig_list_lock); 2230 if (contig_pfn_list == NULL) { 2231 mutex_exit(&contig_list_lock); 2232 if (!create_contig_pfnlist(flags)) { 2233 return (NULL); 2234 } 2235 goto retry; 2236 } 2237 contig_searches++; 2238 /* 2239 * Search contiguous pfn list for physically contiguous pages not in 2240 * the io_pool. Start the search where the last search left off. 2241 */ 2242 pages_requested = pages_needed = npages; 2243 search_start = next_alloc_pfn; 2244 start_mfn = prev_mfn = 0; 2245 while (pages_needed) { 2246 pfn = contig_pfn_list[next_alloc_pfn]; 2247 mfn = pfn_to_mfn(pfn); 2248 /* 2249 * Check if mfn is first one or contig to previous one and 2250 * if page corresponding to mfn is free and that mfn 2251 * range is not crossing a segment boundary. 2252 */ 2253 if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 2254 (pp = page_numtopp_alloc(pfn)) != NULL && 2255 !((mfn & pfnseg) < (start_mfn & pfnseg))) { 2256 PP_CLRFREE(pp); 2257 page_io_pool_add(&plist, pp); 2258 pages_needed--; 2259 if (prev_mfn == 0) 2260 start_mfn = mfn; 2261 prev_mfn = mfn; 2262 } else { 2263 contig_search_restarts++; 2264 /* 2265 * free partial page list 2266 */ 2267 while (plist != NULL) { 2268 pp = plist; 2269 page_io_pool_sub(&plist, pp, pp); 2270 page_free(pp, 1); 2271 } 2272 pages_needed = pages_requested; 2273 start_mfn = prev_mfn = 0; 2274 } 2275 if (++next_alloc_pfn == contig_pfn_cnt) 2276 next_alloc_pfn = 0; 2277 if (next_alloc_pfn == search_start) 2278 break; /* all pfns searched */ 2279 } 2280 mutex_exit(&contig_list_lock); 2281 if (pages_needed) { 2282 contig_search_failed++; 2283 /* 2284 * Failed to find enough contig pages. 2285 * free partial page list 2286 */ 2287 while (plist != NULL) { 2288 pp = plist; 2289 page_io_pool_sub(&plist, pp, pp); 2290 page_free(pp, 1); 2291 } 2292 } 2293 return (plist); 2294 } 2295 2296 /* 2297 * Search the reserved io pool pages for a page range with the 2298 * desired characteristics. 2299 */ 2300 page_t * 2301 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg) 2302 { 2303 page_t *pp_first, *pp_last; 2304 page_t *pp, **poolp; 2305 pgcnt_t nwanted, pfnalign; 2306 uint64_t pfnseg; 2307 mfn_t mfn, tmfn, hi_mfn, lo_mfn; 2308 int align, attempt = 0; 2309 2310 if (minctg == 1) 2311 contig = 0; 2312 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2313 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2314 pfnseg = mmu_btop(mattr->dma_attr_seg); 2315 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2316 if (align > MMU_PAGESIZE) 2317 pfnalign = mmu_btop(align); 2318 else 2319 pfnalign = 0; 2320 2321 try_again: 2322 /* 2323 * See if we want pages for a legacy device 2324 */ 2325 if (hi_mfn < PFN_16MEG) 2326 poolp = &io_pool_16m; 2327 else 2328 poolp = &io_pool_4g; 2329 try_smaller: 2330 /* 2331 * Take pages from I/O pool. We'll use pages from the highest 2332 * MFN range possible. 2333 */ 2334 pp_first = pp_last = NULL; 2335 mutex_enter(&io_pool_lock); 2336 nwanted = minctg; 2337 for (pp = *poolp; pp && nwanted > 0; ) { 2338 pp = pp->p_prev; 2339 2340 /* 2341 * skip pages above allowable range 2342 */ 2343 mfn = mfn_list[pp->p_pagenum]; 2344 if (hi_mfn < mfn) 2345 goto skip; 2346 2347 /* 2348 * stop at pages below allowable range 2349 */ 2350 if (lo_mfn > mfn) 2351 break; 2352 restart: 2353 if (pp_last == NULL) { 2354 /* 2355 * Check alignment 2356 */ 2357 tmfn = mfn - (minctg - 1); 2358 if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign)) 2359 goto skip; /* not properly aligned */ 2360 /* 2361 * Check segment 2362 */ 2363 if ((mfn & pfnseg) < (tmfn & pfnseg)) 2364 goto skip; /* crosses seg boundary */ 2365 /* 2366 * Start building page list 2367 */ 2368 pp_first = pp_last = pp; 2369 nwanted--; 2370 } else { 2371 /* 2372 * check physical contiguity if required 2373 */ 2374 if (contig && 2375 mfn_list[pp_first->p_pagenum] != mfn + 1) { 2376 /* 2377 * not a contiguous page, restart list. 2378 */ 2379 pp_last = NULL; 2380 nwanted = minctg; 2381 goto restart; 2382 } else { /* add page to list */ 2383 pp_first = pp; 2384 nwanted--; 2385 } 2386 } 2387 skip: 2388 if (pp == *poolp) 2389 break; 2390 } 2391 2392 /* 2393 * If we didn't find memory. Try the more constrained pool, then 2394 * sweep free pages into the DMA pool and try again. 2395 */ 2396 if (nwanted != 0) { 2397 mutex_exit(&io_pool_lock); 2398 /* 2399 * If we were looking in the less constrained pool and 2400 * didn't find pages, try the more constrained pool. 2401 */ 2402 if (poolp == &io_pool_4g) { 2403 poolp = &io_pool_16m; 2404 goto try_smaller; 2405 } 2406 kmem_reap(); 2407 if (++attempt < 4) { 2408 /* 2409 * Grab some more io_pool pages 2410 */ 2411 (void) populate_io_pool(); 2412 goto try_again; /* go around and retry */ 2413 } 2414 return (NULL); 2415 } 2416 /* 2417 * Found the pages, now snip them from the list 2418 */ 2419 page_io_pool_sub(poolp, pp_first, pp_last); 2420 io_pool_cnt -= minctg; 2421 /* 2422 * reset low water mark 2423 */ 2424 if (io_pool_cnt < io_pool_cnt_lowater) 2425 io_pool_cnt_lowater = io_pool_cnt; 2426 mutex_exit(&io_pool_lock); 2427 return (pp_first); 2428 } 2429 2430 page_t * 2431 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr, 2432 ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg) 2433 { 2434 uint_t kflags; 2435 int order, extra, extpages, i, contig, nbits, extents; 2436 page_t *pp, *expp, *pp_first, **pplist = NULL; 2437 mfn_t *mfnlist = NULL; 2438 2439 contig = flags & PG_PHYSCONTIG; 2440 if (minctg == 1) 2441 contig = 0; 2442 flags &= ~PG_PHYSCONTIG; 2443 kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 2444 /* 2445 * Hypervisor will allocate extents, if we want contig 2446 * pages extent must be >= minctg 2447 */ 2448 if (contig) { 2449 order = highbit(minctg) - 1; 2450 if (minctg & ((1 << order) - 1)) 2451 order++; 2452 extpages = 1 << order; 2453 } else { 2454 order = 0; 2455 extpages = minctg; 2456 } 2457 if (extpages > minctg) { 2458 extra = extpages - minctg; 2459 if (!page_resv(extra, kflags)) 2460 return (NULL); 2461 } 2462 pp_first = NULL; 2463 pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 2464 if (pplist == NULL) 2465 goto balloon_fail; 2466 mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 2467 if (mfnlist == NULL) 2468 goto balloon_fail; 2469 pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr); 2470 if (pp == NULL) 2471 goto balloon_fail; 2472 pp_first = pp; 2473 if (extpages > minctg) { 2474 /* 2475 * fill out the rest of extent pages to swap 2476 * with the hypervisor 2477 */ 2478 for (i = 0; i < extra; i++) { 2479 expp = page_create_va(vp, 2480 (u_offset_t)(uintptr_t)io_pool_kva, 2481 PAGESIZE, flags, &kvseg, io_pool_kva); 2482 if (expp == NULL) 2483 goto balloon_fail; 2484 (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 2485 page_io_unlock(expp); 2486 page_hashout(expp, NULL); 2487 page_io_lock(expp); 2488 /* 2489 * add page to end of list 2490 */ 2491 expp->p_prev = pp_first->p_prev; 2492 expp->p_next = pp_first; 2493 expp->p_prev->p_next = expp; 2494 pp_first->p_prev = expp; 2495 } 2496 2497 } 2498 for (i = 0; i < extpages; i++) { 2499 pplist[i] = pp; 2500 pp = pp->p_next; 2501 } 2502 nbits = highbit(mattr->dma_attr_addr_hi); 2503 extents = contig ? 1 : minctg; 2504 if (balloon_replace_pages(extents, pplist, nbits, order, 2505 mfnlist) != extents) { 2506 if (ioalloc_dbg) 2507 cmn_err(CE_NOTE, "request to hypervisor" 2508 " for %d pages, maxaddr %" PRIx64 " failed", 2509 extpages, mattr->dma_attr_addr_hi); 2510 goto balloon_fail; 2511 } 2512 2513 kmem_free(pplist, extpages * sizeof (page_t *)); 2514 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2515 /* 2516 * Return any excess pages to free list 2517 */ 2518 if (extpages > minctg) { 2519 for (i = 0; i < extra; i++) { 2520 pp = pp_first->p_prev; 2521 page_sub(&pp_first, pp); 2522 page_io_unlock(pp); 2523 page_unresv(1); 2524 page_free(pp, 1); 2525 } 2526 } 2527 return (pp_first); 2528 balloon_fail: 2529 /* 2530 * Return pages to free list and return failure 2531 */ 2532 while (pp_first != NULL) { 2533 pp = pp_first; 2534 page_sub(&pp_first, pp); 2535 page_io_unlock(pp); 2536 if (pp->p_vnode != NULL) 2537 page_hashout(pp, NULL); 2538 page_free(pp, 1); 2539 } 2540 if (pplist) 2541 kmem_free(pplist, extpages * sizeof (page_t *)); 2542 if (mfnlist) 2543 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2544 page_unresv(extpages - minctg); 2545 return (NULL); 2546 } 2547 2548 static void 2549 return_partial_alloc(page_t *plist) 2550 { 2551 page_t *pp; 2552 2553 while (plist != NULL) { 2554 pp = plist; 2555 page_sub(&plist, pp); 2556 page_io_unlock(pp); 2557 page_destroy_io(pp); 2558 } 2559 } 2560 2561 static page_t * 2562 page_get_contigpages( 2563 struct vnode *vp, 2564 u_offset_t off, 2565 int *npagesp, 2566 uint_t flags, 2567 caddr_t vaddr, 2568 ddi_dma_attr_t *mattr) 2569 { 2570 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2571 page_t *plist; /* list to return */ 2572 page_t *pp, *mcpl; 2573 int contig, anyaddr, npages, getone = 0; 2574 mfn_t lo_mfn; 2575 mfn_t hi_mfn; 2576 pgcnt_t pfnalign = 0; 2577 int align, sgllen; 2578 uint64_t pfnseg; 2579 pgcnt_t minctg; 2580 2581 npages = *npagesp; 2582 ASSERT(mattr != NULL); 2583 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2584 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2585 sgllen = mattr->dma_attr_sgllen; 2586 pfnseg = mmu_btop(mattr->dma_attr_seg); 2587 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2588 if (align > MMU_PAGESIZE) 2589 pfnalign = mmu_btop(align); 2590 2591 /* 2592 * Clear the contig flag if only one page is needed. 2593 */ 2594 contig = flags & PG_PHYSCONTIG; 2595 if (npages == 1) { 2596 getone = 1; 2597 contig = 0; 2598 } 2599 2600 /* 2601 * Check if any page in the system is fine. 2602 */ 2603 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 2604 if (!contig && anyaddr) { 2605 flags &= ~PG_PHYSCONTIG; 2606 plist = page_create_va(vp, off, npages * MMU_PAGESIZE, 2607 flags, &kvseg, vaddr); 2608 if (plist != NULL) { 2609 *npagesp = 0; 2610 return (plist); 2611 } 2612 } 2613 plist = NULL; 2614 minctg = howmany(npages, sgllen); 2615 while (npages > sgllen || getone) { 2616 if (minctg > npages) 2617 minctg = npages; 2618 mcpl = NULL; 2619 /* 2620 * We could just want unconstrained but contig pages. 2621 */ 2622 if (anyaddr && contig) { 2623 /* 2624 * Look for free contig pages to satisfy the request. 2625 */ 2626 mcpl = find_contig_free(minctg, flags, pfnseg); 2627 } 2628 /* 2629 * Try the reserved io pools next 2630 */ 2631 if (mcpl == NULL) 2632 mcpl = page_io_pool_alloc(mattr, contig, minctg); 2633 if (mcpl != NULL) { 2634 pp = mcpl; 2635 do { 2636 if (!page_hashin(pp, vp, off, NULL)) { 2637 panic("page_get_contigpages:" 2638 " hashin failed" 2639 " pp %p, vp %p, off %llx", 2640 (void *)pp, (void *)vp, off); 2641 } 2642 off += MMU_PAGESIZE; 2643 PP_CLRFREE(pp); 2644 PP_CLRAGED(pp); 2645 page_set_props(pp, P_REF); 2646 page_io_lock(pp); 2647 pp = pp->p_next; 2648 } while (pp != mcpl); 2649 } else { 2650 /* 2651 * Hypervisor exchange doesn't handle segment or 2652 * alignment constraints 2653 */ 2654 if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || 2655 pfnalign) 2656 goto fail; 2657 /* 2658 * Try exchanging pages with the hypervisor 2659 */ 2660 mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr, 2661 flags, minctg); 2662 if (mcpl == NULL) 2663 goto fail; 2664 off += minctg * MMU_PAGESIZE; 2665 } 2666 check_dma(mattr, mcpl, minctg); 2667 /* 2668 * Here with a minctg run of contiguous pages, add them to the 2669 * list we will return for this request. 2670 */ 2671 page_list_concat(&plist, &mcpl); 2672 npages -= minctg; 2673 *npagesp = npages; 2674 sgllen--; 2675 if (getone) 2676 break; 2677 } 2678 return (plist); 2679 fail: 2680 return_partial_alloc(plist); 2681 return (NULL); 2682 } 2683 2684 /* 2685 * Allocator for domain 0 I/O pages. We match the required 2686 * DMA attributes and contiguity constraints. 2687 */ 2688 /*ARGSUSED*/ 2689 page_t * 2690 page_create_io( 2691 struct vnode *vp, 2692 u_offset_t off, 2693 uint_t bytes, 2694 uint_t flags, 2695 struct as *as, 2696 caddr_t vaddr, 2697 ddi_dma_attr_t *mattr) 2698 { 2699 page_t *plist = NULL, *pp; 2700 int npages = 0, contig, anyaddr, pages_req; 2701 mfn_t lo_mfn; 2702 mfn_t hi_mfn; 2703 pgcnt_t pfnalign = 0; 2704 int align; 2705 int is_domu = 0; 2706 int dummy, bytes_got; 2707 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2708 2709 ASSERT(mattr != NULL); 2710 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2711 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2712 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2713 if (align > MMU_PAGESIZE) 2714 pfnalign = mmu_btop(align); 2715 2716 /* 2717 * Clear the contig flag if only one page is needed or the scatter 2718 * gather list length is >= npages. 2719 */ 2720 pages_req = npages = mmu_btopr(bytes); 2721 contig = (flags & PG_PHYSCONTIG); 2722 bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 2723 if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages) 2724 contig = 0; 2725 2726 /* 2727 * Check if any old page in the system is fine. 2728 * DomU should always go down this path. 2729 */ 2730 is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 2731 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 2732 if ((!contig && anyaddr) || is_domu) { 2733 flags &= ~PG_PHYSCONTIG; 2734 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 2735 if (plist != NULL) 2736 return (plist); 2737 else if (is_domu) 2738 return (NULL); /* no memory available */ 2739 } 2740 /* 2741 * DomU should never reach here 2742 */ 2743 if (contig) { 2744 plist = page_get_contigpages(vp, off, &npages, flags, vaddr, 2745 mattr); 2746 if (plist == NULL) 2747 goto fail; 2748 bytes_got = (pages_req - npages) << MMU_PAGESHIFT; 2749 vaddr += bytes_got; 2750 off += bytes_got; 2751 /* 2752 * We now have all the contiguous pages we need, but 2753 * we may still need additional non-contiguous pages. 2754 */ 2755 } 2756 /* 2757 * now loop collecting the requested number of pages, these do 2758 * not have to be contiguous pages but we will use the contig 2759 * page alloc code to get the pages since it will honor any 2760 * other constraints the pages may have. 2761 */ 2762 while (npages--) { 2763 dummy = 1; 2764 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr); 2765 if (pp == NULL) 2766 goto fail; 2767 page_add(&plist, pp); 2768 vaddr += MMU_PAGESIZE; 2769 off += MMU_PAGESIZE; 2770 } 2771 return (plist); 2772 fail: 2773 /* 2774 * Failed to get enough pages, return ones we did get 2775 */ 2776 return_partial_alloc(plist); 2777 return (NULL); 2778 } 2779 2780 /* 2781 * Lock and return the page with the highest mfn that we can find. last_mfn 2782 * holds the last one found, so the next search can start from there. We 2783 * also keep a counter so that we don't loop forever if the machine has no 2784 * free pages. 2785 * 2786 * This is called from the balloon thread to find pages to give away. new_high 2787 * is used when new mfn's have been added to the system - we will reset our 2788 * search if the new mfn's are higher than our current search position. 2789 */ 2790 page_t * 2791 page_get_high_mfn(mfn_t new_high) 2792 { 2793 static mfn_t last_mfn = 0; 2794 pfn_t pfn; 2795 page_t *pp; 2796 ulong_t loop_count = 0; 2797 2798 if (new_high > last_mfn) 2799 last_mfn = new_high; 2800 2801 for (; loop_count < mfn_count; loop_count++, last_mfn--) { 2802 if (last_mfn == 0) { 2803 last_mfn = cached_max_mfn; 2804 } 2805 2806 pfn = mfn_to_pfn(last_mfn); 2807 if (pfn & PFN_IS_FOREIGN_MFN) 2808 continue; 2809 2810 /* See if the page is free. If so, lock it. */ 2811 pp = page_numtopp_alloc(pfn); 2812 if (pp == NULL) 2813 continue; 2814 PP_CLRFREE(pp); 2815 2816 ASSERT(PAGE_EXCL(pp)); 2817 ASSERT(pp->p_vnode == NULL); 2818 ASSERT(!hat_page_is_mapped(pp)); 2819 last_mfn--; 2820 return (pp); 2821 } 2822 return (NULL); 2823 } 2824 2825 #else /* !__xpv */ 2826 2827 /* 2828 * get a page from any list with the given mnode 2829 */ 2830 static page_t * 2831 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 2832 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 2833 { 2834 kmutex_t *pcm; 2835 int i; 2836 page_t *pp; 2837 page_t *first_pp; 2838 uint64_t pgaddr; 2839 ulong_t bin; 2840 int mtypestart; 2841 int plw_initialized; 2842 page_list_walker_t plw; 2843 2844 VM_STAT_ADD(pga_vmstats.pgma_alloc); 2845 2846 ASSERT((flags & PG_MATCH_COLOR) == 0); 2847 ASSERT(szc == 0); 2848 ASSERT(dma_attr != NULL); 2849 2850 MTYPE_START(mnode, mtype, flags); 2851 if (mtype < 0) { 2852 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 2853 return (NULL); 2854 } 2855 2856 mtypestart = mtype; 2857 2858 bin = origbin; 2859 2860 /* 2861 * check up to page_colors + 1 bins - origbin may be checked twice 2862 * because of BIN_STEP skip 2863 */ 2864 do { 2865 plw_initialized = 0; 2866 2867 for (plw.plw_count = 0; 2868 plw.plw_count < page_colors; plw.plw_count++) { 2869 2870 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 2871 goto nextfreebin; 2872 2873 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2874 mutex_enter(pcm); 2875 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2876 first_pp = pp; 2877 while (pp != NULL) { 2878 if (page_trylock(pp, SE_EXCL) == 0) { 2879 pp = pp->p_next; 2880 if (pp == first_pp) { 2881 pp = NULL; 2882 } 2883 continue; 2884 } 2885 2886 ASSERT(PP_ISFREE(pp)); 2887 ASSERT(PP_ISAGED(pp)); 2888 ASSERT(pp->p_vnode == NULL); 2889 ASSERT(pp->p_hash == NULL); 2890 ASSERT(pp->p_offset == (u_offset_t)-1); 2891 ASSERT(pp->p_szc == szc); 2892 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2893 /* check if page within DMA attributes */ 2894 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2895 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2896 (pgaddr + MMU_PAGESIZE - 1 <= 2897 dma_attr->dma_attr_addr_hi)) { 2898 break; 2899 } 2900 2901 /* continue looking */ 2902 page_unlock(pp); 2903 pp = pp->p_next; 2904 if (pp == first_pp) 2905 pp = NULL; 2906 2907 } 2908 if (pp != NULL) { 2909 ASSERT(mtype == PP_2_MTYPE(pp)); 2910 ASSERT(pp->p_szc == 0); 2911 2912 /* found a page with specified DMA attributes */ 2913 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 2914 mtype), pp); 2915 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2916 2917 if ((PP_ISFREE(pp) == 0) || 2918 (PP_ISAGED(pp) == 0)) { 2919 cmn_err(CE_PANIC, "page %p is not free", 2920 (void *)pp); 2921 } 2922 2923 mutex_exit(pcm); 2924 check_dma(dma_attr, pp, 1); 2925 VM_STAT_ADD(pga_vmstats.pgma_allocok); 2926 return (pp); 2927 } 2928 mutex_exit(pcm); 2929 nextfreebin: 2930 if (plw_initialized == 0) { 2931 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 2932 ASSERT(plw.plw_ceq_dif == page_colors); 2933 plw_initialized = 1; 2934 } 2935 2936 if (plw.plw_do_split) { 2937 pp = page_freelist_split(szc, bin, mnode, 2938 mtype, 2939 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 2940 &plw); 2941 if (pp != NULL) 2942 return (pp); 2943 } 2944 2945 bin = page_list_walk_next_bin(szc, bin, &plw); 2946 } 2947 2948 MTYPE_NEXT(mnode, mtype, flags); 2949 } while (mtype >= 0); 2950 2951 /* failed to find a page in the freelist; try it in the cachelist */ 2952 2953 /* reset mtype start for cachelist search */ 2954 mtype = mtypestart; 2955 ASSERT(mtype >= 0); 2956 2957 /* start with the bin of matching color */ 2958 bin = origbin; 2959 2960 do { 2961 for (i = 0; i <= page_colors; i++) { 2962 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 2963 goto nextcachebin; 2964 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 2965 mutex_enter(pcm); 2966 pp = PAGE_CACHELISTS(mnode, bin, mtype); 2967 first_pp = pp; 2968 while (pp != NULL) { 2969 if (page_trylock(pp, SE_EXCL) == 0) { 2970 pp = pp->p_next; 2971 if (pp == first_pp) 2972 break; 2973 continue; 2974 } 2975 ASSERT(pp->p_vnode); 2976 ASSERT(PP_ISAGED(pp) == 0); 2977 ASSERT(pp->p_szc == 0); 2978 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2979 2980 /* check if page within DMA attributes */ 2981 2982 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2983 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2984 (pgaddr + MMU_PAGESIZE - 1 <= 2985 dma_attr->dma_attr_addr_hi)) { 2986 break; 2987 } 2988 2989 /* continue looking */ 2990 page_unlock(pp); 2991 pp = pp->p_next; 2992 if (pp == first_pp) 2993 pp = NULL; 2994 } 2995 2996 if (pp != NULL) { 2997 ASSERT(mtype == PP_2_MTYPE(pp)); 2998 ASSERT(pp->p_szc == 0); 2999 3000 /* found a page with specified DMA attributes */ 3001 page_sub(&PAGE_CACHELISTS(mnode, bin, 3002 mtype), pp); 3003 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3004 3005 mutex_exit(pcm); 3006 ASSERT(pp->p_vnode); 3007 ASSERT(PP_ISAGED(pp) == 0); 3008 check_dma(dma_attr, pp, 1); 3009 VM_STAT_ADD(pga_vmstats.pgma_allocok); 3010 return (pp); 3011 } 3012 mutex_exit(pcm); 3013 nextcachebin: 3014 bin += (i == 0) ? BIN_STEP : 1; 3015 bin &= page_colors_mask; 3016 } 3017 MTYPE_NEXT(mnode, mtype, flags); 3018 } while (mtype >= 0); 3019 3020 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 3021 return (NULL); 3022 } 3023 3024 /* 3025 * This function is similar to page_get_freelist()/page_get_cachelist() 3026 * but it searches both the lists to find a page with the specified 3027 * color (or no color) and DMA attributes. The search is done in the 3028 * freelist first and then in the cache list within the highest memory 3029 * range (based on DMA attributes) before searching in the lower 3030 * memory ranges. 3031 * 3032 * Note: This function is called only by page_create_io(). 3033 */ 3034 /*ARGSUSED*/ 3035 static page_t * 3036 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 3037 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 3038 { 3039 uint_t bin; 3040 int mtype; 3041 page_t *pp; 3042 int n; 3043 int m; 3044 int szc; 3045 int fullrange; 3046 int mnode; 3047 int local_failed_stat = 0; 3048 lgrp_mnode_cookie_t lgrp_cookie; 3049 3050 VM_STAT_ADD(pga_vmstats.pga_alloc); 3051 3052 /* only base pagesize currently supported */ 3053 if (size != MMU_PAGESIZE) 3054 return (NULL); 3055 3056 /* 3057 * If we're passed a specific lgroup, we use it. Otherwise, 3058 * assume first-touch placement is desired. 3059 */ 3060 if (!LGRP_EXISTS(lgrp)) 3061 lgrp = lgrp_home_lgrp(); 3062 3063 /* LINTED */ 3064 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3065 3066 /* 3067 * Only hold one freelist or cachelist lock at a time, that way we 3068 * can start anywhere and not have to worry about lock 3069 * ordering. 3070 */ 3071 if (dma_attr == NULL) { 3072 n = 0; 3073 m = mnoderangecnt - 1; 3074 fullrange = 1; 3075 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 3076 } else { 3077 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 3078 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 3079 3080 /* 3081 * We can guarantee alignment only for page boundary. 3082 */ 3083 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 3084 return (NULL); 3085 3086 n = pfn_2_mtype(pfnlo); 3087 m = pfn_2_mtype(pfnhi); 3088 3089 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 3090 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 3091 } 3092 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 3093 3094 if (n > m) 3095 return (NULL); 3096 3097 szc = 0; 3098 3099 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 3100 if (n == 0) { 3101 flags |= PGI_MT_RANGE0; 3102 n = m; 3103 } 3104 3105 /* 3106 * Try local memory node first, but try remote if we can't 3107 * get a page of the right color. 3108 */ 3109 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 3110 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3111 /* 3112 * allocate pages from high pfn to low. 3113 */ 3114 for (mtype = m; mtype >= n; mtype--) { 3115 if (fullrange != 0) { 3116 pp = page_get_mnode_freelist(mnode, 3117 bin, mtype, szc, flags); 3118 if (pp == NULL) { 3119 pp = page_get_mnode_cachelist( 3120 bin, flags, mnode, mtype); 3121 } 3122 } else { 3123 pp = page_get_mnode_anylist(bin, szc, 3124 flags, mnode, mtype, dma_attr); 3125 } 3126 if (pp != NULL) { 3127 VM_STAT_ADD(pga_vmstats.pga_allocok); 3128 check_dma(dma_attr, pp, 1); 3129 return (pp); 3130 } 3131 } 3132 if (!local_failed_stat) { 3133 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3134 local_failed_stat = 1; 3135 } 3136 } 3137 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 3138 3139 return (NULL); 3140 } 3141 3142 /* 3143 * page_create_io() 3144 * 3145 * This function is a copy of page_create_va() with an additional 3146 * argument 'mattr' that specifies DMA memory requirements to 3147 * the page list functions. This function is used by the segkmem 3148 * allocator so it is only to create new pages (i.e PG_EXCL is 3149 * set). 3150 * 3151 * Note: This interface is currently used by x86 PSM only and is 3152 * not fully specified so the commitment level is only for 3153 * private interface specific to x86. This interface uses PSM 3154 * specific page_get_anylist() interface. 3155 */ 3156 3157 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 3158 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 3159 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 3160 break; \ 3161 } \ 3162 } 3163 3164 3165 page_t * 3166 page_create_io( 3167 struct vnode *vp, 3168 u_offset_t off, 3169 uint_t bytes, 3170 uint_t flags, 3171 struct as *as, 3172 caddr_t vaddr, 3173 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 3174 { 3175 page_t *plist = NULL; 3176 uint_t plist_len = 0; 3177 pgcnt_t npages; 3178 page_t *npp = NULL; 3179 uint_t pages_req; 3180 page_t *pp; 3181 kmutex_t *phm = NULL; 3182 uint_t index; 3183 3184 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 3185 "page_create_start:vp %p off %llx bytes %u flags %x", 3186 vp, off, bytes, flags); 3187 3188 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 3189 3190 pages_req = npages = mmu_btopr(bytes); 3191 3192 /* 3193 * Do the freemem and pcf accounting. 3194 */ 3195 if (!page_create_wait(npages, flags)) { 3196 return (NULL); 3197 } 3198 3199 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 3200 "page_create_success:vp %p off %llx", vp, off); 3201 3202 /* 3203 * If satisfying this request has left us with too little 3204 * memory, start the wheels turning to get some back. The 3205 * first clause of the test prevents waking up the pageout 3206 * daemon in situations where it would decide that there's 3207 * nothing to do. 3208 */ 3209 if (nscan < desscan && freemem < minfree) { 3210 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 3211 "pageout_cv_signal:freemem %ld", freemem); 3212 cv_signal(&proc_pageout->p_cv); 3213 } 3214 3215 if (flags & PG_PHYSCONTIG) { 3216 3217 plist = page_get_contigpage(&npages, mattr, 1); 3218 if (plist == NULL) { 3219 page_create_putback(npages); 3220 return (NULL); 3221 } 3222 3223 pp = plist; 3224 3225 do { 3226 if (!page_hashin(pp, vp, off, NULL)) { 3227 panic("pg_creat_io: hashin failed %p %p %llx", 3228 (void *)pp, (void *)vp, off); 3229 } 3230 VM_STAT_ADD(page_create_new); 3231 off += MMU_PAGESIZE; 3232 PP_CLRFREE(pp); 3233 PP_CLRAGED(pp); 3234 page_set_props(pp, P_REF); 3235 pp = pp->p_next; 3236 } while (pp != plist); 3237 3238 if (!npages) { 3239 check_dma(mattr, plist, pages_req); 3240 return (plist); 3241 } else { 3242 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 3243 } 3244 3245 /* 3246 * fall-thru: 3247 * 3248 * page_get_contigpage returns when npages <= sgllen. 3249 * Grab the rest of the non-contig pages below from anylist. 3250 */ 3251 } 3252 3253 /* 3254 * Loop around collecting the requested number of pages. 3255 * Most of the time, we have to `create' a new page. With 3256 * this in mind, pull the page off the free list before 3257 * getting the hash lock. This will minimize the hash 3258 * lock hold time, nesting, and the like. If it turns 3259 * out we don't need the page, we put it back at the end. 3260 */ 3261 while (npages--) { 3262 phm = NULL; 3263 3264 index = PAGE_HASH_FUNC(vp, off); 3265 top: 3266 ASSERT(phm == NULL); 3267 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 3268 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3269 3270 if (npp == NULL) { 3271 /* 3272 * Try to get the page of any color either from 3273 * the freelist or from the cache list. 3274 */ 3275 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 3276 flags & ~PG_MATCH_COLOR, mattr, NULL); 3277 if (npp == NULL) { 3278 if (mattr == NULL) { 3279 /* 3280 * Not looking for a special page; 3281 * panic! 3282 */ 3283 panic("no page found %d", (int)npages); 3284 } 3285 /* 3286 * No page found! This can happen 3287 * if we are looking for a page 3288 * within a specific memory range 3289 * for DMA purposes. If PG_WAIT is 3290 * specified then we wait for a 3291 * while and then try again. The 3292 * wait could be forever if we 3293 * don't get the page(s) we need. 3294 * 3295 * Note: XXX We really need a mechanism 3296 * to wait for pages in the desired 3297 * range. For now, we wait for any 3298 * pages and see if we can use it. 3299 */ 3300 3301 if ((mattr != NULL) && (flags & PG_WAIT)) { 3302 delay(10); 3303 goto top; 3304 } 3305 goto fail; /* undo accounting stuff */ 3306 } 3307 3308 if (PP_ISAGED(npp) == 0) { 3309 /* 3310 * Since this page came from the 3311 * cachelist, we must destroy the 3312 * old vnode association. 3313 */ 3314 page_hashout(npp, (kmutex_t *)NULL); 3315 } 3316 } 3317 3318 /* 3319 * We own this page! 3320 */ 3321 ASSERT(PAGE_EXCL(npp)); 3322 ASSERT(npp->p_vnode == NULL); 3323 ASSERT(!hat_page_is_mapped(npp)); 3324 PP_CLRFREE(npp); 3325 PP_CLRAGED(npp); 3326 3327 /* 3328 * Here we have a page in our hot little mits and are 3329 * just waiting to stuff it on the appropriate lists. 3330 * Get the mutex and check to see if it really does 3331 * not exist. 3332 */ 3333 phm = PAGE_HASH_MUTEX(index); 3334 mutex_enter(phm); 3335 PAGE_HASH_SEARCH(index, pp, vp, off); 3336 if (pp == NULL) { 3337 VM_STAT_ADD(page_create_new); 3338 pp = npp; 3339 npp = NULL; 3340 if (!page_hashin(pp, vp, off, phm)) { 3341 /* 3342 * Since we hold the page hash mutex and 3343 * just searched for this page, page_hashin 3344 * had better not fail. If it does, that 3345 * means somethread did not follow the 3346 * page hash mutex rules. Panic now and 3347 * get it over with. As usual, go down 3348 * holding all the locks. 3349 */ 3350 ASSERT(MUTEX_HELD(phm)); 3351 panic("page_create: hashin fail %p %p %llx %p", 3352 (void *)pp, (void *)vp, off, (void *)phm); 3353 3354 } 3355 ASSERT(MUTEX_HELD(phm)); 3356 mutex_exit(phm); 3357 phm = NULL; 3358 3359 /* 3360 * Hat layer locking need not be done to set 3361 * the following bits since the page is not hashed 3362 * and was on the free list (i.e., had no mappings). 3363 * 3364 * Set the reference bit to protect 3365 * against immediate pageout 3366 * 3367 * XXXmh modify freelist code to set reference 3368 * bit so we don't have to do it here. 3369 */ 3370 page_set_props(pp, P_REF); 3371 } else { 3372 ASSERT(MUTEX_HELD(phm)); 3373 mutex_exit(phm); 3374 phm = NULL; 3375 /* 3376 * NOTE: This should not happen for pages associated 3377 * with kernel vnode 'kvp'. 3378 */ 3379 /* XX64 - to debug why this happens! */ 3380 ASSERT(!VN_ISKAS(vp)); 3381 if (VN_ISKAS(vp)) 3382 cmn_err(CE_NOTE, 3383 "page_create: page not expected " 3384 "in hash list for kernel vnode - pp 0x%p", 3385 (void *)pp); 3386 VM_STAT_ADD(page_create_exists); 3387 goto fail; 3388 } 3389 3390 /* 3391 * Got a page! It is locked. Acquire the i/o 3392 * lock since we are going to use the p_next and 3393 * p_prev fields to link the requested pages together. 3394 */ 3395 page_io_lock(pp); 3396 page_add(&plist, pp); 3397 plist = plist->p_next; 3398 off += MMU_PAGESIZE; 3399 vaddr += MMU_PAGESIZE; 3400 } 3401 3402 check_dma(mattr, plist, pages_req); 3403 return (plist); 3404 3405 fail: 3406 if (npp != NULL) { 3407 /* 3408 * Did not need this page after all. 3409 * Put it back on the free list. 3410 */ 3411 VM_STAT_ADD(page_create_putbacks); 3412 PP_SETFREE(npp); 3413 PP_SETAGED(npp); 3414 npp->p_offset = (u_offset_t)-1; 3415 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 3416 page_unlock(npp); 3417 } 3418 3419 /* 3420 * Give up the pages we already got. 3421 */ 3422 while (plist != NULL) { 3423 pp = plist; 3424 page_sub(&plist, pp); 3425 page_io_unlock(pp); 3426 plist_len++; 3427 /*LINTED: constant in conditional ctx*/ 3428 VN_DISPOSE(pp, B_INVAL, 0, kcred); 3429 } 3430 3431 /* 3432 * VN_DISPOSE does freemem accounting for the pages in plist 3433 * by calling page_free. So, we need to undo the pcf accounting 3434 * for only the remaining pages. 3435 */ 3436 VM_STAT_ADD(page_create_putbacks); 3437 page_create_putback(pages_req - plist_len); 3438 3439 return (NULL); 3440 } 3441 #endif /* !__xpv */ 3442 3443 3444 /* 3445 * Copy the data from the physical page represented by "frompp" to 3446 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 3447 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 3448 * level and no one sleeps with an active mapping there. 3449 * 3450 * Note that the ref/mod bits in the page_t's are not affected by 3451 * this operation, hence it is up to the caller to update them appropriately. 3452 */ 3453 int 3454 ppcopy(page_t *frompp, page_t *topp) 3455 { 3456 caddr_t pp_addr1; 3457 caddr_t pp_addr2; 3458 hat_mempte_t pte1; 3459 hat_mempte_t pte2; 3460 kmutex_t *ppaddr_mutex; 3461 label_t ljb; 3462 int ret = 1; 3463 3464 ASSERT_STACK_ALIGNED(); 3465 ASSERT(PAGE_LOCKED(frompp)); 3466 ASSERT(PAGE_LOCKED(topp)); 3467 3468 if (kpm_enable) { 3469 pp_addr1 = hat_kpm_page2va(frompp, 0); 3470 pp_addr2 = hat_kpm_page2va(topp, 0); 3471 kpreempt_disable(); 3472 } else { 3473 /* 3474 * disable pre-emption so that CPU can't change 3475 */ 3476 kpreempt_disable(); 3477 3478 pp_addr1 = CPU->cpu_caddr1; 3479 pp_addr2 = CPU->cpu_caddr2; 3480 pte1 = CPU->cpu_caddr1pte; 3481 pte2 = CPU->cpu_caddr2pte; 3482 3483 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3484 mutex_enter(ppaddr_mutex); 3485 3486 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 3487 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 3488 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 3489 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3490 HAT_LOAD_NOCONSIST); 3491 } 3492 3493 if (on_fault(&ljb)) { 3494 ret = 0; 3495 goto faulted; 3496 } 3497 if (use_sse_pagecopy) 3498 #ifdef __xpv 3499 page_copy_no_xmm(pp_addr2, pp_addr1); 3500 #else 3501 hwblkpagecopy(pp_addr1, pp_addr2); 3502 #endif 3503 else 3504 bcopy(pp_addr1, pp_addr2, PAGESIZE); 3505 3506 no_fault(); 3507 faulted: 3508 if (!kpm_enable) { 3509 #ifdef __xpv 3510 /* 3511 * We can't leave unused mappings laying about under the 3512 * hypervisor, so blow them away. 3513 */ 3514 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 3515 UVMF_INVLPG | UVMF_LOCAL) < 0) 3516 panic("HYPERVISOR_update_va_mapping() failed"); 3517 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3518 UVMF_INVLPG | UVMF_LOCAL) < 0) 3519 panic("HYPERVISOR_update_va_mapping() failed"); 3520 #endif 3521 mutex_exit(ppaddr_mutex); 3522 } 3523 kpreempt_enable(); 3524 return (ret); 3525 } 3526 3527 void 3528 pagezero(page_t *pp, uint_t off, uint_t len) 3529 { 3530 ASSERT(PAGE_LOCKED(pp)); 3531 pfnzero(page_pptonum(pp), off, len); 3532 } 3533 3534 /* 3535 * Zero the physical page from off to off + len given by pfn 3536 * without changing the reference and modified bits of page. 3537 * 3538 * We use this using CPU private page address #2, see ppcopy() for more info. 3539 * pfnzero() must not be called at interrupt level. 3540 */ 3541 void 3542 pfnzero(pfn_t pfn, uint_t off, uint_t len) 3543 { 3544 caddr_t pp_addr2; 3545 hat_mempte_t pte2; 3546 kmutex_t *ppaddr_mutex = NULL; 3547 3548 ASSERT_STACK_ALIGNED(); 3549 ASSERT(len <= MMU_PAGESIZE); 3550 ASSERT(off <= MMU_PAGESIZE); 3551 ASSERT(off + len <= MMU_PAGESIZE); 3552 3553 if (kpm_enable && !pfn_is_foreign(pfn)) { 3554 pp_addr2 = hat_kpm_pfn2va(pfn); 3555 kpreempt_disable(); 3556 } else { 3557 kpreempt_disable(); 3558 3559 pp_addr2 = CPU->cpu_caddr2; 3560 pte2 = CPU->cpu_caddr2pte; 3561 3562 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3563 mutex_enter(ppaddr_mutex); 3564 3565 hat_mempte_remap(pfn, pp_addr2, pte2, 3566 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3567 HAT_LOAD_NOCONSIST); 3568 } 3569 3570 if (use_sse_pagezero) { 3571 #ifdef __xpv 3572 uint_t rem; 3573 3574 /* 3575 * zero a byte at a time until properly aligned for 3576 * block_zero_no_xmm(). 3577 */ 3578 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3579 pp_addr2[off++] = 0; 3580 3581 /* 3582 * Now use faster block_zero_no_xmm() for any range 3583 * that is properly aligned and sized. 3584 */ 3585 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3586 len -= rem; 3587 if (len != 0) { 3588 block_zero_no_xmm(pp_addr2 + off, len); 3589 off += len; 3590 } 3591 3592 /* 3593 * zero remainder with byte stores. 3594 */ 3595 while (rem-- > 0) 3596 pp_addr2[off++] = 0; 3597 #else 3598 hwblkclr(pp_addr2 + off, len); 3599 #endif 3600 } else { 3601 bzero(pp_addr2 + off, len); 3602 } 3603 3604 if (!kpm_enable || pfn_is_foreign(pfn)) { 3605 #ifdef __xpv 3606 /* 3607 * On the hypervisor this page might get used for a page 3608 * table before any intervening change to this mapping, 3609 * so blow it away. 3610 */ 3611 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3612 UVMF_INVLPG) < 0) 3613 panic("HYPERVISOR_update_va_mapping() failed"); 3614 #endif 3615 mutex_exit(ppaddr_mutex); 3616 } 3617 3618 kpreempt_enable(); 3619 } 3620 3621 /* 3622 * Platform-dependent page scrub call. 3623 */ 3624 void 3625 pagescrub(page_t *pp, uint_t off, uint_t len) 3626 { 3627 /* 3628 * For now, we rely on the fact that pagezero() will 3629 * always clear UEs. 3630 */ 3631 pagezero(pp, off, len); 3632 } 3633 3634 /* 3635 * set up two private addresses for use on a given CPU for use in ppcopy() 3636 */ 3637 void 3638 setup_vaddr_for_ppcopy(struct cpu *cpup) 3639 { 3640 void *addr; 3641 hat_mempte_t pte_pa; 3642 3643 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3644 pte_pa = hat_mempte_setup(addr); 3645 cpup->cpu_caddr1 = addr; 3646 cpup->cpu_caddr1pte = pte_pa; 3647 3648 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3649 pte_pa = hat_mempte_setup(addr); 3650 cpup->cpu_caddr2 = addr; 3651 cpup->cpu_caddr2pte = pte_pa; 3652 3653 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 3654 } 3655 3656 /* 3657 * Undo setup_vaddr_for_ppcopy 3658 */ 3659 void 3660 teardown_vaddr_for_ppcopy(struct cpu *cpup) 3661 { 3662 mutex_destroy(&cpup->cpu_ppaddr_mutex); 3663 3664 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 3665 cpup->cpu_caddr2pte = 0; 3666 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 3667 cpup->cpu_caddr2 = 0; 3668 3669 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 3670 cpup->cpu_caddr1pte = 0; 3671 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 3672 cpup->cpu_caddr1 = 0; 3673 } 3674 3675 /* 3676 * Create the pageout scanner thread. The thread has to 3677 * start at procedure with process pp and priority pri. 3678 */ 3679 void 3680 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 3681 { 3682 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 3683 } 3684 3685 /* 3686 * Function for flushing D-cache when performing module relocations 3687 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 3688 */ 3689 void 3690 dcache_flushall() 3691 {} 3692 3693 size_t 3694 exec_get_spslew(void) 3695 { 3696 return (0); 3697 } 3698 3699 /* 3700 * Allocate a memory page. The argument 'seed' can be any pseudo-random 3701 * number to vary where the pages come from. This is quite a hacked up 3702 * method -- it works for now, but really needs to be fixed up a bit. 3703 * 3704 * We currently use page_create_va() on the kvp with fake offsets, 3705 * segments and virt address. This is pretty bogus, but was copied from the 3706 * old hat_i86.c code. A better approach would be to specify either mnode 3707 * random or mnode local and takes a page from whatever color has the MOST 3708 * available - this would have a minimal impact on page coloring. 3709 */ 3710 page_t * 3711 page_get_physical(uintptr_t seed) 3712 { 3713 page_t *pp; 3714 u_offset_t offset; 3715 static struct seg tmpseg; 3716 static uintptr_t ctr = 0; 3717 3718 /* 3719 * This code is gross, we really need a simpler page allocator. 3720 * 3721 * We need assign an offset for the page to call page_create_va(). 3722 * To avoid conflicts with other pages, we get creative with the offset. 3723 * For 32 bits, we pick an offset > 4Gig 3724 * For 64 bits, pick an offset somewhere in the VA hole. 3725 */ 3726 offset = seed; 3727 if (offset > kernelbase) 3728 offset -= kernelbase; 3729 offset <<= MMU_PAGESHIFT; 3730 #if defined(__amd64) 3731 offset += mmu.hole_start; /* something in VA hole */ 3732 #else 3733 offset += 1ULL << 40; /* something > 4 Gig */ 3734 #endif 3735 3736 if (page_resv(1, KM_NOSLEEP) == 0) 3737 return (NULL); 3738 3739 #ifdef DEBUG 3740 pp = page_exists(&kvp, offset); 3741 if (pp != NULL) 3742 panic("page already exists %p", (void *)pp); 3743 #endif 3744 3745 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 3746 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 3747 if (pp == NULL) 3748 return (NULL); 3749 page_io_unlock(pp); 3750 page_hashout(pp, NULL); 3751 return (pp); 3752 } 3753