1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * UNIX machine dependent virtual memory support. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/user.h> 44 #include <sys/proc.h> 45 #include <sys/kmem.h> 46 #include <sys/vmem.h> 47 #include <sys/buf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/lgrp.h> 50 #include <sys/disp.h> 51 #include <sys/vm.h> 52 #include <sys/mman.h> 53 #include <sys/vnode.h> 54 #include <sys/cred.h> 55 #include <sys/exec.h> 56 #include <sys/exechdr.h> 57 #include <sys/debug.h> 58 #include <sys/vmsystm.h> 59 60 #include <vm/hat.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_vn.h> 65 #include <vm/page.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/vm_dep.h> 69 70 #include <sys/cpu.h> 71 #include <sys/vm_machparam.h> 72 #include <sys/memlist.h> 73 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 74 #include <vm/hat_i86.h> 75 #include <sys/x86_archext.h> 76 #include <sys/elf_386.h> 77 #include <sys/cmn_err.h> 78 #include <sys/archsystm.h> 79 #include <sys/machsystm.h> 80 81 #include <sys/vtrace.h> 82 #include <sys/ddidmareq.h> 83 #include <sys/promif.h> 84 #include <sys/memnode.h> 85 #include <sys/stack.h> 86 #include <util/qsort.h> 87 #include <sys/taskq.h> 88 89 #ifdef __xpv 90 91 #include <sys/hypervisor.h> 92 #include <sys/xen_mmu.h> 93 #include <sys/balloon_impl.h> 94 95 /* 96 * domain 0 pages usable for DMA are kept pre-allocated and kept in 97 * distinct lists, ordered by increasing mfn. 98 */ 99 static kmutex_t io_pool_lock; 100 static kmutex_t contig_list_lock; 101 static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 102 static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 103 static long io_pool_cnt; 104 static long io_pool_cnt_max = 0; 105 #define DEFAULT_IO_POOL_MIN 128 106 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 107 static long io_pool_cnt_lowater = 0; 108 static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 109 static long io_pool_shrinks; /* how many times did we really shrink */ 110 static long io_pool_grows; /* how many times did we grow */ 111 static mfn_t start_mfn = 1; 112 static caddr_t io_pool_kva; /* use to alloc pages when needed */ 113 114 static int create_contig_pfnlist(uint_t); 115 116 /* 117 * percentage of phys mem to hold in the i/o pool 118 */ 119 #define DEFAULT_IO_POOL_PCT 2 120 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 121 static void page_io_pool_sub(page_t **, page_t *, page_t *); 122 int ioalloc_dbg = 0; 123 124 #endif /* __xpv */ 125 126 uint_t vac_colors = 1; 127 128 int largepagesupport = 0; 129 extern uint_t page_create_new; 130 extern uint_t page_create_exists; 131 extern uint_t page_create_putbacks; 132 extern uint_t page_create_putbacks; 133 /* 134 * Allow users to disable the kernel's use of SSE. 135 */ 136 extern int use_sse_pagecopy, use_sse_pagezero; 137 138 /* 139 * combined memory ranges from mnode and memranges[] to manage single 140 * mnode/mtype dimension in the page lists. 141 */ 142 typedef struct { 143 pfn_t mnr_pfnlo; 144 pfn_t mnr_pfnhi; 145 int mnr_mnode; 146 int mnr_memrange; /* index into memranges[] */ 147 /* maintain page list stats */ 148 pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 149 pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 150 pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 151 #ifdef DEBUG 152 struct mnr_mts { /* mnode/mtype szc stats */ 153 pgcnt_t mnr_mts_pgcnt; 154 int mnr_mts_colors; 155 pgcnt_t *mnr_mtsc_pgcnt; 156 } *mnr_mts; 157 #endif 158 } mnoderange_t; 159 160 #define MEMRANGEHI(mtype) \ 161 ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 162 #define MEMRANGELO(mtype) (memranges[mtype]) 163 164 #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 165 166 /* 167 * As the PC architecture evolved memory up was clumped into several 168 * ranges for various historical I/O devices to do DMA. 169 * < 16Meg - ISA bus 170 * < 2Gig - ??? 171 * < 4Gig - PCI bus or drivers that don't understand PAE mode 172 * 173 * These are listed in reverse order, so that we can skip over unused 174 * ranges on machines with small memories. 175 * 176 * For now under the Hypervisor, we'll only ever have one memrange. 177 */ 178 #define PFN_4GIG 0x100000 179 #define PFN_16MEG 0x1000 180 static pfn_t arch_memranges[NUM_MEM_RANGES] = { 181 PFN_4GIG, /* pfn range for 4G and above */ 182 0x80000, /* pfn range for 2G-4G */ 183 PFN_16MEG, /* pfn range for 16M-2G */ 184 0x00000, /* pfn range for 0-16M */ 185 }; 186 pfn_t *memranges = &arch_memranges[0]; 187 int nranges = NUM_MEM_RANGES; 188 189 /* 190 * This combines mem_node_config and memranges into one data 191 * structure to be used for page list management. 192 */ 193 mnoderange_t *mnoderanges; 194 int mnoderangecnt; 195 int mtype4g; 196 197 /* 198 * 4g memory management variables for systems with more than 4g of memory: 199 * 200 * physical memory below 4g is required for 32bit dma devices and, currently, 201 * for kmem memory. On systems with more than 4g of memory, the pool of memory 202 * below 4g can be depleted without any paging activity given that there is 203 * likely to be sufficient memory above 4g. 204 * 205 * physmax4g is set true if the largest pfn is over 4g. The rest of the 206 * 4g memory management code is enabled only when physmax4g is true. 207 * 208 * maxmem4g is the count of the maximum number of pages on the page lists 209 * with physical addresses below 4g. It can be a lot less then 4g given that 210 * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 211 * agp aperture etc. 212 * 213 * freemem4g maintains the count of the number of available pages on the 214 * page lists with physical addresses below 4g. 215 * 216 * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 217 * 6% (desfree4gshift = 4) of maxmem4g. 218 * 219 * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 220 * and the amount of physical memory above 4g is greater than freemem4g. 221 * In this case, page_get_* routines will restrict below 4g allocations 222 * for requests that don't specifically require it. 223 */ 224 225 #define LOTSFREE4G (maxmem4g >> lotsfree4gshift) 226 #define DESFREE4G (maxmem4g >> desfree4gshift) 227 228 #define RESTRICT4G_ALLOC \ 229 (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 230 231 static pgcnt_t maxmem4g; 232 static pgcnt_t freemem4g; 233 static int physmax4g; 234 static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 235 static int lotsfree4gshift = 3; 236 237 /* 238 * 16m memory management: 239 * 240 * reserve some amount of physical memory below 16m for legacy devices. 241 * 242 * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 243 * 16m or if the 16m pool drops below DESFREE16M. 244 * 245 * In this case, general page allocations via page_get_{free,cache}list 246 * routines will be restricted from allocating from the 16m pool. Allocations 247 * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 248 * are not restricted. 249 */ 250 251 #define FREEMEM16M MTYPE_FREEMEM(0) 252 #define DESFREE16M desfree16m 253 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 254 ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 255 ((freemem >= (FREEMEM16M)) || \ 256 (FREEMEM16M < (DESFREE16M + pgcnt)))) 257 258 static pgcnt_t desfree16m = 0x380; 259 260 /* 261 * This can be patched via /etc/system to allow old non-PAE aware device 262 * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 263 */ 264 int restricted_kmemalloc = 0; 265 266 #ifdef VM_STATS 267 struct { 268 ulong_t pga_alloc; 269 ulong_t pga_notfullrange; 270 ulong_t pga_nulldmaattr; 271 ulong_t pga_allocok; 272 ulong_t pga_allocfailed; 273 ulong_t pgma_alloc; 274 ulong_t pgma_allocok; 275 ulong_t pgma_allocfailed; 276 ulong_t pgma_allocempty; 277 } pga_vmstats; 278 #endif 279 280 uint_t mmu_page_sizes; 281 282 /* How many page sizes the users can see */ 283 uint_t mmu_exported_page_sizes; 284 285 /* page sizes that legacy applications can see */ 286 uint_t mmu_legacy_page_sizes; 287 288 /* 289 * Number of pages in 1 GB. Don't enable automatic large pages if we have 290 * fewer than this many pages. 291 */ 292 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 293 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 294 295 /* 296 * Maximum and default segment size tunables for user private 297 * and shared anon memory, and user text and initialized data. 298 * These can be patched via /etc/system to allow large pages 299 * to be used for mapping application private and shared anon memory. 300 */ 301 size_t mcntl0_lpsize = MMU_PAGESIZE; 302 size_t max_uheap_lpsize = MMU_PAGESIZE; 303 size_t default_uheap_lpsize = MMU_PAGESIZE; 304 size_t max_ustack_lpsize = MMU_PAGESIZE; 305 size_t default_ustack_lpsize = MMU_PAGESIZE; 306 size_t max_privmap_lpsize = MMU_PAGESIZE; 307 size_t max_uidata_lpsize = MMU_PAGESIZE; 308 size_t max_utext_lpsize = MMU_PAGESIZE; 309 size_t max_shm_lpsize = MMU_PAGESIZE; 310 311 312 /* 313 * initialized by page_coloring_init(). 314 */ 315 uint_t page_colors; 316 uint_t page_colors_mask; 317 uint_t page_coloring_shift; 318 int cpu_page_colors; 319 static uint_t l2_colors; 320 321 /* 322 * Page freelists and cachelists are dynamically allocated once mnoderangecnt 323 * and page_colors are calculated from the l2 cache n-way set size. Within a 324 * mnode range, the page freelist and cachelist are hashed into bins based on 325 * color. This makes it easier to search for a page within a specific memory 326 * range. 327 */ 328 #define PAGE_COLORS_MIN 16 329 330 page_t ****page_freelists; 331 page_t ***page_cachelists; 332 333 334 /* 335 * Used by page layer to know about page sizes 336 */ 337 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 338 339 kmutex_t *fpc_mutex[NPC_MUTEX]; 340 kmutex_t *cpc_mutex[NPC_MUTEX]; 341 342 /* 343 * Only let one thread at a time try to coalesce large pages, to 344 * prevent them from working against each other. 345 */ 346 static kmutex_t contig_lock; 347 #define CONTIG_LOCK() mutex_enter(&contig_lock); 348 #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 349 350 #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 351 352 /* 353 * Return the optimum page size for a given mapping 354 */ 355 /*ARGSUSED*/ 356 size_t 357 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 358 { 359 level_t l = 0; 360 size_t pgsz = MMU_PAGESIZE; 361 size_t max_lpsize; 362 uint_t mszc; 363 364 ASSERT(maptype != MAPPGSZ_VA); 365 366 if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 367 return (MMU_PAGESIZE); 368 } 369 370 switch (maptype) { 371 case MAPPGSZ_HEAP: 372 case MAPPGSZ_STK: 373 max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 374 MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 375 if (max_lpsize == MMU_PAGESIZE) { 376 return (MMU_PAGESIZE); 377 } 378 if (len == 0) { 379 len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 380 p->p_brksize - p->p_bssbase : p->p_stksize; 381 } 382 len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 383 default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 384 385 /* 386 * use the pages size that best fits len 387 */ 388 for (l = mmu.umax_page_level; l > 0; --l) { 389 if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 390 continue; 391 } else { 392 pgsz = LEVEL_SIZE(l); 393 } 394 break; 395 } 396 397 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 398 p->p_stkpageszc); 399 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 400 pgsz = hw_page_array[mszc].hp_size; 401 } 402 return (pgsz); 403 404 case MAPPGSZ_ISM: 405 for (l = mmu.umax_page_level; l > 0; --l) { 406 if (len >= LEVEL_SIZE(l)) 407 return (LEVEL_SIZE(l)); 408 } 409 return (LEVEL_SIZE(0)); 410 } 411 return (pgsz); 412 } 413 414 static uint_t 415 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 416 size_t min_physmem) 417 { 418 caddr_t eaddr = addr + size; 419 uint_t szcvec = 0; 420 caddr_t raddr; 421 caddr_t readdr; 422 size_t pgsz; 423 int i; 424 425 if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 426 return (0); 427 } 428 429 for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 430 pgsz = page_get_pagesize(i); 431 if (pgsz > max_lpsize) { 432 continue; 433 } 434 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 435 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 436 if (raddr < addr || raddr >= readdr) { 437 continue; 438 } 439 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 440 continue; 441 } 442 /* 443 * Set szcvec to the remaining page sizes. 444 */ 445 szcvec = ((1 << (i + 1)) - 1) & ~1; 446 break; 447 } 448 return (szcvec); 449 } 450 451 /* 452 * Return a bit vector of large page size codes that 453 * can be used to map [addr, addr + len) region. 454 */ 455 /*ARGSUSED*/ 456 uint_t 457 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 458 int memcntl) 459 { 460 size_t max_lpsize = mcntl0_lpsize; 461 462 if (mmu.max_page_level == 0) 463 return (0); 464 465 if (flags & MAP_TEXT) { 466 if (!memcntl) 467 max_lpsize = max_utext_lpsize; 468 return (map_szcvec(addr, size, off, max_lpsize, 469 shm_lpg_min_physmem)); 470 471 } else if (flags & MAP_INITDATA) { 472 if (!memcntl) 473 max_lpsize = max_uidata_lpsize; 474 return (map_szcvec(addr, size, off, max_lpsize, 475 privm_lpg_min_physmem)); 476 477 } else if (type == MAPPGSZC_SHM) { 478 if (!memcntl) 479 max_lpsize = max_shm_lpsize; 480 return (map_szcvec(addr, size, off, max_lpsize, 481 shm_lpg_min_physmem)); 482 483 } else if (type == MAPPGSZC_HEAP) { 484 if (!memcntl) 485 max_lpsize = max_uheap_lpsize; 486 return (map_szcvec(addr, size, off, max_lpsize, 487 privm_lpg_min_physmem)); 488 489 } else if (type == MAPPGSZC_STACK) { 490 if (!memcntl) 491 max_lpsize = max_ustack_lpsize; 492 return (map_szcvec(addr, size, off, max_lpsize, 493 privm_lpg_min_physmem)); 494 495 } else { 496 if (!memcntl) 497 max_lpsize = max_privmap_lpsize; 498 return (map_szcvec(addr, size, off, max_lpsize, 499 privm_lpg_min_physmem)); 500 } 501 } 502 503 /* 504 * Handle a pagefault. 505 */ 506 faultcode_t 507 pagefault( 508 caddr_t addr, 509 enum fault_type type, 510 enum seg_rw rw, 511 int iskernel) 512 { 513 struct as *as; 514 struct hat *hat; 515 struct proc *p; 516 kthread_t *t; 517 faultcode_t res; 518 caddr_t base; 519 size_t len; 520 int err; 521 int mapped_red; 522 uintptr_t ea; 523 524 ASSERT_STACK_ALIGNED(); 525 526 if (INVALID_VADDR(addr)) 527 return (FC_NOMAP); 528 529 mapped_red = segkp_map_red(); 530 531 if (iskernel) { 532 as = &kas; 533 hat = as->a_hat; 534 } else { 535 t = curthread; 536 p = ttoproc(t); 537 as = p->p_as; 538 hat = as->a_hat; 539 } 540 541 /* 542 * Dispatch pagefault. 543 */ 544 res = as_fault(hat, as, addr, 1, type, rw); 545 546 /* 547 * If this isn't a potential unmapped hole in the user's 548 * UNIX data or stack segments, just return status info. 549 */ 550 if (res != FC_NOMAP || iskernel) 551 goto out; 552 553 /* 554 * Check to see if we happened to faulted on a currently unmapped 555 * part of the UNIX data or stack segments. If so, create a zfod 556 * mapping there and then try calling the fault routine again. 557 */ 558 base = p->p_brkbase; 559 len = p->p_brksize; 560 561 if (addr < base || addr >= base + len) { /* data seg? */ 562 base = (caddr_t)p->p_usrstack - p->p_stksize; 563 len = p->p_stksize; 564 if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 565 /* not in either UNIX data or stack segments */ 566 res = FC_NOMAP; 567 goto out; 568 } 569 } 570 571 /* 572 * the rest of this function implements a 3.X 4.X 5.X compatibility 573 * This code is probably not needed anymore 574 */ 575 if (p->p_model == DATAMODEL_ILP32) { 576 577 /* expand the gap to the page boundaries on each side */ 578 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 579 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 580 len = ea - (uintptr_t)base; 581 582 as_rangelock(as); 583 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 584 0) { 585 err = as_map(as, base, len, segvn_create, zfod_argsp); 586 as_rangeunlock(as); 587 if (err) { 588 res = FC_MAKE_ERR(err); 589 goto out; 590 } 591 } else { 592 /* 593 * This page is already mapped by another thread after 594 * we returned from as_fault() above. We just fall 595 * through as_fault() below. 596 */ 597 as_rangeunlock(as); 598 } 599 600 res = as_fault(hat, as, addr, 1, F_INVAL, rw); 601 } 602 603 out: 604 if (mapped_red) 605 segkp_unmap_red(); 606 607 return (res); 608 } 609 610 void 611 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 612 { 613 struct proc *p = curproc; 614 caddr_t userlimit = (flags & _MAP_LOW32) ? 615 (caddr_t)_userlimit32 : p->p_as->a_userlimit; 616 617 map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 618 } 619 620 /*ARGSUSED*/ 621 int 622 map_addr_vacalign_check(caddr_t addr, u_offset_t off) 623 { 624 return (0); 625 } 626 627 /* 628 * map_addr_proc() is the routine called when the system is to 629 * choose an address for the user. We will pick an address 630 * range which is the highest available below userlimit. 631 * 632 * addrp is a value/result parameter. 633 * On input it is a hint from the user to be used in a completely 634 * machine dependent fashion. We decide to completely ignore this hint. 635 * 636 * On output it is NULL if no address can be found in the current 637 * processes address space or else an address that is currently 638 * not mapped for len bytes with a page of red zone on either side. 639 * 640 * align is not needed on x86 (it's for viturally addressed caches) 641 */ 642 /*ARGSUSED*/ 643 void 644 map_addr_proc( 645 caddr_t *addrp, 646 size_t len, 647 offset_t off, 648 int vacalign, 649 caddr_t userlimit, 650 struct proc *p, 651 uint_t flags) 652 { 653 struct as *as = p->p_as; 654 caddr_t addr; 655 caddr_t base; 656 size_t slen; 657 size_t align_amount; 658 659 ASSERT32(userlimit == as->a_userlimit); 660 661 base = p->p_brkbase; 662 #if defined(__amd64) 663 /* 664 * XX64 Yes, this needs more work. 665 */ 666 if (p->p_model == DATAMODEL_NATIVE) { 667 if (userlimit < as->a_userlimit) { 668 /* 669 * This happens when a program wants to map 670 * something in a range that's accessible to a 671 * program in a smaller address space. For example, 672 * a 64-bit program calling mmap32(2) to guarantee 673 * that the returned address is below 4Gbytes. 674 */ 675 ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 676 677 if (userlimit > base) 678 slen = userlimit - base; 679 else { 680 *addrp = NULL; 681 return; 682 } 683 } else { 684 /* 685 * XX64 This layout is probably wrong .. but in 686 * the event we make the amd64 address space look 687 * like sparcv9 i.e. with the stack -above- the 688 * heap, this bit of code might even be correct. 689 */ 690 slen = p->p_usrstack - base - 691 (((size_t)rctl_enforced_value( 692 rctlproc_legacy[RLIMIT_STACK], 693 p->p_rctls, p) + PAGEOFFSET) & PAGEMASK); 694 } 695 } else 696 #endif 697 slen = userlimit - base; 698 699 len = (len + PAGEOFFSET) & PAGEMASK; 700 701 /* 702 * Redzone for each side of the request. This is done to leave 703 * one page unmapped between segments. This is not required, but 704 * it's useful for the user because if their program strays across 705 * a segment boundary, it will catch a fault immediately making 706 * debugging a little easier. 707 */ 708 len += 2 * MMU_PAGESIZE; 709 710 /* 711 * figure out what the alignment should be 712 * 713 * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 714 */ 715 if (len <= ELF_386_MAXPGSZ) { 716 /* 717 * Align virtual addresses to ensure that ELF shared libraries 718 * are mapped with the appropriate alignment constraints by 719 * the run-time linker. 720 */ 721 align_amount = ELF_386_MAXPGSZ; 722 } else { 723 int l = mmu.umax_page_level; 724 725 while (l && len < LEVEL_SIZE(l)) 726 --l; 727 728 align_amount = LEVEL_SIZE(l); 729 } 730 731 if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 732 align_amount = (uintptr_t)*addrp; 733 734 len += align_amount; 735 736 /* 737 * Look for a large enough hole starting below userlimit. 738 * After finding it, use the upper part. Addition of PAGESIZE 739 * is for the redzone as described above. 740 */ 741 if (as_gap(as, len, &base, &slen, AH_HI, NULL) == 0) { 742 caddr_t as_addr; 743 744 addr = base + slen - len + MMU_PAGESIZE; 745 as_addr = addr; 746 /* 747 * Round address DOWN to the alignment amount, 748 * add the offset, and if this address is less 749 * than the original address, add alignment amount. 750 */ 751 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 752 addr += (uintptr_t)(off & (align_amount - 1)); 753 if (addr < as_addr) 754 addr += align_amount; 755 756 ASSERT(addr <= (as_addr + align_amount)); 757 ASSERT(((uintptr_t)addr & (align_amount - 1)) == 758 ((uintptr_t)(off & (align_amount - 1)))); 759 *addrp = addr; 760 } else { 761 *addrp = NULL; /* no more virtual space */ 762 } 763 } 764 765 /* 766 * Determine whether [base, base+len] contains a valid range of 767 * addresses at least minlen long. base and len are adjusted if 768 * required to provide a valid range. 769 */ 770 /*ARGSUSED3*/ 771 int 772 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 773 { 774 uintptr_t hi, lo; 775 776 lo = (uintptr_t)*basep; 777 hi = lo + *lenp; 778 779 /* 780 * If hi rolled over the top, try cutting back. 781 */ 782 if (hi < lo) { 783 if (0 - lo + hi < minlen) 784 return (0); 785 if (0 - lo < minlen) 786 return (0); 787 *lenp = 0 - lo; 788 } else if (hi - lo < minlen) { 789 return (0); 790 } 791 #if defined(__amd64) 792 /* 793 * Deal with a possible hole in the address range between 794 * hole_start and hole_end that should never be mapped. 795 */ 796 if (lo < hole_start) { 797 if (hi > hole_start) { 798 if (hi < hole_end) { 799 hi = hole_start; 800 } else { 801 /* lo < hole_start && hi >= hole_end */ 802 if (dir == AH_LO) { 803 /* 804 * prefer lowest range 805 */ 806 if (hole_start - lo >= minlen) 807 hi = hole_start; 808 else if (hi - hole_end >= minlen) 809 lo = hole_end; 810 else 811 return (0); 812 } else { 813 /* 814 * prefer highest range 815 */ 816 if (hi - hole_end >= minlen) 817 lo = hole_end; 818 else if (hole_start - lo >= minlen) 819 hi = hole_start; 820 else 821 return (0); 822 } 823 } 824 } 825 } else { 826 /* lo >= hole_start */ 827 if (hi < hole_end) 828 return (0); 829 if (lo < hole_end) 830 lo = hole_end; 831 } 832 833 if (hi - lo < minlen) 834 return (0); 835 836 *basep = (caddr_t)lo; 837 *lenp = hi - lo; 838 #endif 839 return (1); 840 } 841 842 /* 843 * Determine whether [addr, addr+len] are valid user addresses. 844 */ 845 /*ARGSUSED*/ 846 int 847 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 848 caddr_t userlimit) 849 { 850 caddr_t eaddr = addr + len; 851 852 if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 853 return (RANGE_BADADDR); 854 855 #if defined(__amd64) 856 /* 857 * Check for the VA hole 858 */ 859 if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 860 return (RANGE_BADADDR); 861 #endif 862 863 return (RANGE_OKAY); 864 } 865 866 /* 867 * Return 1 if the page frame is onboard memory, else 0. 868 */ 869 int 870 pf_is_memory(pfn_t pf) 871 { 872 if (pfn_is_foreign(pf)) 873 return (0); 874 return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 875 } 876 877 /* 878 * return the memrange containing pfn 879 */ 880 int 881 memrange_num(pfn_t pfn) 882 { 883 int n; 884 885 for (n = 0; n < nranges - 1; ++n) { 886 if (pfn >= memranges[n]) 887 break; 888 } 889 return (n); 890 } 891 892 /* 893 * return the mnoderange containing pfn 894 */ 895 /*ARGSUSED*/ 896 int 897 pfn_2_mtype(pfn_t pfn) 898 { 899 #if defined(__xpv) 900 return (0); 901 #else 902 int n; 903 904 for (n = mnoderangecnt - 1; n >= 0; n--) { 905 if (pfn >= mnoderanges[n].mnr_pfnlo) { 906 break; 907 } 908 } 909 return (n); 910 #endif 911 } 912 913 #if !defined(__xpv) 914 /* 915 * is_contigpage_free: 916 * returns a page list of contiguous pages. It minimally has to return 917 * minctg pages. Caller determines minctg based on the scatter-gather 918 * list length. 919 * 920 * pfnp is set to the next page frame to search on return. 921 */ 922 static page_t * 923 is_contigpage_free( 924 pfn_t *pfnp, 925 pgcnt_t *pgcnt, 926 pgcnt_t minctg, 927 uint64_t pfnseg, 928 int iolock) 929 { 930 int i = 0; 931 pfn_t pfn = *pfnp; 932 page_t *pp; 933 page_t *plist = NULL; 934 935 /* 936 * fail if pfn + minctg crosses a segment boundary. 937 * Adjust for next starting pfn to begin at segment boundary. 938 */ 939 940 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 941 *pfnp = roundup(*pfnp, pfnseg + 1); 942 return (NULL); 943 } 944 945 do { 946 retry: 947 pp = page_numtopp_nolock(pfn + i); 948 if ((pp == NULL) || 949 (page_trylock(pp, SE_EXCL) == 0)) { 950 (*pfnp)++; 951 break; 952 } 953 if (page_pptonum(pp) != pfn + i) { 954 page_unlock(pp); 955 goto retry; 956 } 957 958 if (!(PP_ISFREE(pp))) { 959 page_unlock(pp); 960 (*pfnp)++; 961 break; 962 } 963 964 if (!PP_ISAGED(pp)) { 965 page_list_sub(pp, PG_CACHE_LIST); 966 page_hashout(pp, (kmutex_t *)NULL); 967 } else { 968 page_list_sub(pp, PG_FREE_LIST); 969 } 970 971 if (iolock) 972 page_io_lock(pp); 973 page_list_concat(&plist, &pp); 974 975 /* 976 * exit loop when pgcnt satisfied or segment boundary reached. 977 */ 978 979 } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 980 981 *pfnp += i; /* set to next pfn to search */ 982 983 if (i >= minctg) { 984 *pgcnt -= i; 985 return (plist); 986 } 987 988 /* 989 * failure: minctg not satisfied. 990 * 991 * if next request crosses segment boundary, set next pfn 992 * to search from the segment boundary. 993 */ 994 if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 995 *pfnp = roundup(*pfnp, pfnseg + 1); 996 997 /* clean up any pages already allocated */ 998 999 while (plist) { 1000 pp = plist; 1001 page_sub(&plist, pp); 1002 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 1003 if (iolock) 1004 page_io_unlock(pp); 1005 page_unlock(pp); 1006 } 1007 1008 return (NULL); 1009 } 1010 #endif /* !__xpv */ 1011 1012 /* 1013 * verify that pages being returned from allocator have correct DMA attribute 1014 */ 1015 #ifndef DEBUG 1016 #define check_dma(a, b, c) (0) 1017 #else 1018 static void 1019 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 1020 { 1021 if (dma_attr == NULL) 1022 return; 1023 1024 while (cnt-- > 0) { 1025 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 1026 dma_attr->dma_attr_addr_lo) 1027 panic("PFN (pp=%p) below dma_attr_addr_lo", pp); 1028 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 1029 dma_attr->dma_attr_addr_hi) 1030 panic("PFN (pp=%p) above dma_attr_addr_hi", pp); 1031 pp = pp->p_next; 1032 } 1033 } 1034 #endif 1035 1036 #if !defined(__xpv) 1037 static page_t * 1038 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 1039 { 1040 pfn_t pfn; 1041 int sgllen; 1042 uint64_t pfnseg; 1043 pgcnt_t minctg; 1044 page_t *pplist = NULL, *plist; 1045 uint64_t lo, hi; 1046 pgcnt_t pfnalign = 0; 1047 static pfn_t startpfn; 1048 static pgcnt_t lastctgcnt; 1049 uintptr_t align; 1050 1051 CONTIG_LOCK(); 1052 1053 if (mattr) { 1054 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 1055 hi = mmu_btop(mattr->dma_attr_addr_hi); 1056 if (hi >= physmax) 1057 hi = physmax - 1; 1058 sgllen = mattr->dma_attr_sgllen; 1059 pfnseg = mmu_btop(mattr->dma_attr_seg); 1060 1061 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 1062 if (align > MMU_PAGESIZE) 1063 pfnalign = mmu_btop(align); 1064 1065 /* 1066 * in order to satisfy the request, must minimally 1067 * acquire minctg contiguous pages 1068 */ 1069 minctg = howmany(*pgcnt, sgllen); 1070 1071 ASSERT(hi >= lo); 1072 1073 /* 1074 * start from where last searched if the minctg >= lastctgcnt 1075 */ 1076 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 1077 startpfn = lo; 1078 } else { 1079 hi = physmax - 1; 1080 lo = 0; 1081 sgllen = 1; 1082 pfnseg = mmu.highest_pfn; 1083 minctg = *pgcnt; 1084 1085 if (minctg < lastctgcnt) 1086 startpfn = lo; 1087 } 1088 lastctgcnt = minctg; 1089 1090 ASSERT(pfnseg + 1 >= (uint64_t)minctg); 1091 1092 /* conserve 16m memory - start search above 16m when possible */ 1093 if (hi > PFN_16M && startpfn < PFN_16M) 1094 startpfn = PFN_16M; 1095 1096 pfn = startpfn; 1097 if (pfnalign) 1098 pfn = P2ROUNDUP(pfn, pfnalign); 1099 1100 while (pfn + minctg - 1 <= hi) { 1101 1102 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1103 if (plist) { 1104 page_list_concat(&pplist, &plist); 1105 sgllen--; 1106 /* 1107 * return when contig pages no longer needed 1108 */ 1109 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1110 startpfn = pfn; 1111 CONTIG_UNLOCK(); 1112 check_dma(mattr, pplist, *pgcnt); 1113 return (pplist); 1114 } 1115 minctg = howmany(*pgcnt, sgllen); 1116 } 1117 if (pfnalign) 1118 pfn = P2ROUNDUP(pfn, pfnalign); 1119 } 1120 1121 /* cannot find contig pages in specified range */ 1122 if (startpfn == lo) { 1123 CONTIG_UNLOCK(); 1124 return (NULL); 1125 } 1126 1127 /* did not start with lo previously */ 1128 pfn = lo; 1129 if (pfnalign) 1130 pfn = P2ROUNDUP(pfn, pfnalign); 1131 1132 /* allow search to go above startpfn */ 1133 while (pfn < startpfn) { 1134 1135 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 1136 if (plist != NULL) { 1137 1138 page_list_concat(&pplist, &plist); 1139 sgllen--; 1140 1141 /* 1142 * return when contig pages no longer needed 1143 */ 1144 if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 1145 startpfn = pfn; 1146 CONTIG_UNLOCK(); 1147 check_dma(mattr, pplist, *pgcnt); 1148 return (pplist); 1149 } 1150 minctg = howmany(*pgcnt, sgllen); 1151 } 1152 if (pfnalign) 1153 pfn = P2ROUNDUP(pfn, pfnalign); 1154 } 1155 CONTIG_UNLOCK(); 1156 return (NULL); 1157 } 1158 #endif /* !__xpv */ 1159 1160 /* 1161 * mnode_range_cnt() calculates the number of memory ranges for mnode and 1162 * memranges[]. Used to determine the size of page lists and mnoderanges. 1163 */ 1164 int 1165 mnode_range_cnt(int mnode) 1166 { 1167 #if defined(__xpv) 1168 ASSERT(mnode == 0); 1169 return (1); 1170 #else /* __xpv */ 1171 int mri; 1172 int mnrcnt = 0; 1173 1174 if (mem_node_config[mnode].exists != 0) { 1175 mri = nranges - 1; 1176 1177 /* find the memranges index below contained in mnode range */ 1178 1179 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1180 mri--; 1181 1182 /* 1183 * increment mnode range counter when memranges or mnode 1184 * boundary is reached. 1185 */ 1186 while (mri >= 0 && 1187 mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1188 mnrcnt++; 1189 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1190 mri--; 1191 else 1192 break; 1193 } 1194 } 1195 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 1196 return (mnrcnt); 1197 #endif /* __xpv */ 1198 } 1199 1200 /* 1201 * mnode_range_setup() initializes mnoderanges. 1202 */ 1203 void 1204 mnode_range_setup(mnoderange_t *mnoderanges) 1205 { 1206 int mnode, mri; 1207 1208 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 1209 if (mem_node_config[mnode].exists == 0) 1210 continue; 1211 1212 mri = nranges - 1; 1213 1214 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1215 mri--; 1216 1217 while (mri >= 0 && mem_node_config[mnode].physmax >= 1218 MEMRANGELO(mri)) { 1219 mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 1220 mem_node_config[mnode].physbase); 1221 mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 1222 mem_node_config[mnode].physmax); 1223 mnoderanges->mnr_mnode = mnode; 1224 mnoderanges->mnr_memrange = mri; 1225 mnoderanges++; 1226 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1227 mri--; 1228 else 1229 break; 1230 } 1231 } 1232 } 1233 1234 /*ARGSUSED*/ 1235 int 1236 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1237 { 1238 int mtype = mnoderangecnt - 1; 1239 1240 #if !defined(__xpv) 1241 #if defined(__i386) 1242 /* 1243 * set the mtype range 1244 * - kmem requests needs to be below 4g if restricted_kmemalloc is set. 1245 * - for non kmem requests, set range to above 4g if memory below 4g 1246 * runs low. 1247 */ 1248 if (restricted_kmemalloc && VN_ISKAS(vp) && 1249 (caddr_t)(vaddr) >= kernelheap && 1250 (caddr_t)(vaddr) < ekernelheap) { 1251 ASSERT(physmax4g); 1252 mtype = mtype4g; 1253 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1254 btop(pgsz), *flags)) { 1255 *flags |= PGI_MT_RANGE16M; 1256 } else { 1257 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1258 VM_STAT_COND_ADD((*flags & PG_PANIC), 1259 vmm_vmstats.pgpanicalloc); 1260 *flags |= PGI_MT_RANGE0; 1261 } 1262 return (mtype); 1263 } 1264 #endif /* __i386 */ 1265 1266 if (RESTRICT4G_ALLOC) { 1267 VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1268 /* here only for > 4g systems */ 1269 *flags |= PGI_MT_RANGE4G; 1270 } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1271 *flags |= PGI_MT_RANGE16M; 1272 } else { 1273 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1274 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1275 *flags |= PGI_MT_RANGE0; 1276 } 1277 #endif /* !__xpv */ 1278 return (mtype); 1279 } 1280 1281 1282 /* mtype init for page_get_replacement_page */ 1283 /*ARGSUSED*/ 1284 int 1285 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1286 { 1287 int mtype = mnoderangecnt - 1; 1288 #if !defined(__ixpv) 1289 if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1290 *flags |= PGI_MT_RANGE16M; 1291 } else { 1292 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1293 *flags |= PGI_MT_RANGE0; 1294 } 1295 #endif 1296 return (mtype); 1297 } 1298 1299 /* 1300 * Determine if the mnode range specified in mtype contains memory belonging 1301 * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1302 * the range of indices from high pfn to 0, 16m or 4g. 1303 * 1304 * Return first mnode range type index found otherwise return -1 if none found. 1305 */ 1306 int 1307 mtype_func(int mnode, int mtype, uint_t flags) 1308 { 1309 if (flags & PGI_MT_RANGE) { 1310 int mtlim = 0; 1311 1312 if (flags & PGI_MT_NEXT) 1313 mtype--; 1314 if (flags & PGI_MT_RANGE4G) 1315 mtlim = mtype4g + 1; /* exclude 0-4g range */ 1316 else if (flags & PGI_MT_RANGE16M) 1317 mtlim = 1; /* exclude 0-16m range */ 1318 while (mtype >= mtlim) { 1319 if (mnoderanges[mtype].mnr_mnode == mnode) 1320 return (mtype); 1321 mtype--; 1322 } 1323 } else if (mnoderanges[mtype].mnr_mnode == mnode) { 1324 return (mtype); 1325 } 1326 return (-1); 1327 } 1328 1329 /* 1330 * Update the page list max counts with the pfn range specified by the 1331 * input parameters. Called from add_physmem() when physical memory with 1332 * page_t's are initially added to the page lists. 1333 */ 1334 void 1335 mtype_modify_max(pfn_t startpfn, long cnt) 1336 { 1337 int mtype = 0; 1338 pfn_t endpfn = startpfn + cnt, pfn; 1339 pgcnt_t inc; 1340 1341 ASSERT(cnt > 0); 1342 1343 if (!physmax4g) 1344 return; 1345 1346 for (pfn = startpfn; pfn < endpfn; ) { 1347 if (pfn <= mnoderanges[mtype].mnr_pfnhi) { 1348 if (endpfn < mnoderanges[mtype].mnr_pfnhi) { 1349 inc = endpfn - pfn; 1350 } else { 1351 inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; 1352 } 1353 if (mtype <= mtype4g) 1354 maxmem4g += inc; 1355 pfn += inc; 1356 } 1357 mtype++; 1358 ASSERT(mtype < mnoderangecnt || pfn >= endpfn); 1359 } 1360 } 1361 1362 int 1363 mtype_2_mrange(int mtype) 1364 { 1365 return (mnoderanges[mtype].mnr_memrange); 1366 } 1367 1368 void 1369 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1370 { 1371 ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1372 *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1373 *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1374 } 1375 1376 size_t 1377 plcnt_sz(size_t ctrs_sz) 1378 { 1379 #ifdef DEBUG 1380 int szc, colors; 1381 1382 ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1383 for (szc = 0; szc < mmu_page_sizes; szc++) { 1384 colors = page_get_pagecolors(szc); 1385 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1386 } 1387 #endif 1388 return (ctrs_sz); 1389 } 1390 1391 caddr_t 1392 plcnt_init(caddr_t addr) 1393 { 1394 #ifdef DEBUG 1395 int mt, szc, colors; 1396 1397 for (mt = 0; mt < mnoderangecnt; mt++) { 1398 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1399 addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1400 for (szc = 0; szc < mmu_page_sizes; szc++) { 1401 colors = page_get_pagecolors(szc); 1402 mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1403 mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1404 (pgcnt_t *)addr; 1405 addr += (sizeof (pgcnt_t) * colors); 1406 } 1407 } 1408 #endif 1409 return (addr); 1410 } 1411 1412 void 1413 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1414 { 1415 #ifdef DEBUG 1416 int bin = PP_2_BIN(pp); 1417 1418 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1419 atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1420 cnt); 1421 #endif 1422 ASSERT(mtype == PP_2_MTYPE(pp)); 1423 if (physmax4g && mtype <= mtype4g) 1424 atomic_add_long(&freemem4g, cnt); 1425 if (flags & PG_CACHE_LIST) 1426 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1427 else 1428 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 1429 atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 1430 } 1431 1432 /* 1433 * Returns the free page count for mnode 1434 */ 1435 int 1436 mnode_pgcnt(int mnode) 1437 { 1438 int mtype = mnoderangecnt - 1; 1439 int flags = PGI_MT_RANGE0; 1440 pgcnt_t pgcnt = 0; 1441 1442 mtype = mtype_func(mnode, mtype, flags); 1443 1444 while (mtype != -1) { 1445 pgcnt += MTYPE_FREEMEM(mtype); 1446 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1447 } 1448 return (pgcnt); 1449 } 1450 1451 /* 1452 * Initialize page coloring variables based on the l2 cache parameters. 1453 * Calculate and return memory needed for page coloring data structures. 1454 */ 1455 size_t 1456 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 1457 { 1458 size_t colorsz = 0; 1459 int i; 1460 int colors; 1461 1462 #if defined(__xpv) 1463 /* 1464 * Hypervisor domains currently don't have any concept of NUMA. 1465 * Hence we'll act like there is only 1 memrange. 1466 */ 1467 i = memrange_num(1); 1468 #else /* !__xpv */ 1469 /* 1470 * Reduce the memory ranges lists if we don't have large amounts 1471 * of memory. This avoids searching known empty free lists. 1472 */ 1473 i = memrange_num(physmax); 1474 #if defined(__i386) 1475 if (i > 0) 1476 restricted_kmemalloc = 0; 1477 #endif 1478 /* physmax greater than 4g */ 1479 if (i == 0) 1480 physmax4g = 1; 1481 #endif /* !__xpv */ 1482 memranges += i; 1483 nranges -= i; 1484 1485 ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 1486 1487 ASSERT(ISP2(l2_sz)); 1488 ASSERT(ISP2(l2_linesz)); 1489 ASSERT(l2_sz > MMU_PAGESIZE); 1490 1491 /* l2_assoc is 0 for fully associative l2 cache */ 1492 if (l2_assoc) 1493 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 1494 else 1495 l2_colors = 1; 1496 1497 /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 1498 page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 1499 1500 /* 1501 * cpu_page_colors is non-zero when a page color may be spread across 1502 * multiple bins. 1503 */ 1504 if (l2_colors < page_colors) 1505 cpu_page_colors = l2_colors; 1506 1507 ASSERT(ISP2(page_colors)); 1508 1509 page_colors_mask = page_colors - 1; 1510 1511 ASSERT(ISP2(CPUSETSIZE())); 1512 page_coloring_shift = lowbit(CPUSETSIZE()); 1513 1514 /* initialize number of colors per page size */ 1515 for (i = 0; i <= mmu.max_page_level; i++) { 1516 hw_page_array[i].hp_size = LEVEL_SIZE(i); 1517 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 1518 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 1519 hw_page_array[i].hp_colors = (page_colors_mask >> 1520 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 1521 + 1; 1522 colorequivszc[i] = 0; 1523 } 1524 1525 /* 1526 * The value of cpu_page_colors determines if additional color bins 1527 * need to be checked for a particular color in the page_get routines. 1528 */ 1529 if (cpu_page_colors != 0) { 1530 1531 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 1532 ASSERT(a > 0); 1533 ASSERT(a < 16); 1534 1535 for (i = 0; i <= mmu.max_page_level; i++) { 1536 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1537 colorequivszc[i] = 0; 1538 continue; 1539 } 1540 while ((colors >> a) == 0) 1541 a--; 1542 ASSERT(a >= 0); 1543 1544 /* higher 4 bits encodes color equiv mask */ 1545 colorequivszc[i] = (a << 4); 1546 } 1547 } 1548 1549 /* factor in colorequiv to check additional 'equivalent' bins. */ 1550 if (colorequiv > 1) { 1551 1552 int a = lowbit(colorequiv) - 1; 1553 if (a > 15) 1554 a = 15; 1555 1556 for (i = 0; i <= mmu.max_page_level; i++) { 1557 if ((colors = hw_page_array[i].hp_colors) <= 1) { 1558 continue; 1559 } 1560 while ((colors >> a) == 0) 1561 a--; 1562 if ((a << 4) > colorequivszc[i]) { 1563 colorequivszc[i] = (a << 4); 1564 } 1565 } 1566 } 1567 1568 /* size for mnoderanges */ 1569 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 1570 mnoderangecnt += mnode_range_cnt(i); 1571 colorsz = mnoderangecnt * sizeof (mnoderange_t); 1572 1573 /* size for fpc_mutex and cpc_mutex */ 1574 colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 1575 1576 /* size of page_freelists */ 1577 colorsz += mnoderangecnt * sizeof (page_t ***); 1578 colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 1579 1580 for (i = 0; i < mmu_page_sizes; i++) { 1581 colors = page_get_pagecolors(i); 1582 colorsz += mnoderangecnt * colors * sizeof (page_t *); 1583 } 1584 1585 /* size of page_cachelists */ 1586 colorsz += mnoderangecnt * sizeof (page_t **); 1587 colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 1588 1589 return (colorsz); 1590 } 1591 1592 /* 1593 * Called once at startup to configure page_coloring data structures and 1594 * does the 1st page_free()/page_freelist_add(). 1595 */ 1596 void 1597 page_coloring_setup(caddr_t pcmemaddr) 1598 { 1599 int i; 1600 int j; 1601 int k; 1602 caddr_t addr; 1603 int colors; 1604 1605 /* 1606 * do page coloring setup 1607 */ 1608 addr = pcmemaddr; 1609 1610 mnoderanges = (mnoderange_t *)addr; 1611 addr += (mnoderangecnt * sizeof (mnoderange_t)); 1612 1613 mnode_range_setup(mnoderanges); 1614 1615 if (physmax4g) 1616 mtype4g = pfn_2_mtype(0xfffff); 1617 1618 for (k = 0; k < NPC_MUTEX; k++) { 1619 fpc_mutex[k] = (kmutex_t *)addr; 1620 addr += (max_mem_nodes * sizeof (kmutex_t)); 1621 } 1622 for (k = 0; k < NPC_MUTEX; k++) { 1623 cpc_mutex[k] = (kmutex_t *)addr; 1624 addr += (max_mem_nodes * sizeof (kmutex_t)); 1625 } 1626 page_freelists = (page_t ****)addr; 1627 addr += (mnoderangecnt * sizeof (page_t ***)); 1628 1629 page_cachelists = (page_t ***)addr; 1630 addr += (mnoderangecnt * sizeof (page_t **)); 1631 1632 for (i = 0; i < mnoderangecnt; i++) { 1633 page_freelists[i] = (page_t ***)addr; 1634 addr += (mmu_page_sizes * sizeof (page_t **)); 1635 1636 for (j = 0; j < mmu_page_sizes; j++) { 1637 colors = page_get_pagecolors(j); 1638 page_freelists[i][j] = (page_t **)addr; 1639 addr += (colors * sizeof (page_t *)); 1640 } 1641 page_cachelists[i] = (page_t **)addr; 1642 addr += (page_colors * sizeof (page_t *)); 1643 } 1644 } 1645 1646 #if defined(__xpv) 1647 /* 1648 * Give back 10% of the io_pool pages to the free list. 1649 * Don't shrink the pool below some absolute minimum. 1650 */ 1651 static void 1652 page_io_pool_shrink() 1653 { 1654 int retcnt; 1655 page_t *pp, *pp_first, *pp_last, **curpool; 1656 mfn_t mfn; 1657 int bothpools = 0; 1658 1659 mutex_enter(&io_pool_lock); 1660 io_pool_shrink_attempts++; /* should be a kstat? */ 1661 retcnt = io_pool_cnt / 10; 1662 if (io_pool_cnt - retcnt < io_pool_cnt_min) 1663 retcnt = io_pool_cnt - io_pool_cnt_min; 1664 if (retcnt <= 0) 1665 goto done; 1666 io_pool_shrinks++; /* should be a kstat? */ 1667 curpool = &io_pool_4g; 1668 domore: 1669 /* 1670 * Loop through taking pages from the end of the list 1671 * (highest mfns) till amount to return reached. 1672 */ 1673 for (pp = *curpool; pp && retcnt > 0; ) { 1674 pp_first = pp_last = pp->p_prev; 1675 if (pp_first == *curpool) 1676 break; 1677 retcnt--; 1678 io_pool_cnt--; 1679 page_io_pool_sub(curpool, pp_first, pp_last); 1680 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 1681 start_mfn = mfn; 1682 page_free(pp_first, 1); 1683 pp = *curpool; 1684 } 1685 if (retcnt != 0 && !bothpools) { 1686 /* 1687 * If not enough found in less constrained pool try the 1688 * more constrained one. 1689 */ 1690 curpool = &io_pool_16m; 1691 bothpools = 1; 1692 goto domore; 1693 } 1694 done: 1695 mutex_exit(&io_pool_lock); 1696 } 1697 1698 #endif /* __xpv */ 1699 1700 uint_t 1701 page_create_update_flags_x86(uint_t flags) 1702 { 1703 #if defined(__xpv) 1704 /* 1705 * Check this is an urgent allocation and free pages are depleted. 1706 */ 1707 if (!(flags & PG_WAIT) && freemem < desfree) 1708 page_io_pool_shrink(); 1709 #else /* !__xpv */ 1710 /* 1711 * page_create_get_something may call this because 4g memory may be 1712 * depleted. Set flags to allow for relocation of base page below 1713 * 4g if necessary. 1714 */ 1715 if (physmax4g) 1716 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1717 #endif /* __xpv */ 1718 return (flags); 1719 } 1720 1721 /*ARGSUSED*/ 1722 int 1723 bp_color(struct buf *bp) 1724 { 1725 return (0); 1726 } 1727 1728 #if defined(__xpv) 1729 1730 /* 1731 * Take pages out of an io_pool 1732 */ 1733 static void 1734 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 1735 { 1736 if (*poolp == pp_first) { 1737 *poolp = pp_last->p_next; 1738 if (*poolp == pp_first) 1739 *poolp = NULL; 1740 } 1741 pp_first->p_prev->p_next = pp_last->p_next; 1742 pp_last->p_next->p_prev = pp_first->p_prev; 1743 pp_first->p_prev = pp_last; 1744 pp_last->p_next = pp_first; 1745 } 1746 1747 /* 1748 * Put a page on the io_pool list. The list is ordered by increasing MFN. 1749 */ 1750 static void 1751 page_io_pool_add(page_t **poolp, page_t *pp) 1752 { 1753 page_t *look; 1754 mfn_t mfn = mfn_list[pp->p_pagenum]; 1755 1756 if (*poolp == NULL) { 1757 *poolp = pp; 1758 pp->p_next = pp; 1759 pp->p_prev = pp; 1760 return; 1761 } 1762 1763 /* 1764 * Since we try to take pages from the high end of the pool 1765 * chances are good that the pages to be put on the list will 1766 * go at or near the end of the list. so start at the end and 1767 * work backwards. 1768 */ 1769 look = (*poolp)->p_prev; 1770 while (mfn < mfn_list[look->p_pagenum]) { 1771 look = look->p_prev; 1772 if (look == (*poolp)->p_prev) 1773 break; /* backed all the way to front of list */ 1774 } 1775 1776 /* insert after look */ 1777 pp->p_prev = look; 1778 pp->p_next = look->p_next; 1779 pp->p_next->p_prev = pp; 1780 look->p_next = pp; 1781 if (mfn < mfn_list[(*poolp)->p_pagenum]) { 1782 /* 1783 * we inserted a new first list element 1784 * adjust pool pointer to newly inserted element 1785 */ 1786 *poolp = pp; 1787 } 1788 } 1789 1790 /* 1791 * Add a page to the io_pool. Setting the force flag will force the page 1792 * into the io_pool no matter what. 1793 */ 1794 static void 1795 add_page_to_pool(page_t *pp, int force) 1796 { 1797 page_t *highest; 1798 page_t *freep = NULL; 1799 1800 mutex_enter(&io_pool_lock); 1801 /* 1802 * Always keep the scarce low memory pages 1803 */ 1804 if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 1805 ++io_pool_cnt; 1806 page_io_pool_add(&io_pool_16m, pp); 1807 goto done; 1808 } 1809 if (io_pool_cnt < io_pool_cnt_max || force) { 1810 ++io_pool_cnt; 1811 page_io_pool_add(&io_pool_4g, pp); 1812 } else { 1813 highest = io_pool_4g->p_prev; 1814 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 1815 page_io_pool_sub(&io_pool_4g, highest, highest); 1816 page_io_pool_add(&io_pool_4g, pp); 1817 freep = highest; 1818 } else { 1819 freep = pp; 1820 } 1821 } 1822 done: 1823 mutex_exit(&io_pool_lock); 1824 if (freep) 1825 page_free(freep, 1); 1826 } 1827 1828 1829 int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 1830 int contig_pfn_max; /* capacity of the contig pfn list */ 1831 int next_alloc_pfn; /* next position in list to start a contig search */ 1832 int contig_pfnlist_updates; /* pfn list update count */ 1833 int contig_pfnlist_builds; /* how many times have we (re)built list */ 1834 int contig_pfnlist_buildfailed; /* how many times has list build failed */ 1835 int create_contig_pending; /* nonzero means taskq creating contig list */ 1836 pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 1837 1838 /* 1839 * Function to use in sorting a list of pfns by their underlying mfns. 1840 */ 1841 static int 1842 mfn_compare(const void *pfnp1, const void *pfnp2) 1843 { 1844 mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 1845 mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 1846 1847 if (mfn1 > mfn2) 1848 return (1); 1849 if (mfn1 < mfn2) 1850 return (-1); 1851 return (0); 1852 } 1853 1854 /* 1855 * Compact the contig_pfn_list by tossing all the non-contiguous 1856 * elements from the list. 1857 */ 1858 static void 1859 compact_contig_pfn_list(void) 1860 { 1861 pfn_t pfn, lapfn, prev_lapfn; 1862 mfn_t mfn; 1863 int i, newcnt = 0; 1864 1865 prev_lapfn = 0; 1866 for (i = 0; i < contig_pfn_cnt - 1; i++) { 1867 pfn = contig_pfn_list[i]; 1868 lapfn = contig_pfn_list[i + 1]; 1869 mfn = mfn_list[pfn]; 1870 /* 1871 * See if next pfn is for a contig mfn 1872 */ 1873 if (mfn_list[lapfn] != mfn + 1) 1874 continue; 1875 /* 1876 * pfn and lookahead are both put in list 1877 * unless pfn is the previous lookahead. 1878 */ 1879 if (pfn != prev_lapfn) 1880 contig_pfn_list[newcnt++] = pfn; 1881 contig_pfn_list[newcnt++] = lapfn; 1882 prev_lapfn = lapfn; 1883 } 1884 for (i = newcnt; i < contig_pfn_cnt; i++) 1885 contig_pfn_list[i] = 0; 1886 contig_pfn_cnt = newcnt; 1887 } 1888 1889 /*ARGSUSED*/ 1890 static void 1891 call_create_contiglist(void *arg) 1892 { 1893 (void) create_contig_pfnlist(PG_WAIT); 1894 } 1895 1896 /* 1897 * Create list of freelist pfns that have underlying 1898 * contiguous mfns. The list is kept in ascending mfn order. 1899 * returns 1 if list created else 0. 1900 */ 1901 static int 1902 create_contig_pfnlist(uint_t flags) 1903 { 1904 pfn_t pfn; 1905 page_t *pp; 1906 int ret = 1; 1907 1908 mutex_enter(&contig_list_lock); 1909 if (contig_pfn_list != NULL) 1910 goto out; 1911 contig_pfn_max = freemem + (freemem / 10); 1912 contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 1913 (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 1914 if (contig_pfn_list == NULL) { 1915 /* 1916 * If we could not create the contig list (because 1917 * we could not sleep for memory). Dispatch a taskq that can 1918 * sleep to get the memory. 1919 */ 1920 if (!create_contig_pending) { 1921 if (taskq_dispatch(system_taskq, call_create_contiglist, 1922 NULL, TQ_NOSLEEP) != NULL) 1923 create_contig_pending = 1; 1924 } 1925 contig_pfnlist_buildfailed++; /* count list build failures */ 1926 ret = 0; 1927 goto out; 1928 } 1929 create_contig_pending = 0; 1930 ASSERT(contig_pfn_cnt == 0); 1931 for (pfn = 0; pfn < mfn_count; pfn++) { 1932 pp = page_numtopp_nolock(pfn); 1933 if (pp == NULL || !PP_ISFREE(pp)) 1934 continue; 1935 contig_pfn_list[contig_pfn_cnt] = pfn; 1936 if (++contig_pfn_cnt == contig_pfn_max) 1937 break; 1938 } 1939 qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 1940 compact_contig_pfn_list(); 1941 /* 1942 * Make sure next search of the newly created contiguous pfn 1943 * list starts at the beginning of the list. 1944 */ 1945 next_alloc_pfn = 0; 1946 contig_pfnlist_builds++; /* count list builds */ 1947 out: 1948 mutex_exit(&contig_list_lock); 1949 return (ret); 1950 } 1951 1952 1953 /* 1954 * Toss the current contig pfnlist. Someone is about to do a massive 1955 * update to pfn<->mfn mappings. So we have them destroy the list and lock 1956 * it till they are done with their update. 1957 */ 1958 void 1959 clear_and_lock_contig_pfnlist() 1960 { 1961 pfn_t *listp = NULL; 1962 size_t listsize; 1963 1964 mutex_enter(&contig_list_lock); 1965 if (contig_pfn_list != NULL) { 1966 listp = contig_pfn_list; 1967 listsize = contig_pfn_max * sizeof (pfn_t); 1968 contig_pfn_list = NULL; 1969 contig_pfn_max = contig_pfn_cnt = 0; 1970 } 1971 if (listp != NULL) 1972 kmem_free(listp, listsize); 1973 } 1974 1975 /* 1976 * Unlock the contig_pfn_list. The next attempted use of it will cause 1977 * it to be re-created. 1978 */ 1979 void 1980 unlock_contig_pfnlist() 1981 { 1982 mutex_exit(&contig_list_lock); 1983 } 1984 1985 /* 1986 * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 1987 */ 1988 void 1989 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 1990 { 1991 int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 1992 pfn_t probe_pfn; 1993 mfn_t probe_mfn; 1994 int drop_lock = 0; 1995 1996 if (mutex_owner(&contig_list_lock) != curthread) { 1997 drop_lock = 1; 1998 mutex_enter(&contig_list_lock); 1999 } 2000 if (contig_pfn_list == NULL) 2001 goto done; 2002 contig_pfnlist_updates++; 2003 /* 2004 * Find the pfn in the current list. Use a binary chop to locate it. 2005 */ 2006 probe_hi = contig_pfn_cnt - 1; 2007 probe_lo = 0; 2008 probe_pos = (probe_hi + probe_lo) / 2; 2009 while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2010 if (probe_pos == probe_lo) { /* pfn not in list */ 2011 probe_pos = -1; 2012 break; 2013 } 2014 if (pfn_to_mfn(probe_pfn) <= oldmfn) 2015 probe_lo = probe_pos; 2016 else 2017 probe_hi = probe_pos; 2018 probe_pos = (probe_hi + probe_lo) / 2; 2019 } 2020 if (probe_pos >= 0) { /* remove pfn fom list */ 2021 contig_pfn_cnt--; 2022 ovbcopy(&contig_pfn_list[probe_pos + 1], 2023 &contig_pfn_list[probe_pos], 2024 (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2025 } 2026 if (newmfn == MFN_INVALID) 2027 goto done; 2028 /* 2029 * Check if new mfn has adjacent mfns in the list 2030 */ 2031 probe_hi = contig_pfn_cnt - 1; 2032 probe_lo = 0; 2033 insert_after = -2; 2034 do { 2035 probe_pos = (probe_hi + probe_lo) / 2; 2036 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2037 if (newmfn == probe_mfn + 1) 2038 insert_after = probe_pos; 2039 else if (newmfn == probe_mfn - 1) 2040 insert_after = probe_pos - 1; 2041 if (probe_pos == probe_lo) 2042 break; 2043 if (probe_mfn <= newmfn) 2044 probe_lo = probe_pos; 2045 else 2046 probe_hi = probe_pos; 2047 } while (insert_after == -2); 2048 /* 2049 * If there is space in the list and there are adjacent mfns 2050 * insert the pfn in to its proper place in the list. 2051 */ 2052 if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2053 insert_point = insert_after + 1; 2054 ovbcopy(&contig_pfn_list[insert_point], 2055 &contig_pfn_list[insert_point + 1], 2056 (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2057 contig_pfn_list[insert_point] = pfn; 2058 contig_pfn_cnt++; 2059 } 2060 done: 2061 if (drop_lock) 2062 mutex_exit(&contig_list_lock); 2063 } 2064 2065 /* 2066 * Called to (re-)populate the io_pool from the free page lists. 2067 */ 2068 long 2069 populate_io_pool(void) 2070 { 2071 pfn_t pfn; 2072 mfn_t mfn, max_mfn; 2073 page_t *pp; 2074 2075 /* 2076 * Figure out the bounds of the pool on first invocation. 2077 * We use a percentage of memory for the io pool size. 2078 * we allow that to shrink, but not to less than a fixed minimum 2079 */ 2080 if (io_pool_cnt_max == 0) { 2081 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2082 io_pool_cnt_lowater = io_pool_cnt_max; 2083 /* 2084 * This is the first time in populate_io_pool, grab a va to use 2085 * when we need to allocate pages. 2086 */ 2087 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2088 } 2089 /* 2090 * If we are out of pages in the pool, then grow the size of the pool 2091 */ 2092 if (io_pool_cnt == 0) 2093 io_pool_cnt_max += io_pool_cnt_max / 20; /* grow by 5% */ 2094 io_pool_grows++; /* should be a kstat? */ 2095 2096 /* 2097 * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2098 */ 2099 (void) mfn_to_pfn(start_mfn); 2100 max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2101 for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2102 pfn = mfn_to_pfn(mfn); 2103 if (pfn & PFN_IS_FOREIGN_MFN) 2104 continue; 2105 /* 2106 * try to allocate it from free pages 2107 */ 2108 pp = page_numtopp_alloc(pfn); 2109 if (pp == NULL) 2110 continue; 2111 PP_CLRFREE(pp); 2112 add_page_to_pool(pp, 1); 2113 if (io_pool_cnt >= io_pool_cnt_max) 2114 break; 2115 } 2116 2117 return (io_pool_cnt); 2118 } 2119 2120 /* 2121 * Destroy a page that was being used for DMA I/O. It may or 2122 * may not actually go back to the io_pool. 2123 */ 2124 void 2125 page_destroy_io(page_t *pp) 2126 { 2127 mfn_t mfn = mfn_list[pp->p_pagenum]; 2128 2129 /* 2130 * When the page was alloc'd a reservation was made, release it now 2131 */ 2132 page_unresv(1); 2133 /* 2134 * Unload translations, if any, then hash out the 2135 * page to erase its identity. 2136 */ 2137 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2138 page_hashout(pp, NULL); 2139 2140 /* 2141 * If the page came from the free lists, just put it back to them. 2142 * DomU pages always go on the free lists as well. 2143 */ 2144 if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2145 page_free(pp, 1); 2146 return; 2147 } 2148 2149 add_page_to_pool(pp, 0); 2150 } 2151 2152 2153 long contig_searches; /* count of times contig pages requested */ 2154 long contig_search_restarts; /* count of contig ranges tried */ 2155 long contig_search_failed; /* count of contig alloc failures */ 2156 2157 /* 2158 * Look thru the contiguous pfns that are not part of the io_pool for 2159 * contiguous free pages. Return a list of the found pages or NULL. 2160 */ 2161 page_t * 2162 find_contig_free(uint_t bytes, uint_t flags) 2163 { 2164 page_t *pp, *plist = NULL; 2165 mfn_t mfn, prev_mfn; 2166 pfn_t pfn; 2167 int pages_needed, pages_requested; 2168 int search_start; 2169 2170 /* 2171 * create the contig pfn list if not already done 2172 */ 2173 retry: 2174 mutex_enter(&contig_list_lock); 2175 if (contig_pfn_list == NULL) { 2176 mutex_exit(&contig_list_lock); 2177 if (!create_contig_pfnlist(flags)) { 2178 return (NULL); 2179 } 2180 goto retry; 2181 } 2182 contig_searches++; 2183 /* 2184 * Search contiguous pfn list for physically contiguous pages not in 2185 * the io_pool. Start the search where the last search left off. 2186 */ 2187 pages_requested = pages_needed = mmu_btop(bytes); 2188 search_start = next_alloc_pfn; 2189 prev_mfn = 0; 2190 while (pages_needed) { 2191 pfn = contig_pfn_list[next_alloc_pfn]; 2192 mfn = pfn_to_mfn(pfn); 2193 if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 2194 (pp = page_numtopp_alloc(pfn)) != NULL) { 2195 PP_CLRFREE(pp); 2196 page_io_pool_add(&plist, pp); 2197 pages_needed--; 2198 prev_mfn = mfn; 2199 } else { 2200 contig_search_restarts++; 2201 /* 2202 * free partial page list 2203 */ 2204 while (plist != NULL) { 2205 pp = plist; 2206 page_io_pool_sub(&plist, pp, pp); 2207 page_free(pp, 1); 2208 } 2209 pages_needed = pages_requested; 2210 prev_mfn = 0; 2211 } 2212 if (++next_alloc_pfn == contig_pfn_cnt) 2213 next_alloc_pfn = 0; 2214 if (next_alloc_pfn == search_start) 2215 break; /* all pfns searched */ 2216 } 2217 mutex_exit(&contig_list_lock); 2218 if (pages_needed) { 2219 contig_search_failed++; 2220 /* 2221 * Failed to find enough contig pages. 2222 * free partial page list 2223 */ 2224 while (plist != NULL) { 2225 pp = plist; 2226 page_io_pool_sub(&plist, pp, pp); 2227 page_free(pp, 1); 2228 } 2229 } 2230 return (plist); 2231 } 2232 2233 /* 2234 * Allocator for domain 0 I/O pages. We match the required 2235 * DMA attributes and contiguity constraints. 2236 */ 2237 /*ARGSUSED*/ 2238 page_t * 2239 page_create_io( 2240 struct vnode *vp, 2241 u_offset_t off, 2242 uint_t bytes, 2243 uint_t flags, 2244 struct as *as, 2245 caddr_t vaddr, 2246 ddi_dma_attr_t *mattr) 2247 { 2248 mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 2249 page_t *pp_first; /* list to return */ 2250 page_t *pp_last; /* last in list to return */ 2251 page_t *pp, **poolp, **pplist = NULL, *expp; 2252 int i, extpages = 0, npages = 0, contig, anyaddr, extra; 2253 mfn_t lo_mfn; 2254 mfn_t hi_mfn; 2255 mfn_t mfn, tmfn; 2256 mfn_t *mfnlist = 0; 2257 pgcnt_t pfnalign = 0; 2258 int align, order, nbits, extents; 2259 uint64_t pfnseg; 2260 int attempt = 0, is_domu = 0; 2261 int asked_hypervisor = 0; 2262 uint_t kflags; 2263 2264 ASSERT(mattr != NULL); 2265 lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2266 hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 2267 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2268 if (align > MMU_PAGESIZE) 2269 pfnalign = mmu_btop(align); 2270 pfnseg = mmu_btop(mattr->dma_attr_seg); 2271 2272 /* 2273 * Clear the contig flag if only one page is needed. 2274 */ 2275 contig = (flags & PG_PHYSCONTIG); 2276 flags &= ~PG_PHYSCONTIG; 2277 bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 2278 if (bytes == MMU_PAGESIZE) 2279 contig = 0; 2280 2281 /* 2282 * Check if any old page in the system is fine. 2283 * DomU should always go down this path. 2284 */ 2285 is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 2286 anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 2287 if ((!contig && anyaddr) || is_domu) { 2288 pp = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 2289 if (pp) 2290 return (pp); 2291 else if (is_domu) 2292 return (NULL); /* no memory available */ 2293 } 2294 /* 2295 * DomU should never reach here 2296 */ 2297 try_again: 2298 /* 2299 * We could just want unconstrained but contig pages. 2300 */ 2301 if (anyaddr && contig && pfnseg >= max_mfn) { 2302 /* 2303 * Look for free contig pages to satisfy the request. 2304 */ 2305 pp_first = find_contig_free(bytes, flags); 2306 if (pp_first != NULL) 2307 goto done; 2308 } 2309 /* 2310 * See if we want pages for a legacy device 2311 */ 2312 if (hi_mfn < PFN_16MEG) 2313 poolp = &io_pool_16m; 2314 else 2315 poolp = &io_pool_4g; 2316 try_smaller: 2317 /* 2318 * Take pages from I/O pool. We'll use pages from the highest MFN 2319 * range possible. 2320 */ 2321 pp_first = pp_last = NULL; 2322 npages = mmu_btop(bytes); 2323 mutex_enter(&io_pool_lock); 2324 for (pp = *poolp; pp && npages > 0; ) { 2325 pp = pp->p_prev; 2326 2327 /* 2328 * skip pages above allowable range 2329 */ 2330 mfn = mfn_list[pp->p_pagenum]; 2331 if (hi_mfn < mfn) 2332 goto skip; 2333 2334 /* 2335 * stop at pages below allowable range 2336 */ 2337 if (lo_mfn > mfn) 2338 break; 2339 restart: 2340 if (pp_last == NULL) { 2341 /* 2342 * Check alignment 2343 */ 2344 tmfn = mfn - (npages - 1); 2345 if (pfnalign) { 2346 if (tmfn != P2ROUNDUP(tmfn, pfnalign)) 2347 goto skip; /* not properly aligned */ 2348 } 2349 /* 2350 * Check segment 2351 */ 2352 if ((mfn & pfnseg) < (tmfn & pfnseg)) 2353 goto skip; /* crosses segment boundary */ 2354 /* 2355 * Start building page list 2356 */ 2357 pp_first = pp_last = pp; 2358 npages--; 2359 } else { 2360 /* 2361 * check physical contiguity if required 2362 */ 2363 if (contig && 2364 mfn_list[pp_first->p_pagenum] != mfn + 1) { 2365 /* 2366 * not a contiguous page, restart list. 2367 */ 2368 pp_last = NULL; 2369 npages = mmu_btop(bytes); 2370 goto restart; 2371 } else { /* add page to list */ 2372 pp_first = pp; 2373 --npages; 2374 } 2375 } 2376 skip: 2377 if (pp == *poolp) 2378 break; 2379 } 2380 2381 /* 2382 * If we didn't find memory. Try the more constrained pool, then 2383 * sweep free pages into the DMA pool and try again. If we fail 2384 * repeatedly, ask the Hypervisor for help. 2385 */ 2386 if (npages != 0) { 2387 mutex_exit(&io_pool_lock); 2388 /* 2389 * If we were looking in the less constrained pool and didn't 2390 * find pages, try the more constrained pool. 2391 */ 2392 if (poolp == &io_pool_4g) { 2393 poolp = &io_pool_16m; 2394 goto try_smaller; 2395 } 2396 kmem_reap(); 2397 if (++attempt < 4) { 2398 /* 2399 * Grab some more io_pool pages 2400 */ 2401 (void) populate_io_pool(); 2402 goto try_again; 2403 } 2404 2405 if (asked_hypervisor++) 2406 return (NULL); /* really out of luck */ 2407 /* 2408 * Hypervisor exchange doesn't handle segment or alignment 2409 * constraints 2410 */ 2411 if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || pfnalign) 2412 return (NULL); 2413 /* 2414 * Try exchanging pages with the hypervisor. 2415 */ 2416 npages = mmu_btop(bytes); 2417 kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 2418 /* 2419 * Hypervisor will allocate extents, if we want contig pages 2420 * extent must be >= npages 2421 */ 2422 if (contig) { 2423 order = highbit(npages) - 1; 2424 if (npages & ((1 << order) - 1)) 2425 order++; 2426 extpages = 1 << order; 2427 } else { 2428 order = 0; 2429 extpages = npages; 2430 } 2431 if (extpages > npages) { 2432 extra = extpages - npages; 2433 if (!page_resv(extra, kflags)) 2434 return (NULL); 2435 } 2436 pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 2437 if (pplist == NULL) 2438 goto fail; 2439 mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 2440 if (mfnlist == NULL) 2441 goto fail; 2442 pp = page_create_va(vp, off, npages * PAGESIZE, flags, 2443 &kvseg, vaddr); 2444 if (pp == NULL) 2445 goto fail; 2446 pp_first = pp; 2447 if (extpages > npages) { 2448 /* 2449 * fill out the rest of extent pages to swap with the 2450 * hypervisor 2451 */ 2452 for (i = 0; i < extra; i++) { 2453 expp = page_create_va(vp, 2454 (u_offset_t)(uintptr_t)io_pool_kva, 2455 PAGESIZE, flags, &kvseg, io_pool_kva); 2456 if (expp == NULL) 2457 goto balloon_fail; 2458 (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 2459 page_io_unlock(expp); 2460 page_hashout(expp, NULL); 2461 page_io_lock(expp); 2462 /* 2463 * add page to end of list 2464 */ 2465 expp->p_prev = pp_first->p_prev; 2466 expp->p_next = pp_first; 2467 expp->p_prev->p_next = expp; 2468 pp_first->p_prev = expp; 2469 } 2470 2471 } 2472 for (i = 0; i < extpages; i++) { 2473 pplist[i] = pp; 2474 pp = pp->p_next; 2475 } 2476 nbits = highbit(mattr->dma_attr_addr_hi); 2477 extents = contig ? 1 : npages; 2478 if (balloon_replace_pages(extents, pplist, nbits, order, 2479 mfnlist) != extents) { 2480 if (ioalloc_dbg) 2481 cmn_err(CE_NOTE, "request to hypervisor for" 2482 " %d pages, maxaddr %" PRIx64 " failed", 2483 extpages, mattr->dma_attr_addr_hi); 2484 goto balloon_fail; 2485 } 2486 2487 kmem_free(pplist, extpages * sizeof (page_t *)); 2488 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2489 /* 2490 * Return any excess pages to free list 2491 */ 2492 if (extpages > npages) { 2493 for (i = 0; i < extra; i++) { 2494 pp = pp_first->p_prev; 2495 page_sub(&pp_first, pp); 2496 page_io_unlock(pp); 2497 page_unresv(1); 2498 page_free(pp, 1); 2499 } 2500 } 2501 check_dma(mattr, pp_first, mmu_btop(bytes)); 2502 return (pp_first); 2503 } 2504 2505 /* 2506 * Found the pages, now snip them from the list 2507 */ 2508 page_io_pool_sub(poolp, pp_first, pp_last); 2509 io_pool_cnt -= mmu_btop(bytes); 2510 if (io_pool_cnt < io_pool_cnt_lowater) 2511 io_pool_cnt_lowater = io_pool_cnt; /* io pool low water mark */ 2512 mutex_exit(&io_pool_lock); 2513 done: 2514 check_dma(mattr, pp_first, mmu_btop(bytes)); 2515 pp = pp_first; 2516 do { 2517 if (!page_hashin(pp, vp, off, NULL)) { 2518 panic("pg_create_io: hashin failed pp %p, vp %p," 2519 " off %llx", 2520 (void *)pp, (void *)vp, off); 2521 } 2522 off += MMU_PAGESIZE; 2523 PP_CLRFREE(pp); 2524 PP_CLRAGED(pp); 2525 page_set_props(pp, P_REF); 2526 page_io_lock(pp); 2527 pp = pp->p_next; 2528 } while (pp != pp_first); 2529 return (pp_first); 2530 balloon_fail: 2531 /* 2532 * Return pages to free list and return failure 2533 */ 2534 while (pp_first != NULL) { 2535 pp = pp_first; 2536 page_sub(&pp_first, pp); 2537 page_io_unlock(pp); 2538 if (pp->p_vnode != NULL) 2539 page_hashout(pp, NULL); 2540 page_free(pp, 1); 2541 } 2542 fail: 2543 if (pplist) 2544 kmem_free(pplist, extpages * sizeof (page_t *)); 2545 if (mfnlist) 2546 kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2547 page_unresv(extpages - npages); 2548 return (NULL); 2549 } 2550 2551 /* 2552 * Lock and return the page with the highest mfn that we can find. last_mfn 2553 * holds the last one found, so the next search can start from there. We 2554 * also keep a counter so that we don't loop forever if the machine has no 2555 * free pages. 2556 * 2557 * This is called from the balloon thread to find pages to give away. new_high 2558 * is used when new mfn's have been added to the system - we will reset our 2559 * search if the new mfn's are higher than our current search position. 2560 */ 2561 page_t * 2562 page_get_high_mfn(mfn_t new_high) 2563 { 2564 static mfn_t last_mfn = 0; 2565 pfn_t pfn; 2566 page_t *pp; 2567 ulong_t loop_count = 0; 2568 2569 if (new_high > last_mfn) 2570 last_mfn = new_high; 2571 2572 for (; loop_count < mfn_count; loop_count++, last_mfn--) { 2573 if (last_mfn == 0) { 2574 last_mfn = cached_max_mfn; 2575 } 2576 2577 pfn = mfn_to_pfn(last_mfn); 2578 if (pfn & PFN_IS_FOREIGN_MFN) 2579 continue; 2580 2581 /* See if the page is free. If so, lock it. */ 2582 pp = page_numtopp_alloc(pfn); 2583 if (pp == NULL) 2584 continue; 2585 PP_CLRFREE(pp); 2586 2587 ASSERT(PAGE_EXCL(pp)); 2588 ASSERT(pp->p_vnode == NULL); 2589 ASSERT(!hat_page_is_mapped(pp)); 2590 last_mfn--; 2591 return (pp); 2592 } 2593 return (NULL); 2594 } 2595 2596 #else /* !__xpv */ 2597 2598 /* 2599 * get a page from any list with the given mnode 2600 */ 2601 static page_t * 2602 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 2603 int mnode, int mtype, ddi_dma_attr_t *dma_attr) 2604 { 2605 kmutex_t *pcm; 2606 int i; 2607 page_t *pp; 2608 page_t *first_pp; 2609 uint64_t pgaddr; 2610 ulong_t bin; 2611 int mtypestart; 2612 int plw_initialized; 2613 page_list_walker_t plw; 2614 2615 VM_STAT_ADD(pga_vmstats.pgma_alloc); 2616 2617 ASSERT((flags & PG_MATCH_COLOR) == 0); 2618 ASSERT(szc == 0); 2619 ASSERT(dma_attr != NULL); 2620 2621 MTYPE_START(mnode, mtype, flags); 2622 if (mtype < 0) { 2623 VM_STAT_ADD(pga_vmstats.pgma_allocempty); 2624 return (NULL); 2625 } 2626 2627 mtypestart = mtype; 2628 2629 bin = origbin; 2630 2631 /* 2632 * check up to page_colors + 1 bins - origbin may be checked twice 2633 * because of BIN_STEP skip 2634 */ 2635 do { 2636 plw_initialized = 0; 2637 2638 for (plw.plw_count = 0; 2639 plw.plw_count < page_colors; plw.plw_count++) { 2640 2641 if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 2642 goto nextfreebin; 2643 2644 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2645 mutex_enter(pcm); 2646 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2647 first_pp = pp; 2648 while (pp != NULL) { 2649 if (page_trylock(pp, SE_EXCL) == 0) { 2650 pp = pp->p_next; 2651 if (pp == first_pp) { 2652 pp = NULL; 2653 } 2654 continue; 2655 } 2656 2657 ASSERT(PP_ISFREE(pp)); 2658 ASSERT(PP_ISAGED(pp)); 2659 ASSERT(pp->p_vnode == NULL); 2660 ASSERT(pp->p_hash == NULL); 2661 ASSERT(pp->p_offset == (u_offset_t)-1); 2662 ASSERT(pp->p_szc == szc); 2663 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2664 /* check if page within DMA attributes */ 2665 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2666 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2667 (pgaddr + MMU_PAGESIZE - 1 <= 2668 dma_attr->dma_attr_addr_hi)) { 2669 break; 2670 } 2671 2672 /* continue looking */ 2673 page_unlock(pp); 2674 pp = pp->p_next; 2675 if (pp == first_pp) 2676 pp = NULL; 2677 2678 } 2679 if (pp != NULL) { 2680 ASSERT(mtype == PP_2_MTYPE(pp)); 2681 ASSERT(pp->p_szc == 0); 2682 2683 /* found a page with specified DMA attributes */ 2684 page_sub(&PAGE_FREELISTS(mnode, szc, bin, 2685 mtype), pp); 2686 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2687 2688 if ((PP_ISFREE(pp) == 0) || 2689 (PP_ISAGED(pp) == 0)) { 2690 cmn_err(CE_PANIC, "page %p is not free", 2691 (void *)pp); 2692 } 2693 2694 mutex_exit(pcm); 2695 check_dma(dma_attr, pp, 1); 2696 VM_STAT_ADD(pga_vmstats.pgma_allocok); 2697 return (pp); 2698 } 2699 mutex_exit(pcm); 2700 nextfreebin: 2701 if (plw_initialized == 0) { 2702 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 2703 ASSERT(plw.plw_ceq_dif == page_colors); 2704 plw_initialized = 1; 2705 } 2706 2707 if (plw.plw_do_split) { 2708 pp = page_freelist_split(szc, bin, mnode, 2709 mtype, 2710 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 2711 &plw); 2712 if (pp != NULL) 2713 return (pp); 2714 } 2715 2716 bin = page_list_walk_next_bin(szc, bin, &plw); 2717 } 2718 2719 MTYPE_NEXT(mnode, mtype, flags); 2720 } while (mtype >= 0); 2721 2722 /* failed to find a page in the freelist; try it in the cachelist */ 2723 2724 /* reset mtype start for cachelist search */ 2725 mtype = mtypestart; 2726 ASSERT(mtype >= 0); 2727 2728 /* start with the bin of matching color */ 2729 bin = origbin; 2730 2731 do { 2732 for (i = 0; i <= page_colors; i++) { 2733 if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 2734 goto nextcachebin; 2735 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 2736 mutex_enter(pcm); 2737 pp = PAGE_CACHELISTS(mnode, bin, mtype); 2738 first_pp = pp; 2739 while (pp != NULL) { 2740 if (page_trylock(pp, SE_EXCL) == 0) { 2741 pp = pp->p_next; 2742 if (pp == first_pp) 2743 break; 2744 continue; 2745 } 2746 ASSERT(pp->p_vnode); 2747 ASSERT(PP_ISAGED(pp) == 0); 2748 ASSERT(pp->p_szc == 0); 2749 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2750 2751 /* check if page within DMA attributes */ 2752 2753 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 2754 if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 2755 (pgaddr + MMU_PAGESIZE - 1 <= 2756 dma_attr->dma_attr_addr_hi)) { 2757 break; 2758 } 2759 2760 /* continue looking */ 2761 page_unlock(pp); 2762 pp = pp->p_next; 2763 if (pp == first_pp) 2764 pp = NULL; 2765 } 2766 2767 if (pp != NULL) { 2768 ASSERT(mtype == PP_2_MTYPE(pp)); 2769 ASSERT(pp->p_szc == 0); 2770 2771 /* found a page with specified DMA attributes */ 2772 page_sub(&PAGE_CACHELISTS(mnode, bin, 2773 mtype), pp); 2774 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 2775 2776 mutex_exit(pcm); 2777 ASSERT(pp->p_vnode); 2778 ASSERT(PP_ISAGED(pp) == 0); 2779 check_dma(dma_attr, pp, 1); 2780 VM_STAT_ADD(pga_vmstats.pgma_allocok); 2781 return (pp); 2782 } 2783 mutex_exit(pcm); 2784 nextcachebin: 2785 bin += (i == 0) ? BIN_STEP : 1; 2786 bin &= page_colors_mask; 2787 } 2788 MTYPE_NEXT(mnode, mtype, flags); 2789 } while (mtype >= 0); 2790 2791 VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 2792 return (NULL); 2793 } 2794 2795 /* 2796 * This function is similar to page_get_freelist()/page_get_cachelist() 2797 * but it searches both the lists to find a page with the specified 2798 * color (or no color) and DMA attributes. The search is done in the 2799 * freelist first and then in the cache list within the highest memory 2800 * range (based on DMA attributes) before searching in the lower 2801 * memory ranges. 2802 * 2803 * Note: This function is called only by page_create_io(). 2804 */ 2805 /*ARGSUSED*/ 2806 static page_t * 2807 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 2808 size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 2809 { 2810 uint_t bin; 2811 int mtype; 2812 page_t *pp; 2813 int n; 2814 int m; 2815 int szc; 2816 int fullrange; 2817 int mnode; 2818 int local_failed_stat = 0; 2819 lgrp_mnode_cookie_t lgrp_cookie; 2820 2821 VM_STAT_ADD(pga_vmstats.pga_alloc); 2822 2823 /* only base pagesize currently supported */ 2824 if (size != MMU_PAGESIZE) 2825 return (NULL); 2826 2827 /* 2828 * If we're passed a specific lgroup, we use it. Otherwise, 2829 * assume first-touch placement is desired. 2830 */ 2831 if (!LGRP_EXISTS(lgrp)) 2832 lgrp = lgrp_home_lgrp(); 2833 2834 /* LINTED */ 2835 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 2836 2837 /* 2838 * Only hold one freelist or cachelist lock at a time, that way we 2839 * can start anywhere and not have to worry about lock 2840 * ordering. 2841 */ 2842 if (dma_attr == NULL) { 2843 n = 0; 2844 m = mnoderangecnt - 1; 2845 fullrange = 1; 2846 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 2847 } else { 2848 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 2849 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 2850 2851 /* 2852 * We can guarantee alignment only for page boundary. 2853 */ 2854 if (dma_attr->dma_attr_align > MMU_PAGESIZE) 2855 return (NULL); 2856 2857 n = pfn_2_mtype(pfnlo); 2858 m = pfn_2_mtype(pfnhi); 2859 2860 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 2861 (pfnhi >= mnoderanges[m].mnr_pfnhi)); 2862 } 2863 VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 2864 2865 if (n > m) 2866 return (NULL); 2867 2868 szc = 0; 2869 2870 /* cylcing thru mtype handled by RANGE0 if n == 0 */ 2871 if (n == 0) { 2872 flags |= PGI_MT_RANGE0; 2873 n = m; 2874 } 2875 2876 /* 2877 * Try local memory node first, but try remote if we can't 2878 * get a page of the right color. 2879 */ 2880 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 2881 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 2882 /* 2883 * allocate pages from high pfn to low. 2884 */ 2885 for (mtype = m; mtype >= n; mtype--) { 2886 if (fullrange != 0) { 2887 pp = page_get_mnode_freelist(mnode, 2888 bin, mtype, szc, flags); 2889 if (pp == NULL) { 2890 pp = page_get_mnode_cachelist( 2891 bin, flags, mnode, mtype); 2892 } 2893 } else { 2894 pp = page_get_mnode_anylist(bin, szc, 2895 flags, mnode, mtype, dma_attr); 2896 } 2897 if (pp != NULL) { 2898 VM_STAT_ADD(pga_vmstats.pga_allocok); 2899 check_dma(dma_attr, pp, 1); 2900 return (pp); 2901 } 2902 } 2903 if (!local_failed_stat) { 2904 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 2905 local_failed_stat = 1; 2906 } 2907 } 2908 VM_STAT_ADD(pga_vmstats.pga_allocfailed); 2909 2910 return (NULL); 2911 } 2912 2913 /* 2914 * page_create_io() 2915 * 2916 * This function is a copy of page_create_va() with an additional 2917 * argument 'mattr' that specifies DMA memory requirements to 2918 * the page list functions. This function is used by the segkmem 2919 * allocator so it is only to create new pages (i.e PG_EXCL is 2920 * set). 2921 * 2922 * Note: This interface is currently used by x86 PSM only and is 2923 * not fully specified so the commitment level is only for 2924 * private interface specific to x86. This interface uses PSM 2925 * specific page_get_anylist() interface. 2926 */ 2927 2928 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 2929 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 2930 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 2931 break; \ 2932 } \ 2933 } 2934 2935 2936 page_t * 2937 page_create_io( 2938 struct vnode *vp, 2939 u_offset_t off, 2940 uint_t bytes, 2941 uint_t flags, 2942 struct as *as, 2943 caddr_t vaddr, 2944 ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 2945 { 2946 page_t *plist = NULL; 2947 uint_t plist_len = 0; 2948 pgcnt_t npages; 2949 page_t *npp = NULL; 2950 uint_t pages_req; 2951 page_t *pp; 2952 kmutex_t *phm = NULL; 2953 uint_t index; 2954 2955 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2956 "page_create_start:vp %p off %llx bytes %u flags %x", 2957 vp, off, bytes, flags); 2958 2959 ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 2960 2961 pages_req = npages = mmu_btopr(bytes); 2962 2963 /* 2964 * Do the freemem and pcf accounting. 2965 */ 2966 if (!page_create_wait(npages, flags)) { 2967 return (NULL); 2968 } 2969 2970 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2971 "page_create_success:vp %p off %llx", vp, off); 2972 2973 /* 2974 * If satisfying this request has left us with too little 2975 * memory, start the wheels turning to get some back. The 2976 * first clause of the test prevents waking up the pageout 2977 * daemon in situations where it would decide that there's 2978 * nothing to do. 2979 */ 2980 if (nscan < desscan && freemem < minfree) { 2981 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2982 "pageout_cv_signal:freemem %ld", freemem); 2983 cv_signal(&proc_pageout->p_cv); 2984 } 2985 2986 if (flags & PG_PHYSCONTIG) { 2987 2988 plist = page_get_contigpage(&npages, mattr, 1); 2989 if (plist == NULL) { 2990 page_create_putback(npages); 2991 return (NULL); 2992 } 2993 2994 pp = plist; 2995 2996 do { 2997 if (!page_hashin(pp, vp, off, NULL)) { 2998 panic("pg_creat_io: hashin failed %p %p %llx", 2999 (void *)pp, (void *)vp, off); 3000 } 3001 VM_STAT_ADD(page_create_new); 3002 off += MMU_PAGESIZE; 3003 PP_CLRFREE(pp); 3004 PP_CLRAGED(pp); 3005 page_set_props(pp, P_REF); 3006 pp = pp->p_next; 3007 } while (pp != plist); 3008 3009 if (!npages) { 3010 check_dma(mattr, plist, pages_req); 3011 return (plist); 3012 } else { 3013 vaddr += (pages_req - npages) << MMU_PAGESHIFT; 3014 } 3015 3016 /* 3017 * fall-thru: 3018 * 3019 * page_get_contigpage returns when npages <= sgllen. 3020 * Grab the rest of the non-contig pages below from anylist. 3021 */ 3022 } 3023 3024 /* 3025 * Loop around collecting the requested number of pages. 3026 * Most of the time, we have to `create' a new page. With 3027 * this in mind, pull the page off the free list before 3028 * getting the hash lock. This will minimize the hash 3029 * lock hold time, nesting, and the like. If it turns 3030 * out we don't need the page, we put it back at the end. 3031 */ 3032 while (npages--) { 3033 phm = NULL; 3034 3035 index = PAGE_HASH_FUNC(vp, off); 3036 top: 3037 ASSERT(phm == NULL); 3038 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 3039 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3040 3041 if (npp == NULL) { 3042 /* 3043 * Try to get the page of any color either from 3044 * the freelist or from the cache list. 3045 */ 3046 npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 3047 flags & ~PG_MATCH_COLOR, mattr, NULL); 3048 if (npp == NULL) { 3049 if (mattr == NULL) { 3050 /* 3051 * Not looking for a special page; 3052 * panic! 3053 */ 3054 panic("no page found %d", (int)npages); 3055 } 3056 /* 3057 * No page found! This can happen 3058 * if we are looking for a page 3059 * within a specific memory range 3060 * for DMA purposes. If PG_WAIT is 3061 * specified then we wait for a 3062 * while and then try again. The 3063 * wait could be forever if we 3064 * don't get the page(s) we need. 3065 * 3066 * Note: XXX We really need a mechanism 3067 * to wait for pages in the desired 3068 * range. For now, we wait for any 3069 * pages and see if we can use it. 3070 */ 3071 3072 if ((mattr != NULL) && (flags & PG_WAIT)) { 3073 delay(10); 3074 goto top; 3075 } 3076 goto fail; /* undo accounting stuff */ 3077 } 3078 3079 if (PP_ISAGED(npp) == 0) { 3080 /* 3081 * Since this page came from the 3082 * cachelist, we must destroy the 3083 * old vnode association. 3084 */ 3085 page_hashout(npp, (kmutex_t *)NULL); 3086 } 3087 } 3088 3089 /* 3090 * We own this page! 3091 */ 3092 ASSERT(PAGE_EXCL(npp)); 3093 ASSERT(npp->p_vnode == NULL); 3094 ASSERT(!hat_page_is_mapped(npp)); 3095 PP_CLRFREE(npp); 3096 PP_CLRAGED(npp); 3097 3098 /* 3099 * Here we have a page in our hot little mits and are 3100 * just waiting to stuff it on the appropriate lists. 3101 * Get the mutex and check to see if it really does 3102 * not exist. 3103 */ 3104 phm = PAGE_HASH_MUTEX(index); 3105 mutex_enter(phm); 3106 PAGE_HASH_SEARCH(index, pp, vp, off); 3107 if (pp == NULL) { 3108 VM_STAT_ADD(page_create_new); 3109 pp = npp; 3110 npp = NULL; 3111 if (!page_hashin(pp, vp, off, phm)) { 3112 /* 3113 * Since we hold the page hash mutex and 3114 * just searched for this page, page_hashin 3115 * had better not fail. If it does, that 3116 * means somethread did not follow the 3117 * page hash mutex rules. Panic now and 3118 * get it over with. As usual, go down 3119 * holding all the locks. 3120 */ 3121 ASSERT(MUTEX_HELD(phm)); 3122 panic("page_create: hashin fail %p %p %llx %p", 3123 (void *)pp, (void *)vp, off, (void *)phm); 3124 3125 } 3126 ASSERT(MUTEX_HELD(phm)); 3127 mutex_exit(phm); 3128 phm = NULL; 3129 3130 /* 3131 * Hat layer locking need not be done to set 3132 * the following bits since the page is not hashed 3133 * and was on the free list (i.e., had no mappings). 3134 * 3135 * Set the reference bit to protect 3136 * against immediate pageout 3137 * 3138 * XXXmh modify freelist code to set reference 3139 * bit so we don't have to do it here. 3140 */ 3141 page_set_props(pp, P_REF); 3142 } else { 3143 ASSERT(MUTEX_HELD(phm)); 3144 mutex_exit(phm); 3145 phm = NULL; 3146 /* 3147 * NOTE: This should not happen for pages associated 3148 * with kernel vnode 'kvp'. 3149 */ 3150 /* XX64 - to debug why this happens! */ 3151 ASSERT(!VN_ISKAS(vp)); 3152 if (VN_ISKAS(vp)) 3153 cmn_err(CE_NOTE, 3154 "page_create: page not expected " 3155 "in hash list for kernel vnode - pp 0x%p", 3156 (void *)pp); 3157 VM_STAT_ADD(page_create_exists); 3158 goto fail; 3159 } 3160 3161 /* 3162 * Got a page! It is locked. Acquire the i/o 3163 * lock since we are going to use the p_next and 3164 * p_prev fields to link the requested pages together. 3165 */ 3166 page_io_lock(pp); 3167 page_add(&plist, pp); 3168 plist = plist->p_next; 3169 off += MMU_PAGESIZE; 3170 vaddr += MMU_PAGESIZE; 3171 } 3172 3173 check_dma(mattr, plist, pages_req); 3174 return (plist); 3175 3176 fail: 3177 if (npp != NULL) { 3178 /* 3179 * Did not need this page after all. 3180 * Put it back on the free list. 3181 */ 3182 VM_STAT_ADD(page_create_putbacks); 3183 PP_SETFREE(npp); 3184 PP_SETAGED(npp); 3185 npp->p_offset = (u_offset_t)-1; 3186 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 3187 page_unlock(npp); 3188 } 3189 3190 /* 3191 * Give up the pages we already got. 3192 */ 3193 while (plist != NULL) { 3194 pp = plist; 3195 page_sub(&plist, pp); 3196 page_io_unlock(pp); 3197 plist_len++; 3198 /*LINTED: constant in conditional ctx*/ 3199 VN_DISPOSE(pp, B_INVAL, 0, kcred); 3200 } 3201 3202 /* 3203 * VN_DISPOSE does freemem accounting for the pages in plist 3204 * by calling page_free. So, we need to undo the pcf accounting 3205 * for only the remaining pages. 3206 */ 3207 VM_STAT_ADD(page_create_putbacks); 3208 page_create_putback(pages_req - plist_len); 3209 3210 return (NULL); 3211 } 3212 #endif /* !__xpv */ 3213 3214 3215 /* 3216 * Copy the data from the physical page represented by "frompp" to 3217 * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 3218 * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 3219 * level and no one sleeps with an active mapping there. 3220 * 3221 * Note that the ref/mod bits in the page_t's are not affected by 3222 * this operation, hence it is up to the caller to update them appropriately. 3223 */ 3224 int 3225 ppcopy(page_t *frompp, page_t *topp) 3226 { 3227 caddr_t pp_addr1; 3228 caddr_t pp_addr2; 3229 hat_mempte_t pte1; 3230 hat_mempte_t pte2; 3231 kmutex_t *ppaddr_mutex; 3232 label_t ljb; 3233 int ret = 1; 3234 3235 ASSERT_STACK_ALIGNED(); 3236 ASSERT(PAGE_LOCKED(frompp)); 3237 ASSERT(PAGE_LOCKED(topp)); 3238 3239 if (kpm_enable) { 3240 pp_addr1 = hat_kpm_page2va(frompp, 0); 3241 pp_addr2 = hat_kpm_page2va(topp, 0); 3242 kpreempt_disable(); 3243 } else { 3244 /* 3245 * disable pre-emption so that CPU can't change 3246 */ 3247 kpreempt_disable(); 3248 3249 pp_addr1 = CPU->cpu_caddr1; 3250 pp_addr2 = CPU->cpu_caddr2; 3251 pte1 = CPU->cpu_caddr1pte; 3252 pte2 = CPU->cpu_caddr2pte; 3253 3254 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3255 mutex_enter(ppaddr_mutex); 3256 3257 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 3258 PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 3259 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 3260 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3261 HAT_LOAD_NOCONSIST); 3262 } 3263 3264 if (on_fault(&ljb)) { 3265 ret = 0; 3266 goto faulted; 3267 } 3268 if (use_sse_pagecopy) 3269 #ifdef __xpv 3270 page_copy_no_xmm(pp_addr2, pp_addr1); 3271 #else 3272 hwblkpagecopy(pp_addr1, pp_addr2); 3273 #endif 3274 else 3275 bcopy(pp_addr1, pp_addr2, PAGESIZE); 3276 3277 no_fault(); 3278 faulted: 3279 if (!kpm_enable) { 3280 #ifdef __xpv 3281 /* 3282 * We can't leave unused mappings laying about under the 3283 * hypervisor, so blow them away. 3284 */ 3285 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 3286 UVMF_INVLPG | UVMF_LOCAL) < 0) 3287 panic("HYPERVISOR_update_va_mapping() failed"); 3288 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3289 UVMF_INVLPG | UVMF_LOCAL) < 0) 3290 panic("HYPERVISOR_update_va_mapping() failed"); 3291 #endif 3292 mutex_exit(ppaddr_mutex); 3293 } 3294 kpreempt_enable(); 3295 return (ret); 3296 } 3297 3298 void 3299 pagezero(page_t *pp, uint_t off, uint_t len) 3300 { 3301 ASSERT(PAGE_LOCKED(pp)); 3302 pfnzero(page_pptonum(pp), off, len); 3303 } 3304 3305 /* 3306 * Zero the physical page from off to off + len given by pfn 3307 * without changing the reference and modified bits of page. 3308 * 3309 * We use this using CPU private page address #2, see ppcopy() for more info. 3310 * pfnzero() must not be called at interrupt level. 3311 */ 3312 void 3313 pfnzero(pfn_t pfn, uint_t off, uint_t len) 3314 { 3315 caddr_t pp_addr2; 3316 hat_mempte_t pte2; 3317 kmutex_t *ppaddr_mutex = NULL; 3318 3319 ASSERT_STACK_ALIGNED(); 3320 ASSERT(len <= MMU_PAGESIZE); 3321 ASSERT(off <= MMU_PAGESIZE); 3322 ASSERT(off + len <= MMU_PAGESIZE); 3323 3324 if (kpm_enable && !pfn_is_foreign(pfn)) { 3325 pp_addr2 = hat_kpm_pfn2va(pfn); 3326 kpreempt_disable(); 3327 } else { 3328 kpreempt_disable(); 3329 3330 pp_addr2 = CPU->cpu_caddr2; 3331 pte2 = CPU->cpu_caddr2pte; 3332 3333 ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 3334 mutex_enter(ppaddr_mutex); 3335 3336 hat_mempte_remap(pfn, pp_addr2, pte2, 3337 PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 3338 HAT_LOAD_NOCONSIST); 3339 } 3340 3341 if (use_sse_pagezero) { 3342 #ifdef __xpv 3343 uint_t rem; 3344 3345 /* 3346 * zero a byte at a time until properly aligned for 3347 * block_zero_no_xmm(). 3348 */ 3349 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3350 pp_addr2[off++] = 0; 3351 3352 /* 3353 * Now use faster block_zero_no_xmm() for any range 3354 * that is properly aligned and sized. 3355 */ 3356 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3357 len -= rem; 3358 if (len != 0) { 3359 block_zero_no_xmm(pp_addr2 + off, len); 3360 off += len; 3361 } 3362 3363 /* 3364 * zero remainder with byte stores. 3365 */ 3366 while (rem-- > 0) 3367 pp_addr2[off++] = 0; 3368 #else 3369 hwblkclr(pp_addr2 + off, len); 3370 #endif 3371 } else { 3372 bzero(pp_addr2 + off, len); 3373 } 3374 3375 if (!kpm_enable || pfn_is_foreign(pfn)) { 3376 #ifdef __xpv 3377 /* 3378 * On the hypervisor this page might get used for a page 3379 * table before any intervening change to this mapping, 3380 * so blow it away. 3381 */ 3382 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3383 UVMF_INVLPG) < 0) 3384 panic("HYPERVISOR_update_va_mapping() failed"); 3385 #endif 3386 mutex_exit(ppaddr_mutex); 3387 } 3388 3389 kpreempt_enable(); 3390 } 3391 3392 /* 3393 * Platform-dependent page scrub call. 3394 */ 3395 void 3396 pagescrub(page_t *pp, uint_t off, uint_t len) 3397 { 3398 /* 3399 * For now, we rely on the fact that pagezero() will 3400 * always clear UEs. 3401 */ 3402 pagezero(pp, off, len); 3403 } 3404 3405 /* 3406 * set up two private addresses for use on a given CPU for use in ppcopy() 3407 */ 3408 void 3409 setup_vaddr_for_ppcopy(struct cpu *cpup) 3410 { 3411 void *addr; 3412 hat_mempte_t pte_pa; 3413 3414 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3415 pte_pa = hat_mempte_setup(addr); 3416 cpup->cpu_caddr1 = addr; 3417 cpup->cpu_caddr1pte = pte_pa; 3418 3419 addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3420 pte_pa = hat_mempte_setup(addr); 3421 cpup->cpu_caddr2 = addr; 3422 cpup->cpu_caddr2pte = pte_pa; 3423 3424 mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 3425 } 3426 3427 /* 3428 * Undo setup_vaddr_for_ppcopy 3429 */ 3430 void 3431 teardown_vaddr_for_ppcopy(struct cpu *cpup) 3432 { 3433 mutex_destroy(&cpup->cpu_ppaddr_mutex); 3434 3435 hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 3436 cpup->cpu_caddr2pte = 0; 3437 vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 3438 cpup->cpu_caddr2 = 0; 3439 3440 hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 3441 cpup->cpu_caddr1pte = 0; 3442 vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 3443 cpup->cpu_caddr1 = 0; 3444 } 3445 3446 /* 3447 * Create the pageout scanner thread. The thread has to 3448 * start at procedure with process pp and priority pri. 3449 */ 3450 void 3451 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) 3452 { 3453 (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); 3454 } 3455 3456 /* 3457 * Function for flushing D-cache when performing module relocations 3458 * to an alternate mapping. Unnecessary on Intel / AMD platforms. 3459 */ 3460 void 3461 dcache_flushall() 3462 {} 3463 3464 size_t 3465 exec_get_spslew(void) 3466 { 3467 return (0); 3468 } 3469 3470 /* 3471 * Allocate a memory page. The argument 'seed' can be any pseudo-random 3472 * number to vary where the pages come from. This is quite a hacked up 3473 * method -- it works for now, but really needs to be fixed up a bit. 3474 * 3475 * We currently use page_create_va() on the kvp with fake offsets, 3476 * segments and virt address. This is pretty bogus, but was copied from the 3477 * old hat_i86.c code. A better approach would be to specify either mnode 3478 * random or mnode local and takes a page from whatever color has the MOST 3479 * available - this would have a minimal impact on page coloring. 3480 */ 3481 page_t * 3482 page_get_physical(uintptr_t seed) 3483 { 3484 page_t *pp; 3485 u_offset_t offset; 3486 static struct seg tmpseg; 3487 static uintptr_t ctr = 0; 3488 3489 /* 3490 * This code is gross, we really need a simpler page allocator. 3491 * 3492 * We need assign an offset for the page to call page_create_va(). 3493 * To avoid conflicts with other pages, we get creative with the offset. 3494 * For 32 bits, we pick an offset > 4Gig 3495 * For 64 bits, pick an offset somewhere in the VA hole. 3496 */ 3497 offset = seed; 3498 if (offset > kernelbase) 3499 offset -= kernelbase; 3500 offset <<= MMU_PAGESHIFT; 3501 #if defined(__amd64) 3502 offset += mmu.hole_start; /* something in VA hole */ 3503 #else 3504 offset += 1ULL << 40; /* something > 4 Gig */ 3505 #endif 3506 3507 if (page_resv(1, KM_NOSLEEP) == 0) 3508 return (NULL); 3509 3510 #ifdef DEBUG 3511 pp = page_exists(&kvp, offset); 3512 if (pp != NULL) 3513 panic("page already exists %p", pp); 3514 #endif 3515 3516 pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 3517 &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 3518 if (pp == NULL) 3519 return (NULL); 3520 page_io_unlock(pp); 3521 page_hashout(pp, NULL); 3522 return (pp); 3523 } 3524