1 /*- 2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * The Mach Operating System project at Carnegie-Mellon University. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 36 */ 37 38 /*- 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 */ 64 65 /* 66 * GENERAL RULES ON VM_PAGE MANIPULATION 67 * 68 * - A page queue lock is required when adding or removing a page from a 69 * page queue regardless of other locks or the busy state of a page. 70 * 71 * * In general, no thread besides the page daemon can acquire or 72 * hold more than one page queue lock at a time. 73 * 74 * * The page daemon can acquire and hold any pair of page queue 75 * locks in any order. 76 * 77 * - The object lock is required when inserting or removing 78 * pages from an object (vm_page_insert() or vm_page_remove()). 79 * 80 */ 81 82 /* 83 * Resident memory management module. 84 */ 85 86 #include <sys/cdefs.h> 87 __FBSDID("$FreeBSD$"); 88 89 #include "opt_vm.h" 90 91 #include <sys/param.h> 92 #include <sys/systm.h> 93 #include <sys/lock.h> 94 #include <sys/domainset.h> 95 #include <sys/kernel.h> 96 #include <sys/limits.h> 97 #include <sys/linker.h> 98 #include <sys/malloc.h> 99 #include <sys/mman.h> 100 #include <sys/msgbuf.h> 101 #include <sys/mutex.h> 102 #include <sys/proc.h> 103 #include <sys/rwlock.h> 104 #include <sys/sbuf.h> 105 #include <sys/smp.h> 106 #include <sys/sysctl.h> 107 #include <sys/vmmeter.h> 108 #include <sys/vnode.h> 109 110 #include <vm/vm.h> 111 #include <vm/pmap.h> 112 #include <vm/vm_param.h> 113 #include <vm/vm_domainset.h> 114 #include <vm/vm_kern.h> 115 #include <vm/vm_map.h> 116 #include <vm/vm_object.h> 117 #include <vm/vm_page.h> 118 #include <vm/vm_pageout.h> 119 #include <vm/vm_phys.h> 120 #include <vm/vm_pagequeue.h> 121 #include <vm/vm_pager.h> 122 #include <vm/vm_radix.h> 123 #include <vm/vm_reserv.h> 124 #include <vm/vm_extern.h> 125 #include <vm/uma.h> 126 #include <vm/uma_int.h> 127 128 #include <machine/md_var.h> 129 130 extern int uma_startup_count(int); 131 extern void uma_startup(void *, int); 132 extern int vmem_startup_count(void); 133 134 /* 135 * Associated with page of user-allocatable memory is a 136 * page structure. 137 */ 138 139 struct vm_domain vm_dom[MAXMEMDOM]; 140 141 struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; 142 143 struct mtx_padalign __exclusive_cache_line vm_domainset_lock; 144 /* The following fields are protected by the domainset lock. */ 145 domainset_t __exclusive_cache_line vm_min_domains; 146 domainset_t __exclusive_cache_line vm_severe_domains; 147 static int vm_min_waiters; 148 static int vm_severe_waiters; 149 static int vm_pageproc_waiters; 150 151 /* 152 * bogus page -- for I/O to/from partially complete buffers, 153 * or for paging into sparsely invalid regions. 154 */ 155 vm_page_t bogus_page; 156 157 vm_page_t vm_page_array; 158 long vm_page_array_size; 159 long first_page; 160 161 static int boot_pages; 162 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 163 &boot_pages, 0, 164 "number of pages allocated for bootstrapping the VM system"); 165 166 static int pa_tryrelock_restart; 167 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, 168 &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); 169 170 static TAILQ_HEAD(, vm_page) blacklist_head; 171 static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); 172 SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | 173 CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); 174 175 static uma_zone_t fakepg_zone; 176 177 static void vm_page_alloc_check(vm_page_t m); 178 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); 179 static void vm_page_enqueue(uint8_t queue, vm_page_t m); 180 static void vm_page_init(void *dummy); 181 static int vm_page_insert_after(vm_page_t m, vm_object_t object, 182 vm_pindex_t pindex, vm_page_t mpred); 183 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, 184 vm_page_t mpred); 185 static int vm_page_reclaim_run(int req_class, int domain, u_long npages, 186 vm_page_t m_run, vm_paddr_t high); 187 static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, 188 int req); 189 static int vm_page_import(void *arg, void **store, int cnt, int domain, 190 int flags); 191 static void vm_page_release(void *arg, void **store, int cnt); 192 193 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); 194 195 static void 196 vm_page_init(void *dummy) 197 { 198 199 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 200 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 201 bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | 202 VM_ALLOC_NORMAL | VM_ALLOC_WIRED); 203 } 204 205 /* 206 * The cache page zone is initialized later since we need to be able to allocate 207 * pages before UMA is fully initialized. 208 */ 209 static void 210 vm_page_init_cache_zones(void *dummy __unused) 211 { 212 struct vm_domain *vmd; 213 int i; 214 215 for (i = 0; i < vm_ndomains; i++) { 216 vmd = VM_DOMAIN(i); 217 /* 218 * Don't allow the page cache to take up more than .25% of 219 * memory. 220 */ 221 if (vmd->vmd_page_count / 400 < 256 * mp_ncpus) 222 continue; 223 vmd->vmd_pgcache = uma_zcache_create("vm pgcache", 224 sizeof(struct vm_page), NULL, NULL, NULL, NULL, 225 vm_page_import, vm_page_release, vmd, 226 UMA_ZONE_NOBUCKETCACHE | UMA_ZONE_MAXBUCKET | UMA_ZONE_VM); 227 } 228 } 229 SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); 230 231 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ 232 #if PAGE_SIZE == 32768 233 #ifdef CTASSERT 234 CTASSERT(sizeof(u_long) >= 8); 235 #endif 236 #endif 237 238 /* 239 * Try to acquire a physical address lock while a pmap is locked. If we 240 * fail to trylock we unlock and lock the pmap directly and cache the 241 * locked pa in *locked. The caller should then restart their loop in case 242 * the virtual to physical mapping has changed. 243 */ 244 int 245 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) 246 { 247 vm_paddr_t lockpa; 248 249 lockpa = *locked; 250 *locked = pa; 251 if (lockpa) { 252 PA_LOCK_ASSERT(lockpa, MA_OWNED); 253 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) 254 return (0); 255 PA_UNLOCK(lockpa); 256 } 257 if (PA_TRYLOCK(pa)) 258 return (0); 259 PMAP_UNLOCK(pmap); 260 atomic_add_int(&pa_tryrelock_restart, 1); 261 PA_LOCK(pa); 262 PMAP_LOCK(pmap); 263 return (EAGAIN); 264 } 265 266 /* 267 * vm_set_page_size: 268 * 269 * Sets the page size, perhaps based upon the memory 270 * size. Must be called before any use of page-size 271 * dependent functions. 272 */ 273 void 274 vm_set_page_size(void) 275 { 276 if (vm_cnt.v_page_size == 0) 277 vm_cnt.v_page_size = PAGE_SIZE; 278 if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) 279 panic("vm_set_page_size: page size not a power of two"); 280 } 281 282 /* 283 * vm_page_blacklist_next: 284 * 285 * Find the next entry in the provided string of blacklist 286 * addresses. Entries are separated by space, comma, or newline. 287 * If an invalid integer is encountered then the rest of the 288 * string is skipped. Updates the list pointer to the next 289 * character, or NULL if the string is exhausted or invalid. 290 */ 291 static vm_paddr_t 292 vm_page_blacklist_next(char **list, char *end) 293 { 294 vm_paddr_t bad; 295 char *cp, *pos; 296 297 if (list == NULL || *list == NULL) 298 return (0); 299 if (**list =='\0') { 300 *list = NULL; 301 return (0); 302 } 303 304 /* 305 * If there's no end pointer then the buffer is coming from 306 * the kenv and we know it's null-terminated. 307 */ 308 if (end == NULL) 309 end = *list + strlen(*list); 310 311 /* Ensure that strtoq() won't walk off the end */ 312 if (*end != '\0') { 313 if (*end == '\n' || *end == ' ' || *end == ',') 314 *end = '\0'; 315 else { 316 printf("Blacklist not terminated, skipping\n"); 317 *list = NULL; 318 return (0); 319 } 320 } 321 322 for (pos = *list; *pos != '\0'; pos = cp) { 323 bad = strtoq(pos, &cp, 0); 324 if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { 325 if (bad == 0) { 326 if (++cp < end) 327 continue; 328 else 329 break; 330 } 331 } else 332 break; 333 if (*cp == '\0' || ++cp >= end) 334 *list = NULL; 335 else 336 *list = cp; 337 return (trunc_page(bad)); 338 } 339 printf("Garbage in RAM blacklist, skipping\n"); 340 *list = NULL; 341 return (0); 342 } 343 344 bool 345 vm_page_blacklist_add(vm_paddr_t pa, bool verbose) 346 { 347 struct vm_domain *vmd; 348 vm_page_t m; 349 int ret; 350 351 m = vm_phys_paddr_to_vm_page(pa); 352 if (m == NULL) 353 return (true); /* page does not exist, no failure */ 354 355 vmd = vm_pagequeue_domain(m); 356 vm_domain_free_lock(vmd); 357 ret = vm_phys_unfree_page(m); 358 vm_domain_free_unlock(vmd); 359 if (ret) { 360 TAILQ_INSERT_TAIL(&blacklist_head, m, listq); 361 if (verbose) 362 printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); 363 } 364 return (ret); 365 } 366 367 /* 368 * vm_page_blacklist_check: 369 * 370 * Iterate through the provided string of blacklist addresses, pulling 371 * each entry out of the physical allocator free list and putting it 372 * onto a list for reporting via the vm.page_blacklist sysctl. 373 */ 374 static void 375 vm_page_blacklist_check(char *list, char *end) 376 { 377 vm_paddr_t pa; 378 char *next; 379 380 next = list; 381 while (next != NULL) { 382 if ((pa = vm_page_blacklist_next(&next, end)) == 0) 383 continue; 384 vm_page_blacklist_add(pa, bootverbose); 385 } 386 } 387 388 /* 389 * vm_page_blacklist_load: 390 * 391 * Search for a special module named "ram_blacklist". It'll be a 392 * plain text file provided by the user via the loader directive 393 * of the same name. 394 */ 395 static void 396 vm_page_blacklist_load(char **list, char **end) 397 { 398 void *mod; 399 u_char *ptr; 400 u_int len; 401 402 mod = NULL; 403 ptr = NULL; 404 405 mod = preload_search_by_type("ram_blacklist"); 406 if (mod != NULL) { 407 ptr = preload_fetch_addr(mod); 408 len = preload_fetch_size(mod); 409 } 410 *list = ptr; 411 if (ptr != NULL) 412 *end = ptr + len; 413 else 414 *end = NULL; 415 return; 416 } 417 418 static int 419 sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) 420 { 421 vm_page_t m; 422 struct sbuf sbuf; 423 int error, first; 424 425 first = 1; 426 error = sysctl_wire_old_buffer(req, 0); 427 if (error != 0) 428 return (error); 429 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 430 TAILQ_FOREACH(m, &blacklist_head, listq) { 431 sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", 432 (uintmax_t)m->phys_addr); 433 first = 0; 434 } 435 error = sbuf_finish(&sbuf); 436 sbuf_delete(&sbuf); 437 return (error); 438 } 439 440 static void 441 vm_page_domain_init(int domain) 442 { 443 struct vm_domain *vmd; 444 struct vm_pagequeue *pq; 445 int i; 446 447 vmd = VM_DOMAIN(domain); 448 bzero(vmd, sizeof(*vmd)); 449 *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = 450 "vm inactive pagequeue"; 451 *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = 452 "vm active pagequeue"; 453 *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = 454 "vm laundry pagequeue"; 455 *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = 456 "vm unswappable pagequeue"; 457 vmd->vmd_domain = domain; 458 vmd->vmd_page_count = 0; 459 vmd->vmd_free_count = 0; 460 vmd->vmd_segs = 0; 461 vmd->vmd_oom = FALSE; 462 for (i = 0; i < PQ_COUNT; i++) { 463 pq = &vmd->vmd_pagequeues[i]; 464 TAILQ_INIT(&pq->pq_pl); 465 mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", 466 MTX_DEF | MTX_DUPOK); 467 } 468 mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); 469 mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); 470 snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); 471 } 472 473 /* 474 * Initialize a physical page in preparation for adding it to the free 475 * lists. 476 */ 477 static void 478 vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) 479 { 480 481 m->object = NULL; 482 m->wire_count = 0; 483 m->busy_lock = VPB_UNBUSIED; 484 m->hold_count = 0; 485 m->flags = 0; 486 m->phys_addr = pa; 487 m->queue = PQ_NONE; 488 m->psind = 0; 489 m->segind = segind; 490 m->order = VM_NFREEORDER; 491 m->pool = VM_FREEPOOL_DEFAULT; 492 m->valid = m->dirty = 0; 493 pmap_page_init(m); 494 } 495 496 /* 497 * vm_page_startup: 498 * 499 * Initializes the resident memory module. Allocates physical memory for 500 * bootstrapping UMA and some data structures that are used to manage 501 * physical pages. Initializes these structures, and populates the free 502 * page queues. 503 */ 504 vm_offset_t 505 vm_page_startup(vm_offset_t vaddr) 506 { 507 struct vm_phys_seg *seg; 508 vm_page_t m; 509 char *list, *listend; 510 vm_offset_t mapped; 511 vm_paddr_t end, high_avail, low_avail, new_end, page_range, size; 512 vm_paddr_t biggestsize, last_pa, pa; 513 u_long pagecount; 514 int biggestone, i, segind; 515 516 biggestsize = 0; 517 biggestone = 0; 518 vaddr = round_page(vaddr); 519 520 for (i = 0; phys_avail[i + 1]; i += 2) { 521 phys_avail[i] = round_page(phys_avail[i]); 522 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 523 } 524 for (i = 0; phys_avail[i + 1]; i += 2) { 525 size = phys_avail[i + 1] - phys_avail[i]; 526 if (size > biggestsize) { 527 biggestone = i; 528 biggestsize = size; 529 } 530 } 531 532 end = phys_avail[biggestone+1]; 533 534 /* 535 * Initialize the page and queue locks. 536 */ 537 mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); 538 for (i = 0; i < PA_LOCK_COUNT; i++) 539 mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); 540 for (i = 0; i < vm_ndomains; i++) 541 vm_page_domain_init(i); 542 543 /* 544 * Allocate memory for use when boot strapping the kernel memory 545 * allocator. Tell UMA how many zones we are going to create 546 * before going fully functional. UMA will add its zones. 547 * 548 * VM startup zones: vmem, vmem_btag, VM OBJECT, RADIX NODE, MAP, 549 * KMAP ENTRY, MAP ENTRY, VMSPACE. 550 */ 551 boot_pages = uma_startup_count(8); 552 553 #ifndef UMA_MD_SMALL_ALLOC 554 /* vmem_startup() calls uma_prealloc(). */ 555 boot_pages += vmem_startup_count(); 556 /* vm_map_startup() calls uma_prealloc(). */ 557 boot_pages += howmany(MAX_KMAP, 558 UMA_SLAB_SPACE / sizeof(struct vm_map)); 559 560 /* 561 * Before going fully functional kmem_init() does allocation 562 * from "KMAP ENTRY" and vmem_create() does allocation from "vmem". 563 */ 564 boot_pages += 2; 565 #endif 566 /* 567 * CTFLAG_RDTUN doesn't work during the early boot process, so we must 568 * manually fetch the value. 569 */ 570 TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); 571 new_end = end - (boot_pages * UMA_SLAB_SIZE); 572 new_end = trunc_page(new_end); 573 mapped = pmap_map(&vaddr, new_end, end, 574 VM_PROT_READ | VM_PROT_WRITE); 575 bzero((void *)mapped, end - new_end); 576 uma_startup((void *)mapped, boot_pages); 577 578 #ifdef WITNESS 579 end = new_end; 580 new_end = end - round_page(witness_startup_count()); 581 mapped = pmap_map(&vaddr, new_end, end, 582 VM_PROT_READ | VM_PROT_WRITE); 583 bzero((void *)mapped, end - new_end); 584 witness_startup((void *)mapped); 585 #endif 586 587 #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ 588 defined(__i386__) || defined(__mips__) 589 /* 590 * Allocate a bitmap to indicate that a random physical page 591 * needs to be included in a minidump. 592 * 593 * The amd64 port needs this to indicate which direct map pages 594 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 595 * 596 * However, i386 still needs this workspace internally within the 597 * minidump code. In theory, they are not needed on i386, but are 598 * included should the sf_buf code decide to use them. 599 */ 600 last_pa = 0; 601 for (i = 0; dump_avail[i + 1] != 0; i += 2) 602 if (dump_avail[i + 1] > last_pa) 603 last_pa = dump_avail[i + 1]; 604 page_range = last_pa / PAGE_SIZE; 605 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 606 new_end -= vm_page_dump_size; 607 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, 608 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); 609 bzero((void *)vm_page_dump, vm_page_dump_size); 610 #else 611 (void)last_pa; 612 #endif 613 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) 614 /* 615 * Include the UMA bootstrap pages and vm_page_dump in a crash dump. 616 * When pmap_map() uses the direct map, they are not automatically 617 * included. 618 */ 619 for (pa = new_end; pa < end; pa += PAGE_SIZE) 620 dump_add_page(pa); 621 #endif 622 phys_avail[biggestone + 1] = new_end; 623 #ifdef __amd64__ 624 /* 625 * Request that the physical pages underlying the message buffer be 626 * included in a crash dump. Since the message buffer is accessed 627 * through the direct map, they are not automatically included. 628 */ 629 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 630 last_pa = pa + round_page(msgbufsize); 631 while (pa < last_pa) { 632 dump_add_page(pa); 633 pa += PAGE_SIZE; 634 } 635 #endif 636 /* 637 * Compute the number of pages of memory that will be available for 638 * use, taking into account the overhead of a page structure per page. 639 * In other words, solve 640 * "available physical memory" - round_page(page_range * 641 * sizeof(struct vm_page)) = page_range * PAGE_SIZE 642 * for page_range. 643 */ 644 low_avail = phys_avail[0]; 645 high_avail = phys_avail[1]; 646 for (i = 0; i < vm_phys_nsegs; i++) { 647 if (vm_phys_segs[i].start < low_avail) 648 low_avail = vm_phys_segs[i].start; 649 if (vm_phys_segs[i].end > high_avail) 650 high_avail = vm_phys_segs[i].end; 651 } 652 /* Skip the first chunk. It is already accounted for. */ 653 for (i = 2; phys_avail[i + 1] != 0; i += 2) { 654 if (phys_avail[i] < low_avail) 655 low_avail = phys_avail[i]; 656 if (phys_avail[i + 1] > high_avail) 657 high_avail = phys_avail[i + 1]; 658 } 659 first_page = low_avail / PAGE_SIZE; 660 #ifdef VM_PHYSSEG_SPARSE 661 size = 0; 662 for (i = 0; i < vm_phys_nsegs; i++) 663 size += vm_phys_segs[i].end - vm_phys_segs[i].start; 664 for (i = 0; phys_avail[i + 1] != 0; i += 2) 665 size += phys_avail[i + 1] - phys_avail[i]; 666 #elif defined(VM_PHYSSEG_DENSE) 667 size = high_avail - low_avail; 668 #else 669 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 670 #endif 671 672 #ifdef VM_PHYSSEG_DENSE 673 /* 674 * In the VM_PHYSSEG_DENSE case, the number of pages can account for 675 * the overhead of a page structure per page only if vm_page_array is 676 * allocated from the last physical memory chunk. Otherwise, we must 677 * allocate page structures representing the physical memory 678 * underlying vm_page_array, even though they will not be used. 679 */ 680 if (new_end != high_avail) 681 page_range = size / PAGE_SIZE; 682 else 683 #endif 684 { 685 page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); 686 687 /* 688 * If the partial bytes remaining are large enough for 689 * a page (PAGE_SIZE) without a corresponding 690 * 'struct vm_page', then new_end will contain an 691 * extra page after subtracting the length of the VM 692 * page array. Compensate by subtracting an extra 693 * page from new_end. 694 */ 695 if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { 696 if (new_end == high_avail) 697 high_avail -= PAGE_SIZE; 698 new_end -= PAGE_SIZE; 699 } 700 } 701 end = new_end; 702 703 /* 704 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 705 * However, because this page is allocated from KVM, out-of-bounds 706 * accesses using the direct map will not be trapped. 707 */ 708 vaddr += PAGE_SIZE; 709 710 /* 711 * Allocate physical memory for the page structures, and map it. 712 */ 713 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 714 mapped = pmap_map(&vaddr, new_end, end, 715 VM_PROT_READ | VM_PROT_WRITE); 716 vm_page_array = (vm_page_t)mapped; 717 vm_page_array_size = page_range; 718 719 #if VM_NRESERVLEVEL > 0 720 /* 721 * Allocate physical memory for the reservation management system's 722 * data structures, and map it. 723 */ 724 if (high_avail == end) 725 high_avail = new_end; 726 new_end = vm_reserv_startup(&vaddr, new_end, high_avail); 727 #endif 728 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) 729 /* 730 * Include vm_page_array and vm_reserv_array in a crash dump. 731 */ 732 for (pa = new_end; pa < end; pa += PAGE_SIZE) 733 dump_add_page(pa); 734 #endif 735 phys_avail[biggestone + 1] = new_end; 736 737 /* 738 * Add physical memory segments corresponding to the available 739 * physical pages. 740 */ 741 for (i = 0; phys_avail[i + 1] != 0; i += 2) 742 vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); 743 744 /* 745 * Initialize the physical memory allocator. 746 */ 747 vm_phys_init(); 748 749 /* 750 * Initialize the page structures and add every available page to the 751 * physical memory allocator's free lists. 752 */ 753 vm_cnt.v_page_count = 0; 754 for (segind = 0; segind < vm_phys_nsegs; segind++) { 755 seg = &vm_phys_segs[segind]; 756 for (m = seg->first_page, pa = seg->start; pa < seg->end; 757 m++, pa += PAGE_SIZE) 758 vm_page_init_page(m, pa, segind); 759 760 /* 761 * Add the segment to the free lists only if it is covered by 762 * one of the ranges in phys_avail. Because we've added the 763 * ranges to the vm_phys_segs array, we can assume that each 764 * segment is either entirely contained in one of the ranges, 765 * or doesn't overlap any of them. 766 */ 767 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 768 struct vm_domain *vmd; 769 770 if (seg->start < phys_avail[i] || 771 seg->end > phys_avail[i + 1]) 772 continue; 773 774 m = seg->first_page; 775 pagecount = (u_long)atop(seg->end - seg->start); 776 777 vmd = VM_DOMAIN(seg->domain); 778 vm_domain_free_lock(vmd); 779 vm_phys_free_contig(m, pagecount); 780 vm_domain_free_unlock(vmd); 781 vm_domain_freecnt_inc(vmd, pagecount); 782 vm_cnt.v_page_count += (u_int)pagecount; 783 784 vmd = VM_DOMAIN(seg->domain); 785 vmd->vmd_page_count += (u_int)pagecount; 786 vmd->vmd_segs |= 1UL << m->segind; 787 break; 788 } 789 } 790 791 /* 792 * Remove blacklisted pages from the physical memory allocator. 793 */ 794 TAILQ_INIT(&blacklist_head); 795 vm_page_blacklist_load(&list, &listend); 796 vm_page_blacklist_check(list, listend); 797 798 list = kern_getenv("vm.blacklist"); 799 vm_page_blacklist_check(list, NULL); 800 801 freeenv(list); 802 #if VM_NRESERVLEVEL > 0 803 /* 804 * Initialize the reservation management system. 805 */ 806 vm_reserv_init(); 807 #endif 808 /* 809 * Set an initial domain policy for thread0 so that allocations 810 * can work. 811 */ 812 domainset_zero(); 813 814 return (vaddr); 815 } 816 817 void 818 vm_page_reference(vm_page_t m) 819 { 820 821 vm_page_aflag_set(m, PGA_REFERENCED); 822 } 823 824 /* 825 * vm_page_busy_downgrade: 826 * 827 * Downgrade an exclusive busy page into a single shared busy page. 828 */ 829 void 830 vm_page_busy_downgrade(vm_page_t m) 831 { 832 u_int x; 833 bool locked; 834 835 vm_page_assert_xbusied(m); 836 locked = mtx_owned(vm_page_lockptr(m)); 837 838 for (;;) { 839 x = m->busy_lock; 840 x &= VPB_BIT_WAITERS; 841 if (x != 0 && !locked) 842 vm_page_lock(m); 843 if (atomic_cmpset_rel_int(&m->busy_lock, 844 VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) 845 break; 846 if (x != 0 && !locked) 847 vm_page_unlock(m); 848 } 849 if (x != 0) { 850 wakeup(m); 851 if (!locked) 852 vm_page_unlock(m); 853 } 854 } 855 856 /* 857 * vm_page_sbusied: 858 * 859 * Return a positive value if the page is shared busied, 0 otherwise. 860 */ 861 int 862 vm_page_sbusied(vm_page_t m) 863 { 864 u_int x; 865 866 x = m->busy_lock; 867 return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); 868 } 869 870 /* 871 * vm_page_sunbusy: 872 * 873 * Shared unbusy a page. 874 */ 875 void 876 vm_page_sunbusy(vm_page_t m) 877 { 878 u_int x; 879 880 vm_page_lock_assert(m, MA_NOTOWNED); 881 vm_page_assert_sbusied(m); 882 883 for (;;) { 884 x = m->busy_lock; 885 if (VPB_SHARERS(x) > 1) { 886 if (atomic_cmpset_int(&m->busy_lock, x, 887 x - VPB_ONE_SHARER)) 888 break; 889 continue; 890 } 891 if ((x & VPB_BIT_WAITERS) == 0) { 892 KASSERT(x == VPB_SHARERS_WORD(1), 893 ("vm_page_sunbusy: invalid lock state")); 894 if (atomic_cmpset_int(&m->busy_lock, 895 VPB_SHARERS_WORD(1), VPB_UNBUSIED)) 896 break; 897 continue; 898 } 899 KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), 900 ("vm_page_sunbusy: invalid lock state for waiters")); 901 902 vm_page_lock(m); 903 if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { 904 vm_page_unlock(m); 905 continue; 906 } 907 wakeup(m); 908 vm_page_unlock(m); 909 break; 910 } 911 } 912 913 /* 914 * vm_page_busy_sleep: 915 * 916 * Sleep and release the page lock, using the page pointer as wchan. 917 * This is used to implement the hard-path of busying mechanism. 918 * 919 * The given page must be locked. 920 * 921 * If nonshared is true, sleep only if the page is xbusy. 922 */ 923 void 924 vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) 925 { 926 u_int x; 927 928 vm_page_assert_locked(m); 929 930 x = m->busy_lock; 931 if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || 932 ((x & VPB_BIT_WAITERS) == 0 && 933 !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { 934 vm_page_unlock(m); 935 return; 936 } 937 msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); 938 } 939 940 /* 941 * vm_page_trysbusy: 942 * 943 * Try to shared busy a page. 944 * If the operation succeeds 1 is returned otherwise 0. 945 * The operation never sleeps. 946 */ 947 int 948 vm_page_trysbusy(vm_page_t m) 949 { 950 u_int x; 951 952 for (;;) { 953 x = m->busy_lock; 954 if ((x & VPB_BIT_SHARED) == 0) 955 return (0); 956 if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) 957 return (1); 958 } 959 } 960 961 static void 962 vm_page_xunbusy_locked(vm_page_t m) 963 { 964 965 vm_page_assert_xbusied(m); 966 vm_page_assert_locked(m); 967 968 atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); 969 /* There is a waiter, do wakeup() instead of vm_page_flash(). */ 970 wakeup(m); 971 } 972 973 void 974 vm_page_xunbusy_maybelocked(vm_page_t m) 975 { 976 bool lockacq; 977 978 vm_page_assert_xbusied(m); 979 980 /* 981 * Fast path for unbusy. If it succeeds, we know that there 982 * are no waiters, so we do not need a wakeup. 983 */ 984 if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, 985 VPB_UNBUSIED)) 986 return; 987 988 lockacq = !mtx_owned(vm_page_lockptr(m)); 989 if (lockacq) 990 vm_page_lock(m); 991 vm_page_xunbusy_locked(m); 992 if (lockacq) 993 vm_page_unlock(m); 994 } 995 996 /* 997 * vm_page_xunbusy_hard: 998 * 999 * Called after the first try the exclusive unbusy of a page failed. 1000 * It is assumed that the waiters bit is on. 1001 */ 1002 void 1003 vm_page_xunbusy_hard(vm_page_t m) 1004 { 1005 1006 vm_page_assert_xbusied(m); 1007 1008 vm_page_lock(m); 1009 vm_page_xunbusy_locked(m); 1010 vm_page_unlock(m); 1011 } 1012 1013 /* 1014 * vm_page_flash: 1015 * 1016 * Wakeup anyone waiting for the page. 1017 * The ownership bits do not change. 1018 * 1019 * The given page must be locked. 1020 */ 1021 void 1022 vm_page_flash(vm_page_t m) 1023 { 1024 u_int x; 1025 1026 vm_page_lock_assert(m, MA_OWNED); 1027 1028 for (;;) { 1029 x = m->busy_lock; 1030 if ((x & VPB_BIT_WAITERS) == 0) 1031 return; 1032 if (atomic_cmpset_int(&m->busy_lock, x, 1033 x & (~VPB_BIT_WAITERS))) 1034 break; 1035 } 1036 wakeup(m); 1037 } 1038 1039 /* 1040 * Avoid releasing and reacquiring the same page lock. 1041 */ 1042 void 1043 vm_page_change_lock(vm_page_t m, struct mtx **mtx) 1044 { 1045 struct mtx *mtx1; 1046 1047 mtx1 = vm_page_lockptr(m); 1048 if (*mtx == mtx1) 1049 return; 1050 if (*mtx != NULL) 1051 mtx_unlock(*mtx); 1052 *mtx = mtx1; 1053 mtx_lock(mtx1); 1054 } 1055 1056 /* 1057 * Keep page from being freed by the page daemon 1058 * much of the same effect as wiring, except much lower 1059 * overhead and should be used only for *very* temporary 1060 * holding ("wiring"). 1061 */ 1062 void 1063 vm_page_hold(vm_page_t mem) 1064 { 1065 1066 vm_page_lock_assert(mem, MA_OWNED); 1067 mem->hold_count++; 1068 } 1069 1070 void 1071 vm_page_unhold(vm_page_t mem) 1072 { 1073 1074 vm_page_lock_assert(mem, MA_OWNED); 1075 KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); 1076 --mem->hold_count; 1077 if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) 1078 vm_page_free_toq(mem); 1079 } 1080 1081 /* 1082 * vm_page_unhold_pages: 1083 * 1084 * Unhold each of the pages that is referenced by the given array. 1085 */ 1086 void 1087 vm_page_unhold_pages(vm_page_t *ma, int count) 1088 { 1089 struct mtx *mtx; 1090 1091 mtx = NULL; 1092 for (; count != 0; count--) { 1093 vm_page_change_lock(*ma, &mtx); 1094 vm_page_unhold(*ma); 1095 ma++; 1096 } 1097 if (mtx != NULL) 1098 mtx_unlock(mtx); 1099 } 1100 1101 vm_page_t 1102 PHYS_TO_VM_PAGE(vm_paddr_t pa) 1103 { 1104 vm_page_t m; 1105 1106 #ifdef VM_PHYSSEG_SPARSE 1107 m = vm_phys_paddr_to_vm_page(pa); 1108 if (m == NULL) 1109 m = vm_phys_fictitious_to_vm_page(pa); 1110 return (m); 1111 #elif defined(VM_PHYSSEG_DENSE) 1112 long pi; 1113 1114 pi = atop(pa); 1115 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1116 m = &vm_page_array[pi - first_page]; 1117 return (m); 1118 } 1119 return (vm_phys_fictitious_to_vm_page(pa)); 1120 #else 1121 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 1122 #endif 1123 } 1124 1125 /* 1126 * vm_page_getfake: 1127 * 1128 * Create a fictitious page with the specified physical address and 1129 * memory attribute. The memory attribute is the only the machine- 1130 * dependent aspect of a fictitious page that must be initialized. 1131 */ 1132 vm_page_t 1133 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 1134 { 1135 vm_page_t m; 1136 1137 m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); 1138 vm_page_initfake(m, paddr, memattr); 1139 return (m); 1140 } 1141 1142 void 1143 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 1144 { 1145 1146 if ((m->flags & PG_FICTITIOUS) != 0) { 1147 /* 1148 * The page's memattr might have changed since the 1149 * previous initialization. Update the pmap to the 1150 * new memattr. 1151 */ 1152 goto memattr; 1153 } 1154 m->phys_addr = paddr; 1155 m->queue = PQ_NONE; 1156 /* Fictitious pages don't use "segind". */ 1157 m->flags = PG_FICTITIOUS; 1158 /* Fictitious pages don't use "order" or "pool". */ 1159 m->oflags = VPO_UNMANAGED; 1160 m->busy_lock = VPB_SINGLE_EXCLUSIVER; 1161 m->wire_count = 1; 1162 pmap_page_init(m); 1163 memattr: 1164 pmap_page_set_memattr(m, memattr); 1165 } 1166 1167 /* 1168 * vm_page_putfake: 1169 * 1170 * Release a fictitious page. 1171 */ 1172 void 1173 vm_page_putfake(vm_page_t m) 1174 { 1175 1176 KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); 1177 KASSERT((m->flags & PG_FICTITIOUS) != 0, 1178 ("vm_page_putfake: bad page %p", m)); 1179 uma_zfree(fakepg_zone, m); 1180 } 1181 1182 /* 1183 * vm_page_updatefake: 1184 * 1185 * Update the given fictitious page to the specified physical address and 1186 * memory attribute. 1187 */ 1188 void 1189 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 1190 { 1191 1192 KASSERT((m->flags & PG_FICTITIOUS) != 0, 1193 ("vm_page_updatefake: bad page %p", m)); 1194 m->phys_addr = paddr; 1195 pmap_page_set_memattr(m, memattr); 1196 } 1197 1198 /* 1199 * vm_page_free: 1200 * 1201 * Free a page. 1202 */ 1203 void 1204 vm_page_free(vm_page_t m) 1205 { 1206 1207 m->flags &= ~PG_ZERO; 1208 vm_page_free_toq(m); 1209 } 1210 1211 /* 1212 * vm_page_free_zero: 1213 * 1214 * Free a page to the zerod-pages queue 1215 */ 1216 void 1217 vm_page_free_zero(vm_page_t m) 1218 { 1219 1220 m->flags |= PG_ZERO; 1221 vm_page_free_toq(m); 1222 } 1223 1224 /* 1225 * Unbusy and handle the page queueing for a page from a getpages request that 1226 * was optionally read ahead or behind. 1227 */ 1228 void 1229 vm_page_readahead_finish(vm_page_t m) 1230 { 1231 1232 /* We shouldn't put invalid pages on queues. */ 1233 KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); 1234 1235 /* 1236 * Since the page is not the actually needed one, whether it should 1237 * be activated or deactivated is not obvious. Empirical results 1238 * have shown that deactivating the page is usually the best choice, 1239 * unless the page is wanted by another thread. 1240 */ 1241 vm_page_lock(m); 1242 if ((m->busy_lock & VPB_BIT_WAITERS) != 0) 1243 vm_page_activate(m); 1244 else 1245 vm_page_deactivate(m); 1246 vm_page_unlock(m); 1247 vm_page_xunbusy(m); 1248 } 1249 1250 /* 1251 * vm_page_sleep_if_busy: 1252 * 1253 * Sleep and release the page queues lock if the page is busied. 1254 * Returns TRUE if the thread slept. 1255 * 1256 * The given page must be unlocked and object containing it must 1257 * be locked. 1258 */ 1259 int 1260 vm_page_sleep_if_busy(vm_page_t m, const char *msg) 1261 { 1262 vm_object_t obj; 1263 1264 vm_page_lock_assert(m, MA_NOTOWNED); 1265 VM_OBJECT_ASSERT_WLOCKED(m->object); 1266 1267 if (vm_page_busied(m)) { 1268 /* 1269 * The page-specific object must be cached because page 1270 * identity can change during the sleep, causing the 1271 * re-lock of a different object. 1272 * It is assumed that a reference to the object is already 1273 * held by the callers. 1274 */ 1275 obj = m->object; 1276 vm_page_lock(m); 1277 VM_OBJECT_WUNLOCK(obj); 1278 vm_page_busy_sleep(m, msg, false); 1279 VM_OBJECT_WLOCK(obj); 1280 return (TRUE); 1281 } 1282 return (FALSE); 1283 } 1284 1285 /* 1286 * vm_page_dirty_KBI: [ internal use only ] 1287 * 1288 * Set all bits in the page's dirty field. 1289 * 1290 * The object containing the specified page must be locked if the 1291 * call is made from the machine-independent layer. 1292 * 1293 * See vm_page_clear_dirty_mask(). 1294 * 1295 * This function should only be called by vm_page_dirty(). 1296 */ 1297 void 1298 vm_page_dirty_KBI(vm_page_t m) 1299 { 1300 1301 /* Refer to this operation by its public name. */ 1302 KASSERT(m->valid == VM_PAGE_BITS_ALL, 1303 ("vm_page_dirty: page is invalid!")); 1304 m->dirty = VM_PAGE_BITS_ALL; 1305 } 1306 1307 /* 1308 * vm_page_insert: [ internal use only ] 1309 * 1310 * Inserts the given mem entry into the object and object list. 1311 * 1312 * The object must be locked. 1313 */ 1314 int 1315 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 1316 { 1317 vm_page_t mpred; 1318 1319 VM_OBJECT_ASSERT_WLOCKED(object); 1320 mpred = vm_radix_lookup_le(&object->rtree, pindex); 1321 return (vm_page_insert_after(m, object, pindex, mpred)); 1322 } 1323 1324 /* 1325 * vm_page_insert_after: 1326 * 1327 * Inserts the page "m" into the specified object at offset "pindex". 1328 * 1329 * The page "mpred" must immediately precede the offset "pindex" within 1330 * the specified object. 1331 * 1332 * The object must be locked. 1333 */ 1334 static int 1335 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, 1336 vm_page_t mpred) 1337 { 1338 vm_page_t msucc; 1339 1340 VM_OBJECT_ASSERT_WLOCKED(object); 1341 KASSERT(m->object == NULL, 1342 ("vm_page_insert_after: page already inserted")); 1343 if (mpred != NULL) { 1344 KASSERT(mpred->object == object, 1345 ("vm_page_insert_after: object doesn't contain mpred")); 1346 KASSERT(mpred->pindex < pindex, 1347 ("vm_page_insert_after: mpred doesn't precede pindex")); 1348 msucc = TAILQ_NEXT(mpred, listq); 1349 } else 1350 msucc = TAILQ_FIRST(&object->memq); 1351 if (msucc != NULL) 1352 KASSERT(msucc->pindex > pindex, 1353 ("vm_page_insert_after: msucc doesn't succeed pindex")); 1354 1355 /* 1356 * Record the object/offset pair in this page 1357 */ 1358 m->object = object; 1359 m->pindex = pindex; 1360 1361 /* 1362 * Now link into the object's ordered list of backed pages. 1363 */ 1364 if (vm_radix_insert(&object->rtree, m)) { 1365 m->object = NULL; 1366 m->pindex = 0; 1367 return (1); 1368 } 1369 vm_page_insert_radixdone(m, object, mpred); 1370 return (0); 1371 } 1372 1373 /* 1374 * vm_page_insert_radixdone: 1375 * 1376 * Complete page "m" insertion into the specified object after the 1377 * radix trie hooking. 1378 * 1379 * The page "mpred" must precede the offset "m->pindex" within the 1380 * specified object. 1381 * 1382 * The object must be locked. 1383 */ 1384 static void 1385 vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) 1386 { 1387 1388 VM_OBJECT_ASSERT_WLOCKED(object); 1389 KASSERT(object != NULL && m->object == object, 1390 ("vm_page_insert_radixdone: page %p has inconsistent object", m)); 1391 if (mpred != NULL) { 1392 KASSERT(mpred->object == object, 1393 ("vm_page_insert_after: object doesn't contain mpred")); 1394 KASSERT(mpred->pindex < m->pindex, 1395 ("vm_page_insert_after: mpred doesn't precede pindex")); 1396 } 1397 1398 if (mpred != NULL) 1399 TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); 1400 else 1401 TAILQ_INSERT_HEAD(&object->memq, m, listq); 1402 1403 /* 1404 * Show that the object has one more resident page. 1405 */ 1406 object->resident_page_count++; 1407 1408 /* 1409 * Hold the vnode until the last page is released. 1410 */ 1411 if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 1412 vhold(object->handle); 1413 1414 /* 1415 * Since we are inserting a new and possibly dirty page, 1416 * update the object's OBJ_MIGHTBEDIRTY flag. 1417 */ 1418 if (pmap_page_is_write_mapped(m)) 1419 vm_object_set_writeable_dirty(object); 1420 } 1421 1422 /* 1423 * vm_page_remove: 1424 * 1425 * Removes the specified page from its containing object, but does not 1426 * invalidate any backing storage. 1427 * 1428 * The object must be locked. The page must be locked if it is managed. 1429 */ 1430 void 1431 vm_page_remove(vm_page_t m) 1432 { 1433 vm_object_t object; 1434 vm_page_t mrem; 1435 1436 if ((m->oflags & VPO_UNMANAGED) == 0) 1437 vm_page_assert_locked(m); 1438 if ((object = m->object) == NULL) 1439 return; 1440 VM_OBJECT_ASSERT_WLOCKED(object); 1441 if (vm_page_xbusied(m)) 1442 vm_page_xunbusy_maybelocked(m); 1443 mrem = vm_radix_remove(&object->rtree, m->pindex); 1444 KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); 1445 1446 /* 1447 * Now remove from the object's list of backed pages. 1448 */ 1449 TAILQ_REMOVE(&object->memq, m, listq); 1450 1451 /* 1452 * And show that the object has one fewer resident page. 1453 */ 1454 object->resident_page_count--; 1455 1456 /* 1457 * The vnode may now be recycled. 1458 */ 1459 if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 1460 vdrop(object->handle); 1461 1462 m->object = NULL; 1463 } 1464 1465 /* 1466 * vm_page_lookup: 1467 * 1468 * Returns the page associated with the object/offset 1469 * pair specified; if none is found, NULL is returned. 1470 * 1471 * The object must be locked. 1472 */ 1473 vm_page_t 1474 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 1475 { 1476 1477 VM_OBJECT_ASSERT_LOCKED(object); 1478 return (vm_radix_lookup(&object->rtree, pindex)); 1479 } 1480 1481 /* 1482 * vm_page_find_least: 1483 * 1484 * Returns the page associated with the object with least pindex 1485 * greater than or equal to the parameter pindex, or NULL. 1486 * 1487 * The object must be locked. 1488 */ 1489 vm_page_t 1490 vm_page_find_least(vm_object_t object, vm_pindex_t pindex) 1491 { 1492 vm_page_t m; 1493 1494 VM_OBJECT_ASSERT_LOCKED(object); 1495 if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) 1496 m = vm_radix_lookup_ge(&object->rtree, pindex); 1497 return (m); 1498 } 1499 1500 /* 1501 * Returns the given page's successor (by pindex) within the object if it is 1502 * resident; if none is found, NULL is returned. 1503 * 1504 * The object must be locked. 1505 */ 1506 vm_page_t 1507 vm_page_next(vm_page_t m) 1508 { 1509 vm_page_t next; 1510 1511 VM_OBJECT_ASSERT_LOCKED(m->object); 1512 if ((next = TAILQ_NEXT(m, listq)) != NULL) { 1513 MPASS(next->object == m->object); 1514 if (next->pindex != m->pindex + 1) 1515 next = NULL; 1516 } 1517 return (next); 1518 } 1519 1520 /* 1521 * Returns the given page's predecessor (by pindex) within the object if it is 1522 * resident; if none is found, NULL is returned. 1523 * 1524 * The object must be locked. 1525 */ 1526 vm_page_t 1527 vm_page_prev(vm_page_t m) 1528 { 1529 vm_page_t prev; 1530 1531 VM_OBJECT_ASSERT_LOCKED(m->object); 1532 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { 1533 MPASS(prev->object == m->object); 1534 if (prev->pindex != m->pindex - 1) 1535 prev = NULL; 1536 } 1537 return (prev); 1538 } 1539 1540 /* 1541 * Uses the page mnew as a replacement for an existing page at index 1542 * pindex which must be already present in the object. 1543 * 1544 * The existing page must not be on a paging queue. 1545 */ 1546 vm_page_t 1547 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) 1548 { 1549 vm_page_t mold; 1550 1551 VM_OBJECT_ASSERT_WLOCKED(object); 1552 KASSERT(mnew->object == NULL, 1553 ("vm_page_replace: page %p already in object", mnew)); 1554 KASSERT(mnew->queue == PQ_NONE, 1555 ("vm_page_replace: new page %p is on a paging queue", mnew)); 1556 1557 /* 1558 * This function mostly follows vm_page_insert() and 1559 * vm_page_remove() without the radix, object count and vnode 1560 * dance. Double check such functions for more comments. 1561 */ 1562 1563 mnew->object = object; 1564 mnew->pindex = pindex; 1565 mold = vm_radix_replace(&object->rtree, mnew); 1566 KASSERT(mold->queue == PQ_NONE, 1567 ("vm_page_replace: old page %p is on a paging queue", mold)); 1568 1569 /* Keep the resident page list in sorted order. */ 1570 TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); 1571 TAILQ_REMOVE(&object->memq, mold, listq); 1572 1573 mold->object = NULL; 1574 vm_page_xunbusy_maybelocked(mold); 1575 1576 /* 1577 * The object's resident_page_count does not change because we have 1578 * swapped one page for another, but OBJ_MIGHTBEDIRTY. 1579 */ 1580 if (pmap_page_is_write_mapped(mnew)) 1581 vm_object_set_writeable_dirty(object); 1582 return (mold); 1583 } 1584 1585 /* 1586 * vm_page_rename: 1587 * 1588 * Move the given memory entry from its 1589 * current object to the specified target object/offset. 1590 * 1591 * Note: swap associated with the page must be invalidated by the move. We 1592 * have to do this for several reasons: (1) we aren't freeing the 1593 * page, (2) we are dirtying the page, (3) the VM system is probably 1594 * moving the page from object A to B, and will then later move 1595 * the backing store from A to B and we can't have a conflict. 1596 * 1597 * Note: we *always* dirty the page. It is necessary both for the 1598 * fact that we moved it, and because we may be invalidating 1599 * swap. 1600 * 1601 * The objects must be locked. 1602 */ 1603 int 1604 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 1605 { 1606 vm_page_t mpred; 1607 vm_pindex_t opidx; 1608 1609 VM_OBJECT_ASSERT_WLOCKED(new_object); 1610 1611 mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); 1612 KASSERT(mpred == NULL || mpred->pindex != new_pindex, 1613 ("vm_page_rename: pindex already renamed")); 1614 1615 /* 1616 * Create a custom version of vm_page_insert() which does not depend 1617 * by m_prev and can cheat on the implementation aspects of the 1618 * function. 1619 */ 1620 opidx = m->pindex; 1621 m->pindex = new_pindex; 1622 if (vm_radix_insert(&new_object->rtree, m)) { 1623 m->pindex = opidx; 1624 return (1); 1625 } 1626 1627 /* 1628 * The operation cannot fail anymore. The removal must happen before 1629 * the listq iterator is tainted. 1630 */ 1631 m->pindex = opidx; 1632 vm_page_lock(m); 1633 vm_page_remove(m); 1634 1635 /* Return back to the new pindex to complete vm_page_insert(). */ 1636 m->pindex = new_pindex; 1637 m->object = new_object; 1638 vm_page_unlock(m); 1639 vm_page_insert_radixdone(m, new_object, mpred); 1640 vm_page_dirty(m); 1641 return (0); 1642 } 1643 1644 /* 1645 * vm_page_alloc: 1646 * 1647 * Allocate and return a page that is associated with the specified 1648 * object and offset pair. By default, this page is exclusive busied. 1649 * 1650 * The caller must always specify an allocation class. 1651 * 1652 * allocation classes: 1653 * VM_ALLOC_NORMAL normal process request 1654 * VM_ALLOC_SYSTEM system *really* needs a page 1655 * VM_ALLOC_INTERRUPT interrupt time request 1656 * 1657 * optional allocation flags: 1658 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1659 * intends to allocate 1660 * VM_ALLOC_NOBUSY do not exclusive busy the page 1661 * VM_ALLOC_NODUMP do not include the page in a kernel core dump 1662 * VM_ALLOC_NOOBJ page is not associated with an object and 1663 * should not be exclusive busy 1664 * VM_ALLOC_SBUSY shared busy the allocated page 1665 * VM_ALLOC_WIRED wire the allocated page 1666 * VM_ALLOC_ZERO prefer a zeroed page 1667 */ 1668 vm_page_t 1669 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1670 { 1671 1672 return (vm_page_alloc_after(object, pindex, req, object != NULL ? 1673 vm_radix_lookup_le(&object->rtree, pindex) : NULL)); 1674 } 1675 1676 vm_page_t 1677 vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, 1678 int req) 1679 { 1680 1681 return (vm_page_alloc_domain_after(object, pindex, domain, req, 1682 object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : 1683 NULL)); 1684 } 1685 1686 /* 1687 * Allocate a page in the specified object with the given page index. To 1688 * optimize insertion of the page into the object, the caller must also specifiy 1689 * the resident page in the object with largest index smaller than the given 1690 * page index, or NULL if no such page exists. 1691 */ 1692 vm_page_t 1693 vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, 1694 int req, vm_page_t mpred) 1695 { 1696 struct vm_domainset_iter di; 1697 vm_page_t m; 1698 int domain; 1699 1700 vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); 1701 do { 1702 m = vm_page_alloc_domain_after(object, pindex, domain, req, 1703 mpred); 1704 if (m != NULL) 1705 break; 1706 } while (vm_domainset_iter_page(&di, &domain, &req) == 0); 1707 1708 return (m); 1709 } 1710 1711 /* 1712 * Returns true if the number of free pages exceeds the minimum 1713 * for the request class and false otherwise. 1714 */ 1715 int 1716 vm_domain_allocate(struct vm_domain *vmd, int req, int npages) 1717 { 1718 u_int limit, old, new; 1719 1720 req = req & VM_ALLOC_CLASS_MASK; 1721 1722 /* 1723 * The page daemon is allowed to dig deeper into the free page list. 1724 */ 1725 if (curproc == pageproc && req != VM_ALLOC_INTERRUPT) 1726 req = VM_ALLOC_SYSTEM; 1727 if (req == VM_ALLOC_INTERRUPT) 1728 limit = 0; 1729 else if (req == VM_ALLOC_SYSTEM) 1730 limit = vmd->vmd_interrupt_free_min; 1731 else 1732 limit = vmd->vmd_free_reserved; 1733 1734 /* 1735 * Attempt to reserve the pages. Fail if we're below the limit. 1736 */ 1737 limit += npages; 1738 old = vmd->vmd_free_count; 1739 do { 1740 if (old < limit) 1741 return (0); 1742 new = old - npages; 1743 } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); 1744 1745 /* Wake the page daemon if we've crossed the threshold. */ 1746 if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) 1747 pagedaemon_wakeup(vmd->vmd_domain); 1748 1749 /* Only update bitsets on transitions. */ 1750 if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || 1751 (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) 1752 vm_domain_set(vmd); 1753 1754 return (1); 1755 } 1756 1757 vm_page_t 1758 vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, 1759 int req, vm_page_t mpred) 1760 { 1761 struct vm_domain *vmd; 1762 vm_page_t m; 1763 int flags; 1764 1765 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && 1766 (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && 1767 ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != 1768 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), 1769 ("inconsistent object(%p)/req(%x)", object, req)); 1770 KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, 1771 ("Can't sleep and retry object insertion.")); 1772 KASSERT(mpred == NULL || mpred->pindex < pindex, 1773 ("mpred %p doesn't precede pindex 0x%jx", mpred, 1774 (uintmax_t)pindex)); 1775 if (object != NULL) 1776 VM_OBJECT_ASSERT_WLOCKED(object); 1777 1778 again: 1779 m = NULL; 1780 #if VM_NRESERVLEVEL > 0 1781 /* 1782 * Can we allocate the page from a reservation? 1783 */ 1784 if (vm_object_reserv(object) && 1785 ((m = vm_reserv_extend(req, object, pindex, domain, mpred)) != NULL || 1786 (m = vm_reserv_alloc_page(req, object, pindex, domain, mpred)) != NULL)) { 1787 domain = vm_phys_domain(m); 1788 vmd = VM_DOMAIN(domain); 1789 goto found; 1790 } 1791 #endif 1792 vmd = VM_DOMAIN(domain); 1793 if (object != NULL && vmd->vmd_pgcache != NULL) { 1794 m = uma_zalloc(vmd->vmd_pgcache, M_NOWAIT); 1795 if (m != NULL) 1796 goto found; 1797 } 1798 if (vm_domain_allocate(vmd, req, 1)) { 1799 /* 1800 * If not, allocate it from the free page queues. 1801 */ 1802 vm_domain_free_lock(vmd); 1803 m = vm_phys_alloc_pages(domain, object != NULL ? 1804 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); 1805 vm_domain_free_unlock(vmd); 1806 if (m == NULL) { 1807 vm_domain_freecnt_inc(vmd, 1); 1808 #if VM_NRESERVLEVEL > 0 1809 if (vm_reserv_reclaim_inactive(domain)) 1810 goto again; 1811 #endif 1812 } 1813 } 1814 if (m == NULL) { 1815 /* 1816 * Not allocatable, give up. 1817 */ 1818 if (vm_domain_alloc_fail(vmd, object, req)) 1819 goto again; 1820 return (NULL); 1821 } 1822 1823 /* 1824 * At this point we had better have found a good page. 1825 */ 1826 KASSERT(m != NULL, ("missing page")); 1827 1828 found: 1829 vm_page_alloc_check(m); 1830 1831 /* 1832 * Initialize the page. Only the PG_ZERO flag is inherited. 1833 */ 1834 flags = 0; 1835 if ((req & VM_ALLOC_ZERO) != 0) 1836 flags = PG_ZERO; 1837 flags &= m->flags; 1838 if ((req & VM_ALLOC_NODUMP) != 0) 1839 flags |= PG_NODUMP; 1840 m->flags = flags; 1841 m->aflags = 0; 1842 m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? 1843 VPO_UNMANAGED : 0; 1844 m->busy_lock = VPB_UNBUSIED; 1845 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) 1846 m->busy_lock = VPB_SINGLE_EXCLUSIVER; 1847 if ((req & VM_ALLOC_SBUSY) != 0) 1848 m->busy_lock = VPB_SHARERS_WORD(1); 1849 if (req & VM_ALLOC_WIRED) { 1850 /* 1851 * The page lock is not required for wiring a page until that 1852 * page is inserted into the object. 1853 */ 1854 vm_wire_add(1); 1855 m->wire_count = 1; 1856 } 1857 m->act_count = 0; 1858 1859 if (object != NULL) { 1860 if (vm_page_insert_after(m, object, pindex, mpred)) { 1861 if (req & VM_ALLOC_WIRED) { 1862 vm_wire_sub(1); 1863 m->wire_count = 0; 1864 } 1865 KASSERT(m->object == NULL, ("page %p has object", m)); 1866 m->oflags = VPO_UNMANAGED; 1867 m->busy_lock = VPB_UNBUSIED; 1868 /* Don't change PG_ZERO. */ 1869 vm_page_free_toq(m); 1870 if (req & VM_ALLOC_WAITFAIL) { 1871 VM_OBJECT_WUNLOCK(object); 1872 vm_radix_wait(); 1873 VM_OBJECT_WLOCK(object); 1874 } 1875 return (NULL); 1876 } 1877 1878 /* Ignore device objects; the pager sets "memattr" for them. */ 1879 if (object->memattr != VM_MEMATTR_DEFAULT && 1880 (object->flags & OBJ_FICTITIOUS) == 0) 1881 pmap_page_set_memattr(m, object->memattr); 1882 } else 1883 m->pindex = pindex; 1884 1885 return (m); 1886 } 1887 1888 /* 1889 * vm_page_alloc_contig: 1890 * 1891 * Allocate a contiguous set of physical pages of the given size "npages" 1892 * from the free lists. All of the physical pages must be at or above 1893 * the given physical address "low" and below the given physical address 1894 * "high". The given value "alignment" determines the alignment of the 1895 * first physical page in the set. If the given value "boundary" is 1896 * non-zero, then the set of physical pages cannot cross any physical 1897 * address boundary that is a multiple of that value. Both "alignment" 1898 * and "boundary" must be a power of two. 1899 * 1900 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, 1901 * then the memory attribute setting for the physical pages is configured 1902 * to the object's memory attribute setting. Otherwise, the memory 1903 * attribute setting for the physical pages is configured to "memattr", 1904 * overriding the object's memory attribute setting. However, if the 1905 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the 1906 * memory attribute setting for the physical pages cannot be configured 1907 * to VM_MEMATTR_DEFAULT. 1908 * 1909 * The specified object may not contain fictitious pages. 1910 * 1911 * The caller must always specify an allocation class. 1912 * 1913 * allocation classes: 1914 * VM_ALLOC_NORMAL normal process request 1915 * VM_ALLOC_SYSTEM system *really* needs a page 1916 * VM_ALLOC_INTERRUPT interrupt time request 1917 * 1918 * optional allocation flags: 1919 * VM_ALLOC_NOBUSY do not exclusive busy the page 1920 * VM_ALLOC_NODUMP do not include the page in a kernel core dump 1921 * VM_ALLOC_NOOBJ page is not associated with an object and 1922 * should not be exclusive busy 1923 * VM_ALLOC_SBUSY shared busy the allocated page 1924 * VM_ALLOC_WIRED wire the allocated page 1925 * VM_ALLOC_ZERO prefer a zeroed page 1926 */ 1927 vm_page_t 1928 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, 1929 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 1930 vm_paddr_t boundary, vm_memattr_t memattr) 1931 { 1932 struct vm_domainset_iter di; 1933 vm_page_t m; 1934 int domain; 1935 1936 vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); 1937 do { 1938 m = vm_page_alloc_contig_domain(object, pindex, domain, req, 1939 npages, low, high, alignment, boundary, memattr); 1940 if (m != NULL) 1941 break; 1942 } while (vm_domainset_iter_page(&di, &domain, &req) == 0); 1943 1944 return (m); 1945 } 1946 1947 vm_page_t 1948 vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, 1949 int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 1950 vm_paddr_t boundary, vm_memattr_t memattr) 1951 { 1952 struct vm_domain *vmd; 1953 vm_page_t m, m_ret, mpred; 1954 u_int busy_lock, flags, oflags; 1955 1956 mpred = NULL; /* XXX: pacify gcc */ 1957 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && 1958 (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && 1959 ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != 1960 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), 1961 ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, 1962 req)); 1963 KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, 1964 ("Can't sleep and retry object insertion.")); 1965 if (object != NULL) { 1966 VM_OBJECT_ASSERT_WLOCKED(object); 1967 KASSERT((object->flags & OBJ_FICTITIOUS) == 0, 1968 ("vm_page_alloc_contig: object %p has fictitious pages", 1969 object)); 1970 } 1971 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 1972 1973 if (object != NULL) { 1974 mpred = vm_radix_lookup_le(&object->rtree, pindex); 1975 KASSERT(mpred == NULL || mpred->pindex != pindex, 1976 ("vm_page_alloc_contig: pindex already allocated")); 1977 } 1978 1979 /* 1980 * Can we allocate the pages without the number of free pages falling 1981 * below the lower bound for the allocation class? 1982 */ 1983 again: 1984 #if VM_NRESERVLEVEL > 0 1985 /* 1986 * Can we allocate the pages from a reservation? 1987 */ 1988 if (vm_object_reserv(object) && 1989 ((m_ret = vm_reserv_extend_contig(req, object, pindex, domain, 1990 npages, low, high, alignment, boundary, mpred)) != NULL || 1991 (m_ret = vm_reserv_alloc_contig(req, object, pindex, domain, 1992 npages, low, high, alignment, boundary, mpred)) != NULL)) { 1993 domain = vm_phys_domain(m_ret); 1994 vmd = VM_DOMAIN(domain); 1995 goto found; 1996 } 1997 #endif 1998 m_ret = NULL; 1999 vmd = VM_DOMAIN(domain); 2000 if (vm_domain_allocate(vmd, req, npages)) { 2001 /* 2002 * allocate them from the free page queues. 2003 */ 2004 vm_domain_free_lock(vmd); 2005 m_ret = vm_phys_alloc_contig(domain, npages, low, high, 2006 alignment, boundary); 2007 vm_domain_free_unlock(vmd); 2008 if (m_ret == NULL) { 2009 vm_domain_freecnt_inc(vmd, npages); 2010 #if VM_NRESERVLEVEL > 0 2011 if (vm_reserv_reclaim_contig(domain, npages, low, 2012 high, alignment, boundary)) 2013 goto again; 2014 #endif 2015 } 2016 } 2017 if (m_ret == NULL) { 2018 if (vm_domain_alloc_fail(vmd, object, req)) 2019 goto again; 2020 return (NULL); 2021 } 2022 #if VM_NRESERVLEVEL > 0 2023 found: 2024 #endif 2025 for (m = m_ret; m < &m_ret[npages]; m++) 2026 vm_page_alloc_check(m); 2027 2028 /* 2029 * Initialize the pages. Only the PG_ZERO flag is inherited. 2030 */ 2031 flags = 0; 2032 if ((req & VM_ALLOC_ZERO) != 0) 2033 flags = PG_ZERO; 2034 if ((req & VM_ALLOC_NODUMP) != 0) 2035 flags |= PG_NODUMP; 2036 oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? 2037 VPO_UNMANAGED : 0; 2038 busy_lock = VPB_UNBUSIED; 2039 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) 2040 busy_lock = VPB_SINGLE_EXCLUSIVER; 2041 if ((req & VM_ALLOC_SBUSY) != 0) 2042 busy_lock = VPB_SHARERS_WORD(1); 2043 if ((req & VM_ALLOC_WIRED) != 0) 2044 vm_wire_add(npages); 2045 if (object != NULL) { 2046 if (object->memattr != VM_MEMATTR_DEFAULT && 2047 memattr == VM_MEMATTR_DEFAULT) 2048 memattr = object->memattr; 2049 } 2050 for (m = m_ret; m < &m_ret[npages]; m++) { 2051 m->aflags = 0; 2052 m->flags = (m->flags | PG_NODUMP) & flags; 2053 m->busy_lock = busy_lock; 2054 if ((req & VM_ALLOC_WIRED) != 0) 2055 m->wire_count = 1; 2056 m->act_count = 0; 2057 m->oflags = oflags; 2058 if (object != NULL) { 2059 if (vm_page_insert_after(m, object, pindex, mpred)) { 2060 if ((req & VM_ALLOC_WIRED) != 0) 2061 vm_wire_sub(npages); 2062 KASSERT(m->object == NULL, 2063 ("page %p has object", m)); 2064 mpred = m; 2065 for (m = m_ret; m < &m_ret[npages]; m++) { 2066 if (m <= mpred && 2067 (req & VM_ALLOC_WIRED) != 0) 2068 m->wire_count = 0; 2069 m->oflags = VPO_UNMANAGED; 2070 m->busy_lock = VPB_UNBUSIED; 2071 /* Don't change PG_ZERO. */ 2072 vm_page_free_toq(m); 2073 } 2074 if (req & VM_ALLOC_WAITFAIL) { 2075 VM_OBJECT_WUNLOCK(object); 2076 vm_radix_wait(); 2077 VM_OBJECT_WLOCK(object); 2078 } 2079 return (NULL); 2080 } 2081 mpred = m; 2082 } else 2083 m->pindex = pindex; 2084 if (memattr != VM_MEMATTR_DEFAULT) 2085 pmap_page_set_memattr(m, memattr); 2086 pindex++; 2087 } 2088 return (m_ret); 2089 } 2090 2091 /* 2092 * Check a page that has been freshly dequeued from a freelist. 2093 */ 2094 static void 2095 vm_page_alloc_check(vm_page_t m) 2096 { 2097 2098 KASSERT(m->object == NULL, ("page %p has object", m)); 2099 KASSERT(m->queue == PQ_NONE, 2100 ("page %p has unexpected queue %d", m, m->queue)); 2101 KASSERT(!vm_page_held(m), ("page %p is held", m)); 2102 KASSERT(!vm_page_busied(m), ("page %p is busy", m)); 2103 KASSERT(m->dirty == 0, ("page %p is dirty", m)); 2104 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 2105 ("page %p has unexpected memattr %d", 2106 m, pmap_page_get_memattr(m))); 2107 KASSERT(m->valid == 0, ("free page %p is valid", m)); 2108 } 2109 2110 /* 2111 * vm_page_alloc_freelist: 2112 * 2113 * Allocate a physical page from the specified free page list. 2114 * 2115 * The caller must always specify an allocation class. 2116 * 2117 * allocation classes: 2118 * VM_ALLOC_NORMAL normal process request 2119 * VM_ALLOC_SYSTEM system *really* needs a page 2120 * VM_ALLOC_INTERRUPT interrupt time request 2121 * 2122 * optional allocation flags: 2123 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 2124 * intends to allocate 2125 * VM_ALLOC_WIRED wire the allocated page 2126 * VM_ALLOC_ZERO prefer a zeroed page 2127 */ 2128 vm_page_t 2129 vm_page_alloc_freelist(int freelist, int req) 2130 { 2131 struct vm_domainset_iter di; 2132 vm_page_t m; 2133 int domain; 2134 2135 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); 2136 do { 2137 m = vm_page_alloc_freelist_domain(domain, freelist, req); 2138 if (m != NULL) 2139 break; 2140 } while (vm_domainset_iter_page(&di, &domain, &req) == 0); 2141 2142 return (m); 2143 } 2144 2145 vm_page_t 2146 vm_page_alloc_freelist_domain(int domain, int freelist, int req) 2147 { 2148 struct vm_domain *vmd; 2149 vm_page_t m; 2150 u_int flags; 2151 2152 /* 2153 * Do not allocate reserved pages unless the req has asked for it. 2154 */ 2155 vmd = VM_DOMAIN(domain); 2156 again: 2157 if (vm_domain_allocate(vmd, req, 1)) { 2158 vm_domain_free_lock(vmd); 2159 m = vm_phys_alloc_freelist_pages(domain, freelist, 2160 VM_FREEPOOL_DIRECT, 0); 2161 vm_domain_free_unlock(vmd); 2162 if (m == NULL) 2163 vm_domain_freecnt_inc(vmd, 1); 2164 } 2165 if (m == NULL) { 2166 if (vm_domain_alloc_fail(vmd, NULL, req)) 2167 goto again; 2168 return (NULL); 2169 } 2170 vm_page_alloc_check(m); 2171 2172 /* 2173 * Initialize the page. Only the PG_ZERO flag is inherited. 2174 */ 2175 m->aflags = 0; 2176 flags = 0; 2177 if ((req & VM_ALLOC_ZERO) != 0) 2178 flags = PG_ZERO; 2179 m->flags &= flags; 2180 if ((req & VM_ALLOC_WIRED) != 0) { 2181 /* 2182 * The page lock is not required for wiring a page that does 2183 * not belong to an object. 2184 */ 2185 vm_wire_add(1); 2186 m->wire_count = 1; 2187 } 2188 /* Unmanaged pages don't use "act_count". */ 2189 m->oflags = VPO_UNMANAGED; 2190 return (m); 2191 } 2192 2193 static int 2194 vm_page_import(void *arg, void **store, int cnt, int domain, int flags) 2195 { 2196 struct vm_domain *vmd; 2197 vm_page_t m; 2198 int i, j, n; 2199 2200 vmd = arg; 2201 /* Only import if we can bring in a full bucket. */ 2202 if (cnt == 1 || !vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) 2203 return (0); 2204 domain = vmd->vmd_domain; 2205 n = 64; /* Starting stride, arbitrary. */ 2206 vm_domain_free_lock(vmd); 2207 for (i = 0; i < cnt; i+=n) { 2208 n = vm_phys_alloc_npages(domain, VM_FREELIST_DEFAULT, &m, 2209 MIN(n, cnt-i)); 2210 if (n == 0) 2211 break; 2212 for (j = 0; j < n; j++) 2213 store[i+j] = m++; 2214 } 2215 vm_domain_free_unlock(vmd); 2216 if (cnt != i) 2217 vm_domain_freecnt_inc(vmd, cnt - i); 2218 2219 return (i); 2220 } 2221 2222 static void 2223 vm_page_release(void *arg, void **store, int cnt) 2224 { 2225 struct vm_domain *vmd; 2226 vm_page_t m; 2227 int i; 2228 2229 vmd = arg; 2230 vm_domain_free_lock(vmd); 2231 for (i = 0; i < cnt; i++) { 2232 m = (vm_page_t)store[i]; 2233 vm_phys_free_pages(m, 0); 2234 } 2235 vm_domain_free_unlock(vmd); 2236 vm_domain_freecnt_inc(vmd, cnt); 2237 } 2238 2239 #define VPSC_ANY 0 /* No restrictions. */ 2240 #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ 2241 #define VPSC_NOSUPER 2 /* Skip superpages. */ 2242 2243 /* 2244 * vm_page_scan_contig: 2245 * 2246 * Scan vm_page_array[] between the specified entries "m_start" and 2247 * "m_end" for a run of contiguous physical pages that satisfy the 2248 * specified conditions, and return the lowest page in the run. The 2249 * specified "alignment" determines the alignment of the lowest physical 2250 * page in the run. If the specified "boundary" is non-zero, then the 2251 * run of physical pages cannot span a physical address that is a 2252 * multiple of "boundary". 2253 * 2254 * "m_end" is never dereferenced, so it need not point to a vm_page 2255 * structure within vm_page_array[]. 2256 * 2257 * "npages" must be greater than zero. "m_start" and "m_end" must not 2258 * span a hole (or discontiguity) in the physical address space. Both 2259 * "alignment" and "boundary" must be a power of two. 2260 */ 2261 vm_page_t 2262 vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, 2263 u_long alignment, vm_paddr_t boundary, int options) 2264 { 2265 struct mtx *m_mtx; 2266 vm_object_t object; 2267 vm_paddr_t pa; 2268 vm_page_t m, m_run; 2269 #if VM_NRESERVLEVEL > 0 2270 int level; 2271 #endif 2272 int m_inc, order, run_ext, run_len; 2273 2274 KASSERT(npages > 0, ("npages is 0")); 2275 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 2276 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 2277 m_run = NULL; 2278 run_len = 0; 2279 m_mtx = NULL; 2280 for (m = m_start; m < m_end && run_len < npages; m += m_inc) { 2281 KASSERT((m->flags & PG_MARKER) == 0, 2282 ("page %p is PG_MARKER", m)); 2283 KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1, 2284 ("fictitious page %p has invalid wire count", m)); 2285 2286 /* 2287 * If the current page would be the start of a run, check its 2288 * physical address against the end, alignment, and boundary 2289 * conditions. If it doesn't satisfy these conditions, either 2290 * terminate the scan or advance to the next page that 2291 * satisfies the failed condition. 2292 */ 2293 if (run_len == 0) { 2294 KASSERT(m_run == NULL, ("m_run != NULL")); 2295 if (m + npages > m_end) 2296 break; 2297 pa = VM_PAGE_TO_PHYS(m); 2298 if ((pa & (alignment - 1)) != 0) { 2299 m_inc = atop(roundup2(pa, alignment) - pa); 2300 continue; 2301 } 2302 if (rounddown2(pa ^ (pa + ptoa(npages) - 1), 2303 boundary) != 0) { 2304 m_inc = atop(roundup2(pa, boundary) - pa); 2305 continue; 2306 } 2307 } else 2308 KASSERT(m_run != NULL, ("m_run == NULL")); 2309 2310 vm_page_change_lock(m, &m_mtx); 2311 m_inc = 1; 2312 retry: 2313 if (vm_page_held(m)) 2314 run_ext = 0; 2315 #if VM_NRESERVLEVEL > 0 2316 else if ((level = vm_reserv_level(m)) >= 0 && 2317 (options & VPSC_NORESERV) != 0) { 2318 run_ext = 0; 2319 /* Advance to the end of the reservation. */ 2320 pa = VM_PAGE_TO_PHYS(m); 2321 m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - 2322 pa); 2323 } 2324 #endif 2325 else if ((object = m->object) != NULL) { 2326 /* 2327 * The page is considered eligible for relocation if 2328 * and only if it could be laundered or reclaimed by 2329 * the page daemon. 2330 */ 2331 if (!VM_OBJECT_TRYRLOCK(object)) { 2332 mtx_unlock(m_mtx); 2333 VM_OBJECT_RLOCK(object); 2334 mtx_lock(m_mtx); 2335 if (m->object != object) { 2336 /* 2337 * The page may have been freed. 2338 */ 2339 VM_OBJECT_RUNLOCK(object); 2340 goto retry; 2341 } else if (vm_page_held(m)) { 2342 run_ext = 0; 2343 goto unlock; 2344 } 2345 } 2346 KASSERT((m->flags & PG_UNHOLDFREE) == 0, 2347 ("page %p is PG_UNHOLDFREE", m)); 2348 /* Don't care: PG_NODUMP, PG_ZERO. */ 2349 if (object->type != OBJT_DEFAULT && 2350 object->type != OBJT_SWAP && 2351 object->type != OBJT_VNODE) { 2352 run_ext = 0; 2353 #if VM_NRESERVLEVEL > 0 2354 } else if ((options & VPSC_NOSUPER) != 0 && 2355 (level = vm_reserv_level_iffullpop(m)) >= 0) { 2356 run_ext = 0; 2357 /* Advance to the end of the superpage. */ 2358 pa = VM_PAGE_TO_PHYS(m); 2359 m_inc = atop(roundup2(pa + 1, 2360 vm_reserv_size(level)) - pa); 2361 #endif 2362 } else if (object->memattr == VM_MEMATTR_DEFAULT && 2363 m->queue != PQ_NONE && !vm_page_busied(m)) { 2364 /* 2365 * The page is allocated but eligible for 2366 * relocation. Extend the current run by one 2367 * page. 2368 */ 2369 KASSERT(pmap_page_get_memattr(m) == 2370 VM_MEMATTR_DEFAULT, 2371 ("page %p has an unexpected memattr", m)); 2372 KASSERT((m->oflags & (VPO_SWAPINPROG | 2373 VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, 2374 ("page %p has unexpected oflags", m)); 2375 /* Don't care: VPO_NOSYNC. */ 2376 run_ext = 1; 2377 } else 2378 run_ext = 0; 2379 unlock: 2380 VM_OBJECT_RUNLOCK(object); 2381 #if VM_NRESERVLEVEL > 0 2382 } else if (level >= 0) { 2383 /* 2384 * The page is reserved but not yet allocated. In 2385 * other words, it is still free. Extend the current 2386 * run by one page. 2387 */ 2388 run_ext = 1; 2389 #endif 2390 } else if ((order = m->order) < VM_NFREEORDER) { 2391 /* 2392 * The page is enqueued in the physical memory 2393 * allocator's free page queues. Moreover, it is the 2394 * first page in a power-of-two-sized run of 2395 * contiguous free pages. Add these pages to the end 2396 * of the current run, and jump ahead. 2397 */ 2398 run_ext = 1 << order; 2399 m_inc = 1 << order; 2400 } else { 2401 /* 2402 * Skip the page for one of the following reasons: (1) 2403 * It is enqueued in the physical memory allocator's 2404 * free page queues. However, it is not the first 2405 * page in a run of contiguous free pages. (This case 2406 * rarely occurs because the scan is performed in 2407 * ascending order.) (2) It is not reserved, and it is 2408 * transitioning from free to allocated. (Conversely, 2409 * the transition from allocated to free for managed 2410 * pages is blocked by the page lock.) (3) It is 2411 * allocated but not contained by an object and not 2412 * wired, e.g., allocated by Xen's balloon driver. 2413 */ 2414 run_ext = 0; 2415 } 2416 2417 /* 2418 * Extend or reset the current run of pages. 2419 */ 2420 if (run_ext > 0) { 2421 if (run_len == 0) 2422 m_run = m; 2423 run_len += run_ext; 2424 } else { 2425 if (run_len > 0) { 2426 m_run = NULL; 2427 run_len = 0; 2428 } 2429 } 2430 } 2431 if (m_mtx != NULL) 2432 mtx_unlock(m_mtx); 2433 if (run_len >= npages) 2434 return (m_run); 2435 return (NULL); 2436 } 2437 2438 /* 2439 * vm_page_reclaim_run: 2440 * 2441 * Try to relocate each of the allocated virtual pages within the 2442 * specified run of physical pages to a new physical address. Free the 2443 * physical pages underlying the relocated virtual pages. A virtual page 2444 * is relocatable if and only if it could be laundered or reclaimed by 2445 * the page daemon. Whenever possible, a virtual page is relocated to a 2446 * physical address above "high". 2447 * 2448 * Returns 0 if every physical page within the run was already free or 2449 * just freed by a successful relocation. Otherwise, returns a non-zero 2450 * value indicating why the last attempt to relocate a virtual page was 2451 * unsuccessful. 2452 * 2453 * "req_class" must be an allocation class. 2454 */ 2455 static int 2456 vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, 2457 vm_paddr_t high) 2458 { 2459 struct vm_domain *vmd; 2460 struct mtx *m_mtx; 2461 struct spglist free; 2462 vm_object_t object; 2463 vm_paddr_t pa; 2464 vm_page_t m, m_end, m_new; 2465 int error, order, req; 2466 2467 KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, 2468 ("req_class is not an allocation class")); 2469 SLIST_INIT(&free); 2470 error = 0; 2471 m = m_run; 2472 m_end = m_run + npages; 2473 m_mtx = NULL; 2474 for (; error == 0 && m < m_end; m++) { 2475 KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, 2476 ("page %p is PG_FICTITIOUS or PG_MARKER", m)); 2477 2478 /* 2479 * Avoid releasing and reacquiring the same page lock. 2480 */ 2481 vm_page_change_lock(m, &m_mtx); 2482 retry: 2483 if (vm_page_held(m)) 2484 error = EBUSY; 2485 else if ((object = m->object) != NULL) { 2486 /* 2487 * The page is relocated if and only if it could be 2488 * laundered or reclaimed by the page daemon. 2489 */ 2490 if (!VM_OBJECT_TRYWLOCK(object)) { 2491 mtx_unlock(m_mtx); 2492 VM_OBJECT_WLOCK(object); 2493 mtx_lock(m_mtx); 2494 if (m->object != object) { 2495 /* 2496 * The page may have been freed. 2497 */ 2498 VM_OBJECT_WUNLOCK(object); 2499 goto retry; 2500 } else if (vm_page_held(m)) { 2501 error = EBUSY; 2502 goto unlock; 2503 } 2504 } 2505 KASSERT((m->flags & PG_UNHOLDFREE) == 0, 2506 ("page %p is PG_UNHOLDFREE", m)); 2507 /* Don't care: PG_NODUMP, PG_ZERO. */ 2508 if (object->type != OBJT_DEFAULT && 2509 object->type != OBJT_SWAP && 2510 object->type != OBJT_VNODE) 2511 error = EINVAL; 2512 else if (object->memattr != VM_MEMATTR_DEFAULT) 2513 error = EINVAL; 2514 else if (m->queue != PQ_NONE && !vm_page_busied(m)) { 2515 KASSERT(pmap_page_get_memattr(m) == 2516 VM_MEMATTR_DEFAULT, 2517 ("page %p has an unexpected memattr", m)); 2518 KASSERT((m->oflags & (VPO_SWAPINPROG | 2519 VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, 2520 ("page %p has unexpected oflags", m)); 2521 /* Don't care: VPO_NOSYNC. */ 2522 if (m->valid != 0) { 2523 /* 2524 * First, try to allocate a new page 2525 * that is above "high". Failing 2526 * that, try to allocate a new page 2527 * that is below "m_run". Allocate 2528 * the new page between the end of 2529 * "m_run" and "high" only as a last 2530 * resort. 2531 */ 2532 req = req_class | VM_ALLOC_NOOBJ; 2533 if ((m->flags & PG_NODUMP) != 0) 2534 req |= VM_ALLOC_NODUMP; 2535 if (trunc_page(high) != 2536 ~(vm_paddr_t)PAGE_MASK) { 2537 m_new = vm_page_alloc_contig( 2538 NULL, 0, req, 1, 2539 round_page(high), 2540 ~(vm_paddr_t)0, 2541 PAGE_SIZE, 0, 2542 VM_MEMATTR_DEFAULT); 2543 } else 2544 m_new = NULL; 2545 if (m_new == NULL) { 2546 pa = VM_PAGE_TO_PHYS(m_run); 2547 m_new = vm_page_alloc_contig( 2548 NULL, 0, req, 1, 2549 0, pa - 1, PAGE_SIZE, 0, 2550 VM_MEMATTR_DEFAULT); 2551 } 2552 if (m_new == NULL) { 2553 pa += ptoa(npages); 2554 m_new = vm_page_alloc_contig( 2555 NULL, 0, req, 1, 2556 pa, high, PAGE_SIZE, 0, 2557 VM_MEMATTR_DEFAULT); 2558 } 2559 if (m_new == NULL) { 2560 error = ENOMEM; 2561 goto unlock; 2562 } 2563 KASSERT(m_new->wire_count == 0, 2564 ("page %p is wired", m_new)); 2565 2566 /* 2567 * Replace "m" with the new page. For 2568 * vm_page_replace(), "m" must be busy 2569 * and dequeued. Finally, change "m" 2570 * as if vm_page_free() was called. 2571 */ 2572 if (object->ref_count != 0) 2573 pmap_remove_all(m); 2574 m_new->aflags = m->aflags; 2575 KASSERT(m_new->oflags == VPO_UNMANAGED, 2576 ("page %p is managed", m_new)); 2577 m_new->oflags = m->oflags & VPO_NOSYNC; 2578 pmap_copy_page(m, m_new); 2579 m_new->valid = m->valid; 2580 m_new->dirty = m->dirty; 2581 m->flags &= ~PG_ZERO; 2582 vm_page_xbusy(m); 2583 vm_page_remque(m); 2584 vm_page_replace_checked(m_new, object, 2585 m->pindex, m); 2586 if (vm_page_free_prep(m, false)) 2587 SLIST_INSERT_HEAD(&free, m, 2588 plinks.s.ss); 2589 2590 /* 2591 * The new page must be deactivated 2592 * before the object is unlocked. 2593 */ 2594 vm_page_change_lock(m_new, &m_mtx); 2595 vm_page_deactivate(m_new); 2596 } else { 2597 m->flags &= ~PG_ZERO; 2598 vm_page_remque(m); 2599 vm_page_remove(m); 2600 if (vm_page_free_prep(m, false)) 2601 SLIST_INSERT_HEAD(&free, m, 2602 plinks.s.ss); 2603 KASSERT(m->dirty == 0, 2604 ("page %p is dirty", m)); 2605 } 2606 } else 2607 error = EBUSY; 2608 unlock: 2609 VM_OBJECT_WUNLOCK(object); 2610 } else { 2611 MPASS(vm_phys_domain(m) == domain); 2612 vmd = VM_DOMAIN(domain); 2613 vm_domain_free_lock(vmd); 2614 order = m->order; 2615 if (order < VM_NFREEORDER) { 2616 /* 2617 * The page is enqueued in the physical memory 2618 * allocator's free page queues. Moreover, it 2619 * is the first page in a power-of-two-sized 2620 * run of contiguous free pages. Jump ahead 2621 * to the last page within that run, and 2622 * continue from there. 2623 */ 2624 m += (1 << order) - 1; 2625 } 2626 #if VM_NRESERVLEVEL > 0 2627 else if (vm_reserv_is_page_free(m)) 2628 order = 0; 2629 #endif 2630 vm_domain_free_unlock(vmd); 2631 if (order == VM_NFREEORDER) 2632 error = EINVAL; 2633 } 2634 } 2635 if (m_mtx != NULL) 2636 mtx_unlock(m_mtx); 2637 if ((m = SLIST_FIRST(&free)) != NULL) { 2638 int cnt; 2639 2640 vmd = VM_DOMAIN(domain); 2641 cnt = 0; 2642 vm_domain_free_lock(vmd); 2643 do { 2644 MPASS(vm_phys_domain(m) == domain); 2645 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2646 vm_phys_free_pages(m, 0); 2647 cnt++; 2648 } while ((m = SLIST_FIRST(&free)) != NULL); 2649 vm_domain_free_unlock(vmd); 2650 vm_domain_freecnt_inc(vmd, cnt); 2651 } 2652 return (error); 2653 } 2654 2655 #define NRUNS 16 2656 2657 CTASSERT(powerof2(NRUNS)); 2658 2659 #define RUN_INDEX(count) ((count) & (NRUNS - 1)) 2660 2661 #define MIN_RECLAIM 8 2662 2663 /* 2664 * vm_page_reclaim_contig: 2665 * 2666 * Reclaim allocated, contiguous physical memory satisfying the specified 2667 * conditions by relocating the virtual pages using that physical memory. 2668 * Returns true if reclamation is successful and false otherwise. Since 2669 * relocation requires the allocation of physical pages, reclamation may 2670 * fail due to a shortage of free pages. When reclamation fails, callers 2671 * are expected to perform vm_wait() before retrying a failed allocation 2672 * operation, e.g., vm_page_alloc_contig(). 2673 * 2674 * The caller must always specify an allocation class through "req". 2675 * 2676 * allocation classes: 2677 * VM_ALLOC_NORMAL normal process request 2678 * VM_ALLOC_SYSTEM system *really* needs a page 2679 * VM_ALLOC_INTERRUPT interrupt time request 2680 * 2681 * The optional allocation flags are ignored. 2682 * 2683 * "npages" must be greater than zero. Both "alignment" and "boundary" 2684 * must be a power of two. 2685 */ 2686 bool 2687 vm_page_reclaim_contig_domain(int domain, int req, u_long npages, 2688 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 2689 { 2690 struct vm_domain *vmd; 2691 vm_paddr_t curr_low; 2692 vm_page_t m_run, m_runs[NRUNS]; 2693 u_long count, reclaimed; 2694 int error, i, options, req_class; 2695 2696 KASSERT(npages > 0, ("npages is 0")); 2697 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 2698 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 2699 req_class = req & VM_ALLOC_CLASS_MASK; 2700 2701 /* 2702 * The page daemon is allowed to dig deeper into the free page list. 2703 */ 2704 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 2705 req_class = VM_ALLOC_SYSTEM; 2706 2707 /* 2708 * Return if the number of free pages cannot satisfy the requested 2709 * allocation. 2710 */ 2711 vmd = VM_DOMAIN(domain); 2712 count = vmd->vmd_free_count; 2713 if (count < npages + vmd->vmd_free_reserved || (count < npages + 2714 vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || 2715 (count < npages && req_class == VM_ALLOC_INTERRUPT)) 2716 return (false); 2717 2718 /* 2719 * Scan up to three times, relaxing the restrictions ("options") on 2720 * the reclamation of reservations and superpages each time. 2721 */ 2722 for (options = VPSC_NORESERV;;) { 2723 /* 2724 * Find the highest runs that satisfy the given constraints 2725 * and restrictions, and record them in "m_runs". 2726 */ 2727 curr_low = low; 2728 count = 0; 2729 for (;;) { 2730 m_run = vm_phys_scan_contig(domain, npages, curr_low, 2731 high, alignment, boundary, options); 2732 if (m_run == NULL) 2733 break; 2734 curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); 2735 m_runs[RUN_INDEX(count)] = m_run; 2736 count++; 2737 } 2738 2739 /* 2740 * Reclaim the highest runs in LIFO (descending) order until 2741 * the number of reclaimed pages, "reclaimed", is at least 2742 * MIN_RECLAIM. Reset "reclaimed" each time because each 2743 * reclamation is idempotent, and runs will (likely) recur 2744 * from one scan to the next as restrictions are relaxed. 2745 */ 2746 reclaimed = 0; 2747 for (i = 0; count > 0 && i < NRUNS; i++) { 2748 count--; 2749 m_run = m_runs[RUN_INDEX(count)]; 2750 error = vm_page_reclaim_run(req_class, domain, npages, 2751 m_run, high); 2752 if (error == 0) { 2753 reclaimed += npages; 2754 if (reclaimed >= MIN_RECLAIM) 2755 return (true); 2756 } 2757 } 2758 2759 /* 2760 * Either relax the restrictions on the next scan or return if 2761 * the last scan had no restrictions. 2762 */ 2763 if (options == VPSC_NORESERV) 2764 options = VPSC_NOSUPER; 2765 else if (options == VPSC_NOSUPER) 2766 options = VPSC_ANY; 2767 else if (options == VPSC_ANY) 2768 return (reclaimed != 0); 2769 } 2770 } 2771 2772 bool 2773 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, 2774 u_long alignment, vm_paddr_t boundary) 2775 { 2776 struct vm_domainset_iter di; 2777 int domain; 2778 bool ret; 2779 2780 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); 2781 do { 2782 ret = vm_page_reclaim_contig_domain(domain, req, npages, low, 2783 high, alignment, boundary); 2784 if (ret) 2785 break; 2786 } while (vm_domainset_iter_page(&di, &domain, &req) == 0); 2787 2788 return (ret); 2789 } 2790 2791 /* 2792 * Set the domain in the appropriate page level domainset. 2793 */ 2794 void 2795 vm_domain_set(struct vm_domain *vmd) 2796 { 2797 2798 mtx_lock(&vm_domainset_lock); 2799 if (!vmd->vmd_minset && vm_paging_min(vmd)) { 2800 vmd->vmd_minset = 1; 2801 DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); 2802 } 2803 if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { 2804 vmd->vmd_severeset = 1; 2805 DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); 2806 } 2807 mtx_unlock(&vm_domainset_lock); 2808 } 2809 2810 /* 2811 * Clear the domain from the appropriate page level domainset. 2812 */ 2813 void 2814 vm_domain_clear(struct vm_domain *vmd) 2815 { 2816 2817 mtx_lock(&vm_domainset_lock); 2818 if (vmd->vmd_minset && !vm_paging_min(vmd)) { 2819 vmd->vmd_minset = 0; 2820 DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); 2821 if (vm_min_waiters != 0) { 2822 vm_min_waiters = 0; 2823 wakeup(&vm_min_domains); 2824 } 2825 } 2826 if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { 2827 vmd->vmd_severeset = 0; 2828 DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); 2829 if (vm_severe_waiters != 0) { 2830 vm_severe_waiters = 0; 2831 wakeup(&vm_severe_domains); 2832 } 2833 } 2834 2835 /* 2836 * If pageout daemon needs pages, then tell it that there are 2837 * some free. 2838 */ 2839 if (vmd->vmd_pageout_pages_needed && 2840 vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { 2841 wakeup(&vmd->vmd_pageout_pages_needed); 2842 vmd->vmd_pageout_pages_needed = 0; 2843 } 2844 2845 /* See comments in vm_wait_doms(). */ 2846 if (vm_pageproc_waiters) { 2847 vm_pageproc_waiters = 0; 2848 wakeup(&vm_pageproc_waiters); 2849 } 2850 mtx_unlock(&vm_domainset_lock); 2851 } 2852 2853 /* 2854 * Wait for free pages to exceed the min threshold globally. 2855 */ 2856 void 2857 vm_wait_min(void) 2858 { 2859 2860 mtx_lock(&vm_domainset_lock); 2861 while (vm_page_count_min()) { 2862 vm_min_waiters++; 2863 msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); 2864 } 2865 mtx_unlock(&vm_domainset_lock); 2866 } 2867 2868 /* 2869 * Wait for free pages to exceed the severe threshold globally. 2870 */ 2871 void 2872 vm_wait_severe(void) 2873 { 2874 2875 mtx_lock(&vm_domainset_lock); 2876 while (vm_page_count_severe()) { 2877 vm_severe_waiters++; 2878 msleep(&vm_severe_domains, &vm_domainset_lock, PVM, 2879 "vmwait", 0); 2880 } 2881 mtx_unlock(&vm_domainset_lock); 2882 } 2883 2884 u_int 2885 vm_wait_count(void) 2886 { 2887 2888 return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); 2889 } 2890 2891 static void 2892 vm_wait_doms(const domainset_t *wdoms) 2893 { 2894 2895 /* 2896 * We use racey wakeup synchronization to avoid expensive global 2897 * locking for the pageproc when sleeping with a non-specific vm_wait. 2898 * To handle this, we only sleep for one tick in this instance. It 2899 * is expected that most allocations for the pageproc will come from 2900 * kmem or vm_page_grab* which will use the more specific and 2901 * race-free vm_wait_domain(). 2902 */ 2903 if (curproc == pageproc) { 2904 mtx_lock(&vm_domainset_lock); 2905 vm_pageproc_waiters++; 2906 msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP, 2907 "pageprocwait", 1); 2908 } else { 2909 /* 2910 * XXX Ideally we would wait only until the allocation could 2911 * be satisfied. This condition can cause new allocators to 2912 * consume all freed pages while old allocators wait. 2913 */ 2914 mtx_lock(&vm_domainset_lock); 2915 if (DOMAINSET_SUBSET(&vm_min_domains, wdoms)) { 2916 vm_min_waiters++; 2917 msleep(&vm_min_domains, &vm_domainset_lock, PVM, 2918 "vmwait", 0); 2919 } 2920 mtx_unlock(&vm_domainset_lock); 2921 } 2922 } 2923 2924 /* 2925 * vm_wait_domain: 2926 * 2927 * Sleep until free pages are available for allocation. 2928 * - Called in various places after failed memory allocations. 2929 */ 2930 void 2931 vm_wait_domain(int domain) 2932 { 2933 struct vm_domain *vmd; 2934 domainset_t wdom; 2935 2936 vmd = VM_DOMAIN(domain); 2937 vm_domain_free_assert_unlocked(vmd); 2938 2939 if (curproc == pageproc) { 2940 mtx_lock(&vm_domainset_lock); 2941 if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { 2942 vmd->vmd_pageout_pages_needed = 1; 2943 msleep(&vmd->vmd_pageout_pages_needed, 2944 &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); 2945 } else 2946 mtx_unlock(&vm_domainset_lock); 2947 } else { 2948 if (pageproc == NULL) 2949 panic("vm_wait in early boot"); 2950 DOMAINSET_ZERO(&wdom); 2951 DOMAINSET_SET(vmd->vmd_domain, &wdom); 2952 vm_wait_doms(&wdom); 2953 } 2954 } 2955 2956 /* 2957 * vm_wait: 2958 * 2959 * Sleep until free pages are available for allocation in the 2960 * affinity domains of the obj. If obj is NULL, the domain set 2961 * for the calling thread is used. 2962 * Called in various places after failed memory allocations. 2963 */ 2964 void 2965 vm_wait(vm_object_t obj) 2966 { 2967 struct domainset *d; 2968 2969 d = NULL; 2970 2971 /* 2972 * Carefully fetch pointers only once: the struct domainset 2973 * itself is ummutable but the pointer might change. 2974 */ 2975 if (obj != NULL) 2976 d = obj->domain.dr_policy; 2977 if (d == NULL) 2978 d = curthread->td_domain.dr_policy; 2979 2980 vm_wait_doms(&d->ds_mask); 2981 } 2982 2983 /* 2984 * vm_domain_alloc_fail: 2985 * 2986 * Called when a page allocation function fails. Informs the 2987 * pagedaemon and performs the requested wait. Requires the 2988 * domain_free and object lock on entry. Returns with the 2989 * object lock held and free lock released. Returns an error when 2990 * retry is necessary. 2991 * 2992 */ 2993 static int 2994 vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) 2995 { 2996 2997 vm_domain_free_assert_unlocked(vmd); 2998 2999 atomic_add_int(&vmd->vmd_pageout_deficit, 3000 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 3001 if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { 3002 if (object != NULL) 3003 VM_OBJECT_WUNLOCK(object); 3004 vm_wait_domain(vmd->vmd_domain); 3005 if (object != NULL) 3006 VM_OBJECT_WLOCK(object); 3007 if (req & VM_ALLOC_WAITOK) 3008 return (EAGAIN); 3009 } 3010 3011 return (0); 3012 } 3013 3014 /* 3015 * vm_waitpfault: 3016 * 3017 * Sleep until free pages are available for allocation. 3018 * - Called only in vm_fault so that processes page faulting 3019 * can be easily tracked. 3020 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 3021 * processes will be able to grab memory first. Do not change 3022 * this balance without careful testing first. 3023 */ 3024 void 3025 vm_waitpfault(void) 3026 { 3027 3028 mtx_lock(&vm_domainset_lock); 3029 if (vm_page_count_min()) { 3030 vm_min_waiters++; 3031 msleep(&vm_min_domains, &vm_domainset_lock, PUSER, "pfault", 0); 3032 } 3033 mtx_unlock(&vm_domainset_lock); 3034 } 3035 3036 struct vm_pagequeue * 3037 vm_page_pagequeue(vm_page_t m) 3038 { 3039 3040 return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]); 3041 } 3042 3043 /* 3044 * vm_page_dequeue: 3045 * 3046 * Remove the given page from its current page queue. 3047 * 3048 * The page must be locked. 3049 */ 3050 void 3051 vm_page_dequeue(vm_page_t m) 3052 { 3053 struct vm_pagequeue *pq; 3054 3055 vm_page_assert_locked(m); 3056 KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", 3057 m)); 3058 pq = vm_page_pagequeue(m); 3059 vm_pagequeue_lock(pq); 3060 m->queue = PQ_NONE; 3061 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3062 vm_pagequeue_cnt_dec(pq); 3063 vm_pagequeue_unlock(pq); 3064 } 3065 3066 /* 3067 * vm_page_dequeue_locked: 3068 * 3069 * Remove the given page from its current page queue. 3070 * 3071 * The page and page queue must be locked. 3072 */ 3073 void 3074 vm_page_dequeue_locked(vm_page_t m) 3075 { 3076 struct vm_pagequeue *pq; 3077 3078 vm_page_lock_assert(m, MA_OWNED); 3079 pq = vm_page_pagequeue(m); 3080 vm_pagequeue_assert_locked(pq); 3081 m->queue = PQ_NONE; 3082 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3083 vm_pagequeue_cnt_dec(pq); 3084 } 3085 3086 /* 3087 * vm_page_enqueue: 3088 * 3089 * Add the given page to the specified page queue. 3090 * 3091 * The page must be locked. 3092 */ 3093 static void 3094 vm_page_enqueue(uint8_t queue, vm_page_t m) 3095 { 3096 struct vm_pagequeue *pq; 3097 3098 vm_page_lock_assert(m, MA_OWNED); 3099 KASSERT(queue < PQ_COUNT, 3100 ("vm_page_enqueue: invalid queue %u request for page %p", 3101 queue, m)); 3102 pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue]; 3103 vm_pagequeue_lock(pq); 3104 m->queue = queue; 3105 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3106 vm_pagequeue_cnt_inc(pq); 3107 vm_pagequeue_unlock(pq); 3108 } 3109 3110 /* 3111 * vm_page_requeue: 3112 * 3113 * Move the given page to the tail of its current page queue. 3114 * 3115 * The page must be locked. 3116 */ 3117 void 3118 vm_page_requeue(vm_page_t m) 3119 { 3120 struct vm_pagequeue *pq; 3121 3122 vm_page_lock_assert(m, MA_OWNED); 3123 KASSERT(m->queue != PQ_NONE, 3124 ("vm_page_requeue: page %p is not queued", m)); 3125 pq = vm_page_pagequeue(m); 3126 vm_pagequeue_lock(pq); 3127 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3128 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3129 vm_pagequeue_unlock(pq); 3130 } 3131 3132 /* 3133 * vm_page_requeue_locked: 3134 * 3135 * Move the given page to the tail of its current page queue. 3136 * 3137 * The page queue must be locked. 3138 */ 3139 void 3140 vm_page_requeue_locked(vm_page_t m) 3141 { 3142 struct vm_pagequeue *pq; 3143 3144 KASSERT(m->queue != PQ_NONE, 3145 ("vm_page_requeue_locked: page %p is not queued", m)); 3146 pq = vm_page_pagequeue(m); 3147 vm_pagequeue_assert_locked(pq); 3148 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3149 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3150 } 3151 3152 /* 3153 * vm_page_activate: 3154 * 3155 * Put the specified page on the active list (if appropriate). 3156 * Ensure that act_count is at least ACT_INIT but do not otherwise 3157 * mess with it. 3158 * 3159 * The page must be locked. 3160 */ 3161 void 3162 vm_page_activate(vm_page_t m) 3163 { 3164 int queue; 3165 3166 vm_page_lock_assert(m, MA_OWNED); 3167 if ((queue = m->queue) != PQ_ACTIVE) { 3168 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 3169 if (m->act_count < ACT_INIT) 3170 m->act_count = ACT_INIT; 3171 if (queue != PQ_NONE) 3172 vm_page_dequeue(m); 3173 vm_page_enqueue(PQ_ACTIVE, m); 3174 } 3175 } else { 3176 if (m->act_count < ACT_INIT) 3177 m->act_count = ACT_INIT; 3178 } 3179 } 3180 3181 /* 3182 * vm_page_free_prep: 3183 * 3184 * Prepares the given page to be put on the free list, 3185 * disassociating it from any VM object. The caller may return 3186 * the page to the free list only if this function returns true. 3187 * 3188 * The object must be locked. The page must be locked if it is 3189 * managed. For a queued managed page, the pagequeue_locked 3190 * argument specifies whether the page queue is already locked. 3191 */ 3192 bool 3193 vm_page_free_prep(vm_page_t m, bool pagequeue_locked) 3194 { 3195 3196 #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) 3197 if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { 3198 uint64_t *p; 3199 int i; 3200 p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3201 for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) 3202 KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", 3203 m, i, (uintmax_t)*p)); 3204 } 3205 #endif 3206 if ((m->oflags & VPO_UNMANAGED) == 0) { 3207 vm_page_lock_assert(m, MA_OWNED); 3208 KASSERT(!pmap_page_is_mapped(m), 3209 ("vm_page_free_toq: freeing mapped page %p", m)); 3210 } else 3211 KASSERT(m->queue == PQ_NONE, 3212 ("vm_page_free_toq: unmanaged page %p is queued", m)); 3213 VM_CNT_INC(v_tfree); 3214 3215 if (vm_page_sbusied(m)) 3216 panic("vm_page_free: freeing busy page %p", m); 3217 3218 vm_page_remove(m); 3219 3220 /* 3221 * If fictitious remove object association and 3222 * return. 3223 */ 3224 if ((m->flags & PG_FICTITIOUS) != 0) { 3225 KASSERT(m->wire_count == 1, 3226 ("fictitious page %p is not wired", m)); 3227 KASSERT(m->queue == PQ_NONE, 3228 ("fictitious page %p is queued", m)); 3229 return (false); 3230 } 3231 3232 if (m->queue != PQ_NONE) { 3233 if (pagequeue_locked) 3234 vm_page_dequeue_locked(m); 3235 else 3236 vm_page_dequeue(m); 3237 } 3238 m->valid = 0; 3239 vm_page_undirty(m); 3240 3241 if (m->wire_count != 0) 3242 panic("vm_page_free: freeing wired page %p", m); 3243 if (m->hold_count != 0) { 3244 m->flags &= ~PG_ZERO; 3245 KASSERT((m->flags & PG_UNHOLDFREE) == 0, 3246 ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); 3247 m->flags |= PG_UNHOLDFREE; 3248 return (false); 3249 } 3250 3251 /* 3252 * Restore the default memory attribute to the page. 3253 */ 3254 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 3255 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 3256 3257 #if VM_NRESERVLEVEL > 0 3258 if (vm_reserv_free_page(m)) 3259 return (false); 3260 #endif 3261 3262 return (true); 3263 } 3264 3265 void 3266 vm_page_free_phys_pglist(struct pglist *tq) 3267 { 3268 struct vm_domain *vmd; 3269 vm_page_t m; 3270 int cnt; 3271 3272 if (TAILQ_EMPTY(tq)) 3273 return; 3274 vmd = NULL; 3275 cnt = 0; 3276 TAILQ_FOREACH(m, tq, listq) { 3277 if (vmd != vm_pagequeue_domain(m)) { 3278 if (vmd != NULL) { 3279 vm_domain_free_unlock(vmd); 3280 vm_domain_freecnt_inc(vmd, cnt); 3281 cnt = 0; 3282 } 3283 vmd = vm_pagequeue_domain(m); 3284 vm_domain_free_lock(vmd); 3285 } 3286 vm_phys_free_pages(m, 0); 3287 cnt++; 3288 } 3289 if (vmd != NULL) { 3290 vm_domain_free_unlock(vmd); 3291 vm_domain_freecnt_inc(vmd, cnt); 3292 } 3293 } 3294 3295 /* 3296 * vm_page_free_toq: 3297 * 3298 * Returns the given page to the free list, disassociating it 3299 * from any VM object. 3300 * 3301 * The object must be locked. The page must be locked if it is 3302 * managed. 3303 */ 3304 void 3305 vm_page_free_toq(vm_page_t m) 3306 { 3307 struct vm_domain *vmd; 3308 3309 if (!vm_page_free_prep(m, false)) 3310 return; 3311 3312 vmd = vm_pagequeue_domain(m); 3313 if (m->pool == VM_FREEPOOL_DEFAULT && vmd->vmd_pgcache != NULL) { 3314 uma_zfree(vmd->vmd_pgcache, m); 3315 return; 3316 } 3317 vm_domain_free_lock(vmd); 3318 vm_phys_free_pages(m, 0); 3319 vm_domain_free_unlock(vmd); 3320 vm_domain_freecnt_inc(vmd, 1); 3321 } 3322 3323 /* 3324 * vm_page_free_pages_toq: 3325 * 3326 * Returns a list of pages to the free list, disassociating it 3327 * from any VM object. In other words, this is equivalent to 3328 * calling vm_page_free_toq() for each page of a list of VM objects. 3329 * 3330 * The objects must be locked. The pages must be locked if it is 3331 * managed. 3332 */ 3333 void 3334 vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) 3335 { 3336 vm_page_t m; 3337 int count; 3338 3339 if (SLIST_EMPTY(free)) 3340 return; 3341 3342 count = 0; 3343 while ((m = SLIST_FIRST(free)) != NULL) { 3344 count++; 3345 SLIST_REMOVE_HEAD(free, plinks.s.ss); 3346 vm_page_free_toq(m); 3347 } 3348 3349 if (update_wire_count) 3350 vm_wire_sub(count); 3351 } 3352 3353 /* 3354 * vm_page_wire: 3355 * 3356 * Mark this page as wired down. If the page is fictitious, then 3357 * its wire count must remain one. 3358 * 3359 * The page must be locked. 3360 */ 3361 void 3362 vm_page_wire(vm_page_t m) 3363 { 3364 3365 vm_page_assert_locked(m); 3366 if ((m->flags & PG_FICTITIOUS) != 0) { 3367 KASSERT(m->wire_count == 1, 3368 ("vm_page_wire: fictitious page %p's wire count isn't one", 3369 m)); 3370 return; 3371 } 3372 if (m->wire_count == 0) { 3373 KASSERT((m->oflags & VPO_UNMANAGED) == 0 || 3374 m->queue == PQ_NONE, 3375 ("vm_page_wire: unmanaged page %p is queued", m)); 3376 vm_wire_add(1); 3377 } 3378 m->wire_count++; 3379 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); 3380 } 3381 3382 /* 3383 * vm_page_unwire: 3384 * 3385 * Release one wiring of the specified page, potentially allowing it to be 3386 * paged out. Returns TRUE if the number of wirings transitions to zero and 3387 * FALSE otherwise. 3388 * 3389 * Only managed pages belonging to an object can be paged out. If the number 3390 * of wirings transitions to zero and the page is eligible for page out, then 3391 * the page is added to the specified paging queue (unless PQ_NONE is 3392 * specified, in which case the page is dequeued if it belongs to a paging 3393 * queue). 3394 * 3395 * If a page is fictitious, then its wire count must always be one. 3396 * 3397 * A managed page must be locked. 3398 */ 3399 bool 3400 vm_page_unwire(vm_page_t m, uint8_t queue) 3401 { 3402 bool unwired; 3403 3404 KASSERT(queue < PQ_COUNT || queue == PQ_NONE, 3405 ("vm_page_unwire: invalid queue %u request for page %p", 3406 queue, m)); 3407 3408 unwired = vm_page_unwire_noq(m); 3409 if (unwired && (m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL) { 3410 if (m->queue == queue) { 3411 if (queue == PQ_ACTIVE) 3412 vm_page_reference(m); 3413 else if (queue != PQ_NONE) 3414 vm_page_requeue(m); 3415 } else { 3416 vm_page_remque(m); 3417 if (queue != PQ_NONE) { 3418 vm_page_enqueue(queue, m); 3419 if (queue == PQ_ACTIVE) 3420 /* Initialize act_count. */ 3421 vm_page_activate(m); 3422 } 3423 } 3424 } 3425 return (unwired); 3426 } 3427 3428 /* 3429 * 3430 * vm_page_unwire_noq: 3431 * 3432 * Unwire a page without (re-)inserting it into a page queue. It is up 3433 * to the caller to enqueue, requeue, or free the page as appropriate. 3434 * In most cases, vm_page_unwire() should be used instead. 3435 */ 3436 bool 3437 vm_page_unwire_noq(vm_page_t m) 3438 { 3439 3440 if ((m->oflags & VPO_UNMANAGED) == 0) 3441 vm_page_assert_locked(m); 3442 if ((m->flags & PG_FICTITIOUS) != 0) { 3443 KASSERT(m->wire_count == 1, 3444 ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); 3445 return (false); 3446 } 3447 if (m->wire_count == 0) 3448 panic("vm_page_unwire: page %p's wire count is zero", m); 3449 m->wire_count--; 3450 if (m->wire_count == 0) { 3451 vm_wire_sub(1); 3452 return (true); 3453 } else 3454 return (false); 3455 } 3456 3457 /* 3458 * Move the specified page to the inactive queue, or requeue the page if it is 3459 * already in the inactive queue. 3460 * 3461 * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive 3462 * queue. However, setting "noreuse" to TRUE will accelerate the specified 3463 * page's reclamation, but it will not unmap the page from any address space. 3464 * This is implemented by inserting the page near the head of the inactive 3465 * queue, using a marker page to guide FIFO insertion ordering. 3466 * 3467 * The page must be locked. 3468 */ 3469 static inline void 3470 _vm_page_deactivate(vm_page_t m, boolean_t noreuse) 3471 { 3472 struct vm_pagequeue *pq; 3473 int queue; 3474 3475 vm_page_assert_locked(m); 3476 3477 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 3478 pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE]; 3479 /* Avoid multiple acquisitions of the inactive queue lock. */ 3480 queue = m->queue; 3481 if (queue == PQ_INACTIVE) { 3482 vm_pagequeue_lock(pq); 3483 vm_page_dequeue_locked(m); 3484 } else { 3485 if (queue != PQ_NONE) 3486 vm_page_dequeue(m); 3487 vm_pagequeue_lock(pq); 3488 } 3489 m->queue = PQ_INACTIVE; 3490 if (noreuse) 3491 TAILQ_INSERT_BEFORE( 3492 &vm_pagequeue_domain(m)->vmd_inacthead, m, 3493 plinks.q); 3494 else 3495 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3496 vm_pagequeue_cnt_inc(pq); 3497 vm_pagequeue_unlock(pq); 3498 } 3499 } 3500 3501 /* 3502 * Move the specified page to the inactive queue, or requeue the page if it is 3503 * already in the inactive queue. 3504 * 3505 * The page must be locked. 3506 */ 3507 void 3508 vm_page_deactivate(vm_page_t m) 3509 { 3510 3511 _vm_page_deactivate(m, FALSE); 3512 } 3513 3514 /* 3515 * Move the specified page to the inactive queue with the expectation 3516 * that it is unlikely to be reused. 3517 * 3518 * The page must be locked. 3519 */ 3520 void 3521 vm_page_deactivate_noreuse(vm_page_t m) 3522 { 3523 3524 _vm_page_deactivate(m, TRUE); 3525 } 3526 3527 /* 3528 * vm_page_launder 3529 * 3530 * Put a page in the laundry, or requeue it if it is already there. 3531 */ 3532 void 3533 vm_page_launder(vm_page_t m) 3534 { 3535 3536 vm_page_assert_locked(m); 3537 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 3538 if (m->queue == PQ_LAUNDRY) 3539 vm_page_requeue(m); 3540 else { 3541 vm_page_remque(m); 3542 vm_page_enqueue(PQ_LAUNDRY, m); 3543 } 3544 } 3545 } 3546 3547 /* 3548 * vm_page_unswappable 3549 * 3550 * Put a page in the PQ_UNSWAPPABLE holding queue. 3551 */ 3552 void 3553 vm_page_unswappable(vm_page_t m) 3554 { 3555 3556 vm_page_assert_locked(m); 3557 KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0, 3558 ("page %p already unswappable", m)); 3559 if (m->queue != PQ_NONE) 3560 vm_page_dequeue(m); 3561 vm_page_enqueue(PQ_UNSWAPPABLE, m); 3562 } 3563 3564 /* 3565 * Attempt to free the page. If it cannot be freed, do nothing. Returns true 3566 * if the page is freed and false otherwise. 3567 * 3568 * The page must be managed. The page and its containing object must be 3569 * locked. 3570 */ 3571 bool 3572 vm_page_try_to_free(vm_page_t m) 3573 { 3574 3575 vm_page_assert_locked(m); 3576 VM_OBJECT_ASSERT_WLOCKED(m->object); 3577 KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); 3578 if (m->dirty != 0 || vm_page_held(m) || vm_page_busied(m)) 3579 return (false); 3580 if (m->object->ref_count != 0) { 3581 pmap_remove_all(m); 3582 if (m->dirty != 0) 3583 return (false); 3584 } 3585 vm_page_free(m); 3586 return (true); 3587 } 3588 3589 /* 3590 * vm_page_advise 3591 * 3592 * Apply the specified advice to the given page. 3593 * 3594 * The object and page must be locked. 3595 */ 3596 void 3597 vm_page_advise(vm_page_t m, int advice) 3598 { 3599 3600 vm_page_assert_locked(m); 3601 VM_OBJECT_ASSERT_WLOCKED(m->object); 3602 if (advice == MADV_FREE) 3603 /* 3604 * Mark the page clean. This will allow the page to be freed 3605 * without first paging it out. MADV_FREE pages are often 3606 * quickly reused by malloc(3), so we do not do anything that 3607 * would result in a page fault on a later access. 3608 */ 3609 vm_page_undirty(m); 3610 else if (advice != MADV_DONTNEED) { 3611 if (advice == MADV_WILLNEED) 3612 vm_page_activate(m); 3613 return; 3614 } 3615 3616 /* 3617 * Clear any references to the page. Otherwise, the page daemon will 3618 * immediately reactivate the page. 3619 */ 3620 vm_page_aflag_clear(m, PGA_REFERENCED); 3621 3622 if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) 3623 vm_page_dirty(m); 3624 3625 /* 3626 * Place clean pages near the head of the inactive queue rather than 3627 * the tail, thus defeating the queue's LRU operation and ensuring that 3628 * the page will be reused quickly. Dirty pages not already in the 3629 * laundry are moved there. 3630 */ 3631 if (m->dirty == 0) 3632 vm_page_deactivate_noreuse(m); 3633 else if (!vm_page_in_laundry(m)) 3634 vm_page_launder(m); 3635 } 3636 3637 /* 3638 * Grab a page, waiting until we are waken up due to the page 3639 * changing state. We keep on waiting, if the page continues 3640 * to be in the object. If the page doesn't exist, first allocate it 3641 * and then conditionally zero it. 3642 * 3643 * This routine may sleep. 3644 * 3645 * The object must be locked on entry. The lock will, however, be released 3646 * and reacquired if the routine sleeps. 3647 */ 3648 vm_page_t 3649 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 3650 { 3651 vm_page_t m; 3652 int sleep; 3653 int pflags; 3654 3655 VM_OBJECT_ASSERT_WLOCKED(object); 3656 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 3657 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 3658 ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); 3659 pflags = allocflags & 3660 ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); 3661 if ((allocflags & VM_ALLOC_NOWAIT) == 0) 3662 pflags |= VM_ALLOC_WAITFAIL; 3663 retrylookup: 3664 if ((m = vm_page_lookup(object, pindex)) != NULL) { 3665 sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? 3666 vm_page_xbusied(m) : vm_page_busied(m); 3667 if (sleep) { 3668 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 3669 return (NULL); 3670 /* 3671 * Reference the page before unlocking and 3672 * sleeping so that the page daemon is less 3673 * likely to reclaim it. 3674 */ 3675 vm_page_aflag_set(m, PGA_REFERENCED); 3676 vm_page_lock(m); 3677 VM_OBJECT_WUNLOCK(object); 3678 vm_page_busy_sleep(m, "pgrbwt", (allocflags & 3679 VM_ALLOC_IGN_SBUSY) != 0); 3680 VM_OBJECT_WLOCK(object); 3681 goto retrylookup; 3682 } else { 3683 if ((allocflags & VM_ALLOC_WIRED) != 0) { 3684 vm_page_lock(m); 3685 vm_page_wire(m); 3686 vm_page_unlock(m); 3687 } 3688 if ((allocflags & 3689 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) 3690 vm_page_xbusy(m); 3691 if ((allocflags & VM_ALLOC_SBUSY) != 0) 3692 vm_page_sbusy(m); 3693 return (m); 3694 } 3695 } 3696 m = vm_page_alloc(object, pindex, pflags); 3697 if (m == NULL) { 3698 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 3699 return (NULL); 3700 goto retrylookup; 3701 } 3702 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 3703 pmap_zero_page(m); 3704 return (m); 3705 } 3706 3707 /* 3708 * Return the specified range of pages from the given object. For each 3709 * page offset within the range, if a page already exists within the object 3710 * at that offset and it is busy, then wait for it to change state. If, 3711 * instead, the page doesn't exist, then allocate it. 3712 * 3713 * The caller must always specify an allocation class. 3714 * 3715 * allocation classes: 3716 * VM_ALLOC_NORMAL normal process request 3717 * VM_ALLOC_SYSTEM system *really* needs the pages 3718 * 3719 * The caller must always specify that the pages are to be busied and/or 3720 * wired. 3721 * 3722 * optional allocation flags: 3723 * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages 3724 * VM_ALLOC_NOBUSY do not exclusive busy the page 3725 * VM_ALLOC_NOWAIT do not sleep 3726 * VM_ALLOC_SBUSY set page to sbusy state 3727 * VM_ALLOC_WIRED wire the pages 3728 * VM_ALLOC_ZERO zero and validate any invalid pages 3729 * 3730 * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it 3731 * may return a partial prefix of the requested range. 3732 */ 3733 int 3734 vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, 3735 vm_page_t *ma, int count) 3736 { 3737 vm_page_t m, mpred; 3738 int pflags; 3739 int i; 3740 bool sleep; 3741 3742 VM_OBJECT_ASSERT_WLOCKED(object); 3743 KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, 3744 ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); 3745 KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || 3746 (allocflags & VM_ALLOC_WIRED) != 0, 3747 ("vm_page_grab_pages: the pages must be busied or wired")); 3748 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 3749 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 3750 ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch")); 3751 if (count == 0) 3752 return (0); 3753 pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | 3754 VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY); 3755 if ((allocflags & VM_ALLOC_NOWAIT) == 0) 3756 pflags |= VM_ALLOC_WAITFAIL; 3757 i = 0; 3758 retrylookup: 3759 m = vm_radix_lookup_le(&object->rtree, pindex + i); 3760 if (m == NULL || m->pindex != pindex + i) { 3761 mpred = m; 3762 m = NULL; 3763 } else 3764 mpred = TAILQ_PREV(m, pglist, listq); 3765 for (; i < count; i++) { 3766 if (m != NULL) { 3767 sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? 3768 vm_page_xbusied(m) : vm_page_busied(m); 3769 if (sleep) { 3770 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 3771 break; 3772 /* 3773 * Reference the page before unlocking and 3774 * sleeping so that the page daemon is less 3775 * likely to reclaim it. 3776 */ 3777 vm_page_aflag_set(m, PGA_REFERENCED); 3778 vm_page_lock(m); 3779 VM_OBJECT_WUNLOCK(object); 3780 vm_page_busy_sleep(m, "grbmaw", (allocflags & 3781 VM_ALLOC_IGN_SBUSY) != 0); 3782 VM_OBJECT_WLOCK(object); 3783 goto retrylookup; 3784 } 3785 if ((allocflags & VM_ALLOC_WIRED) != 0) { 3786 vm_page_lock(m); 3787 vm_page_wire(m); 3788 vm_page_unlock(m); 3789 } 3790 if ((allocflags & (VM_ALLOC_NOBUSY | 3791 VM_ALLOC_SBUSY)) == 0) 3792 vm_page_xbusy(m); 3793 if ((allocflags & VM_ALLOC_SBUSY) != 0) 3794 vm_page_sbusy(m); 3795 } else { 3796 m = vm_page_alloc_after(object, pindex + i, 3797 pflags | VM_ALLOC_COUNT(count - i), mpred); 3798 if (m == NULL) { 3799 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 3800 break; 3801 goto retrylookup; 3802 } 3803 } 3804 if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) { 3805 if ((m->flags & PG_ZERO) == 0) 3806 pmap_zero_page(m); 3807 m->valid = VM_PAGE_BITS_ALL; 3808 } 3809 ma[i] = mpred = m; 3810 m = vm_page_next(m); 3811 } 3812 return (i); 3813 } 3814 3815 /* 3816 * Mapping function for valid or dirty bits in a page. 3817 * 3818 * Inputs are required to range within a page. 3819 */ 3820 vm_page_bits_t 3821 vm_page_bits(int base, int size) 3822 { 3823 int first_bit; 3824 int last_bit; 3825 3826 KASSERT( 3827 base + size <= PAGE_SIZE, 3828 ("vm_page_bits: illegal base/size %d/%d", base, size) 3829 ); 3830 3831 if (size == 0) /* handle degenerate case */ 3832 return (0); 3833 3834 first_bit = base >> DEV_BSHIFT; 3835 last_bit = (base + size - 1) >> DEV_BSHIFT; 3836 3837 return (((vm_page_bits_t)2 << last_bit) - 3838 ((vm_page_bits_t)1 << first_bit)); 3839 } 3840 3841 /* 3842 * vm_page_set_valid_range: 3843 * 3844 * Sets portions of a page valid. The arguments are expected 3845 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 3846 * of any partial chunks touched by the range. The invalid portion of 3847 * such chunks will be zeroed. 3848 * 3849 * (base + size) must be less then or equal to PAGE_SIZE. 3850 */ 3851 void 3852 vm_page_set_valid_range(vm_page_t m, int base, int size) 3853 { 3854 int endoff, frag; 3855 3856 VM_OBJECT_ASSERT_WLOCKED(m->object); 3857 if (size == 0) /* handle degenerate case */ 3858 return; 3859 3860 /* 3861 * If the base is not DEV_BSIZE aligned and the valid 3862 * bit is clear, we have to zero out a portion of the 3863 * first block. 3864 */ 3865 if ((frag = rounddown2(base, DEV_BSIZE)) != base && 3866 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 3867 pmap_zero_page_area(m, frag, base - frag); 3868 3869 /* 3870 * If the ending offset is not DEV_BSIZE aligned and the 3871 * valid bit is clear, we have to zero out a portion of 3872 * the last block. 3873 */ 3874 endoff = base + size; 3875 if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && 3876 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 3877 pmap_zero_page_area(m, endoff, 3878 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 3879 3880 /* 3881 * Assert that no previously invalid block that is now being validated 3882 * is already dirty. 3883 */ 3884 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 3885 ("vm_page_set_valid_range: page %p is dirty", m)); 3886 3887 /* 3888 * Set valid bits inclusive of any overlap. 3889 */ 3890 m->valid |= vm_page_bits(base, size); 3891 } 3892 3893 /* 3894 * Clear the given bits from the specified page's dirty field. 3895 */ 3896 static __inline void 3897 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) 3898 { 3899 uintptr_t addr; 3900 #if PAGE_SIZE < 16384 3901 int shift; 3902 #endif 3903 3904 /* 3905 * If the object is locked and the page is neither exclusive busy nor 3906 * write mapped, then the page's dirty field cannot possibly be 3907 * set by a concurrent pmap operation. 3908 */ 3909 VM_OBJECT_ASSERT_WLOCKED(m->object); 3910 if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) 3911 m->dirty &= ~pagebits; 3912 else { 3913 /* 3914 * The pmap layer can call vm_page_dirty() without 3915 * holding a distinguished lock. The combination of 3916 * the object's lock and an atomic operation suffice 3917 * to guarantee consistency of the page dirty field. 3918 * 3919 * For PAGE_SIZE == 32768 case, compiler already 3920 * properly aligns the dirty field, so no forcible 3921 * alignment is needed. Only require existence of 3922 * atomic_clear_64 when page size is 32768. 3923 */ 3924 addr = (uintptr_t)&m->dirty; 3925 #if PAGE_SIZE == 32768 3926 atomic_clear_64((uint64_t *)addr, pagebits); 3927 #elif PAGE_SIZE == 16384 3928 atomic_clear_32((uint32_t *)addr, pagebits); 3929 #else /* PAGE_SIZE <= 8192 */ 3930 /* 3931 * Use a trick to perform a 32-bit atomic on the 3932 * containing aligned word, to not depend on the existence 3933 * of atomic_clear_{8, 16}. 3934 */ 3935 shift = addr & (sizeof(uint32_t) - 1); 3936 #if BYTE_ORDER == BIG_ENDIAN 3937 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; 3938 #else 3939 shift *= NBBY; 3940 #endif 3941 addr &= ~(sizeof(uint32_t) - 1); 3942 atomic_clear_32((uint32_t *)addr, pagebits << shift); 3943 #endif /* PAGE_SIZE */ 3944 } 3945 } 3946 3947 /* 3948 * vm_page_set_validclean: 3949 * 3950 * Sets portions of a page valid and clean. The arguments are expected 3951 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 3952 * of any partial chunks touched by the range. The invalid portion of 3953 * such chunks will be zero'd. 3954 * 3955 * (base + size) must be less then or equal to PAGE_SIZE. 3956 */ 3957 void 3958 vm_page_set_validclean(vm_page_t m, int base, int size) 3959 { 3960 vm_page_bits_t oldvalid, pagebits; 3961 int endoff, frag; 3962 3963 VM_OBJECT_ASSERT_WLOCKED(m->object); 3964 if (size == 0) /* handle degenerate case */ 3965 return; 3966 3967 /* 3968 * If the base is not DEV_BSIZE aligned and the valid 3969 * bit is clear, we have to zero out a portion of the 3970 * first block. 3971 */ 3972 if ((frag = rounddown2(base, DEV_BSIZE)) != base && 3973 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) 3974 pmap_zero_page_area(m, frag, base - frag); 3975 3976 /* 3977 * If the ending offset is not DEV_BSIZE aligned and the 3978 * valid bit is clear, we have to zero out a portion of 3979 * the last block. 3980 */ 3981 endoff = base + size; 3982 if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && 3983 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) 3984 pmap_zero_page_area(m, endoff, 3985 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 3986 3987 /* 3988 * Set valid, clear dirty bits. If validating the entire 3989 * page we can safely clear the pmap modify bit. We also 3990 * use this opportunity to clear the VPO_NOSYNC flag. If a process 3991 * takes a write fault on a MAP_NOSYNC memory area the flag will 3992 * be set again. 3993 * 3994 * We set valid bits inclusive of any overlap, but we can only 3995 * clear dirty bits for DEV_BSIZE chunks that are fully within 3996 * the range. 3997 */ 3998 oldvalid = m->valid; 3999 pagebits = vm_page_bits(base, size); 4000 m->valid |= pagebits; 4001 #if 0 /* NOT YET */ 4002 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 4003 frag = DEV_BSIZE - frag; 4004 base += frag; 4005 size -= frag; 4006 if (size < 0) 4007 size = 0; 4008 } 4009 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 4010 #endif 4011 if (base == 0 && size == PAGE_SIZE) { 4012 /* 4013 * The page can only be modified within the pmap if it is 4014 * mapped, and it can only be mapped if it was previously 4015 * fully valid. 4016 */ 4017 if (oldvalid == VM_PAGE_BITS_ALL) 4018 /* 4019 * Perform the pmap_clear_modify() first. Otherwise, 4020 * a concurrent pmap operation, such as 4021 * pmap_protect(), could clear a modification in the 4022 * pmap and set the dirty field on the page before 4023 * pmap_clear_modify() had begun and after the dirty 4024 * field was cleared here. 4025 */ 4026 pmap_clear_modify(m); 4027 m->dirty = 0; 4028 m->oflags &= ~VPO_NOSYNC; 4029 } else if (oldvalid != VM_PAGE_BITS_ALL) 4030 m->dirty &= ~pagebits; 4031 else 4032 vm_page_clear_dirty_mask(m, pagebits); 4033 } 4034 4035 void 4036 vm_page_clear_dirty(vm_page_t m, int base, int size) 4037 { 4038 4039 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 4040 } 4041 4042 /* 4043 * vm_page_set_invalid: 4044 * 4045 * Invalidates DEV_BSIZE'd chunks within a page. Both the 4046 * valid and dirty bits for the effected areas are cleared. 4047 */ 4048 void 4049 vm_page_set_invalid(vm_page_t m, int base, int size) 4050 { 4051 vm_page_bits_t bits; 4052 vm_object_t object; 4053 4054 object = m->object; 4055 VM_OBJECT_ASSERT_WLOCKED(object); 4056 if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + 4057 size >= object->un_pager.vnp.vnp_size) 4058 bits = VM_PAGE_BITS_ALL; 4059 else 4060 bits = vm_page_bits(base, size); 4061 if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && 4062 bits != 0) 4063 pmap_remove_all(m); 4064 KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || 4065 !pmap_page_is_mapped(m), 4066 ("vm_page_set_invalid: page %p is mapped", m)); 4067 m->valid &= ~bits; 4068 m->dirty &= ~bits; 4069 } 4070 4071 /* 4072 * vm_page_zero_invalid() 4073 * 4074 * The kernel assumes that the invalid portions of a page contain 4075 * garbage, but such pages can be mapped into memory by user code. 4076 * When this occurs, we must zero out the non-valid portions of the 4077 * page so user code sees what it expects. 4078 * 4079 * Pages are most often semi-valid when the end of a file is mapped 4080 * into memory and the file's size is not page aligned. 4081 */ 4082 void 4083 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 4084 { 4085 int b; 4086 int i; 4087 4088 VM_OBJECT_ASSERT_WLOCKED(m->object); 4089 /* 4090 * Scan the valid bits looking for invalid sections that 4091 * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the 4092 * valid bit may be set ) have already been zeroed by 4093 * vm_page_set_validclean(). 4094 */ 4095 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 4096 if (i == (PAGE_SIZE / DEV_BSIZE) || 4097 (m->valid & ((vm_page_bits_t)1 << i))) { 4098 if (i > b) { 4099 pmap_zero_page_area(m, 4100 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 4101 } 4102 b = i + 1; 4103 } 4104 } 4105 4106 /* 4107 * setvalid is TRUE when we can safely set the zero'd areas 4108 * as being valid. We can do this if there are no cache consistancy 4109 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 4110 */ 4111 if (setvalid) 4112 m->valid = VM_PAGE_BITS_ALL; 4113 } 4114 4115 /* 4116 * vm_page_is_valid: 4117 * 4118 * Is (partial) page valid? Note that the case where size == 0 4119 * will return FALSE in the degenerate case where the page is 4120 * entirely invalid, and TRUE otherwise. 4121 */ 4122 int 4123 vm_page_is_valid(vm_page_t m, int base, int size) 4124 { 4125 vm_page_bits_t bits; 4126 4127 VM_OBJECT_ASSERT_LOCKED(m->object); 4128 bits = vm_page_bits(base, size); 4129 return (m->valid != 0 && (m->valid & bits) == bits); 4130 } 4131 4132 /* 4133 * Returns true if all of the specified predicates are true for the entire 4134 * (super)page and false otherwise. 4135 */ 4136 bool 4137 vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) 4138 { 4139 vm_object_t object; 4140 int i, npages; 4141 4142 object = m->object; 4143 if (skip_m != NULL && skip_m->object != object) 4144 return (false); 4145 VM_OBJECT_ASSERT_LOCKED(object); 4146 npages = atop(pagesizes[m->psind]); 4147 4148 /* 4149 * The physically contiguous pages that make up a superpage, i.e., a 4150 * page with a page size index ("psind") greater than zero, will 4151 * occupy adjacent entries in vm_page_array[]. 4152 */ 4153 for (i = 0; i < npages; i++) { 4154 /* Always test object consistency, including "skip_m". */ 4155 if (m[i].object != object) 4156 return (false); 4157 if (&m[i] == skip_m) 4158 continue; 4159 if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) 4160 return (false); 4161 if ((flags & PS_ALL_DIRTY) != 0) { 4162 /* 4163 * Calling vm_page_test_dirty() or pmap_is_modified() 4164 * might stop this case from spuriously returning 4165 * "false". However, that would require a write lock 4166 * on the object containing "m[i]". 4167 */ 4168 if (m[i].dirty != VM_PAGE_BITS_ALL) 4169 return (false); 4170 } 4171 if ((flags & PS_ALL_VALID) != 0 && 4172 m[i].valid != VM_PAGE_BITS_ALL) 4173 return (false); 4174 } 4175 return (true); 4176 } 4177 4178 /* 4179 * Set the page's dirty bits if the page is modified. 4180 */ 4181 void 4182 vm_page_test_dirty(vm_page_t m) 4183 { 4184 4185 VM_OBJECT_ASSERT_WLOCKED(m->object); 4186 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 4187 vm_page_dirty(m); 4188 } 4189 4190 void 4191 vm_page_lock_KBI(vm_page_t m, const char *file, int line) 4192 { 4193 4194 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); 4195 } 4196 4197 void 4198 vm_page_unlock_KBI(vm_page_t m, const char *file, int line) 4199 { 4200 4201 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); 4202 } 4203 4204 int 4205 vm_page_trylock_KBI(vm_page_t m, const char *file, int line) 4206 { 4207 4208 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); 4209 } 4210 4211 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) 4212 void 4213 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) 4214 { 4215 4216 vm_page_lock_assert_KBI(m, MA_OWNED, file, line); 4217 } 4218 4219 void 4220 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) 4221 { 4222 4223 mtx_assert_(vm_page_lockptr(m), a, file, line); 4224 } 4225 #endif 4226 4227 #ifdef INVARIANTS 4228 void 4229 vm_page_object_lock_assert(vm_page_t m) 4230 { 4231 4232 /* 4233 * Certain of the page's fields may only be modified by the 4234 * holder of the containing object's lock or the exclusive busy. 4235 * holder. Unfortunately, the holder of the write busy is 4236 * not recorded, and thus cannot be checked here. 4237 */ 4238 if (m->object != NULL && !vm_page_xbusied(m)) 4239 VM_OBJECT_ASSERT_WLOCKED(m->object); 4240 } 4241 4242 void 4243 vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) 4244 { 4245 4246 if ((bits & PGA_WRITEABLE) == 0) 4247 return; 4248 4249 /* 4250 * The PGA_WRITEABLE flag can only be set if the page is 4251 * managed, is exclusively busied or the object is locked. 4252 * Currently, this flag is only set by pmap_enter(). 4253 */ 4254 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4255 ("PGA_WRITEABLE on unmanaged page")); 4256 if (!vm_page_xbusied(m)) 4257 VM_OBJECT_ASSERT_LOCKED(m->object); 4258 } 4259 #endif 4260 4261 #include "opt_ddb.h" 4262 #ifdef DDB 4263 #include <sys/kernel.h> 4264 4265 #include <ddb/ddb.h> 4266 4267 DB_SHOW_COMMAND(page, vm_page_print_page_info) 4268 { 4269 4270 db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); 4271 db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); 4272 db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); 4273 db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); 4274 db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); 4275 db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); 4276 db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); 4277 db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); 4278 db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); 4279 } 4280 4281 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 4282 { 4283 int dom; 4284 4285 db_printf("pq_free %d\n", vm_free_count()); 4286 for (dom = 0; dom < vm_ndomains; dom++) { 4287 db_printf( 4288 "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", 4289 dom, 4290 vm_dom[dom].vmd_page_count, 4291 vm_dom[dom].vmd_free_count, 4292 vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, 4293 vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, 4294 vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 4295 vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); 4296 } 4297 } 4298 4299 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) 4300 { 4301 vm_page_t m; 4302 boolean_t phys; 4303 4304 if (!have_addr) { 4305 db_printf("show pginfo addr\n"); 4306 return; 4307 } 4308 4309 phys = strchr(modif, 'p') != NULL; 4310 if (phys) 4311 m = PHYS_TO_VM_PAGE(addr); 4312 else 4313 m = (vm_page_t)addr; 4314 db_printf( 4315 "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" 4316 " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", 4317 m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, 4318 m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, 4319 m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); 4320 } 4321 #endif /* DDB */ 4322