1 /*- 2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * The Mach Operating System project at Carnegie-Mellon University. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 /*- 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 */ 62 63 /* 64 * Resident memory management module. 65 */ 66 67 #include <sys/cdefs.h> 68 #include "opt_vm.h" 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/counter.h> 73 #include <sys/domainset.h> 74 #include <sys/kernel.h> 75 #include <sys/limits.h> 76 #include <sys/linker.h> 77 #include <sys/lock.h> 78 #include <sys/malloc.h> 79 #include <sys/mman.h> 80 #include <sys/msgbuf.h> 81 #include <sys/mutex.h> 82 #include <sys/proc.h> 83 #include <sys/rwlock.h> 84 #include <sys/sleepqueue.h> 85 #include <sys/sbuf.h> 86 #include <sys/sched.h> 87 #include <sys/smp.h> 88 #include <sys/sysctl.h> 89 #include <sys/vmmeter.h> 90 #include <sys/vnode.h> 91 92 #include <vm/vm.h> 93 #include <vm/pmap.h> 94 #include <vm/vm_param.h> 95 #include <vm/vm_domainset.h> 96 #include <vm/vm_kern.h> 97 #include <vm/vm_map.h> 98 #include <vm/vm_object.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_pageout.h> 101 #include <vm/vm_phys.h> 102 #include <vm/vm_pagequeue.h> 103 #include <vm/vm_pager.h> 104 #include <vm/vm_radix.h> 105 #include <vm/vm_reserv.h> 106 #include <vm/vm_extern.h> 107 #include <vm/vm_dumpset.h> 108 #include <vm/uma.h> 109 #include <vm/uma_int.h> 110 111 #include <machine/md_var.h> 112 113 struct vm_domain vm_dom[MAXMEMDOM]; 114 115 DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); 116 117 struct mtx_padalign __exclusive_cache_line vm_domainset_lock; 118 /* The following fields are protected by the domainset lock. */ 119 domainset_t __exclusive_cache_line vm_min_domains; 120 domainset_t __exclusive_cache_line vm_severe_domains; 121 static int vm_min_waiters; 122 static int vm_severe_waiters; 123 static int vm_pageproc_waiters; 124 125 static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 126 "VM page statistics"); 127 128 static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries); 129 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries, 130 CTLFLAG_RD, &pqstate_commit_retries, 131 "Number of failed per-page atomic queue state updates"); 132 133 static COUNTER_U64_DEFINE_EARLY(queue_ops); 134 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, 135 CTLFLAG_RD, &queue_ops, 136 "Number of batched queue operations"); 137 138 static COUNTER_U64_DEFINE_EARLY(queue_nops); 139 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops, 140 CTLFLAG_RD, &queue_nops, 141 "Number of batched queue operations with no effects"); 142 143 static unsigned long nofreeq_size; 144 SYSCTL_ULONG(_vm_stats_page, OID_AUTO, nofreeq_size, CTLFLAG_RD, 145 &nofreeq_size, 0, 146 "Size of the nofree queue"); 147 148 /* 149 * bogus page -- for I/O to/from partially complete buffers, 150 * or for paging into sparsely invalid regions. 151 */ 152 vm_page_t bogus_page; 153 154 vm_page_t vm_page_array; 155 long vm_page_array_size; 156 long first_page; 157 158 struct bitset *vm_page_dump; 159 long vm_page_dump_pages; 160 161 static TAILQ_HEAD(, vm_page) blacklist_head; 162 static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); 163 SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | 164 CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); 165 166 static uma_zone_t fakepg_zone; 167 168 static void vm_page_alloc_check(vm_page_t m); 169 static vm_page_t vm_page_alloc_nofree_domain(int domain, int req); 170 static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, 171 vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked); 172 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); 173 static void vm_page_enqueue(vm_page_t m, uint8_t queue); 174 static bool vm_page_free_prep(vm_page_t m); 175 static void vm_page_free_toq(vm_page_t m); 176 static void vm_page_init(void *dummy); 177 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object); 178 static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, 179 const uint16_t nflag); 180 static int vm_page_reclaim_run(int req_class, int domain, u_long npages, 181 vm_page_t m_run, vm_paddr_t high); 182 static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse); 183 static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, 184 int req); 185 static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, 186 int flags); 187 static void vm_page_zone_release(void *arg, void **store, int cnt); 188 189 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); 190 191 static void 192 vm_page_init(void *dummy) 193 { 194 195 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 196 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 197 bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_NOFREE); 198 } 199 200 static int pgcache_zone_max_pcpu; 201 SYSCTL_INT(_vm, OID_AUTO, pgcache_zone_max_pcpu, 202 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pgcache_zone_max_pcpu, 0, 203 "Per-CPU page cache size"); 204 205 /* 206 * The cache page zone is initialized later since we need to be able to allocate 207 * pages before UMA is fully initialized. 208 */ 209 static void 210 vm_page_init_cache_zones(void *dummy __unused) 211 { 212 struct vm_domain *vmd; 213 struct vm_pgcache *pgcache; 214 int cache, domain, maxcache, pool; 215 216 TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &pgcache_zone_max_pcpu); 217 maxcache = pgcache_zone_max_pcpu * mp_ncpus; 218 for (domain = 0; domain < vm_ndomains; domain++) { 219 vmd = VM_DOMAIN(domain); 220 for (pool = 0; pool < VM_NFREEPOOL; pool++) { 221 #ifdef VM_FREEPOOL_LAZYINIT 222 if (pool == VM_FREEPOOL_LAZYINIT) 223 continue; 224 #endif 225 pgcache = &vmd->vmd_pgcache[pool]; 226 pgcache->domain = domain; 227 pgcache->pool = pool; 228 pgcache->zone = uma_zcache_create("vm pgcache", 229 PAGE_SIZE, NULL, NULL, NULL, NULL, 230 vm_page_zone_import, vm_page_zone_release, pgcache, 231 UMA_ZONE_VM); 232 233 /* 234 * Limit each pool's zone to 0.1% of the pages in the 235 * domain. 236 */ 237 cache = maxcache != 0 ? maxcache : 238 vmd->vmd_page_count / 1000; 239 uma_zone_set_maxcache(pgcache->zone, cache); 240 } 241 } 242 } 243 SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); 244 245 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ 246 #if PAGE_SIZE == 32768 247 #ifdef CTASSERT 248 CTASSERT(sizeof(u_long) >= 8); 249 #endif 250 #endif 251 252 /* 253 * vm_set_page_size: 254 * 255 * Sets the page size, perhaps based upon the memory 256 * size. Must be called before any use of page-size 257 * dependent functions. 258 */ 259 void 260 vm_set_page_size(void) 261 { 262 if (vm_cnt.v_page_size == 0) 263 vm_cnt.v_page_size = PAGE_SIZE; 264 if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) 265 panic("vm_set_page_size: page size not a power of two"); 266 } 267 268 /* 269 * vm_page_blacklist_next: 270 * 271 * Find the next entry in the provided string of blacklist 272 * addresses. Entries are separated by space, comma, or newline. 273 * If an invalid integer is encountered then the rest of the 274 * string is skipped. Updates the list pointer to the next 275 * character, or NULL if the string is exhausted or invalid. 276 */ 277 static vm_paddr_t 278 vm_page_blacklist_next(char **list, char *end) 279 { 280 vm_paddr_t bad; 281 char *cp, *pos; 282 283 if (list == NULL || *list == NULL) 284 return (0); 285 if (**list =='\0') { 286 *list = NULL; 287 return (0); 288 } 289 290 /* 291 * If there's no end pointer then the buffer is coming from 292 * the kenv and we know it's null-terminated. 293 */ 294 if (end == NULL) 295 end = *list + strlen(*list); 296 297 /* Ensure that strtoq() won't walk off the end */ 298 if (*end != '\0') { 299 if (*end == '\n' || *end == ' ' || *end == ',') 300 *end = '\0'; 301 else { 302 printf("Blacklist not terminated, skipping\n"); 303 *list = NULL; 304 return (0); 305 } 306 } 307 308 for (pos = *list; *pos != '\0'; pos = cp) { 309 bad = strtoq(pos, &cp, 0); 310 if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { 311 if (bad == 0) { 312 if (++cp < end) 313 continue; 314 else 315 break; 316 } 317 } else 318 break; 319 if (*cp == '\0' || ++cp >= end) 320 *list = NULL; 321 else 322 *list = cp; 323 return (trunc_page(bad)); 324 } 325 printf("Garbage in RAM blacklist, skipping\n"); 326 *list = NULL; 327 return (0); 328 } 329 330 bool 331 vm_page_blacklist_add(vm_paddr_t pa, bool verbose) 332 { 333 struct vm_domain *vmd; 334 vm_page_t m; 335 bool found; 336 337 m = vm_phys_paddr_to_vm_page(pa); 338 if (m == NULL) 339 return (true); /* page does not exist, no failure */ 340 341 vmd = VM_DOMAIN(vm_phys_domain(pa)); 342 vm_domain_free_lock(vmd); 343 found = vm_phys_unfree_page(pa); 344 vm_domain_free_unlock(vmd); 345 if (found) { 346 vm_domain_freecnt_inc(vmd, -1); 347 TAILQ_INSERT_TAIL(&blacklist_head, m, plinks.q); 348 if (verbose) 349 printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); 350 } 351 return (found); 352 } 353 354 /* 355 * vm_page_blacklist_check: 356 * 357 * Iterate through the provided string of blacklist addresses, pulling 358 * each entry out of the physical allocator free list and putting it 359 * onto a list for reporting via the vm.page_blacklist sysctl. 360 */ 361 static void 362 vm_page_blacklist_check(char *list, char *end) 363 { 364 vm_paddr_t pa; 365 char *next; 366 367 next = list; 368 while (next != NULL) { 369 if ((pa = vm_page_blacklist_next(&next, end)) == 0) 370 continue; 371 vm_page_blacklist_add(pa, bootverbose); 372 } 373 } 374 375 /* 376 * vm_page_blacklist_load: 377 * 378 * Search for a special module named "ram_blacklist". It'll be a 379 * plain text file provided by the user via the loader directive 380 * of the same name. 381 */ 382 static void 383 vm_page_blacklist_load(char **list, char **end) 384 { 385 void *mod; 386 u_char *ptr; 387 u_int len; 388 389 mod = NULL; 390 ptr = NULL; 391 392 mod = preload_search_by_type("ram_blacklist"); 393 if (mod != NULL) { 394 ptr = preload_fetch_addr(mod); 395 len = preload_fetch_size(mod); 396 } 397 *list = ptr; 398 if (ptr != NULL) 399 *end = ptr + len; 400 else 401 *end = NULL; 402 return; 403 } 404 405 static int 406 sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) 407 { 408 vm_page_t m; 409 struct sbuf sbuf; 410 int error, first; 411 412 first = 1; 413 error = sysctl_wire_old_buffer(req, 0); 414 if (error != 0) 415 return (error); 416 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 417 TAILQ_FOREACH(m, &blacklist_head, plinks.q) { 418 sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", 419 (uintmax_t)m->phys_addr); 420 first = 0; 421 } 422 error = sbuf_finish(&sbuf); 423 sbuf_delete(&sbuf); 424 return (error); 425 } 426 427 /* 428 * Initialize a dummy page for use in scans of the specified paging queue. 429 * In principle, this function only needs to set the flag PG_MARKER. 430 * Nonetheless, it write busies the page as a safety precaution. 431 */ 432 void 433 vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags) 434 { 435 436 bzero(marker, sizeof(*marker)); 437 marker->flags = PG_MARKER; 438 marker->a.flags = aflags; 439 marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE; 440 marker->a.queue = queue; 441 } 442 443 static void 444 vm_page_domain_init(int domain) 445 { 446 struct vm_domain *vmd; 447 struct vm_pagequeue *pq; 448 int i; 449 450 vmd = VM_DOMAIN(domain); 451 bzero(vmd, sizeof(*vmd)); 452 *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = 453 "vm inactive pagequeue"; 454 *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = 455 "vm active pagequeue"; 456 *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = 457 "vm laundry pagequeue"; 458 *__DECONST(const char **, 459 &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = 460 "vm unswappable pagequeue"; 461 vmd->vmd_domain = domain; 462 vmd->vmd_page_count = 0; 463 vmd->vmd_free_count = 0; 464 vmd->vmd_segs = 0; 465 vmd->vmd_oom = false; 466 vmd->vmd_helper_threads_enabled = true; 467 for (i = 0; i < PQ_COUNT; i++) { 468 pq = &vmd->vmd_pagequeues[i]; 469 TAILQ_INIT(&pq->pq_pl); 470 mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", 471 MTX_DEF | MTX_DUPOK); 472 pq->pq_pdpages = 0; 473 vm_page_init_marker(&vmd->vmd_markers[i], i, 0); 474 } 475 mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); 476 mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); 477 snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); 478 479 /* 480 * inacthead is used to provide FIFO ordering for LRU-bypassing 481 * insertions. 482 */ 483 vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); 484 TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, 485 &vmd->vmd_inacthead, plinks.q); 486 487 /* 488 * The clock pages are used to implement active queue scanning without 489 * requeues. Scans start at clock[0], which is advanced after the scan 490 * ends. When the two clock hands meet, they are reset and scanning 491 * resumes from the head of the queue. 492 */ 493 vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); 494 vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); 495 TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, 496 &vmd->vmd_clock[0], plinks.q); 497 TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, 498 &vmd->vmd_clock[1], plinks.q); 499 } 500 501 /* 502 * Initialize a physical page in preparation for adding it to the free 503 * lists. 504 */ 505 void 506 vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind, int pool) 507 { 508 m->object = NULL; 509 m->ref_count = 0; 510 m->busy_lock = VPB_FREED; 511 m->flags = m->a.flags = 0; 512 m->phys_addr = pa; 513 m->a.queue = PQ_NONE; 514 m->psind = 0; 515 m->segind = segind; 516 m->order = VM_NFREEORDER; 517 m->pool = pool; 518 m->valid = m->dirty = 0; 519 pmap_page_init(m); 520 } 521 522 #ifndef PMAP_HAS_PAGE_ARRAY 523 static vm_paddr_t 524 vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range) 525 { 526 vm_paddr_t new_end; 527 528 /* 529 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 530 * However, because this page is allocated from KVM, out-of-bounds 531 * accesses using the direct map will not be trapped. 532 */ 533 *vaddr += PAGE_SIZE; 534 535 /* 536 * Allocate physical memory for the page structures, and map it. 537 */ 538 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 539 vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end, 540 VM_PROT_READ | VM_PROT_WRITE); 541 vm_page_array_size = page_range; 542 543 return (new_end); 544 } 545 #endif 546 547 /* 548 * vm_page_startup: 549 * 550 * Initializes the resident memory module. Allocates physical memory for 551 * bootstrapping UMA and some data structures that are used to manage 552 * physical pages. Initializes these structures, and populates the free 553 * page queues. 554 */ 555 vm_offset_t 556 vm_page_startup(vm_offset_t vaddr) 557 { 558 struct vm_phys_seg *seg; 559 struct vm_domain *vmd; 560 vm_page_t m; 561 char *list, *listend; 562 vm_paddr_t end, high_avail, low_avail, new_end, size; 563 vm_paddr_t page_range __unused; 564 vm_paddr_t last_pa, pa, startp, endp; 565 u_long pagecount; 566 #if MINIDUMP_PAGE_TRACKING 567 u_long vm_page_dump_size; 568 #endif 569 int biggestone, i, segind; 570 #ifdef WITNESS 571 vm_offset_t mapped; 572 int witness_size; 573 #endif 574 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) 575 long ii; 576 #endif 577 int pool; 578 #ifdef VM_FREEPOOL_LAZYINIT 579 int lazyinit; 580 #endif 581 582 vaddr = round_page(vaddr); 583 584 vm_phys_early_startup(); 585 biggestone = vm_phys_avail_largest(); 586 end = phys_avail[biggestone+1]; 587 588 /* 589 * Initialize the page and queue locks. 590 */ 591 mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); 592 for (i = 0; i < vm_ndomains; i++) 593 vm_page_domain_init(i); 594 595 new_end = end; 596 #ifdef WITNESS 597 witness_size = round_page(witness_startup_count()); 598 new_end -= witness_size; 599 mapped = pmap_map(&vaddr, new_end, new_end + witness_size, 600 VM_PROT_READ | VM_PROT_WRITE); 601 bzero((void *)mapped, witness_size); 602 witness_startup((void *)mapped); 603 #endif 604 605 #if MINIDUMP_PAGE_TRACKING 606 /* 607 * Allocate a bitmap to indicate that a random physical page 608 * needs to be included in a minidump. 609 * 610 * The amd64 port needs this to indicate which direct map pages 611 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 612 * 613 * However, i386 still needs this workspace internally within the 614 * minidump code. In theory, they are not needed on i386, but are 615 * included should the sf_buf code decide to use them. 616 */ 617 last_pa = 0; 618 vm_page_dump_pages = 0; 619 for (i = 0; dump_avail[i + 1] != 0; i += 2) { 620 vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) - 621 dump_avail[i] / PAGE_SIZE; 622 if (dump_avail[i + 1] > last_pa) 623 last_pa = dump_avail[i + 1]; 624 } 625 vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages)); 626 new_end -= vm_page_dump_size; 627 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, 628 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); 629 bzero((void *)vm_page_dump, vm_page_dump_size); 630 #if MINIDUMP_STARTUP_PAGE_TRACKING 631 /* 632 * Include the UMA bootstrap pages, witness pages and vm_page_dump 633 * in a crash dump. When pmap_map() uses the direct map, they are 634 * not automatically included. 635 */ 636 for (pa = new_end; pa < end; pa += PAGE_SIZE) 637 dump_add_page(pa); 638 #endif 639 #else 640 (void)last_pa; 641 #endif 642 phys_avail[biggestone + 1] = new_end; 643 #ifdef __amd64__ 644 /* 645 * Request that the physical pages underlying the message buffer be 646 * included in a crash dump. Since the message buffer is accessed 647 * through the direct map, they are not automatically included. 648 */ 649 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 650 last_pa = pa + round_page(msgbufsize); 651 while (pa < last_pa) { 652 dump_add_page(pa); 653 pa += PAGE_SIZE; 654 } 655 #else 656 (void)pa; 657 #endif 658 659 /* 660 * Determine the lowest and highest physical addresses and, in the case 661 * of VM_PHYSSEG_SPARSE, the exact size of the available physical 662 * memory. vm_phys_early_startup() already checked that phys_avail[] 663 * has at least one element. 664 */ 665 #ifdef VM_PHYSSEG_SPARSE 666 size = phys_avail[1] - phys_avail[0]; 667 #endif 668 low_avail = phys_avail[0]; 669 high_avail = phys_avail[1]; 670 for (i = 2; phys_avail[i + 1] != 0; i += 2) { 671 #ifdef VM_PHYSSEG_SPARSE 672 size += phys_avail[i + 1] - phys_avail[i]; 673 #endif 674 if (phys_avail[i] < low_avail) 675 low_avail = phys_avail[i]; 676 if (phys_avail[i + 1] > high_avail) 677 high_avail = phys_avail[i + 1]; 678 } 679 for (i = 0; i < vm_phys_nsegs; i++) { 680 #ifdef VM_PHYSSEG_SPARSE 681 size += vm_phys_segs[i].end - vm_phys_segs[i].start; 682 #endif 683 if (vm_phys_segs[i].start < low_avail) 684 low_avail = vm_phys_segs[i].start; 685 if (vm_phys_segs[i].end > high_avail) 686 high_avail = vm_phys_segs[i].end; 687 } 688 first_page = low_avail / PAGE_SIZE; 689 #ifdef VM_PHYSSEG_DENSE 690 size = high_avail - low_avail; 691 #endif 692 693 #ifdef PMAP_HAS_PAGE_ARRAY 694 pmap_page_array_startup(size / PAGE_SIZE); 695 biggestone = vm_phys_avail_largest(); 696 end = new_end = phys_avail[biggestone + 1]; 697 #else 698 #ifdef VM_PHYSSEG_DENSE 699 /* 700 * In the VM_PHYSSEG_DENSE case, the number of pages can account for 701 * the overhead of a page structure per page only if vm_page_array is 702 * allocated from the last physical memory chunk. Otherwise, we must 703 * allocate page structures representing the physical memory 704 * underlying vm_page_array, even though they will not be used. 705 */ 706 if (new_end != high_avail) 707 page_range = size / PAGE_SIZE; 708 else 709 #endif 710 { 711 page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); 712 713 /* 714 * If the partial bytes remaining are large enough for 715 * a page (PAGE_SIZE) without a corresponding 716 * 'struct vm_page', then new_end will contain an 717 * extra page after subtracting the length of the VM 718 * page array. Compensate by subtracting an extra 719 * page from new_end. 720 */ 721 if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { 722 if (new_end == high_avail) 723 high_avail -= PAGE_SIZE; 724 new_end -= PAGE_SIZE; 725 } 726 } 727 end = new_end; 728 new_end = vm_page_array_alloc(&vaddr, end, page_range); 729 #endif 730 731 #if VM_NRESERVLEVEL > 0 732 /* 733 * Allocate physical memory for the reservation management system's 734 * data structures, and map it. 735 */ 736 new_end = vm_reserv_startup(&vaddr, new_end); 737 #endif 738 #if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING 739 /* 740 * Include vm_page_array and vm_reserv_array in a crash dump. 741 */ 742 for (pa = new_end; pa < end; pa += PAGE_SIZE) 743 dump_add_page(pa); 744 #endif 745 phys_avail[biggestone + 1] = new_end; 746 747 /* 748 * Add physical memory segments corresponding to the available 749 * physical pages. 750 */ 751 for (i = 0; phys_avail[i + 1] != 0; i += 2) 752 vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); 753 754 /* 755 * Initialize the physical memory allocator. 756 */ 757 vm_phys_init(); 758 759 pool = VM_FREEPOOL_DEFAULT; 760 #ifdef VM_FREEPOOL_LAZYINIT 761 lazyinit = 1; 762 TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit); 763 if (lazyinit) 764 pool = VM_FREEPOOL_LAZYINIT; 765 #endif 766 767 /* 768 * Initialize the page structures and add every available page to the 769 * physical memory allocator's free lists. 770 */ 771 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) 772 for (ii = 0; ii < vm_page_array_size; ii++) { 773 m = &vm_page_array[ii]; 774 vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0, 775 VM_FREEPOOL_DEFAULT); 776 m->flags = PG_FICTITIOUS; 777 } 778 #endif 779 vm_cnt.v_page_count = 0; 780 for (segind = 0; segind < vm_phys_nsegs; segind++) { 781 seg = &vm_phys_segs[segind]; 782 783 /* 784 * Initialize pages not covered by phys_avail[], since they 785 * might be freed to the allocator at some future point, e.g., 786 * by kmem_bootstrap_free(). 787 */ 788 startp = seg->start; 789 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 790 if (startp >= seg->end) 791 break; 792 if (phys_avail[i + 1] < startp) 793 continue; 794 if (phys_avail[i] <= startp) { 795 startp = phys_avail[i + 1]; 796 continue; 797 } 798 m = vm_phys_seg_paddr_to_vm_page(seg, startp); 799 for (endp = MIN(phys_avail[i], seg->end); 800 startp < endp; startp += PAGE_SIZE, m++) { 801 vm_page_init_page(m, startp, segind, 802 VM_FREEPOOL_DEFAULT); 803 } 804 } 805 806 /* 807 * Add the segment's pages that are covered by one of 808 * phys_avail's ranges to the free lists. 809 */ 810 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 811 if (seg->end <= phys_avail[i] || 812 seg->start >= phys_avail[i + 1]) 813 continue; 814 815 startp = MAX(seg->start, phys_avail[i]); 816 endp = MIN(seg->end, phys_avail[i + 1]); 817 pagecount = (u_long)atop(endp - startp); 818 if (pagecount == 0) 819 continue; 820 821 /* 822 * If lazy vm_page initialization is not enabled, simply 823 * initialize all of the pages in the segment covered by 824 * phys_avail. Otherwise, initialize only the first 825 * page of each run of free pages handed to the vm_phys 826 * allocator, which in turn defers initialization of 827 * pages until they are needed. 828 * 829 * This avoids blocking the boot process for long 830 * periods, which may be relevant for VMs (which ought 831 * to boot as quickly as possible) and/or systems with 832 * large amounts of physical memory. 833 */ 834 m = vm_phys_seg_paddr_to_vm_page(seg, startp); 835 vm_page_init_page(m, startp, segind, pool); 836 if (pool == VM_FREEPOOL_DEFAULT) { 837 for (u_long j = 1; j < pagecount; j++) { 838 vm_page_init_page(&m[j], 839 startp + ptoa((vm_paddr_t)j), 840 segind, pool); 841 } 842 } 843 vmd = VM_DOMAIN(seg->domain); 844 vm_domain_free_lock(vmd); 845 vm_phys_enqueue_contig(m, pool, pagecount); 846 vm_domain_free_unlock(vmd); 847 vm_domain_freecnt_inc(vmd, pagecount); 848 vm_cnt.v_page_count += (u_int)pagecount; 849 vmd->vmd_page_count += (u_int)pagecount; 850 vmd->vmd_segs |= 1UL << segind; 851 } 852 } 853 854 /* 855 * Remove blacklisted pages from the physical memory allocator. 856 */ 857 TAILQ_INIT(&blacklist_head); 858 vm_page_blacklist_load(&list, &listend); 859 vm_page_blacklist_check(list, listend); 860 861 list = kern_getenv("vm.blacklist"); 862 vm_page_blacklist_check(list, NULL); 863 864 freeenv(list); 865 #if VM_NRESERVLEVEL > 0 866 /* 867 * Initialize the reservation management system. 868 */ 869 vm_reserv_init(); 870 #endif 871 872 return (vaddr); 873 } 874 875 void 876 vm_page_reference(vm_page_t m) 877 { 878 879 vm_page_aflag_set(m, PGA_REFERENCED); 880 } 881 882 /* 883 * vm_page_trybusy 884 * 885 * Helper routine for grab functions to trylock busy. 886 * 887 * Returns true on success and false on failure. 888 */ 889 static bool 890 vm_page_trybusy(vm_page_t m, int allocflags) 891 { 892 893 if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0) 894 return (vm_page_trysbusy(m)); 895 else 896 return (vm_page_tryxbusy(m)); 897 } 898 899 /* 900 * vm_page_tryacquire 901 * 902 * Helper routine for grab functions to trylock busy and wire. 903 * 904 * Returns true on success and false on failure. 905 */ 906 static inline bool 907 vm_page_tryacquire(vm_page_t m, int allocflags) 908 { 909 bool locked; 910 911 locked = vm_page_trybusy(m, allocflags); 912 if (locked && (allocflags & VM_ALLOC_WIRED) != 0) 913 vm_page_wire(m); 914 return (locked); 915 } 916 917 /* 918 * vm_page_busy_acquire: 919 * 920 * Acquire the busy lock as described by VM_ALLOC_* flags. Will loop 921 * and drop the object lock if necessary. 922 */ 923 bool 924 vm_page_busy_acquire(vm_page_t m, int allocflags) 925 { 926 vm_object_t obj; 927 bool locked; 928 929 /* 930 * The page-specific object must be cached because page 931 * identity can change during the sleep, causing the 932 * re-lock of a different object. 933 * It is assumed that a reference to the object is already 934 * held by the callers. 935 */ 936 obj = atomic_load_ptr(&m->object); 937 for (;;) { 938 if (vm_page_tryacquire(m, allocflags)) 939 return (true); 940 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 941 return (false); 942 if (obj != NULL) 943 locked = VM_OBJECT_WOWNED(obj); 944 else 945 locked = false; 946 MPASS(locked || vm_page_wired(m)); 947 if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags, 948 locked) && locked) 949 VM_OBJECT_WLOCK(obj); 950 if ((allocflags & VM_ALLOC_WAITFAIL) != 0) 951 return (false); 952 KASSERT(m->object == obj || m->object == NULL, 953 ("vm_page_busy_acquire: page %p does not belong to %p", 954 m, obj)); 955 } 956 } 957 958 /* 959 * vm_page_busy_downgrade: 960 * 961 * Downgrade an exclusive busy page into a single shared busy page. 962 */ 963 void 964 vm_page_busy_downgrade(vm_page_t m) 965 { 966 u_int x; 967 968 vm_page_assert_xbusied(m); 969 970 x = vm_page_busy_fetch(m); 971 for (;;) { 972 if (atomic_fcmpset_rel_int(&m->busy_lock, 973 &x, VPB_SHARERS_WORD(1))) 974 break; 975 } 976 if ((x & VPB_BIT_WAITERS) != 0) 977 wakeup(m); 978 } 979 980 /* 981 * 982 * vm_page_busy_tryupgrade: 983 * 984 * Attempt to upgrade a single shared busy into an exclusive busy. 985 */ 986 int 987 vm_page_busy_tryupgrade(vm_page_t m) 988 { 989 u_int ce, x; 990 991 vm_page_assert_sbusied(m); 992 993 x = vm_page_busy_fetch(m); 994 ce = VPB_CURTHREAD_EXCLUSIVE; 995 for (;;) { 996 if (VPB_SHARERS(x) > 1) 997 return (0); 998 KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), 999 ("vm_page_busy_tryupgrade: invalid lock state")); 1000 if (!atomic_fcmpset_acq_int(&m->busy_lock, &x, 1001 ce | (x & VPB_BIT_WAITERS))) 1002 continue; 1003 return (1); 1004 } 1005 } 1006 1007 /* 1008 * vm_page_sbusied: 1009 * 1010 * Return a positive value if the page is shared busied, 0 otherwise. 1011 */ 1012 int 1013 vm_page_sbusied(vm_page_t m) 1014 { 1015 u_int x; 1016 1017 x = vm_page_busy_fetch(m); 1018 return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); 1019 } 1020 1021 /* 1022 * vm_page_sunbusy: 1023 * 1024 * Shared unbusy a page. 1025 */ 1026 void 1027 vm_page_sunbusy(vm_page_t m) 1028 { 1029 u_int x; 1030 1031 vm_page_assert_sbusied(m); 1032 1033 x = vm_page_busy_fetch(m); 1034 for (;;) { 1035 KASSERT(x != VPB_FREED, 1036 ("vm_page_sunbusy: Unlocking freed page.")); 1037 if (VPB_SHARERS(x) > 1) { 1038 if (atomic_fcmpset_int(&m->busy_lock, &x, 1039 x - VPB_ONE_SHARER)) 1040 break; 1041 continue; 1042 } 1043 KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), 1044 ("vm_page_sunbusy: invalid lock state")); 1045 if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) 1046 continue; 1047 if ((x & VPB_BIT_WAITERS) == 0) 1048 break; 1049 wakeup(m); 1050 break; 1051 } 1052 } 1053 1054 /* 1055 * vm_page_busy_sleep: 1056 * 1057 * Sleep if the page is busy, using the page pointer as wchan. 1058 * This is used to implement the hard-path of the busying mechanism. 1059 * 1060 * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function 1061 * will not sleep if the page is shared-busy. 1062 * 1063 * The object lock must be held on entry. 1064 * 1065 * Returns true if it slept and dropped the object lock, or false 1066 * if there was no sleep and the lock is still held. 1067 */ 1068 bool 1069 vm_page_busy_sleep(vm_page_t m, const char *wmesg, int allocflags) 1070 { 1071 vm_object_t obj; 1072 1073 obj = m->object; 1074 VM_OBJECT_ASSERT_LOCKED(obj); 1075 1076 return (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, allocflags, 1077 true)); 1078 } 1079 1080 /* 1081 * vm_page_busy_sleep_unlocked: 1082 * 1083 * Sleep if the page is busy, using the page pointer as wchan. 1084 * This is used to implement the hard-path of busying mechanism. 1085 * 1086 * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function 1087 * will not sleep if the page is shared-busy. 1088 * 1089 * The object lock must not be held on entry. The operation will 1090 * return if the page changes identity. 1091 */ 1092 void 1093 vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, 1094 const char *wmesg, int allocflags) 1095 { 1096 VM_OBJECT_ASSERT_UNLOCKED(obj); 1097 1098 (void)_vm_page_busy_sleep(obj, m, pindex, wmesg, allocflags, false); 1099 } 1100 1101 /* 1102 * _vm_page_busy_sleep: 1103 * 1104 * Internal busy sleep function. Verifies the page identity and 1105 * lockstate against parameters. Returns true if it sleeps and 1106 * false otherwise. 1107 * 1108 * allocflags uses VM_ALLOC_* flags to specify the lock required. 1109 * 1110 * If locked is true the lock will be dropped for any true returns 1111 * and held for any false returns. 1112 */ 1113 static bool 1114 _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, 1115 const char *wmesg, int allocflags, bool locked) 1116 { 1117 bool xsleep; 1118 u_int x; 1119 1120 /* 1121 * If the object is busy we must wait for that to drain to zero 1122 * before trying the page again. 1123 */ 1124 if (obj != NULL && vm_object_busied(obj)) { 1125 if (locked) 1126 VM_OBJECT_DROP(obj); 1127 vm_object_busy_wait(obj, wmesg); 1128 return (true); 1129 } 1130 1131 if (!vm_page_busied(m)) 1132 return (false); 1133 1134 xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0; 1135 sleepq_lock(m); 1136 x = vm_page_busy_fetch(m); 1137 do { 1138 /* 1139 * If the page changes objects or becomes unlocked we can 1140 * simply return. 1141 */ 1142 if (x == VPB_UNBUSIED || 1143 (xsleep && (x & VPB_BIT_SHARED) != 0) || 1144 m->object != obj || m->pindex != pindex) { 1145 sleepq_release(m); 1146 return (false); 1147 } 1148 if ((x & VPB_BIT_WAITERS) != 0) 1149 break; 1150 } while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS)); 1151 if (locked) 1152 VM_OBJECT_DROP(obj); 1153 DROP_GIANT(); 1154 sleepq_add(m, NULL, wmesg, 0, 0); 1155 sleepq_wait(m, PVM); 1156 PICKUP_GIANT(); 1157 return (true); 1158 } 1159 1160 /* 1161 * vm_page_trysbusy: 1162 * 1163 * Try to shared busy a page. 1164 * If the operation succeeds 1 is returned otherwise 0. 1165 * The operation never sleeps. 1166 */ 1167 int 1168 vm_page_trysbusy(vm_page_t m) 1169 { 1170 vm_object_t obj; 1171 u_int x; 1172 1173 obj = m->object; 1174 x = vm_page_busy_fetch(m); 1175 for (;;) { 1176 if ((x & VPB_BIT_SHARED) == 0) 1177 return (0); 1178 /* 1179 * Reduce the window for transient busies that will trigger 1180 * false negatives in vm_page_ps_test(). 1181 */ 1182 if (obj != NULL && vm_object_busied(obj)) 1183 return (0); 1184 if (atomic_fcmpset_acq_int(&m->busy_lock, &x, 1185 x + VPB_ONE_SHARER)) 1186 break; 1187 } 1188 1189 /* Refetch the object now that we're guaranteed that it is stable. */ 1190 obj = m->object; 1191 if (obj != NULL && vm_object_busied(obj)) { 1192 vm_page_sunbusy(m); 1193 return (0); 1194 } 1195 return (1); 1196 } 1197 1198 /* 1199 * vm_page_tryxbusy: 1200 * 1201 * Try to exclusive busy a page. 1202 * If the operation succeeds 1 is returned otherwise 0. 1203 * The operation never sleeps. 1204 */ 1205 int 1206 vm_page_tryxbusy(vm_page_t m) 1207 { 1208 vm_object_t obj; 1209 1210 if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, 1211 VPB_CURTHREAD_EXCLUSIVE) == 0) 1212 return (0); 1213 1214 obj = m->object; 1215 if (obj != NULL && vm_object_busied(obj)) { 1216 vm_page_xunbusy(m); 1217 return (0); 1218 } 1219 return (1); 1220 } 1221 1222 static void 1223 vm_page_xunbusy_hard_tail(vm_page_t m) 1224 { 1225 atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); 1226 /* Wake the waiter. */ 1227 wakeup(m); 1228 } 1229 1230 /* 1231 * vm_page_xunbusy_hard: 1232 * 1233 * Called when unbusy has failed because there is a waiter. 1234 */ 1235 void 1236 vm_page_xunbusy_hard(vm_page_t m) 1237 { 1238 vm_page_assert_xbusied(m); 1239 vm_page_xunbusy_hard_tail(m); 1240 } 1241 1242 void 1243 vm_page_xunbusy_hard_unchecked(vm_page_t m) 1244 { 1245 vm_page_assert_xbusied_unchecked(m); 1246 vm_page_xunbusy_hard_tail(m); 1247 } 1248 1249 static void 1250 vm_page_busy_free(vm_page_t m) 1251 { 1252 u_int x; 1253 1254 atomic_thread_fence_rel(); 1255 x = atomic_swap_int(&m->busy_lock, VPB_FREED); 1256 if ((x & VPB_BIT_WAITERS) != 0) 1257 wakeup(m); 1258 } 1259 1260 /* 1261 * vm_page_unhold_pages: 1262 * 1263 * Unhold each of the pages that is referenced by the given array. 1264 */ 1265 void 1266 vm_page_unhold_pages(vm_page_t *ma, int count) 1267 { 1268 1269 for (; count != 0; count--) { 1270 vm_page_unwire(*ma, PQ_ACTIVE); 1271 ma++; 1272 } 1273 } 1274 1275 vm_page_t 1276 PHYS_TO_VM_PAGE(vm_paddr_t pa) 1277 { 1278 vm_page_t m; 1279 1280 #ifdef VM_PHYSSEG_SPARSE 1281 m = vm_phys_paddr_to_vm_page(pa); 1282 if (m == NULL) 1283 m = vm_phys_fictitious_to_vm_page(pa); 1284 return (m); 1285 #elif defined(VM_PHYSSEG_DENSE) 1286 long pi; 1287 1288 pi = atop(pa); 1289 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1290 m = &vm_page_array[pi - first_page]; 1291 return (m); 1292 } 1293 return (vm_phys_fictitious_to_vm_page(pa)); 1294 #else 1295 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 1296 #endif 1297 } 1298 1299 /* 1300 * vm_page_getfake: 1301 * 1302 * Create a fictitious page with the specified physical address and 1303 * memory attribute. The memory attribute is the only the machine- 1304 * dependent aspect of a fictitious page that must be initialized. 1305 */ 1306 vm_page_t 1307 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 1308 { 1309 vm_page_t m; 1310 1311 m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); 1312 vm_page_initfake(m, paddr, memattr); 1313 return (m); 1314 } 1315 1316 void 1317 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 1318 { 1319 1320 if ((m->flags & PG_FICTITIOUS) != 0) { 1321 /* 1322 * The page's memattr might have changed since the 1323 * previous initialization. Update the pmap to the 1324 * new memattr. 1325 */ 1326 goto memattr; 1327 } 1328 m->phys_addr = paddr; 1329 m->a.queue = PQ_NONE; 1330 /* Fictitious pages don't use "segind". */ 1331 m->flags = PG_FICTITIOUS; 1332 /* Fictitious pages don't use "order" or "pool". */ 1333 m->oflags = VPO_UNMANAGED; 1334 m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; 1335 /* Fictitious pages are unevictable. */ 1336 m->ref_count = 1; 1337 pmap_page_init(m); 1338 memattr: 1339 pmap_page_set_memattr(m, memattr); 1340 } 1341 1342 /* 1343 * vm_page_putfake: 1344 * 1345 * Release a fictitious page. 1346 */ 1347 void 1348 vm_page_putfake(vm_page_t m) 1349 { 1350 1351 KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); 1352 KASSERT((m->flags & PG_FICTITIOUS) != 0, 1353 ("vm_page_putfake: bad page %p", m)); 1354 vm_page_assert_xbusied(m); 1355 vm_page_busy_free(m); 1356 uma_zfree(fakepg_zone, m); 1357 } 1358 1359 /* 1360 * vm_page_updatefake: 1361 * 1362 * Update the given fictitious page to the specified physical address and 1363 * memory attribute. 1364 */ 1365 void 1366 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 1367 { 1368 1369 KASSERT((m->flags & PG_FICTITIOUS) != 0, 1370 ("vm_page_updatefake: bad page %p", m)); 1371 m->phys_addr = paddr; 1372 pmap_page_set_memattr(m, memattr); 1373 } 1374 1375 /* 1376 * vm_page_free: 1377 * 1378 * Free a page. 1379 */ 1380 void 1381 vm_page_free(vm_page_t m) 1382 { 1383 1384 m->flags &= ~PG_ZERO; 1385 vm_page_free_toq(m); 1386 } 1387 1388 /* 1389 * vm_page_free_zero: 1390 * 1391 * Free a page to the zerod-pages queue 1392 */ 1393 void 1394 vm_page_free_zero(vm_page_t m) 1395 { 1396 1397 m->flags |= PG_ZERO; 1398 vm_page_free_toq(m); 1399 } 1400 1401 /* 1402 * Unbusy and handle the page queueing for a page from a getpages request that 1403 * was optionally read ahead or behind. 1404 */ 1405 void 1406 vm_page_readahead_finish(vm_page_t m) 1407 { 1408 1409 /* We shouldn't put invalid pages on queues. */ 1410 KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m)); 1411 1412 /* 1413 * Since the page is not the actually needed one, whether it should 1414 * be activated or deactivated is not obvious. Empirical results 1415 * have shown that deactivating the page is usually the best choice, 1416 * unless the page is wanted by another thread. 1417 */ 1418 if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0) 1419 vm_page_activate(m); 1420 else 1421 vm_page_deactivate(m); 1422 vm_page_xunbusy_unchecked(m); 1423 } 1424 1425 /* 1426 * Destroy the identity of an invalid page and free it if possible. 1427 * This is intended to be used when reading a page from backing store fails. 1428 */ 1429 void 1430 vm_page_free_invalid(vm_page_t m) 1431 { 1432 1433 KASSERT(vm_page_none_valid(m), ("page %p is valid", m)); 1434 KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); 1435 KASSERT(m->object != NULL, ("page %p has no object", m)); 1436 VM_OBJECT_ASSERT_WLOCKED(m->object); 1437 1438 /* 1439 * We may be attempting to free the page as part of the handling for an 1440 * I/O error, in which case the page was xbusied by a different thread. 1441 */ 1442 vm_page_xbusy_claim(m); 1443 1444 /* 1445 * If someone has wired this page while the object lock 1446 * was not held, then the thread that unwires is responsible 1447 * for freeing the page. Otherwise just free the page now. 1448 * The wire count of this unmapped page cannot change while 1449 * we have the page xbusy and the page's object wlocked. 1450 */ 1451 if (vm_page_remove(m)) 1452 vm_page_free(m); 1453 } 1454 1455 /* 1456 * vm_page_dirty_KBI: [ internal use only ] 1457 * 1458 * Set all bits in the page's dirty field. 1459 * 1460 * The object containing the specified page must be locked if the 1461 * call is made from the machine-independent layer. 1462 * 1463 * See vm_page_clear_dirty_mask(). 1464 * 1465 * This function should only be called by vm_page_dirty(). 1466 */ 1467 void 1468 vm_page_dirty_KBI(vm_page_t m) 1469 { 1470 1471 /* Refer to this operation by its public name. */ 1472 KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); 1473 m->dirty = VM_PAGE_BITS_ALL; 1474 } 1475 1476 /* 1477 * Insert the given page into the given object at the given pindex. 1478 * 1479 * The procedure is marked __always_inline to suggest to the compiler to 1480 * eliminate the iter parameter and the associated alternate branch. 1481 */ 1482 static __always_inline int 1483 vm_page_insert_lookup(vm_page_t m, vm_object_t object, vm_pindex_t pindex, 1484 bool iter, struct pctrie_iter *pages) 1485 { 1486 int error; 1487 1488 VM_OBJECT_ASSERT_WLOCKED(object); 1489 KASSERT(m->object == NULL, 1490 ("vm_page_insert: page %p already inserted", m)); 1491 1492 /* 1493 * Record the object/offset pair in this page. 1494 */ 1495 m->object = object; 1496 m->pindex = pindex; 1497 m->ref_count |= VPRC_OBJREF; 1498 1499 /* 1500 * Add this page to the object's radix tree. 1501 */ 1502 if (iter) 1503 error = vm_radix_iter_insert(pages, m); 1504 else 1505 error = vm_radix_insert(&object->rtree, m); 1506 if (__predict_false(error != 0)) { 1507 m->object = NULL; 1508 m->pindex = 0; 1509 m->ref_count &= ~VPRC_OBJREF; 1510 return (1); 1511 } 1512 1513 vm_page_insert_radixdone(m, object); 1514 vm_pager_page_inserted(object, m); 1515 return (0); 1516 } 1517 1518 /* 1519 * vm_page_insert: [ internal use only ] 1520 * 1521 * Inserts the given mem entry into the object and object list. 1522 * 1523 * The object must be locked. 1524 */ 1525 int 1526 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 1527 { 1528 return (vm_page_insert_lookup(m, object, pindex, false, NULL)); 1529 } 1530 1531 /* 1532 * vm_page_iter_insert: 1533 * 1534 * Tries to insert the page "m" into the specified object at offset 1535 * "pindex" using the iterator "pages". Returns 0 if the insertion was 1536 * successful. 1537 * 1538 * The object must be locked. 1539 */ 1540 int 1541 vm_page_iter_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex, 1542 struct pctrie_iter *pages) 1543 { 1544 return (vm_page_insert_lookup(m, object, pindex, true, pages)); 1545 } 1546 1547 /* 1548 * vm_page_insert_radixdone: 1549 * 1550 * Complete page "m" insertion into the specified object after the 1551 * radix trie hooking. 1552 * 1553 * The object must be locked. 1554 */ 1555 static void 1556 vm_page_insert_radixdone(vm_page_t m, vm_object_t object) 1557 { 1558 1559 VM_OBJECT_ASSERT_WLOCKED(object); 1560 KASSERT(object != NULL && m->object == object, 1561 ("vm_page_insert_radixdone: page %p has inconsistent object", m)); 1562 KASSERT((m->ref_count & VPRC_OBJREF) != 0, 1563 ("vm_page_insert_radixdone: page %p is missing object ref", m)); 1564 1565 /* 1566 * Show that the object has one more resident page. 1567 */ 1568 object->resident_page_count++; 1569 1570 /* 1571 * Hold the vnode until the last page is released. 1572 */ 1573 if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 1574 vhold(object->handle); 1575 1576 /* 1577 * Since we are inserting a new and possibly dirty page, 1578 * update the object's generation count. 1579 */ 1580 if (pmap_page_is_write_mapped(m)) 1581 vm_object_set_writeable_dirty(object); 1582 } 1583 1584 /* 1585 * vm_page_remove_radixdone 1586 * 1587 * Complete page "m" removal from the specified object after the radix trie 1588 * unhooking. 1589 * 1590 * The caller is responsible for updating the page's fields to reflect this 1591 * removal. 1592 */ 1593 static void 1594 vm_page_remove_radixdone(vm_page_t m) 1595 { 1596 vm_object_t object; 1597 1598 vm_page_assert_xbusied(m); 1599 object = m->object; 1600 VM_OBJECT_ASSERT_WLOCKED(object); 1601 KASSERT((m->ref_count & VPRC_OBJREF) != 0, 1602 ("page %p is missing its object ref", m)); 1603 1604 /* Deferred free of swap space. */ 1605 if ((m->a.flags & PGA_SWAP_FREE) != 0) 1606 vm_pager_page_unswapped(m); 1607 1608 vm_pager_page_removed(object, m); 1609 m->object = NULL; 1610 1611 /* 1612 * And show that the object has one fewer resident page. 1613 */ 1614 object->resident_page_count--; 1615 1616 /* 1617 * The vnode may now be recycled. 1618 */ 1619 if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 1620 vdrop(object->handle); 1621 } 1622 1623 /* 1624 * vm_page_free_object_prep: 1625 * 1626 * Disassociates the given page from its VM object. 1627 * 1628 * The object must be locked, and the page must be xbusy. 1629 */ 1630 static void 1631 vm_page_free_object_prep(vm_page_t m) 1632 { 1633 KASSERT(((m->oflags & VPO_UNMANAGED) != 0) == 1634 ((m->object->flags & OBJ_UNMANAGED) != 0), 1635 ("%s: managed flag mismatch for page %p", 1636 __func__, m)); 1637 vm_page_assert_xbusied(m); 1638 1639 /* 1640 * The object reference can be released without an atomic 1641 * operation. 1642 */ 1643 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 1644 m->ref_count == VPRC_OBJREF, 1645 ("%s: page %p has unexpected ref_count %u", 1646 __func__, m, m->ref_count)); 1647 vm_page_remove_radixdone(m); 1648 m->ref_count -= VPRC_OBJREF; 1649 } 1650 1651 /* 1652 * vm_page_iter_free: 1653 * 1654 * Free the given page, and use the iterator to remove it from the radix 1655 * tree. 1656 */ 1657 void 1658 vm_page_iter_free(struct pctrie_iter *pages, vm_page_t m) 1659 { 1660 vm_radix_iter_remove(pages); 1661 vm_page_free_object_prep(m); 1662 vm_page_xunbusy(m); 1663 m->flags &= ~PG_ZERO; 1664 vm_page_free_toq(m); 1665 } 1666 1667 /* 1668 * vm_page_remove: 1669 * 1670 * Removes the specified page from its containing object, but does not 1671 * invalidate any backing storage. Returns true if the object's reference 1672 * was the last reference to the page, and false otherwise. 1673 * 1674 * The object must be locked and the page must be exclusively busied. 1675 * The exclusive busy will be released on return. If this is not the 1676 * final ref and the caller does not hold a wire reference it may not 1677 * continue to access the page. 1678 */ 1679 bool 1680 vm_page_remove(vm_page_t m) 1681 { 1682 bool dropped; 1683 1684 dropped = vm_page_remove_xbusy(m); 1685 vm_page_xunbusy(m); 1686 1687 return (dropped); 1688 } 1689 1690 /* 1691 * vm_page_iter_remove: 1692 * 1693 * Remove the current page, and use the iterator to remove it from the 1694 * radix tree. 1695 */ 1696 bool 1697 vm_page_iter_remove(struct pctrie_iter *pages, vm_page_t m) 1698 { 1699 bool dropped; 1700 1701 vm_radix_iter_remove(pages); 1702 vm_page_remove_radixdone(m); 1703 dropped = (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); 1704 vm_page_xunbusy(m); 1705 1706 return (dropped); 1707 } 1708 1709 /* 1710 * vm_page_radix_remove 1711 * 1712 * Removes the specified page from the radix tree. 1713 */ 1714 static void 1715 vm_page_radix_remove(vm_page_t m) 1716 { 1717 vm_page_t mrem __diagused; 1718 1719 mrem = vm_radix_remove(&m->object->rtree, m->pindex); 1720 KASSERT(mrem == m, 1721 ("removed page %p, expected page %p", mrem, m)); 1722 } 1723 1724 /* 1725 * vm_page_remove_xbusy 1726 * 1727 * Removes the page but leaves the xbusy held. Returns true if this 1728 * removed the final ref and false otherwise. 1729 */ 1730 bool 1731 vm_page_remove_xbusy(vm_page_t m) 1732 { 1733 1734 vm_page_radix_remove(m); 1735 vm_page_remove_radixdone(m); 1736 return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); 1737 } 1738 1739 /* 1740 * vm_page_lookup: 1741 * 1742 * Returns the page associated with the object/offset 1743 * pair specified; if none is found, NULL is returned. 1744 * 1745 * The object must be locked. 1746 */ 1747 vm_page_t 1748 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 1749 { 1750 1751 VM_OBJECT_ASSERT_LOCKED(object); 1752 return (vm_radix_lookup(&object->rtree, pindex)); 1753 } 1754 1755 /* 1756 * vm_page_iter_init: 1757 * 1758 * Initialize iterator for vm pages. 1759 */ 1760 void 1761 vm_page_iter_init(struct pctrie_iter *pages, vm_object_t object) 1762 { 1763 1764 vm_radix_iter_init(pages, &object->rtree); 1765 } 1766 1767 /* 1768 * vm_page_iter_init: 1769 * 1770 * Initialize iterator for vm pages. 1771 */ 1772 void 1773 vm_page_iter_limit_init(struct pctrie_iter *pages, vm_object_t object, 1774 vm_pindex_t limit) 1775 { 1776 1777 vm_radix_iter_limit_init(pages, &object->rtree, limit); 1778 } 1779 1780 /* 1781 * vm_page_lookup_unlocked: 1782 * 1783 * Returns the page associated with the object/offset pair specified; 1784 * if none is found, NULL is returned. The page may be no longer be 1785 * present in the object at the time that this function returns. Only 1786 * useful for opportunistic checks such as inmem(). 1787 */ 1788 vm_page_t 1789 vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex) 1790 { 1791 1792 return (vm_radix_lookup_unlocked(&object->rtree, pindex)); 1793 } 1794 1795 /* 1796 * vm_page_relookup: 1797 * 1798 * Returns a page that must already have been busied by 1799 * the caller. Used for bogus page replacement. 1800 */ 1801 vm_page_t 1802 vm_page_relookup(vm_object_t object, vm_pindex_t pindex) 1803 { 1804 vm_page_t m; 1805 1806 m = vm_page_lookup_unlocked(object, pindex); 1807 KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) && 1808 m->object == object && m->pindex == pindex, 1809 ("vm_page_relookup: Invalid page %p", m)); 1810 return (m); 1811 } 1812 1813 /* 1814 * This should only be used by lockless functions for releasing transient 1815 * incorrect acquires. The page may have been freed after we acquired a 1816 * busy lock. In this case busy_lock == VPB_FREED and we have nothing 1817 * further to do. 1818 */ 1819 static void 1820 vm_page_busy_release(vm_page_t m) 1821 { 1822 u_int x; 1823 1824 x = vm_page_busy_fetch(m); 1825 for (;;) { 1826 if (x == VPB_FREED) 1827 break; 1828 if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) { 1829 if (atomic_fcmpset_int(&m->busy_lock, &x, 1830 x - VPB_ONE_SHARER)) 1831 break; 1832 continue; 1833 } 1834 KASSERT((x & VPB_BIT_SHARED) != 0 || 1835 (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE, 1836 ("vm_page_busy_release: %p xbusy not owned.", m)); 1837 if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) 1838 continue; 1839 if ((x & VPB_BIT_WAITERS) != 0) 1840 wakeup(m); 1841 break; 1842 } 1843 } 1844 1845 /* 1846 * Uses the page mnew as a replacement for an existing page at index 1847 * pindex which must be already present in the object. 1848 * 1849 * Both pages must be exclusively busied on enter. The old page is 1850 * unbusied on exit. 1851 * 1852 * A return value of true means mold is now free. If this is not the 1853 * final ref and the caller does not hold a wire reference it may not 1854 * continue to access the page. 1855 */ 1856 static bool 1857 vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, 1858 vm_page_t mold) 1859 { 1860 vm_page_t mret __diagused; 1861 bool dropped; 1862 1863 VM_OBJECT_ASSERT_WLOCKED(object); 1864 vm_page_assert_xbusied(mold); 1865 KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0, 1866 ("vm_page_replace: page %p already in object", mnew)); 1867 1868 /* 1869 * This function mostly follows vm_page_insert() and 1870 * vm_page_remove() without the radix, object count and vnode 1871 * dance. Double check such functions for more comments. 1872 */ 1873 1874 mnew->object = object; 1875 mnew->pindex = pindex; 1876 atomic_set_int(&mnew->ref_count, VPRC_OBJREF); 1877 mret = vm_radix_replace(&object->rtree, mnew); 1878 KASSERT(mret == mold, 1879 ("invalid page replacement, mold=%p, mret=%p", mold, mret)); 1880 KASSERT((mold->oflags & VPO_UNMANAGED) == 1881 (mnew->oflags & VPO_UNMANAGED), 1882 ("vm_page_replace: mismatched VPO_UNMANAGED")); 1883 1884 mold->object = NULL; 1885 1886 /* 1887 * The object's resident_page_count does not change because we have 1888 * swapped one page for another, but the generation count should 1889 * change if the page is dirty. 1890 */ 1891 if (pmap_page_is_write_mapped(mnew)) 1892 vm_object_set_writeable_dirty(object); 1893 dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF; 1894 vm_page_xunbusy(mold); 1895 1896 return (dropped); 1897 } 1898 1899 void 1900 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, 1901 vm_page_t mold) 1902 { 1903 1904 vm_page_assert_xbusied(mnew); 1905 1906 if (vm_page_replace_hold(mnew, object, pindex, mold)) 1907 vm_page_free(mold); 1908 } 1909 1910 /* 1911 * vm_page_iter_rename: 1912 * 1913 * Tries to move the specified page from its current object to a new object 1914 * and pindex, using the given iterator to remove the page from its current 1915 * object. Returns true if the move was successful, and false if the move 1916 * was aborted due to a failed memory allocation. 1917 * 1918 * Panics if a page already resides in the new object at the new pindex. 1919 * 1920 * This routine dirties the page if it is valid, as callers are expected to 1921 * transfer backing storage only after moving the page. Dirtying the page 1922 * ensures that the destination object retains the most recent copy of the 1923 * page. 1924 * 1925 * The objects must be locked. 1926 */ 1927 bool 1928 vm_page_iter_rename(struct pctrie_iter *old_pages, vm_page_t m, 1929 vm_object_t new_object, vm_pindex_t new_pindex) 1930 { 1931 vm_pindex_t opidx; 1932 1933 KASSERT((m->ref_count & VPRC_OBJREF) != 0, 1934 ("%s: page %p is missing object ref", __func__, m)); 1935 VM_OBJECT_ASSERT_WLOCKED(m->object); 1936 VM_OBJECT_ASSERT_WLOCKED(new_object); 1937 1938 /* 1939 * Create a custom version of vm_page_insert() which does not depend 1940 * by m_prev and can cheat on the implementation aspects of the 1941 * function. 1942 */ 1943 opidx = m->pindex; 1944 m->pindex = new_pindex; 1945 if (vm_radix_insert(&new_object->rtree, m) != 0) { 1946 m->pindex = opidx; 1947 return (false); 1948 } 1949 1950 /* 1951 * The operation cannot fail anymore. 1952 */ 1953 m->pindex = opidx; 1954 vm_radix_iter_remove(old_pages); 1955 vm_page_remove_radixdone(m); 1956 1957 /* Return back to the new pindex to complete vm_page_insert(). */ 1958 m->pindex = new_pindex; 1959 m->object = new_object; 1960 1961 vm_page_insert_radixdone(m, new_object); 1962 if (vm_page_any_valid(m)) 1963 vm_page_dirty(m); 1964 vm_pager_page_inserted(new_object, m); 1965 return (true); 1966 } 1967 1968 /* 1969 * vm_page_alloc: 1970 * 1971 * Allocate and return a page that is associated with the specified 1972 * object and offset pair. By default, this page is exclusive busied. 1973 * 1974 * The caller must always specify an allocation class. 1975 * 1976 * allocation classes: 1977 * VM_ALLOC_NORMAL normal process request 1978 * VM_ALLOC_SYSTEM system *really* needs a page 1979 * VM_ALLOC_INTERRUPT interrupt time request 1980 * 1981 * optional allocation flags: 1982 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1983 * intends to allocate 1984 * VM_ALLOC_NOBUSY do not exclusive busy the page 1985 * VM_ALLOC_NODUMP do not include the page in a kernel core dump 1986 * VM_ALLOC_NOFREE page will never be freed 1987 * VM_ALLOC_NOWAIT ignored (default behavior) 1988 * VM_ALLOC_SBUSY shared busy the allocated page 1989 * VM_ALLOC_WAITFAIL in case of failure, sleep before returning 1990 * VM_ALLOC_WIRED wire the allocated page 1991 * VM_ALLOC_ZERO prefer a zeroed page 1992 */ 1993 vm_page_t 1994 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1995 { 1996 struct pctrie_iter pages; 1997 1998 vm_page_iter_init(&pages, object); 1999 return (vm_page_alloc_iter(object, pindex, req, &pages)); 2000 } 2001 2002 /* 2003 * Allocate a page in the specified object with the given page index. If the 2004 * object lock is dropped and regained, the pages iter is reset. 2005 */ 2006 vm_page_t 2007 vm_page_alloc_iter(vm_object_t object, vm_pindex_t pindex, int req, 2008 struct pctrie_iter *pages) 2009 { 2010 struct vm_domainset_iter di; 2011 vm_page_t m; 2012 int domain; 2013 2014 vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, 2015 pages); 2016 do { 2017 m = vm_page_alloc_domain_iter(object, pindex, domain, req, 2018 pages); 2019 if (m != NULL) 2020 break; 2021 } while (vm_domainset_iter_page(&di, object, &domain, pages) == 0); 2022 2023 return (m); 2024 } 2025 2026 /* 2027 * Returns true if the number of free pages exceeds the minimum 2028 * for the request class and false otherwise. 2029 */ 2030 static int 2031 _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages) 2032 { 2033 u_int limit, old, new; 2034 2035 if (req_class == VM_ALLOC_INTERRUPT) 2036 limit = 0; 2037 else if (req_class == VM_ALLOC_SYSTEM) 2038 limit = vmd->vmd_interrupt_free_min; 2039 else 2040 limit = vmd->vmd_free_reserved; 2041 2042 /* 2043 * Attempt to reserve the pages. Fail if we're below the limit. 2044 */ 2045 limit += npages; 2046 old = atomic_load_int(&vmd->vmd_free_count); 2047 do { 2048 if (old < limit) 2049 return (0); 2050 new = old - npages; 2051 } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); 2052 2053 /* Wake the page daemon if we've crossed the threshold. */ 2054 if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) 2055 pagedaemon_wakeup(vmd->vmd_domain); 2056 2057 /* Only update bitsets on transitions. */ 2058 if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || 2059 (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) 2060 vm_domain_set(vmd); 2061 2062 return (1); 2063 } 2064 2065 int 2066 vm_domain_allocate(struct vm_domain *vmd, int req, int npages) 2067 { 2068 int req_class; 2069 2070 /* 2071 * The page daemon is allowed to dig deeper into the free page list. 2072 */ 2073 req_class = req & VM_ALLOC_CLASS_MASK; 2074 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 2075 req_class = VM_ALLOC_SYSTEM; 2076 return (_vm_domain_allocate(vmd, req_class, npages)); 2077 } 2078 2079 vm_page_t 2080 vm_page_alloc_domain_iter(vm_object_t object, vm_pindex_t pindex, int domain, 2081 int req, struct pctrie_iter *pages) 2082 { 2083 struct vm_domain *vmd; 2084 vm_page_t m; 2085 int flags; 2086 2087 #define VM_ALLOC_COMMON (VM_ALLOC_CLASS_MASK | VM_ALLOC_NODUMP | \ 2088 VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | \ 2089 VM_ALLOC_WIRED | VM_ALLOC_ZERO) 2090 #define VPA_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \ 2091 VM_ALLOC_NOBUSY | VM_ALLOC_NOFREE | \ 2092 VM_ALLOC_SBUSY) 2093 KASSERT((req & ~VPA_FLAGS) == 0, 2094 ("invalid request %#x", req)); 2095 KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != 2096 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), 2097 ("invalid request %#x", req)); 2098 VM_OBJECT_ASSERT_WLOCKED(object); 2099 2100 flags = 0; 2101 m = NULL; 2102 if (!vm_pager_can_alloc_page(object, pindex)) 2103 return (NULL); 2104 #if VM_NRESERVLEVEL > 0 2105 again: 2106 #endif 2107 if (__predict_false((req & VM_ALLOC_NOFREE) != 0)) { 2108 m = vm_page_alloc_nofree_domain(domain, req); 2109 if (m != NULL) 2110 goto found; 2111 } 2112 #if VM_NRESERVLEVEL > 0 2113 /* 2114 * Can we allocate the page from a reservation? 2115 */ 2116 if (vm_object_reserv(object) && 2117 (m = vm_reserv_alloc_page(object, pindex, domain, req, pages)) != 2118 NULL) { 2119 goto found; 2120 } 2121 #endif 2122 vmd = VM_DOMAIN(domain); 2123 if (vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone != NULL) { 2124 m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone, 2125 M_NOWAIT | M_NOVM); 2126 if (m != NULL) { 2127 flags |= PG_PCPU_CACHE; 2128 goto found; 2129 } 2130 } 2131 if (vm_domain_allocate(vmd, req, 1)) { 2132 /* 2133 * If not, allocate it from the free page queues. 2134 */ 2135 vm_domain_free_lock(vmd); 2136 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 0); 2137 vm_domain_free_unlock(vmd); 2138 if (m == NULL) { 2139 vm_domain_freecnt_inc(vmd, 1); 2140 #if VM_NRESERVLEVEL > 0 2141 if (vm_reserv_reclaim_inactive(domain)) 2142 goto again; 2143 #endif 2144 } 2145 } 2146 if (m == NULL) { 2147 /* 2148 * Not allocatable, give up. 2149 */ 2150 (void)vm_domain_alloc_fail(vmd, object, req); 2151 if ((req & VM_ALLOC_WAITFAIL) != 0) 2152 pctrie_iter_reset(pages); 2153 return (NULL); 2154 } 2155 2156 /* 2157 * At this point we had better have found a good page. 2158 */ 2159 found: 2160 vm_page_dequeue(m); 2161 vm_page_alloc_check(m); 2162 2163 /* 2164 * Initialize the page. Only the PG_ZERO flag is inherited. 2165 */ 2166 flags |= m->flags & PG_ZERO; 2167 if ((req & VM_ALLOC_NODUMP) != 0) 2168 flags |= PG_NODUMP; 2169 if ((req & VM_ALLOC_NOFREE) != 0) 2170 flags |= PG_NOFREE; 2171 m->flags = flags; 2172 m->a.flags = 0; 2173 m->oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; 2174 m->pool = VM_FREEPOOL_DEFAULT; 2175 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) 2176 m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; 2177 else if ((req & VM_ALLOC_SBUSY) != 0) 2178 m->busy_lock = VPB_SHARERS_WORD(1); 2179 else 2180 m->busy_lock = VPB_UNBUSIED; 2181 if (req & VM_ALLOC_WIRED) { 2182 vm_wire_add(1); 2183 m->ref_count = 1; 2184 } 2185 m->a.act_count = 0; 2186 2187 if (vm_page_iter_insert(m, object, pindex, pages)) { 2188 if (req & VM_ALLOC_WIRED) { 2189 vm_wire_sub(1); 2190 m->ref_count = 0; 2191 } 2192 KASSERT(m->object == NULL, ("page %p has object", m)); 2193 m->oflags = VPO_UNMANAGED; 2194 m->busy_lock = VPB_UNBUSIED; 2195 /* Don't change PG_ZERO. */ 2196 vm_page_free_toq(m); 2197 if (req & VM_ALLOC_WAITFAIL) { 2198 VM_OBJECT_WUNLOCK(object); 2199 vm_radix_wait(); 2200 pctrie_iter_reset(pages); 2201 VM_OBJECT_WLOCK(object); 2202 } 2203 return (NULL); 2204 } 2205 2206 /* Ignore device objects; the pager sets "memattr" for them. */ 2207 if (object->memattr != VM_MEMATTR_DEFAULT && 2208 (object->flags & OBJ_FICTITIOUS) == 0) 2209 pmap_page_set_memattr(m, object->memattr); 2210 2211 return (m); 2212 } 2213 2214 /* 2215 * vm_page_alloc_contig: 2216 * 2217 * Allocate a contiguous set of physical pages of the given size "npages" 2218 * from the free lists. All of the physical pages must be at or above 2219 * the given physical address "low" and below the given physical address 2220 * "high". The given value "alignment" determines the alignment of the 2221 * first physical page in the set. If the given value "boundary" is 2222 * non-zero, then the set of physical pages cannot cross any physical 2223 * address boundary that is a multiple of that value. Both "alignment" 2224 * and "boundary" must be a power of two. 2225 * 2226 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, 2227 * then the memory attribute setting for the physical pages is configured 2228 * to the object's memory attribute setting. Otherwise, the memory 2229 * attribute setting for the physical pages is configured to "memattr", 2230 * overriding the object's memory attribute setting. However, if the 2231 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the 2232 * memory attribute setting for the physical pages cannot be configured 2233 * to VM_MEMATTR_DEFAULT. 2234 * 2235 * The specified object may not contain fictitious pages. 2236 * 2237 * The caller must always specify an allocation class. 2238 * 2239 * allocation classes: 2240 * VM_ALLOC_NORMAL normal process request 2241 * VM_ALLOC_SYSTEM system *really* needs the pages 2242 * VM_ALLOC_INTERRUPT interrupt time request 2243 * 2244 * optional allocation flags: 2245 * VM_ALLOC_NOBUSY do not exclusive busy the pages 2246 * VM_ALLOC_NODUMP do not include the pages in a kernel core dump 2247 * VM_ALLOC_NORECLAIM do not reclaim after initial failure 2248 * VM_ALLOC_NOWAIT ignored (default behavior) 2249 * VM_ALLOC_SBUSY shared busy the allocated pages 2250 * VM_ALLOC_WAITFAIL in case of failure, sleep before returning 2251 * VM_ALLOC_WIRED wire the allocated pages 2252 * VM_ALLOC_ZERO prefer zeroed pages 2253 */ 2254 vm_page_t 2255 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, 2256 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 2257 vm_paddr_t boundary, vm_memattr_t memattr) 2258 { 2259 struct vm_domainset_iter di; 2260 vm_page_t bounds[2]; 2261 vm_page_t m; 2262 int domain; 2263 int start_segind; 2264 2265 start_segind = -1; 2266 2267 vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, NULL); 2268 do { 2269 m = vm_page_alloc_contig_domain(object, pindex, domain, req, 2270 npages, low, high, alignment, boundary, memattr); 2271 if (m != NULL) 2272 break; 2273 if (start_segind == -1) 2274 start_segind = vm_phys_lookup_segind(low); 2275 if (vm_phys_find_range(bounds, start_segind, domain, 2276 npages, low, high) == -1) { 2277 vm_domainset_iter_ignore(&di, domain); 2278 } 2279 } while (vm_domainset_iter_page(&di, object, &domain, NULL) == 0); 2280 2281 return (m); 2282 } 2283 2284 static vm_page_t 2285 vm_page_find_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, 2286 vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 2287 { 2288 struct vm_domain *vmd; 2289 vm_page_t m_ret; 2290 2291 /* 2292 * Can we allocate the pages without the number of free pages falling 2293 * below the lower bound for the allocation class? 2294 */ 2295 vmd = VM_DOMAIN(domain); 2296 if (!vm_domain_allocate(vmd, req, npages)) 2297 return (NULL); 2298 /* 2299 * Try to allocate the pages from the free page queues. 2300 */ 2301 vm_domain_free_lock(vmd); 2302 m_ret = vm_phys_alloc_contig(domain, npages, low, high, 2303 alignment, boundary); 2304 vm_domain_free_unlock(vmd); 2305 if (m_ret != NULL) 2306 return (m_ret); 2307 #if VM_NRESERVLEVEL > 0 2308 /* 2309 * Try to break a reservation to allocate the pages. 2310 */ 2311 if ((req & VM_ALLOC_NORECLAIM) == 0) { 2312 m_ret = vm_reserv_reclaim_contig(domain, npages, low, 2313 high, alignment, boundary); 2314 if (m_ret != NULL) 2315 return (m_ret); 2316 } 2317 #endif 2318 vm_domain_freecnt_inc(vmd, npages); 2319 return (NULL); 2320 } 2321 2322 vm_page_t 2323 vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, 2324 int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 2325 vm_paddr_t boundary, vm_memattr_t memattr) 2326 { 2327 struct pctrie_iter pages; 2328 vm_page_t m, m_ret, mpred; 2329 u_int busy_lock, flags, oflags; 2330 2331 #define VPAC_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \ 2332 VM_ALLOC_NOBUSY | VM_ALLOC_NORECLAIM | \ 2333 VM_ALLOC_SBUSY) 2334 KASSERT((req & ~VPAC_FLAGS) == 0, 2335 ("invalid request %#x", req)); 2336 KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != 2337 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), 2338 ("invalid request %#x", req)); 2339 VM_OBJECT_ASSERT_WLOCKED(object); 2340 KASSERT((object->flags & OBJ_FICTITIOUS) == 0, 2341 ("vm_page_alloc_contig: object %p has fictitious pages", 2342 object)); 2343 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 2344 2345 vm_page_iter_init(&pages, object); 2346 m_ret = NULL; 2347 #if VM_NRESERVLEVEL > 0 2348 /* 2349 * Can we allocate the pages from a reservation? 2350 */ 2351 if (vm_object_reserv(object)) { 2352 m_ret = vm_reserv_alloc_contig(object, pindex, domain, 2353 req, npages, low, high, alignment, boundary, &pages); 2354 } 2355 #endif 2356 if (m_ret == NULL) { 2357 m_ret = vm_page_find_contig_domain(domain, req, npages, 2358 low, high, alignment, boundary); 2359 } 2360 if (m_ret == NULL) { 2361 (void)vm_domain_alloc_fail(VM_DOMAIN(domain), object, req); 2362 return (NULL); 2363 } 2364 2365 /* 2366 * Initialize the pages. Only the PG_ZERO flag is inherited. 2367 */ 2368 flags = PG_ZERO; 2369 if ((req & VM_ALLOC_NODUMP) != 0) 2370 flags |= PG_NODUMP; 2371 oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; 2372 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) 2373 busy_lock = VPB_CURTHREAD_EXCLUSIVE; 2374 else if ((req & VM_ALLOC_SBUSY) != 0) 2375 busy_lock = VPB_SHARERS_WORD(1); 2376 else 2377 busy_lock = VPB_UNBUSIED; 2378 if ((req & VM_ALLOC_WIRED) != 0) 2379 vm_wire_add(npages); 2380 if (object->memattr != VM_MEMATTR_DEFAULT && 2381 memattr == VM_MEMATTR_DEFAULT) 2382 memattr = object->memattr; 2383 for (m = m_ret; m < &m_ret[npages]; m++) { 2384 vm_page_dequeue(m); 2385 vm_page_alloc_check(m); 2386 m->a.flags = 0; 2387 m->flags = (m->flags | PG_NODUMP) & flags; 2388 m->busy_lock = busy_lock; 2389 if ((req & VM_ALLOC_WIRED) != 0) 2390 m->ref_count = 1; 2391 m->a.act_count = 0; 2392 m->oflags = oflags; 2393 m->pool = VM_FREEPOOL_DEFAULT; 2394 if (vm_page_iter_insert(m, object, pindex, &pages)) { 2395 if ((req & VM_ALLOC_WIRED) != 0) 2396 vm_wire_sub(npages); 2397 KASSERT(m->object == NULL, 2398 ("page %p has object", m)); 2399 mpred = m; 2400 for (m = m_ret; m < &m_ret[npages]; m++) { 2401 if (m <= mpred && 2402 (req & VM_ALLOC_WIRED) != 0) 2403 m->ref_count = 0; 2404 m->oflags = VPO_UNMANAGED; 2405 m->busy_lock = VPB_UNBUSIED; 2406 /* Don't change PG_ZERO. */ 2407 vm_page_free_toq(m); 2408 } 2409 if (req & VM_ALLOC_WAITFAIL) { 2410 VM_OBJECT_WUNLOCK(object); 2411 vm_radix_wait(); 2412 VM_OBJECT_WLOCK(object); 2413 } 2414 return (NULL); 2415 } 2416 if (memattr != VM_MEMATTR_DEFAULT) 2417 pmap_page_set_memattr(m, memattr); 2418 pindex++; 2419 } 2420 return (m_ret); 2421 } 2422 2423 /* 2424 * Allocate a physical page that is not intended to be inserted into a VM 2425 * object. 2426 */ 2427 vm_page_t 2428 vm_page_alloc_noobj_domain(int domain, int req) 2429 { 2430 struct vm_domain *vmd; 2431 vm_page_t m; 2432 int flags; 2433 2434 #define VPAN_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \ 2435 VM_ALLOC_NOFREE | VM_ALLOC_WAITOK) 2436 KASSERT((req & ~VPAN_FLAGS) == 0, 2437 ("invalid request %#x", req)); 2438 2439 flags = ((req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0) | 2440 ((req & VM_ALLOC_NOFREE) != 0 ? PG_NOFREE : 0); 2441 vmd = VM_DOMAIN(domain); 2442 again: 2443 if (__predict_false((req & VM_ALLOC_NOFREE) != 0)) { 2444 m = vm_page_alloc_nofree_domain(domain, req); 2445 if (m != NULL) 2446 goto found; 2447 } 2448 2449 if (vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) { 2450 m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone, 2451 M_NOWAIT | M_NOVM); 2452 if (m != NULL) { 2453 flags |= PG_PCPU_CACHE; 2454 goto found; 2455 } 2456 } 2457 2458 if (vm_domain_allocate(vmd, req, 1)) { 2459 vm_domain_free_lock(vmd); 2460 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0); 2461 vm_domain_free_unlock(vmd); 2462 if (m == NULL) { 2463 vm_domain_freecnt_inc(vmd, 1); 2464 #if VM_NRESERVLEVEL > 0 2465 if (vm_reserv_reclaim_inactive(domain)) 2466 goto again; 2467 #endif 2468 } 2469 } 2470 if (m == NULL) { 2471 if (!vm_domain_alloc_fail(vmd, NULL, req)) 2472 return (NULL); 2473 goto again; 2474 } 2475 2476 found: 2477 /* 2478 * If the page comes from the free page cache, then it might still 2479 * have a pending deferred dequeue. Specifically, when the page is 2480 * imported from a different pool by vm_phys_alloc_npages(), the 2481 * second, third, etc. pages in a non-zero order set could have 2482 * pending deferred dequeues. 2483 */ 2484 vm_page_dequeue(m); 2485 vm_page_alloc_check(m); 2486 2487 /* 2488 * Consumers should not rely on a useful default pindex value. 2489 */ 2490 m->pindex = 0xdeadc0dedeadc0de; 2491 m->flags = (m->flags & PG_ZERO) | flags; 2492 m->a.flags = 0; 2493 m->oflags = VPO_UNMANAGED; 2494 m->pool = VM_FREEPOOL_DIRECT; 2495 m->busy_lock = VPB_UNBUSIED; 2496 if ((req & VM_ALLOC_WIRED) != 0) { 2497 vm_wire_add(1); 2498 m->ref_count = 1; 2499 } 2500 2501 if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) 2502 pmap_zero_page(m); 2503 2504 return (m); 2505 } 2506 2507 #if VM_NRESERVLEVEL > 1 2508 #define VM_NOFREE_IMPORT_ORDER (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER) 2509 #elif VM_NRESERVLEVEL > 0 2510 #define VM_NOFREE_IMPORT_ORDER VM_LEVEL_0_ORDER 2511 #else 2512 #define VM_NOFREE_IMPORT_ORDER 8 2513 #endif 2514 2515 /* 2516 * Allocate a single NOFREE page. 2517 * 2518 * This routine hands out NOFREE pages from higher-order 2519 * physical memory blocks in order to reduce memory fragmentation. 2520 * When a NOFREE for a given domain chunk is used up, 2521 * the routine will try to fetch a new one from the freelists 2522 * and discard the old one. 2523 */ 2524 static vm_page_t __noinline 2525 vm_page_alloc_nofree_domain(int domain, int req) 2526 { 2527 vm_page_t m; 2528 struct vm_domain *vmd; 2529 2530 KASSERT((req & VM_ALLOC_NOFREE) != 0, ("invalid request %#x", req)); 2531 2532 vmd = VM_DOMAIN(domain); 2533 vm_domain_free_lock(vmd); 2534 if (TAILQ_EMPTY(&vmd->vmd_nofreeq)) { 2535 int count; 2536 2537 count = 1 << VM_NOFREE_IMPORT_ORDER; 2538 if (!vm_domain_allocate(vmd, req, count)) { 2539 vm_domain_free_unlock(vmd); 2540 return (NULL); 2541 } 2542 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 2543 VM_NOFREE_IMPORT_ORDER); 2544 if (m == NULL) { 2545 vm_domain_freecnt_inc(vmd, count); 2546 vm_domain_free_unlock(vmd); 2547 return (NULL); 2548 } 2549 m->ref_count = count - 1; 2550 TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m, plinks.q); 2551 atomic_add_long(&nofreeq_size, count); 2552 } 2553 m = TAILQ_FIRST(&vmd->vmd_nofreeq); 2554 TAILQ_REMOVE(&vmd->vmd_nofreeq, m, plinks.q); 2555 if (m->ref_count > 0) { 2556 vm_page_t m_next; 2557 2558 m_next = &m[1]; 2559 vm_page_dequeue(m_next); 2560 m_next->ref_count = m->ref_count - 1; 2561 TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m_next, plinks.q); 2562 m->ref_count = 0; 2563 } 2564 vm_domain_free_unlock(vmd); 2565 atomic_add_long(&nofreeq_size, -1); 2566 VM_CNT_INC(v_nofree_count); 2567 2568 return (m); 2569 } 2570 2571 /* 2572 * Though a NOFREE page by definition should not be freed, we support putting 2573 * them aside for future NOFREE allocations. This enables code which allocates 2574 * NOFREE pages for some purpose but then encounters an error and releases 2575 * resources. 2576 */ 2577 static void __noinline 2578 vm_page_free_nofree(struct vm_domain *vmd, vm_page_t m) 2579 { 2580 VM_CNT_ADD(v_nofree_count, -1); 2581 atomic_add_long(&nofreeq_size, 1); 2582 vm_domain_free_lock(vmd); 2583 MPASS(m->ref_count == 0); 2584 TAILQ_INSERT_HEAD(&vmd->vmd_nofreeq, m, plinks.q); 2585 vm_domain_free_unlock(vmd); 2586 } 2587 2588 vm_page_t 2589 vm_page_alloc_noobj(int req) 2590 { 2591 struct vm_domainset_iter di; 2592 vm_page_t m; 2593 int domain; 2594 2595 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); 2596 do { 2597 m = vm_page_alloc_noobj_domain(domain, req); 2598 if (m != NULL) 2599 break; 2600 } while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0); 2601 2602 return (m); 2603 } 2604 2605 vm_page_t 2606 vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low, 2607 vm_paddr_t high, u_long alignment, vm_paddr_t boundary, 2608 vm_memattr_t memattr) 2609 { 2610 struct vm_domainset_iter di; 2611 vm_page_t m; 2612 int domain; 2613 2614 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); 2615 do { 2616 m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low, 2617 high, alignment, boundary, memattr); 2618 if (m != NULL) 2619 break; 2620 } while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0); 2621 2622 return (m); 2623 } 2624 2625 vm_page_t 2626 vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages, 2627 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, 2628 vm_memattr_t memattr) 2629 { 2630 vm_page_t m, m_ret; 2631 u_int flags; 2632 2633 #define VPANC_FLAGS (VM_ALLOC_COMMON | VM_ALLOC_COUNT_MASK | \ 2634 VM_ALLOC_NORECLAIM | VM_ALLOC_WAITOK) 2635 KASSERT((req & ~VPANC_FLAGS) == 0, 2636 ("invalid request %#x", req)); 2637 KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != 2638 (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM), 2639 ("invalid request %#x", req)); 2640 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 2641 2642 while ((m_ret = vm_page_find_contig_domain(domain, req, npages, 2643 low, high, alignment, boundary)) == NULL) { 2644 if (!vm_domain_alloc_fail(VM_DOMAIN(domain), NULL, req)) 2645 return (NULL); 2646 } 2647 2648 /* 2649 * Initialize the pages. Only the PG_ZERO flag is inherited. 2650 */ 2651 flags = PG_ZERO; 2652 if ((req & VM_ALLOC_NODUMP) != 0) 2653 flags |= PG_NODUMP; 2654 if ((req & VM_ALLOC_WIRED) != 0) 2655 vm_wire_add(npages); 2656 for (m = m_ret; m < &m_ret[npages]; m++) { 2657 vm_page_dequeue(m); 2658 vm_page_alloc_check(m); 2659 2660 /* 2661 * Consumers should not rely on a useful default pindex value. 2662 */ 2663 m->pindex = 0xdeadc0dedeadc0de; 2664 m->a.flags = 0; 2665 m->flags = (m->flags | PG_NODUMP) & flags; 2666 m->busy_lock = VPB_UNBUSIED; 2667 if ((req & VM_ALLOC_WIRED) != 0) 2668 m->ref_count = 1; 2669 m->a.act_count = 0; 2670 m->oflags = VPO_UNMANAGED; 2671 m->pool = VM_FREEPOOL_DIRECT; 2672 2673 /* 2674 * Zero the page before updating any mappings since the page is 2675 * not yet shared with any devices which might require the 2676 * non-default memory attribute. pmap_page_set_memattr() 2677 * flushes data caches before returning. 2678 */ 2679 if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) 2680 pmap_zero_page(m); 2681 if (memattr != VM_MEMATTR_DEFAULT) 2682 pmap_page_set_memattr(m, memattr); 2683 } 2684 return (m_ret); 2685 } 2686 2687 /* 2688 * Check a page that has been freshly dequeued from a freelist. 2689 */ 2690 static void 2691 vm_page_alloc_check(vm_page_t m) 2692 { 2693 2694 KASSERT(m->object == NULL, ("page %p has object", m)); 2695 KASSERT(m->a.queue == PQ_NONE && 2696 (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, 2697 ("page %p has unexpected queue %d, flags %#x", 2698 m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK))); 2699 KASSERT(m->ref_count == 0, ("page %p has references", m)); 2700 KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m)); 2701 KASSERT(m->dirty == 0, ("page %p is dirty", m)); 2702 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 2703 ("page %p has unexpected memattr %d", 2704 m, pmap_page_get_memattr(m))); 2705 KASSERT(vm_page_none_valid(m), ("free page %p is valid", m)); 2706 pmap_vm_page_alloc_check(m); 2707 } 2708 2709 static int 2710 vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) 2711 { 2712 struct vm_domain *vmd; 2713 struct vm_pgcache *pgcache; 2714 int i; 2715 2716 pgcache = arg; 2717 vmd = VM_DOMAIN(pgcache->domain); 2718 2719 /* 2720 * The page daemon should avoid creating extra memory pressure since its 2721 * main purpose is to replenish the store of free pages. 2722 */ 2723 if (vmd->vmd_severeset || curproc == pageproc || 2724 !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) 2725 return (0); 2726 domain = vmd->vmd_domain; 2727 vm_domain_free_lock(vmd); 2728 i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, 2729 (vm_page_t *)store); 2730 vm_domain_free_unlock(vmd); 2731 if (cnt != i) 2732 vm_domain_freecnt_inc(vmd, cnt - i); 2733 2734 return (i); 2735 } 2736 2737 static void 2738 vm_page_zone_release(void *arg, void **store, int cnt) 2739 { 2740 struct vm_domain *vmd; 2741 struct vm_pgcache *pgcache; 2742 vm_page_t m; 2743 int i; 2744 2745 pgcache = arg; 2746 vmd = VM_DOMAIN(pgcache->domain); 2747 vm_domain_free_lock(vmd); 2748 for (i = 0; i < cnt; i++) { 2749 m = (vm_page_t)store[i]; 2750 vm_phys_free_pages(m, pgcache->pool, 0); 2751 } 2752 vm_domain_free_unlock(vmd); 2753 vm_domain_freecnt_inc(vmd, cnt); 2754 } 2755 2756 #define VPSC_ANY 0 /* No restrictions. */ 2757 #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ 2758 #define VPSC_NOSUPER 2 /* Skip superpages. */ 2759 2760 /* 2761 * vm_page_scan_contig: 2762 * 2763 * Scan vm_page_array[] between the specified entries "m_start" and 2764 * "m_end" for a run of contiguous physical pages that satisfy the 2765 * specified conditions, and return the lowest page in the run. The 2766 * specified "alignment" determines the alignment of the lowest physical 2767 * page in the run. If the specified "boundary" is non-zero, then the 2768 * run of physical pages cannot span a physical address that is a 2769 * multiple of "boundary". 2770 * 2771 * "m_end" is never dereferenced, so it need not point to a vm_page 2772 * structure within vm_page_array[]. 2773 * 2774 * "npages" must be greater than zero. "m_start" and "m_end" must not 2775 * span a hole (or discontiguity) in the physical address space. Both 2776 * "alignment" and "boundary" must be a power of two. 2777 */ 2778 static vm_page_t 2779 vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, 2780 u_long alignment, vm_paddr_t boundary, int options) 2781 { 2782 vm_object_t object; 2783 vm_paddr_t pa; 2784 vm_page_t m, m_run; 2785 #if VM_NRESERVLEVEL > 0 2786 int level; 2787 #endif 2788 int m_inc, order, run_ext, run_len; 2789 2790 KASSERT(npages > 0, ("npages is 0")); 2791 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 2792 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 2793 m_run = NULL; 2794 run_len = 0; 2795 for (m = m_start; m < m_end && run_len < npages; m += m_inc) { 2796 KASSERT((m->flags & PG_MARKER) == 0, 2797 ("page %p is PG_MARKER", m)); 2798 KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1, 2799 ("fictitious page %p has invalid ref count", m)); 2800 2801 /* 2802 * If the current page would be the start of a run, check its 2803 * physical address against the end, alignment, and boundary 2804 * conditions. If it doesn't satisfy these conditions, either 2805 * terminate the scan or advance to the next page that 2806 * satisfies the failed condition. 2807 */ 2808 if (run_len == 0) { 2809 KASSERT(m_run == NULL, ("m_run != NULL")); 2810 if (m + npages > m_end) 2811 break; 2812 pa = VM_PAGE_TO_PHYS(m); 2813 if (!vm_addr_align_ok(pa, alignment)) { 2814 m_inc = atop(roundup2(pa, alignment) - pa); 2815 continue; 2816 } 2817 if (!vm_addr_bound_ok(pa, ptoa(npages), boundary)) { 2818 m_inc = atop(roundup2(pa, boundary) - pa); 2819 continue; 2820 } 2821 } else 2822 KASSERT(m_run != NULL, ("m_run == NULL")); 2823 2824 retry: 2825 m_inc = 1; 2826 if (vm_page_wired(m)) 2827 run_ext = 0; 2828 #if VM_NRESERVLEVEL > 0 2829 else if ((level = vm_reserv_level(m)) >= 0 && 2830 (options & VPSC_NORESERV) != 0) { 2831 run_ext = 0; 2832 /* Advance to the end of the reservation. */ 2833 pa = VM_PAGE_TO_PHYS(m); 2834 m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - 2835 pa); 2836 } 2837 #endif 2838 else if ((object = atomic_load_ptr(&m->object)) != NULL) { 2839 /* 2840 * The page is considered eligible for relocation if 2841 * and only if it could be laundered or reclaimed by 2842 * the page daemon. 2843 */ 2844 VM_OBJECT_RLOCK(object); 2845 if (object != m->object) { 2846 VM_OBJECT_RUNLOCK(object); 2847 goto retry; 2848 } 2849 /* Don't care: PG_NODUMP, PG_ZERO. */ 2850 if ((object->flags & OBJ_SWAP) == 0 && 2851 object->type != OBJT_VNODE) { 2852 run_ext = 0; 2853 #if VM_NRESERVLEVEL > 0 2854 } else if ((options & VPSC_NOSUPER) != 0 && 2855 (level = vm_reserv_level_iffullpop(m)) >= 0) { 2856 run_ext = 0; 2857 /* Advance to the end of the superpage. */ 2858 pa = VM_PAGE_TO_PHYS(m); 2859 m_inc = atop(roundup2(pa + 1, 2860 vm_reserv_size(level)) - pa); 2861 #endif 2862 } else if (object->memattr == VM_MEMATTR_DEFAULT && 2863 vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { 2864 /* 2865 * The page is allocated but eligible for 2866 * relocation. Extend the current run by one 2867 * page. 2868 */ 2869 KASSERT(pmap_page_get_memattr(m) == 2870 VM_MEMATTR_DEFAULT, 2871 ("page %p has an unexpected memattr", m)); 2872 KASSERT((m->oflags & (VPO_SWAPINPROG | 2873 VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, 2874 ("page %p has unexpected oflags", m)); 2875 /* Don't care: PGA_NOSYNC. */ 2876 run_ext = 1; 2877 } else 2878 run_ext = 0; 2879 VM_OBJECT_RUNLOCK(object); 2880 #if VM_NRESERVLEVEL > 0 2881 } else if (level >= 0) { 2882 /* 2883 * The page is reserved but not yet allocated. In 2884 * other words, it is still free. Extend the current 2885 * run by one page. 2886 */ 2887 run_ext = 1; 2888 #endif 2889 } else if ((order = m->order) < VM_NFREEORDER) { 2890 /* 2891 * The page is enqueued in the physical memory 2892 * allocator's free page queues. Moreover, it is the 2893 * first page in a power-of-two-sized run of 2894 * contiguous free pages. Add these pages to the end 2895 * of the current run, and jump ahead. 2896 */ 2897 run_ext = 1 << order; 2898 m_inc = 1 << order; 2899 } else { 2900 /* 2901 * Skip the page for one of the following reasons: (1) 2902 * It is enqueued in the physical memory allocator's 2903 * free page queues. However, it is not the first 2904 * page in a run of contiguous free pages. (This case 2905 * rarely occurs because the scan is performed in 2906 * ascending order.) (2) It is not reserved, and it is 2907 * transitioning from free to allocated. (Conversely, 2908 * the transition from allocated to free for managed 2909 * pages is blocked by the page busy lock.) (3) It is 2910 * allocated but not contained by an object and not 2911 * wired, e.g., allocated by Xen's balloon driver. 2912 */ 2913 run_ext = 0; 2914 } 2915 2916 /* 2917 * Extend or reset the current run of pages. 2918 */ 2919 if (run_ext > 0) { 2920 if (run_len == 0) 2921 m_run = m; 2922 run_len += run_ext; 2923 } else { 2924 if (run_len > 0) { 2925 m_run = NULL; 2926 run_len = 0; 2927 } 2928 } 2929 } 2930 if (run_len >= npages) 2931 return (m_run); 2932 return (NULL); 2933 } 2934 2935 /* 2936 * vm_page_reclaim_run: 2937 * 2938 * Try to relocate each of the allocated virtual pages within the 2939 * specified run of physical pages to a new physical address. Free the 2940 * physical pages underlying the relocated virtual pages. A virtual page 2941 * is relocatable if and only if it could be laundered or reclaimed by 2942 * the page daemon. Whenever possible, a virtual page is relocated to a 2943 * physical address above "high". 2944 * 2945 * Returns 0 if every physical page within the run was already free or 2946 * just freed by a successful relocation. Otherwise, returns a non-zero 2947 * value indicating why the last attempt to relocate a virtual page was 2948 * unsuccessful. 2949 * 2950 * "req_class" must be an allocation class. 2951 */ 2952 static int 2953 vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, 2954 vm_paddr_t high) 2955 { 2956 struct vm_domain *vmd; 2957 struct spglist free; 2958 vm_object_t object; 2959 vm_paddr_t pa; 2960 vm_page_t m, m_end, m_new; 2961 int error, order, req; 2962 2963 KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, 2964 ("req_class is not an allocation class")); 2965 SLIST_INIT(&free); 2966 error = 0; 2967 m = m_run; 2968 m_end = m_run + npages; 2969 for (; error == 0 && m < m_end; m++) { 2970 KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, 2971 ("page %p is PG_FICTITIOUS or PG_MARKER", m)); 2972 2973 /* 2974 * Racily check for wirings. Races are handled once the object 2975 * lock is held and the page is unmapped. 2976 */ 2977 if (vm_page_wired(m)) 2978 error = EBUSY; 2979 else if ((object = atomic_load_ptr(&m->object)) != NULL) { 2980 /* 2981 * The page is relocated if and only if it could be 2982 * laundered or reclaimed by the page daemon. 2983 */ 2984 VM_OBJECT_WLOCK(object); 2985 /* Don't care: PG_NODUMP, PG_ZERO. */ 2986 if (m->object != object || 2987 ((object->flags & OBJ_SWAP) == 0 && 2988 object->type != OBJT_VNODE)) 2989 error = EINVAL; 2990 else if (object->memattr != VM_MEMATTR_DEFAULT) 2991 error = EINVAL; 2992 else if (vm_page_queue(m) != PQ_NONE && 2993 vm_page_tryxbusy(m) != 0) { 2994 if (vm_page_wired(m)) { 2995 vm_page_xunbusy(m); 2996 error = EBUSY; 2997 goto unlock; 2998 } 2999 KASSERT(pmap_page_get_memattr(m) == 3000 VM_MEMATTR_DEFAULT, 3001 ("page %p has an unexpected memattr", m)); 3002 KASSERT(m->oflags == 0, 3003 ("page %p has unexpected oflags", m)); 3004 /* Don't care: PGA_NOSYNC. */ 3005 if (!vm_page_none_valid(m)) { 3006 /* 3007 * First, try to allocate a new page 3008 * that is above "high". Failing 3009 * that, try to allocate a new page 3010 * that is below "m_run". Allocate 3011 * the new page between the end of 3012 * "m_run" and "high" only as a last 3013 * resort. 3014 */ 3015 req = req_class; 3016 if ((m->flags & PG_NODUMP) != 0) 3017 req |= VM_ALLOC_NODUMP; 3018 if (trunc_page(high) != 3019 ~(vm_paddr_t)PAGE_MASK) { 3020 m_new = 3021 vm_page_alloc_noobj_contig( 3022 req, 1, round_page(high), 3023 ~(vm_paddr_t)0, PAGE_SIZE, 3024 0, VM_MEMATTR_DEFAULT); 3025 } else 3026 m_new = NULL; 3027 if (m_new == NULL) { 3028 pa = VM_PAGE_TO_PHYS(m_run); 3029 m_new = 3030 vm_page_alloc_noobj_contig( 3031 req, 1, 0, pa - 1, 3032 PAGE_SIZE, 0, 3033 VM_MEMATTR_DEFAULT); 3034 } 3035 if (m_new == NULL) { 3036 pa += ptoa(npages); 3037 m_new = 3038 vm_page_alloc_noobj_contig( 3039 req, 1, pa, high, PAGE_SIZE, 3040 0, VM_MEMATTR_DEFAULT); 3041 } 3042 if (m_new == NULL) { 3043 vm_page_xunbusy(m); 3044 error = ENOMEM; 3045 goto unlock; 3046 } 3047 3048 /* 3049 * Unmap the page and check for new 3050 * wirings that may have been acquired 3051 * through a pmap lookup. 3052 */ 3053 if (object->ref_count != 0 && 3054 !vm_page_try_remove_all(m)) { 3055 vm_page_xunbusy(m); 3056 vm_page_free(m_new); 3057 error = EBUSY; 3058 goto unlock; 3059 } 3060 3061 /* 3062 * Replace "m" with the new page. For 3063 * vm_page_replace(), "m" must be busy 3064 * and dequeued. Finally, change "m" 3065 * as if vm_page_free() was called. 3066 */ 3067 m_new->a.flags = m->a.flags & 3068 ~PGA_QUEUE_STATE_MASK; 3069 KASSERT(m_new->oflags == VPO_UNMANAGED, 3070 ("page %p is managed", m_new)); 3071 m_new->oflags = 0; 3072 pmap_copy_page(m, m_new); 3073 m_new->valid = m->valid; 3074 m_new->dirty = m->dirty; 3075 m->flags &= ~PG_ZERO; 3076 vm_page_dequeue(m); 3077 if (vm_page_replace_hold(m_new, object, 3078 m->pindex, m) && 3079 vm_page_free_prep(m)) 3080 SLIST_INSERT_HEAD(&free, m, 3081 plinks.s.ss); 3082 3083 /* 3084 * The new page must be deactivated 3085 * before the object is unlocked. 3086 */ 3087 vm_page_deactivate(m_new); 3088 } else { 3089 m->flags &= ~PG_ZERO; 3090 vm_page_dequeue(m); 3091 if (vm_page_free_prep(m)) 3092 SLIST_INSERT_HEAD(&free, m, 3093 plinks.s.ss); 3094 KASSERT(m->dirty == 0, 3095 ("page %p is dirty", m)); 3096 } 3097 } else 3098 error = EBUSY; 3099 unlock: 3100 VM_OBJECT_WUNLOCK(object); 3101 } else { 3102 MPASS(vm_page_domain(m) == domain); 3103 vmd = VM_DOMAIN(domain); 3104 vm_domain_free_lock(vmd); 3105 order = m->order; 3106 if (order < VM_NFREEORDER) { 3107 /* 3108 * The page is enqueued in the physical memory 3109 * allocator's free page queues. Moreover, it 3110 * is the first page in a power-of-two-sized 3111 * run of contiguous free pages. Jump ahead 3112 * to the last page within that run, and 3113 * continue from there. 3114 */ 3115 m += (1 << order) - 1; 3116 } 3117 #if VM_NRESERVLEVEL > 0 3118 else if (vm_reserv_is_page_free(m)) 3119 order = 0; 3120 #endif 3121 vm_domain_free_unlock(vmd); 3122 if (order == VM_NFREEORDER) 3123 error = EINVAL; 3124 } 3125 } 3126 if ((m = SLIST_FIRST(&free)) != NULL) { 3127 int cnt; 3128 3129 vmd = VM_DOMAIN(domain); 3130 cnt = 0; 3131 vm_domain_free_lock(vmd); 3132 do { 3133 MPASS(vm_page_domain(m) == domain); 3134 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3135 vm_phys_free_pages(m, m->pool, 0); 3136 cnt++; 3137 } while ((m = SLIST_FIRST(&free)) != NULL); 3138 vm_domain_free_unlock(vmd); 3139 vm_domain_freecnt_inc(vmd, cnt); 3140 } 3141 return (error); 3142 } 3143 3144 #define NRUNS 16 3145 3146 #define RUN_INDEX(count, nruns) ((count) % (nruns)) 3147 3148 #define MIN_RECLAIM 8 3149 3150 /* 3151 * vm_page_reclaim_contig: 3152 * 3153 * Reclaim allocated, contiguous physical memory satisfying the specified 3154 * conditions by relocating the virtual pages using that physical memory. 3155 * Returns 0 if reclamation is successful, ERANGE if the specified domain 3156 * can't possibly satisfy the reclamation request, or ENOMEM if not 3157 * currently able to reclaim the requested number of pages. Since 3158 * relocation requires the allocation of physical pages, reclamation may 3159 * fail with ENOMEM due to a shortage of free pages. When reclamation 3160 * fails in this manner, callers are expected to perform vm_wait() before 3161 * retrying a failed allocation operation, e.g., vm_page_alloc_contig(). 3162 * 3163 * The caller must always specify an allocation class through "req". 3164 * 3165 * allocation classes: 3166 * VM_ALLOC_NORMAL normal process request 3167 * VM_ALLOC_SYSTEM system *really* needs a page 3168 * VM_ALLOC_INTERRUPT interrupt time request 3169 * 3170 * The optional allocation flags are ignored. 3171 * 3172 * "npages" must be greater than zero. Both "alignment" and "boundary" 3173 * must be a power of two. 3174 */ 3175 int 3176 vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages, 3177 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, 3178 int desired_runs) 3179 { 3180 struct vm_domain *vmd; 3181 vm_page_t bounds[2], m_run, _m_runs[NRUNS], *m_runs; 3182 u_long count, minalign, reclaimed; 3183 int error, i, min_reclaim, nruns, options, req_class; 3184 int segind, start_segind; 3185 int ret; 3186 3187 KASSERT(npages > 0, ("npages is 0")); 3188 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 3189 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 3190 3191 ret = ENOMEM; 3192 3193 /* 3194 * If the caller wants to reclaim multiple runs, try to allocate 3195 * space to store the runs. If that fails, fall back to the old 3196 * behavior of just reclaiming MIN_RECLAIM pages. 3197 */ 3198 if (desired_runs > 1) 3199 m_runs = malloc((NRUNS + desired_runs) * sizeof(*m_runs), 3200 M_TEMP, M_NOWAIT); 3201 else 3202 m_runs = NULL; 3203 3204 if (m_runs == NULL) { 3205 m_runs = _m_runs; 3206 nruns = NRUNS; 3207 } else { 3208 nruns = NRUNS + desired_runs - 1; 3209 } 3210 min_reclaim = MAX(desired_runs * npages, MIN_RECLAIM); 3211 3212 /* 3213 * The caller will attempt an allocation after some runs have been 3214 * reclaimed and added to the vm_phys buddy lists. Due to limitations 3215 * of vm_phys_alloc_contig(), round up the requested length to the next 3216 * power of two or maximum chunk size, and ensure that each run is 3217 * suitably aligned. 3218 */ 3219 minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1); 3220 npages = roundup2(npages, minalign); 3221 if (alignment < ptoa(minalign)) 3222 alignment = ptoa(minalign); 3223 3224 /* 3225 * The page daemon is allowed to dig deeper into the free page list. 3226 */ 3227 req_class = req & VM_ALLOC_CLASS_MASK; 3228 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 3229 req_class = VM_ALLOC_SYSTEM; 3230 3231 start_segind = vm_phys_lookup_segind(low); 3232 3233 /* 3234 * Return if the number of free pages cannot satisfy the requested 3235 * allocation. 3236 */ 3237 vmd = VM_DOMAIN(domain); 3238 count = vmd->vmd_free_count; 3239 if (count < npages + vmd->vmd_free_reserved || (count < npages + 3240 vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || 3241 (count < npages && req_class == VM_ALLOC_INTERRUPT)) 3242 goto done; 3243 3244 /* 3245 * Scan up to three times, relaxing the restrictions ("options") on 3246 * the reclamation of reservations and superpages each time. 3247 */ 3248 for (options = VPSC_NORESERV;;) { 3249 bool phys_range_exists = false; 3250 3251 /* 3252 * Find the highest runs that satisfy the given constraints 3253 * and restrictions, and record them in "m_runs". 3254 */ 3255 count = 0; 3256 segind = start_segind; 3257 while ((segind = vm_phys_find_range(bounds, segind, domain, 3258 npages, low, high)) != -1) { 3259 phys_range_exists = true; 3260 while ((m_run = vm_page_scan_contig(npages, bounds[0], 3261 bounds[1], alignment, boundary, options))) { 3262 bounds[0] = m_run + npages; 3263 m_runs[RUN_INDEX(count, nruns)] = m_run; 3264 count++; 3265 } 3266 segind++; 3267 } 3268 3269 if (!phys_range_exists) { 3270 ret = ERANGE; 3271 goto done; 3272 } 3273 3274 /* 3275 * Reclaim the highest runs in LIFO (descending) order until 3276 * the number of reclaimed pages, "reclaimed", is at least 3277 * "min_reclaim". Reset "reclaimed" each time because each 3278 * reclamation is idempotent, and runs will (likely) recur 3279 * from one scan to the next as restrictions are relaxed. 3280 */ 3281 reclaimed = 0; 3282 for (i = 0; count > 0 && i < nruns; i++) { 3283 count--; 3284 m_run = m_runs[RUN_INDEX(count, nruns)]; 3285 error = vm_page_reclaim_run(req_class, domain, npages, 3286 m_run, high); 3287 if (error == 0) { 3288 reclaimed += npages; 3289 if (reclaimed >= min_reclaim) { 3290 ret = 0; 3291 goto done; 3292 } 3293 } 3294 } 3295 3296 /* 3297 * Either relax the restrictions on the next scan or return if 3298 * the last scan had no restrictions. 3299 */ 3300 if (options == VPSC_NORESERV) 3301 options = VPSC_NOSUPER; 3302 else if (options == VPSC_NOSUPER) 3303 options = VPSC_ANY; 3304 else if (options == VPSC_ANY) { 3305 if (reclaimed != 0) 3306 ret = 0; 3307 goto done; 3308 } 3309 } 3310 done: 3311 if (m_runs != _m_runs) 3312 free(m_runs, M_TEMP); 3313 return (ret); 3314 } 3315 3316 int 3317 vm_page_reclaim_contig_domain(int domain, int req, u_long npages, 3318 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 3319 { 3320 return (vm_page_reclaim_contig_domain_ext(domain, req, npages, low, 3321 high, alignment, boundary, 1)); 3322 } 3323 3324 int 3325 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, 3326 u_long alignment, vm_paddr_t boundary) 3327 { 3328 struct vm_domainset_iter di; 3329 int domain, ret, status; 3330 3331 ret = ERANGE; 3332 3333 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); 3334 do { 3335 status = vm_page_reclaim_contig_domain(domain, req, npages, low, 3336 high, alignment, boundary); 3337 if (status == 0) 3338 return (0); 3339 else if (status == ERANGE) 3340 vm_domainset_iter_ignore(&di, domain); 3341 else { 3342 KASSERT(status == ENOMEM, ("Unrecognized error %d " 3343 "from vm_page_reclaim_contig_domain()", status)); 3344 ret = ENOMEM; 3345 } 3346 } while (vm_domainset_iter_page(&di, NULL, &domain, NULL) == 0); 3347 3348 return (ret); 3349 } 3350 3351 /* 3352 * Set the domain in the appropriate page level domainset. 3353 */ 3354 void 3355 vm_domain_set(struct vm_domain *vmd) 3356 { 3357 3358 mtx_lock(&vm_domainset_lock); 3359 if (!vmd->vmd_minset && vm_paging_min(vmd)) { 3360 vmd->vmd_minset = 1; 3361 DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); 3362 } 3363 if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { 3364 vmd->vmd_severeset = 1; 3365 DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); 3366 } 3367 mtx_unlock(&vm_domainset_lock); 3368 } 3369 3370 /* 3371 * Clear the domain from the appropriate page level domainset. 3372 */ 3373 void 3374 vm_domain_clear(struct vm_domain *vmd) 3375 { 3376 3377 mtx_lock(&vm_domainset_lock); 3378 if (vmd->vmd_minset && !vm_paging_min(vmd)) { 3379 vmd->vmd_minset = 0; 3380 DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); 3381 if (vm_min_waiters != 0) { 3382 vm_min_waiters = 0; 3383 wakeup(&vm_min_domains); 3384 } 3385 } 3386 if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { 3387 vmd->vmd_severeset = 0; 3388 DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); 3389 if (vm_severe_waiters != 0) { 3390 vm_severe_waiters = 0; 3391 wakeup(&vm_severe_domains); 3392 } 3393 } 3394 3395 /* 3396 * If pageout daemon needs pages, then tell it that there are 3397 * some free. 3398 */ 3399 if (vmd->vmd_pageout_pages_needed && 3400 vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { 3401 wakeup(&vmd->vmd_pageout_pages_needed); 3402 vmd->vmd_pageout_pages_needed = 0; 3403 } 3404 3405 /* See comments in vm_wait_doms(). */ 3406 if (vm_pageproc_waiters) { 3407 vm_pageproc_waiters = 0; 3408 wakeup(&vm_pageproc_waiters); 3409 } 3410 mtx_unlock(&vm_domainset_lock); 3411 } 3412 3413 /* 3414 * Wait for free pages to exceed the min threshold globally. 3415 */ 3416 void 3417 vm_wait_min(void) 3418 { 3419 3420 mtx_lock(&vm_domainset_lock); 3421 while (vm_page_count_min()) { 3422 vm_min_waiters++; 3423 msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); 3424 } 3425 mtx_unlock(&vm_domainset_lock); 3426 } 3427 3428 /* 3429 * Wait for free pages to exceed the severe threshold globally. 3430 */ 3431 void 3432 vm_wait_severe(void) 3433 { 3434 3435 mtx_lock(&vm_domainset_lock); 3436 while (vm_page_count_severe()) { 3437 vm_severe_waiters++; 3438 msleep(&vm_severe_domains, &vm_domainset_lock, PVM, 3439 "vmwait", 0); 3440 } 3441 mtx_unlock(&vm_domainset_lock); 3442 } 3443 3444 u_int 3445 vm_wait_count(void) 3446 { 3447 3448 return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); 3449 } 3450 3451 int 3452 vm_wait_doms(const domainset_t *wdoms, int mflags) 3453 { 3454 int error; 3455 3456 error = 0; 3457 3458 /* 3459 * We use racey wakeup synchronization to avoid expensive global 3460 * locking for the pageproc when sleeping with a non-specific vm_wait. 3461 * To handle this, we only sleep for one tick in this instance. It 3462 * is expected that most allocations for the pageproc will come from 3463 * kmem or vm_page_grab* which will use the more specific and 3464 * race-free vm_wait_domain(). 3465 */ 3466 if (curproc == pageproc) { 3467 mtx_lock(&vm_domainset_lock); 3468 vm_pageproc_waiters++; 3469 error = msleep(&vm_pageproc_waiters, &vm_domainset_lock, 3470 PVM | PDROP | mflags, "pageprocwait", 1); 3471 } else { 3472 /* 3473 * XXX Ideally we would wait only until the allocation could 3474 * be satisfied. This condition can cause new allocators to 3475 * consume all freed pages while old allocators wait. 3476 */ 3477 mtx_lock(&vm_domainset_lock); 3478 if (vm_page_count_min_set(wdoms)) { 3479 if (pageproc == NULL) 3480 panic("vm_wait in early boot"); 3481 vm_min_waiters++; 3482 error = msleep(&vm_min_domains, &vm_domainset_lock, 3483 PVM | PDROP | mflags, "vmwait", 0); 3484 } else 3485 mtx_unlock(&vm_domainset_lock); 3486 } 3487 return (error); 3488 } 3489 3490 /* 3491 * vm_wait_domain: 3492 * 3493 * Sleep until free pages are available for allocation. 3494 * - Called in various places after failed memory allocations. 3495 */ 3496 void 3497 vm_wait_domain(int domain) 3498 { 3499 struct vm_domain *vmd; 3500 domainset_t wdom; 3501 3502 vmd = VM_DOMAIN(domain); 3503 vm_domain_free_assert_unlocked(vmd); 3504 3505 if (curproc == pageproc) { 3506 mtx_lock(&vm_domainset_lock); 3507 if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { 3508 vmd->vmd_pageout_pages_needed = 1; 3509 msleep(&vmd->vmd_pageout_pages_needed, 3510 &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); 3511 } else 3512 mtx_unlock(&vm_domainset_lock); 3513 } else { 3514 DOMAINSET_ZERO(&wdom); 3515 DOMAINSET_SET(vmd->vmd_domain, &wdom); 3516 vm_wait_doms(&wdom, 0); 3517 } 3518 } 3519 3520 static int 3521 vm_wait_flags(vm_object_t obj, int mflags) 3522 { 3523 struct domainset *d; 3524 3525 d = NULL; 3526 3527 /* 3528 * Carefully fetch pointers only once: the struct domainset 3529 * itself is ummutable but the pointer might change. 3530 */ 3531 if (obj != NULL) 3532 d = obj->domain.dr_policy; 3533 if (d == NULL) 3534 d = curthread->td_domain.dr_policy; 3535 3536 return (vm_wait_doms(&d->ds_mask, mflags)); 3537 } 3538 3539 /* 3540 * vm_wait: 3541 * 3542 * Sleep until free pages are available for allocation in the 3543 * affinity domains of the obj. If obj is NULL, the domain set 3544 * for the calling thread is used. 3545 * Called in various places after failed memory allocations. 3546 */ 3547 void 3548 vm_wait(vm_object_t obj) 3549 { 3550 (void)vm_wait_flags(obj, 0); 3551 } 3552 3553 int 3554 vm_wait_intr(vm_object_t obj) 3555 { 3556 return (vm_wait_flags(obj, PCATCH)); 3557 } 3558 3559 /* 3560 * vm_domain_alloc_fail: 3561 * 3562 * Called when a page allocation function fails. Informs the 3563 * pagedaemon and performs the requested wait. Requires the 3564 * domain_free and object lock on entry. Returns with the 3565 * object lock held and free lock released. Returns an error when 3566 * retry is necessary. 3567 * 3568 */ 3569 static int 3570 vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) 3571 { 3572 3573 vm_domain_free_assert_unlocked(vmd); 3574 3575 atomic_add_int(&vmd->vmd_pageout_deficit, 3576 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 3577 if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { 3578 if (object != NULL) 3579 VM_OBJECT_WUNLOCK(object); 3580 vm_wait_domain(vmd->vmd_domain); 3581 if (object != NULL) 3582 VM_OBJECT_WLOCK(object); 3583 if (req & VM_ALLOC_WAITOK) 3584 return (EAGAIN); 3585 } 3586 3587 return (0); 3588 } 3589 3590 /* 3591 * vm_waitpfault: 3592 * 3593 * Sleep until free pages are available for allocation. 3594 * - Called only in vm_fault so that processes page faulting 3595 * can be easily tracked. 3596 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 3597 * processes will be able to grab memory first. Do not change 3598 * this balance without careful testing first. 3599 */ 3600 void 3601 vm_waitpfault(struct domainset *dset, int timo) 3602 { 3603 3604 /* 3605 * XXX Ideally we would wait only until the allocation could 3606 * be satisfied. This condition can cause new allocators to 3607 * consume all freed pages while old allocators wait. 3608 */ 3609 mtx_lock(&vm_domainset_lock); 3610 if (vm_page_count_min_set(&dset->ds_mask)) { 3611 vm_min_waiters++; 3612 msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, 3613 "pfault", timo); 3614 } else 3615 mtx_unlock(&vm_domainset_lock); 3616 } 3617 3618 static struct vm_pagequeue * 3619 _vm_page_pagequeue(vm_page_t m, uint8_t queue) 3620 { 3621 3622 return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); 3623 } 3624 3625 #ifdef INVARIANTS 3626 static struct vm_pagequeue * 3627 vm_page_pagequeue(vm_page_t m) 3628 { 3629 3630 return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue)); 3631 } 3632 #endif 3633 3634 static __always_inline bool 3635 vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, 3636 vm_page_astate_t new) 3637 { 3638 vm_page_astate_t tmp; 3639 3640 tmp = *old; 3641 do { 3642 if (__predict_true(vm_page_astate_fcmpset(m, old, new))) 3643 return (true); 3644 counter_u64_add(pqstate_commit_retries, 1); 3645 } while (old->_bits == tmp._bits); 3646 3647 return (false); 3648 } 3649 3650 /* 3651 * Do the work of committing a queue state update that moves the page out of 3652 * its current queue. 3653 */ 3654 static bool 3655 _vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m, 3656 vm_page_astate_t *old, vm_page_astate_t new) 3657 { 3658 vm_page_t next; 3659 3660 vm_pagequeue_assert_locked(pq); 3661 KASSERT(vm_page_pagequeue(m) == pq, 3662 ("%s: queue %p does not match page %p", __func__, pq, m)); 3663 KASSERT(old->queue != PQ_NONE && new.queue != old->queue, 3664 ("%s: invalid queue indices %d %d", 3665 __func__, old->queue, new.queue)); 3666 3667 /* 3668 * Once the queue index of the page changes there is nothing 3669 * synchronizing with further updates to the page's physical 3670 * queue state. Therefore we must speculatively remove the page 3671 * from the queue now and be prepared to roll back if the queue 3672 * state update fails. If the page is not physically enqueued then 3673 * we just update its queue index. 3674 */ 3675 if ((old->flags & PGA_ENQUEUED) != 0) { 3676 new.flags &= ~PGA_ENQUEUED; 3677 next = TAILQ_NEXT(m, plinks.q); 3678 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3679 vm_pagequeue_cnt_dec(pq); 3680 if (!vm_page_pqstate_fcmpset(m, old, new)) { 3681 if (next == NULL) 3682 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3683 else 3684 TAILQ_INSERT_BEFORE(next, m, plinks.q); 3685 vm_pagequeue_cnt_inc(pq); 3686 return (false); 3687 } else { 3688 return (true); 3689 } 3690 } else { 3691 return (vm_page_pqstate_fcmpset(m, old, new)); 3692 } 3693 } 3694 3695 static bool 3696 vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old, 3697 vm_page_astate_t new) 3698 { 3699 struct vm_pagequeue *pq; 3700 vm_page_astate_t as; 3701 bool ret; 3702 3703 pq = _vm_page_pagequeue(m, old->queue); 3704 3705 /* 3706 * The queue field and PGA_ENQUEUED flag are stable only so long as the 3707 * corresponding page queue lock is held. 3708 */ 3709 vm_pagequeue_lock(pq); 3710 as = vm_page_astate_load(m); 3711 if (__predict_false(as._bits != old->_bits)) { 3712 *old = as; 3713 ret = false; 3714 } else { 3715 ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new); 3716 } 3717 vm_pagequeue_unlock(pq); 3718 return (ret); 3719 } 3720 3721 /* 3722 * Commit a queue state update that enqueues or requeues a page. 3723 */ 3724 static bool 3725 _vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m, 3726 vm_page_astate_t *old, vm_page_astate_t new) 3727 { 3728 struct vm_domain *vmd; 3729 3730 vm_pagequeue_assert_locked(pq); 3731 KASSERT(old->queue != PQ_NONE && new.queue == old->queue, 3732 ("%s: invalid queue indices %d %d", 3733 __func__, old->queue, new.queue)); 3734 3735 new.flags |= PGA_ENQUEUED; 3736 if (!vm_page_pqstate_fcmpset(m, old, new)) 3737 return (false); 3738 3739 if ((old->flags & PGA_ENQUEUED) != 0) 3740 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3741 else 3742 vm_pagequeue_cnt_inc(pq); 3743 3744 /* 3745 * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if 3746 * both flags are set in close succession, only PGA_REQUEUE_HEAD will be 3747 * applied, even if it was set first. 3748 */ 3749 if ((old->flags & PGA_REQUEUE_HEAD) != 0) { 3750 vmd = vm_pagequeue_domain(m); 3751 KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE], 3752 ("%s: invalid page queue for page %p", __func__, m)); 3753 TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); 3754 } else { 3755 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3756 } 3757 return (true); 3758 } 3759 3760 /* 3761 * Commit a queue state update that encodes a request for a deferred queue 3762 * operation. 3763 */ 3764 static bool 3765 vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old, 3766 vm_page_astate_t new) 3767 { 3768 3769 KASSERT(old->queue == new.queue || new.queue != PQ_NONE, 3770 ("%s: invalid state, queue %d flags %x", 3771 __func__, new.queue, new.flags)); 3772 3773 if (old->_bits != new._bits && 3774 !vm_page_pqstate_fcmpset(m, old, new)) 3775 return (false); 3776 vm_page_pqbatch_submit(m, new.queue); 3777 return (true); 3778 } 3779 3780 /* 3781 * A generic queue state update function. This handles more cases than the 3782 * specialized functions above. 3783 */ 3784 bool 3785 vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) 3786 { 3787 3788 if (old->_bits == new._bits) 3789 return (true); 3790 3791 if (old->queue != PQ_NONE && new.queue != old->queue) { 3792 if (!vm_page_pqstate_commit_dequeue(m, old, new)) 3793 return (false); 3794 if (new.queue != PQ_NONE) 3795 vm_page_pqbatch_submit(m, new.queue); 3796 } else { 3797 if (!vm_page_pqstate_fcmpset(m, old, new)) 3798 return (false); 3799 if (new.queue != PQ_NONE && 3800 ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0) 3801 vm_page_pqbatch_submit(m, new.queue); 3802 } 3803 return (true); 3804 } 3805 3806 /* 3807 * Apply deferred queue state updates to a page. 3808 */ 3809 static inline void 3810 vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) 3811 { 3812 vm_page_astate_t new, old; 3813 3814 CRITICAL_ASSERT(curthread); 3815 vm_pagequeue_assert_locked(pq); 3816 KASSERT(queue < PQ_COUNT, 3817 ("%s: invalid queue index %d", __func__, queue)); 3818 KASSERT(pq == _vm_page_pagequeue(m, queue), 3819 ("%s: page %p does not belong to queue %p", __func__, m, pq)); 3820 3821 for (old = vm_page_astate_load(m);;) { 3822 if (__predict_false(old.queue != queue || 3823 (old.flags & PGA_QUEUE_OP_MASK) == 0)) { 3824 counter_u64_add(queue_nops, 1); 3825 break; 3826 } 3827 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3828 ("%s: page %p is unmanaged", __func__, m)); 3829 3830 new = old; 3831 if ((old.flags & PGA_DEQUEUE) != 0) { 3832 new.flags &= ~PGA_QUEUE_OP_MASK; 3833 new.queue = PQ_NONE; 3834 if (__predict_true(_vm_page_pqstate_commit_dequeue(pq, 3835 m, &old, new))) { 3836 counter_u64_add(queue_ops, 1); 3837 break; 3838 } 3839 } else { 3840 new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); 3841 if (__predict_true(_vm_page_pqstate_commit_requeue(pq, 3842 m, &old, new))) { 3843 counter_u64_add(queue_ops, 1); 3844 break; 3845 } 3846 } 3847 } 3848 } 3849 3850 static void 3851 vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, 3852 uint8_t queue) 3853 { 3854 int i; 3855 3856 for (i = 0; i < bq->bq_cnt; i++) 3857 vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); 3858 vm_batchqueue_init(bq); 3859 } 3860 3861 /* 3862 * vm_page_pqbatch_submit: [ internal use only ] 3863 * 3864 * Enqueue a page in the specified page queue's batched work queue. 3865 * The caller must have encoded the requested operation in the page 3866 * structure's a.flags field. 3867 */ 3868 void 3869 vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) 3870 { 3871 struct vm_batchqueue *bq; 3872 struct vm_pagequeue *pq; 3873 int domain, slots_remaining; 3874 3875 KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); 3876 3877 domain = vm_page_domain(m); 3878 critical_enter(); 3879 bq = DPCPU_PTR(pqbatch[domain][queue]); 3880 slots_remaining = vm_batchqueue_insert(bq, m); 3881 if (slots_remaining > (VM_BATCHQUEUE_SIZE >> 1)) { 3882 /* keep building the bq */ 3883 critical_exit(); 3884 return; 3885 } else if (slots_remaining > 0 ) { 3886 /* Try to process the bq if we can get the lock */ 3887 pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; 3888 if (vm_pagequeue_trylock(pq)) { 3889 vm_pqbatch_process(pq, bq, queue); 3890 vm_pagequeue_unlock(pq); 3891 } 3892 critical_exit(); 3893 return; 3894 } 3895 critical_exit(); 3896 3897 /* if we make it here, the bq is full so wait for the lock */ 3898 3899 pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; 3900 vm_pagequeue_lock(pq); 3901 critical_enter(); 3902 bq = DPCPU_PTR(pqbatch[domain][queue]); 3903 vm_pqbatch_process(pq, bq, queue); 3904 vm_pqbatch_process_page(pq, m, queue); 3905 vm_pagequeue_unlock(pq); 3906 critical_exit(); 3907 } 3908 3909 /* 3910 * vm_page_pqbatch_drain: [ internal use only ] 3911 * 3912 * Force all per-CPU page queue batch queues to be drained. This is 3913 * intended for use in severe memory shortages, to ensure that pages 3914 * do not remain stuck in the batch queues. 3915 */ 3916 void 3917 vm_page_pqbatch_drain(void) 3918 { 3919 struct thread *td; 3920 struct vm_domain *vmd; 3921 struct vm_pagequeue *pq; 3922 int cpu, domain, queue; 3923 3924 td = curthread; 3925 CPU_FOREACH(cpu) { 3926 thread_lock(td); 3927 sched_bind(td, cpu); 3928 thread_unlock(td); 3929 3930 for (domain = 0; domain < vm_ndomains; domain++) { 3931 vmd = VM_DOMAIN(domain); 3932 for (queue = 0; queue < PQ_COUNT; queue++) { 3933 pq = &vmd->vmd_pagequeues[queue]; 3934 vm_pagequeue_lock(pq); 3935 critical_enter(); 3936 vm_pqbatch_process(pq, 3937 DPCPU_PTR(pqbatch[domain][queue]), queue); 3938 critical_exit(); 3939 vm_pagequeue_unlock(pq); 3940 } 3941 } 3942 } 3943 thread_lock(td); 3944 sched_unbind(td); 3945 thread_unlock(td); 3946 } 3947 3948 /* 3949 * vm_page_dequeue_deferred: [ internal use only ] 3950 * 3951 * Request removal of the given page from its current page 3952 * queue. Physical removal from the queue may be deferred 3953 * indefinitely. 3954 */ 3955 void 3956 vm_page_dequeue_deferred(vm_page_t m) 3957 { 3958 vm_page_astate_t new, old; 3959 3960 old = vm_page_astate_load(m); 3961 do { 3962 if (old.queue == PQ_NONE) { 3963 KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, 3964 ("%s: page %p has unexpected queue state", 3965 __func__, m)); 3966 break; 3967 } 3968 new = old; 3969 new.flags |= PGA_DEQUEUE; 3970 } while (!vm_page_pqstate_commit_request(m, &old, new)); 3971 } 3972 3973 /* 3974 * vm_page_dequeue: 3975 * 3976 * Remove the page from whichever page queue it's in, if any, before 3977 * returning. 3978 */ 3979 void 3980 vm_page_dequeue(vm_page_t m) 3981 { 3982 vm_page_astate_t new, old; 3983 3984 old = vm_page_astate_load(m); 3985 do { 3986 if (__predict_true(old.queue == PQ_NONE)) { 3987 KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, 3988 ("%s: page %p has unexpected queue state", 3989 __func__, m)); 3990 break; 3991 } 3992 new = old; 3993 new.flags &= ~PGA_QUEUE_OP_MASK; 3994 new.queue = PQ_NONE; 3995 } while (!vm_page_pqstate_commit_dequeue(m, &old, new)); 3996 3997 } 3998 3999 /* 4000 * Schedule the given page for insertion into the specified page queue. 4001 * Physical insertion of the page may be deferred indefinitely. 4002 */ 4003 static void 4004 vm_page_enqueue(vm_page_t m, uint8_t queue) 4005 { 4006 4007 KASSERT(m->a.queue == PQ_NONE && 4008 (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, 4009 ("%s: page %p is already enqueued", __func__, m)); 4010 KASSERT(m->ref_count > 0, 4011 ("%s: page %p does not carry any references", __func__, m)); 4012 4013 m->a.queue = queue; 4014 if ((m->a.flags & PGA_REQUEUE) == 0) 4015 vm_page_aflag_set(m, PGA_REQUEUE); 4016 vm_page_pqbatch_submit(m, queue); 4017 } 4018 4019 /* 4020 * vm_page_free_prep: 4021 * 4022 * Prepares the given page to be put on the free list, 4023 * disassociating it from any VM object. The caller may return 4024 * the page to the free list only if this function returns true. 4025 * 4026 * The object, if it exists, must be locked, and then the page must 4027 * be xbusy. Otherwise the page must be not busied. A managed 4028 * page must be unmapped. 4029 */ 4030 static bool 4031 vm_page_free_prep(vm_page_t m) 4032 { 4033 4034 /* 4035 * Synchronize with threads that have dropped a reference to this 4036 * page. 4037 */ 4038 atomic_thread_fence_acq(); 4039 4040 #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) 4041 if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { 4042 uint64_t *p; 4043 int i; 4044 p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4045 for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) 4046 KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", 4047 m, i, (uintmax_t)*p)); 4048 } 4049 #endif 4050 if ((m->oflags & VPO_UNMANAGED) == 0) { 4051 KASSERT(!pmap_page_is_mapped(m), 4052 ("vm_page_free_prep: freeing mapped page %p", m)); 4053 KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, 4054 ("vm_page_free_prep: mapping flags set in page %p", m)); 4055 } else { 4056 KASSERT(m->a.queue == PQ_NONE, 4057 ("vm_page_free_prep: unmanaged page %p is queued", m)); 4058 } 4059 VM_CNT_INC(v_tfree); 4060 4061 if (m->object != NULL) { 4062 vm_page_radix_remove(m); 4063 vm_page_free_object_prep(m); 4064 } else 4065 vm_page_assert_unbusied(m); 4066 4067 vm_page_busy_free(m); 4068 4069 /* 4070 * If fictitious remove object association and 4071 * return. 4072 */ 4073 if ((m->flags & PG_FICTITIOUS) != 0) { 4074 KASSERT(m->ref_count == 1, 4075 ("fictitious page %p is referenced", m)); 4076 KASSERT(m->a.queue == PQ_NONE, 4077 ("fictitious page %p is queued", m)); 4078 return (false); 4079 } 4080 4081 /* 4082 * Pages need not be dequeued before they are returned to the physical 4083 * memory allocator, but they must at least be marked for a deferred 4084 * dequeue. 4085 */ 4086 if ((m->oflags & VPO_UNMANAGED) == 0) 4087 vm_page_dequeue_deferred(m); 4088 4089 m->valid = 0; 4090 vm_page_undirty(m); 4091 4092 if (m->ref_count != 0) 4093 panic("vm_page_free_prep: page %p has references", m); 4094 4095 /* 4096 * Restore the default memory attribute to the page. 4097 */ 4098 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 4099 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 4100 4101 #if VM_NRESERVLEVEL > 0 4102 /* 4103 * Determine whether the page belongs to a reservation. If the page was 4104 * allocated from a per-CPU cache, it cannot belong to a reservation, so 4105 * as an optimization, we avoid the check in that case. 4106 */ 4107 if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) 4108 return (false); 4109 #endif 4110 4111 return (true); 4112 } 4113 4114 /* 4115 * vm_page_free_toq: 4116 * 4117 * Returns the given page to the free list, disassociating it 4118 * from any VM object. 4119 * 4120 * The object must be locked. The page must be exclusively busied if it 4121 * belongs to an object. 4122 */ 4123 static void 4124 vm_page_free_toq(vm_page_t m) 4125 { 4126 struct vm_domain *vmd; 4127 uma_zone_t zone; 4128 4129 if (!vm_page_free_prep(m)) 4130 return; 4131 4132 vmd = vm_pagequeue_domain(m); 4133 if (__predict_false((m->flags & PG_NOFREE) != 0)) { 4134 vm_page_free_nofree(vmd, m); 4135 return; 4136 } 4137 zone = vmd->vmd_pgcache[m->pool].zone; 4138 if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { 4139 uma_zfree(zone, m); 4140 return; 4141 } 4142 vm_domain_free_lock(vmd); 4143 vm_phys_free_pages(m, m->pool, 0); 4144 vm_domain_free_unlock(vmd); 4145 vm_domain_freecnt_inc(vmd, 1); 4146 } 4147 4148 /* 4149 * vm_page_free_pages_toq: 4150 * 4151 * Returns a list of pages to the free list, disassociating it 4152 * from any VM object. In other words, this is equivalent to 4153 * calling vm_page_free_toq() for each page of a list of VM objects. 4154 */ 4155 int 4156 vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) 4157 { 4158 vm_page_t m; 4159 int count; 4160 4161 if (SLIST_EMPTY(free)) 4162 return (0); 4163 4164 count = 0; 4165 while ((m = SLIST_FIRST(free)) != NULL) { 4166 count++; 4167 SLIST_REMOVE_HEAD(free, plinks.s.ss); 4168 vm_page_free_toq(m); 4169 } 4170 4171 if (update_wire_count) 4172 vm_wire_sub(count); 4173 return (count); 4174 } 4175 4176 /* 4177 * Mark this page as wired down. For managed pages, this prevents reclamation 4178 * by the page daemon, or when the containing object, if any, is destroyed. 4179 */ 4180 void 4181 vm_page_wire(vm_page_t m) 4182 { 4183 u_int old; 4184 4185 #ifdef INVARIANTS 4186 if (m->object != NULL && !vm_page_busied(m) && 4187 !vm_object_busied(m->object)) 4188 VM_OBJECT_ASSERT_LOCKED(m->object); 4189 #endif 4190 KASSERT((m->flags & PG_FICTITIOUS) == 0 || 4191 VPRC_WIRE_COUNT(m->ref_count) >= 1, 4192 ("vm_page_wire: fictitious page %p has zero wirings", m)); 4193 4194 old = atomic_fetchadd_int(&m->ref_count, 1); 4195 KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, 4196 ("vm_page_wire: counter overflow for page %p", m)); 4197 if (VPRC_WIRE_COUNT(old) == 0) { 4198 if ((m->oflags & VPO_UNMANAGED) == 0) 4199 vm_page_aflag_set(m, PGA_DEQUEUE); 4200 vm_wire_add(1); 4201 } 4202 } 4203 4204 /* 4205 * Attempt to wire a mapped page following a pmap lookup of that page. 4206 * This may fail if a thread is concurrently tearing down mappings of the page. 4207 * The transient failure is acceptable because it translates to the 4208 * failure of the caller pmap_extract_and_hold(), which should be then 4209 * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages(). 4210 */ 4211 bool 4212 vm_page_wire_mapped(vm_page_t m) 4213 { 4214 u_int old; 4215 4216 old = atomic_load_int(&m->ref_count); 4217 do { 4218 KASSERT(old > 0, 4219 ("vm_page_wire_mapped: wiring unreferenced page %p", m)); 4220 if ((old & VPRC_BLOCKED) != 0) 4221 return (false); 4222 } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); 4223 4224 if (VPRC_WIRE_COUNT(old) == 0) { 4225 if ((m->oflags & VPO_UNMANAGED) == 0) 4226 vm_page_aflag_set(m, PGA_DEQUEUE); 4227 vm_wire_add(1); 4228 } 4229 return (true); 4230 } 4231 4232 /* 4233 * Release a wiring reference to a managed page. If the page still belongs to 4234 * an object, update its position in the page queues to reflect the reference. 4235 * If the wiring was the last reference to the page, free the page. 4236 */ 4237 static void 4238 vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse) 4239 { 4240 u_int old; 4241 4242 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4243 ("%s: page %p is unmanaged", __func__, m)); 4244 4245 /* 4246 * Update LRU state before releasing the wiring reference. 4247 * Use a release store when updating the reference count to 4248 * synchronize with vm_page_free_prep(). 4249 */ 4250 old = atomic_load_int(&m->ref_count); 4251 do { 4252 u_int count; 4253 4254 KASSERT(VPRC_WIRE_COUNT(old) > 0, 4255 ("vm_page_unwire: wire count underflow for page %p", m)); 4256 4257 count = old & ~VPRC_BLOCKED; 4258 if (count > VPRC_OBJREF + 1) { 4259 /* 4260 * The page has at least one other wiring reference. An 4261 * earlier iteration of this loop may have called 4262 * vm_page_release_toq() and cleared PGA_DEQUEUE, so 4263 * re-set it if necessary. 4264 */ 4265 if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0) 4266 vm_page_aflag_set(m, PGA_DEQUEUE); 4267 } else if (count == VPRC_OBJREF + 1) { 4268 /* 4269 * This is the last wiring. Clear PGA_DEQUEUE and 4270 * update the page's queue state to reflect the 4271 * reference. If the page does not belong to an object 4272 * (i.e., the VPRC_OBJREF bit is clear), we only need to 4273 * clear leftover queue state. 4274 */ 4275 vm_page_release_toq(m, nqueue, noreuse); 4276 } else if (count == 1) { 4277 vm_page_aflag_clear(m, PGA_DEQUEUE); 4278 } 4279 } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); 4280 4281 if (VPRC_WIRE_COUNT(old) == 1) { 4282 vm_wire_sub(1); 4283 if (old == 1) 4284 vm_page_free(m); 4285 } 4286 } 4287 4288 /* 4289 * Release one wiring of the specified page, potentially allowing it to be 4290 * paged out. 4291 * 4292 * Only managed pages belonging to an object can be paged out. If the number 4293 * of wirings transitions to zero and the page is eligible for page out, then 4294 * the page is added to the specified paging queue. If the released wiring 4295 * represented the last reference to the page, the page is freed. 4296 */ 4297 void 4298 vm_page_unwire(vm_page_t m, uint8_t nqueue) 4299 { 4300 4301 KASSERT(nqueue < PQ_COUNT, 4302 ("vm_page_unwire: invalid queue %u request for page %p", 4303 nqueue, m)); 4304 4305 if ((m->oflags & VPO_UNMANAGED) != 0) { 4306 if (vm_page_unwire_noq(m) && m->ref_count == 0) 4307 vm_page_free(m); 4308 return; 4309 } 4310 vm_page_unwire_managed(m, nqueue, false); 4311 } 4312 4313 /* 4314 * Unwire a page without (re-)inserting it into a page queue. It is up 4315 * to the caller to enqueue, requeue, or free the page as appropriate. 4316 * In most cases involving managed pages, vm_page_unwire() should be used 4317 * instead. 4318 */ 4319 bool 4320 vm_page_unwire_noq(vm_page_t m) 4321 { 4322 u_int old; 4323 4324 old = vm_page_drop(m, 1); 4325 KASSERT(VPRC_WIRE_COUNT(old) != 0, 4326 ("%s: counter underflow for page %p", __func__, m)); 4327 KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1, 4328 ("%s: missing ref on fictitious page %p", __func__, m)); 4329 4330 if (VPRC_WIRE_COUNT(old) > 1) 4331 return (false); 4332 if ((m->oflags & VPO_UNMANAGED) == 0) 4333 vm_page_aflag_clear(m, PGA_DEQUEUE); 4334 vm_wire_sub(1); 4335 return (true); 4336 } 4337 4338 /* 4339 * Ensure that the page ends up in the specified page queue. If the page is 4340 * active or being moved to the active queue, ensure that its act_count is 4341 * at least ACT_INIT but do not otherwise mess with it. 4342 */ 4343 static __always_inline void 4344 vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) 4345 { 4346 vm_page_astate_t old, new; 4347 4348 KASSERT(m->ref_count > 0, 4349 ("%s: page %p does not carry any references", __func__, m)); 4350 KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, 4351 ("%s: invalid flags %x", __func__, nflag)); 4352 4353 if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) 4354 return; 4355 4356 old = vm_page_astate_load(m); 4357 do { 4358 if ((old.flags & PGA_DEQUEUE) != 0) 4359 break; 4360 new = old; 4361 new.flags &= ~PGA_QUEUE_OP_MASK; 4362 if (nqueue == PQ_ACTIVE) 4363 new.act_count = max(old.act_count, ACT_INIT); 4364 if (old.queue == nqueue) { 4365 /* 4366 * There is no need to requeue pages already in the 4367 * active queue. 4368 */ 4369 if (nqueue != PQ_ACTIVE || 4370 (old.flags & PGA_ENQUEUED) == 0) 4371 new.flags |= nflag; 4372 } else { 4373 new.flags |= nflag; 4374 new.queue = nqueue; 4375 } 4376 } while (!vm_page_pqstate_commit(m, &old, new)); 4377 } 4378 4379 /* 4380 * Put the specified page on the active list (if appropriate). 4381 */ 4382 void 4383 vm_page_activate(vm_page_t m) 4384 { 4385 4386 vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); 4387 } 4388 4389 /* 4390 * Move the specified page to the tail of the inactive queue, or requeue 4391 * the page if it is already in the inactive queue. 4392 */ 4393 void 4394 vm_page_deactivate(vm_page_t m) 4395 { 4396 4397 vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); 4398 } 4399 4400 void 4401 vm_page_deactivate_noreuse(vm_page_t m) 4402 { 4403 4404 vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); 4405 } 4406 4407 /* 4408 * Put a page in the laundry, or requeue it if it is already there. 4409 */ 4410 void 4411 vm_page_launder(vm_page_t m) 4412 { 4413 4414 vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); 4415 } 4416 4417 /* 4418 * Put a page in the PQ_UNSWAPPABLE holding queue. 4419 */ 4420 void 4421 vm_page_unswappable(vm_page_t m) 4422 { 4423 4424 VM_OBJECT_ASSERT_LOCKED(m->object); 4425 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4426 ("page %p already unswappable", m)); 4427 4428 vm_page_dequeue(m); 4429 vm_page_enqueue(m, PQ_UNSWAPPABLE); 4430 } 4431 4432 /* 4433 * Release a page back to the page queues in preparation for unwiring. 4434 */ 4435 static void 4436 vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse) 4437 { 4438 vm_page_astate_t old, new; 4439 uint16_t nflag; 4440 4441 /* 4442 * Use a check of the valid bits to determine whether we should 4443 * accelerate reclamation of the page. The object lock might not be 4444 * held here, in which case the check is racy. At worst we will either 4445 * accelerate reclamation of a valid page and violate LRU, or 4446 * unnecessarily defer reclamation of an invalid page. 4447 * 4448 * If we were asked to not cache the page, place it near the head of the 4449 * inactive queue so that is reclaimed sooner. 4450 */ 4451 if (noreuse || vm_page_none_valid(m)) { 4452 nqueue = PQ_INACTIVE; 4453 nflag = PGA_REQUEUE_HEAD; 4454 } else { 4455 nflag = PGA_REQUEUE; 4456 } 4457 4458 old = vm_page_astate_load(m); 4459 do { 4460 new = old; 4461 4462 /* 4463 * If the page is already in the active queue and we are not 4464 * trying to accelerate reclamation, simply mark it as 4465 * referenced and avoid any queue operations. 4466 */ 4467 new.flags &= ~PGA_QUEUE_OP_MASK; 4468 if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE && 4469 (old.flags & PGA_ENQUEUED) != 0) 4470 new.flags |= PGA_REFERENCED; 4471 else { 4472 new.flags |= nflag; 4473 new.queue = nqueue; 4474 } 4475 } while (!vm_page_pqstate_commit(m, &old, new)); 4476 } 4477 4478 /* 4479 * Unwire a page and either attempt to free it or re-add it to the page queues. 4480 */ 4481 void 4482 vm_page_release(vm_page_t m, int flags) 4483 { 4484 vm_object_t object; 4485 4486 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4487 ("vm_page_release: page %p is unmanaged", m)); 4488 4489 if ((flags & VPR_TRYFREE) != 0) { 4490 for (;;) { 4491 object = atomic_load_ptr(&m->object); 4492 if (object == NULL) 4493 break; 4494 /* Depends on type-stability. */ 4495 if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object)) 4496 break; 4497 if (object == m->object) { 4498 vm_page_release_locked(m, flags); 4499 VM_OBJECT_WUNLOCK(object); 4500 return; 4501 } 4502 VM_OBJECT_WUNLOCK(object); 4503 } 4504 } 4505 vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); 4506 } 4507 4508 /* See vm_page_release(). */ 4509 void 4510 vm_page_release_locked(vm_page_t m, int flags) 4511 { 4512 4513 VM_OBJECT_ASSERT_WLOCKED(m->object); 4514 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4515 ("vm_page_release_locked: page %p is unmanaged", m)); 4516 4517 if (vm_page_unwire_noq(m)) { 4518 if ((flags & VPR_TRYFREE) != 0 && 4519 (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && 4520 m->dirty == 0 && vm_page_tryxbusy(m)) { 4521 /* 4522 * An unlocked lookup may have wired the page before the 4523 * busy lock was acquired, in which case the page must 4524 * not be freed. 4525 */ 4526 if (__predict_true(!vm_page_wired(m))) { 4527 vm_page_free(m); 4528 return; 4529 } 4530 vm_page_xunbusy(m); 4531 } else { 4532 vm_page_release_toq(m, PQ_INACTIVE, flags != 0); 4533 } 4534 } 4535 } 4536 4537 static bool 4538 vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t)) 4539 { 4540 u_int old; 4541 4542 KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0, 4543 ("vm_page_try_blocked_op: page %p has no object", m)); 4544 KASSERT(vm_page_busied(m), 4545 ("vm_page_try_blocked_op: page %p is not busy", m)); 4546 VM_OBJECT_ASSERT_LOCKED(m->object); 4547 4548 old = atomic_load_int(&m->ref_count); 4549 do { 4550 KASSERT(old != 0, 4551 ("vm_page_try_blocked_op: page %p has no references", m)); 4552 KASSERT((old & VPRC_BLOCKED) == 0, 4553 ("vm_page_try_blocked_op: page %p blocks wirings", m)); 4554 if (VPRC_WIRE_COUNT(old) != 0) 4555 return (false); 4556 } while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED)); 4557 4558 (op)(m); 4559 4560 /* 4561 * If the object is read-locked, new wirings may be created via an 4562 * object lookup. 4563 */ 4564 old = vm_page_drop(m, VPRC_BLOCKED); 4565 KASSERT(!VM_OBJECT_WOWNED(m->object) || 4566 old == (VPRC_BLOCKED | VPRC_OBJREF), 4567 ("vm_page_try_blocked_op: unexpected refcount value %u for %p", 4568 old, m)); 4569 return (true); 4570 } 4571 4572 /* 4573 * Atomically check for wirings and remove all mappings of the page. 4574 */ 4575 bool 4576 vm_page_try_remove_all(vm_page_t m) 4577 { 4578 4579 return (vm_page_try_blocked_op(m, pmap_remove_all)); 4580 } 4581 4582 /* 4583 * Atomically check for wirings and remove all writeable mappings of the page. 4584 */ 4585 bool 4586 vm_page_try_remove_write(vm_page_t m) 4587 { 4588 4589 return (vm_page_try_blocked_op(m, pmap_remove_write)); 4590 } 4591 4592 /* 4593 * vm_page_advise 4594 * 4595 * Apply the specified advice to the given page. 4596 */ 4597 void 4598 vm_page_advise(vm_page_t m, int advice) 4599 { 4600 4601 VM_OBJECT_ASSERT_WLOCKED(m->object); 4602 vm_page_assert_xbusied(m); 4603 4604 if (advice == MADV_FREE) 4605 /* 4606 * Mark the page clean. This will allow the page to be freed 4607 * without first paging it out. MADV_FREE pages are often 4608 * quickly reused by malloc(3), so we do not do anything that 4609 * would result in a page fault on a later access. 4610 */ 4611 vm_page_undirty(m); 4612 else if (advice != MADV_DONTNEED) { 4613 if (advice == MADV_WILLNEED) 4614 vm_page_activate(m); 4615 return; 4616 } 4617 4618 if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) 4619 vm_page_dirty(m); 4620 4621 /* 4622 * Clear any references to the page. Otherwise, the page daemon will 4623 * immediately reactivate the page. 4624 */ 4625 vm_page_aflag_clear(m, PGA_REFERENCED); 4626 4627 /* 4628 * Place clean pages near the head of the inactive queue rather than 4629 * the tail, thus defeating the queue's LRU operation and ensuring that 4630 * the page will be reused quickly. Dirty pages not already in the 4631 * laundry are moved there. 4632 */ 4633 if (m->dirty == 0) 4634 vm_page_deactivate_noreuse(m); 4635 else if (!vm_page_in_laundry(m)) 4636 vm_page_launder(m); 4637 } 4638 4639 /* 4640 * vm_page_grab_release 4641 * 4642 * Helper routine for grab functions to release busy on return. 4643 */ 4644 static inline void 4645 vm_page_grab_release(vm_page_t m, int allocflags) 4646 { 4647 4648 if ((allocflags & VM_ALLOC_NOBUSY) != 0) { 4649 if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) 4650 vm_page_sunbusy(m); 4651 else 4652 vm_page_xunbusy(m); 4653 } 4654 } 4655 4656 /* 4657 * vm_page_grab_sleep 4658 * 4659 * Sleep for busy according to VM_ALLOC_ parameters. Returns true 4660 * if the caller should retry and false otherwise. 4661 * 4662 * If the object is locked on entry the object will be unlocked with 4663 * false returns and still locked but possibly having been dropped 4664 * with true returns. 4665 */ 4666 static bool 4667 vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex, 4668 const char *wmesg, int allocflags, bool locked) 4669 { 4670 4671 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 4672 return (false); 4673 4674 /* 4675 * Reference the page before unlocking and sleeping so that 4676 * the page daemon is less likely to reclaim it. 4677 */ 4678 if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0) 4679 vm_page_reference(m); 4680 4681 if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) && 4682 locked) 4683 VM_OBJECT_WLOCK(object); 4684 if ((allocflags & VM_ALLOC_WAITFAIL) != 0) 4685 return (false); 4686 4687 return (true); 4688 } 4689 4690 /* 4691 * Assert that the grab flags are valid. 4692 */ 4693 static inline void 4694 vm_page_grab_check(int allocflags) 4695 { 4696 4697 KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || 4698 (allocflags & VM_ALLOC_WIRED) != 0, 4699 ("vm_page_grab*: the pages must be busied or wired")); 4700 4701 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 4702 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 4703 ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); 4704 } 4705 4706 /* 4707 * Calculate the page allocation flags for grab. 4708 */ 4709 static inline int 4710 vm_page_grab_pflags(int allocflags) 4711 { 4712 int pflags; 4713 4714 pflags = allocflags & 4715 ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | 4716 VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY); 4717 if ((allocflags & VM_ALLOC_NOWAIT) == 0) 4718 pflags |= VM_ALLOC_WAITFAIL; 4719 if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) 4720 pflags |= VM_ALLOC_SBUSY; 4721 4722 return (pflags); 4723 } 4724 4725 /* 4726 * Grab a page, waiting until we are woken up due to the page changing state. 4727 * We keep on waiting, if the page continues to be in the object, unless 4728 * allocflags forbid waiting. 4729 * 4730 * The object must be locked on entry. This routine may sleep. The lock will, 4731 * however, be released and reacquired if the routine sleeps. 4732 * 4733 * Return a grabbed page, or NULL. Set *found if a page was found, whether or 4734 * not it was grabbed. 4735 */ 4736 static inline vm_page_t 4737 vm_page_grab_lookup(vm_object_t object, vm_pindex_t pindex, int allocflags, 4738 bool *found, struct pctrie_iter *pages) 4739 { 4740 vm_page_t m; 4741 4742 while ((*found = (m = vm_radix_iter_lookup(pages, pindex)) != NULL) && 4743 !vm_page_tryacquire(m, allocflags)) { 4744 if (!vm_page_grab_sleep(object, m, pindex, "pgrbwt", 4745 allocflags, true)) 4746 return (NULL); 4747 pctrie_iter_reset(pages); 4748 } 4749 return (m); 4750 } 4751 4752 /* 4753 * Grab a page. Use an iterator parameter. Keep on waiting, as long as the page 4754 * exists in the object. If the page doesn't exist, first allocate it and then 4755 * conditionally zero it. 4756 * 4757 * The object must be locked on entry. This routine may sleep. The lock will, 4758 * however, be released and reacquired if the routine sleeps. 4759 */ 4760 vm_page_t 4761 vm_page_grab_iter(vm_object_t object, vm_pindex_t pindex, int allocflags, 4762 struct pctrie_iter *pages) 4763 { 4764 vm_page_t m; 4765 bool found; 4766 4767 VM_OBJECT_ASSERT_WLOCKED(object); 4768 vm_page_grab_check(allocflags); 4769 4770 while ((m = vm_page_grab_lookup( 4771 object, pindex, allocflags, &found, pages)) == NULL) { 4772 if ((allocflags & VM_ALLOC_NOCREAT) != 0) 4773 return (NULL); 4774 if (found && 4775 (allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) 4776 return (NULL); 4777 m = vm_page_alloc_iter(object, pindex, 4778 vm_page_grab_pflags(allocflags), pages); 4779 if (m != NULL) { 4780 if ((allocflags & VM_ALLOC_ZERO) != 0 && 4781 (m->flags & PG_ZERO) == 0) 4782 pmap_zero_page(m); 4783 break; 4784 } 4785 if ((allocflags & 4786 (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) 4787 return (NULL); 4788 } 4789 vm_page_grab_release(m, allocflags); 4790 4791 return (m); 4792 } 4793 4794 /* 4795 * Grab a page. Keep on waiting, as long as the page exists in the object. If 4796 * the page doesn't exist, first allocate it and then conditionally zero it. 4797 * 4798 * The object must be locked on entry. This routine may sleep. The lock will, 4799 * however, be released and reacquired if the routine sleeps. 4800 */ 4801 vm_page_t 4802 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 4803 { 4804 struct pctrie_iter pages; 4805 4806 VM_OBJECT_ASSERT_WLOCKED(object); 4807 vm_page_iter_init(&pages, object); 4808 return (vm_page_grab_iter(object, pindex, allocflags, &pages)); 4809 } 4810 4811 /* 4812 * Attempt to validate a page, locklessly acquiring it if necessary, given a 4813 * (object, pindex) tuple and either an invalided page or NULL. The resulting 4814 * page will be validated against the identity tuple, and busied or wired as 4815 * requested. A NULL page returned guarantees that the page was not in radix at 4816 * the time of the call but callers must perform higher level synchronization or 4817 * retry the operation under a lock if they require an atomic answer. This is 4818 * the only lock free validation routine, other routines can depend on the 4819 * resulting page state. 4820 * 4821 * The return value PAGE_NOT_ACQUIRED indicates that the operation failed due to 4822 * caller flags. 4823 */ 4824 #define PAGE_NOT_ACQUIRED ((vm_page_t)1) 4825 static vm_page_t 4826 vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, vm_page_t m, 4827 int allocflags) 4828 { 4829 if (m == NULL) 4830 m = vm_page_lookup_unlocked(object, pindex); 4831 for (; m != NULL; m = vm_page_lookup_unlocked(object, pindex)) { 4832 if (vm_page_trybusy(m, allocflags)) { 4833 if (m->object == object && m->pindex == pindex) { 4834 if ((allocflags & VM_ALLOC_WIRED) != 0) 4835 vm_page_wire(m); 4836 vm_page_grab_release(m, allocflags); 4837 break; 4838 } 4839 /* relookup. */ 4840 vm_page_busy_release(m); 4841 cpu_spinwait(); 4842 continue; 4843 } 4844 if (!vm_page_grab_sleep(object, m, pindex, "pgnslp", 4845 allocflags, false)) 4846 return (PAGE_NOT_ACQUIRED); 4847 } 4848 return (m); 4849 } 4850 4851 /* 4852 * Try to locklessly grab a page and fall back to the object lock if NOCREAT 4853 * is not set. 4854 */ 4855 vm_page_t 4856 vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags) 4857 { 4858 vm_page_t m; 4859 4860 vm_page_grab_check(allocflags); 4861 m = vm_page_acquire_unlocked(object, pindex, NULL, allocflags); 4862 if (m == PAGE_NOT_ACQUIRED) 4863 return (NULL); 4864 if (m != NULL) 4865 return (m); 4866 4867 /* 4868 * The radix lockless lookup should never return a false negative 4869 * errors. If the user specifies NOCREAT they are guaranteed there 4870 * was no page present at the instant of the call. A NOCREAT caller 4871 * must handle create races gracefully. 4872 */ 4873 if ((allocflags & VM_ALLOC_NOCREAT) != 0) 4874 return (NULL); 4875 4876 VM_OBJECT_WLOCK(object); 4877 m = vm_page_grab(object, pindex, allocflags); 4878 VM_OBJECT_WUNLOCK(object); 4879 4880 return (m); 4881 } 4882 4883 /* 4884 * Grab a page and make it valid, paging in if necessary. Use an iterator 4885 * parameter. Pages missing from their pager are zero filled and validated. If 4886 * a VM_ALLOC_COUNT is supplied and the page is not valid as many as 4887 * VM_INITIAL_PAGEIN pages can be brought in simultaneously. Additional pages 4888 * will be left on a paging queue but will neither be wired nor busy regardless 4889 * of allocflags. 4890 */ 4891 int 4892 vm_page_grab_valid_iter(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, 4893 int allocflags, struct pctrie_iter *pages) 4894 { 4895 vm_page_t m; 4896 vm_page_t ma[VM_INITIAL_PAGEIN]; 4897 int after, ahead, i, pflags, rv; 4898 4899 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 4900 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 4901 ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); 4902 KASSERT((allocflags & 4903 (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, 4904 ("vm_page_grab_valid: Invalid flags 0x%X", allocflags)); 4905 VM_OBJECT_ASSERT_WLOCKED(object); 4906 pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY | 4907 VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY); 4908 pflags |= VM_ALLOC_WAITFAIL; 4909 4910 retrylookup: 4911 if ((m = vm_radix_iter_lookup(pages, pindex)) != NULL) { 4912 /* 4913 * If the page is fully valid it can only become invalid 4914 * with the object lock held. If it is not valid it can 4915 * become valid with the busy lock held. Therefore, we 4916 * may unnecessarily lock the exclusive busy here if we 4917 * race with I/O completion not using the object lock. 4918 * However, we will not end up with an invalid page and a 4919 * shared lock. 4920 */ 4921 if (!vm_page_trybusy(m, 4922 vm_page_all_valid(m) ? allocflags : 0)) { 4923 (void)vm_page_grab_sleep(object, m, pindex, "pgrbwt", 4924 allocflags, true); 4925 pctrie_iter_reset(pages); 4926 goto retrylookup; 4927 } 4928 if (vm_page_all_valid(m)) 4929 goto out; 4930 if ((allocflags & VM_ALLOC_NOCREAT) != 0) { 4931 vm_page_busy_release(m); 4932 *mp = NULL; 4933 return (VM_PAGER_FAIL); 4934 } 4935 } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { 4936 *mp = NULL; 4937 return (VM_PAGER_FAIL); 4938 } else { 4939 m = vm_page_alloc_iter(object, pindex, pflags, pages); 4940 if (m == NULL) { 4941 if (!vm_pager_can_alloc_page(object, pindex)) { 4942 *mp = NULL; 4943 return (VM_PAGER_AGAIN); 4944 } 4945 goto retrylookup; 4946 } 4947 } 4948 4949 vm_page_assert_xbusied(m); 4950 if (vm_pager_has_page(object, pindex, NULL, &after)) { 4951 after = MIN(after, VM_INITIAL_PAGEIN); 4952 after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT); 4953 after = MAX(after, 1); 4954 ma[0] = m; 4955 pctrie_iter_reset(pages); 4956 for (i = 1; i < after; i++) { 4957 m = vm_radix_iter_lookup_ge(pages, pindex + i); 4958 ahead = after; 4959 if (m != NULL) 4960 ahead = MIN(ahead, m->pindex - pindex); 4961 for (; i < ahead; i++) { 4962 ma[i] = vm_page_alloc_iter(object, pindex + i, 4963 VM_ALLOC_NORMAL, pages); 4964 if (ma[i] == NULL) 4965 break; 4966 } 4967 if (m == NULL || m->pindex != pindex + i || 4968 vm_page_any_valid(m) || !vm_page_tryxbusy(m)) 4969 break; 4970 ma[i] = m; 4971 } 4972 after = i; 4973 vm_object_pip_add(object, after); 4974 VM_OBJECT_WUNLOCK(object); 4975 rv = vm_pager_get_pages(object, ma, after, NULL, NULL); 4976 pctrie_iter_reset(pages); 4977 VM_OBJECT_WLOCK(object); 4978 vm_object_pip_wakeupn(object, after); 4979 /* Pager may have replaced a page. */ 4980 m = ma[0]; 4981 if (rv != VM_PAGER_OK) { 4982 for (i = 0; i < after; i++) { 4983 if (!vm_page_wired(ma[i])) 4984 vm_page_free(ma[i]); 4985 else 4986 vm_page_xunbusy(ma[i]); 4987 } 4988 *mp = NULL; 4989 return (rv); 4990 } 4991 for (i = 1; i < after; i++) 4992 vm_page_readahead_finish(ma[i]); 4993 MPASS(vm_page_all_valid(m)); 4994 } else { 4995 vm_page_zero_invalid(m, TRUE); 4996 pctrie_iter_reset(pages); 4997 } 4998 out: 4999 if ((allocflags & VM_ALLOC_WIRED) != 0) 5000 vm_page_wire(m); 5001 if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m)) 5002 vm_page_busy_downgrade(m); 5003 else if ((allocflags & VM_ALLOC_NOBUSY) != 0) 5004 vm_page_busy_release(m); 5005 *mp = m; 5006 return (VM_PAGER_OK); 5007 } 5008 5009 /* 5010 * Grab a page and make it valid, paging in if necessary. Pages missing from 5011 * their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied 5012 * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought 5013 * in simultaneously. Additional pages will be left on a paging queue but 5014 * will neither be wired nor busy regardless of allocflags. 5015 */ 5016 int 5017 vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, 5018 int allocflags) 5019 { 5020 struct pctrie_iter pages; 5021 5022 VM_OBJECT_ASSERT_WLOCKED(object); 5023 vm_page_iter_init(&pages, object); 5024 return (vm_page_grab_valid_iter(mp, object, pindex, allocflags, 5025 &pages)); 5026 } 5027 5028 /* 5029 * Grab a page. Keep on waiting, as long as the page exists in the object. If 5030 * the page doesn't exist, and the pager has it, allocate it and zero part of 5031 * it. 5032 * 5033 * The object must be locked on entry. This routine may sleep. The lock will, 5034 * however, be released and reacquired if the routine sleeps. 5035 */ 5036 int 5037 vm_page_grab_zero_partial(vm_object_t object, vm_pindex_t pindex, int base, 5038 int end) 5039 { 5040 struct pctrie_iter pages; 5041 vm_page_t m; 5042 int allocflags, rv; 5043 bool found; 5044 5045 VM_OBJECT_ASSERT_WLOCKED(object); 5046 KASSERT(base >= 0, ("%s: base %d", __func__, base)); 5047 KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, 5048 end)); 5049 5050 allocflags = VM_ALLOC_NOCREAT | VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL; 5051 vm_page_iter_init(&pages, object); 5052 while ((m = vm_page_grab_lookup( 5053 object, pindex, allocflags, &found, &pages)) == NULL) { 5054 if (!vm_pager_has_page(object, pindex, NULL, NULL)) 5055 return (0); 5056 m = vm_page_alloc_iter(object, pindex, 5057 vm_page_grab_pflags(allocflags), &pages); 5058 if (m != NULL) { 5059 vm_object_pip_add(object, 1); 5060 VM_OBJECT_WUNLOCK(object); 5061 rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); 5062 VM_OBJECT_WLOCK(object); 5063 vm_object_pip_wakeup(object); 5064 if (rv != VM_PAGER_OK) { 5065 vm_page_free(m); 5066 return (EIO); 5067 } 5068 5069 /* 5070 * Since the page was not resident, and therefore not 5071 * recently accessed, immediately enqueue it for 5072 * asynchronous laundering. The current operation is 5073 * not regarded as an access. 5074 */ 5075 vm_page_launder(m); 5076 break; 5077 } 5078 } 5079 5080 pmap_zero_page_area(m, base, end - base); 5081 KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid", __func__, m)); 5082 vm_page_set_dirty(m); 5083 vm_page_xunbusy(m); 5084 return (0); 5085 } 5086 5087 /* 5088 * Locklessly grab a valid page. If the page is not valid or not yet 5089 * allocated this will fall back to the object lock method. 5090 */ 5091 int 5092 vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object, 5093 vm_pindex_t pindex, int allocflags) 5094 { 5095 vm_page_t m; 5096 int flags; 5097 int error; 5098 5099 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 5100 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 5101 ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY " 5102 "mismatch")); 5103 KASSERT((allocflags & 5104 (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, 5105 ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags)); 5106 5107 /* 5108 * Attempt a lockless lookup and busy. We need at least an sbusy 5109 * before we can inspect the valid field and return a wired page. 5110 */ 5111 flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); 5112 vm_page_grab_check(flags); 5113 m = vm_page_acquire_unlocked(object, pindex, NULL, flags); 5114 if (m == PAGE_NOT_ACQUIRED) 5115 return (VM_PAGER_FAIL); 5116 if (m != NULL) { 5117 if (vm_page_all_valid(m)) { 5118 if ((allocflags & VM_ALLOC_WIRED) != 0) 5119 vm_page_wire(m); 5120 vm_page_grab_release(m, allocflags); 5121 *mp = m; 5122 return (VM_PAGER_OK); 5123 } 5124 vm_page_busy_release(m); 5125 } 5126 if ((allocflags & VM_ALLOC_NOCREAT) != 0) { 5127 *mp = NULL; 5128 return (VM_PAGER_FAIL); 5129 } 5130 VM_OBJECT_WLOCK(object); 5131 error = vm_page_grab_valid(mp, object, pindex, allocflags); 5132 VM_OBJECT_WUNLOCK(object); 5133 5134 return (error); 5135 } 5136 5137 /* 5138 * Return the specified range of pages from the given object. For each 5139 * page offset within the range, if a page already exists within the object 5140 * at that offset and it is busy, then wait for it to change state. If, 5141 * instead, the page doesn't exist, then allocate it. 5142 * 5143 * The caller must always specify an allocation class. 5144 * 5145 * allocation classes: 5146 * VM_ALLOC_NORMAL normal process request 5147 * VM_ALLOC_SYSTEM system *really* needs the pages 5148 * VM_ALLOC_INTERRUPT interrupt time request 5149 * 5150 * The caller must always specify that the pages are to be busied and/or 5151 * wired. 5152 * 5153 * optional allocation flags: 5154 * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages 5155 * VM_ALLOC_NOBUSY do not exclusive busy the pages 5156 * VM_ALLOC_NODUMP do not include the pages in a kernel core dump 5157 * VM_ALLOC_NOFREE pages will never be freed 5158 * VM_ALLOC_NOWAIT do not sleep 5159 * VM_ALLOC_SBUSY set pages to sbusy state 5160 * VM_ALLOC_WAITFAIL in case of failure, sleep before returning 5161 * VM_ALLOC_WAITOK ignored (default behavior) 5162 * VM_ALLOC_WIRED wire the pages 5163 * VM_ALLOC_ZERO zero and validate any invalid pages 5164 * 5165 * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it 5166 * may return a partial prefix of the requested range. 5167 */ 5168 int 5169 vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, 5170 vm_page_t *ma, int count) 5171 { 5172 struct pctrie_iter pages; 5173 vm_page_t m; 5174 int pflags; 5175 int ahead, i; 5176 5177 VM_OBJECT_ASSERT_WLOCKED(object); 5178 KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, 5179 ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); 5180 KASSERT(count > 0, 5181 ("vm_page_grab_pages: invalid page count %d", count)); 5182 vm_page_grab_check(allocflags); 5183 5184 pflags = vm_page_grab_pflags(allocflags); 5185 i = 0; 5186 vm_page_iter_init(&pages, object); 5187 retrylookup: 5188 ahead = -1; 5189 for (; i < count; i++) { 5190 if (ahead < 0) { 5191 ahead = vm_radix_iter_lookup_range( 5192 &pages, pindex + i, &ma[i], count - i); 5193 } 5194 if (ahead-- > 0) { 5195 m = ma[i]; 5196 if (!vm_page_tryacquire(m, allocflags)) { 5197 if (vm_page_grab_sleep(object, m, pindex + i, 5198 "grbmaw", allocflags, true)) { 5199 pctrie_iter_reset(&pages); 5200 goto retrylookup; 5201 } 5202 break; 5203 } 5204 } else { 5205 if ((allocflags & VM_ALLOC_NOCREAT) != 0) 5206 break; 5207 m = vm_page_alloc_iter(object, pindex + i, 5208 pflags | VM_ALLOC_COUNT(count - i), &pages); 5209 /* pages was reset if alloc_iter lost the lock. */ 5210 if (m == NULL) { 5211 if ((allocflags & (VM_ALLOC_NOWAIT | 5212 VM_ALLOC_WAITFAIL)) != 0) 5213 break; 5214 goto retrylookup; 5215 } 5216 ma[i] = m; 5217 } 5218 if (vm_page_none_valid(m) && 5219 (allocflags & VM_ALLOC_ZERO) != 0) { 5220 if ((m->flags & PG_ZERO) == 0) 5221 pmap_zero_page(m); 5222 vm_page_valid(m); 5223 } 5224 vm_page_grab_release(m, allocflags); 5225 } 5226 return (i); 5227 } 5228 5229 /* 5230 * Unlocked variant of vm_page_grab_pages(). This accepts the same flags 5231 * and will fall back to the locked variant to handle allocation. 5232 */ 5233 int 5234 vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex, 5235 int allocflags, vm_page_t *ma, int count) 5236 { 5237 vm_page_t m; 5238 int flags; 5239 int i, num_fetched; 5240 5241 KASSERT(count > 0, 5242 ("vm_page_grab_pages_unlocked: invalid page count %d", count)); 5243 vm_page_grab_check(allocflags); 5244 5245 /* 5246 * Modify flags for lockless acquire to hold the page until we 5247 * set it valid if necessary. 5248 */ 5249 flags = allocflags & ~VM_ALLOC_NOBUSY; 5250 vm_page_grab_check(flags); 5251 num_fetched = vm_radix_lookup_range_unlocked(&object->rtree, pindex, 5252 ma, count); 5253 for (i = 0; i < num_fetched; i++, pindex++) { 5254 m = vm_page_acquire_unlocked(object, pindex, ma[i], flags); 5255 if (m == PAGE_NOT_ACQUIRED) 5256 return (i); 5257 if (m == NULL) 5258 break; 5259 if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) { 5260 if ((m->flags & PG_ZERO) == 0) 5261 pmap_zero_page(m); 5262 vm_page_valid(m); 5263 } 5264 /* m will still be wired or busy according to flags. */ 5265 vm_page_grab_release(m, allocflags); 5266 /* vm_page_acquire_unlocked() may not return ma[i]. */ 5267 ma[i] = m; 5268 } 5269 if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0) 5270 return (i); 5271 count -= i; 5272 VM_OBJECT_WLOCK(object); 5273 i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count); 5274 VM_OBJECT_WUNLOCK(object); 5275 5276 return (i); 5277 } 5278 5279 /* 5280 * Mapping function for valid or dirty bits in a page. 5281 * 5282 * Inputs are required to range within a page. 5283 */ 5284 vm_page_bits_t 5285 vm_page_bits(int base, int size) 5286 { 5287 int first_bit; 5288 int last_bit; 5289 5290 KASSERT( 5291 base + size <= PAGE_SIZE, 5292 ("vm_page_bits: illegal base/size %d/%d", base, size) 5293 ); 5294 5295 if (size == 0) /* handle degenerate case */ 5296 return (0); 5297 5298 first_bit = base >> DEV_BSHIFT; 5299 last_bit = (base + size - 1) >> DEV_BSHIFT; 5300 5301 return (((vm_page_bits_t)2 << last_bit) - 5302 ((vm_page_bits_t)1 << first_bit)); 5303 } 5304 5305 void 5306 vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set) 5307 { 5308 5309 #if PAGE_SIZE == 32768 5310 atomic_set_64((uint64_t *)bits, set); 5311 #elif PAGE_SIZE == 16384 5312 atomic_set_32((uint32_t *)bits, set); 5313 #elif (PAGE_SIZE == 8192) && defined(atomic_set_16) 5314 atomic_set_16((uint16_t *)bits, set); 5315 #elif (PAGE_SIZE == 4096) && defined(atomic_set_8) 5316 atomic_set_8((uint8_t *)bits, set); 5317 #else /* PAGE_SIZE <= 8192 */ 5318 uintptr_t addr; 5319 int shift; 5320 5321 addr = (uintptr_t)bits; 5322 /* 5323 * Use a trick to perform a 32-bit atomic on the 5324 * containing aligned word, to not depend on the existence 5325 * of atomic_{set, clear}_{8, 16}. 5326 */ 5327 shift = addr & (sizeof(uint32_t) - 1); 5328 #if BYTE_ORDER == BIG_ENDIAN 5329 shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; 5330 #else 5331 shift *= NBBY; 5332 #endif 5333 addr &= ~(sizeof(uint32_t) - 1); 5334 atomic_set_32((uint32_t *)addr, set << shift); 5335 #endif /* PAGE_SIZE */ 5336 } 5337 5338 static inline void 5339 vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear) 5340 { 5341 5342 #if PAGE_SIZE == 32768 5343 atomic_clear_64((uint64_t *)bits, clear); 5344 #elif PAGE_SIZE == 16384 5345 atomic_clear_32((uint32_t *)bits, clear); 5346 #elif (PAGE_SIZE == 8192) && defined(atomic_clear_16) 5347 atomic_clear_16((uint16_t *)bits, clear); 5348 #elif (PAGE_SIZE == 4096) && defined(atomic_clear_8) 5349 atomic_clear_8((uint8_t *)bits, clear); 5350 #else /* PAGE_SIZE <= 8192 */ 5351 uintptr_t addr; 5352 int shift; 5353 5354 addr = (uintptr_t)bits; 5355 /* 5356 * Use a trick to perform a 32-bit atomic on the 5357 * containing aligned word, to not depend on the existence 5358 * of atomic_{set, clear}_{8, 16}. 5359 */ 5360 shift = addr & (sizeof(uint32_t) - 1); 5361 #if BYTE_ORDER == BIG_ENDIAN 5362 shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; 5363 #else 5364 shift *= NBBY; 5365 #endif 5366 addr &= ~(sizeof(uint32_t) - 1); 5367 atomic_clear_32((uint32_t *)addr, clear << shift); 5368 #endif /* PAGE_SIZE */ 5369 } 5370 5371 static inline vm_page_bits_t 5372 vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits) 5373 { 5374 #if PAGE_SIZE == 32768 5375 uint64_t old; 5376 5377 old = *bits; 5378 while (atomic_fcmpset_64(bits, &old, newbits) == 0); 5379 return (old); 5380 #elif PAGE_SIZE == 16384 5381 uint32_t old; 5382 5383 old = *bits; 5384 while (atomic_fcmpset_32(bits, &old, newbits) == 0); 5385 return (old); 5386 #elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16) 5387 uint16_t old; 5388 5389 old = *bits; 5390 while (atomic_fcmpset_16(bits, &old, newbits) == 0); 5391 return (old); 5392 #elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8) 5393 uint8_t old; 5394 5395 old = *bits; 5396 while (atomic_fcmpset_8(bits, &old, newbits) == 0); 5397 return (old); 5398 #else /* PAGE_SIZE <= 4096*/ 5399 uintptr_t addr; 5400 uint32_t old, new, mask; 5401 int shift; 5402 5403 addr = (uintptr_t)bits; 5404 /* 5405 * Use a trick to perform a 32-bit atomic on the 5406 * containing aligned word, to not depend on the existence 5407 * of atomic_{set, swap, clear}_{8, 16}. 5408 */ 5409 shift = addr & (sizeof(uint32_t) - 1); 5410 #if BYTE_ORDER == BIG_ENDIAN 5411 shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; 5412 #else 5413 shift *= NBBY; 5414 #endif 5415 addr &= ~(sizeof(uint32_t) - 1); 5416 mask = VM_PAGE_BITS_ALL << shift; 5417 5418 old = *bits; 5419 do { 5420 new = old & ~mask; 5421 new |= newbits << shift; 5422 } while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0); 5423 return (old >> shift); 5424 #endif /* PAGE_SIZE */ 5425 } 5426 5427 /* 5428 * vm_page_set_valid_range: 5429 * 5430 * Sets portions of a page valid. The arguments are expected 5431 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 5432 * of any partial chunks touched by the range. The invalid portion of 5433 * such chunks will be zeroed. 5434 * 5435 * (base + size) must be less then or equal to PAGE_SIZE. 5436 */ 5437 void 5438 vm_page_set_valid_range(vm_page_t m, int base, int size) 5439 { 5440 int endoff, frag; 5441 vm_page_bits_t pagebits; 5442 5443 vm_page_assert_busied(m); 5444 if (size == 0) /* handle degenerate case */ 5445 return; 5446 5447 /* 5448 * If the base is not DEV_BSIZE aligned and the valid 5449 * bit is clear, we have to zero out a portion of the 5450 * first block. 5451 */ 5452 if ((frag = rounddown2(base, DEV_BSIZE)) != base && 5453 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 5454 pmap_zero_page_area(m, frag, base - frag); 5455 5456 /* 5457 * If the ending offset is not DEV_BSIZE aligned and the 5458 * valid bit is clear, we have to zero out a portion of 5459 * the last block. 5460 */ 5461 endoff = base + size; 5462 if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && 5463 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 5464 pmap_zero_page_area(m, endoff, 5465 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 5466 5467 /* 5468 * Assert that no previously invalid block that is now being validated 5469 * is already dirty. 5470 */ 5471 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 5472 ("vm_page_set_valid_range: page %p is dirty", m)); 5473 5474 /* 5475 * Set valid bits inclusive of any overlap. 5476 */ 5477 pagebits = vm_page_bits(base, size); 5478 if (vm_page_xbusied(m)) 5479 m->valid |= pagebits; 5480 else 5481 vm_page_bits_set(m, &m->valid, pagebits); 5482 } 5483 5484 /* 5485 * Set the page dirty bits and free the invalid swap space if 5486 * present. Returns the previous dirty bits. 5487 */ 5488 vm_page_bits_t 5489 vm_page_set_dirty(vm_page_t m) 5490 { 5491 vm_page_bits_t old; 5492 5493 VM_PAGE_OBJECT_BUSY_ASSERT(m); 5494 5495 if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) { 5496 old = m->dirty; 5497 m->dirty = VM_PAGE_BITS_ALL; 5498 } else 5499 old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL); 5500 if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0) 5501 vm_pager_page_unswapped(m); 5502 5503 return (old); 5504 } 5505 5506 /* 5507 * Clear the given bits from the specified page's dirty field. 5508 */ 5509 static __inline void 5510 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) 5511 { 5512 5513 vm_page_assert_busied(m); 5514 5515 /* 5516 * If the page is xbusied and not write mapped we are the 5517 * only thread that can modify dirty bits. Otherwise, The pmap 5518 * layer can call vm_page_dirty() without holding a distinguished 5519 * lock. The combination of page busy and atomic operations 5520 * suffice to guarantee consistency of the page dirty field. 5521 */ 5522 if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) 5523 m->dirty &= ~pagebits; 5524 else 5525 vm_page_bits_clear(m, &m->dirty, pagebits); 5526 } 5527 5528 /* 5529 * vm_page_set_validclean: 5530 * 5531 * Sets portions of a page valid and clean. The arguments are expected 5532 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 5533 * of any partial chunks touched by the range. The invalid portion of 5534 * such chunks will be zero'd. 5535 * 5536 * (base + size) must be less then or equal to PAGE_SIZE. 5537 */ 5538 void 5539 vm_page_set_validclean(vm_page_t m, int base, int size) 5540 { 5541 vm_page_bits_t oldvalid, pagebits; 5542 int endoff, frag; 5543 5544 vm_page_assert_busied(m); 5545 if (size == 0) /* handle degenerate case */ 5546 return; 5547 5548 /* 5549 * If the base is not DEV_BSIZE aligned and the valid 5550 * bit is clear, we have to zero out a portion of the 5551 * first block. 5552 */ 5553 if ((frag = rounddown2(base, DEV_BSIZE)) != base && 5554 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) 5555 pmap_zero_page_area(m, frag, base - frag); 5556 5557 /* 5558 * If the ending offset is not DEV_BSIZE aligned and the 5559 * valid bit is clear, we have to zero out a portion of 5560 * the last block. 5561 */ 5562 endoff = base + size; 5563 if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && 5564 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) 5565 pmap_zero_page_area(m, endoff, 5566 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 5567 5568 /* 5569 * Set valid, clear dirty bits. If validating the entire 5570 * page we can safely clear the pmap modify bit. We also 5571 * use this opportunity to clear the PGA_NOSYNC flag. If a process 5572 * takes a write fault on a MAP_NOSYNC memory area the flag will 5573 * be set again. 5574 * 5575 * We set valid bits inclusive of any overlap, but we can only 5576 * clear dirty bits for DEV_BSIZE chunks that are fully within 5577 * the range. 5578 */ 5579 oldvalid = m->valid; 5580 pagebits = vm_page_bits(base, size); 5581 if (vm_page_xbusied(m)) 5582 m->valid |= pagebits; 5583 else 5584 vm_page_bits_set(m, &m->valid, pagebits); 5585 #if 0 /* NOT YET */ 5586 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 5587 frag = DEV_BSIZE - frag; 5588 base += frag; 5589 size -= frag; 5590 if (size < 0) 5591 size = 0; 5592 } 5593 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 5594 #endif 5595 if (base == 0 && size == PAGE_SIZE) { 5596 /* 5597 * The page can only be modified within the pmap if it is 5598 * mapped, and it can only be mapped if it was previously 5599 * fully valid. 5600 */ 5601 if (oldvalid == VM_PAGE_BITS_ALL) 5602 /* 5603 * Perform the pmap_clear_modify() first. Otherwise, 5604 * a concurrent pmap operation, such as 5605 * pmap_protect(), could clear a modification in the 5606 * pmap and set the dirty field on the page before 5607 * pmap_clear_modify() had begun and after the dirty 5608 * field was cleared here. 5609 */ 5610 pmap_clear_modify(m); 5611 m->dirty = 0; 5612 vm_page_aflag_clear(m, PGA_NOSYNC); 5613 } else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m)) 5614 m->dirty &= ~pagebits; 5615 else 5616 vm_page_clear_dirty_mask(m, pagebits); 5617 } 5618 5619 void 5620 vm_page_clear_dirty(vm_page_t m, int base, int size) 5621 { 5622 5623 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 5624 } 5625 5626 /* 5627 * vm_page_set_invalid: 5628 * 5629 * Invalidates DEV_BSIZE'd chunks within a page. Both the 5630 * valid and dirty bits for the effected areas are cleared. 5631 */ 5632 void 5633 vm_page_set_invalid(vm_page_t m, int base, int size) 5634 { 5635 vm_page_bits_t bits; 5636 vm_object_t object; 5637 5638 /* 5639 * The object lock is required so that pages can't be mapped 5640 * read-only while we're in the process of invalidating them. 5641 */ 5642 object = m->object; 5643 VM_OBJECT_ASSERT_WLOCKED(object); 5644 vm_page_assert_busied(m); 5645 5646 if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + 5647 size >= object->un_pager.vnp.vnp_size) 5648 bits = VM_PAGE_BITS_ALL; 5649 else 5650 bits = vm_page_bits(base, size); 5651 if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0) 5652 pmap_remove_all(m); 5653 KASSERT((bits == 0 && vm_page_all_valid(m)) || 5654 !pmap_page_is_mapped(m), 5655 ("vm_page_set_invalid: page %p is mapped", m)); 5656 if (vm_page_xbusied(m)) { 5657 m->valid &= ~bits; 5658 m->dirty &= ~bits; 5659 } else { 5660 vm_page_bits_clear(m, &m->valid, bits); 5661 vm_page_bits_clear(m, &m->dirty, bits); 5662 } 5663 } 5664 5665 /* 5666 * vm_page_invalid: 5667 * 5668 * Invalidates the entire page. The page must be busy, unmapped, and 5669 * the enclosing object must be locked. The object locks protects 5670 * against concurrent read-only pmap enter which is done without 5671 * busy. 5672 */ 5673 void 5674 vm_page_invalid(vm_page_t m) 5675 { 5676 5677 vm_page_assert_busied(m); 5678 VM_OBJECT_ASSERT_WLOCKED(m->object); 5679 MPASS(!pmap_page_is_mapped(m)); 5680 5681 if (vm_page_xbusied(m)) 5682 m->valid = 0; 5683 else 5684 vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL); 5685 } 5686 5687 /* 5688 * vm_page_zero_invalid() 5689 * 5690 * The kernel assumes that the invalid portions of a page contain 5691 * garbage, but such pages can be mapped into memory by user code. 5692 * When this occurs, we must zero out the non-valid portions of the 5693 * page so user code sees what it expects. 5694 * 5695 * Pages are most often semi-valid when the end of a file is mapped 5696 * into memory and the file's size is not page aligned. 5697 */ 5698 void 5699 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 5700 { 5701 int b; 5702 int i; 5703 5704 /* 5705 * Scan the valid bits looking for invalid sections that 5706 * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the 5707 * valid bit may be set ) have already been zeroed by 5708 * vm_page_set_validclean(). 5709 */ 5710 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 5711 if (i == (PAGE_SIZE / DEV_BSIZE) || 5712 (m->valid & ((vm_page_bits_t)1 << i))) { 5713 if (i > b) { 5714 pmap_zero_page_area(m, 5715 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 5716 } 5717 b = i + 1; 5718 } 5719 } 5720 5721 /* 5722 * setvalid is TRUE when we can safely set the zero'd areas 5723 * as being valid. We can do this if there are no cache consistency 5724 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 5725 */ 5726 if (setvalid) 5727 vm_page_valid(m); 5728 } 5729 5730 /* 5731 * vm_page_is_valid: 5732 * 5733 * Is (partial) page valid? Note that the case where size == 0 5734 * will return FALSE in the degenerate case where the page is 5735 * entirely invalid, and TRUE otherwise. 5736 * 5737 * Some callers envoke this routine without the busy lock held and 5738 * handle races via higher level locks. Typical callers should 5739 * hold a busy lock to prevent invalidation. 5740 */ 5741 int 5742 vm_page_is_valid(vm_page_t m, int base, int size) 5743 { 5744 vm_page_bits_t bits; 5745 5746 bits = vm_page_bits(base, size); 5747 return (vm_page_any_valid(m) && (m->valid & bits) == bits); 5748 } 5749 5750 /* 5751 * Returns true if all of the specified predicates are true for the entire 5752 * (super)page and false otherwise. 5753 */ 5754 bool 5755 vm_page_ps_test(vm_page_t m, int psind, int flags, vm_page_t skip_m) 5756 { 5757 vm_object_t object; 5758 int i, npages; 5759 5760 object = m->object; 5761 if (skip_m != NULL && skip_m->object != object) 5762 return (false); 5763 VM_OBJECT_ASSERT_LOCKED(object); 5764 KASSERT(psind <= m->psind, 5765 ("psind %d > psind %d of m %p", psind, m->psind, m)); 5766 npages = atop(pagesizes[psind]); 5767 5768 /* 5769 * The physically contiguous pages that make up a superpage, i.e., a 5770 * page with a page size index ("psind") greater than zero, will 5771 * occupy adjacent entries in vm_page_array[]. 5772 */ 5773 for (i = 0; i < npages; i++) { 5774 /* Always test object consistency, including "skip_m". */ 5775 if (m[i].object != object) 5776 return (false); 5777 if (&m[i] == skip_m) 5778 continue; 5779 if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) 5780 return (false); 5781 if ((flags & PS_ALL_DIRTY) != 0) { 5782 /* 5783 * Calling vm_page_test_dirty() or pmap_is_modified() 5784 * might stop this case from spuriously returning 5785 * "false". However, that would require a write lock 5786 * on the object containing "m[i]". 5787 */ 5788 if (m[i].dirty != VM_PAGE_BITS_ALL) 5789 return (false); 5790 } 5791 if ((flags & PS_ALL_VALID) != 0 && 5792 m[i].valid != VM_PAGE_BITS_ALL) 5793 return (false); 5794 } 5795 return (true); 5796 } 5797 5798 /* 5799 * Set the page's dirty bits if the page is modified. 5800 */ 5801 void 5802 vm_page_test_dirty(vm_page_t m) 5803 { 5804 5805 vm_page_assert_busied(m); 5806 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 5807 vm_page_dirty(m); 5808 } 5809 5810 void 5811 vm_page_valid(vm_page_t m) 5812 { 5813 5814 vm_page_assert_busied(m); 5815 if (vm_page_xbusied(m)) 5816 m->valid = VM_PAGE_BITS_ALL; 5817 else 5818 vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL); 5819 } 5820 5821 #ifdef INVARIANTS 5822 void 5823 vm_page_object_busy_assert(vm_page_t m) 5824 { 5825 5826 /* 5827 * Certain of the page's fields may only be modified by the 5828 * holder of a page or object busy. 5829 */ 5830 if (m->object != NULL && !vm_page_busied(m)) 5831 VM_OBJECT_ASSERT_BUSY(m->object); 5832 } 5833 5834 void 5835 vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits) 5836 { 5837 5838 if ((bits & PGA_WRITEABLE) == 0) 5839 return; 5840 5841 /* 5842 * The PGA_WRITEABLE flag can only be set if the page is 5843 * managed, is exclusively busied or the object is locked. 5844 * Currently, this flag is only set by pmap_enter(). 5845 */ 5846 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5847 ("PGA_WRITEABLE on unmanaged page")); 5848 if (!vm_page_xbusied(m)) 5849 VM_OBJECT_ASSERT_BUSY(m->object); 5850 } 5851 #endif 5852 5853 #include "opt_ddb.h" 5854 #ifdef DDB 5855 #include <sys/kernel.h> 5856 5857 #include <ddb/ddb.h> 5858 5859 DB_SHOW_COMMAND_FLAGS(page, vm_page_print_page_info, DB_CMD_MEMSAFE) 5860 { 5861 5862 db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); 5863 db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); 5864 db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); 5865 db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); 5866 db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); 5867 db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); 5868 db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); 5869 db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); 5870 db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); 5871 } 5872 5873 DB_SHOW_COMMAND_FLAGS(pageq, vm_page_print_pageq_info, DB_CMD_MEMSAFE) 5874 { 5875 int dom; 5876 5877 db_printf("pq_free %d\n", vm_free_count()); 5878 for (dom = 0; dom < vm_ndomains; dom++) { 5879 db_printf( 5880 "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", 5881 dom, 5882 vm_dom[dom].vmd_page_count, 5883 vm_dom[dom].vmd_free_count, 5884 vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, 5885 vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, 5886 vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 5887 vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); 5888 } 5889 } 5890 5891 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) 5892 { 5893 vm_page_t m; 5894 boolean_t phys, virt; 5895 5896 if (!have_addr) { 5897 db_printf("show pginfo addr\n"); 5898 return; 5899 } 5900 5901 phys = strchr(modif, 'p') != NULL; 5902 virt = strchr(modif, 'v') != NULL; 5903 if (virt) 5904 m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); 5905 else if (phys) 5906 m = PHYS_TO_VM_PAGE(addr); 5907 else 5908 m = (vm_page_t)addr; 5909 db_printf( 5910 "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n" 5911 " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", 5912 m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, 5913 m->a.queue, m->ref_count, m->a.flags, m->oflags, 5914 m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty); 5915 } 5916 #endif /* DDB */ 5917