1 /*- 2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * The Mach Operating System project at Carnegie-Mellon University. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 /*- 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 */ 62 63 /* 64 * Resident memory management module. 65 */ 66 67 #include <sys/cdefs.h> 68 #include "opt_vm.h" 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/counter.h> 73 #include <sys/domainset.h> 74 #include <sys/kernel.h> 75 #include <sys/limits.h> 76 #include <sys/linker.h> 77 #include <sys/lock.h> 78 #include <sys/malloc.h> 79 #include <sys/mman.h> 80 #include <sys/msgbuf.h> 81 #include <sys/mutex.h> 82 #include <sys/proc.h> 83 #include <sys/rwlock.h> 84 #include <sys/sleepqueue.h> 85 #include <sys/sbuf.h> 86 #include <sys/sched.h> 87 #include <sys/smp.h> 88 #include <sys/sysctl.h> 89 #include <sys/vmmeter.h> 90 #include <sys/vnode.h> 91 92 #include <vm/vm.h> 93 #include <vm/pmap.h> 94 #include <vm/vm_param.h> 95 #include <vm/vm_domainset.h> 96 #include <vm/vm_kern.h> 97 #include <vm/vm_map.h> 98 #include <vm/vm_object.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_pageout.h> 101 #include <vm/vm_phys.h> 102 #include <vm/vm_pagequeue.h> 103 #include <vm/vm_pager.h> 104 #include <vm/vm_radix.h> 105 #include <vm/vm_reserv.h> 106 #include <vm/vm_extern.h> 107 #include <vm/vm_dumpset.h> 108 #include <vm/uma.h> 109 #include <vm/uma_int.h> 110 111 #include <machine/md_var.h> 112 113 struct vm_domain vm_dom[MAXMEMDOM]; 114 115 DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); 116 117 struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; 118 119 struct mtx_padalign __exclusive_cache_line vm_domainset_lock; 120 /* The following fields are protected by the domainset lock. */ 121 domainset_t __exclusive_cache_line vm_min_domains; 122 domainset_t __exclusive_cache_line vm_severe_domains; 123 static int vm_min_waiters; 124 static int vm_severe_waiters; 125 static int vm_pageproc_waiters; 126 127 static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 128 "VM page statistics"); 129 130 static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries); 131 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries, 132 CTLFLAG_RD, &pqstate_commit_retries, 133 "Number of failed per-page atomic queue state updates"); 134 135 static COUNTER_U64_DEFINE_EARLY(queue_ops); 136 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, 137 CTLFLAG_RD, &queue_ops, 138 "Number of batched queue operations"); 139 140 static COUNTER_U64_DEFINE_EARLY(queue_nops); 141 SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops, 142 CTLFLAG_RD, &queue_nops, 143 "Number of batched queue operations with no effects"); 144 145 /* 146 * bogus page -- for I/O to/from partially complete buffers, 147 * or for paging into sparsely invalid regions. 148 */ 149 vm_page_t bogus_page; 150 151 vm_page_t vm_page_array; 152 long vm_page_array_size; 153 long first_page; 154 155 struct bitset *vm_page_dump; 156 long vm_page_dump_pages; 157 158 static TAILQ_HEAD(, vm_page) blacklist_head; 159 static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); 160 SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | 161 CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); 162 163 static uma_zone_t fakepg_zone; 164 165 static void vm_page_alloc_check(vm_page_t m); 166 static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, 167 vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked); 168 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); 169 static void vm_page_enqueue(vm_page_t m, uint8_t queue); 170 static bool vm_page_free_prep(vm_page_t m); 171 static void vm_page_free_toq(vm_page_t m); 172 static void vm_page_init(void *dummy); 173 static int vm_page_insert_after(vm_page_t m, vm_object_t object, 174 vm_pindex_t pindex, vm_page_t mpred); 175 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, 176 vm_page_t mpred); 177 static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, 178 const uint16_t nflag); 179 static int vm_page_reclaim_run(int req_class, int domain, u_long npages, 180 vm_page_t m_run, vm_paddr_t high); 181 static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse); 182 static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, 183 int req); 184 static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, 185 int flags); 186 static void vm_page_zone_release(void *arg, void **store, int cnt); 187 188 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); 189 190 static void 191 vm_page_init(void *dummy) 192 { 193 194 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 195 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 196 bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED); 197 } 198 199 static int pgcache_zone_max_pcpu; 200 SYSCTL_INT(_vm, OID_AUTO, pgcache_zone_max_pcpu, 201 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pgcache_zone_max_pcpu, 0, 202 "Per-CPU page cache size"); 203 204 /* 205 * The cache page zone is initialized later since we need to be able to allocate 206 * pages before UMA is fully initialized. 207 */ 208 static void 209 vm_page_init_cache_zones(void *dummy __unused) 210 { 211 struct vm_domain *vmd; 212 struct vm_pgcache *pgcache; 213 int cache, domain, maxcache, pool; 214 215 TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &pgcache_zone_max_pcpu); 216 maxcache = pgcache_zone_max_pcpu * mp_ncpus; 217 for (domain = 0; domain < vm_ndomains; domain++) { 218 vmd = VM_DOMAIN(domain); 219 for (pool = 0; pool < VM_NFREEPOOL; pool++) { 220 pgcache = &vmd->vmd_pgcache[pool]; 221 pgcache->domain = domain; 222 pgcache->pool = pool; 223 pgcache->zone = uma_zcache_create("vm pgcache", 224 PAGE_SIZE, NULL, NULL, NULL, NULL, 225 vm_page_zone_import, vm_page_zone_release, pgcache, 226 UMA_ZONE_VM); 227 228 /* 229 * Limit each pool's zone to 0.1% of the pages in the 230 * domain. 231 */ 232 cache = maxcache != 0 ? maxcache : 233 vmd->vmd_page_count / 1000; 234 uma_zone_set_maxcache(pgcache->zone, cache); 235 } 236 } 237 } 238 SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); 239 240 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ 241 #if PAGE_SIZE == 32768 242 #ifdef CTASSERT 243 CTASSERT(sizeof(u_long) >= 8); 244 #endif 245 #endif 246 247 /* 248 * vm_set_page_size: 249 * 250 * Sets the page size, perhaps based upon the memory 251 * size. Must be called before any use of page-size 252 * dependent functions. 253 */ 254 void 255 vm_set_page_size(void) 256 { 257 if (vm_cnt.v_page_size == 0) 258 vm_cnt.v_page_size = PAGE_SIZE; 259 if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) 260 panic("vm_set_page_size: page size not a power of two"); 261 } 262 263 /* 264 * vm_page_blacklist_next: 265 * 266 * Find the next entry in the provided string of blacklist 267 * addresses. Entries are separated by space, comma, or newline. 268 * If an invalid integer is encountered then the rest of the 269 * string is skipped. Updates the list pointer to the next 270 * character, or NULL if the string is exhausted or invalid. 271 */ 272 static vm_paddr_t 273 vm_page_blacklist_next(char **list, char *end) 274 { 275 vm_paddr_t bad; 276 char *cp, *pos; 277 278 if (list == NULL || *list == NULL) 279 return (0); 280 if (**list =='\0') { 281 *list = NULL; 282 return (0); 283 } 284 285 /* 286 * If there's no end pointer then the buffer is coming from 287 * the kenv and we know it's null-terminated. 288 */ 289 if (end == NULL) 290 end = *list + strlen(*list); 291 292 /* Ensure that strtoq() won't walk off the end */ 293 if (*end != '\0') { 294 if (*end == '\n' || *end == ' ' || *end == ',') 295 *end = '\0'; 296 else { 297 printf("Blacklist not terminated, skipping\n"); 298 *list = NULL; 299 return (0); 300 } 301 } 302 303 for (pos = *list; *pos != '\0'; pos = cp) { 304 bad = strtoq(pos, &cp, 0); 305 if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { 306 if (bad == 0) { 307 if (++cp < end) 308 continue; 309 else 310 break; 311 } 312 } else 313 break; 314 if (*cp == '\0' || ++cp >= end) 315 *list = NULL; 316 else 317 *list = cp; 318 return (trunc_page(bad)); 319 } 320 printf("Garbage in RAM blacklist, skipping\n"); 321 *list = NULL; 322 return (0); 323 } 324 325 bool 326 vm_page_blacklist_add(vm_paddr_t pa, bool verbose) 327 { 328 struct vm_domain *vmd; 329 vm_page_t m; 330 bool found; 331 332 m = vm_phys_paddr_to_vm_page(pa); 333 if (m == NULL) 334 return (true); /* page does not exist, no failure */ 335 336 vmd = vm_pagequeue_domain(m); 337 vm_domain_free_lock(vmd); 338 found = vm_phys_unfree_page(m); 339 vm_domain_free_unlock(vmd); 340 if (found) { 341 vm_domain_freecnt_inc(vmd, -1); 342 TAILQ_INSERT_TAIL(&blacklist_head, m, listq); 343 if (verbose) 344 printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); 345 } 346 return (found); 347 } 348 349 /* 350 * vm_page_blacklist_check: 351 * 352 * Iterate through the provided string of blacklist addresses, pulling 353 * each entry out of the physical allocator free list and putting it 354 * onto a list for reporting via the vm.page_blacklist sysctl. 355 */ 356 static void 357 vm_page_blacklist_check(char *list, char *end) 358 { 359 vm_paddr_t pa; 360 char *next; 361 362 next = list; 363 while (next != NULL) { 364 if ((pa = vm_page_blacklist_next(&next, end)) == 0) 365 continue; 366 vm_page_blacklist_add(pa, bootverbose); 367 } 368 } 369 370 /* 371 * vm_page_blacklist_load: 372 * 373 * Search for a special module named "ram_blacklist". It'll be a 374 * plain text file provided by the user via the loader directive 375 * of the same name. 376 */ 377 static void 378 vm_page_blacklist_load(char **list, char **end) 379 { 380 void *mod; 381 u_char *ptr; 382 u_int len; 383 384 mod = NULL; 385 ptr = NULL; 386 387 mod = preload_search_by_type("ram_blacklist"); 388 if (mod != NULL) { 389 ptr = preload_fetch_addr(mod); 390 len = preload_fetch_size(mod); 391 } 392 *list = ptr; 393 if (ptr != NULL) 394 *end = ptr + len; 395 else 396 *end = NULL; 397 return; 398 } 399 400 static int 401 sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) 402 { 403 vm_page_t m; 404 struct sbuf sbuf; 405 int error, first; 406 407 first = 1; 408 error = sysctl_wire_old_buffer(req, 0); 409 if (error != 0) 410 return (error); 411 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 412 TAILQ_FOREACH(m, &blacklist_head, listq) { 413 sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", 414 (uintmax_t)m->phys_addr); 415 first = 0; 416 } 417 error = sbuf_finish(&sbuf); 418 sbuf_delete(&sbuf); 419 return (error); 420 } 421 422 /* 423 * Initialize a dummy page for use in scans of the specified paging queue. 424 * In principle, this function only needs to set the flag PG_MARKER. 425 * Nonetheless, it write busies the page as a safety precaution. 426 */ 427 void 428 vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags) 429 { 430 431 bzero(marker, sizeof(*marker)); 432 marker->flags = PG_MARKER; 433 marker->a.flags = aflags; 434 marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE; 435 marker->a.queue = queue; 436 } 437 438 static void 439 vm_page_domain_init(int domain) 440 { 441 struct vm_domain *vmd; 442 struct vm_pagequeue *pq; 443 int i; 444 445 vmd = VM_DOMAIN(domain); 446 bzero(vmd, sizeof(*vmd)); 447 *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = 448 "vm inactive pagequeue"; 449 *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = 450 "vm active pagequeue"; 451 *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = 452 "vm laundry pagequeue"; 453 *__DECONST(const char **, 454 &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = 455 "vm unswappable pagequeue"; 456 vmd->vmd_domain = domain; 457 vmd->vmd_page_count = 0; 458 vmd->vmd_free_count = 0; 459 vmd->vmd_segs = 0; 460 vmd->vmd_oom = FALSE; 461 for (i = 0; i < PQ_COUNT; i++) { 462 pq = &vmd->vmd_pagequeues[i]; 463 TAILQ_INIT(&pq->pq_pl); 464 mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", 465 MTX_DEF | MTX_DUPOK); 466 pq->pq_pdpages = 0; 467 vm_page_init_marker(&vmd->vmd_markers[i], i, 0); 468 } 469 mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); 470 mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); 471 snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); 472 473 /* 474 * inacthead is used to provide FIFO ordering for LRU-bypassing 475 * insertions. 476 */ 477 vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); 478 TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, 479 &vmd->vmd_inacthead, plinks.q); 480 481 /* 482 * The clock pages are used to implement active queue scanning without 483 * requeues. Scans start at clock[0], which is advanced after the scan 484 * ends. When the two clock hands meet, they are reset and scanning 485 * resumes from the head of the queue. 486 */ 487 vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); 488 vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); 489 TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, 490 &vmd->vmd_clock[0], plinks.q); 491 TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, 492 &vmd->vmd_clock[1], plinks.q); 493 } 494 495 /* 496 * Initialize a physical page in preparation for adding it to the free 497 * lists. 498 */ 499 void 500 vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) 501 { 502 503 m->object = NULL; 504 m->ref_count = 0; 505 m->busy_lock = VPB_FREED; 506 m->flags = m->a.flags = 0; 507 m->phys_addr = pa; 508 m->a.queue = PQ_NONE; 509 m->psind = 0; 510 m->segind = segind; 511 m->order = VM_NFREEORDER; 512 m->pool = VM_FREEPOOL_DEFAULT; 513 m->valid = m->dirty = 0; 514 pmap_page_init(m); 515 } 516 517 #ifndef PMAP_HAS_PAGE_ARRAY 518 static vm_paddr_t 519 vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range) 520 { 521 vm_paddr_t new_end; 522 523 /* 524 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 525 * However, because this page is allocated from KVM, out-of-bounds 526 * accesses using the direct map will not be trapped. 527 */ 528 *vaddr += PAGE_SIZE; 529 530 /* 531 * Allocate physical memory for the page structures, and map it. 532 */ 533 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 534 vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end, 535 VM_PROT_READ | VM_PROT_WRITE); 536 vm_page_array_size = page_range; 537 538 return (new_end); 539 } 540 #endif 541 542 /* 543 * vm_page_startup: 544 * 545 * Initializes the resident memory module. Allocates physical memory for 546 * bootstrapping UMA and some data structures that are used to manage 547 * physical pages. Initializes these structures, and populates the free 548 * page queues. 549 */ 550 vm_offset_t 551 vm_page_startup(vm_offset_t vaddr) 552 { 553 struct vm_phys_seg *seg; 554 struct vm_domain *vmd; 555 vm_page_t m; 556 char *list, *listend; 557 vm_paddr_t end, high_avail, low_avail, new_end, size; 558 vm_paddr_t page_range __unused; 559 vm_paddr_t last_pa, pa, startp, endp; 560 u_long pagecount; 561 #if MINIDUMP_PAGE_TRACKING 562 u_long vm_page_dump_size; 563 #endif 564 int biggestone, i, segind; 565 #ifdef WITNESS 566 vm_offset_t mapped; 567 int witness_size; 568 #endif 569 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) 570 long ii; 571 #endif 572 573 vaddr = round_page(vaddr); 574 575 vm_phys_early_startup(); 576 biggestone = vm_phys_avail_largest(); 577 end = phys_avail[biggestone+1]; 578 579 /* 580 * Initialize the page and queue locks. 581 */ 582 mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); 583 for (i = 0; i < PA_LOCK_COUNT; i++) 584 mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); 585 for (i = 0; i < vm_ndomains; i++) 586 vm_page_domain_init(i); 587 588 new_end = end; 589 #ifdef WITNESS 590 witness_size = round_page(witness_startup_count()); 591 new_end -= witness_size; 592 mapped = pmap_map(&vaddr, new_end, new_end + witness_size, 593 VM_PROT_READ | VM_PROT_WRITE); 594 bzero((void *)mapped, witness_size); 595 witness_startup((void *)mapped); 596 #endif 597 598 #if MINIDUMP_PAGE_TRACKING 599 /* 600 * Allocate a bitmap to indicate that a random physical page 601 * needs to be included in a minidump. 602 * 603 * The amd64 port needs this to indicate which direct map pages 604 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 605 * 606 * However, i386 still needs this workspace internally within the 607 * minidump code. In theory, they are not needed on i386, but are 608 * included should the sf_buf code decide to use them. 609 */ 610 last_pa = 0; 611 vm_page_dump_pages = 0; 612 for (i = 0; dump_avail[i + 1] != 0; i += 2) { 613 vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) - 614 dump_avail[i] / PAGE_SIZE; 615 if (dump_avail[i + 1] > last_pa) 616 last_pa = dump_avail[i + 1]; 617 } 618 vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages)); 619 new_end -= vm_page_dump_size; 620 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, 621 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); 622 bzero((void *)vm_page_dump, vm_page_dump_size); 623 #if MINIDUMP_STARTUP_PAGE_TRACKING 624 /* 625 * Include the UMA bootstrap pages, witness pages and vm_page_dump 626 * in a crash dump. When pmap_map() uses the direct map, they are 627 * not automatically included. 628 */ 629 for (pa = new_end; pa < end; pa += PAGE_SIZE) 630 dump_add_page(pa); 631 #endif 632 #else 633 (void)last_pa; 634 #endif 635 phys_avail[biggestone + 1] = new_end; 636 #ifdef __amd64__ 637 /* 638 * Request that the physical pages underlying the message buffer be 639 * included in a crash dump. Since the message buffer is accessed 640 * through the direct map, they are not automatically included. 641 */ 642 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 643 last_pa = pa + round_page(msgbufsize); 644 while (pa < last_pa) { 645 dump_add_page(pa); 646 pa += PAGE_SIZE; 647 } 648 #endif 649 /* 650 * Compute the number of pages of memory that will be available for 651 * use, taking into account the overhead of a page structure per page. 652 * In other words, solve 653 * "available physical memory" - round_page(page_range * 654 * sizeof(struct vm_page)) = page_range * PAGE_SIZE 655 * for page_range. 656 */ 657 low_avail = phys_avail[0]; 658 high_avail = phys_avail[1]; 659 for (i = 0; i < vm_phys_nsegs; i++) { 660 if (vm_phys_segs[i].start < low_avail) 661 low_avail = vm_phys_segs[i].start; 662 if (vm_phys_segs[i].end > high_avail) 663 high_avail = vm_phys_segs[i].end; 664 } 665 /* Skip the first chunk. It is already accounted for. */ 666 for (i = 2; phys_avail[i + 1] != 0; i += 2) { 667 if (phys_avail[i] < low_avail) 668 low_avail = phys_avail[i]; 669 if (phys_avail[i + 1] > high_avail) 670 high_avail = phys_avail[i + 1]; 671 } 672 first_page = low_avail / PAGE_SIZE; 673 #ifdef VM_PHYSSEG_SPARSE 674 size = 0; 675 for (i = 0; i < vm_phys_nsegs; i++) 676 size += vm_phys_segs[i].end - vm_phys_segs[i].start; 677 for (i = 0; phys_avail[i + 1] != 0; i += 2) 678 size += phys_avail[i + 1] - phys_avail[i]; 679 #elif defined(VM_PHYSSEG_DENSE) 680 size = high_avail - low_avail; 681 #else 682 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 683 #endif 684 685 #ifdef PMAP_HAS_PAGE_ARRAY 686 pmap_page_array_startup(size / PAGE_SIZE); 687 biggestone = vm_phys_avail_largest(); 688 end = new_end = phys_avail[biggestone + 1]; 689 #else 690 #ifdef VM_PHYSSEG_DENSE 691 /* 692 * In the VM_PHYSSEG_DENSE case, the number of pages can account for 693 * the overhead of a page structure per page only if vm_page_array is 694 * allocated from the last physical memory chunk. Otherwise, we must 695 * allocate page structures representing the physical memory 696 * underlying vm_page_array, even though they will not be used. 697 */ 698 if (new_end != high_avail) 699 page_range = size / PAGE_SIZE; 700 else 701 #endif 702 { 703 page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); 704 705 /* 706 * If the partial bytes remaining are large enough for 707 * a page (PAGE_SIZE) without a corresponding 708 * 'struct vm_page', then new_end will contain an 709 * extra page after subtracting the length of the VM 710 * page array. Compensate by subtracting an extra 711 * page from new_end. 712 */ 713 if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { 714 if (new_end == high_avail) 715 high_avail -= PAGE_SIZE; 716 new_end -= PAGE_SIZE; 717 } 718 } 719 end = new_end; 720 new_end = vm_page_array_alloc(&vaddr, end, page_range); 721 #endif 722 723 #if VM_NRESERVLEVEL > 0 724 /* 725 * Allocate physical memory for the reservation management system's 726 * data structures, and map it. 727 */ 728 new_end = vm_reserv_startup(&vaddr, new_end); 729 #endif 730 #if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING 731 /* 732 * Include vm_page_array and vm_reserv_array in a crash dump. 733 */ 734 for (pa = new_end; pa < end; pa += PAGE_SIZE) 735 dump_add_page(pa); 736 #endif 737 phys_avail[biggestone + 1] = new_end; 738 739 /* 740 * Add physical memory segments corresponding to the available 741 * physical pages. 742 */ 743 for (i = 0; phys_avail[i + 1] != 0; i += 2) 744 if (vm_phys_avail_size(i) != 0) 745 vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); 746 747 /* 748 * Initialize the physical memory allocator. 749 */ 750 vm_phys_init(); 751 752 /* 753 * Initialize the page structures and add every available page to the 754 * physical memory allocator's free lists. 755 */ 756 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) 757 for (ii = 0; ii < vm_page_array_size; ii++) { 758 m = &vm_page_array[ii]; 759 vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); 760 m->flags = PG_FICTITIOUS; 761 } 762 #endif 763 vm_cnt.v_page_count = 0; 764 for (segind = 0; segind < vm_phys_nsegs; segind++) { 765 seg = &vm_phys_segs[segind]; 766 for (m = seg->first_page, pa = seg->start; pa < seg->end; 767 m++, pa += PAGE_SIZE) 768 vm_page_init_page(m, pa, segind); 769 770 /* 771 * Add the segment's pages that are covered by one of 772 * phys_avail's ranges to the free lists. 773 */ 774 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 775 if (seg->end <= phys_avail[i] || 776 seg->start >= phys_avail[i + 1]) 777 continue; 778 779 startp = MAX(seg->start, phys_avail[i]); 780 endp = MIN(seg->end, phys_avail[i + 1]); 781 pagecount = (u_long)atop(endp - startp); 782 if (pagecount == 0) 783 continue; 784 785 m = seg->first_page + atop(startp - seg->start); 786 vmd = VM_DOMAIN(seg->domain); 787 vm_domain_free_lock(vmd); 788 vm_phys_enqueue_contig(m, pagecount); 789 vm_domain_free_unlock(vmd); 790 vm_domain_freecnt_inc(vmd, pagecount); 791 vm_cnt.v_page_count += (u_int)pagecount; 792 vmd->vmd_page_count += (u_int)pagecount; 793 vmd->vmd_segs |= 1UL << segind; 794 } 795 } 796 797 /* 798 * Remove blacklisted pages from the physical memory allocator. 799 */ 800 TAILQ_INIT(&blacklist_head); 801 vm_page_blacklist_load(&list, &listend); 802 vm_page_blacklist_check(list, listend); 803 804 list = kern_getenv("vm.blacklist"); 805 vm_page_blacklist_check(list, NULL); 806 807 freeenv(list); 808 #if VM_NRESERVLEVEL > 0 809 /* 810 * Initialize the reservation management system. 811 */ 812 vm_reserv_init(); 813 #endif 814 815 return (vaddr); 816 } 817 818 void 819 vm_page_reference(vm_page_t m) 820 { 821 822 vm_page_aflag_set(m, PGA_REFERENCED); 823 } 824 825 /* 826 * vm_page_trybusy 827 * 828 * Helper routine for grab functions to trylock busy. 829 * 830 * Returns true on success and false on failure. 831 */ 832 static bool 833 vm_page_trybusy(vm_page_t m, int allocflags) 834 { 835 836 if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0) 837 return (vm_page_trysbusy(m)); 838 else 839 return (vm_page_tryxbusy(m)); 840 } 841 842 /* 843 * vm_page_tryacquire 844 * 845 * Helper routine for grab functions to trylock busy and wire. 846 * 847 * Returns true on success and false on failure. 848 */ 849 static inline bool 850 vm_page_tryacquire(vm_page_t m, int allocflags) 851 { 852 bool locked; 853 854 locked = vm_page_trybusy(m, allocflags); 855 if (locked && (allocflags & VM_ALLOC_WIRED) != 0) 856 vm_page_wire(m); 857 return (locked); 858 } 859 860 /* 861 * vm_page_busy_acquire: 862 * 863 * Acquire the busy lock as described by VM_ALLOC_* flags. Will loop 864 * and drop the object lock if necessary. 865 */ 866 bool 867 vm_page_busy_acquire(vm_page_t m, int allocflags) 868 { 869 vm_object_t obj; 870 bool locked; 871 872 /* 873 * The page-specific object must be cached because page 874 * identity can change during the sleep, causing the 875 * re-lock of a different object. 876 * It is assumed that a reference to the object is already 877 * held by the callers. 878 */ 879 obj = atomic_load_ptr(&m->object); 880 for (;;) { 881 if (vm_page_tryacquire(m, allocflags)) 882 return (true); 883 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 884 return (false); 885 if (obj != NULL) 886 locked = VM_OBJECT_WOWNED(obj); 887 else 888 locked = false; 889 MPASS(locked || vm_page_wired(m)); 890 if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags, 891 locked) && locked) 892 VM_OBJECT_WLOCK(obj); 893 if ((allocflags & VM_ALLOC_WAITFAIL) != 0) 894 return (false); 895 KASSERT(m->object == obj || m->object == NULL, 896 ("vm_page_busy_acquire: page %p does not belong to %p", 897 m, obj)); 898 } 899 } 900 901 /* 902 * vm_page_busy_downgrade: 903 * 904 * Downgrade an exclusive busy page into a single shared busy page. 905 */ 906 void 907 vm_page_busy_downgrade(vm_page_t m) 908 { 909 u_int x; 910 911 vm_page_assert_xbusied(m); 912 913 x = vm_page_busy_fetch(m); 914 for (;;) { 915 if (atomic_fcmpset_rel_int(&m->busy_lock, 916 &x, VPB_SHARERS_WORD(1))) 917 break; 918 } 919 if ((x & VPB_BIT_WAITERS) != 0) 920 wakeup(m); 921 } 922 923 /* 924 * 925 * vm_page_busy_tryupgrade: 926 * 927 * Attempt to upgrade a single shared busy into an exclusive busy. 928 */ 929 int 930 vm_page_busy_tryupgrade(vm_page_t m) 931 { 932 u_int ce, x; 933 934 vm_page_assert_sbusied(m); 935 936 x = vm_page_busy_fetch(m); 937 ce = VPB_CURTHREAD_EXCLUSIVE; 938 for (;;) { 939 if (VPB_SHARERS(x) > 1) 940 return (0); 941 KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), 942 ("vm_page_busy_tryupgrade: invalid lock state")); 943 if (!atomic_fcmpset_acq_int(&m->busy_lock, &x, 944 ce | (x & VPB_BIT_WAITERS))) 945 continue; 946 return (1); 947 } 948 } 949 950 /* 951 * vm_page_sbusied: 952 * 953 * Return a positive value if the page is shared busied, 0 otherwise. 954 */ 955 int 956 vm_page_sbusied(vm_page_t m) 957 { 958 u_int x; 959 960 x = vm_page_busy_fetch(m); 961 return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); 962 } 963 964 /* 965 * vm_page_sunbusy: 966 * 967 * Shared unbusy a page. 968 */ 969 void 970 vm_page_sunbusy(vm_page_t m) 971 { 972 u_int x; 973 974 vm_page_assert_sbusied(m); 975 976 x = vm_page_busy_fetch(m); 977 for (;;) { 978 KASSERT(x != VPB_FREED, 979 ("vm_page_sunbusy: Unlocking freed page.")); 980 if (VPB_SHARERS(x) > 1) { 981 if (atomic_fcmpset_int(&m->busy_lock, &x, 982 x - VPB_ONE_SHARER)) 983 break; 984 continue; 985 } 986 KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), 987 ("vm_page_sunbusy: invalid lock state")); 988 if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) 989 continue; 990 if ((x & VPB_BIT_WAITERS) == 0) 991 break; 992 wakeup(m); 993 break; 994 } 995 } 996 997 /* 998 * vm_page_busy_sleep: 999 * 1000 * Sleep if the page is busy, using the page pointer as wchan. 1001 * This is used to implement the hard-path of the busying mechanism. 1002 * 1003 * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function 1004 * will not sleep if the page is shared-busy. 1005 * 1006 * The object lock must be held on entry. 1007 * 1008 * Returns true if it slept and dropped the object lock, or false 1009 * if there was no sleep and the lock is still held. 1010 */ 1011 bool 1012 vm_page_busy_sleep(vm_page_t m, const char *wmesg, int allocflags) 1013 { 1014 vm_object_t obj; 1015 1016 obj = m->object; 1017 VM_OBJECT_ASSERT_LOCKED(obj); 1018 1019 return (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, allocflags, 1020 true)); 1021 } 1022 1023 /* 1024 * vm_page_busy_sleep_unlocked: 1025 * 1026 * Sleep if the page is busy, using the page pointer as wchan. 1027 * This is used to implement the hard-path of busying mechanism. 1028 * 1029 * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function 1030 * will not sleep if the page is shared-busy. 1031 * 1032 * The object lock must not be held on entry. The operation will 1033 * return if the page changes identity. 1034 */ 1035 void 1036 vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, 1037 const char *wmesg, int allocflags) 1038 { 1039 VM_OBJECT_ASSERT_UNLOCKED(obj); 1040 1041 (void)_vm_page_busy_sleep(obj, m, pindex, wmesg, allocflags, false); 1042 } 1043 1044 /* 1045 * _vm_page_busy_sleep: 1046 * 1047 * Internal busy sleep function. Verifies the page identity and 1048 * lockstate against parameters. Returns true if it sleeps and 1049 * false otherwise. 1050 * 1051 * allocflags uses VM_ALLOC_* flags to specify the lock required. 1052 * 1053 * If locked is true the lock will be dropped for any true returns 1054 * and held for any false returns. 1055 */ 1056 static bool 1057 _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, 1058 const char *wmesg, int allocflags, bool locked) 1059 { 1060 bool xsleep; 1061 u_int x; 1062 1063 /* 1064 * If the object is busy we must wait for that to drain to zero 1065 * before trying the page again. 1066 */ 1067 if (obj != NULL && vm_object_busied(obj)) { 1068 if (locked) 1069 VM_OBJECT_DROP(obj); 1070 vm_object_busy_wait(obj, wmesg); 1071 return (true); 1072 } 1073 1074 if (!vm_page_busied(m)) 1075 return (false); 1076 1077 xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0; 1078 sleepq_lock(m); 1079 x = vm_page_busy_fetch(m); 1080 do { 1081 /* 1082 * If the page changes objects or becomes unlocked we can 1083 * simply return. 1084 */ 1085 if (x == VPB_UNBUSIED || 1086 (xsleep && (x & VPB_BIT_SHARED) != 0) || 1087 m->object != obj || m->pindex != pindex) { 1088 sleepq_release(m); 1089 return (false); 1090 } 1091 if ((x & VPB_BIT_WAITERS) != 0) 1092 break; 1093 } while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS)); 1094 if (locked) 1095 VM_OBJECT_DROP(obj); 1096 DROP_GIANT(); 1097 sleepq_add(m, NULL, wmesg, 0, 0); 1098 sleepq_wait(m, PVM); 1099 PICKUP_GIANT(); 1100 return (true); 1101 } 1102 1103 /* 1104 * vm_page_trysbusy: 1105 * 1106 * Try to shared busy a page. 1107 * If the operation succeeds 1 is returned otherwise 0. 1108 * The operation never sleeps. 1109 */ 1110 int 1111 vm_page_trysbusy(vm_page_t m) 1112 { 1113 vm_object_t obj; 1114 u_int x; 1115 1116 obj = m->object; 1117 x = vm_page_busy_fetch(m); 1118 for (;;) { 1119 if ((x & VPB_BIT_SHARED) == 0) 1120 return (0); 1121 /* 1122 * Reduce the window for transient busies that will trigger 1123 * false negatives in vm_page_ps_test(). 1124 */ 1125 if (obj != NULL && vm_object_busied(obj)) 1126 return (0); 1127 if (atomic_fcmpset_acq_int(&m->busy_lock, &x, 1128 x + VPB_ONE_SHARER)) 1129 break; 1130 } 1131 1132 /* Refetch the object now that we're guaranteed that it is stable. */ 1133 obj = m->object; 1134 if (obj != NULL && vm_object_busied(obj)) { 1135 vm_page_sunbusy(m); 1136 return (0); 1137 } 1138 return (1); 1139 } 1140 1141 /* 1142 * vm_page_tryxbusy: 1143 * 1144 * Try to exclusive busy a page. 1145 * If the operation succeeds 1 is returned otherwise 0. 1146 * The operation never sleeps. 1147 */ 1148 int 1149 vm_page_tryxbusy(vm_page_t m) 1150 { 1151 vm_object_t obj; 1152 1153 if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, 1154 VPB_CURTHREAD_EXCLUSIVE) == 0) 1155 return (0); 1156 1157 obj = m->object; 1158 if (obj != NULL && vm_object_busied(obj)) { 1159 vm_page_xunbusy(m); 1160 return (0); 1161 } 1162 return (1); 1163 } 1164 1165 static void 1166 vm_page_xunbusy_hard_tail(vm_page_t m) 1167 { 1168 atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); 1169 /* Wake the waiter. */ 1170 wakeup(m); 1171 } 1172 1173 /* 1174 * vm_page_xunbusy_hard: 1175 * 1176 * Called when unbusy has failed because there is a waiter. 1177 */ 1178 void 1179 vm_page_xunbusy_hard(vm_page_t m) 1180 { 1181 vm_page_assert_xbusied(m); 1182 vm_page_xunbusy_hard_tail(m); 1183 } 1184 1185 void 1186 vm_page_xunbusy_hard_unchecked(vm_page_t m) 1187 { 1188 vm_page_assert_xbusied_unchecked(m); 1189 vm_page_xunbusy_hard_tail(m); 1190 } 1191 1192 static void 1193 vm_page_busy_free(vm_page_t m) 1194 { 1195 u_int x; 1196 1197 atomic_thread_fence_rel(); 1198 x = atomic_swap_int(&m->busy_lock, VPB_FREED); 1199 if ((x & VPB_BIT_WAITERS) != 0) 1200 wakeup(m); 1201 } 1202 1203 /* 1204 * vm_page_unhold_pages: 1205 * 1206 * Unhold each of the pages that is referenced by the given array. 1207 */ 1208 void 1209 vm_page_unhold_pages(vm_page_t *ma, int count) 1210 { 1211 1212 for (; count != 0; count--) { 1213 vm_page_unwire(*ma, PQ_ACTIVE); 1214 ma++; 1215 } 1216 } 1217 1218 vm_page_t 1219 PHYS_TO_VM_PAGE(vm_paddr_t pa) 1220 { 1221 vm_page_t m; 1222 1223 #ifdef VM_PHYSSEG_SPARSE 1224 m = vm_phys_paddr_to_vm_page(pa); 1225 if (m == NULL) 1226 m = vm_phys_fictitious_to_vm_page(pa); 1227 return (m); 1228 #elif defined(VM_PHYSSEG_DENSE) 1229 long pi; 1230 1231 pi = atop(pa); 1232 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 1233 m = &vm_page_array[pi - first_page]; 1234 return (m); 1235 } 1236 return (vm_phys_fictitious_to_vm_page(pa)); 1237 #else 1238 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 1239 #endif 1240 } 1241 1242 /* 1243 * vm_page_getfake: 1244 * 1245 * Create a fictitious page with the specified physical address and 1246 * memory attribute. The memory attribute is the only the machine- 1247 * dependent aspect of a fictitious page that must be initialized. 1248 */ 1249 vm_page_t 1250 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 1251 { 1252 vm_page_t m; 1253 1254 m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); 1255 vm_page_initfake(m, paddr, memattr); 1256 return (m); 1257 } 1258 1259 void 1260 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 1261 { 1262 1263 if ((m->flags & PG_FICTITIOUS) != 0) { 1264 /* 1265 * The page's memattr might have changed since the 1266 * previous initialization. Update the pmap to the 1267 * new memattr. 1268 */ 1269 goto memattr; 1270 } 1271 m->phys_addr = paddr; 1272 m->a.queue = PQ_NONE; 1273 /* Fictitious pages don't use "segind". */ 1274 m->flags = PG_FICTITIOUS; 1275 /* Fictitious pages don't use "order" or "pool". */ 1276 m->oflags = VPO_UNMANAGED; 1277 m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; 1278 /* Fictitious pages are unevictable. */ 1279 m->ref_count = 1; 1280 pmap_page_init(m); 1281 memattr: 1282 pmap_page_set_memattr(m, memattr); 1283 } 1284 1285 /* 1286 * vm_page_putfake: 1287 * 1288 * Release a fictitious page. 1289 */ 1290 void 1291 vm_page_putfake(vm_page_t m) 1292 { 1293 1294 KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); 1295 KASSERT((m->flags & PG_FICTITIOUS) != 0, 1296 ("vm_page_putfake: bad page %p", m)); 1297 vm_page_assert_xbusied(m); 1298 vm_page_busy_free(m); 1299 uma_zfree(fakepg_zone, m); 1300 } 1301 1302 /* 1303 * vm_page_updatefake: 1304 * 1305 * Update the given fictitious page to the specified physical address and 1306 * memory attribute. 1307 */ 1308 void 1309 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 1310 { 1311 1312 KASSERT((m->flags & PG_FICTITIOUS) != 0, 1313 ("vm_page_updatefake: bad page %p", m)); 1314 m->phys_addr = paddr; 1315 pmap_page_set_memattr(m, memattr); 1316 } 1317 1318 /* 1319 * vm_page_free: 1320 * 1321 * Free a page. 1322 */ 1323 void 1324 vm_page_free(vm_page_t m) 1325 { 1326 1327 m->flags &= ~PG_ZERO; 1328 vm_page_free_toq(m); 1329 } 1330 1331 /* 1332 * vm_page_free_zero: 1333 * 1334 * Free a page to the zerod-pages queue 1335 */ 1336 void 1337 vm_page_free_zero(vm_page_t m) 1338 { 1339 1340 m->flags |= PG_ZERO; 1341 vm_page_free_toq(m); 1342 } 1343 1344 /* 1345 * Unbusy and handle the page queueing for a page from a getpages request that 1346 * was optionally read ahead or behind. 1347 */ 1348 void 1349 vm_page_readahead_finish(vm_page_t m) 1350 { 1351 1352 /* We shouldn't put invalid pages on queues. */ 1353 KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m)); 1354 1355 /* 1356 * Since the page is not the actually needed one, whether it should 1357 * be activated or deactivated is not obvious. Empirical results 1358 * have shown that deactivating the page is usually the best choice, 1359 * unless the page is wanted by another thread. 1360 */ 1361 if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0) 1362 vm_page_activate(m); 1363 else 1364 vm_page_deactivate(m); 1365 vm_page_xunbusy_unchecked(m); 1366 } 1367 1368 /* 1369 * Destroy the identity of an invalid page and free it if possible. 1370 * This is intended to be used when reading a page from backing store fails. 1371 */ 1372 void 1373 vm_page_free_invalid(vm_page_t m) 1374 { 1375 1376 KASSERT(vm_page_none_valid(m), ("page %p is valid", m)); 1377 KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); 1378 KASSERT(m->object != NULL, ("page %p has no object", m)); 1379 VM_OBJECT_ASSERT_WLOCKED(m->object); 1380 1381 /* 1382 * We may be attempting to free the page as part of the handling for an 1383 * I/O error, in which case the page was xbusied by a different thread. 1384 */ 1385 vm_page_xbusy_claim(m); 1386 1387 /* 1388 * If someone has wired this page while the object lock 1389 * was not held, then the thread that unwires is responsible 1390 * for freeing the page. Otherwise just free the page now. 1391 * The wire count of this unmapped page cannot change while 1392 * we have the page xbusy and the page's object wlocked. 1393 */ 1394 if (vm_page_remove(m)) 1395 vm_page_free(m); 1396 } 1397 1398 /* 1399 * vm_page_dirty_KBI: [ internal use only ] 1400 * 1401 * Set all bits in the page's dirty field. 1402 * 1403 * The object containing the specified page must be locked if the 1404 * call is made from the machine-independent layer. 1405 * 1406 * See vm_page_clear_dirty_mask(). 1407 * 1408 * This function should only be called by vm_page_dirty(). 1409 */ 1410 void 1411 vm_page_dirty_KBI(vm_page_t m) 1412 { 1413 1414 /* Refer to this operation by its public name. */ 1415 KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); 1416 m->dirty = VM_PAGE_BITS_ALL; 1417 } 1418 1419 /* 1420 * Insert the given page into the given object at the given pindex. mpred is 1421 * used for memq linkage. From vm_page_insert, lookup is true, mpred is 1422 * initially NULL, and this procedure looks it up. From vm_page_insert_after, 1423 * lookup is false and mpred is known to the caller to be valid, and may be 1424 * NULL if this will be the page with the lowest pindex. 1425 * 1426 * The procedure is marked __always_inline to suggest to the compiler to 1427 * eliminate the lookup parameter and the associated alternate branch. 1428 */ 1429 static __always_inline int 1430 vm_page_insert_lookup(vm_page_t m, vm_object_t object, vm_pindex_t pindex, 1431 vm_page_t mpred, bool lookup) 1432 { 1433 int error; 1434 1435 VM_OBJECT_ASSERT_WLOCKED(object); 1436 KASSERT(m->object == NULL, 1437 ("vm_page_insert: page %p already inserted", m)); 1438 1439 /* 1440 * Record the object/offset pair in this page. 1441 */ 1442 m->object = object; 1443 m->pindex = pindex; 1444 m->ref_count |= VPRC_OBJREF; 1445 1446 /* 1447 * Add this page to the object's radix tree, and look up mpred if 1448 * needed. 1449 */ 1450 if (lookup) 1451 error = vm_radix_insert_lookup_lt(&object->rtree, m, &mpred); 1452 else 1453 error = vm_radix_insert(&object->rtree, m); 1454 if (__predict_false(error != 0)) { 1455 m->object = NULL; 1456 m->pindex = 0; 1457 m->ref_count &= ~VPRC_OBJREF; 1458 return (1); 1459 } 1460 1461 /* 1462 * Now link into the object's ordered list of backed pages. 1463 */ 1464 vm_page_insert_radixdone(m, object, mpred); 1465 vm_pager_page_inserted(object, m); 1466 return (0); 1467 } 1468 1469 /* 1470 * vm_page_insert: [ internal use only ] 1471 * 1472 * Inserts the given mem entry into the object and object list. 1473 * 1474 * The object must be locked. 1475 */ 1476 int 1477 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 1478 { 1479 return (vm_page_insert_lookup(m, object, pindex, NULL, true)); 1480 } 1481 1482 /* 1483 * vm_page_insert_after: 1484 * 1485 * Inserts the page "m" into the specified object at offset "pindex". 1486 * 1487 * The page "mpred" must immediately precede the offset "pindex" within 1488 * the specified object. 1489 * 1490 * The object must be locked. 1491 */ 1492 static int 1493 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, 1494 vm_page_t mpred) 1495 { 1496 return (vm_page_insert_lookup(m, object, pindex, mpred, false)); 1497 } 1498 1499 /* 1500 * vm_page_insert_radixdone: 1501 * 1502 * Complete page "m" insertion into the specified object after the 1503 * radix trie hooking. 1504 * 1505 * The page "mpred" must precede the offset "m->pindex" within the 1506 * specified object. 1507 * 1508 * The object must be locked. 1509 */ 1510 static void 1511 vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) 1512 { 1513 1514 VM_OBJECT_ASSERT_WLOCKED(object); 1515 KASSERT(object != NULL && m->object == object, 1516 ("vm_page_insert_radixdone: page %p has inconsistent object", m)); 1517 KASSERT((m->ref_count & VPRC_OBJREF) != 0, 1518 ("vm_page_insert_radixdone: page %p is missing object ref", m)); 1519 if (mpred != NULL) { 1520 KASSERT(mpred->object == object, 1521 ("vm_page_insert_radixdone: object doesn't contain mpred")); 1522 KASSERT(mpred->pindex < m->pindex, 1523 ("vm_page_insert_radixdone: mpred doesn't precede pindex")); 1524 KASSERT(TAILQ_NEXT(mpred, listq) == NULL || 1525 m->pindex < TAILQ_NEXT(mpred, listq)->pindex, 1526 ("vm_page_insert_radixdone: pindex doesn't precede msucc")); 1527 } else { 1528 KASSERT(TAILQ_EMPTY(&object->memq) || 1529 m->pindex < TAILQ_FIRST(&object->memq)->pindex, 1530 ("vm_page_insert_radixdone: no mpred but not first page")); 1531 } 1532 1533 if (mpred != NULL) 1534 TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); 1535 else 1536 TAILQ_INSERT_HEAD(&object->memq, m, listq); 1537 1538 /* 1539 * Show that the object has one more resident page. 1540 */ 1541 object->resident_page_count++; 1542 1543 /* 1544 * Hold the vnode until the last page is released. 1545 */ 1546 if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 1547 vhold(object->handle); 1548 1549 /* 1550 * Since we are inserting a new and possibly dirty page, 1551 * update the object's generation count. 1552 */ 1553 if (pmap_page_is_write_mapped(m)) 1554 vm_object_set_writeable_dirty(object); 1555 } 1556 1557 /* 1558 * Do the work to remove a page from its object. The caller is responsible for 1559 * updating the page's fields to reflect this removal. 1560 */ 1561 static void 1562 vm_page_object_remove(vm_page_t m) 1563 { 1564 vm_object_t object; 1565 vm_page_t mrem __diagused; 1566 1567 vm_page_assert_xbusied(m); 1568 object = m->object; 1569 VM_OBJECT_ASSERT_WLOCKED(object); 1570 KASSERT((m->ref_count & VPRC_OBJREF) != 0, 1571 ("page %p is missing its object ref", m)); 1572 1573 /* Deferred free of swap space. */ 1574 if ((m->a.flags & PGA_SWAP_FREE) != 0) 1575 vm_pager_page_unswapped(m); 1576 1577 vm_pager_page_removed(object, m); 1578 1579 m->object = NULL; 1580 mrem = vm_radix_remove(&object->rtree, m->pindex); 1581 KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); 1582 1583 /* 1584 * Now remove from the object's list of backed pages. 1585 */ 1586 TAILQ_REMOVE(&object->memq, m, listq); 1587 1588 /* 1589 * And show that the object has one fewer resident page. 1590 */ 1591 object->resident_page_count--; 1592 1593 /* 1594 * The vnode may now be recycled. 1595 */ 1596 if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 1597 vdrop(object->handle); 1598 } 1599 1600 /* 1601 * vm_page_remove: 1602 * 1603 * Removes the specified page from its containing object, but does not 1604 * invalidate any backing storage. Returns true if the object's reference 1605 * was the last reference to the page, and false otherwise. 1606 * 1607 * The object must be locked and the page must be exclusively busied. 1608 * The exclusive busy will be released on return. If this is not the 1609 * final ref and the caller does not hold a wire reference it may not 1610 * continue to access the page. 1611 */ 1612 bool 1613 vm_page_remove(vm_page_t m) 1614 { 1615 bool dropped; 1616 1617 dropped = vm_page_remove_xbusy(m); 1618 vm_page_xunbusy(m); 1619 1620 return (dropped); 1621 } 1622 1623 /* 1624 * vm_page_remove_xbusy 1625 * 1626 * Removes the page but leaves the xbusy held. Returns true if this 1627 * removed the final ref and false otherwise. 1628 */ 1629 bool 1630 vm_page_remove_xbusy(vm_page_t m) 1631 { 1632 1633 vm_page_object_remove(m); 1634 return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); 1635 } 1636 1637 /* 1638 * vm_page_lookup: 1639 * 1640 * Returns the page associated with the object/offset 1641 * pair specified; if none is found, NULL is returned. 1642 * 1643 * The object must be locked. 1644 */ 1645 vm_page_t 1646 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 1647 { 1648 1649 VM_OBJECT_ASSERT_LOCKED(object); 1650 return (vm_radix_lookup(&object->rtree, pindex)); 1651 } 1652 1653 /* 1654 * vm_page_lookup_unlocked: 1655 * 1656 * Returns the page associated with the object/offset pair specified; 1657 * if none is found, NULL is returned. The page may be no longer be 1658 * present in the object at the time that this function returns. Only 1659 * useful for opportunistic checks such as inmem(). 1660 */ 1661 vm_page_t 1662 vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex) 1663 { 1664 1665 return (vm_radix_lookup_unlocked(&object->rtree, pindex)); 1666 } 1667 1668 /* 1669 * vm_page_relookup: 1670 * 1671 * Returns a page that must already have been busied by 1672 * the caller. Used for bogus page replacement. 1673 */ 1674 vm_page_t 1675 vm_page_relookup(vm_object_t object, vm_pindex_t pindex) 1676 { 1677 vm_page_t m; 1678 1679 m = vm_radix_lookup_unlocked(&object->rtree, pindex); 1680 KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) && 1681 m->object == object && m->pindex == pindex, 1682 ("vm_page_relookup: Invalid page %p", m)); 1683 return (m); 1684 } 1685 1686 /* 1687 * This should only be used by lockless functions for releasing transient 1688 * incorrect acquires. The page may have been freed after we acquired a 1689 * busy lock. In this case busy_lock == VPB_FREED and we have nothing 1690 * further to do. 1691 */ 1692 static void 1693 vm_page_busy_release(vm_page_t m) 1694 { 1695 u_int x; 1696 1697 x = vm_page_busy_fetch(m); 1698 for (;;) { 1699 if (x == VPB_FREED) 1700 break; 1701 if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) { 1702 if (atomic_fcmpset_int(&m->busy_lock, &x, 1703 x - VPB_ONE_SHARER)) 1704 break; 1705 continue; 1706 } 1707 KASSERT((x & VPB_BIT_SHARED) != 0 || 1708 (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE, 1709 ("vm_page_busy_release: %p xbusy not owned.", m)); 1710 if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) 1711 continue; 1712 if ((x & VPB_BIT_WAITERS) != 0) 1713 wakeup(m); 1714 break; 1715 } 1716 } 1717 1718 /* 1719 * vm_page_find_least: 1720 * 1721 * Returns the page associated with the object with least pindex 1722 * greater than or equal to the parameter pindex, or NULL. 1723 * 1724 * The object must be locked. 1725 */ 1726 vm_page_t 1727 vm_page_find_least(vm_object_t object, vm_pindex_t pindex) 1728 { 1729 vm_page_t m; 1730 1731 VM_OBJECT_ASSERT_LOCKED(object); 1732 if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) 1733 m = vm_radix_lookup_ge(&object->rtree, pindex); 1734 return (m); 1735 } 1736 1737 /* 1738 * Returns the given page's successor (by pindex) within the object if it is 1739 * resident; if none is found, NULL is returned. 1740 * 1741 * The object must be locked. 1742 */ 1743 vm_page_t 1744 vm_page_next(vm_page_t m) 1745 { 1746 vm_page_t next; 1747 1748 VM_OBJECT_ASSERT_LOCKED(m->object); 1749 if ((next = TAILQ_NEXT(m, listq)) != NULL) { 1750 MPASS(next->object == m->object); 1751 if (next->pindex != m->pindex + 1) 1752 next = NULL; 1753 } 1754 return (next); 1755 } 1756 1757 /* 1758 * Returns the given page's predecessor (by pindex) within the object if it is 1759 * resident; if none is found, NULL is returned. 1760 * 1761 * The object must be locked. 1762 */ 1763 vm_page_t 1764 vm_page_prev(vm_page_t m) 1765 { 1766 vm_page_t prev; 1767 1768 VM_OBJECT_ASSERT_LOCKED(m->object); 1769 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { 1770 MPASS(prev->object == m->object); 1771 if (prev->pindex != m->pindex - 1) 1772 prev = NULL; 1773 } 1774 return (prev); 1775 } 1776 1777 /* 1778 * Uses the page mnew as a replacement for an existing page at index 1779 * pindex which must be already present in the object. 1780 * 1781 * Both pages must be exclusively busied on enter. The old page is 1782 * unbusied on exit. 1783 * 1784 * A return value of true means mold is now free. If this is not the 1785 * final ref and the caller does not hold a wire reference it may not 1786 * continue to access the page. 1787 */ 1788 static bool 1789 vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, 1790 vm_page_t mold) 1791 { 1792 vm_page_t mret __diagused; 1793 bool dropped; 1794 1795 VM_OBJECT_ASSERT_WLOCKED(object); 1796 vm_page_assert_xbusied(mold); 1797 KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0, 1798 ("vm_page_replace: page %p already in object", mnew)); 1799 1800 /* 1801 * This function mostly follows vm_page_insert() and 1802 * vm_page_remove() without the radix, object count and vnode 1803 * dance. Double check such functions for more comments. 1804 */ 1805 1806 mnew->object = object; 1807 mnew->pindex = pindex; 1808 atomic_set_int(&mnew->ref_count, VPRC_OBJREF); 1809 mret = vm_radix_replace(&object->rtree, mnew); 1810 KASSERT(mret == mold, 1811 ("invalid page replacement, mold=%p, mret=%p", mold, mret)); 1812 KASSERT((mold->oflags & VPO_UNMANAGED) == 1813 (mnew->oflags & VPO_UNMANAGED), 1814 ("vm_page_replace: mismatched VPO_UNMANAGED")); 1815 1816 /* Keep the resident page list in sorted order. */ 1817 TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); 1818 TAILQ_REMOVE(&object->memq, mold, listq); 1819 mold->object = NULL; 1820 1821 /* 1822 * The object's resident_page_count does not change because we have 1823 * swapped one page for another, but the generation count should 1824 * change if the page is dirty. 1825 */ 1826 if (pmap_page_is_write_mapped(mnew)) 1827 vm_object_set_writeable_dirty(object); 1828 dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF; 1829 vm_page_xunbusy(mold); 1830 1831 return (dropped); 1832 } 1833 1834 void 1835 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, 1836 vm_page_t mold) 1837 { 1838 1839 vm_page_assert_xbusied(mnew); 1840 1841 if (vm_page_replace_hold(mnew, object, pindex, mold)) 1842 vm_page_free(mold); 1843 } 1844 1845 /* 1846 * vm_page_rename: 1847 * 1848 * Move the given memory entry from its 1849 * current object to the specified target object/offset. 1850 * 1851 * Note: swap associated with the page must be invalidated by the move. We 1852 * have to do this for several reasons: (1) we aren't freeing the 1853 * page, (2) we are dirtying the page, (3) the VM system is probably 1854 * moving the page from object A to B, and will then later move 1855 * the backing store from A to B and we can't have a conflict. 1856 * 1857 * Note: we *always* dirty the page. It is necessary both for the 1858 * fact that we moved it, and because we may be invalidating 1859 * swap. 1860 * 1861 * The objects must be locked. 1862 */ 1863 int 1864 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 1865 { 1866 vm_page_t mpred; 1867 vm_pindex_t opidx; 1868 1869 VM_OBJECT_ASSERT_WLOCKED(new_object); 1870 1871 KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m)); 1872 1873 /* 1874 * Create a custom version of vm_page_insert() which does not depend 1875 * by m_prev and can cheat on the implementation aspects of the 1876 * function. 1877 */ 1878 opidx = m->pindex; 1879 m->pindex = new_pindex; 1880 if (vm_radix_insert_lookup_lt(&new_object->rtree, m, &mpred) != 0) { 1881 m->pindex = opidx; 1882 return (1); 1883 } 1884 1885 /* 1886 * The operation cannot fail anymore. The removal must happen before 1887 * the listq iterator is tainted. 1888 */ 1889 m->pindex = opidx; 1890 vm_page_object_remove(m); 1891 1892 /* Return back to the new pindex to complete vm_page_insert(). */ 1893 m->pindex = new_pindex; 1894 m->object = new_object; 1895 1896 vm_page_insert_radixdone(m, new_object, mpred); 1897 vm_page_dirty(m); 1898 vm_pager_page_inserted(new_object, m); 1899 return (0); 1900 } 1901 1902 /* 1903 * vm_page_alloc: 1904 * 1905 * Allocate and return a page that is associated with the specified 1906 * object and offset pair. By default, this page is exclusive busied. 1907 * 1908 * The caller must always specify an allocation class. 1909 * 1910 * allocation classes: 1911 * VM_ALLOC_NORMAL normal process request 1912 * VM_ALLOC_SYSTEM system *really* needs a page 1913 * VM_ALLOC_INTERRUPT interrupt time request 1914 * 1915 * optional allocation flags: 1916 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1917 * intends to allocate 1918 * VM_ALLOC_NOBUSY do not exclusive busy the page 1919 * VM_ALLOC_NODUMP do not include the page in a kernel core dump 1920 * VM_ALLOC_SBUSY shared busy the allocated page 1921 * VM_ALLOC_WIRED wire the allocated page 1922 * VM_ALLOC_ZERO prefer a zeroed page 1923 */ 1924 vm_page_t 1925 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1926 { 1927 1928 return (vm_page_alloc_after(object, pindex, req, 1929 vm_radix_lookup_le(&object->rtree, pindex))); 1930 } 1931 1932 vm_page_t 1933 vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, 1934 int req) 1935 { 1936 1937 return (vm_page_alloc_domain_after(object, pindex, domain, req, 1938 vm_radix_lookup_le(&object->rtree, pindex))); 1939 } 1940 1941 /* 1942 * Allocate a page in the specified object with the given page index. To 1943 * optimize insertion of the page into the object, the caller must also specifiy 1944 * the resident page in the object with largest index smaller than the given 1945 * page index, or NULL if no such page exists. 1946 */ 1947 vm_page_t 1948 vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, 1949 int req, vm_page_t mpred) 1950 { 1951 struct vm_domainset_iter di; 1952 vm_page_t m; 1953 int domain; 1954 1955 vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); 1956 do { 1957 m = vm_page_alloc_domain_after(object, pindex, domain, req, 1958 mpred); 1959 if (m != NULL) 1960 break; 1961 } while (vm_domainset_iter_page(&di, object, &domain) == 0); 1962 1963 return (m); 1964 } 1965 1966 /* 1967 * Returns true if the number of free pages exceeds the minimum 1968 * for the request class and false otherwise. 1969 */ 1970 static int 1971 _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages) 1972 { 1973 u_int limit, old, new; 1974 1975 if (req_class == VM_ALLOC_INTERRUPT) 1976 limit = 0; 1977 else if (req_class == VM_ALLOC_SYSTEM) 1978 limit = vmd->vmd_interrupt_free_min; 1979 else 1980 limit = vmd->vmd_free_reserved; 1981 1982 /* 1983 * Attempt to reserve the pages. Fail if we're below the limit. 1984 */ 1985 limit += npages; 1986 old = vmd->vmd_free_count; 1987 do { 1988 if (old < limit) 1989 return (0); 1990 new = old - npages; 1991 } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); 1992 1993 /* Wake the page daemon if we've crossed the threshold. */ 1994 if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) 1995 pagedaemon_wakeup(vmd->vmd_domain); 1996 1997 /* Only update bitsets on transitions. */ 1998 if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || 1999 (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) 2000 vm_domain_set(vmd); 2001 2002 return (1); 2003 } 2004 2005 int 2006 vm_domain_allocate(struct vm_domain *vmd, int req, int npages) 2007 { 2008 int req_class; 2009 2010 /* 2011 * The page daemon is allowed to dig deeper into the free page list. 2012 */ 2013 req_class = req & VM_ALLOC_CLASS_MASK; 2014 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 2015 req_class = VM_ALLOC_SYSTEM; 2016 return (_vm_domain_allocate(vmd, req_class, npages)); 2017 } 2018 2019 vm_page_t 2020 vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, 2021 int req, vm_page_t mpred) 2022 { 2023 struct vm_domain *vmd; 2024 vm_page_t m; 2025 int flags; 2026 2027 #define VPA_FLAGS (VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL | \ 2028 VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY | \ 2029 VM_ALLOC_SBUSY | VM_ALLOC_WIRED | \ 2030 VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK) 2031 KASSERT((req & ~VPA_FLAGS) == 0, 2032 ("invalid request %#x", req)); 2033 KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != 2034 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), 2035 ("invalid request %#x", req)); 2036 KASSERT(mpred == NULL || mpred->pindex < pindex, 2037 ("mpred %p doesn't precede pindex 0x%jx", mpred, 2038 (uintmax_t)pindex)); 2039 VM_OBJECT_ASSERT_WLOCKED(object); 2040 2041 flags = 0; 2042 m = NULL; 2043 if (!vm_pager_can_alloc_page(object, pindex)) 2044 return (NULL); 2045 again: 2046 #if VM_NRESERVLEVEL > 0 2047 /* 2048 * Can we allocate the page from a reservation? 2049 */ 2050 if (vm_object_reserv(object) && 2051 (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) != 2052 NULL) { 2053 goto found; 2054 } 2055 #endif 2056 vmd = VM_DOMAIN(domain); 2057 if (vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone != NULL) { 2058 m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone, 2059 M_NOWAIT | M_NOVM); 2060 if (m != NULL) { 2061 flags |= PG_PCPU_CACHE; 2062 goto found; 2063 } 2064 } 2065 if (vm_domain_allocate(vmd, req, 1)) { 2066 /* 2067 * If not, allocate it from the free page queues. 2068 */ 2069 vm_domain_free_lock(vmd); 2070 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 0); 2071 vm_domain_free_unlock(vmd); 2072 if (m == NULL) { 2073 vm_domain_freecnt_inc(vmd, 1); 2074 #if VM_NRESERVLEVEL > 0 2075 if (vm_reserv_reclaim_inactive(domain)) 2076 goto again; 2077 #endif 2078 } 2079 } 2080 if (m == NULL) { 2081 /* 2082 * Not allocatable, give up. 2083 */ 2084 if (vm_domain_alloc_fail(vmd, object, req)) 2085 goto again; 2086 return (NULL); 2087 } 2088 2089 /* 2090 * At this point we had better have found a good page. 2091 */ 2092 found: 2093 vm_page_dequeue(m); 2094 vm_page_alloc_check(m); 2095 2096 /* 2097 * Initialize the page. Only the PG_ZERO flag is inherited. 2098 */ 2099 flags |= m->flags & PG_ZERO; 2100 if ((req & VM_ALLOC_NODUMP) != 0) 2101 flags |= PG_NODUMP; 2102 m->flags = flags; 2103 m->a.flags = 0; 2104 m->oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; 2105 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) 2106 m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; 2107 else if ((req & VM_ALLOC_SBUSY) != 0) 2108 m->busy_lock = VPB_SHARERS_WORD(1); 2109 else 2110 m->busy_lock = VPB_UNBUSIED; 2111 if (req & VM_ALLOC_WIRED) { 2112 vm_wire_add(1); 2113 m->ref_count = 1; 2114 } 2115 m->a.act_count = 0; 2116 2117 if (vm_page_insert_after(m, object, pindex, mpred)) { 2118 if (req & VM_ALLOC_WIRED) { 2119 vm_wire_sub(1); 2120 m->ref_count = 0; 2121 } 2122 KASSERT(m->object == NULL, ("page %p has object", m)); 2123 m->oflags = VPO_UNMANAGED; 2124 m->busy_lock = VPB_UNBUSIED; 2125 /* Don't change PG_ZERO. */ 2126 vm_page_free_toq(m); 2127 if (req & VM_ALLOC_WAITFAIL) { 2128 VM_OBJECT_WUNLOCK(object); 2129 vm_radix_wait(); 2130 VM_OBJECT_WLOCK(object); 2131 } 2132 return (NULL); 2133 } 2134 2135 /* Ignore device objects; the pager sets "memattr" for them. */ 2136 if (object->memattr != VM_MEMATTR_DEFAULT && 2137 (object->flags & OBJ_FICTITIOUS) == 0) 2138 pmap_page_set_memattr(m, object->memattr); 2139 2140 return (m); 2141 } 2142 2143 /* 2144 * vm_page_alloc_contig: 2145 * 2146 * Allocate a contiguous set of physical pages of the given size "npages" 2147 * from the free lists. All of the physical pages must be at or above 2148 * the given physical address "low" and below the given physical address 2149 * "high". The given value "alignment" determines the alignment of the 2150 * first physical page in the set. If the given value "boundary" is 2151 * non-zero, then the set of physical pages cannot cross any physical 2152 * address boundary that is a multiple of that value. Both "alignment" 2153 * and "boundary" must be a power of two. 2154 * 2155 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, 2156 * then the memory attribute setting for the physical pages is configured 2157 * to the object's memory attribute setting. Otherwise, the memory 2158 * attribute setting for the physical pages is configured to "memattr", 2159 * overriding the object's memory attribute setting. However, if the 2160 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the 2161 * memory attribute setting for the physical pages cannot be configured 2162 * to VM_MEMATTR_DEFAULT. 2163 * 2164 * The specified object may not contain fictitious pages. 2165 * 2166 * The caller must always specify an allocation class. 2167 * 2168 * allocation classes: 2169 * VM_ALLOC_NORMAL normal process request 2170 * VM_ALLOC_SYSTEM system *really* needs a page 2171 * VM_ALLOC_INTERRUPT interrupt time request 2172 * 2173 * optional allocation flags: 2174 * VM_ALLOC_NOBUSY do not exclusive busy the page 2175 * VM_ALLOC_NODUMP do not include the page in a kernel core dump 2176 * VM_ALLOC_SBUSY shared busy the allocated page 2177 * VM_ALLOC_WIRED wire the allocated page 2178 * VM_ALLOC_ZERO prefer a zeroed page 2179 */ 2180 vm_page_t 2181 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, 2182 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 2183 vm_paddr_t boundary, vm_memattr_t memattr) 2184 { 2185 struct vm_domainset_iter di; 2186 vm_page_t bounds[2]; 2187 vm_page_t m; 2188 int domain; 2189 int start_segind; 2190 2191 start_segind = -1; 2192 2193 vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); 2194 do { 2195 m = vm_page_alloc_contig_domain(object, pindex, domain, req, 2196 npages, low, high, alignment, boundary, memattr); 2197 if (m != NULL) 2198 break; 2199 if (start_segind == -1) 2200 start_segind = vm_phys_lookup_segind(low); 2201 if (vm_phys_find_range(bounds, start_segind, domain, 2202 npages, low, high) == -1) { 2203 vm_domainset_iter_ignore(&di, domain); 2204 } 2205 } while (vm_domainset_iter_page(&di, object, &domain) == 0); 2206 2207 return (m); 2208 } 2209 2210 static vm_page_t 2211 vm_page_find_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, 2212 vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 2213 { 2214 struct vm_domain *vmd; 2215 vm_page_t m_ret; 2216 2217 /* 2218 * Can we allocate the pages without the number of free pages falling 2219 * below the lower bound for the allocation class? 2220 */ 2221 vmd = VM_DOMAIN(domain); 2222 if (!vm_domain_allocate(vmd, req, npages)) 2223 return (NULL); 2224 /* 2225 * Try to allocate the pages from the free page queues. 2226 */ 2227 vm_domain_free_lock(vmd); 2228 m_ret = vm_phys_alloc_contig(domain, npages, low, high, 2229 alignment, boundary); 2230 vm_domain_free_unlock(vmd); 2231 if (m_ret != NULL) 2232 return (m_ret); 2233 #if VM_NRESERVLEVEL > 0 2234 /* 2235 * Try to break a reservation to allocate the pages. 2236 */ 2237 if ((req & VM_ALLOC_NORECLAIM) == 0) { 2238 m_ret = vm_reserv_reclaim_contig(domain, npages, low, 2239 high, alignment, boundary); 2240 if (m_ret != NULL) 2241 return (m_ret); 2242 } 2243 #endif 2244 vm_domain_freecnt_inc(vmd, npages); 2245 return (NULL); 2246 } 2247 2248 vm_page_t 2249 vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, 2250 int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 2251 vm_paddr_t boundary, vm_memattr_t memattr) 2252 { 2253 vm_page_t m, m_ret, mpred; 2254 u_int busy_lock, flags, oflags; 2255 2256 #define VPAC_FLAGS (VPA_FLAGS | VM_ALLOC_NORECLAIM) 2257 KASSERT((req & ~VPAC_FLAGS) == 0, 2258 ("invalid request %#x", req)); 2259 KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != 2260 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), 2261 ("invalid request %#x", req)); 2262 KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != 2263 (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM), 2264 ("invalid request %#x", req)); 2265 VM_OBJECT_ASSERT_WLOCKED(object); 2266 KASSERT((object->flags & OBJ_FICTITIOUS) == 0, 2267 ("vm_page_alloc_contig: object %p has fictitious pages", 2268 object)); 2269 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 2270 2271 mpred = vm_radix_lookup_le(&object->rtree, pindex); 2272 KASSERT(mpred == NULL || mpred->pindex != pindex, 2273 ("vm_page_alloc_contig: pindex already allocated")); 2274 for (;;) { 2275 #if VM_NRESERVLEVEL > 0 2276 /* 2277 * Can we allocate the pages from a reservation? 2278 */ 2279 if (vm_object_reserv(object) && 2280 (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req, 2281 mpred, npages, low, high, alignment, boundary)) != NULL) { 2282 break; 2283 } 2284 #endif 2285 if ((m_ret = vm_page_find_contig_domain(domain, req, npages, 2286 low, high, alignment, boundary)) != NULL) 2287 break; 2288 if (!vm_domain_alloc_fail(VM_DOMAIN(domain), object, req)) 2289 return (NULL); 2290 } 2291 for (m = m_ret; m < &m_ret[npages]; m++) { 2292 vm_page_dequeue(m); 2293 vm_page_alloc_check(m); 2294 } 2295 2296 /* 2297 * Initialize the pages. Only the PG_ZERO flag is inherited. 2298 */ 2299 flags = PG_ZERO; 2300 if ((req & VM_ALLOC_NODUMP) != 0) 2301 flags |= PG_NODUMP; 2302 oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; 2303 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) 2304 busy_lock = VPB_CURTHREAD_EXCLUSIVE; 2305 else if ((req & VM_ALLOC_SBUSY) != 0) 2306 busy_lock = VPB_SHARERS_WORD(1); 2307 else 2308 busy_lock = VPB_UNBUSIED; 2309 if ((req & VM_ALLOC_WIRED) != 0) 2310 vm_wire_add(npages); 2311 if (object->memattr != VM_MEMATTR_DEFAULT && 2312 memattr == VM_MEMATTR_DEFAULT) 2313 memattr = object->memattr; 2314 for (m = m_ret; m < &m_ret[npages]; m++) { 2315 m->a.flags = 0; 2316 m->flags = (m->flags | PG_NODUMP) & flags; 2317 m->busy_lock = busy_lock; 2318 if ((req & VM_ALLOC_WIRED) != 0) 2319 m->ref_count = 1; 2320 m->a.act_count = 0; 2321 m->oflags = oflags; 2322 if (vm_page_insert_after(m, object, pindex, mpred)) { 2323 if ((req & VM_ALLOC_WIRED) != 0) 2324 vm_wire_sub(npages); 2325 KASSERT(m->object == NULL, 2326 ("page %p has object", m)); 2327 mpred = m; 2328 for (m = m_ret; m < &m_ret[npages]; m++) { 2329 if (m <= mpred && 2330 (req & VM_ALLOC_WIRED) != 0) 2331 m->ref_count = 0; 2332 m->oflags = VPO_UNMANAGED; 2333 m->busy_lock = VPB_UNBUSIED; 2334 /* Don't change PG_ZERO. */ 2335 vm_page_free_toq(m); 2336 } 2337 if (req & VM_ALLOC_WAITFAIL) { 2338 VM_OBJECT_WUNLOCK(object); 2339 vm_radix_wait(); 2340 VM_OBJECT_WLOCK(object); 2341 } 2342 return (NULL); 2343 } 2344 mpred = m; 2345 if (memattr != VM_MEMATTR_DEFAULT) 2346 pmap_page_set_memattr(m, memattr); 2347 pindex++; 2348 } 2349 return (m_ret); 2350 } 2351 2352 /* 2353 * Allocate a physical page that is not intended to be inserted into a VM 2354 * object. If the "freelist" parameter is not equal to VM_NFREELIST, then only 2355 * pages from the specified vm_phys freelist will be returned. 2356 */ 2357 static __always_inline vm_page_t 2358 _vm_page_alloc_noobj_domain(int domain, const int freelist, int req) 2359 { 2360 struct vm_domain *vmd; 2361 vm_page_t m; 2362 int flags; 2363 2364 #define VPAN_FLAGS (VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL | \ 2365 VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | \ 2366 VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | \ 2367 VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK) 2368 KASSERT((req & ~VPAN_FLAGS) == 0, 2369 ("invalid request %#x", req)); 2370 2371 flags = (req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0; 2372 vmd = VM_DOMAIN(domain); 2373 again: 2374 if (freelist == VM_NFREELIST && 2375 vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) { 2376 m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone, 2377 M_NOWAIT | M_NOVM); 2378 if (m != NULL) { 2379 flags |= PG_PCPU_CACHE; 2380 goto found; 2381 } 2382 } 2383 2384 if (vm_domain_allocate(vmd, req, 1)) { 2385 vm_domain_free_lock(vmd); 2386 if (freelist == VM_NFREELIST) 2387 m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0); 2388 else 2389 m = vm_phys_alloc_freelist_pages(domain, freelist, 2390 VM_FREEPOOL_DIRECT, 0); 2391 vm_domain_free_unlock(vmd); 2392 if (m == NULL) { 2393 vm_domain_freecnt_inc(vmd, 1); 2394 #if VM_NRESERVLEVEL > 0 2395 if (freelist == VM_NFREELIST && 2396 vm_reserv_reclaim_inactive(domain)) 2397 goto again; 2398 #endif 2399 } 2400 } 2401 if (m == NULL) { 2402 if (vm_domain_alloc_fail(vmd, NULL, req)) 2403 goto again; 2404 return (NULL); 2405 } 2406 2407 found: 2408 vm_page_dequeue(m); 2409 vm_page_alloc_check(m); 2410 2411 /* 2412 * Consumers should not rely on a useful default pindex value. 2413 */ 2414 m->pindex = 0xdeadc0dedeadc0de; 2415 m->flags = (m->flags & PG_ZERO) | flags; 2416 m->a.flags = 0; 2417 m->oflags = VPO_UNMANAGED; 2418 m->busy_lock = VPB_UNBUSIED; 2419 if ((req & VM_ALLOC_WIRED) != 0) { 2420 vm_wire_add(1); 2421 m->ref_count = 1; 2422 } 2423 2424 if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) 2425 pmap_zero_page(m); 2426 2427 return (m); 2428 } 2429 2430 vm_page_t 2431 vm_page_alloc_freelist(int freelist, int req) 2432 { 2433 struct vm_domainset_iter di; 2434 vm_page_t m; 2435 int domain; 2436 2437 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); 2438 do { 2439 m = vm_page_alloc_freelist_domain(domain, freelist, req); 2440 if (m != NULL) 2441 break; 2442 } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); 2443 2444 return (m); 2445 } 2446 2447 vm_page_t 2448 vm_page_alloc_freelist_domain(int domain, int freelist, int req) 2449 { 2450 KASSERT(freelist >= 0 && freelist < VM_NFREELIST, 2451 ("%s: invalid freelist %d", __func__, freelist)); 2452 2453 return (_vm_page_alloc_noobj_domain(domain, freelist, req)); 2454 } 2455 2456 vm_page_t 2457 vm_page_alloc_noobj(int req) 2458 { 2459 struct vm_domainset_iter di; 2460 vm_page_t m; 2461 int domain; 2462 2463 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); 2464 do { 2465 m = vm_page_alloc_noobj_domain(domain, req); 2466 if (m != NULL) 2467 break; 2468 } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); 2469 2470 return (m); 2471 } 2472 2473 vm_page_t 2474 vm_page_alloc_noobj_domain(int domain, int req) 2475 { 2476 return (_vm_page_alloc_noobj_domain(domain, VM_NFREELIST, req)); 2477 } 2478 2479 vm_page_t 2480 vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low, 2481 vm_paddr_t high, u_long alignment, vm_paddr_t boundary, 2482 vm_memattr_t memattr) 2483 { 2484 struct vm_domainset_iter di; 2485 vm_page_t m; 2486 int domain; 2487 2488 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); 2489 do { 2490 m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low, 2491 high, alignment, boundary, memattr); 2492 if (m != NULL) 2493 break; 2494 } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); 2495 2496 return (m); 2497 } 2498 2499 vm_page_t 2500 vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages, 2501 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, 2502 vm_memattr_t memattr) 2503 { 2504 vm_page_t m, m_ret; 2505 u_int flags; 2506 2507 #define VPANC_FLAGS (VPAN_FLAGS | VM_ALLOC_NORECLAIM) 2508 KASSERT((req & ~VPANC_FLAGS) == 0, 2509 ("invalid request %#x", req)); 2510 KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != 2511 (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM), 2512 ("invalid request %#x", req)); 2513 KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != 2514 (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), 2515 ("invalid request %#x", req)); 2516 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 2517 2518 while ((m_ret = vm_page_find_contig_domain(domain, req, npages, 2519 low, high, alignment, boundary)) == NULL) { 2520 if (!vm_domain_alloc_fail(VM_DOMAIN(domain), NULL, req)) 2521 return (NULL); 2522 } 2523 2524 /* 2525 * Initialize the pages. Only the PG_ZERO flag is inherited. 2526 */ 2527 flags = PG_ZERO; 2528 if ((req & VM_ALLOC_NODUMP) != 0) 2529 flags |= PG_NODUMP; 2530 if ((req & VM_ALLOC_WIRED) != 0) 2531 vm_wire_add(npages); 2532 for (m = m_ret; m < &m_ret[npages]; m++) { 2533 vm_page_dequeue(m); 2534 vm_page_alloc_check(m); 2535 2536 /* 2537 * Consumers should not rely on a useful default pindex value. 2538 */ 2539 m->pindex = 0xdeadc0dedeadc0de; 2540 m->a.flags = 0; 2541 m->flags = (m->flags | PG_NODUMP) & flags; 2542 m->busy_lock = VPB_UNBUSIED; 2543 if ((req & VM_ALLOC_WIRED) != 0) 2544 m->ref_count = 1; 2545 m->a.act_count = 0; 2546 m->oflags = VPO_UNMANAGED; 2547 2548 /* 2549 * Zero the page before updating any mappings since the page is 2550 * not yet shared with any devices which might require the 2551 * non-default memory attribute. pmap_page_set_memattr() 2552 * flushes data caches before returning. 2553 */ 2554 if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) 2555 pmap_zero_page(m); 2556 if (memattr != VM_MEMATTR_DEFAULT) 2557 pmap_page_set_memattr(m, memattr); 2558 } 2559 return (m_ret); 2560 } 2561 2562 /* 2563 * Check a page that has been freshly dequeued from a freelist. 2564 */ 2565 static void 2566 vm_page_alloc_check(vm_page_t m) 2567 { 2568 2569 KASSERT(m->object == NULL, ("page %p has object", m)); 2570 KASSERT(m->a.queue == PQ_NONE && 2571 (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, 2572 ("page %p has unexpected queue %d, flags %#x", 2573 m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK))); 2574 KASSERT(m->ref_count == 0, ("page %p has references", m)); 2575 KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m)); 2576 KASSERT(m->dirty == 0, ("page %p is dirty", m)); 2577 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 2578 ("page %p has unexpected memattr %d", 2579 m, pmap_page_get_memattr(m))); 2580 KASSERT(vm_page_none_valid(m), ("free page %p is valid", m)); 2581 pmap_vm_page_alloc_check(m); 2582 } 2583 2584 static int 2585 vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) 2586 { 2587 struct vm_domain *vmd; 2588 struct vm_pgcache *pgcache; 2589 int i; 2590 2591 pgcache = arg; 2592 vmd = VM_DOMAIN(pgcache->domain); 2593 2594 /* 2595 * The page daemon should avoid creating extra memory pressure since its 2596 * main purpose is to replenish the store of free pages. 2597 */ 2598 if (vmd->vmd_severeset || curproc == pageproc || 2599 !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) 2600 return (0); 2601 domain = vmd->vmd_domain; 2602 vm_domain_free_lock(vmd); 2603 i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, 2604 (vm_page_t *)store); 2605 vm_domain_free_unlock(vmd); 2606 if (cnt != i) 2607 vm_domain_freecnt_inc(vmd, cnt - i); 2608 2609 return (i); 2610 } 2611 2612 static void 2613 vm_page_zone_release(void *arg, void **store, int cnt) 2614 { 2615 struct vm_domain *vmd; 2616 struct vm_pgcache *pgcache; 2617 vm_page_t m; 2618 int i; 2619 2620 pgcache = arg; 2621 vmd = VM_DOMAIN(pgcache->domain); 2622 vm_domain_free_lock(vmd); 2623 for (i = 0; i < cnt; i++) { 2624 m = (vm_page_t)store[i]; 2625 vm_phys_free_pages(m, 0); 2626 } 2627 vm_domain_free_unlock(vmd); 2628 vm_domain_freecnt_inc(vmd, cnt); 2629 } 2630 2631 #define VPSC_ANY 0 /* No restrictions. */ 2632 #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ 2633 #define VPSC_NOSUPER 2 /* Skip superpages. */ 2634 2635 /* 2636 * vm_page_scan_contig: 2637 * 2638 * Scan vm_page_array[] between the specified entries "m_start" and 2639 * "m_end" for a run of contiguous physical pages that satisfy the 2640 * specified conditions, and return the lowest page in the run. The 2641 * specified "alignment" determines the alignment of the lowest physical 2642 * page in the run. If the specified "boundary" is non-zero, then the 2643 * run of physical pages cannot span a physical address that is a 2644 * multiple of "boundary". 2645 * 2646 * "m_end" is never dereferenced, so it need not point to a vm_page 2647 * structure within vm_page_array[]. 2648 * 2649 * "npages" must be greater than zero. "m_start" and "m_end" must not 2650 * span a hole (or discontiguity) in the physical address space. Both 2651 * "alignment" and "boundary" must be a power of two. 2652 */ 2653 static vm_page_t 2654 vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, 2655 u_long alignment, vm_paddr_t boundary, int options) 2656 { 2657 vm_object_t object; 2658 vm_paddr_t pa; 2659 vm_page_t m, m_run; 2660 #if VM_NRESERVLEVEL > 0 2661 int level; 2662 #endif 2663 int m_inc, order, run_ext, run_len; 2664 2665 KASSERT(npages > 0, ("npages is 0")); 2666 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 2667 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 2668 m_run = NULL; 2669 run_len = 0; 2670 for (m = m_start; m < m_end && run_len < npages; m += m_inc) { 2671 KASSERT((m->flags & PG_MARKER) == 0, 2672 ("page %p is PG_MARKER", m)); 2673 KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1, 2674 ("fictitious page %p has invalid ref count", m)); 2675 2676 /* 2677 * If the current page would be the start of a run, check its 2678 * physical address against the end, alignment, and boundary 2679 * conditions. If it doesn't satisfy these conditions, either 2680 * terminate the scan or advance to the next page that 2681 * satisfies the failed condition. 2682 */ 2683 if (run_len == 0) { 2684 KASSERT(m_run == NULL, ("m_run != NULL")); 2685 if (m + npages > m_end) 2686 break; 2687 pa = VM_PAGE_TO_PHYS(m); 2688 if (!vm_addr_align_ok(pa, alignment)) { 2689 m_inc = atop(roundup2(pa, alignment) - pa); 2690 continue; 2691 } 2692 if (!vm_addr_bound_ok(pa, ptoa(npages), boundary)) { 2693 m_inc = atop(roundup2(pa, boundary) - pa); 2694 continue; 2695 } 2696 } else 2697 KASSERT(m_run != NULL, ("m_run == NULL")); 2698 2699 retry: 2700 m_inc = 1; 2701 if (vm_page_wired(m)) 2702 run_ext = 0; 2703 #if VM_NRESERVLEVEL > 0 2704 else if ((level = vm_reserv_level(m)) >= 0 && 2705 (options & VPSC_NORESERV) != 0) { 2706 run_ext = 0; 2707 /* Advance to the end of the reservation. */ 2708 pa = VM_PAGE_TO_PHYS(m); 2709 m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - 2710 pa); 2711 } 2712 #endif 2713 else if ((object = atomic_load_ptr(&m->object)) != NULL) { 2714 /* 2715 * The page is considered eligible for relocation if 2716 * and only if it could be laundered or reclaimed by 2717 * the page daemon. 2718 */ 2719 VM_OBJECT_RLOCK(object); 2720 if (object != m->object) { 2721 VM_OBJECT_RUNLOCK(object); 2722 goto retry; 2723 } 2724 /* Don't care: PG_NODUMP, PG_ZERO. */ 2725 if ((object->flags & OBJ_SWAP) == 0 && 2726 object->type != OBJT_VNODE) { 2727 run_ext = 0; 2728 #if VM_NRESERVLEVEL > 0 2729 } else if ((options & VPSC_NOSUPER) != 0 && 2730 (level = vm_reserv_level_iffullpop(m)) >= 0) { 2731 run_ext = 0; 2732 /* Advance to the end of the superpage. */ 2733 pa = VM_PAGE_TO_PHYS(m); 2734 m_inc = atop(roundup2(pa + 1, 2735 vm_reserv_size(level)) - pa); 2736 #endif 2737 } else if (object->memattr == VM_MEMATTR_DEFAULT && 2738 vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { 2739 /* 2740 * The page is allocated but eligible for 2741 * relocation. Extend the current run by one 2742 * page. 2743 */ 2744 KASSERT(pmap_page_get_memattr(m) == 2745 VM_MEMATTR_DEFAULT, 2746 ("page %p has an unexpected memattr", m)); 2747 KASSERT((m->oflags & (VPO_SWAPINPROG | 2748 VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, 2749 ("page %p has unexpected oflags", m)); 2750 /* Don't care: PGA_NOSYNC. */ 2751 run_ext = 1; 2752 } else 2753 run_ext = 0; 2754 VM_OBJECT_RUNLOCK(object); 2755 #if VM_NRESERVLEVEL > 0 2756 } else if (level >= 0) { 2757 /* 2758 * The page is reserved but not yet allocated. In 2759 * other words, it is still free. Extend the current 2760 * run by one page. 2761 */ 2762 run_ext = 1; 2763 #endif 2764 } else if ((order = m->order) < VM_NFREEORDER) { 2765 /* 2766 * The page is enqueued in the physical memory 2767 * allocator's free page queues. Moreover, it is the 2768 * first page in a power-of-two-sized run of 2769 * contiguous free pages. Add these pages to the end 2770 * of the current run, and jump ahead. 2771 */ 2772 run_ext = 1 << order; 2773 m_inc = 1 << order; 2774 } else { 2775 /* 2776 * Skip the page for one of the following reasons: (1) 2777 * It is enqueued in the physical memory allocator's 2778 * free page queues. However, it is not the first 2779 * page in a run of contiguous free pages. (This case 2780 * rarely occurs because the scan is performed in 2781 * ascending order.) (2) It is not reserved, and it is 2782 * transitioning from free to allocated. (Conversely, 2783 * the transition from allocated to free for managed 2784 * pages is blocked by the page busy lock.) (3) It is 2785 * allocated but not contained by an object and not 2786 * wired, e.g., allocated by Xen's balloon driver. 2787 */ 2788 run_ext = 0; 2789 } 2790 2791 /* 2792 * Extend or reset the current run of pages. 2793 */ 2794 if (run_ext > 0) { 2795 if (run_len == 0) 2796 m_run = m; 2797 run_len += run_ext; 2798 } else { 2799 if (run_len > 0) { 2800 m_run = NULL; 2801 run_len = 0; 2802 } 2803 } 2804 } 2805 if (run_len >= npages) 2806 return (m_run); 2807 return (NULL); 2808 } 2809 2810 /* 2811 * vm_page_reclaim_run: 2812 * 2813 * Try to relocate each of the allocated virtual pages within the 2814 * specified run of physical pages to a new physical address. Free the 2815 * physical pages underlying the relocated virtual pages. A virtual page 2816 * is relocatable if and only if it could be laundered or reclaimed by 2817 * the page daemon. Whenever possible, a virtual page is relocated to a 2818 * physical address above "high". 2819 * 2820 * Returns 0 if every physical page within the run was already free or 2821 * just freed by a successful relocation. Otherwise, returns a non-zero 2822 * value indicating why the last attempt to relocate a virtual page was 2823 * unsuccessful. 2824 * 2825 * "req_class" must be an allocation class. 2826 */ 2827 static int 2828 vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, 2829 vm_paddr_t high) 2830 { 2831 struct vm_domain *vmd; 2832 struct spglist free; 2833 vm_object_t object; 2834 vm_paddr_t pa; 2835 vm_page_t m, m_end, m_new; 2836 int error, order, req; 2837 2838 KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, 2839 ("req_class is not an allocation class")); 2840 SLIST_INIT(&free); 2841 error = 0; 2842 m = m_run; 2843 m_end = m_run + npages; 2844 for (; error == 0 && m < m_end; m++) { 2845 KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, 2846 ("page %p is PG_FICTITIOUS or PG_MARKER", m)); 2847 2848 /* 2849 * Racily check for wirings. Races are handled once the object 2850 * lock is held and the page is unmapped. 2851 */ 2852 if (vm_page_wired(m)) 2853 error = EBUSY; 2854 else if ((object = atomic_load_ptr(&m->object)) != NULL) { 2855 /* 2856 * The page is relocated if and only if it could be 2857 * laundered or reclaimed by the page daemon. 2858 */ 2859 VM_OBJECT_WLOCK(object); 2860 /* Don't care: PG_NODUMP, PG_ZERO. */ 2861 if (m->object != object || 2862 ((object->flags & OBJ_SWAP) == 0 && 2863 object->type != OBJT_VNODE)) 2864 error = EINVAL; 2865 else if (object->memattr != VM_MEMATTR_DEFAULT) 2866 error = EINVAL; 2867 else if (vm_page_queue(m) != PQ_NONE && 2868 vm_page_tryxbusy(m) != 0) { 2869 if (vm_page_wired(m)) { 2870 vm_page_xunbusy(m); 2871 error = EBUSY; 2872 goto unlock; 2873 } 2874 KASSERT(pmap_page_get_memattr(m) == 2875 VM_MEMATTR_DEFAULT, 2876 ("page %p has an unexpected memattr", m)); 2877 KASSERT(m->oflags == 0, 2878 ("page %p has unexpected oflags", m)); 2879 /* Don't care: PGA_NOSYNC. */ 2880 if (!vm_page_none_valid(m)) { 2881 /* 2882 * First, try to allocate a new page 2883 * that is above "high". Failing 2884 * that, try to allocate a new page 2885 * that is below "m_run". Allocate 2886 * the new page between the end of 2887 * "m_run" and "high" only as a last 2888 * resort. 2889 */ 2890 req = req_class; 2891 if ((m->flags & PG_NODUMP) != 0) 2892 req |= VM_ALLOC_NODUMP; 2893 if (trunc_page(high) != 2894 ~(vm_paddr_t)PAGE_MASK) { 2895 m_new = 2896 vm_page_alloc_noobj_contig( 2897 req, 1, round_page(high), 2898 ~(vm_paddr_t)0, PAGE_SIZE, 2899 0, VM_MEMATTR_DEFAULT); 2900 } else 2901 m_new = NULL; 2902 if (m_new == NULL) { 2903 pa = VM_PAGE_TO_PHYS(m_run); 2904 m_new = 2905 vm_page_alloc_noobj_contig( 2906 req, 1, 0, pa - 1, 2907 PAGE_SIZE, 0, 2908 VM_MEMATTR_DEFAULT); 2909 } 2910 if (m_new == NULL) { 2911 pa += ptoa(npages); 2912 m_new = 2913 vm_page_alloc_noobj_contig( 2914 req, 1, pa, high, PAGE_SIZE, 2915 0, VM_MEMATTR_DEFAULT); 2916 } 2917 if (m_new == NULL) { 2918 vm_page_xunbusy(m); 2919 error = ENOMEM; 2920 goto unlock; 2921 } 2922 2923 /* 2924 * Unmap the page and check for new 2925 * wirings that may have been acquired 2926 * through a pmap lookup. 2927 */ 2928 if (object->ref_count != 0 && 2929 !vm_page_try_remove_all(m)) { 2930 vm_page_xunbusy(m); 2931 vm_page_free(m_new); 2932 error = EBUSY; 2933 goto unlock; 2934 } 2935 2936 /* 2937 * Replace "m" with the new page. For 2938 * vm_page_replace(), "m" must be busy 2939 * and dequeued. Finally, change "m" 2940 * as if vm_page_free() was called. 2941 */ 2942 m_new->a.flags = m->a.flags & 2943 ~PGA_QUEUE_STATE_MASK; 2944 KASSERT(m_new->oflags == VPO_UNMANAGED, 2945 ("page %p is managed", m_new)); 2946 m_new->oflags = 0; 2947 pmap_copy_page(m, m_new); 2948 m_new->valid = m->valid; 2949 m_new->dirty = m->dirty; 2950 m->flags &= ~PG_ZERO; 2951 vm_page_dequeue(m); 2952 if (vm_page_replace_hold(m_new, object, 2953 m->pindex, m) && 2954 vm_page_free_prep(m)) 2955 SLIST_INSERT_HEAD(&free, m, 2956 plinks.s.ss); 2957 2958 /* 2959 * The new page must be deactivated 2960 * before the object is unlocked. 2961 */ 2962 vm_page_deactivate(m_new); 2963 } else { 2964 m->flags &= ~PG_ZERO; 2965 vm_page_dequeue(m); 2966 if (vm_page_free_prep(m)) 2967 SLIST_INSERT_HEAD(&free, m, 2968 plinks.s.ss); 2969 KASSERT(m->dirty == 0, 2970 ("page %p is dirty", m)); 2971 } 2972 } else 2973 error = EBUSY; 2974 unlock: 2975 VM_OBJECT_WUNLOCK(object); 2976 } else { 2977 MPASS(vm_page_domain(m) == domain); 2978 vmd = VM_DOMAIN(domain); 2979 vm_domain_free_lock(vmd); 2980 order = m->order; 2981 if (order < VM_NFREEORDER) { 2982 /* 2983 * The page is enqueued in the physical memory 2984 * allocator's free page queues. Moreover, it 2985 * is the first page in a power-of-two-sized 2986 * run of contiguous free pages. Jump ahead 2987 * to the last page within that run, and 2988 * continue from there. 2989 */ 2990 m += (1 << order) - 1; 2991 } 2992 #if VM_NRESERVLEVEL > 0 2993 else if (vm_reserv_is_page_free(m)) 2994 order = 0; 2995 #endif 2996 vm_domain_free_unlock(vmd); 2997 if (order == VM_NFREEORDER) 2998 error = EINVAL; 2999 } 3000 } 3001 if ((m = SLIST_FIRST(&free)) != NULL) { 3002 int cnt; 3003 3004 vmd = VM_DOMAIN(domain); 3005 cnt = 0; 3006 vm_domain_free_lock(vmd); 3007 do { 3008 MPASS(vm_page_domain(m) == domain); 3009 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3010 vm_phys_free_pages(m, 0); 3011 cnt++; 3012 } while ((m = SLIST_FIRST(&free)) != NULL); 3013 vm_domain_free_unlock(vmd); 3014 vm_domain_freecnt_inc(vmd, cnt); 3015 } 3016 return (error); 3017 } 3018 3019 #define NRUNS 16 3020 3021 #define RUN_INDEX(count, nruns) ((count) % (nruns)) 3022 3023 #define MIN_RECLAIM 8 3024 3025 /* 3026 * vm_page_reclaim_contig: 3027 * 3028 * Reclaim allocated, contiguous physical memory satisfying the specified 3029 * conditions by relocating the virtual pages using that physical memory. 3030 * Returns 0 if reclamation is successful, ERANGE if the specified domain 3031 * can't possibly satisfy the reclamation request, or ENOMEM if not 3032 * currently able to reclaim the requested number of pages. Since 3033 * relocation requires the allocation of physical pages, reclamation may 3034 * fail with ENOMEM due to a shortage of free pages. When reclamation 3035 * fails in this manner, callers are expected to perform vm_wait() before 3036 * retrying a failed allocation operation, e.g., vm_page_alloc_contig(). 3037 * 3038 * The caller must always specify an allocation class through "req". 3039 * 3040 * allocation classes: 3041 * VM_ALLOC_NORMAL normal process request 3042 * VM_ALLOC_SYSTEM system *really* needs a page 3043 * VM_ALLOC_INTERRUPT interrupt time request 3044 * 3045 * The optional allocation flags are ignored. 3046 * 3047 * "npages" must be greater than zero. Both "alignment" and "boundary" 3048 * must be a power of two. 3049 */ 3050 int 3051 vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages, 3052 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, 3053 int desired_runs) 3054 { 3055 struct vm_domain *vmd; 3056 vm_page_t bounds[2], m_run, _m_runs[NRUNS], *m_runs; 3057 u_long count, minalign, reclaimed; 3058 int error, i, min_reclaim, nruns, options, req_class; 3059 int segind, start_segind; 3060 int ret; 3061 3062 KASSERT(npages > 0, ("npages is 0")); 3063 KASSERT(powerof2(alignment), ("alignment is not a power of 2")); 3064 KASSERT(powerof2(boundary), ("boundary is not a power of 2")); 3065 3066 ret = ENOMEM; 3067 3068 /* 3069 * If the caller wants to reclaim multiple runs, try to allocate 3070 * space to store the runs. If that fails, fall back to the old 3071 * behavior of just reclaiming MIN_RECLAIM pages. 3072 */ 3073 if (desired_runs > 1) 3074 m_runs = malloc((NRUNS + desired_runs) * sizeof(*m_runs), 3075 M_TEMP, M_NOWAIT); 3076 else 3077 m_runs = NULL; 3078 3079 if (m_runs == NULL) { 3080 m_runs = _m_runs; 3081 nruns = NRUNS; 3082 } else { 3083 nruns = NRUNS + desired_runs - 1; 3084 } 3085 min_reclaim = MAX(desired_runs * npages, MIN_RECLAIM); 3086 3087 /* 3088 * The caller will attempt an allocation after some runs have been 3089 * reclaimed and added to the vm_phys buddy lists. Due to limitations 3090 * of vm_phys_alloc_contig(), round up the requested length to the next 3091 * power of two or maximum chunk size, and ensure that each run is 3092 * suitably aligned. 3093 */ 3094 minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1); 3095 npages = roundup2(npages, minalign); 3096 if (alignment < ptoa(minalign)) 3097 alignment = ptoa(minalign); 3098 3099 /* 3100 * The page daemon is allowed to dig deeper into the free page list. 3101 */ 3102 req_class = req & VM_ALLOC_CLASS_MASK; 3103 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 3104 req_class = VM_ALLOC_SYSTEM; 3105 3106 start_segind = vm_phys_lookup_segind(low); 3107 3108 /* 3109 * Return if the number of free pages cannot satisfy the requested 3110 * allocation. 3111 */ 3112 vmd = VM_DOMAIN(domain); 3113 count = vmd->vmd_free_count; 3114 if (count < npages + vmd->vmd_free_reserved || (count < npages + 3115 vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || 3116 (count < npages && req_class == VM_ALLOC_INTERRUPT)) 3117 goto done; 3118 3119 /* 3120 * Scan up to three times, relaxing the restrictions ("options") on 3121 * the reclamation of reservations and superpages each time. 3122 */ 3123 for (options = VPSC_NORESERV;;) { 3124 bool phys_range_exists = false; 3125 3126 /* 3127 * Find the highest runs that satisfy the given constraints 3128 * and restrictions, and record them in "m_runs". 3129 */ 3130 count = 0; 3131 segind = start_segind; 3132 while ((segind = vm_phys_find_range(bounds, segind, domain, 3133 npages, low, high)) != -1) { 3134 phys_range_exists = true; 3135 while ((m_run = vm_page_scan_contig(npages, bounds[0], 3136 bounds[1], alignment, boundary, options))) { 3137 bounds[0] = m_run + npages; 3138 m_runs[RUN_INDEX(count, nruns)] = m_run; 3139 count++; 3140 } 3141 segind++; 3142 } 3143 3144 if (!phys_range_exists) { 3145 ret = ERANGE; 3146 goto done; 3147 } 3148 3149 /* 3150 * Reclaim the highest runs in LIFO (descending) order until 3151 * the number of reclaimed pages, "reclaimed", is at least 3152 * "min_reclaim". Reset "reclaimed" each time because each 3153 * reclamation is idempotent, and runs will (likely) recur 3154 * from one scan to the next as restrictions are relaxed. 3155 */ 3156 reclaimed = 0; 3157 for (i = 0; count > 0 && i < nruns; i++) { 3158 count--; 3159 m_run = m_runs[RUN_INDEX(count, nruns)]; 3160 error = vm_page_reclaim_run(req_class, domain, npages, 3161 m_run, high); 3162 if (error == 0) { 3163 reclaimed += npages; 3164 if (reclaimed >= min_reclaim) { 3165 ret = 0; 3166 goto done; 3167 } 3168 } 3169 } 3170 3171 /* 3172 * Either relax the restrictions on the next scan or return if 3173 * the last scan had no restrictions. 3174 */ 3175 if (options == VPSC_NORESERV) 3176 options = VPSC_NOSUPER; 3177 else if (options == VPSC_NOSUPER) 3178 options = VPSC_ANY; 3179 else if (options == VPSC_ANY) { 3180 if (reclaimed != 0) 3181 ret = 0; 3182 goto done; 3183 } 3184 } 3185 done: 3186 if (m_runs != _m_runs) 3187 free(m_runs, M_TEMP); 3188 return (ret); 3189 } 3190 3191 int 3192 vm_page_reclaim_contig_domain(int domain, int req, u_long npages, 3193 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) 3194 { 3195 return (vm_page_reclaim_contig_domain_ext(domain, req, npages, low, high, 3196 alignment, boundary, 1)); 3197 } 3198 3199 int 3200 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, 3201 u_long alignment, vm_paddr_t boundary) 3202 { 3203 struct vm_domainset_iter di; 3204 int domain, ret, status; 3205 3206 ret = ERANGE; 3207 3208 vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); 3209 do { 3210 status = vm_page_reclaim_contig_domain(domain, req, npages, low, 3211 high, alignment, boundary); 3212 if (status == 0) 3213 return (0); 3214 else if (status == ERANGE) 3215 vm_domainset_iter_ignore(&di, domain); 3216 else { 3217 KASSERT(status == ENOMEM, ("Unrecognized error %d " 3218 "from vm_page_reclaim_contig_domain()", status)); 3219 ret = ENOMEM; 3220 } 3221 } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); 3222 3223 return (ret); 3224 } 3225 3226 /* 3227 * Set the domain in the appropriate page level domainset. 3228 */ 3229 void 3230 vm_domain_set(struct vm_domain *vmd) 3231 { 3232 3233 mtx_lock(&vm_domainset_lock); 3234 if (!vmd->vmd_minset && vm_paging_min(vmd)) { 3235 vmd->vmd_minset = 1; 3236 DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); 3237 } 3238 if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { 3239 vmd->vmd_severeset = 1; 3240 DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); 3241 } 3242 mtx_unlock(&vm_domainset_lock); 3243 } 3244 3245 /* 3246 * Clear the domain from the appropriate page level domainset. 3247 */ 3248 void 3249 vm_domain_clear(struct vm_domain *vmd) 3250 { 3251 3252 mtx_lock(&vm_domainset_lock); 3253 if (vmd->vmd_minset && !vm_paging_min(vmd)) { 3254 vmd->vmd_minset = 0; 3255 DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); 3256 if (vm_min_waiters != 0) { 3257 vm_min_waiters = 0; 3258 wakeup(&vm_min_domains); 3259 } 3260 } 3261 if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { 3262 vmd->vmd_severeset = 0; 3263 DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); 3264 if (vm_severe_waiters != 0) { 3265 vm_severe_waiters = 0; 3266 wakeup(&vm_severe_domains); 3267 } 3268 } 3269 3270 /* 3271 * If pageout daemon needs pages, then tell it that there are 3272 * some free. 3273 */ 3274 if (vmd->vmd_pageout_pages_needed && 3275 vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { 3276 wakeup(&vmd->vmd_pageout_pages_needed); 3277 vmd->vmd_pageout_pages_needed = 0; 3278 } 3279 3280 /* See comments in vm_wait_doms(). */ 3281 if (vm_pageproc_waiters) { 3282 vm_pageproc_waiters = 0; 3283 wakeup(&vm_pageproc_waiters); 3284 } 3285 mtx_unlock(&vm_domainset_lock); 3286 } 3287 3288 /* 3289 * Wait for free pages to exceed the min threshold globally. 3290 */ 3291 void 3292 vm_wait_min(void) 3293 { 3294 3295 mtx_lock(&vm_domainset_lock); 3296 while (vm_page_count_min()) { 3297 vm_min_waiters++; 3298 msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); 3299 } 3300 mtx_unlock(&vm_domainset_lock); 3301 } 3302 3303 /* 3304 * Wait for free pages to exceed the severe threshold globally. 3305 */ 3306 void 3307 vm_wait_severe(void) 3308 { 3309 3310 mtx_lock(&vm_domainset_lock); 3311 while (vm_page_count_severe()) { 3312 vm_severe_waiters++; 3313 msleep(&vm_severe_domains, &vm_domainset_lock, PVM, 3314 "vmwait", 0); 3315 } 3316 mtx_unlock(&vm_domainset_lock); 3317 } 3318 3319 u_int 3320 vm_wait_count(void) 3321 { 3322 3323 return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); 3324 } 3325 3326 int 3327 vm_wait_doms(const domainset_t *wdoms, int mflags) 3328 { 3329 int error; 3330 3331 error = 0; 3332 3333 /* 3334 * We use racey wakeup synchronization to avoid expensive global 3335 * locking for the pageproc when sleeping with a non-specific vm_wait. 3336 * To handle this, we only sleep for one tick in this instance. It 3337 * is expected that most allocations for the pageproc will come from 3338 * kmem or vm_page_grab* which will use the more specific and 3339 * race-free vm_wait_domain(). 3340 */ 3341 if (curproc == pageproc) { 3342 mtx_lock(&vm_domainset_lock); 3343 vm_pageproc_waiters++; 3344 error = msleep(&vm_pageproc_waiters, &vm_domainset_lock, 3345 PVM | PDROP | mflags, "pageprocwait", 1); 3346 } else { 3347 /* 3348 * XXX Ideally we would wait only until the allocation could 3349 * be satisfied. This condition can cause new allocators to 3350 * consume all freed pages while old allocators wait. 3351 */ 3352 mtx_lock(&vm_domainset_lock); 3353 if (vm_page_count_min_set(wdoms)) { 3354 if (pageproc == NULL) 3355 panic("vm_wait in early boot"); 3356 vm_min_waiters++; 3357 error = msleep(&vm_min_domains, &vm_domainset_lock, 3358 PVM | PDROP | mflags, "vmwait", 0); 3359 } else 3360 mtx_unlock(&vm_domainset_lock); 3361 } 3362 return (error); 3363 } 3364 3365 /* 3366 * vm_wait_domain: 3367 * 3368 * Sleep until free pages are available for allocation. 3369 * - Called in various places after failed memory allocations. 3370 */ 3371 void 3372 vm_wait_domain(int domain) 3373 { 3374 struct vm_domain *vmd; 3375 domainset_t wdom; 3376 3377 vmd = VM_DOMAIN(domain); 3378 vm_domain_free_assert_unlocked(vmd); 3379 3380 if (curproc == pageproc) { 3381 mtx_lock(&vm_domainset_lock); 3382 if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { 3383 vmd->vmd_pageout_pages_needed = 1; 3384 msleep(&vmd->vmd_pageout_pages_needed, 3385 &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); 3386 } else 3387 mtx_unlock(&vm_domainset_lock); 3388 } else { 3389 DOMAINSET_ZERO(&wdom); 3390 DOMAINSET_SET(vmd->vmd_domain, &wdom); 3391 vm_wait_doms(&wdom, 0); 3392 } 3393 } 3394 3395 static int 3396 vm_wait_flags(vm_object_t obj, int mflags) 3397 { 3398 struct domainset *d; 3399 3400 d = NULL; 3401 3402 /* 3403 * Carefully fetch pointers only once: the struct domainset 3404 * itself is ummutable but the pointer might change. 3405 */ 3406 if (obj != NULL) 3407 d = obj->domain.dr_policy; 3408 if (d == NULL) 3409 d = curthread->td_domain.dr_policy; 3410 3411 return (vm_wait_doms(&d->ds_mask, mflags)); 3412 } 3413 3414 /* 3415 * vm_wait: 3416 * 3417 * Sleep until free pages are available for allocation in the 3418 * affinity domains of the obj. If obj is NULL, the domain set 3419 * for the calling thread is used. 3420 * Called in various places after failed memory allocations. 3421 */ 3422 void 3423 vm_wait(vm_object_t obj) 3424 { 3425 (void)vm_wait_flags(obj, 0); 3426 } 3427 3428 int 3429 vm_wait_intr(vm_object_t obj) 3430 { 3431 return (vm_wait_flags(obj, PCATCH)); 3432 } 3433 3434 /* 3435 * vm_domain_alloc_fail: 3436 * 3437 * Called when a page allocation function fails. Informs the 3438 * pagedaemon and performs the requested wait. Requires the 3439 * domain_free and object lock on entry. Returns with the 3440 * object lock held and free lock released. Returns an error when 3441 * retry is necessary. 3442 * 3443 */ 3444 static int 3445 vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) 3446 { 3447 3448 vm_domain_free_assert_unlocked(vmd); 3449 3450 atomic_add_int(&vmd->vmd_pageout_deficit, 3451 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 3452 if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { 3453 if (object != NULL) 3454 VM_OBJECT_WUNLOCK(object); 3455 vm_wait_domain(vmd->vmd_domain); 3456 if (object != NULL) 3457 VM_OBJECT_WLOCK(object); 3458 if (req & VM_ALLOC_WAITOK) 3459 return (EAGAIN); 3460 } 3461 3462 return (0); 3463 } 3464 3465 /* 3466 * vm_waitpfault: 3467 * 3468 * Sleep until free pages are available for allocation. 3469 * - Called only in vm_fault so that processes page faulting 3470 * can be easily tracked. 3471 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 3472 * processes will be able to grab memory first. Do not change 3473 * this balance without careful testing first. 3474 */ 3475 void 3476 vm_waitpfault(struct domainset *dset, int timo) 3477 { 3478 3479 /* 3480 * XXX Ideally we would wait only until the allocation could 3481 * be satisfied. This condition can cause new allocators to 3482 * consume all freed pages while old allocators wait. 3483 */ 3484 mtx_lock(&vm_domainset_lock); 3485 if (vm_page_count_min_set(&dset->ds_mask)) { 3486 vm_min_waiters++; 3487 msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, 3488 "pfault", timo); 3489 } else 3490 mtx_unlock(&vm_domainset_lock); 3491 } 3492 3493 static struct vm_pagequeue * 3494 _vm_page_pagequeue(vm_page_t m, uint8_t queue) 3495 { 3496 3497 return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); 3498 } 3499 3500 #ifdef INVARIANTS 3501 static struct vm_pagequeue * 3502 vm_page_pagequeue(vm_page_t m) 3503 { 3504 3505 return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue)); 3506 } 3507 #endif 3508 3509 static __always_inline bool 3510 vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) 3511 { 3512 vm_page_astate_t tmp; 3513 3514 tmp = *old; 3515 do { 3516 if (__predict_true(vm_page_astate_fcmpset(m, old, new))) 3517 return (true); 3518 counter_u64_add(pqstate_commit_retries, 1); 3519 } while (old->_bits == tmp._bits); 3520 3521 return (false); 3522 } 3523 3524 /* 3525 * Do the work of committing a queue state update that moves the page out of 3526 * its current queue. 3527 */ 3528 static bool 3529 _vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m, 3530 vm_page_astate_t *old, vm_page_astate_t new) 3531 { 3532 vm_page_t next; 3533 3534 vm_pagequeue_assert_locked(pq); 3535 KASSERT(vm_page_pagequeue(m) == pq, 3536 ("%s: queue %p does not match page %p", __func__, pq, m)); 3537 KASSERT(old->queue != PQ_NONE && new.queue != old->queue, 3538 ("%s: invalid queue indices %d %d", 3539 __func__, old->queue, new.queue)); 3540 3541 /* 3542 * Once the queue index of the page changes there is nothing 3543 * synchronizing with further updates to the page's physical 3544 * queue state. Therefore we must speculatively remove the page 3545 * from the queue now and be prepared to roll back if the queue 3546 * state update fails. If the page is not physically enqueued then 3547 * we just update its queue index. 3548 */ 3549 if ((old->flags & PGA_ENQUEUED) != 0) { 3550 new.flags &= ~PGA_ENQUEUED; 3551 next = TAILQ_NEXT(m, plinks.q); 3552 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3553 vm_pagequeue_cnt_dec(pq); 3554 if (!vm_page_pqstate_fcmpset(m, old, new)) { 3555 if (next == NULL) 3556 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3557 else 3558 TAILQ_INSERT_BEFORE(next, m, plinks.q); 3559 vm_pagequeue_cnt_inc(pq); 3560 return (false); 3561 } else { 3562 return (true); 3563 } 3564 } else { 3565 return (vm_page_pqstate_fcmpset(m, old, new)); 3566 } 3567 } 3568 3569 static bool 3570 vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old, 3571 vm_page_astate_t new) 3572 { 3573 struct vm_pagequeue *pq; 3574 vm_page_astate_t as; 3575 bool ret; 3576 3577 pq = _vm_page_pagequeue(m, old->queue); 3578 3579 /* 3580 * The queue field and PGA_ENQUEUED flag are stable only so long as the 3581 * corresponding page queue lock is held. 3582 */ 3583 vm_pagequeue_lock(pq); 3584 as = vm_page_astate_load(m); 3585 if (__predict_false(as._bits != old->_bits)) { 3586 *old = as; 3587 ret = false; 3588 } else { 3589 ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new); 3590 } 3591 vm_pagequeue_unlock(pq); 3592 return (ret); 3593 } 3594 3595 /* 3596 * Commit a queue state update that enqueues or requeues a page. 3597 */ 3598 static bool 3599 _vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m, 3600 vm_page_astate_t *old, vm_page_astate_t new) 3601 { 3602 struct vm_domain *vmd; 3603 3604 vm_pagequeue_assert_locked(pq); 3605 KASSERT(old->queue != PQ_NONE && new.queue == old->queue, 3606 ("%s: invalid queue indices %d %d", 3607 __func__, old->queue, new.queue)); 3608 3609 new.flags |= PGA_ENQUEUED; 3610 if (!vm_page_pqstate_fcmpset(m, old, new)) 3611 return (false); 3612 3613 if ((old->flags & PGA_ENQUEUED) != 0) 3614 TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); 3615 else 3616 vm_pagequeue_cnt_inc(pq); 3617 3618 /* 3619 * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if 3620 * both flags are set in close succession, only PGA_REQUEUE_HEAD will be 3621 * applied, even if it was set first. 3622 */ 3623 if ((old->flags & PGA_REQUEUE_HEAD) != 0) { 3624 vmd = vm_pagequeue_domain(m); 3625 KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE], 3626 ("%s: invalid page queue for page %p", __func__, m)); 3627 TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); 3628 } else { 3629 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 3630 } 3631 return (true); 3632 } 3633 3634 /* 3635 * Commit a queue state update that encodes a request for a deferred queue 3636 * operation. 3637 */ 3638 static bool 3639 vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old, 3640 vm_page_astate_t new) 3641 { 3642 3643 KASSERT(old->queue == new.queue || new.queue != PQ_NONE, 3644 ("%s: invalid state, queue %d flags %x", 3645 __func__, new.queue, new.flags)); 3646 3647 if (old->_bits != new._bits && 3648 !vm_page_pqstate_fcmpset(m, old, new)) 3649 return (false); 3650 vm_page_pqbatch_submit(m, new.queue); 3651 return (true); 3652 } 3653 3654 /* 3655 * A generic queue state update function. This handles more cases than the 3656 * specialized functions above. 3657 */ 3658 bool 3659 vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) 3660 { 3661 3662 if (old->_bits == new._bits) 3663 return (true); 3664 3665 if (old->queue != PQ_NONE && new.queue != old->queue) { 3666 if (!vm_page_pqstate_commit_dequeue(m, old, new)) 3667 return (false); 3668 if (new.queue != PQ_NONE) 3669 vm_page_pqbatch_submit(m, new.queue); 3670 } else { 3671 if (!vm_page_pqstate_fcmpset(m, old, new)) 3672 return (false); 3673 if (new.queue != PQ_NONE && 3674 ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0) 3675 vm_page_pqbatch_submit(m, new.queue); 3676 } 3677 return (true); 3678 } 3679 3680 /* 3681 * Apply deferred queue state updates to a page. 3682 */ 3683 static inline void 3684 vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) 3685 { 3686 vm_page_astate_t new, old; 3687 3688 CRITICAL_ASSERT(curthread); 3689 vm_pagequeue_assert_locked(pq); 3690 KASSERT(queue < PQ_COUNT, 3691 ("%s: invalid queue index %d", __func__, queue)); 3692 KASSERT(pq == _vm_page_pagequeue(m, queue), 3693 ("%s: page %p does not belong to queue %p", __func__, m, pq)); 3694 3695 for (old = vm_page_astate_load(m);;) { 3696 if (__predict_false(old.queue != queue || 3697 (old.flags & PGA_QUEUE_OP_MASK) == 0)) { 3698 counter_u64_add(queue_nops, 1); 3699 break; 3700 } 3701 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3702 ("%s: page %p is unmanaged", __func__, m)); 3703 3704 new = old; 3705 if ((old.flags & PGA_DEQUEUE) != 0) { 3706 new.flags &= ~PGA_QUEUE_OP_MASK; 3707 new.queue = PQ_NONE; 3708 if (__predict_true(_vm_page_pqstate_commit_dequeue(pq, 3709 m, &old, new))) { 3710 counter_u64_add(queue_ops, 1); 3711 break; 3712 } 3713 } else { 3714 new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); 3715 if (__predict_true(_vm_page_pqstate_commit_requeue(pq, 3716 m, &old, new))) { 3717 counter_u64_add(queue_ops, 1); 3718 break; 3719 } 3720 } 3721 } 3722 } 3723 3724 static void 3725 vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, 3726 uint8_t queue) 3727 { 3728 int i; 3729 3730 for (i = 0; i < bq->bq_cnt; i++) 3731 vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); 3732 vm_batchqueue_init(bq); 3733 } 3734 3735 /* 3736 * vm_page_pqbatch_submit: [ internal use only ] 3737 * 3738 * Enqueue a page in the specified page queue's batched work queue. 3739 * The caller must have encoded the requested operation in the page 3740 * structure's a.flags field. 3741 */ 3742 void 3743 vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) 3744 { 3745 struct vm_batchqueue *bq; 3746 struct vm_pagequeue *pq; 3747 int domain, slots_remaining; 3748 3749 KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); 3750 3751 domain = vm_page_domain(m); 3752 critical_enter(); 3753 bq = DPCPU_PTR(pqbatch[domain][queue]); 3754 slots_remaining = vm_batchqueue_insert(bq, m); 3755 if (slots_remaining > (VM_BATCHQUEUE_SIZE >> 1)) { 3756 /* keep building the bq */ 3757 critical_exit(); 3758 return; 3759 } else if (slots_remaining > 0 ) { 3760 /* Try to process the bq if we can get the lock */ 3761 pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; 3762 if (vm_pagequeue_trylock(pq)) { 3763 vm_pqbatch_process(pq, bq, queue); 3764 vm_pagequeue_unlock(pq); 3765 } 3766 critical_exit(); 3767 return; 3768 } 3769 critical_exit(); 3770 3771 /* if we make it here, the bq is full so wait for the lock */ 3772 3773 pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; 3774 vm_pagequeue_lock(pq); 3775 critical_enter(); 3776 bq = DPCPU_PTR(pqbatch[domain][queue]); 3777 vm_pqbatch_process(pq, bq, queue); 3778 vm_pqbatch_process_page(pq, m, queue); 3779 vm_pagequeue_unlock(pq); 3780 critical_exit(); 3781 } 3782 3783 /* 3784 * vm_page_pqbatch_drain: [ internal use only ] 3785 * 3786 * Force all per-CPU page queue batch queues to be drained. This is 3787 * intended for use in severe memory shortages, to ensure that pages 3788 * do not remain stuck in the batch queues. 3789 */ 3790 void 3791 vm_page_pqbatch_drain(void) 3792 { 3793 struct thread *td; 3794 struct vm_domain *vmd; 3795 struct vm_pagequeue *pq; 3796 int cpu, domain, queue; 3797 3798 td = curthread; 3799 CPU_FOREACH(cpu) { 3800 thread_lock(td); 3801 sched_bind(td, cpu); 3802 thread_unlock(td); 3803 3804 for (domain = 0; domain < vm_ndomains; domain++) { 3805 vmd = VM_DOMAIN(domain); 3806 for (queue = 0; queue < PQ_COUNT; queue++) { 3807 pq = &vmd->vmd_pagequeues[queue]; 3808 vm_pagequeue_lock(pq); 3809 critical_enter(); 3810 vm_pqbatch_process(pq, 3811 DPCPU_PTR(pqbatch[domain][queue]), queue); 3812 critical_exit(); 3813 vm_pagequeue_unlock(pq); 3814 } 3815 } 3816 } 3817 thread_lock(td); 3818 sched_unbind(td); 3819 thread_unlock(td); 3820 } 3821 3822 /* 3823 * vm_page_dequeue_deferred: [ internal use only ] 3824 * 3825 * Request removal of the given page from its current page 3826 * queue. Physical removal from the queue may be deferred 3827 * indefinitely. 3828 */ 3829 void 3830 vm_page_dequeue_deferred(vm_page_t m) 3831 { 3832 vm_page_astate_t new, old; 3833 3834 old = vm_page_astate_load(m); 3835 do { 3836 if (old.queue == PQ_NONE) { 3837 KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, 3838 ("%s: page %p has unexpected queue state", 3839 __func__, m)); 3840 break; 3841 } 3842 new = old; 3843 new.flags |= PGA_DEQUEUE; 3844 } while (!vm_page_pqstate_commit_request(m, &old, new)); 3845 } 3846 3847 /* 3848 * vm_page_dequeue: 3849 * 3850 * Remove the page from whichever page queue it's in, if any, before 3851 * returning. 3852 */ 3853 void 3854 vm_page_dequeue(vm_page_t m) 3855 { 3856 vm_page_astate_t new, old; 3857 3858 old = vm_page_astate_load(m); 3859 do { 3860 if (old.queue == PQ_NONE) { 3861 KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, 3862 ("%s: page %p has unexpected queue state", 3863 __func__, m)); 3864 break; 3865 } 3866 new = old; 3867 new.flags &= ~PGA_QUEUE_OP_MASK; 3868 new.queue = PQ_NONE; 3869 } while (!vm_page_pqstate_commit_dequeue(m, &old, new)); 3870 3871 } 3872 3873 /* 3874 * Schedule the given page for insertion into the specified page queue. 3875 * Physical insertion of the page may be deferred indefinitely. 3876 */ 3877 static void 3878 vm_page_enqueue(vm_page_t m, uint8_t queue) 3879 { 3880 3881 KASSERT(m->a.queue == PQ_NONE && 3882 (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, 3883 ("%s: page %p is already enqueued", __func__, m)); 3884 KASSERT(m->ref_count > 0, 3885 ("%s: page %p does not carry any references", __func__, m)); 3886 3887 m->a.queue = queue; 3888 if ((m->a.flags & PGA_REQUEUE) == 0) 3889 vm_page_aflag_set(m, PGA_REQUEUE); 3890 vm_page_pqbatch_submit(m, queue); 3891 } 3892 3893 /* 3894 * vm_page_free_prep: 3895 * 3896 * Prepares the given page to be put on the free list, 3897 * disassociating it from any VM object. The caller may return 3898 * the page to the free list only if this function returns true. 3899 * 3900 * The object, if it exists, must be locked, and then the page must 3901 * be xbusy. Otherwise the page must be not busied. A managed 3902 * page must be unmapped. 3903 */ 3904 static bool 3905 vm_page_free_prep(vm_page_t m) 3906 { 3907 3908 /* 3909 * Synchronize with threads that have dropped a reference to this 3910 * page. 3911 */ 3912 atomic_thread_fence_acq(); 3913 3914 #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) 3915 if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { 3916 uint64_t *p; 3917 int i; 3918 p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3919 for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) 3920 KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", 3921 m, i, (uintmax_t)*p)); 3922 } 3923 #endif 3924 if ((m->oflags & VPO_UNMANAGED) == 0) { 3925 KASSERT(!pmap_page_is_mapped(m), 3926 ("vm_page_free_prep: freeing mapped page %p", m)); 3927 KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, 3928 ("vm_page_free_prep: mapping flags set in page %p", m)); 3929 } else { 3930 KASSERT(m->a.queue == PQ_NONE, 3931 ("vm_page_free_prep: unmanaged page %p is queued", m)); 3932 } 3933 VM_CNT_INC(v_tfree); 3934 3935 if (m->object != NULL) { 3936 KASSERT(((m->oflags & VPO_UNMANAGED) != 0) == 3937 ((m->object->flags & OBJ_UNMANAGED) != 0), 3938 ("vm_page_free_prep: managed flag mismatch for page %p", 3939 m)); 3940 vm_page_assert_xbusied(m); 3941 3942 /* 3943 * The object reference can be released without an atomic 3944 * operation. 3945 */ 3946 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3947 m->ref_count == VPRC_OBJREF, 3948 ("vm_page_free_prep: page %p has unexpected ref_count %u", 3949 m, m->ref_count)); 3950 vm_page_object_remove(m); 3951 m->ref_count -= VPRC_OBJREF; 3952 } else 3953 vm_page_assert_unbusied(m); 3954 3955 vm_page_busy_free(m); 3956 3957 /* 3958 * If fictitious remove object association and 3959 * return. 3960 */ 3961 if ((m->flags & PG_FICTITIOUS) != 0) { 3962 KASSERT(m->ref_count == 1, 3963 ("fictitious page %p is referenced", m)); 3964 KASSERT(m->a.queue == PQ_NONE, 3965 ("fictitious page %p is queued", m)); 3966 return (false); 3967 } 3968 3969 /* 3970 * Pages need not be dequeued before they are returned to the physical 3971 * memory allocator, but they must at least be marked for a deferred 3972 * dequeue. 3973 */ 3974 if ((m->oflags & VPO_UNMANAGED) == 0) 3975 vm_page_dequeue_deferred(m); 3976 3977 m->valid = 0; 3978 vm_page_undirty(m); 3979 3980 if (m->ref_count != 0) 3981 panic("vm_page_free_prep: page %p has references", m); 3982 3983 /* 3984 * Restore the default memory attribute to the page. 3985 */ 3986 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 3987 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 3988 3989 #if VM_NRESERVLEVEL > 0 3990 /* 3991 * Determine whether the page belongs to a reservation. If the page was 3992 * allocated from a per-CPU cache, it cannot belong to a reservation, so 3993 * as an optimization, we avoid the check in that case. 3994 */ 3995 if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) 3996 return (false); 3997 #endif 3998 3999 return (true); 4000 } 4001 4002 /* 4003 * vm_page_free_toq: 4004 * 4005 * Returns the given page to the free list, disassociating it 4006 * from any VM object. 4007 * 4008 * The object must be locked. The page must be exclusively busied if it 4009 * belongs to an object. 4010 */ 4011 static void 4012 vm_page_free_toq(vm_page_t m) 4013 { 4014 struct vm_domain *vmd; 4015 uma_zone_t zone; 4016 4017 if (!vm_page_free_prep(m)) 4018 return; 4019 4020 vmd = vm_pagequeue_domain(m); 4021 zone = vmd->vmd_pgcache[m->pool].zone; 4022 if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { 4023 uma_zfree(zone, m); 4024 return; 4025 } 4026 vm_domain_free_lock(vmd); 4027 vm_phys_free_pages(m, 0); 4028 vm_domain_free_unlock(vmd); 4029 vm_domain_freecnt_inc(vmd, 1); 4030 } 4031 4032 /* 4033 * vm_page_free_pages_toq: 4034 * 4035 * Returns a list of pages to the free list, disassociating it 4036 * from any VM object. In other words, this is equivalent to 4037 * calling vm_page_free_toq() for each page of a list of VM objects. 4038 */ 4039 void 4040 vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) 4041 { 4042 vm_page_t m; 4043 int count; 4044 4045 if (SLIST_EMPTY(free)) 4046 return; 4047 4048 count = 0; 4049 while ((m = SLIST_FIRST(free)) != NULL) { 4050 count++; 4051 SLIST_REMOVE_HEAD(free, plinks.s.ss); 4052 vm_page_free_toq(m); 4053 } 4054 4055 if (update_wire_count) 4056 vm_wire_sub(count); 4057 } 4058 4059 /* 4060 * Mark this page as wired down. For managed pages, this prevents reclamation 4061 * by the page daemon, or when the containing object, if any, is destroyed. 4062 */ 4063 void 4064 vm_page_wire(vm_page_t m) 4065 { 4066 u_int old; 4067 4068 #ifdef INVARIANTS 4069 if (m->object != NULL && !vm_page_busied(m) && 4070 !vm_object_busied(m->object)) 4071 VM_OBJECT_ASSERT_LOCKED(m->object); 4072 #endif 4073 KASSERT((m->flags & PG_FICTITIOUS) == 0 || 4074 VPRC_WIRE_COUNT(m->ref_count) >= 1, 4075 ("vm_page_wire: fictitious page %p has zero wirings", m)); 4076 4077 old = atomic_fetchadd_int(&m->ref_count, 1); 4078 KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, 4079 ("vm_page_wire: counter overflow for page %p", m)); 4080 if (VPRC_WIRE_COUNT(old) == 0) { 4081 if ((m->oflags & VPO_UNMANAGED) == 0) 4082 vm_page_aflag_set(m, PGA_DEQUEUE); 4083 vm_wire_add(1); 4084 } 4085 } 4086 4087 /* 4088 * Attempt to wire a mapped page following a pmap lookup of that page. 4089 * This may fail if a thread is concurrently tearing down mappings of the page. 4090 * The transient failure is acceptable because it translates to the 4091 * failure of the caller pmap_extract_and_hold(), which should be then 4092 * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages(). 4093 */ 4094 bool 4095 vm_page_wire_mapped(vm_page_t m) 4096 { 4097 u_int old; 4098 4099 old = m->ref_count; 4100 do { 4101 KASSERT(old > 0, 4102 ("vm_page_wire_mapped: wiring unreferenced page %p", m)); 4103 if ((old & VPRC_BLOCKED) != 0) 4104 return (false); 4105 } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); 4106 4107 if (VPRC_WIRE_COUNT(old) == 0) { 4108 if ((m->oflags & VPO_UNMANAGED) == 0) 4109 vm_page_aflag_set(m, PGA_DEQUEUE); 4110 vm_wire_add(1); 4111 } 4112 return (true); 4113 } 4114 4115 /* 4116 * Release a wiring reference to a managed page. If the page still belongs to 4117 * an object, update its position in the page queues to reflect the reference. 4118 * If the wiring was the last reference to the page, free the page. 4119 */ 4120 static void 4121 vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse) 4122 { 4123 u_int old; 4124 4125 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4126 ("%s: page %p is unmanaged", __func__, m)); 4127 4128 /* 4129 * Update LRU state before releasing the wiring reference. 4130 * Use a release store when updating the reference count to 4131 * synchronize with vm_page_free_prep(). 4132 */ 4133 old = m->ref_count; 4134 do { 4135 KASSERT(VPRC_WIRE_COUNT(old) > 0, 4136 ("vm_page_unwire: wire count underflow for page %p", m)); 4137 4138 if (old > VPRC_OBJREF + 1) { 4139 /* 4140 * The page has at least one other wiring reference. An 4141 * earlier iteration of this loop may have called 4142 * vm_page_release_toq() and cleared PGA_DEQUEUE, so 4143 * re-set it if necessary. 4144 */ 4145 if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0) 4146 vm_page_aflag_set(m, PGA_DEQUEUE); 4147 } else if (old == VPRC_OBJREF + 1) { 4148 /* 4149 * This is the last wiring. Clear PGA_DEQUEUE and 4150 * update the page's queue state to reflect the 4151 * reference. If the page does not belong to an object 4152 * (i.e., the VPRC_OBJREF bit is clear), we only need to 4153 * clear leftover queue state. 4154 */ 4155 vm_page_release_toq(m, nqueue, noreuse); 4156 } else if (old == 1) { 4157 vm_page_aflag_clear(m, PGA_DEQUEUE); 4158 } 4159 } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); 4160 4161 if (VPRC_WIRE_COUNT(old) == 1) { 4162 vm_wire_sub(1); 4163 if (old == 1) 4164 vm_page_free(m); 4165 } 4166 } 4167 4168 /* 4169 * Release one wiring of the specified page, potentially allowing it to be 4170 * paged out. 4171 * 4172 * Only managed pages belonging to an object can be paged out. If the number 4173 * of wirings transitions to zero and the page is eligible for page out, then 4174 * the page is added to the specified paging queue. If the released wiring 4175 * represented the last reference to the page, the page is freed. 4176 */ 4177 void 4178 vm_page_unwire(vm_page_t m, uint8_t nqueue) 4179 { 4180 4181 KASSERT(nqueue < PQ_COUNT, 4182 ("vm_page_unwire: invalid queue %u request for page %p", 4183 nqueue, m)); 4184 4185 if ((m->oflags & VPO_UNMANAGED) != 0) { 4186 if (vm_page_unwire_noq(m) && m->ref_count == 0) 4187 vm_page_free(m); 4188 return; 4189 } 4190 vm_page_unwire_managed(m, nqueue, false); 4191 } 4192 4193 /* 4194 * Unwire a page without (re-)inserting it into a page queue. It is up 4195 * to the caller to enqueue, requeue, or free the page as appropriate. 4196 * In most cases involving managed pages, vm_page_unwire() should be used 4197 * instead. 4198 */ 4199 bool 4200 vm_page_unwire_noq(vm_page_t m) 4201 { 4202 u_int old; 4203 4204 old = vm_page_drop(m, 1); 4205 KASSERT(VPRC_WIRE_COUNT(old) != 0, 4206 ("%s: counter underflow for page %p", __func__, m)); 4207 KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1, 4208 ("%s: missing ref on fictitious page %p", __func__, m)); 4209 4210 if (VPRC_WIRE_COUNT(old) > 1) 4211 return (false); 4212 if ((m->oflags & VPO_UNMANAGED) == 0) 4213 vm_page_aflag_clear(m, PGA_DEQUEUE); 4214 vm_wire_sub(1); 4215 return (true); 4216 } 4217 4218 /* 4219 * Ensure that the page ends up in the specified page queue. If the page is 4220 * active or being moved to the active queue, ensure that its act_count is 4221 * at least ACT_INIT but do not otherwise mess with it. 4222 */ 4223 static __always_inline void 4224 vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) 4225 { 4226 vm_page_astate_t old, new; 4227 4228 KASSERT(m->ref_count > 0, 4229 ("%s: page %p does not carry any references", __func__, m)); 4230 KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, 4231 ("%s: invalid flags %x", __func__, nflag)); 4232 4233 if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) 4234 return; 4235 4236 old = vm_page_astate_load(m); 4237 do { 4238 if ((old.flags & PGA_DEQUEUE) != 0) 4239 break; 4240 new = old; 4241 new.flags &= ~PGA_QUEUE_OP_MASK; 4242 if (nqueue == PQ_ACTIVE) 4243 new.act_count = max(old.act_count, ACT_INIT); 4244 if (old.queue == nqueue) { 4245 /* 4246 * There is no need to requeue pages already in the 4247 * active queue. 4248 */ 4249 if (nqueue != PQ_ACTIVE || 4250 (old.flags & PGA_ENQUEUED) == 0) 4251 new.flags |= nflag; 4252 } else { 4253 new.flags |= nflag; 4254 new.queue = nqueue; 4255 } 4256 } while (!vm_page_pqstate_commit(m, &old, new)); 4257 } 4258 4259 /* 4260 * Put the specified page on the active list (if appropriate). 4261 */ 4262 void 4263 vm_page_activate(vm_page_t m) 4264 { 4265 4266 vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); 4267 } 4268 4269 /* 4270 * Move the specified page to the tail of the inactive queue, or requeue 4271 * the page if it is already in the inactive queue. 4272 */ 4273 void 4274 vm_page_deactivate(vm_page_t m) 4275 { 4276 4277 vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); 4278 } 4279 4280 void 4281 vm_page_deactivate_noreuse(vm_page_t m) 4282 { 4283 4284 vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); 4285 } 4286 4287 /* 4288 * Put a page in the laundry, or requeue it if it is already there. 4289 */ 4290 void 4291 vm_page_launder(vm_page_t m) 4292 { 4293 4294 vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); 4295 } 4296 4297 /* 4298 * Put a page in the PQ_UNSWAPPABLE holding queue. 4299 */ 4300 void 4301 vm_page_unswappable(vm_page_t m) 4302 { 4303 4304 VM_OBJECT_ASSERT_LOCKED(m->object); 4305 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4306 ("page %p already unswappable", m)); 4307 4308 vm_page_dequeue(m); 4309 vm_page_enqueue(m, PQ_UNSWAPPABLE); 4310 } 4311 4312 /* 4313 * Release a page back to the page queues in preparation for unwiring. 4314 */ 4315 static void 4316 vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse) 4317 { 4318 vm_page_astate_t old, new; 4319 uint16_t nflag; 4320 4321 /* 4322 * Use a check of the valid bits to determine whether we should 4323 * accelerate reclamation of the page. The object lock might not be 4324 * held here, in which case the check is racy. At worst we will either 4325 * accelerate reclamation of a valid page and violate LRU, or 4326 * unnecessarily defer reclamation of an invalid page. 4327 * 4328 * If we were asked to not cache the page, place it near the head of the 4329 * inactive queue so that is reclaimed sooner. 4330 */ 4331 if (noreuse || vm_page_none_valid(m)) { 4332 nqueue = PQ_INACTIVE; 4333 nflag = PGA_REQUEUE_HEAD; 4334 } else { 4335 nflag = PGA_REQUEUE; 4336 } 4337 4338 old = vm_page_astate_load(m); 4339 do { 4340 new = old; 4341 4342 /* 4343 * If the page is already in the active queue and we are not 4344 * trying to accelerate reclamation, simply mark it as 4345 * referenced and avoid any queue operations. 4346 */ 4347 new.flags &= ~PGA_QUEUE_OP_MASK; 4348 if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE && 4349 (old.flags & PGA_ENQUEUED) != 0) 4350 new.flags |= PGA_REFERENCED; 4351 else { 4352 new.flags |= nflag; 4353 new.queue = nqueue; 4354 } 4355 } while (!vm_page_pqstate_commit(m, &old, new)); 4356 } 4357 4358 /* 4359 * Unwire a page and either attempt to free it or re-add it to the page queues. 4360 */ 4361 void 4362 vm_page_release(vm_page_t m, int flags) 4363 { 4364 vm_object_t object; 4365 4366 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4367 ("vm_page_release: page %p is unmanaged", m)); 4368 4369 if ((flags & VPR_TRYFREE) != 0) { 4370 for (;;) { 4371 object = atomic_load_ptr(&m->object); 4372 if (object == NULL) 4373 break; 4374 /* Depends on type-stability. */ 4375 if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object)) 4376 break; 4377 if (object == m->object) { 4378 vm_page_release_locked(m, flags); 4379 VM_OBJECT_WUNLOCK(object); 4380 return; 4381 } 4382 VM_OBJECT_WUNLOCK(object); 4383 } 4384 } 4385 vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); 4386 } 4387 4388 /* See vm_page_release(). */ 4389 void 4390 vm_page_release_locked(vm_page_t m, int flags) 4391 { 4392 4393 VM_OBJECT_ASSERT_WLOCKED(m->object); 4394 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4395 ("vm_page_release_locked: page %p is unmanaged", m)); 4396 4397 if (vm_page_unwire_noq(m)) { 4398 if ((flags & VPR_TRYFREE) != 0 && 4399 (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && 4400 m->dirty == 0 && vm_page_tryxbusy(m)) { 4401 /* 4402 * An unlocked lookup may have wired the page before the 4403 * busy lock was acquired, in which case the page must 4404 * not be freed. 4405 */ 4406 if (__predict_true(!vm_page_wired(m))) { 4407 vm_page_free(m); 4408 return; 4409 } 4410 vm_page_xunbusy(m); 4411 } else { 4412 vm_page_release_toq(m, PQ_INACTIVE, flags != 0); 4413 } 4414 } 4415 } 4416 4417 static bool 4418 vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t)) 4419 { 4420 u_int old; 4421 4422 KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0, 4423 ("vm_page_try_blocked_op: page %p has no object", m)); 4424 KASSERT(vm_page_busied(m), 4425 ("vm_page_try_blocked_op: page %p is not busy", m)); 4426 VM_OBJECT_ASSERT_LOCKED(m->object); 4427 4428 old = m->ref_count; 4429 do { 4430 KASSERT(old != 0, 4431 ("vm_page_try_blocked_op: page %p has no references", m)); 4432 if (VPRC_WIRE_COUNT(old) != 0) 4433 return (false); 4434 } while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED)); 4435 4436 (op)(m); 4437 4438 /* 4439 * If the object is read-locked, new wirings may be created via an 4440 * object lookup. 4441 */ 4442 old = vm_page_drop(m, VPRC_BLOCKED); 4443 KASSERT(!VM_OBJECT_WOWNED(m->object) || 4444 old == (VPRC_BLOCKED | VPRC_OBJREF), 4445 ("vm_page_try_blocked_op: unexpected refcount value %u for %p", 4446 old, m)); 4447 return (true); 4448 } 4449 4450 /* 4451 * Atomically check for wirings and remove all mappings of the page. 4452 */ 4453 bool 4454 vm_page_try_remove_all(vm_page_t m) 4455 { 4456 4457 return (vm_page_try_blocked_op(m, pmap_remove_all)); 4458 } 4459 4460 /* 4461 * Atomically check for wirings and remove all writeable mappings of the page. 4462 */ 4463 bool 4464 vm_page_try_remove_write(vm_page_t m) 4465 { 4466 4467 return (vm_page_try_blocked_op(m, pmap_remove_write)); 4468 } 4469 4470 /* 4471 * vm_page_advise 4472 * 4473 * Apply the specified advice to the given page. 4474 */ 4475 void 4476 vm_page_advise(vm_page_t m, int advice) 4477 { 4478 4479 VM_OBJECT_ASSERT_WLOCKED(m->object); 4480 vm_page_assert_xbusied(m); 4481 4482 if (advice == MADV_FREE) 4483 /* 4484 * Mark the page clean. This will allow the page to be freed 4485 * without first paging it out. MADV_FREE pages are often 4486 * quickly reused by malloc(3), so we do not do anything that 4487 * would result in a page fault on a later access. 4488 */ 4489 vm_page_undirty(m); 4490 else if (advice != MADV_DONTNEED) { 4491 if (advice == MADV_WILLNEED) 4492 vm_page_activate(m); 4493 return; 4494 } 4495 4496 if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) 4497 vm_page_dirty(m); 4498 4499 /* 4500 * Clear any references to the page. Otherwise, the page daemon will 4501 * immediately reactivate the page. 4502 */ 4503 vm_page_aflag_clear(m, PGA_REFERENCED); 4504 4505 /* 4506 * Place clean pages near the head of the inactive queue rather than 4507 * the tail, thus defeating the queue's LRU operation and ensuring that 4508 * the page will be reused quickly. Dirty pages not already in the 4509 * laundry are moved there. 4510 */ 4511 if (m->dirty == 0) 4512 vm_page_deactivate_noreuse(m); 4513 else if (!vm_page_in_laundry(m)) 4514 vm_page_launder(m); 4515 } 4516 4517 /* 4518 * vm_page_grab_release 4519 * 4520 * Helper routine for grab functions to release busy on return. 4521 */ 4522 static inline void 4523 vm_page_grab_release(vm_page_t m, int allocflags) 4524 { 4525 4526 if ((allocflags & VM_ALLOC_NOBUSY) != 0) { 4527 if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) 4528 vm_page_sunbusy(m); 4529 else 4530 vm_page_xunbusy(m); 4531 } 4532 } 4533 4534 /* 4535 * vm_page_grab_sleep 4536 * 4537 * Sleep for busy according to VM_ALLOC_ parameters. Returns true 4538 * if the caller should retry and false otherwise. 4539 * 4540 * If the object is locked on entry the object will be unlocked with 4541 * false returns and still locked but possibly having been dropped 4542 * with true returns. 4543 */ 4544 static bool 4545 vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex, 4546 const char *wmesg, int allocflags, bool locked) 4547 { 4548 4549 if ((allocflags & VM_ALLOC_NOWAIT) != 0) 4550 return (false); 4551 4552 /* 4553 * Reference the page before unlocking and sleeping so that 4554 * the page daemon is less likely to reclaim it. 4555 */ 4556 if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0) 4557 vm_page_reference(m); 4558 4559 if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) && 4560 locked) 4561 VM_OBJECT_WLOCK(object); 4562 if ((allocflags & VM_ALLOC_WAITFAIL) != 0) 4563 return (false); 4564 4565 return (true); 4566 } 4567 4568 /* 4569 * Assert that the grab flags are valid. 4570 */ 4571 static inline void 4572 vm_page_grab_check(int allocflags) 4573 { 4574 4575 KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || 4576 (allocflags & VM_ALLOC_WIRED) != 0, 4577 ("vm_page_grab*: the pages must be busied or wired")); 4578 4579 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 4580 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 4581 ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); 4582 } 4583 4584 /* 4585 * Calculate the page allocation flags for grab. 4586 */ 4587 static inline int 4588 vm_page_grab_pflags(int allocflags) 4589 { 4590 int pflags; 4591 4592 pflags = allocflags & 4593 ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | 4594 VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY); 4595 if ((allocflags & VM_ALLOC_NOWAIT) == 0) 4596 pflags |= VM_ALLOC_WAITFAIL; 4597 if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) 4598 pflags |= VM_ALLOC_SBUSY; 4599 4600 return (pflags); 4601 } 4602 4603 /* 4604 * Grab a page, waiting until we are waken up due to the page 4605 * changing state. We keep on waiting, if the page continues 4606 * to be in the object. If the page doesn't exist, first allocate it 4607 * and then conditionally zero it. 4608 * 4609 * This routine may sleep. 4610 * 4611 * The object must be locked on entry. The lock will, however, be released 4612 * and reacquired if the routine sleeps. 4613 */ 4614 vm_page_t 4615 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 4616 { 4617 vm_page_t m; 4618 4619 VM_OBJECT_ASSERT_WLOCKED(object); 4620 vm_page_grab_check(allocflags); 4621 4622 retrylookup: 4623 if ((m = vm_page_lookup(object, pindex)) != NULL) { 4624 if (!vm_page_tryacquire(m, allocflags)) { 4625 if (vm_page_grab_sleep(object, m, pindex, "pgrbwt", 4626 allocflags, true)) 4627 goto retrylookup; 4628 return (NULL); 4629 } 4630 goto out; 4631 } 4632 if ((allocflags & VM_ALLOC_NOCREAT) != 0) 4633 return (NULL); 4634 m = vm_page_alloc(object, pindex, vm_page_grab_pflags(allocflags)); 4635 if (m == NULL) { 4636 if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) 4637 return (NULL); 4638 goto retrylookup; 4639 } 4640 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 4641 pmap_zero_page(m); 4642 4643 out: 4644 vm_page_grab_release(m, allocflags); 4645 4646 return (m); 4647 } 4648 4649 /* 4650 * Locklessly attempt to acquire a page given a (object, pindex) tuple 4651 * and an optional previous page to avoid the radix lookup. The resulting 4652 * page will be validated against the identity tuple and busied or wired 4653 * as requested. A NULL *mp return guarantees that the page was not in 4654 * radix at the time of the call but callers must perform higher level 4655 * synchronization or retry the operation under a lock if they require 4656 * an atomic answer. This is the only lock free validation routine, 4657 * other routines can depend on the resulting page state. 4658 * 4659 * The return value indicates whether the operation failed due to caller 4660 * flags. The return is tri-state with mp: 4661 * 4662 * (true, *mp != NULL) - The operation was successful. 4663 * (true, *mp == NULL) - The page was not found in tree. 4664 * (false, *mp == NULL) - WAITFAIL or NOWAIT prevented acquisition. 4665 */ 4666 static bool 4667 vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, 4668 vm_page_t prev, vm_page_t *mp, int allocflags) 4669 { 4670 vm_page_t m; 4671 4672 vm_page_grab_check(allocflags); 4673 MPASS(prev == NULL || vm_page_busied(prev) || vm_page_wired(prev)); 4674 4675 *mp = NULL; 4676 for (;;) { 4677 /* 4678 * We may see a false NULL here because the previous page 4679 * has been removed or just inserted and the list is loaded 4680 * without barriers. Switch to radix to verify. 4681 */ 4682 if (prev == NULL || (m = TAILQ_NEXT(prev, listq)) == NULL || 4683 QMD_IS_TRASHED(m) || m->pindex != pindex || 4684 atomic_load_ptr(&m->object) != object) { 4685 prev = NULL; 4686 /* 4687 * This guarantees the result is instantaneously 4688 * correct. 4689 */ 4690 m = vm_radix_lookup_unlocked(&object->rtree, pindex); 4691 } 4692 if (m == NULL) 4693 return (true); 4694 if (vm_page_trybusy(m, allocflags)) { 4695 if (m->object == object && m->pindex == pindex) 4696 break; 4697 /* relookup. */ 4698 vm_page_busy_release(m); 4699 cpu_spinwait(); 4700 continue; 4701 } 4702 if (!vm_page_grab_sleep(object, m, pindex, "pgnslp", 4703 allocflags, false)) 4704 return (false); 4705 } 4706 if ((allocflags & VM_ALLOC_WIRED) != 0) 4707 vm_page_wire(m); 4708 vm_page_grab_release(m, allocflags); 4709 *mp = m; 4710 return (true); 4711 } 4712 4713 /* 4714 * Try to locklessly grab a page and fall back to the object lock if NOCREAT 4715 * is not set. 4716 */ 4717 vm_page_t 4718 vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags) 4719 { 4720 vm_page_t m; 4721 4722 vm_page_grab_check(allocflags); 4723 4724 if (!vm_page_acquire_unlocked(object, pindex, NULL, &m, allocflags)) 4725 return (NULL); 4726 if (m != NULL) 4727 return (m); 4728 4729 /* 4730 * The radix lockless lookup should never return a false negative 4731 * errors. If the user specifies NOCREAT they are guaranteed there 4732 * was no page present at the instant of the call. A NOCREAT caller 4733 * must handle create races gracefully. 4734 */ 4735 if ((allocflags & VM_ALLOC_NOCREAT) != 0) 4736 return (NULL); 4737 4738 VM_OBJECT_WLOCK(object); 4739 m = vm_page_grab(object, pindex, allocflags); 4740 VM_OBJECT_WUNLOCK(object); 4741 4742 return (m); 4743 } 4744 4745 /* 4746 * Grab a page and make it valid, paging in if necessary. Pages missing from 4747 * their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied 4748 * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought 4749 * in simultaneously. Additional pages will be left on a paging queue but 4750 * will neither be wired nor busy regardless of allocflags. 4751 */ 4752 int 4753 vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) 4754 { 4755 vm_page_t m; 4756 vm_page_t ma[VM_INITIAL_PAGEIN]; 4757 int after, i, pflags, rv; 4758 4759 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 4760 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 4761 ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); 4762 KASSERT((allocflags & 4763 (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, 4764 ("vm_page_grab_valid: Invalid flags 0x%X", allocflags)); 4765 VM_OBJECT_ASSERT_WLOCKED(object); 4766 pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY | 4767 VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY); 4768 pflags |= VM_ALLOC_WAITFAIL; 4769 4770 retrylookup: 4771 if ((m = vm_page_lookup(object, pindex)) != NULL) { 4772 /* 4773 * If the page is fully valid it can only become invalid 4774 * with the object lock held. If it is not valid it can 4775 * become valid with the busy lock held. Therefore, we 4776 * may unnecessarily lock the exclusive busy here if we 4777 * race with I/O completion not using the object lock. 4778 * However, we will not end up with an invalid page and a 4779 * shared lock. 4780 */ 4781 if (!vm_page_trybusy(m, 4782 vm_page_all_valid(m) ? allocflags : 0)) { 4783 (void)vm_page_grab_sleep(object, m, pindex, "pgrbwt", 4784 allocflags, true); 4785 goto retrylookup; 4786 } 4787 if (vm_page_all_valid(m)) 4788 goto out; 4789 if ((allocflags & VM_ALLOC_NOCREAT) != 0) { 4790 vm_page_busy_release(m); 4791 *mp = NULL; 4792 return (VM_PAGER_FAIL); 4793 } 4794 } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { 4795 *mp = NULL; 4796 return (VM_PAGER_FAIL); 4797 } else if ((m = vm_page_alloc(object, pindex, pflags)) == NULL) { 4798 if (!vm_pager_can_alloc_page(object, pindex)) { 4799 *mp = NULL; 4800 return (VM_PAGER_AGAIN); 4801 } 4802 goto retrylookup; 4803 } 4804 4805 vm_page_assert_xbusied(m); 4806 if (vm_pager_has_page(object, pindex, NULL, &after)) { 4807 after = MIN(after, VM_INITIAL_PAGEIN); 4808 after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT); 4809 after = MAX(after, 1); 4810 ma[0] = m; 4811 for (i = 1; i < after; i++) { 4812 if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { 4813 if (vm_page_any_valid(ma[i]) || 4814 !vm_page_tryxbusy(ma[i])) 4815 break; 4816 } else { 4817 ma[i] = vm_page_alloc(object, m->pindex + i, 4818 VM_ALLOC_NORMAL); 4819 if (ma[i] == NULL) 4820 break; 4821 } 4822 } 4823 after = i; 4824 vm_object_pip_add(object, after); 4825 VM_OBJECT_WUNLOCK(object); 4826 rv = vm_pager_get_pages(object, ma, after, NULL, NULL); 4827 VM_OBJECT_WLOCK(object); 4828 vm_object_pip_wakeupn(object, after); 4829 /* Pager may have replaced a page. */ 4830 m = ma[0]; 4831 if (rv != VM_PAGER_OK) { 4832 for (i = 0; i < after; i++) { 4833 if (!vm_page_wired(ma[i])) 4834 vm_page_free(ma[i]); 4835 else 4836 vm_page_xunbusy(ma[i]); 4837 } 4838 *mp = NULL; 4839 return (rv); 4840 } 4841 for (i = 1; i < after; i++) 4842 vm_page_readahead_finish(ma[i]); 4843 MPASS(vm_page_all_valid(m)); 4844 } else { 4845 vm_page_zero_invalid(m, TRUE); 4846 } 4847 out: 4848 if ((allocflags & VM_ALLOC_WIRED) != 0) 4849 vm_page_wire(m); 4850 if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m)) 4851 vm_page_busy_downgrade(m); 4852 else if ((allocflags & VM_ALLOC_NOBUSY) != 0) 4853 vm_page_busy_release(m); 4854 *mp = m; 4855 return (VM_PAGER_OK); 4856 } 4857 4858 /* 4859 * Locklessly grab a valid page. If the page is not valid or not yet 4860 * allocated this will fall back to the object lock method. 4861 */ 4862 int 4863 vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object, 4864 vm_pindex_t pindex, int allocflags) 4865 { 4866 vm_page_t m; 4867 int flags; 4868 int error; 4869 4870 KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || 4871 (allocflags & VM_ALLOC_IGN_SBUSY) != 0, 4872 ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY " 4873 "mismatch")); 4874 KASSERT((allocflags & 4875 (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, 4876 ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags)); 4877 4878 /* 4879 * Attempt a lockless lookup and busy. We need at least an sbusy 4880 * before we can inspect the valid field and return a wired page. 4881 */ 4882 flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); 4883 if (!vm_page_acquire_unlocked(object, pindex, NULL, mp, flags)) 4884 return (VM_PAGER_FAIL); 4885 if ((m = *mp) != NULL) { 4886 if (vm_page_all_valid(m)) { 4887 if ((allocflags & VM_ALLOC_WIRED) != 0) 4888 vm_page_wire(m); 4889 vm_page_grab_release(m, allocflags); 4890 return (VM_PAGER_OK); 4891 } 4892 vm_page_busy_release(m); 4893 } 4894 if ((allocflags & VM_ALLOC_NOCREAT) != 0) { 4895 *mp = NULL; 4896 return (VM_PAGER_FAIL); 4897 } 4898 VM_OBJECT_WLOCK(object); 4899 error = vm_page_grab_valid(mp, object, pindex, allocflags); 4900 VM_OBJECT_WUNLOCK(object); 4901 4902 return (error); 4903 } 4904 4905 /* 4906 * Return the specified range of pages from the given object. For each 4907 * page offset within the range, if a page already exists within the object 4908 * at that offset and it is busy, then wait for it to change state. If, 4909 * instead, the page doesn't exist, then allocate it. 4910 * 4911 * The caller must always specify an allocation class. 4912 * 4913 * allocation classes: 4914 * VM_ALLOC_NORMAL normal process request 4915 * VM_ALLOC_SYSTEM system *really* needs the pages 4916 * 4917 * The caller must always specify that the pages are to be busied and/or 4918 * wired. 4919 * 4920 * optional allocation flags: 4921 * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages 4922 * VM_ALLOC_NOBUSY do not exclusive busy the page 4923 * VM_ALLOC_NOWAIT do not sleep 4924 * VM_ALLOC_SBUSY set page to sbusy state 4925 * VM_ALLOC_WIRED wire the pages 4926 * VM_ALLOC_ZERO zero and validate any invalid pages 4927 * 4928 * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it 4929 * may return a partial prefix of the requested range. 4930 */ 4931 int 4932 vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, 4933 vm_page_t *ma, int count) 4934 { 4935 vm_page_t m, mpred; 4936 int pflags; 4937 int i; 4938 4939 VM_OBJECT_ASSERT_WLOCKED(object); 4940 KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, 4941 ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); 4942 KASSERT(count > 0, 4943 ("vm_page_grab_pages: invalid page count %d", count)); 4944 vm_page_grab_check(allocflags); 4945 4946 pflags = vm_page_grab_pflags(allocflags); 4947 i = 0; 4948 retrylookup: 4949 m = vm_radix_lookup_le(&object->rtree, pindex + i); 4950 if (m == NULL || m->pindex != pindex + i) { 4951 mpred = m; 4952 m = NULL; 4953 } else 4954 mpred = TAILQ_PREV(m, pglist, listq); 4955 for (; i < count; i++) { 4956 if (m != NULL) { 4957 if (!vm_page_tryacquire(m, allocflags)) { 4958 if (vm_page_grab_sleep(object, m, pindex + i, 4959 "grbmaw", allocflags, true)) 4960 goto retrylookup; 4961 break; 4962 } 4963 } else { 4964 if ((allocflags & VM_ALLOC_NOCREAT) != 0) 4965 break; 4966 m = vm_page_alloc_after(object, pindex + i, 4967 pflags | VM_ALLOC_COUNT(count - i), mpred); 4968 if (m == NULL) { 4969 if ((allocflags & (VM_ALLOC_NOWAIT | 4970 VM_ALLOC_WAITFAIL)) != 0) 4971 break; 4972 goto retrylookup; 4973 } 4974 } 4975 if (vm_page_none_valid(m) && 4976 (allocflags & VM_ALLOC_ZERO) != 0) { 4977 if ((m->flags & PG_ZERO) == 0) 4978 pmap_zero_page(m); 4979 vm_page_valid(m); 4980 } 4981 vm_page_grab_release(m, allocflags); 4982 ma[i] = mpred = m; 4983 m = vm_page_next(m); 4984 } 4985 return (i); 4986 } 4987 4988 /* 4989 * Unlocked variant of vm_page_grab_pages(). This accepts the same flags 4990 * and will fall back to the locked variant to handle allocation. 4991 */ 4992 int 4993 vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex, 4994 int allocflags, vm_page_t *ma, int count) 4995 { 4996 vm_page_t m, pred; 4997 int flags; 4998 int i; 4999 5000 KASSERT(count > 0, 5001 ("vm_page_grab_pages_unlocked: invalid page count %d", count)); 5002 vm_page_grab_check(allocflags); 5003 5004 /* 5005 * Modify flags for lockless acquire to hold the page until we 5006 * set it valid if necessary. 5007 */ 5008 flags = allocflags & ~VM_ALLOC_NOBUSY; 5009 pred = NULL; 5010 for (i = 0; i < count; i++, pindex++) { 5011 if (!vm_page_acquire_unlocked(object, pindex, pred, &m, flags)) 5012 return (i); 5013 if (m == NULL) 5014 break; 5015 if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) { 5016 if ((m->flags & PG_ZERO) == 0) 5017 pmap_zero_page(m); 5018 vm_page_valid(m); 5019 } 5020 /* m will still be wired or busy according to flags. */ 5021 vm_page_grab_release(m, allocflags); 5022 pred = ma[i] = m; 5023 } 5024 if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0) 5025 return (i); 5026 count -= i; 5027 VM_OBJECT_WLOCK(object); 5028 i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count); 5029 VM_OBJECT_WUNLOCK(object); 5030 5031 return (i); 5032 } 5033 5034 /* 5035 * Mapping function for valid or dirty bits in a page. 5036 * 5037 * Inputs are required to range within a page. 5038 */ 5039 vm_page_bits_t 5040 vm_page_bits(int base, int size) 5041 { 5042 int first_bit; 5043 int last_bit; 5044 5045 KASSERT( 5046 base + size <= PAGE_SIZE, 5047 ("vm_page_bits: illegal base/size %d/%d", base, size) 5048 ); 5049 5050 if (size == 0) /* handle degenerate case */ 5051 return (0); 5052 5053 first_bit = base >> DEV_BSHIFT; 5054 last_bit = (base + size - 1) >> DEV_BSHIFT; 5055 5056 return (((vm_page_bits_t)2 << last_bit) - 5057 ((vm_page_bits_t)1 << first_bit)); 5058 } 5059 5060 void 5061 vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set) 5062 { 5063 5064 #if PAGE_SIZE == 32768 5065 atomic_set_64((uint64_t *)bits, set); 5066 #elif PAGE_SIZE == 16384 5067 atomic_set_32((uint32_t *)bits, set); 5068 #elif (PAGE_SIZE == 8192) && defined(atomic_set_16) 5069 atomic_set_16((uint16_t *)bits, set); 5070 #elif (PAGE_SIZE == 4096) && defined(atomic_set_8) 5071 atomic_set_8((uint8_t *)bits, set); 5072 #else /* PAGE_SIZE <= 8192 */ 5073 uintptr_t addr; 5074 int shift; 5075 5076 addr = (uintptr_t)bits; 5077 /* 5078 * Use a trick to perform a 32-bit atomic on the 5079 * containing aligned word, to not depend on the existence 5080 * of atomic_{set, clear}_{8, 16}. 5081 */ 5082 shift = addr & (sizeof(uint32_t) - 1); 5083 #if BYTE_ORDER == BIG_ENDIAN 5084 shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; 5085 #else 5086 shift *= NBBY; 5087 #endif 5088 addr &= ~(sizeof(uint32_t) - 1); 5089 atomic_set_32((uint32_t *)addr, set << shift); 5090 #endif /* PAGE_SIZE */ 5091 } 5092 5093 static inline void 5094 vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear) 5095 { 5096 5097 #if PAGE_SIZE == 32768 5098 atomic_clear_64((uint64_t *)bits, clear); 5099 #elif PAGE_SIZE == 16384 5100 atomic_clear_32((uint32_t *)bits, clear); 5101 #elif (PAGE_SIZE == 8192) && defined(atomic_clear_16) 5102 atomic_clear_16((uint16_t *)bits, clear); 5103 #elif (PAGE_SIZE == 4096) && defined(atomic_clear_8) 5104 atomic_clear_8((uint8_t *)bits, clear); 5105 #else /* PAGE_SIZE <= 8192 */ 5106 uintptr_t addr; 5107 int shift; 5108 5109 addr = (uintptr_t)bits; 5110 /* 5111 * Use a trick to perform a 32-bit atomic on the 5112 * containing aligned word, to not depend on the existence 5113 * of atomic_{set, clear}_{8, 16}. 5114 */ 5115 shift = addr & (sizeof(uint32_t) - 1); 5116 #if BYTE_ORDER == BIG_ENDIAN 5117 shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; 5118 #else 5119 shift *= NBBY; 5120 #endif 5121 addr &= ~(sizeof(uint32_t) - 1); 5122 atomic_clear_32((uint32_t *)addr, clear << shift); 5123 #endif /* PAGE_SIZE */ 5124 } 5125 5126 static inline vm_page_bits_t 5127 vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits) 5128 { 5129 #if PAGE_SIZE == 32768 5130 uint64_t old; 5131 5132 old = *bits; 5133 while (atomic_fcmpset_64(bits, &old, newbits) == 0); 5134 return (old); 5135 #elif PAGE_SIZE == 16384 5136 uint32_t old; 5137 5138 old = *bits; 5139 while (atomic_fcmpset_32(bits, &old, newbits) == 0); 5140 return (old); 5141 #elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16) 5142 uint16_t old; 5143 5144 old = *bits; 5145 while (atomic_fcmpset_16(bits, &old, newbits) == 0); 5146 return (old); 5147 #elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8) 5148 uint8_t old; 5149 5150 old = *bits; 5151 while (atomic_fcmpset_8(bits, &old, newbits) == 0); 5152 return (old); 5153 #else /* PAGE_SIZE <= 4096*/ 5154 uintptr_t addr; 5155 uint32_t old, new, mask; 5156 int shift; 5157 5158 addr = (uintptr_t)bits; 5159 /* 5160 * Use a trick to perform a 32-bit atomic on the 5161 * containing aligned word, to not depend on the existence 5162 * of atomic_{set, swap, clear}_{8, 16}. 5163 */ 5164 shift = addr & (sizeof(uint32_t) - 1); 5165 #if BYTE_ORDER == BIG_ENDIAN 5166 shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; 5167 #else 5168 shift *= NBBY; 5169 #endif 5170 addr &= ~(sizeof(uint32_t) - 1); 5171 mask = VM_PAGE_BITS_ALL << shift; 5172 5173 old = *bits; 5174 do { 5175 new = old & ~mask; 5176 new |= newbits << shift; 5177 } while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0); 5178 return (old >> shift); 5179 #endif /* PAGE_SIZE */ 5180 } 5181 5182 /* 5183 * vm_page_set_valid_range: 5184 * 5185 * Sets portions of a page valid. The arguments are expected 5186 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 5187 * of any partial chunks touched by the range. The invalid portion of 5188 * such chunks will be zeroed. 5189 * 5190 * (base + size) must be less then or equal to PAGE_SIZE. 5191 */ 5192 void 5193 vm_page_set_valid_range(vm_page_t m, int base, int size) 5194 { 5195 int endoff, frag; 5196 vm_page_bits_t pagebits; 5197 5198 vm_page_assert_busied(m); 5199 if (size == 0) /* handle degenerate case */ 5200 return; 5201 5202 /* 5203 * If the base is not DEV_BSIZE aligned and the valid 5204 * bit is clear, we have to zero out a portion of the 5205 * first block. 5206 */ 5207 if ((frag = rounddown2(base, DEV_BSIZE)) != base && 5208 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 5209 pmap_zero_page_area(m, frag, base - frag); 5210 5211 /* 5212 * If the ending offset is not DEV_BSIZE aligned and the 5213 * valid bit is clear, we have to zero out a portion of 5214 * the last block. 5215 */ 5216 endoff = base + size; 5217 if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && 5218 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 5219 pmap_zero_page_area(m, endoff, 5220 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 5221 5222 /* 5223 * Assert that no previously invalid block that is now being validated 5224 * is already dirty. 5225 */ 5226 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 5227 ("vm_page_set_valid_range: page %p is dirty", m)); 5228 5229 /* 5230 * Set valid bits inclusive of any overlap. 5231 */ 5232 pagebits = vm_page_bits(base, size); 5233 if (vm_page_xbusied(m)) 5234 m->valid |= pagebits; 5235 else 5236 vm_page_bits_set(m, &m->valid, pagebits); 5237 } 5238 5239 /* 5240 * Set the page dirty bits and free the invalid swap space if 5241 * present. Returns the previous dirty bits. 5242 */ 5243 vm_page_bits_t 5244 vm_page_set_dirty(vm_page_t m) 5245 { 5246 vm_page_bits_t old; 5247 5248 VM_PAGE_OBJECT_BUSY_ASSERT(m); 5249 5250 if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) { 5251 old = m->dirty; 5252 m->dirty = VM_PAGE_BITS_ALL; 5253 } else 5254 old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL); 5255 if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0) 5256 vm_pager_page_unswapped(m); 5257 5258 return (old); 5259 } 5260 5261 /* 5262 * Clear the given bits from the specified page's dirty field. 5263 */ 5264 static __inline void 5265 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) 5266 { 5267 5268 vm_page_assert_busied(m); 5269 5270 /* 5271 * If the page is xbusied and not write mapped we are the 5272 * only thread that can modify dirty bits. Otherwise, The pmap 5273 * layer can call vm_page_dirty() without holding a distinguished 5274 * lock. The combination of page busy and atomic operations 5275 * suffice to guarantee consistency of the page dirty field. 5276 */ 5277 if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) 5278 m->dirty &= ~pagebits; 5279 else 5280 vm_page_bits_clear(m, &m->dirty, pagebits); 5281 } 5282 5283 /* 5284 * vm_page_set_validclean: 5285 * 5286 * Sets portions of a page valid and clean. The arguments are expected 5287 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 5288 * of any partial chunks touched by the range. The invalid portion of 5289 * such chunks will be zero'd. 5290 * 5291 * (base + size) must be less then or equal to PAGE_SIZE. 5292 */ 5293 void 5294 vm_page_set_validclean(vm_page_t m, int base, int size) 5295 { 5296 vm_page_bits_t oldvalid, pagebits; 5297 int endoff, frag; 5298 5299 vm_page_assert_busied(m); 5300 if (size == 0) /* handle degenerate case */ 5301 return; 5302 5303 /* 5304 * If the base is not DEV_BSIZE aligned and the valid 5305 * bit is clear, we have to zero out a portion of the 5306 * first block. 5307 */ 5308 if ((frag = rounddown2(base, DEV_BSIZE)) != base && 5309 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) 5310 pmap_zero_page_area(m, frag, base - frag); 5311 5312 /* 5313 * If the ending offset is not DEV_BSIZE aligned and the 5314 * valid bit is clear, we have to zero out a portion of 5315 * the last block. 5316 */ 5317 endoff = base + size; 5318 if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && 5319 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) 5320 pmap_zero_page_area(m, endoff, 5321 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 5322 5323 /* 5324 * Set valid, clear dirty bits. If validating the entire 5325 * page we can safely clear the pmap modify bit. We also 5326 * use this opportunity to clear the PGA_NOSYNC flag. If a process 5327 * takes a write fault on a MAP_NOSYNC memory area the flag will 5328 * be set again. 5329 * 5330 * We set valid bits inclusive of any overlap, but we can only 5331 * clear dirty bits for DEV_BSIZE chunks that are fully within 5332 * the range. 5333 */ 5334 oldvalid = m->valid; 5335 pagebits = vm_page_bits(base, size); 5336 if (vm_page_xbusied(m)) 5337 m->valid |= pagebits; 5338 else 5339 vm_page_bits_set(m, &m->valid, pagebits); 5340 #if 0 /* NOT YET */ 5341 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 5342 frag = DEV_BSIZE - frag; 5343 base += frag; 5344 size -= frag; 5345 if (size < 0) 5346 size = 0; 5347 } 5348 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 5349 #endif 5350 if (base == 0 && size == PAGE_SIZE) { 5351 /* 5352 * The page can only be modified within the pmap if it is 5353 * mapped, and it can only be mapped if it was previously 5354 * fully valid. 5355 */ 5356 if (oldvalid == VM_PAGE_BITS_ALL) 5357 /* 5358 * Perform the pmap_clear_modify() first. Otherwise, 5359 * a concurrent pmap operation, such as 5360 * pmap_protect(), could clear a modification in the 5361 * pmap and set the dirty field on the page before 5362 * pmap_clear_modify() had begun and after the dirty 5363 * field was cleared here. 5364 */ 5365 pmap_clear_modify(m); 5366 m->dirty = 0; 5367 vm_page_aflag_clear(m, PGA_NOSYNC); 5368 } else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m)) 5369 m->dirty &= ~pagebits; 5370 else 5371 vm_page_clear_dirty_mask(m, pagebits); 5372 } 5373 5374 void 5375 vm_page_clear_dirty(vm_page_t m, int base, int size) 5376 { 5377 5378 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 5379 } 5380 5381 /* 5382 * vm_page_set_invalid: 5383 * 5384 * Invalidates DEV_BSIZE'd chunks within a page. Both the 5385 * valid and dirty bits for the effected areas are cleared. 5386 */ 5387 void 5388 vm_page_set_invalid(vm_page_t m, int base, int size) 5389 { 5390 vm_page_bits_t bits; 5391 vm_object_t object; 5392 5393 /* 5394 * The object lock is required so that pages can't be mapped 5395 * read-only while we're in the process of invalidating them. 5396 */ 5397 object = m->object; 5398 VM_OBJECT_ASSERT_WLOCKED(object); 5399 vm_page_assert_busied(m); 5400 5401 if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + 5402 size >= object->un_pager.vnp.vnp_size) 5403 bits = VM_PAGE_BITS_ALL; 5404 else 5405 bits = vm_page_bits(base, size); 5406 if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0) 5407 pmap_remove_all(m); 5408 KASSERT((bits == 0 && vm_page_all_valid(m)) || 5409 !pmap_page_is_mapped(m), 5410 ("vm_page_set_invalid: page %p is mapped", m)); 5411 if (vm_page_xbusied(m)) { 5412 m->valid &= ~bits; 5413 m->dirty &= ~bits; 5414 } else { 5415 vm_page_bits_clear(m, &m->valid, bits); 5416 vm_page_bits_clear(m, &m->dirty, bits); 5417 } 5418 } 5419 5420 /* 5421 * vm_page_invalid: 5422 * 5423 * Invalidates the entire page. The page must be busy, unmapped, and 5424 * the enclosing object must be locked. The object locks protects 5425 * against concurrent read-only pmap enter which is done without 5426 * busy. 5427 */ 5428 void 5429 vm_page_invalid(vm_page_t m) 5430 { 5431 5432 vm_page_assert_busied(m); 5433 VM_OBJECT_ASSERT_WLOCKED(m->object); 5434 MPASS(!pmap_page_is_mapped(m)); 5435 5436 if (vm_page_xbusied(m)) 5437 m->valid = 0; 5438 else 5439 vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL); 5440 } 5441 5442 /* 5443 * vm_page_zero_invalid() 5444 * 5445 * The kernel assumes that the invalid portions of a page contain 5446 * garbage, but such pages can be mapped into memory by user code. 5447 * When this occurs, we must zero out the non-valid portions of the 5448 * page so user code sees what it expects. 5449 * 5450 * Pages are most often semi-valid when the end of a file is mapped 5451 * into memory and the file's size is not page aligned. 5452 */ 5453 void 5454 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 5455 { 5456 int b; 5457 int i; 5458 5459 /* 5460 * Scan the valid bits looking for invalid sections that 5461 * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the 5462 * valid bit may be set ) have already been zeroed by 5463 * vm_page_set_validclean(). 5464 */ 5465 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 5466 if (i == (PAGE_SIZE / DEV_BSIZE) || 5467 (m->valid & ((vm_page_bits_t)1 << i))) { 5468 if (i > b) { 5469 pmap_zero_page_area(m, 5470 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 5471 } 5472 b = i + 1; 5473 } 5474 } 5475 5476 /* 5477 * setvalid is TRUE when we can safely set the zero'd areas 5478 * as being valid. We can do this if there are no cache consistency 5479 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 5480 */ 5481 if (setvalid) 5482 vm_page_valid(m); 5483 } 5484 5485 /* 5486 * vm_page_is_valid: 5487 * 5488 * Is (partial) page valid? Note that the case where size == 0 5489 * will return FALSE in the degenerate case where the page is 5490 * entirely invalid, and TRUE otherwise. 5491 * 5492 * Some callers envoke this routine without the busy lock held and 5493 * handle races via higher level locks. Typical callers should 5494 * hold a busy lock to prevent invalidation. 5495 */ 5496 int 5497 vm_page_is_valid(vm_page_t m, int base, int size) 5498 { 5499 vm_page_bits_t bits; 5500 5501 bits = vm_page_bits(base, size); 5502 return (vm_page_any_valid(m) && (m->valid & bits) == bits); 5503 } 5504 5505 /* 5506 * Returns true if all of the specified predicates are true for the entire 5507 * (super)page and false otherwise. 5508 */ 5509 bool 5510 vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) 5511 { 5512 vm_object_t object; 5513 int i, npages; 5514 5515 object = m->object; 5516 if (skip_m != NULL && skip_m->object != object) 5517 return (false); 5518 VM_OBJECT_ASSERT_LOCKED(object); 5519 npages = atop(pagesizes[m->psind]); 5520 5521 /* 5522 * The physically contiguous pages that make up a superpage, i.e., a 5523 * page with a page size index ("psind") greater than zero, will 5524 * occupy adjacent entries in vm_page_array[]. 5525 */ 5526 for (i = 0; i < npages; i++) { 5527 /* Always test object consistency, including "skip_m". */ 5528 if (m[i].object != object) 5529 return (false); 5530 if (&m[i] == skip_m) 5531 continue; 5532 if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) 5533 return (false); 5534 if ((flags & PS_ALL_DIRTY) != 0) { 5535 /* 5536 * Calling vm_page_test_dirty() or pmap_is_modified() 5537 * might stop this case from spuriously returning 5538 * "false". However, that would require a write lock 5539 * on the object containing "m[i]". 5540 */ 5541 if (m[i].dirty != VM_PAGE_BITS_ALL) 5542 return (false); 5543 } 5544 if ((flags & PS_ALL_VALID) != 0 && 5545 m[i].valid != VM_PAGE_BITS_ALL) 5546 return (false); 5547 } 5548 return (true); 5549 } 5550 5551 /* 5552 * Set the page's dirty bits if the page is modified. 5553 */ 5554 void 5555 vm_page_test_dirty(vm_page_t m) 5556 { 5557 5558 vm_page_assert_busied(m); 5559 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 5560 vm_page_dirty(m); 5561 } 5562 5563 void 5564 vm_page_valid(vm_page_t m) 5565 { 5566 5567 vm_page_assert_busied(m); 5568 if (vm_page_xbusied(m)) 5569 m->valid = VM_PAGE_BITS_ALL; 5570 else 5571 vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL); 5572 } 5573 5574 void 5575 vm_page_lock_KBI(vm_page_t m, const char *file, int line) 5576 { 5577 5578 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); 5579 } 5580 5581 void 5582 vm_page_unlock_KBI(vm_page_t m, const char *file, int line) 5583 { 5584 5585 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); 5586 } 5587 5588 int 5589 vm_page_trylock_KBI(vm_page_t m, const char *file, int line) 5590 { 5591 5592 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); 5593 } 5594 5595 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) 5596 void 5597 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) 5598 { 5599 5600 vm_page_lock_assert_KBI(m, MA_OWNED, file, line); 5601 } 5602 5603 void 5604 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) 5605 { 5606 5607 mtx_assert_(vm_page_lockptr(m), a, file, line); 5608 } 5609 #endif 5610 5611 #ifdef INVARIANTS 5612 void 5613 vm_page_object_busy_assert(vm_page_t m) 5614 { 5615 5616 /* 5617 * Certain of the page's fields may only be modified by the 5618 * holder of a page or object busy. 5619 */ 5620 if (m->object != NULL && !vm_page_busied(m)) 5621 VM_OBJECT_ASSERT_BUSY(m->object); 5622 } 5623 5624 void 5625 vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits) 5626 { 5627 5628 if ((bits & PGA_WRITEABLE) == 0) 5629 return; 5630 5631 /* 5632 * The PGA_WRITEABLE flag can only be set if the page is 5633 * managed, is exclusively busied or the object is locked. 5634 * Currently, this flag is only set by pmap_enter(). 5635 */ 5636 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5637 ("PGA_WRITEABLE on unmanaged page")); 5638 if (!vm_page_xbusied(m)) 5639 VM_OBJECT_ASSERT_BUSY(m->object); 5640 } 5641 #endif 5642 5643 #include "opt_ddb.h" 5644 #ifdef DDB 5645 #include <sys/kernel.h> 5646 5647 #include <ddb/ddb.h> 5648 5649 DB_SHOW_COMMAND_FLAGS(page, vm_page_print_page_info, DB_CMD_MEMSAFE) 5650 { 5651 5652 db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); 5653 db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); 5654 db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); 5655 db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); 5656 db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); 5657 db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); 5658 db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); 5659 db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); 5660 db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); 5661 } 5662 5663 DB_SHOW_COMMAND_FLAGS(pageq, vm_page_print_pageq_info, DB_CMD_MEMSAFE) 5664 { 5665 int dom; 5666 5667 db_printf("pq_free %d\n", vm_free_count()); 5668 for (dom = 0; dom < vm_ndomains; dom++) { 5669 db_printf( 5670 "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", 5671 dom, 5672 vm_dom[dom].vmd_page_count, 5673 vm_dom[dom].vmd_free_count, 5674 vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, 5675 vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, 5676 vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 5677 vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); 5678 } 5679 } 5680 5681 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) 5682 { 5683 vm_page_t m; 5684 boolean_t phys, virt; 5685 5686 if (!have_addr) { 5687 db_printf("show pginfo addr\n"); 5688 return; 5689 } 5690 5691 phys = strchr(modif, 'p') != NULL; 5692 virt = strchr(modif, 'v') != NULL; 5693 if (virt) 5694 m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); 5695 else if (phys) 5696 m = PHYS_TO_VM_PAGE(addr); 5697 else 5698 m = (vm_page_t)addr; 5699 db_printf( 5700 "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n" 5701 " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", 5702 m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, 5703 m->a.queue, m->ref_count, m->a.flags, m->oflags, 5704 m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty); 5705 } 5706 #endif /* DDB */ 5707