1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * The Mach Operating System project at Carnegie-Mellon University. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 34 */ 35 36 /*- 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 */ 62 63 /* 64 * GENERAL RULES ON VM_PAGE MANIPULATION 65 * 66 * - a pageq mutex is required when adding or removing a page from a 67 * page queue (vm_page_queue[]), regardless of other mutexes or the 68 * busy state of a page. 69 * 70 * - The object mutex is held when inserting or removing 71 * pages from an object (vm_page_insert() or vm_page_remove()). 72 * 73 */ 74 75 /* 76 * Resident memory management module. 77 */ 78 79 #include <sys/cdefs.h> 80 __FBSDID("$FreeBSD$"); 81 82 #include "opt_vm.h" 83 84 #include <sys/param.h> 85 #include <sys/systm.h> 86 #include <sys/lock.h> 87 #include <sys/kernel.h> 88 #include <sys/limits.h> 89 #include <sys/malloc.h> 90 #include <sys/msgbuf.h> 91 #include <sys/mutex.h> 92 #include <sys/proc.h> 93 #include <sys/sysctl.h> 94 #include <sys/vmmeter.h> 95 #include <sys/vnode.h> 96 97 #include <vm/vm.h> 98 #include <vm/pmap.h> 99 #include <vm/vm_param.h> 100 #include <vm/vm_kern.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_page.h> 103 #include <vm/vm_pageout.h> 104 #include <vm/vm_pager.h> 105 #include <vm/vm_phys.h> 106 #include <vm/vm_reserv.h> 107 #include <vm/vm_extern.h> 108 #include <vm/uma.h> 109 #include <vm/uma_int.h> 110 111 #include <machine/md_var.h> 112 113 /* 114 * Associated with page of user-allocatable memory is a 115 * page structure. 116 */ 117 118 struct vpgqueues vm_page_queues[PQ_COUNT]; 119 struct vpglocks vm_page_queue_lock; 120 struct vpglocks vm_page_queue_free_lock; 121 122 struct vpglocks pa_lock[PA_LOCK_COUNT]; 123 124 vm_page_t vm_page_array; 125 long vm_page_array_size; 126 long first_page; 127 int vm_page_zero_count; 128 129 static int boot_pages = UMA_BOOT_PAGES; 130 TUNABLE_INT("vm.boot_pages", &boot_pages); 131 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, 132 "number of pages allocated for bootstrapping the VM system"); 133 134 static int pa_tryrelock_restart; 135 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, 136 &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); 137 138 static uma_zone_t fakepg_zone; 139 140 static struct vnode *vm_page_alloc_init(vm_page_t m); 141 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); 142 static void vm_page_queue_remove(int queue, vm_page_t m); 143 static void vm_page_enqueue(int queue, vm_page_t m); 144 static void vm_page_init_fakepg(void *dummy); 145 146 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); 147 148 static void 149 vm_page_init_fakepg(void *dummy) 150 { 151 152 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 153 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 154 } 155 156 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ 157 #if PAGE_SIZE == 32768 158 #ifdef CTASSERT 159 CTASSERT(sizeof(u_long) >= 8); 160 #endif 161 #endif 162 163 /* 164 * Try to acquire a physical address lock while a pmap is locked. If we 165 * fail to trylock we unlock and lock the pmap directly and cache the 166 * locked pa in *locked. The caller should then restart their loop in case 167 * the virtual to physical mapping has changed. 168 */ 169 int 170 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) 171 { 172 vm_paddr_t lockpa; 173 174 lockpa = *locked; 175 *locked = pa; 176 if (lockpa) { 177 PA_LOCK_ASSERT(lockpa, MA_OWNED); 178 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) 179 return (0); 180 PA_UNLOCK(lockpa); 181 } 182 if (PA_TRYLOCK(pa)) 183 return (0); 184 PMAP_UNLOCK(pmap); 185 atomic_add_int(&pa_tryrelock_restart, 1); 186 PA_LOCK(pa); 187 PMAP_LOCK(pmap); 188 return (EAGAIN); 189 } 190 191 /* 192 * vm_set_page_size: 193 * 194 * Sets the page size, perhaps based upon the memory 195 * size. Must be called before any use of page-size 196 * dependent functions. 197 */ 198 void 199 vm_set_page_size(void) 200 { 201 if (cnt.v_page_size == 0) 202 cnt.v_page_size = PAGE_SIZE; 203 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) 204 panic("vm_set_page_size: page size not a power of two"); 205 } 206 207 /* 208 * vm_page_blacklist_lookup: 209 * 210 * See if a physical address in this page has been listed 211 * in the blacklist tunable. Entries in the tunable are 212 * separated by spaces or commas. If an invalid integer is 213 * encountered then the rest of the string is skipped. 214 */ 215 static int 216 vm_page_blacklist_lookup(char *list, vm_paddr_t pa) 217 { 218 vm_paddr_t bad; 219 char *cp, *pos; 220 221 for (pos = list; *pos != '\0'; pos = cp) { 222 bad = strtoq(pos, &cp, 0); 223 if (*cp != '\0') { 224 if (*cp == ' ' || *cp == ',') { 225 cp++; 226 if (cp == pos) 227 continue; 228 } else 229 break; 230 } 231 if (pa == trunc_page(bad)) 232 return (1); 233 } 234 return (0); 235 } 236 237 /* 238 * vm_page_startup: 239 * 240 * Initializes the resident memory module. 241 * 242 * Allocates memory for the page cells, and 243 * for the object/offset-to-page hash table headers. 244 * Each page cell is initialized and placed on the free list. 245 */ 246 vm_offset_t 247 vm_page_startup(vm_offset_t vaddr) 248 { 249 vm_offset_t mapped; 250 vm_paddr_t page_range; 251 vm_paddr_t new_end; 252 int i; 253 vm_paddr_t pa; 254 vm_paddr_t last_pa; 255 char *list; 256 257 /* the biggest memory array is the second group of pages */ 258 vm_paddr_t end; 259 vm_paddr_t biggestsize; 260 vm_paddr_t low_water, high_water; 261 int biggestone; 262 263 biggestsize = 0; 264 biggestone = 0; 265 vaddr = round_page(vaddr); 266 267 for (i = 0; phys_avail[i + 1]; i += 2) { 268 phys_avail[i] = round_page(phys_avail[i]); 269 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 270 } 271 272 low_water = phys_avail[0]; 273 high_water = phys_avail[1]; 274 275 for (i = 0; phys_avail[i + 1]; i += 2) { 276 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; 277 278 if (size > biggestsize) { 279 biggestone = i; 280 biggestsize = size; 281 } 282 if (phys_avail[i] < low_water) 283 low_water = phys_avail[i]; 284 if (phys_avail[i + 1] > high_water) 285 high_water = phys_avail[i + 1]; 286 } 287 288 #ifdef XEN 289 low_water = 0; 290 #endif 291 292 end = phys_avail[biggestone+1]; 293 294 /* 295 * Initialize the locks. 296 */ 297 mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF | 298 MTX_RECURSE); 299 mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, 300 MTX_DEF); 301 302 /* Setup page locks. */ 303 for (i = 0; i < PA_LOCK_COUNT; i++) 304 mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF); 305 306 /* 307 * Initialize the queue headers for the hold queue, the active queue, 308 * and the inactive queue. 309 */ 310 for (i = 0; i < PQ_COUNT; i++) 311 TAILQ_INIT(&vm_page_queues[i].pl); 312 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; 313 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; 314 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; 315 316 /* 317 * Allocate memory for use when boot strapping the kernel memory 318 * allocator. 319 */ 320 new_end = end - (boot_pages * UMA_SLAB_SIZE); 321 new_end = trunc_page(new_end); 322 mapped = pmap_map(&vaddr, new_end, end, 323 VM_PROT_READ | VM_PROT_WRITE); 324 bzero((void *)mapped, end - new_end); 325 uma_startup((void *)mapped, boot_pages); 326 327 #if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \ 328 defined(__mips__) 329 /* 330 * Allocate a bitmap to indicate that a random physical page 331 * needs to be included in a minidump. 332 * 333 * The amd64 port needs this to indicate which direct map pages 334 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 335 * 336 * However, i386 still needs this workspace internally within the 337 * minidump code. In theory, they are not needed on i386, but are 338 * included should the sf_buf code decide to use them. 339 */ 340 last_pa = 0; 341 for (i = 0; dump_avail[i + 1] != 0; i += 2) 342 if (dump_avail[i + 1] > last_pa) 343 last_pa = dump_avail[i + 1]; 344 page_range = last_pa / PAGE_SIZE; 345 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 346 new_end -= vm_page_dump_size; 347 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, 348 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); 349 bzero((void *)vm_page_dump, vm_page_dump_size); 350 #endif 351 #ifdef __amd64__ 352 /* 353 * Request that the physical pages underlying the message buffer be 354 * included in a crash dump. Since the message buffer is accessed 355 * through the direct map, they are not automatically included. 356 */ 357 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 358 last_pa = pa + round_page(msgbufsize); 359 while (pa < last_pa) { 360 dump_add_page(pa); 361 pa += PAGE_SIZE; 362 } 363 #endif 364 /* 365 * Compute the number of pages of memory that will be available for 366 * use (taking into account the overhead of a page structure per 367 * page). 368 */ 369 first_page = low_water / PAGE_SIZE; 370 #ifdef VM_PHYSSEG_SPARSE 371 page_range = 0; 372 for (i = 0; phys_avail[i + 1] != 0; i += 2) 373 page_range += atop(phys_avail[i + 1] - phys_avail[i]); 374 #elif defined(VM_PHYSSEG_DENSE) 375 page_range = high_water / PAGE_SIZE - first_page; 376 #else 377 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 378 #endif 379 end = new_end; 380 381 /* 382 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 383 */ 384 vaddr += PAGE_SIZE; 385 386 /* 387 * Initialize the mem entry structures now, and put them in the free 388 * queue. 389 */ 390 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 391 mapped = pmap_map(&vaddr, new_end, end, 392 VM_PROT_READ | VM_PROT_WRITE); 393 vm_page_array = (vm_page_t) mapped; 394 #if VM_NRESERVLEVEL > 0 395 /* 396 * Allocate memory for the reservation management system's data 397 * structures. 398 */ 399 new_end = vm_reserv_startup(&vaddr, new_end, high_water); 400 #endif 401 #if defined(__amd64__) || defined(__mips__) 402 /* 403 * pmap_map on amd64 and mips can come out of the direct-map, not kvm 404 * like i386, so the pages must be tracked for a crashdump to include 405 * this data. This includes the vm_page_array and the early UMA 406 * bootstrap pages. 407 */ 408 for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) 409 dump_add_page(pa); 410 #endif 411 phys_avail[biggestone + 1] = new_end; 412 413 /* 414 * Clear all of the page structures 415 */ 416 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 417 for (i = 0; i < page_range; i++) 418 vm_page_array[i].order = VM_NFREEORDER; 419 vm_page_array_size = page_range; 420 421 /* 422 * Initialize the physical memory allocator. 423 */ 424 vm_phys_init(); 425 426 /* 427 * Add every available physical page that is not blacklisted to 428 * the free lists. 429 */ 430 cnt.v_page_count = 0; 431 cnt.v_free_count = 0; 432 list = getenv("vm.blacklist"); 433 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 434 pa = phys_avail[i]; 435 last_pa = phys_avail[i + 1]; 436 while (pa < last_pa) { 437 if (list != NULL && 438 vm_page_blacklist_lookup(list, pa)) 439 printf("Skipping page with pa 0x%jx\n", 440 (uintmax_t)pa); 441 else 442 vm_phys_add_page(pa); 443 pa += PAGE_SIZE; 444 } 445 } 446 freeenv(list); 447 #if VM_NRESERVLEVEL > 0 448 /* 449 * Initialize the reservation management system. 450 */ 451 vm_reserv_init(); 452 #endif 453 return (vaddr); 454 } 455 456 457 CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); 458 459 void 460 vm_page_aflag_set(vm_page_t m, uint8_t bits) 461 { 462 uint32_t *addr, val; 463 464 /* 465 * The PGA_WRITEABLE flag can only be set if the page is managed and 466 * VPO_BUSY. Currently, this flag is only set by pmap_enter(). 467 */ 468 KASSERT((bits & PGA_WRITEABLE) == 0 || 469 (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY, 470 ("PGA_WRITEABLE and !VPO_BUSY")); 471 472 /* 473 * We want to use atomic updates for m->aflags, which is a 474 * byte wide. Not all architectures provide atomic operations 475 * on the single-byte destination. Punt and access the whole 476 * 4-byte word with an atomic update. Parallel non-atomic 477 * updates to the fields included in the update by proximity 478 * are handled properly by atomics. 479 */ 480 addr = (void *)&m->aflags; 481 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 482 val = bits; 483 #if BYTE_ORDER == BIG_ENDIAN 484 val <<= 24; 485 #endif 486 atomic_set_32(addr, val); 487 } 488 489 void 490 vm_page_aflag_clear(vm_page_t m, uint8_t bits) 491 { 492 uint32_t *addr, val; 493 494 /* 495 * The PGA_REFERENCED flag can only be cleared if the object 496 * containing the page is locked. 497 */ 498 KASSERT((bits & PGA_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object), 499 ("PGA_REFERENCED and !VM_OBJECT_LOCKED")); 500 501 /* 502 * See the comment in vm_page_aflag_set(). 503 */ 504 addr = (void *)&m->aflags; 505 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 506 val = bits; 507 #if BYTE_ORDER == BIG_ENDIAN 508 val <<= 24; 509 #endif 510 atomic_clear_32(addr, val); 511 } 512 513 void 514 vm_page_reference(vm_page_t m) 515 { 516 517 vm_page_aflag_set(m, PGA_REFERENCED); 518 } 519 520 void 521 vm_page_busy(vm_page_t m) 522 { 523 524 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 525 KASSERT((m->oflags & VPO_BUSY) == 0, 526 ("vm_page_busy: page already busy!!!")); 527 m->oflags |= VPO_BUSY; 528 } 529 530 /* 531 * vm_page_flash: 532 * 533 * wakeup anyone waiting for the page. 534 */ 535 void 536 vm_page_flash(vm_page_t m) 537 { 538 539 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 540 if (m->oflags & VPO_WANTED) { 541 m->oflags &= ~VPO_WANTED; 542 wakeup(m); 543 } 544 } 545 546 /* 547 * vm_page_wakeup: 548 * 549 * clear the VPO_BUSY flag and wakeup anyone waiting for the 550 * page. 551 * 552 */ 553 void 554 vm_page_wakeup(vm_page_t m) 555 { 556 557 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 558 KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!")); 559 m->oflags &= ~VPO_BUSY; 560 vm_page_flash(m); 561 } 562 563 void 564 vm_page_io_start(vm_page_t m) 565 { 566 567 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 568 m->busy++; 569 } 570 571 void 572 vm_page_io_finish(vm_page_t m) 573 { 574 575 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 576 KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m)); 577 m->busy--; 578 if (m->busy == 0) 579 vm_page_flash(m); 580 } 581 582 /* 583 * Keep page from being freed by the page daemon 584 * much of the same effect as wiring, except much lower 585 * overhead and should be used only for *very* temporary 586 * holding ("wiring"). 587 */ 588 void 589 vm_page_hold(vm_page_t mem) 590 { 591 592 vm_page_lock_assert(mem, MA_OWNED); 593 mem->hold_count++; 594 } 595 596 void 597 vm_page_unhold(vm_page_t mem) 598 { 599 600 vm_page_lock_assert(mem, MA_OWNED); 601 --mem->hold_count; 602 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 603 if (mem->hold_count == 0 && mem->queue == PQ_HOLD) 604 vm_page_free_toq(mem); 605 } 606 607 /* 608 * vm_page_unhold_pages: 609 * 610 * Unhold each of the pages that is referenced by the given array. 611 */ 612 void 613 vm_page_unhold_pages(vm_page_t *ma, int count) 614 { 615 struct mtx *mtx, *new_mtx; 616 617 mtx = NULL; 618 for (; count != 0; count--) { 619 /* 620 * Avoid releasing and reacquiring the same page lock. 621 */ 622 new_mtx = vm_page_lockptr(*ma); 623 if (mtx != new_mtx) { 624 if (mtx != NULL) 625 mtx_unlock(mtx); 626 mtx = new_mtx; 627 mtx_lock(mtx); 628 } 629 vm_page_unhold(*ma); 630 ma++; 631 } 632 if (mtx != NULL) 633 mtx_unlock(mtx); 634 } 635 636 vm_page_t 637 PHYS_TO_VM_PAGE(vm_paddr_t pa) 638 { 639 vm_page_t m; 640 641 #ifdef VM_PHYSSEG_SPARSE 642 m = vm_phys_paddr_to_vm_page(pa); 643 if (m == NULL) 644 m = vm_phys_fictitious_to_vm_page(pa); 645 return (m); 646 #elif defined(VM_PHYSSEG_DENSE) 647 long pi; 648 649 pi = atop(pa); 650 if (pi >= first_page && (pi - first_page) < vm_page_array_size) { 651 m = &vm_page_array[pi - first_page]; 652 return (m); 653 } 654 return (vm_phys_fictitious_to_vm_page(pa)); 655 #else 656 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 657 #endif 658 } 659 660 /* 661 * vm_page_getfake: 662 * 663 * Create a fictitious page with the specified physical address and 664 * memory attribute. The memory attribute is the only the machine- 665 * dependent aspect of a fictitious page that must be initialized. 666 */ 667 vm_page_t 668 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 669 { 670 vm_page_t m; 671 672 m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); 673 vm_page_initfake(m, paddr, memattr); 674 return (m); 675 } 676 677 void 678 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 679 { 680 681 if ((m->flags & PG_FICTITIOUS) != 0) { 682 /* 683 * The page's memattr might have changed since the 684 * previous initialization. Update the pmap to the 685 * new memattr. 686 */ 687 goto memattr; 688 } 689 m->phys_addr = paddr; 690 m->queue = PQ_NONE; 691 /* Fictitious pages don't use "segind". */ 692 m->flags = PG_FICTITIOUS; 693 /* Fictitious pages don't use "order" or "pool". */ 694 m->oflags = VPO_BUSY | VPO_UNMANAGED; 695 m->wire_count = 1; 696 memattr: 697 pmap_page_set_memattr(m, memattr); 698 } 699 700 /* 701 * vm_page_putfake: 702 * 703 * Release a fictitious page. 704 */ 705 void 706 vm_page_putfake(vm_page_t m) 707 { 708 709 KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); 710 KASSERT((m->flags & PG_FICTITIOUS) != 0, 711 ("vm_page_putfake: bad page %p", m)); 712 uma_zfree(fakepg_zone, m); 713 } 714 715 /* 716 * vm_page_updatefake: 717 * 718 * Update the given fictitious page to the specified physical address and 719 * memory attribute. 720 */ 721 void 722 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 723 { 724 725 KASSERT((m->flags & PG_FICTITIOUS) != 0, 726 ("vm_page_updatefake: bad page %p", m)); 727 m->phys_addr = paddr; 728 pmap_page_set_memattr(m, memattr); 729 } 730 731 /* 732 * vm_page_free: 733 * 734 * Free a page. 735 */ 736 void 737 vm_page_free(vm_page_t m) 738 { 739 740 m->flags &= ~PG_ZERO; 741 vm_page_free_toq(m); 742 } 743 744 /* 745 * vm_page_free_zero: 746 * 747 * Free a page to the zerod-pages queue 748 */ 749 void 750 vm_page_free_zero(vm_page_t m) 751 { 752 753 m->flags |= PG_ZERO; 754 vm_page_free_toq(m); 755 } 756 757 /* 758 * vm_page_sleep: 759 * 760 * Sleep and release the page and page queues locks. 761 * 762 * The object containing the given page must be locked. 763 */ 764 void 765 vm_page_sleep(vm_page_t m, const char *msg) 766 { 767 768 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 769 if (mtx_owned(&vm_page_queue_mtx)) 770 vm_page_unlock_queues(); 771 if (mtx_owned(vm_page_lockptr(m))) 772 vm_page_unlock(m); 773 774 /* 775 * It's possible that while we sleep, the page will get 776 * unbusied and freed. If we are holding the object 777 * lock, we will assume we hold a reference to the object 778 * such that even if m->object changes, we can re-lock 779 * it. 780 */ 781 m->oflags |= VPO_WANTED; 782 msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0); 783 } 784 785 /* 786 * vm_page_dirty: 787 * 788 * Set all bits in the page's dirty field. 789 * 790 * The object containing the specified page must be locked if the 791 * call is made from the machine-independent layer. 792 * 793 * See vm_page_clear_dirty_mask(). 794 */ 795 void 796 vm_page_dirty(vm_page_t m) 797 { 798 799 KASSERT((m->flags & PG_CACHED) == 0, 800 ("vm_page_dirty: page in cache!")); 801 KASSERT(!VM_PAGE_IS_FREE(m), 802 ("vm_page_dirty: page is free!")); 803 KASSERT(m->valid == VM_PAGE_BITS_ALL, 804 ("vm_page_dirty: page is invalid!")); 805 m->dirty = VM_PAGE_BITS_ALL; 806 } 807 808 /* 809 * vm_page_splay: 810 * 811 * Implements Sleator and Tarjan's top-down splay algorithm. Returns 812 * the vm_page containing the given pindex. If, however, that 813 * pindex is not found in the vm_object, returns a vm_page that is 814 * adjacent to the pindex, coming before or after it. 815 */ 816 vm_page_t 817 vm_page_splay(vm_pindex_t pindex, vm_page_t root) 818 { 819 struct vm_page dummy; 820 vm_page_t lefttreemax, righttreemin, y; 821 822 if (root == NULL) 823 return (root); 824 lefttreemax = righttreemin = &dummy; 825 for (;; root = y) { 826 if (pindex < root->pindex) { 827 if ((y = root->left) == NULL) 828 break; 829 if (pindex < y->pindex) { 830 /* Rotate right. */ 831 root->left = y->right; 832 y->right = root; 833 root = y; 834 if ((y = root->left) == NULL) 835 break; 836 } 837 /* Link into the new root's right tree. */ 838 righttreemin->left = root; 839 righttreemin = root; 840 } else if (pindex > root->pindex) { 841 if ((y = root->right) == NULL) 842 break; 843 if (pindex > y->pindex) { 844 /* Rotate left. */ 845 root->right = y->left; 846 y->left = root; 847 root = y; 848 if ((y = root->right) == NULL) 849 break; 850 } 851 /* Link into the new root's left tree. */ 852 lefttreemax->right = root; 853 lefttreemax = root; 854 } else 855 break; 856 } 857 /* Assemble the new root. */ 858 lefttreemax->right = root->left; 859 righttreemin->left = root->right; 860 root->left = dummy.right; 861 root->right = dummy.left; 862 return (root); 863 } 864 865 /* 866 * vm_page_insert: [ internal use only ] 867 * 868 * Inserts the given mem entry into the object and object list. 869 * 870 * The pagetables are not updated but will presumably fault the page 871 * in if necessary, or if a kernel page the caller will at some point 872 * enter the page into the kernel's pmap. We are not allowed to block 873 * here so we *can't* do this anyway. 874 * 875 * The object and page must be locked. 876 * This routine may not block. 877 */ 878 void 879 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 880 { 881 vm_page_t root; 882 883 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 884 if (m->object != NULL) 885 panic("vm_page_insert: page already inserted"); 886 887 /* 888 * Record the object/offset pair in this page 889 */ 890 m->object = object; 891 m->pindex = pindex; 892 893 /* 894 * Now link into the object's ordered list of backed pages. 895 */ 896 root = object->root; 897 if (root == NULL) { 898 m->left = NULL; 899 m->right = NULL; 900 TAILQ_INSERT_TAIL(&object->memq, m, listq); 901 } else { 902 root = vm_page_splay(pindex, root); 903 if (pindex < root->pindex) { 904 m->left = root->left; 905 m->right = root; 906 root->left = NULL; 907 TAILQ_INSERT_BEFORE(root, m, listq); 908 } else if (pindex == root->pindex) 909 panic("vm_page_insert: offset already allocated"); 910 else { 911 m->right = root->right; 912 m->left = root; 913 root->right = NULL; 914 TAILQ_INSERT_AFTER(&object->memq, root, m, listq); 915 } 916 } 917 object->root = m; 918 919 /* 920 * show that the object has one more resident page. 921 */ 922 object->resident_page_count++; 923 /* 924 * Hold the vnode until the last page is released. 925 */ 926 if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 927 vhold((struct vnode *)object->handle); 928 929 /* 930 * Since we are inserting a new and possibly dirty page, 931 * update the object's OBJ_MIGHTBEDIRTY flag. 932 */ 933 if (m->aflags & PGA_WRITEABLE) 934 vm_object_set_writeable_dirty(object); 935 } 936 937 /* 938 * vm_page_remove: 939 * NOTE: used by device pager as well -wfj 940 * 941 * Removes the given mem entry from the object/offset-page 942 * table and the object page list, but do not invalidate/terminate 943 * the backing store. 944 * 945 * The object and page must be locked. 946 * The underlying pmap entry (if any) is NOT removed here. 947 * This routine may not block. 948 */ 949 void 950 vm_page_remove(vm_page_t m) 951 { 952 vm_object_t object; 953 vm_page_t next, prev, root; 954 955 if ((m->oflags & VPO_UNMANAGED) == 0) 956 vm_page_lock_assert(m, MA_OWNED); 957 if ((object = m->object) == NULL) 958 return; 959 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 960 if (m->oflags & VPO_BUSY) { 961 m->oflags &= ~VPO_BUSY; 962 vm_page_flash(m); 963 } 964 965 /* 966 * Now remove from the object's list of backed pages. 967 */ 968 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 969 /* 970 * Since the page's successor in the list is also its parent 971 * in the tree, its right subtree must be empty. 972 */ 973 next->left = m->left; 974 KASSERT(m->right == NULL, 975 ("vm_page_remove: page %p has right child", m)); 976 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 977 prev->right == m) { 978 /* 979 * Since the page's predecessor in the list is also its parent 980 * in the tree, its left subtree must be empty. 981 */ 982 KASSERT(m->left == NULL, 983 ("vm_page_remove: page %p has left child", m)); 984 prev->right = m->right; 985 } else { 986 if (m != object->root) 987 vm_page_splay(m->pindex, object->root); 988 if (m->left == NULL) 989 root = m->right; 990 else if (m->right == NULL) 991 root = m->left; 992 else { 993 /* 994 * Move the page's successor to the root, because 995 * pages are usually removed in ascending order. 996 */ 997 if (m->right != next) 998 vm_page_splay(m->pindex, m->right); 999 next->left = m->left; 1000 root = next; 1001 } 1002 object->root = root; 1003 } 1004 TAILQ_REMOVE(&object->memq, m, listq); 1005 1006 /* 1007 * And show that the object has one fewer resident page. 1008 */ 1009 object->resident_page_count--; 1010 /* 1011 * The vnode may now be recycled. 1012 */ 1013 if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 1014 vdrop((struct vnode *)object->handle); 1015 1016 m->object = NULL; 1017 } 1018 1019 /* 1020 * vm_page_lookup: 1021 * 1022 * Returns the page associated with the object/offset 1023 * pair specified; if none is found, NULL is returned. 1024 * 1025 * The object must be locked. 1026 * This routine may not block. 1027 * This is a critical path routine 1028 */ 1029 vm_page_t 1030 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 1031 { 1032 vm_page_t m; 1033 1034 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1035 if ((m = object->root) != NULL && m->pindex != pindex) { 1036 m = vm_page_splay(pindex, m); 1037 if ((object->root = m)->pindex != pindex) 1038 m = NULL; 1039 } 1040 return (m); 1041 } 1042 1043 /* 1044 * vm_page_find_least: 1045 * 1046 * Returns the page associated with the object with least pindex 1047 * greater than or equal to the parameter pindex, or NULL. 1048 * 1049 * The object must be locked. 1050 * The routine may not block. 1051 */ 1052 vm_page_t 1053 vm_page_find_least(vm_object_t object, vm_pindex_t pindex) 1054 { 1055 vm_page_t m; 1056 1057 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1058 if ((m = TAILQ_FIRST(&object->memq)) != NULL) { 1059 if (m->pindex < pindex) { 1060 m = vm_page_splay(pindex, object->root); 1061 if ((object->root = m)->pindex < pindex) 1062 m = TAILQ_NEXT(m, listq); 1063 } 1064 } 1065 return (m); 1066 } 1067 1068 /* 1069 * Returns the given page's successor (by pindex) within the object if it is 1070 * resident; if none is found, NULL is returned. 1071 * 1072 * The object must be locked. 1073 */ 1074 vm_page_t 1075 vm_page_next(vm_page_t m) 1076 { 1077 vm_page_t next; 1078 1079 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1080 if ((next = TAILQ_NEXT(m, listq)) != NULL && 1081 next->pindex != m->pindex + 1) 1082 next = NULL; 1083 return (next); 1084 } 1085 1086 /* 1087 * Returns the given page's predecessor (by pindex) within the object if it is 1088 * resident; if none is found, NULL is returned. 1089 * 1090 * The object must be locked. 1091 */ 1092 vm_page_t 1093 vm_page_prev(vm_page_t m) 1094 { 1095 vm_page_t prev; 1096 1097 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1098 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 1099 prev->pindex != m->pindex - 1) 1100 prev = NULL; 1101 return (prev); 1102 } 1103 1104 /* 1105 * vm_page_rename: 1106 * 1107 * Move the given memory entry from its 1108 * current object to the specified target object/offset. 1109 * 1110 * The object must be locked. 1111 * This routine may not block. 1112 * 1113 * Note: swap associated with the page must be invalidated by the move. We 1114 * have to do this for several reasons: (1) we aren't freeing the 1115 * page, (2) we are dirtying the page, (3) the VM system is probably 1116 * moving the page from object A to B, and will then later move 1117 * the backing store from A to B and we can't have a conflict. 1118 * 1119 * Note: we *always* dirty the page. It is necessary both for the 1120 * fact that we moved it, and because we may be invalidating 1121 * swap. If the page is on the cache, we have to deactivate it 1122 * or vm_page_dirty() will panic. Dirty pages are not allowed 1123 * on the cache. 1124 */ 1125 void 1126 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 1127 { 1128 1129 vm_page_remove(m); 1130 vm_page_insert(m, new_object, new_pindex); 1131 vm_page_dirty(m); 1132 } 1133 1134 /* 1135 * Convert all of the given object's cached pages that have a 1136 * pindex within the given range into free pages. If the value 1137 * zero is given for "end", then the range's upper bound is 1138 * infinity. If the given object is backed by a vnode and it 1139 * transitions from having one or more cached pages to none, the 1140 * vnode's hold count is reduced. 1141 */ 1142 void 1143 vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1144 { 1145 vm_page_t m, m_next; 1146 boolean_t empty; 1147 1148 mtx_lock(&vm_page_queue_free_mtx); 1149 if (__predict_false(object->cache == NULL)) { 1150 mtx_unlock(&vm_page_queue_free_mtx); 1151 return; 1152 } 1153 m = object->cache = vm_page_splay(start, object->cache); 1154 if (m->pindex < start) { 1155 if (m->right == NULL) 1156 m = NULL; 1157 else { 1158 m_next = vm_page_splay(start, m->right); 1159 m_next->left = m; 1160 m->right = NULL; 1161 m = object->cache = m_next; 1162 } 1163 } 1164 1165 /* 1166 * At this point, "m" is either (1) a reference to the page 1167 * with the least pindex that is greater than or equal to 1168 * "start" or (2) NULL. 1169 */ 1170 for (; m != NULL && (m->pindex < end || end == 0); m = m_next) { 1171 /* 1172 * Find "m"'s successor and remove "m" from the 1173 * object's cache. 1174 */ 1175 if (m->right == NULL) { 1176 object->cache = m->left; 1177 m_next = NULL; 1178 } else { 1179 m_next = vm_page_splay(start, m->right); 1180 m_next->left = m->left; 1181 object->cache = m_next; 1182 } 1183 /* Convert "m" to a free page. */ 1184 m->object = NULL; 1185 m->valid = 0; 1186 /* Clear PG_CACHED and set PG_FREE. */ 1187 m->flags ^= PG_CACHED | PG_FREE; 1188 KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, 1189 ("vm_page_cache_free: page %p has inconsistent flags", m)); 1190 cnt.v_cache_count--; 1191 cnt.v_free_count++; 1192 } 1193 empty = object->cache == NULL; 1194 mtx_unlock(&vm_page_queue_free_mtx); 1195 if (object->type == OBJT_VNODE && empty) 1196 vdrop(object->handle); 1197 } 1198 1199 /* 1200 * Returns the cached page that is associated with the given 1201 * object and offset. If, however, none exists, returns NULL. 1202 * 1203 * The free page queue must be locked. 1204 */ 1205 static inline vm_page_t 1206 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) 1207 { 1208 vm_page_t m; 1209 1210 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1211 if ((m = object->cache) != NULL && m->pindex != pindex) { 1212 m = vm_page_splay(pindex, m); 1213 if ((object->cache = m)->pindex != pindex) 1214 m = NULL; 1215 } 1216 return (m); 1217 } 1218 1219 /* 1220 * Remove the given cached page from its containing object's 1221 * collection of cached pages. 1222 * 1223 * The free page queue must be locked. 1224 */ 1225 static void 1226 vm_page_cache_remove(vm_page_t m) 1227 { 1228 vm_object_t object; 1229 vm_page_t root; 1230 1231 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1232 KASSERT((m->flags & PG_CACHED) != 0, 1233 ("vm_page_cache_remove: page %p is not cached", m)); 1234 object = m->object; 1235 if (m != object->cache) { 1236 root = vm_page_splay(m->pindex, object->cache); 1237 KASSERT(root == m, 1238 ("vm_page_cache_remove: page %p is not cached in object %p", 1239 m, object)); 1240 } 1241 if (m->left == NULL) 1242 root = m->right; 1243 else if (m->right == NULL) 1244 root = m->left; 1245 else { 1246 root = vm_page_splay(m->pindex, m->left); 1247 root->right = m->right; 1248 } 1249 object->cache = root; 1250 m->object = NULL; 1251 cnt.v_cache_count--; 1252 } 1253 1254 /* 1255 * Transfer all of the cached pages with offset greater than or 1256 * equal to 'offidxstart' from the original object's cache to the 1257 * new object's cache. However, any cached pages with offset 1258 * greater than or equal to the new object's size are kept in the 1259 * original object. Initially, the new object's cache must be 1260 * empty. Offset 'offidxstart' in the original object must 1261 * correspond to offset zero in the new object. 1262 * 1263 * The new object must be locked. 1264 */ 1265 void 1266 vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, 1267 vm_object_t new_object) 1268 { 1269 vm_page_t m, m_next; 1270 1271 /* 1272 * Insertion into an object's collection of cached pages 1273 * requires the object to be locked. In contrast, removal does 1274 * not. 1275 */ 1276 VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); 1277 KASSERT(new_object->cache == NULL, 1278 ("vm_page_cache_transfer: object %p has cached pages", 1279 new_object)); 1280 mtx_lock(&vm_page_queue_free_mtx); 1281 if ((m = orig_object->cache) != NULL) { 1282 /* 1283 * Transfer all of the pages with offset greater than or 1284 * equal to 'offidxstart' from the original object's 1285 * cache to the new object's cache. 1286 */ 1287 m = vm_page_splay(offidxstart, m); 1288 if (m->pindex < offidxstart) { 1289 orig_object->cache = m; 1290 new_object->cache = m->right; 1291 m->right = NULL; 1292 } else { 1293 orig_object->cache = m->left; 1294 new_object->cache = m; 1295 m->left = NULL; 1296 } 1297 while ((m = new_object->cache) != NULL) { 1298 if ((m->pindex - offidxstart) >= new_object->size) { 1299 /* 1300 * Return all of the cached pages with 1301 * offset greater than or equal to the 1302 * new object's size to the original 1303 * object's cache. 1304 */ 1305 new_object->cache = m->left; 1306 m->left = orig_object->cache; 1307 orig_object->cache = m; 1308 break; 1309 } 1310 m_next = vm_page_splay(m->pindex, m->right); 1311 /* Update the page's object and offset. */ 1312 m->object = new_object; 1313 m->pindex -= offidxstart; 1314 if (m_next == NULL) 1315 break; 1316 m->right = NULL; 1317 m_next->left = m; 1318 new_object->cache = m_next; 1319 } 1320 KASSERT(new_object->cache == NULL || 1321 new_object->type == OBJT_SWAP, 1322 ("vm_page_cache_transfer: object %p's type is incompatible" 1323 " with cached pages", new_object)); 1324 } 1325 mtx_unlock(&vm_page_queue_free_mtx); 1326 } 1327 1328 /* 1329 * Returns TRUE if a cached page is associated with the given object and 1330 * offset, and FALSE otherwise. 1331 * 1332 * The object must be locked. 1333 */ 1334 boolean_t 1335 vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) 1336 { 1337 vm_page_t m; 1338 1339 /* 1340 * Insertion into an object's collection of cached pages requires the 1341 * object to be locked. Therefore, if the object is locked and the 1342 * object's collection is empty, there is no need to acquire the free 1343 * page queues lock in order to prove that the specified page doesn't 1344 * exist. 1345 */ 1346 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1347 if (__predict_true(object->cache == NULL)) 1348 return (FALSE); 1349 mtx_lock(&vm_page_queue_free_mtx); 1350 m = vm_page_cache_lookup(object, pindex); 1351 mtx_unlock(&vm_page_queue_free_mtx); 1352 return (m != NULL); 1353 } 1354 1355 /* 1356 * vm_page_alloc: 1357 * 1358 * Allocate and return a page that is associated with the specified 1359 * object and offset pair. By default, this page has the flag VPO_BUSY 1360 * set. 1361 * 1362 * The caller must always specify an allocation class. 1363 * 1364 * allocation classes: 1365 * VM_ALLOC_NORMAL normal process request 1366 * VM_ALLOC_SYSTEM system *really* needs a page 1367 * VM_ALLOC_INTERRUPT interrupt time request 1368 * 1369 * optional allocation flags: 1370 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1371 * intends to allocate 1372 * VM_ALLOC_IFCACHED return page only if it is cached 1373 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page 1374 * is cached 1375 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page 1376 * VM_ALLOC_NODUMP do not include the page in a kernel core dump 1377 * VM_ALLOC_NOOBJ page is not associated with an object and 1378 * should not have the flag VPO_BUSY set 1379 * VM_ALLOC_WIRED wire the allocated page 1380 * VM_ALLOC_ZERO prefer a zeroed page 1381 * 1382 * This routine may not sleep. 1383 */ 1384 vm_page_t 1385 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1386 { 1387 struct vnode *vp = NULL; 1388 vm_object_t m_object; 1389 vm_page_t m; 1390 int flags, req_class; 1391 1392 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1393 ("vm_page_alloc: inconsistent object/req")); 1394 if (object != NULL) 1395 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1396 1397 req_class = req & VM_ALLOC_CLASS_MASK; 1398 1399 /* 1400 * The page daemon is allowed to dig deeper into the free page list. 1401 */ 1402 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1403 req_class = VM_ALLOC_SYSTEM; 1404 1405 mtx_lock(&vm_page_queue_free_mtx); 1406 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1407 (req_class == VM_ALLOC_SYSTEM && 1408 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1409 (req_class == VM_ALLOC_INTERRUPT && 1410 cnt.v_free_count + cnt.v_cache_count > 0)) { 1411 /* 1412 * Allocate from the free queue if the number of free pages 1413 * exceeds the minimum for the request class. 1414 */ 1415 if (object != NULL && 1416 (m = vm_page_cache_lookup(object, pindex)) != NULL) { 1417 if ((req & VM_ALLOC_IFNOTCACHED) != 0) { 1418 mtx_unlock(&vm_page_queue_free_mtx); 1419 return (NULL); 1420 } 1421 if (vm_phys_unfree_page(m)) 1422 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); 1423 #if VM_NRESERVLEVEL > 0 1424 else if (!vm_reserv_reactivate_page(m)) 1425 #else 1426 else 1427 #endif 1428 panic("vm_page_alloc: cache page %p is missing" 1429 " from the free queue", m); 1430 } else if ((req & VM_ALLOC_IFCACHED) != 0) { 1431 mtx_unlock(&vm_page_queue_free_mtx); 1432 return (NULL); 1433 #if VM_NRESERVLEVEL > 0 1434 } else if (object == NULL || object->type == OBJT_DEVICE || 1435 object->type == OBJT_SG || 1436 (object->flags & OBJ_COLORED) == 0 || 1437 (m = vm_reserv_alloc_page(object, pindex)) == NULL) { 1438 #else 1439 } else { 1440 #endif 1441 m = vm_phys_alloc_pages(object != NULL ? 1442 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); 1443 #if VM_NRESERVLEVEL > 0 1444 if (m == NULL && vm_reserv_reclaim_inactive()) { 1445 m = vm_phys_alloc_pages(object != NULL ? 1446 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 1447 0); 1448 } 1449 #endif 1450 } 1451 } else { 1452 /* 1453 * Not allocatable, give up. 1454 */ 1455 mtx_unlock(&vm_page_queue_free_mtx); 1456 atomic_add_int(&vm_pageout_deficit, 1457 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1458 pagedaemon_wakeup(); 1459 return (NULL); 1460 } 1461 1462 /* 1463 * At this point we had better have found a good page. 1464 */ 1465 KASSERT(m != NULL, ("vm_page_alloc: missing page")); 1466 KASSERT(m->queue == PQ_NONE, 1467 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); 1468 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); 1469 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); 1470 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m)); 1471 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); 1472 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1473 ("vm_page_alloc: page %p has unexpected memattr %d", m, 1474 pmap_page_get_memattr(m))); 1475 if ((m->flags & PG_CACHED) != 0) { 1476 KASSERT((m->flags & PG_ZERO) == 0, 1477 ("vm_page_alloc: cached page %p is PG_ZERO", m)); 1478 KASSERT(m->valid != 0, 1479 ("vm_page_alloc: cached page %p is invalid", m)); 1480 if (m->object == object && m->pindex == pindex) 1481 cnt.v_reactivated++; 1482 else 1483 m->valid = 0; 1484 m_object = m->object; 1485 vm_page_cache_remove(m); 1486 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1487 vp = m_object->handle; 1488 } else { 1489 KASSERT(VM_PAGE_IS_FREE(m), 1490 ("vm_page_alloc: page %p is not free", m)); 1491 KASSERT(m->valid == 0, 1492 ("vm_page_alloc: free page %p is valid", m)); 1493 cnt.v_free_count--; 1494 } 1495 1496 /* 1497 * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag 1498 * must be cleared before the free page queues lock is released. 1499 */ 1500 flags = 0; 1501 if (req & VM_ALLOC_NODUMP) 1502 flags |= PG_NODUMP; 1503 if (m->flags & PG_ZERO) { 1504 vm_page_zero_count--; 1505 if (req & VM_ALLOC_ZERO) 1506 flags = PG_ZERO; 1507 } 1508 m->flags = flags; 1509 mtx_unlock(&vm_page_queue_free_mtx); 1510 m->aflags = 0; 1511 if (object == NULL || object->type == OBJT_PHYS) 1512 m->oflags = VPO_UNMANAGED; 1513 else 1514 m->oflags = 0; 1515 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0) 1516 m->oflags |= VPO_BUSY; 1517 if (req & VM_ALLOC_WIRED) { 1518 /* 1519 * The page lock is not required for wiring a page until that 1520 * page is inserted into the object. 1521 */ 1522 atomic_add_int(&cnt.v_wire_count, 1); 1523 m->wire_count = 1; 1524 } 1525 m->act_count = 0; 1526 1527 if (object != NULL) { 1528 /* Ignore device objects; the pager sets "memattr" for them. */ 1529 if (object->memattr != VM_MEMATTR_DEFAULT && 1530 object->type != OBJT_DEVICE && object->type != OBJT_SG) 1531 pmap_page_set_memattr(m, object->memattr); 1532 vm_page_insert(m, object, pindex); 1533 } else 1534 m->pindex = pindex; 1535 1536 /* 1537 * The following call to vdrop() must come after the above call 1538 * to vm_page_insert() in case both affect the same object and 1539 * vnode. Otherwise, the affected vnode's hold count could 1540 * temporarily become zero. 1541 */ 1542 if (vp != NULL) 1543 vdrop(vp); 1544 1545 /* 1546 * Don't wakeup too often - wakeup the pageout daemon when 1547 * we would be nearly out of memory. 1548 */ 1549 if (vm_paging_needed()) 1550 pagedaemon_wakeup(); 1551 1552 return (m); 1553 } 1554 1555 /* 1556 * vm_page_alloc_contig: 1557 * 1558 * Allocate a contiguous set of physical pages of the given size "npages" 1559 * from the free lists. All of the physical pages must be at or above 1560 * the given physical address "low" and below the given physical address 1561 * "high". The given value "alignment" determines the alignment of the 1562 * first physical page in the set. If the given value "boundary" is 1563 * non-zero, then the set of physical pages cannot cross any physical 1564 * address boundary that is a multiple of that value. Both "alignment" 1565 * and "boundary" must be a power of two. 1566 * 1567 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, 1568 * then the memory attribute setting for the physical pages is configured 1569 * to the object's memory attribute setting. Otherwise, the memory 1570 * attribute setting for the physical pages is configured to "memattr", 1571 * overriding the object's memory attribute setting. However, if the 1572 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the 1573 * memory attribute setting for the physical pages cannot be configured 1574 * to VM_MEMATTR_DEFAULT. 1575 * 1576 * The caller must always specify an allocation class. 1577 * 1578 * allocation classes: 1579 * VM_ALLOC_NORMAL normal process request 1580 * VM_ALLOC_SYSTEM system *really* needs a page 1581 * VM_ALLOC_INTERRUPT interrupt time request 1582 * 1583 * optional allocation flags: 1584 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page 1585 * VM_ALLOC_NOOBJ page is not associated with an object and 1586 * should not have the flag VPO_BUSY set 1587 * VM_ALLOC_WIRED wire the allocated page 1588 * VM_ALLOC_ZERO prefer a zeroed page 1589 * 1590 * This routine may not sleep. 1591 */ 1592 vm_page_t 1593 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, 1594 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 1595 vm_paddr_t boundary, vm_memattr_t memattr) 1596 { 1597 struct vnode *drop; 1598 vm_page_t deferred_vdrop_list, m, m_ret; 1599 u_int flags, oflags; 1600 int req_class; 1601 1602 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1603 ("vm_page_alloc_contig: inconsistent object/req")); 1604 if (object != NULL) { 1605 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1606 KASSERT(object->type == OBJT_PHYS, 1607 ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", 1608 object)); 1609 } 1610 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 1611 req_class = req & VM_ALLOC_CLASS_MASK; 1612 1613 /* 1614 * The page daemon is allowed to dig deeper into the free page list. 1615 */ 1616 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1617 req_class = VM_ALLOC_SYSTEM; 1618 1619 deferred_vdrop_list = NULL; 1620 mtx_lock(&vm_page_queue_free_mtx); 1621 if (cnt.v_free_count + cnt.v_cache_count >= npages + 1622 cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && 1623 cnt.v_free_count + cnt.v_cache_count >= npages + 1624 cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && 1625 cnt.v_free_count + cnt.v_cache_count >= npages)) { 1626 #if VM_NRESERVLEVEL > 0 1627 retry: 1628 if (object == NULL || (object->flags & OBJ_COLORED) == 0 || 1629 (m_ret = vm_reserv_alloc_contig(object, pindex, npages, 1630 low, high, alignment, boundary)) == NULL) 1631 #endif 1632 m_ret = vm_phys_alloc_contig(npages, low, high, 1633 alignment, boundary); 1634 } else { 1635 mtx_unlock(&vm_page_queue_free_mtx); 1636 atomic_add_int(&vm_pageout_deficit, npages); 1637 pagedaemon_wakeup(); 1638 return (NULL); 1639 } 1640 if (m_ret != NULL) 1641 for (m = m_ret; m < &m_ret[npages]; m++) { 1642 drop = vm_page_alloc_init(m); 1643 if (drop != NULL) { 1644 /* 1645 * Enqueue the vnode for deferred vdrop(). 1646 * 1647 * Once the pages are removed from the free 1648 * page list, "pageq" can be safely abused to 1649 * construct a short-lived list of vnodes. 1650 */ 1651 m->pageq.tqe_prev = (void *)drop; 1652 m->pageq.tqe_next = deferred_vdrop_list; 1653 deferred_vdrop_list = m; 1654 } 1655 } 1656 else { 1657 #if VM_NRESERVLEVEL > 0 1658 if (vm_reserv_reclaim_contig(npages, low, high, alignment, 1659 boundary)) 1660 goto retry; 1661 #endif 1662 } 1663 mtx_unlock(&vm_page_queue_free_mtx); 1664 if (m_ret == NULL) 1665 return (NULL); 1666 1667 /* 1668 * Initialize the pages. Only the PG_ZERO flag is inherited. 1669 */ 1670 flags = 0; 1671 if ((req & VM_ALLOC_ZERO) != 0) 1672 flags = PG_ZERO; 1673 if ((req & VM_ALLOC_NODUMP) != 0) 1674 flags |= PG_NODUMP; 1675 if ((req & VM_ALLOC_WIRED) != 0) 1676 atomic_add_int(&cnt.v_wire_count, npages); 1677 oflags = VPO_UNMANAGED; 1678 if (object != NULL) { 1679 if ((req & VM_ALLOC_NOBUSY) == 0) 1680 oflags |= VPO_BUSY; 1681 if (object->memattr != VM_MEMATTR_DEFAULT && 1682 memattr == VM_MEMATTR_DEFAULT) 1683 memattr = object->memattr; 1684 } 1685 for (m = m_ret; m < &m_ret[npages]; m++) { 1686 m->aflags = 0; 1687 m->flags &= flags; 1688 if ((req & VM_ALLOC_WIRED) != 0) 1689 m->wire_count = 1; 1690 /* Unmanaged pages don't use "act_count". */ 1691 m->oflags = oflags; 1692 if (memattr != VM_MEMATTR_DEFAULT) 1693 pmap_page_set_memattr(m, memattr); 1694 if (object != NULL) 1695 vm_page_insert(m, object, pindex); 1696 else 1697 m->pindex = pindex; 1698 pindex++; 1699 } 1700 while (deferred_vdrop_list != NULL) { 1701 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev); 1702 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next; 1703 } 1704 if (vm_paging_needed()) 1705 pagedaemon_wakeup(); 1706 return (m_ret); 1707 } 1708 1709 /* 1710 * Initialize a page that has been freshly dequeued from a freelist. 1711 * The caller has to drop the vnode returned, if it is not NULL. 1712 * 1713 * This function may only be used to initialize unmanaged pages. 1714 * 1715 * To be called with vm_page_queue_free_mtx held. 1716 */ 1717 static struct vnode * 1718 vm_page_alloc_init(vm_page_t m) 1719 { 1720 struct vnode *drop; 1721 vm_object_t m_object; 1722 1723 KASSERT(m->queue == PQ_NONE, 1724 ("vm_page_alloc_init: page %p has unexpected queue %d", 1725 m, m->queue)); 1726 KASSERT(m->wire_count == 0, 1727 ("vm_page_alloc_init: page %p is wired", m)); 1728 KASSERT(m->hold_count == 0, 1729 ("vm_page_alloc_init: page %p is held", m)); 1730 KASSERT(m->busy == 0, 1731 ("vm_page_alloc_init: page %p is busy", m)); 1732 KASSERT(m->dirty == 0, 1733 ("vm_page_alloc_init: page %p is dirty", m)); 1734 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1735 ("vm_page_alloc_init: page %p has unexpected memattr %d", 1736 m, pmap_page_get_memattr(m))); 1737 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1738 drop = NULL; 1739 if ((m->flags & PG_CACHED) != 0) { 1740 KASSERT((m->flags & PG_ZERO) == 0, 1741 ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); 1742 m->valid = 0; 1743 m_object = m->object; 1744 vm_page_cache_remove(m); 1745 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1746 drop = m_object->handle; 1747 } else { 1748 KASSERT(VM_PAGE_IS_FREE(m), 1749 ("vm_page_alloc_init: page %p is not free", m)); 1750 KASSERT(m->valid == 0, 1751 ("vm_page_alloc_init: free page %p is valid", m)); 1752 cnt.v_free_count--; 1753 if ((m->flags & PG_ZERO) != 0) 1754 vm_page_zero_count--; 1755 } 1756 /* Don't clear the PG_ZERO flag; we'll need it later. */ 1757 m->flags &= PG_ZERO; 1758 return (drop); 1759 } 1760 1761 /* 1762 * vm_page_alloc_freelist: 1763 * 1764 * Allocate a physical page from the specified free page list. 1765 * 1766 * The caller must always specify an allocation class. 1767 * 1768 * allocation classes: 1769 * VM_ALLOC_NORMAL normal process request 1770 * VM_ALLOC_SYSTEM system *really* needs a page 1771 * VM_ALLOC_INTERRUPT interrupt time request 1772 * 1773 * optional allocation flags: 1774 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1775 * intends to allocate 1776 * VM_ALLOC_WIRED wire the allocated page 1777 * VM_ALLOC_ZERO prefer a zeroed page 1778 * 1779 * This routine may not sleep. 1780 */ 1781 vm_page_t 1782 vm_page_alloc_freelist(int flind, int req) 1783 { 1784 struct vnode *drop; 1785 vm_page_t m; 1786 u_int flags; 1787 int req_class; 1788 1789 req_class = req & VM_ALLOC_CLASS_MASK; 1790 1791 /* 1792 * The page daemon is allowed to dig deeper into the free page list. 1793 */ 1794 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1795 req_class = VM_ALLOC_SYSTEM; 1796 1797 /* 1798 * Do not allocate reserved pages unless the req has asked for it. 1799 */ 1800 mtx_lock(&vm_page_queue_free_mtx); 1801 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1802 (req_class == VM_ALLOC_SYSTEM && 1803 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1804 (req_class == VM_ALLOC_INTERRUPT && 1805 cnt.v_free_count + cnt.v_cache_count > 0)) 1806 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); 1807 else { 1808 mtx_unlock(&vm_page_queue_free_mtx); 1809 atomic_add_int(&vm_pageout_deficit, 1810 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1811 pagedaemon_wakeup(); 1812 return (NULL); 1813 } 1814 if (m == NULL) { 1815 mtx_unlock(&vm_page_queue_free_mtx); 1816 return (NULL); 1817 } 1818 drop = vm_page_alloc_init(m); 1819 mtx_unlock(&vm_page_queue_free_mtx); 1820 1821 /* 1822 * Initialize the page. Only the PG_ZERO flag is inherited. 1823 */ 1824 m->aflags = 0; 1825 flags = 0; 1826 if ((req & VM_ALLOC_ZERO) != 0) 1827 flags = PG_ZERO; 1828 m->flags &= flags; 1829 if ((req & VM_ALLOC_WIRED) != 0) { 1830 /* 1831 * The page lock is not required for wiring a page that does 1832 * not belong to an object. 1833 */ 1834 atomic_add_int(&cnt.v_wire_count, 1); 1835 m->wire_count = 1; 1836 } 1837 /* Unmanaged pages don't use "act_count". */ 1838 m->oflags = VPO_UNMANAGED; 1839 if (drop != NULL) 1840 vdrop(drop); 1841 if (vm_paging_needed()) 1842 pagedaemon_wakeup(); 1843 return (m); 1844 } 1845 1846 /* 1847 * vm_wait: (also see VM_WAIT macro) 1848 * 1849 * Block until free pages are available for allocation 1850 * - Called in various places before memory allocations. 1851 */ 1852 void 1853 vm_wait(void) 1854 { 1855 1856 mtx_lock(&vm_page_queue_free_mtx); 1857 if (curproc == pageproc) { 1858 vm_pageout_pages_needed = 1; 1859 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, 1860 PDROP | PSWP, "VMWait", 0); 1861 } else { 1862 if (!vm_pages_needed) { 1863 vm_pages_needed = 1; 1864 wakeup(&vm_pages_needed); 1865 } 1866 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, 1867 "vmwait", 0); 1868 } 1869 } 1870 1871 /* 1872 * vm_waitpfault: (also see VM_WAITPFAULT macro) 1873 * 1874 * Block until free pages are available for allocation 1875 * - Called only in vm_fault so that processes page faulting 1876 * can be easily tracked. 1877 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 1878 * processes will be able to grab memory first. Do not change 1879 * this balance without careful testing first. 1880 */ 1881 void 1882 vm_waitpfault(void) 1883 { 1884 1885 mtx_lock(&vm_page_queue_free_mtx); 1886 if (!vm_pages_needed) { 1887 vm_pages_needed = 1; 1888 wakeup(&vm_pages_needed); 1889 } 1890 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, 1891 "pfault", 0); 1892 } 1893 1894 /* 1895 * vm_page_requeue: 1896 * 1897 * Move the given page to the tail of its present page queue. 1898 * 1899 * The page queues must be locked. 1900 */ 1901 void 1902 vm_page_requeue(vm_page_t m) 1903 { 1904 struct vpgqueues *vpq; 1905 int queue; 1906 1907 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1908 queue = m->queue; 1909 KASSERT(queue != PQ_NONE, 1910 ("vm_page_requeue: page %p is not queued", m)); 1911 vpq = &vm_page_queues[queue]; 1912 TAILQ_REMOVE(&vpq->pl, m, pageq); 1913 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1914 } 1915 1916 /* 1917 * vm_page_queue_remove: 1918 * 1919 * Remove the given page from the specified queue. 1920 * 1921 * The page and page queues must be locked. 1922 */ 1923 static __inline void 1924 vm_page_queue_remove(int queue, vm_page_t m) 1925 { 1926 struct vpgqueues *pq; 1927 1928 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1929 vm_page_lock_assert(m, MA_OWNED); 1930 pq = &vm_page_queues[queue]; 1931 TAILQ_REMOVE(&pq->pl, m, pageq); 1932 (*pq->cnt)--; 1933 } 1934 1935 /* 1936 * vm_pageq_remove: 1937 * 1938 * Remove a page from its queue. 1939 * 1940 * The given page must be locked. 1941 * This routine may not block. 1942 */ 1943 void 1944 vm_pageq_remove(vm_page_t m) 1945 { 1946 int queue; 1947 1948 vm_page_lock_assert(m, MA_OWNED); 1949 if ((queue = m->queue) != PQ_NONE) { 1950 vm_page_lock_queues(); 1951 m->queue = PQ_NONE; 1952 vm_page_queue_remove(queue, m); 1953 vm_page_unlock_queues(); 1954 } 1955 } 1956 1957 /* 1958 * vm_page_enqueue: 1959 * 1960 * Add the given page to the specified queue. 1961 * 1962 * The page queues must be locked. 1963 */ 1964 static void 1965 vm_page_enqueue(int queue, vm_page_t m) 1966 { 1967 struct vpgqueues *vpq; 1968 1969 vpq = &vm_page_queues[queue]; 1970 m->queue = queue; 1971 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1972 ++*vpq->cnt; 1973 } 1974 1975 /* 1976 * vm_page_activate: 1977 * 1978 * Put the specified page on the active list (if appropriate). 1979 * Ensure that act_count is at least ACT_INIT but do not otherwise 1980 * mess with it. 1981 * 1982 * The page must be locked. 1983 * This routine may not block. 1984 */ 1985 void 1986 vm_page_activate(vm_page_t m) 1987 { 1988 int queue; 1989 1990 vm_page_lock_assert(m, MA_OWNED); 1991 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1992 if ((queue = m->queue) != PQ_ACTIVE) { 1993 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 1994 if (m->act_count < ACT_INIT) 1995 m->act_count = ACT_INIT; 1996 vm_page_lock_queues(); 1997 if (queue != PQ_NONE) 1998 vm_page_queue_remove(queue, m); 1999 vm_page_enqueue(PQ_ACTIVE, m); 2000 vm_page_unlock_queues(); 2001 } else 2002 KASSERT(queue == PQ_NONE, 2003 ("vm_page_activate: wired page %p is queued", m)); 2004 } else { 2005 if (m->act_count < ACT_INIT) 2006 m->act_count = ACT_INIT; 2007 } 2008 } 2009 2010 /* 2011 * vm_page_free_wakeup: 2012 * 2013 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 2014 * routine is called when a page has been added to the cache or free 2015 * queues. 2016 * 2017 * The page queues must be locked. 2018 * This routine may not block. 2019 */ 2020 static inline void 2021 vm_page_free_wakeup(void) 2022 { 2023 2024 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 2025 /* 2026 * if pageout daemon needs pages, then tell it that there are 2027 * some free. 2028 */ 2029 if (vm_pageout_pages_needed && 2030 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { 2031 wakeup(&vm_pageout_pages_needed); 2032 vm_pageout_pages_needed = 0; 2033 } 2034 /* 2035 * wakeup processes that are waiting on memory if we hit a 2036 * high water mark. And wakeup scheduler process if we have 2037 * lots of memory. this process will swapin processes. 2038 */ 2039 if (vm_pages_needed && !vm_page_count_min()) { 2040 vm_pages_needed = 0; 2041 wakeup(&cnt.v_free_count); 2042 } 2043 } 2044 2045 /* 2046 * vm_page_free_toq: 2047 * 2048 * Returns the given page to the free list, 2049 * disassociating it with any VM object. 2050 * 2051 * Object and page must be locked prior to entry. 2052 * This routine may not block. 2053 */ 2054 2055 void 2056 vm_page_free_toq(vm_page_t m) 2057 { 2058 2059 if ((m->oflags & VPO_UNMANAGED) == 0) { 2060 vm_page_lock_assert(m, MA_OWNED); 2061 KASSERT(!pmap_page_is_mapped(m), 2062 ("vm_page_free_toq: freeing mapped page %p", m)); 2063 } 2064 PCPU_INC(cnt.v_tfree); 2065 2066 if (VM_PAGE_IS_FREE(m)) 2067 panic("vm_page_free: freeing free page %p", m); 2068 else if (m->busy != 0) 2069 panic("vm_page_free: freeing busy page %p", m); 2070 2071 /* 2072 * unqueue, then remove page. Note that we cannot destroy 2073 * the page here because we do not want to call the pager's 2074 * callback routine until after we've put the page on the 2075 * appropriate free queue. 2076 */ 2077 if ((m->oflags & VPO_UNMANAGED) == 0) 2078 vm_pageq_remove(m); 2079 vm_page_remove(m); 2080 2081 /* 2082 * If fictitious remove object association and 2083 * return, otherwise delay object association removal. 2084 */ 2085 if ((m->flags & PG_FICTITIOUS) != 0) { 2086 return; 2087 } 2088 2089 m->valid = 0; 2090 vm_page_undirty(m); 2091 2092 if (m->wire_count != 0) 2093 panic("vm_page_free: freeing wired page %p", m); 2094 if (m->hold_count != 0) { 2095 m->flags &= ~PG_ZERO; 2096 vm_page_lock_queues(); 2097 vm_page_enqueue(PQ_HOLD, m); 2098 vm_page_unlock_queues(); 2099 } else { 2100 /* 2101 * Restore the default memory attribute to the page. 2102 */ 2103 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2104 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2105 2106 /* 2107 * Insert the page into the physical memory allocator's 2108 * cache/free page queues. 2109 */ 2110 mtx_lock(&vm_page_queue_free_mtx); 2111 m->flags |= PG_FREE; 2112 cnt.v_free_count++; 2113 #if VM_NRESERVLEVEL > 0 2114 if (!vm_reserv_free_page(m)) 2115 #else 2116 if (TRUE) 2117 #endif 2118 vm_phys_free_pages(m, 0); 2119 if ((m->flags & PG_ZERO) != 0) 2120 ++vm_page_zero_count; 2121 else 2122 vm_page_zero_idle_wakeup(); 2123 vm_page_free_wakeup(); 2124 mtx_unlock(&vm_page_queue_free_mtx); 2125 } 2126 } 2127 2128 /* 2129 * vm_page_wire: 2130 * 2131 * Mark this page as wired down by yet 2132 * another map, removing it from paging queues 2133 * as necessary. 2134 * 2135 * If the page is fictitious, then its wire count must remain one. 2136 * 2137 * The page must be locked. 2138 * This routine may not block. 2139 */ 2140 void 2141 vm_page_wire(vm_page_t m) 2142 { 2143 2144 /* 2145 * Only bump the wire statistics if the page is not already wired, 2146 * and only unqueue the page if it is on some queue (if it is unmanaged 2147 * it is already off the queues). 2148 */ 2149 vm_page_lock_assert(m, MA_OWNED); 2150 if ((m->flags & PG_FICTITIOUS) != 0) { 2151 KASSERT(m->wire_count == 1, 2152 ("vm_page_wire: fictitious page %p's wire count isn't one", 2153 m)); 2154 return; 2155 } 2156 if (m->wire_count == 0) { 2157 if ((m->oflags & VPO_UNMANAGED) == 0) 2158 vm_pageq_remove(m); 2159 atomic_add_int(&cnt.v_wire_count, 1); 2160 } 2161 m->wire_count++; 2162 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); 2163 } 2164 2165 /* 2166 * vm_page_unwire: 2167 * 2168 * Release one wiring of the specified page, potentially enabling it to be 2169 * paged again. If paging is enabled, then the value of the parameter 2170 * "activate" determines to which queue the page is added. If "activate" is 2171 * non-zero, then the page is added to the active queue. Otherwise, it is 2172 * added to the inactive queue. 2173 * 2174 * However, unless the page belongs to an object, it is not enqueued because 2175 * it cannot be paged out. 2176 * 2177 * If a page is fictitious, then its wire count must alway be one. 2178 * 2179 * A managed page must be locked. 2180 */ 2181 void 2182 vm_page_unwire(vm_page_t m, int activate) 2183 { 2184 2185 if ((m->oflags & VPO_UNMANAGED) == 0) 2186 vm_page_lock_assert(m, MA_OWNED); 2187 if ((m->flags & PG_FICTITIOUS) != 0) { 2188 KASSERT(m->wire_count == 1, 2189 ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); 2190 return; 2191 } 2192 if (m->wire_count > 0) { 2193 m->wire_count--; 2194 if (m->wire_count == 0) { 2195 atomic_subtract_int(&cnt.v_wire_count, 1); 2196 if ((m->oflags & VPO_UNMANAGED) != 0 || 2197 m->object == NULL) 2198 return; 2199 if (!activate) 2200 m->flags &= ~PG_WINATCFLS; 2201 vm_page_lock_queues(); 2202 vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m); 2203 vm_page_unlock_queues(); 2204 } 2205 } else 2206 panic("vm_page_unwire: page %p's wire count is zero", m); 2207 } 2208 2209 /* 2210 * Move the specified page to the inactive queue. 2211 * 2212 * Many pages placed on the inactive queue should actually go 2213 * into the cache, but it is difficult to figure out which. What 2214 * we do instead, if the inactive target is well met, is to put 2215 * clean pages at the head of the inactive queue instead of the tail. 2216 * This will cause them to be moved to the cache more quickly and 2217 * if not actively re-referenced, reclaimed more quickly. If we just 2218 * stick these pages at the end of the inactive queue, heavy filesystem 2219 * meta-data accesses can cause an unnecessary paging load on memory bound 2220 * processes. This optimization causes one-time-use metadata to be 2221 * reused more quickly. 2222 * 2223 * Normally athead is 0 resulting in LRU operation. athead is set 2224 * to 1 if we want this page to be 'as if it were placed in the cache', 2225 * except without unmapping it from the process address space. 2226 * 2227 * This routine may not block. 2228 */ 2229 static inline void 2230 _vm_page_deactivate(vm_page_t m, int athead) 2231 { 2232 int queue; 2233 2234 vm_page_lock_assert(m, MA_OWNED); 2235 2236 /* 2237 * Ignore if already inactive. 2238 */ 2239 if ((queue = m->queue) == PQ_INACTIVE) 2240 return; 2241 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 2242 m->flags &= ~PG_WINATCFLS; 2243 vm_page_lock_queues(); 2244 if (queue != PQ_NONE) 2245 vm_page_queue_remove(queue, m); 2246 if (athead) 2247 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, 2248 pageq); 2249 else 2250 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, 2251 pageq); 2252 m->queue = PQ_INACTIVE; 2253 cnt.v_inactive_count++; 2254 vm_page_unlock_queues(); 2255 } 2256 } 2257 2258 /* 2259 * Move the specified page to the inactive queue. 2260 * 2261 * The page must be locked. 2262 */ 2263 void 2264 vm_page_deactivate(vm_page_t m) 2265 { 2266 2267 _vm_page_deactivate(m, 0); 2268 } 2269 2270 /* 2271 * vm_page_try_to_cache: 2272 * 2273 * Returns 0 on failure, 1 on success 2274 */ 2275 int 2276 vm_page_try_to_cache(vm_page_t m) 2277 { 2278 2279 vm_page_lock_assert(m, MA_OWNED); 2280 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2281 if (m->dirty || m->hold_count || m->busy || m->wire_count || 2282 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0) 2283 return (0); 2284 pmap_remove_all(m); 2285 if (m->dirty) 2286 return (0); 2287 vm_page_cache(m); 2288 return (1); 2289 } 2290 2291 /* 2292 * vm_page_try_to_free() 2293 * 2294 * Attempt to free the page. If we cannot free it, we do nothing. 2295 * 1 is returned on success, 0 on failure. 2296 */ 2297 int 2298 vm_page_try_to_free(vm_page_t m) 2299 { 2300 2301 vm_page_lock_assert(m, MA_OWNED); 2302 if (m->object != NULL) 2303 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2304 if (m->dirty || m->hold_count || m->busy || m->wire_count || 2305 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0) 2306 return (0); 2307 pmap_remove_all(m); 2308 if (m->dirty) 2309 return (0); 2310 vm_page_free(m); 2311 return (1); 2312 } 2313 2314 /* 2315 * vm_page_cache 2316 * 2317 * Put the specified page onto the page cache queue (if appropriate). 2318 * 2319 * This routine may not block. 2320 */ 2321 void 2322 vm_page_cache(vm_page_t m) 2323 { 2324 vm_object_t object; 2325 vm_page_t next, prev, root; 2326 2327 vm_page_lock_assert(m, MA_OWNED); 2328 object = m->object; 2329 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2330 if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy || 2331 m->hold_count || m->wire_count) 2332 panic("vm_page_cache: attempting to cache busy page"); 2333 pmap_remove_all(m); 2334 if (m->dirty != 0) 2335 panic("vm_page_cache: page %p is dirty", m); 2336 if (m->valid == 0 || object->type == OBJT_DEFAULT || 2337 (object->type == OBJT_SWAP && 2338 !vm_pager_has_page(object, m->pindex, NULL, NULL))) { 2339 /* 2340 * Hypothesis: A cache-elgible page belonging to a 2341 * default object or swap object but without a backing 2342 * store must be zero filled. 2343 */ 2344 vm_page_free(m); 2345 return; 2346 } 2347 KASSERT((m->flags & PG_CACHED) == 0, 2348 ("vm_page_cache: page %p is already cached", m)); 2349 PCPU_INC(cnt.v_tcached); 2350 2351 /* 2352 * Remove the page from the paging queues. 2353 */ 2354 vm_pageq_remove(m); 2355 2356 /* 2357 * Remove the page from the object's collection of resident 2358 * pages. 2359 */ 2360 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 2361 /* 2362 * Since the page's successor in the list is also its parent 2363 * in the tree, its right subtree must be empty. 2364 */ 2365 next->left = m->left; 2366 KASSERT(m->right == NULL, 2367 ("vm_page_cache: page %p has right child", m)); 2368 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 2369 prev->right == m) { 2370 /* 2371 * Since the page's predecessor in the list is also its parent 2372 * in the tree, its left subtree must be empty. 2373 */ 2374 KASSERT(m->left == NULL, 2375 ("vm_page_cache: page %p has left child", m)); 2376 prev->right = m->right; 2377 } else { 2378 if (m != object->root) 2379 vm_page_splay(m->pindex, object->root); 2380 if (m->left == NULL) 2381 root = m->right; 2382 else if (m->right == NULL) 2383 root = m->left; 2384 else { 2385 /* 2386 * Move the page's successor to the root, because 2387 * pages are usually removed in ascending order. 2388 */ 2389 if (m->right != next) 2390 vm_page_splay(m->pindex, m->right); 2391 next->left = m->left; 2392 root = next; 2393 } 2394 object->root = root; 2395 } 2396 TAILQ_REMOVE(&object->memq, m, listq); 2397 object->resident_page_count--; 2398 2399 /* 2400 * Restore the default memory attribute to the page. 2401 */ 2402 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2403 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2404 2405 /* 2406 * Insert the page into the object's collection of cached pages 2407 * and the physical memory allocator's cache/free page queues. 2408 */ 2409 m->flags &= ~PG_ZERO; 2410 mtx_lock(&vm_page_queue_free_mtx); 2411 m->flags |= PG_CACHED; 2412 cnt.v_cache_count++; 2413 root = object->cache; 2414 if (root == NULL) { 2415 m->left = NULL; 2416 m->right = NULL; 2417 } else { 2418 root = vm_page_splay(m->pindex, root); 2419 if (m->pindex < root->pindex) { 2420 m->left = root->left; 2421 m->right = root; 2422 root->left = NULL; 2423 } else if (__predict_false(m->pindex == root->pindex)) 2424 panic("vm_page_cache: offset already cached"); 2425 else { 2426 m->right = root->right; 2427 m->left = root; 2428 root->right = NULL; 2429 } 2430 } 2431 object->cache = m; 2432 #if VM_NRESERVLEVEL > 0 2433 if (!vm_reserv_free_page(m)) { 2434 #else 2435 if (TRUE) { 2436 #endif 2437 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); 2438 vm_phys_free_pages(m, 0); 2439 } 2440 vm_page_free_wakeup(); 2441 mtx_unlock(&vm_page_queue_free_mtx); 2442 2443 /* 2444 * Increment the vnode's hold count if this is the object's only 2445 * cached page. Decrement the vnode's hold count if this was 2446 * the object's only resident page. 2447 */ 2448 if (object->type == OBJT_VNODE) { 2449 if (root == NULL && object->resident_page_count != 0) 2450 vhold(object->handle); 2451 else if (root != NULL && object->resident_page_count == 0) 2452 vdrop(object->handle); 2453 } 2454 } 2455 2456 /* 2457 * vm_page_dontneed 2458 * 2459 * Cache, deactivate, or do nothing as appropriate. This routine 2460 * is typically used by madvise() MADV_DONTNEED. 2461 * 2462 * Generally speaking we want to move the page into the cache so 2463 * it gets reused quickly. However, this can result in a silly syndrome 2464 * due to the page recycling too quickly. Small objects will not be 2465 * fully cached. On the otherhand, if we move the page to the inactive 2466 * queue we wind up with a problem whereby very large objects 2467 * unnecessarily blow away our inactive and cache queues. 2468 * 2469 * The solution is to move the pages based on a fixed weighting. We 2470 * either leave them alone, deactivate them, or move them to the cache, 2471 * where moving them to the cache has the highest weighting. 2472 * By forcing some pages into other queues we eventually force the 2473 * system to balance the queues, potentially recovering other unrelated 2474 * space from active. The idea is to not force this to happen too 2475 * often. 2476 */ 2477 void 2478 vm_page_dontneed(vm_page_t m) 2479 { 2480 int dnw; 2481 int head; 2482 2483 vm_page_lock_assert(m, MA_OWNED); 2484 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2485 dnw = PCPU_GET(dnweight); 2486 PCPU_INC(dnweight); 2487 2488 /* 2489 * Occasionally leave the page alone. 2490 */ 2491 if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) { 2492 if (m->act_count >= ACT_INIT) 2493 --m->act_count; 2494 return; 2495 } 2496 2497 /* 2498 * Clear any references to the page. Otherwise, the page daemon will 2499 * immediately reactivate the page. 2500 * 2501 * Perform the pmap_clear_reference() first. Otherwise, a concurrent 2502 * pmap operation, such as pmap_remove(), could clear a reference in 2503 * the pmap and set PGA_REFERENCED on the page before the 2504 * pmap_clear_reference() had completed. Consequently, the page would 2505 * appear referenced based upon an old reference that occurred before 2506 * this function ran. 2507 */ 2508 pmap_clear_reference(m); 2509 vm_page_aflag_clear(m, PGA_REFERENCED); 2510 2511 if (m->dirty == 0 && pmap_is_modified(m)) 2512 vm_page_dirty(m); 2513 2514 if (m->dirty || (dnw & 0x0070) == 0) { 2515 /* 2516 * Deactivate the page 3 times out of 32. 2517 */ 2518 head = 0; 2519 } else { 2520 /* 2521 * Cache the page 28 times out of every 32. Note that 2522 * the page is deactivated instead of cached, but placed 2523 * at the head of the queue instead of the tail. 2524 */ 2525 head = 1; 2526 } 2527 _vm_page_deactivate(m, head); 2528 } 2529 2530 /* 2531 * Grab a page, waiting until we are waken up due to the page 2532 * changing state. We keep on waiting, if the page continues 2533 * to be in the object. If the page doesn't exist, first allocate it 2534 * and then conditionally zero it. 2535 * 2536 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended 2537 * to facilitate its eventual removal. 2538 * 2539 * This routine may block. 2540 */ 2541 vm_page_t 2542 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 2543 { 2544 vm_page_t m; 2545 2546 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2547 KASSERT((allocflags & VM_ALLOC_RETRY) != 0, 2548 ("vm_page_grab: VM_ALLOC_RETRY is required")); 2549 retrylookup: 2550 if ((m = vm_page_lookup(object, pindex)) != NULL) { 2551 if ((m->oflags & VPO_BUSY) != 0 || 2552 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) { 2553 /* 2554 * Reference the page before unlocking and 2555 * sleeping so that the page daemon is less 2556 * likely to reclaim it. 2557 */ 2558 vm_page_aflag_set(m, PGA_REFERENCED); 2559 vm_page_sleep(m, "pgrbwt"); 2560 goto retrylookup; 2561 } else { 2562 if ((allocflags & VM_ALLOC_WIRED) != 0) { 2563 vm_page_lock(m); 2564 vm_page_wire(m); 2565 vm_page_unlock(m); 2566 } 2567 if ((allocflags & VM_ALLOC_NOBUSY) == 0) 2568 vm_page_busy(m); 2569 return (m); 2570 } 2571 } 2572 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY | 2573 VM_ALLOC_IGN_SBUSY)); 2574 if (m == NULL) { 2575 VM_OBJECT_UNLOCK(object); 2576 VM_WAIT; 2577 VM_OBJECT_LOCK(object); 2578 goto retrylookup; 2579 } else if (m->valid != 0) 2580 return (m); 2581 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 2582 pmap_zero_page(m); 2583 return (m); 2584 } 2585 2586 /* 2587 * Mapping function for valid bits or for dirty bits in 2588 * a page. May not block. 2589 * 2590 * Inputs are required to range within a page. 2591 */ 2592 vm_page_bits_t 2593 vm_page_bits(int base, int size) 2594 { 2595 int first_bit; 2596 int last_bit; 2597 2598 KASSERT( 2599 base + size <= PAGE_SIZE, 2600 ("vm_page_bits: illegal base/size %d/%d", base, size) 2601 ); 2602 2603 if (size == 0) /* handle degenerate case */ 2604 return (0); 2605 2606 first_bit = base >> DEV_BSHIFT; 2607 last_bit = (base + size - 1) >> DEV_BSHIFT; 2608 2609 return (((vm_page_bits_t)2 << last_bit) - 2610 ((vm_page_bits_t)1 << first_bit)); 2611 } 2612 2613 /* 2614 * vm_page_set_valid_range: 2615 * 2616 * Sets portions of a page valid. The arguments are expected 2617 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2618 * of any partial chunks touched by the range. The invalid portion of 2619 * such chunks will be zeroed. 2620 * 2621 * (base + size) must be less then or equal to PAGE_SIZE. 2622 */ 2623 void 2624 vm_page_set_valid_range(vm_page_t m, int base, int size) 2625 { 2626 int endoff, frag; 2627 2628 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2629 if (size == 0) /* handle degenerate case */ 2630 return; 2631 2632 /* 2633 * If the base is not DEV_BSIZE aligned and the valid 2634 * bit is clear, we have to zero out a portion of the 2635 * first block. 2636 */ 2637 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2638 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2639 pmap_zero_page_area(m, frag, base - frag); 2640 2641 /* 2642 * If the ending offset is not DEV_BSIZE aligned and the 2643 * valid bit is clear, we have to zero out a portion of 2644 * the last block. 2645 */ 2646 endoff = base + size; 2647 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2648 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2649 pmap_zero_page_area(m, endoff, 2650 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2651 2652 /* 2653 * Assert that no previously invalid block that is now being validated 2654 * is already dirty. 2655 */ 2656 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 2657 ("vm_page_set_valid_range: page %p is dirty", m)); 2658 2659 /* 2660 * Set valid bits inclusive of any overlap. 2661 */ 2662 m->valid |= vm_page_bits(base, size); 2663 } 2664 2665 /* 2666 * Clear the given bits from the specified page's dirty field. 2667 */ 2668 static __inline void 2669 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) 2670 { 2671 uintptr_t addr; 2672 #if PAGE_SIZE < 16384 2673 int shift; 2674 #endif 2675 2676 /* 2677 * If the object is locked and the page is neither VPO_BUSY nor 2678 * PGA_WRITEABLE, then the page's dirty field cannot possibly be 2679 * set by a concurrent pmap operation. 2680 */ 2681 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2682 if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0) 2683 m->dirty &= ~pagebits; 2684 else { 2685 /* 2686 * The pmap layer can call vm_page_dirty() without 2687 * holding a distinguished lock. The combination of 2688 * the object's lock and an atomic operation suffice 2689 * to guarantee consistency of the page dirty field. 2690 * 2691 * For PAGE_SIZE == 32768 case, compiler already 2692 * properly aligns the dirty field, so no forcible 2693 * alignment is needed. Only require existence of 2694 * atomic_clear_64 when page size is 32768. 2695 */ 2696 addr = (uintptr_t)&m->dirty; 2697 #if PAGE_SIZE == 32768 2698 atomic_clear_64((uint64_t *)addr, pagebits); 2699 #elif PAGE_SIZE == 16384 2700 atomic_clear_32((uint32_t *)addr, pagebits); 2701 #else /* PAGE_SIZE <= 8192 */ 2702 /* 2703 * Use a trick to perform a 32-bit atomic on the 2704 * containing aligned word, to not depend on the existence 2705 * of atomic_clear_{8, 16}. 2706 */ 2707 shift = addr & (sizeof(uint32_t) - 1); 2708 #if BYTE_ORDER == BIG_ENDIAN 2709 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; 2710 #else 2711 shift *= NBBY; 2712 #endif 2713 addr &= ~(sizeof(uint32_t) - 1); 2714 atomic_clear_32((uint32_t *)addr, pagebits << shift); 2715 #endif /* PAGE_SIZE */ 2716 } 2717 } 2718 2719 /* 2720 * vm_page_set_validclean: 2721 * 2722 * Sets portions of a page valid and clean. The arguments are expected 2723 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2724 * of any partial chunks touched by the range. The invalid portion of 2725 * such chunks will be zero'd. 2726 * 2727 * This routine may not block. 2728 * 2729 * (base + size) must be less then or equal to PAGE_SIZE. 2730 */ 2731 void 2732 vm_page_set_validclean(vm_page_t m, int base, int size) 2733 { 2734 vm_page_bits_t oldvalid, pagebits; 2735 int endoff, frag; 2736 2737 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2738 if (size == 0) /* handle degenerate case */ 2739 return; 2740 2741 /* 2742 * If the base is not DEV_BSIZE aligned and the valid 2743 * bit is clear, we have to zero out a portion of the 2744 * first block. 2745 */ 2746 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2747 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) 2748 pmap_zero_page_area(m, frag, base - frag); 2749 2750 /* 2751 * If the ending offset is not DEV_BSIZE aligned and the 2752 * valid bit is clear, we have to zero out a portion of 2753 * the last block. 2754 */ 2755 endoff = base + size; 2756 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2757 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) 2758 pmap_zero_page_area(m, endoff, 2759 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2760 2761 /* 2762 * Set valid, clear dirty bits. If validating the entire 2763 * page we can safely clear the pmap modify bit. We also 2764 * use this opportunity to clear the VPO_NOSYNC flag. If a process 2765 * takes a write fault on a MAP_NOSYNC memory area the flag will 2766 * be set again. 2767 * 2768 * We set valid bits inclusive of any overlap, but we can only 2769 * clear dirty bits for DEV_BSIZE chunks that are fully within 2770 * the range. 2771 */ 2772 oldvalid = m->valid; 2773 pagebits = vm_page_bits(base, size); 2774 m->valid |= pagebits; 2775 #if 0 /* NOT YET */ 2776 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 2777 frag = DEV_BSIZE - frag; 2778 base += frag; 2779 size -= frag; 2780 if (size < 0) 2781 size = 0; 2782 } 2783 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 2784 #endif 2785 if (base == 0 && size == PAGE_SIZE) { 2786 /* 2787 * The page can only be modified within the pmap if it is 2788 * mapped, and it can only be mapped if it was previously 2789 * fully valid. 2790 */ 2791 if (oldvalid == VM_PAGE_BITS_ALL) 2792 /* 2793 * Perform the pmap_clear_modify() first. Otherwise, 2794 * a concurrent pmap operation, such as 2795 * pmap_protect(), could clear a modification in the 2796 * pmap and set the dirty field on the page before 2797 * pmap_clear_modify() had begun and after the dirty 2798 * field was cleared here. 2799 */ 2800 pmap_clear_modify(m); 2801 m->dirty = 0; 2802 m->oflags &= ~VPO_NOSYNC; 2803 } else if (oldvalid != VM_PAGE_BITS_ALL) 2804 m->dirty &= ~pagebits; 2805 else 2806 vm_page_clear_dirty_mask(m, pagebits); 2807 } 2808 2809 void 2810 vm_page_clear_dirty(vm_page_t m, int base, int size) 2811 { 2812 2813 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 2814 } 2815 2816 /* 2817 * vm_page_set_invalid: 2818 * 2819 * Invalidates DEV_BSIZE'd chunks within a page. Both the 2820 * valid and dirty bits for the effected areas are cleared. 2821 * 2822 * May not block. 2823 */ 2824 void 2825 vm_page_set_invalid(vm_page_t m, int base, int size) 2826 { 2827 vm_page_bits_t bits; 2828 2829 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2830 KASSERT((m->oflags & VPO_BUSY) == 0, 2831 ("vm_page_set_invalid: page %p is busy", m)); 2832 bits = vm_page_bits(base, size); 2833 if (m->valid == VM_PAGE_BITS_ALL && bits != 0) 2834 pmap_remove_all(m); 2835 KASSERT(!pmap_page_is_mapped(m), 2836 ("vm_page_set_invalid: page %p is mapped", m)); 2837 m->valid &= ~bits; 2838 m->dirty &= ~bits; 2839 } 2840 2841 /* 2842 * vm_page_zero_invalid() 2843 * 2844 * The kernel assumes that the invalid portions of a page contain 2845 * garbage, but such pages can be mapped into memory by user code. 2846 * When this occurs, we must zero out the non-valid portions of the 2847 * page so user code sees what it expects. 2848 * 2849 * Pages are most often semi-valid when the end of a file is mapped 2850 * into memory and the file's size is not page aligned. 2851 */ 2852 void 2853 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 2854 { 2855 int b; 2856 int i; 2857 2858 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2859 /* 2860 * Scan the valid bits looking for invalid sections that 2861 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 2862 * valid bit may be set ) have already been zerod by 2863 * vm_page_set_validclean(). 2864 */ 2865 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 2866 if (i == (PAGE_SIZE / DEV_BSIZE) || 2867 (m->valid & ((vm_page_bits_t)1 << i))) { 2868 if (i > b) { 2869 pmap_zero_page_area(m, 2870 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 2871 } 2872 b = i + 1; 2873 } 2874 } 2875 2876 /* 2877 * setvalid is TRUE when we can safely set the zero'd areas 2878 * as being valid. We can do this if there are no cache consistancy 2879 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 2880 */ 2881 if (setvalid) 2882 m->valid = VM_PAGE_BITS_ALL; 2883 } 2884 2885 /* 2886 * vm_page_is_valid: 2887 * 2888 * Is (partial) page valid? Note that the case where size == 0 2889 * will return FALSE in the degenerate case where the page is 2890 * entirely invalid, and TRUE otherwise. 2891 * 2892 * May not block. 2893 */ 2894 int 2895 vm_page_is_valid(vm_page_t m, int base, int size) 2896 { 2897 vm_page_bits_t bits; 2898 2899 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2900 bits = vm_page_bits(base, size); 2901 if (m->valid && ((m->valid & bits) == bits)) 2902 return 1; 2903 else 2904 return 0; 2905 } 2906 2907 /* 2908 * update dirty bits from pmap/mmu. May not block. 2909 */ 2910 void 2911 vm_page_test_dirty(vm_page_t m) 2912 { 2913 2914 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2915 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 2916 vm_page_dirty(m); 2917 } 2918 2919 void 2920 vm_page_lock_KBI(vm_page_t m, const char *file, int line) 2921 { 2922 2923 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); 2924 } 2925 2926 void 2927 vm_page_unlock_KBI(vm_page_t m, const char *file, int line) 2928 { 2929 2930 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); 2931 } 2932 2933 int 2934 vm_page_trylock_KBI(vm_page_t m, const char *file, int line) 2935 { 2936 2937 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); 2938 } 2939 2940 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) 2941 void 2942 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) 2943 { 2944 2945 mtx_assert_(vm_page_lockptr(m), a, file, line); 2946 } 2947 #endif 2948 2949 int so_zerocp_fullpage = 0; 2950 2951 /* 2952 * Replace the given page with a copy. The copied page assumes 2953 * the portion of the given page's "wire_count" that is not the 2954 * responsibility of this copy-on-write mechanism. 2955 * 2956 * The object containing the given page must have a non-zero 2957 * paging-in-progress count and be locked. 2958 */ 2959 void 2960 vm_page_cowfault(vm_page_t m) 2961 { 2962 vm_page_t mnew; 2963 vm_object_t object; 2964 vm_pindex_t pindex; 2965 2966 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED); 2967 vm_page_lock_assert(m, MA_OWNED); 2968 object = m->object; 2969 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2970 KASSERT(object->paging_in_progress != 0, 2971 ("vm_page_cowfault: object %p's paging-in-progress count is zero.", 2972 object)); 2973 pindex = m->pindex; 2974 2975 retry_alloc: 2976 pmap_remove_all(m); 2977 vm_page_remove(m); 2978 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); 2979 if (mnew == NULL) { 2980 vm_page_insert(m, object, pindex); 2981 vm_page_unlock(m); 2982 VM_OBJECT_UNLOCK(object); 2983 VM_WAIT; 2984 VM_OBJECT_LOCK(object); 2985 if (m == vm_page_lookup(object, pindex)) { 2986 vm_page_lock(m); 2987 goto retry_alloc; 2988 } else { 2989 /* 2990 * Page disappeared during the wait. 2991 */ 2992 return; 2993 } 2994 } 2995 2996 if (m->cow == 0) { 2997 /* 2998 * check to see if we raced with an xmit complete when 2999 * waiting to allocate a page. If so, put things back 3000 * the way they were 3001 */ 3002 vm_page_unlock(m); 3003 vm_page_lock(mnew); 3004 vm_page_free(mnew); 3005 vm_page_unlock(mnew); 3006 vm_page_insert(m, object, pindex); 3007 } else { /* clear COW & copy page */ 3008 if (!so_zerocp_fullpage) 3009 pmap_copy_page(m, mnew); 3010 mnew->valid = VM_PAGE_BITS_ALL; 3011 vm_page_dirty(mnew); 3012 mnew->wire_count = m->wire_count - m->cow; 3013 m->wire_count = m->cow; 3014 vm_page_unlock(m); 3015 } 3016 } 3017 3018 void 3019 vm_page_cowclear(vm_page_t m) 3020 { 3021 3022 vm_page_lock_assert(m, MA_OWNED); 3023 if (m->cow) { 3024 m->cow--; 3025 /* 3026 * let vm_fault add back write permission lazily 3027 */ 3028 } 3029 /* 3030 * sf_buf_free() will free the page, so we needn't do it here 3031 */ 3032 } 3033 3034 int 3035 vm_page_cowsetup(vm_page_t m) 3036 { 3037 3038 vm_page_lock_assert(m, MA_OWNED); 3039 if ((m->flags & PG_FICTITIOUS) != 0 || 3040 (m->oflags & VPO_UNMANAGED) != 0 || 3041 m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object)) 3042 return (EBUSY); 3043 m->cow++; 3044 pmap_remove_write(m); 3045 VM_OBJECT_UNLOCK(m->object); 3046 return (0); 3047 } 3048 3049 #ifdef INVARIANTS 3050 void 3051 vm_page_object_lock_assert(vm_page_t m) 3052 { 3053 3054 /* 3055 * Certain of the page's fields may only be modified by the 3056 * holder of the containing object's lock or the setter of the 3057 * page's VPO_BUSY flag. Unfortunately, the setter of the 3058 * VPO_BUSY flag is not recorded, and thus cannot be checked 3059 * here. 3060 */ 3061 if (m->object != NULL && (m->oflags & VPO_BUSY) == 0) 3062 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 3063 } 3064 #endif 3065 3066 #include "opt_ddb.h" 3067 #ifdef DDB 3068 #include <sys/kernel.h> 3069 3070 #include <ddb/ddb.h> 3071 3072 DB_SHOW_COMMAND(page, vm_page_print_page_info) 3073 { 3074 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); 3075 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); 3076 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); 3077 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); 3078 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); 3079 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); 3080 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); 3081 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); 3082 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); 3083 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); 3084 } 3085 3086 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 3087 { 3088 3089 db_printf("PQ_FREE:"); 3090 db_printf(" %d", cnt.v_free_count); 3091 db_printf("\n"); 3092 3093 db_printf("PQ_CACHE:"); 3094 db_printf(" %d", cnt.v_cache_count); 3095 db_printf("\n"); 3096 3097 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 3098 *vm_page_queues[PQ_ACTIVE].cnt, 3099 *vm_page_queues[PQ_INACTIVE].cnt); 3100 } 3101 #endif /* DDB */ 3102