1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * The Mach Operating System project at Carnegie-Mellon University. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 34 */ 35 36 /*- 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 */ 62 63 /* 64 * GENERAL RULES ON VM_PAGE MANIPULATION 65 * 66 * - a pageq mutex is required when adding or removing a page from a 67 * page queue (vm_page_queue[]), regardless of other mutexes or the 68 * busy state of a page. 69 * 70 * - The object mutex is held when inserting or removing 71 * pages from an object (vm_page_insert() or vm_page_remove()). 72 * 73 */ 74 75 /* 76 * Resident memory management module. 77 */ 78 79 #include <sys/cdefs.h> 80 __FBSDID("$FreeBSD$"); 81 82 #include "opt_vm.h" 83 84 #include <sys/param.h> 85 #include <sys/systm.h> 86 #include <sys/lock.h> 87 #include <sys/kernel.h> 88 #include <sys/limits.h> 89 #include <sys/malloc.h> 90 #include <sys/msgbuf.h> 91 #include <sys/mutex.h> 92 #include <sys/proc.h> 93 #include <sys/sysctl.h> 94 #include <sys/vmmeter.h> 95 #include <sys/vnode.h> 96 97 #include <vm/vm.h> 98 #include <vm/pmap.h> 99 #include <vm/vm_param.h> 100 #include <vm/vm_kern.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_page.h> 103 #include <vm/vm_pageout.h> 104 #include <vm/vm_pager.h> 105 #include <vm/vm_phys.h> 106 #include <vm/vm_reserv.h> 107 #include <vm/vm_extern.h> 108 #include <vm/uma.h> 109 #include <vm/uma_int.h> 110 111 #include <machine/md_var.h> 112 113 /* 114 * Associated with page of user-allocatable memory is a 115 * page structure. 116 */ 117 118 struct vpgqueues vm_page_queues[PQ_COUNT]; 119 struct vpglocks vm_page_queue_lock; 120 struct vpglocks vm_page_queue_free_lock; 121 122 struct vpglocks pa_lock[PA_LOCK_COUNT]; 123 124 vm_page_t vm_page_array = 0; 125 int vm_page_array_size = 0; 126 long first_page = 0; 127 int vm_page_zero_count = 0; 128 129 static int boot_pages = UMA_BOOT_PAGES; 130 TUNABLE_INT("vm.boot_pages", &boot_pages); 131 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, 132 "number of pages allocated for bootstrapping the VM system"); 133 134 int pa_tryrelock_restart; 135 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, 136 &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); 137 138 static uma_zone_t fakepg_zone; 139 140 static struct vnode *vm_page_alloc_init(vm_page_t m); 141 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); 142 static void vm_page_queue_remove(int queue, vm_page_t m); 143 static void vm_page_enqueue(int queue, vm_page_t m); 144 static void vm_page_init_fakepg(void *dummy); 145 146 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); 147 148 static void 149 vm_page_init_fakepg(void *dummy) 150 { 151 152 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 153 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 154 } 155 156 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ 157 #if PAGE_SIZE == 32768 158 #ifdef CTASSERT 159 CTASSERT(sizeof(u_long) >= 8); 160 #endif 161 #endif 162 163 /* 164 * Try to acquire a physical address lock while a pmap is locked. If we 165 * fail to trylock we unlock and lock the pmap directly and cache the 166 * locked pa in *locked. The caller should then restart their loop in case 167 * the virtual to physical mapping has changed. 168 */ 169 int 170 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) 171 { 172 vm_paddr_t lockpa; 173 174 lockpa = *locked; 175 *locked = pa; 176 if (lockpa) { 177 PA_LOCK_ASSERT(lockpa, MA_OWNED); 178 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) 179 return (0); 180 PA_UNLOCK(lockpa); 181 } 182 if (PA_TRYLOCK(pa)) 183 return (0); 184 PMAP_UNLOCK(pmap); 185 atomic_add_int(&pa_tryrelock_restart, 1); 186 PA_LOCK(pa); 187 PMAP_LOCK(pmap); 188 return (EAGAIN); 189 } 190 191 /* 192 * vm_set_page_size: 193 * 194 * Sets the page size, perhaps based upon the memory 195 * size. Must be called before any use of page-size 196 * dependent functions. 197 */ 198 void 199 vm_set_page_size(void) 200 { 201 if (cnt.v_page_size == 0) 202 cnt.v_page_size = PAGE_SIZE; 203 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) 204 panic("vm_set_page_size: page size not a power of two"); 205 } 206 207 /* 208 * vm_page_blacklist_lookup: 209 * 210 * See if a physical address in this page has been listed 211 * in the blacklist tunable. Entries in the tunable are 212 * separated by spaces or commas. If an invalid integer is 213 * encountered then the rest of the string is skipped. 214 */ 215 static int 216 vm_page_blacklist_lookup(char *list, vm_paddr_t pa) 217 { 218 vm_paddr_t bad; 219 char *cp, *pos; 220 221 for (pos = list; *pos != '\0'; pos = cp) { 222 bad = strtoq(pos, &cp, 0); 223 if (*cp != '\0') { 224 if (*cp == ' ' || *cp == ',') { 225 cp++; 226 if (cp == pos) 227 continue; 228 } else 229 break; 230 } 231 if (pa == trunc_page(bad)) 232 return (1); 233 } 234 return (0); 235 } 236 237 /* 238 * vm_page_startup: 239 * 240 * Initializes the resident memory module. 241 * 242 * Allocates memory for the page cells, and 243 * for the object/offset-to-page hash table headers. 244 * Each page cell is initialized and placed on the free list. 245 */ 246 vm_offset_t 247 vm_page_startup(vm_offset_t vaddr) 248 { 249 vm_offset_t mapped; 250 vm_paddr_t page_range; 251 vm_paddr_t new_end; 252 int i; 253 vm_paddr_t pa; 254 vm_paddr_t last_pa; 255 char *list; 256 257 /* the biggest memory array is the second group of pages */ 258 vm_paddr_t end; 259 vm_paddr_t biggestsize; 260 vm_paddr_t low_water, high_water; 261 int biggestone; 262 263 biggestsize = 0; 264 biggestone = 0; 265 vaddr = round_page(vaddr); 266 267 for (i = 0; phys_avail[i + 1]; i += 2) { 268 phys_avail[i] = round_page(phys_avail[i]); 269 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 270 } 271 272 low_water = phys_avail[0]; 273 high_water = phys_avail[1]; 274 275 for (i = 0; phys_avail[i + 1]; i += 2) { 276 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; 277 278 if (size > biggestsize) { 279 biggestone = i; 280 biggestsize = size; 281 } 282 if (phys_avail[i] < low_water) 283 low_water = phys_avail[i]; 284 if (phys_avail[i + 1] > high_water) 285 high_water = phys_avail[i + 1]; 286 } 287 288 #ifdef XEN 289 low_water = 0; 290 #endif 291 292 end = phys_avail[biggestone+1]; 293 294 /* 295 * Initialize the locks. 296 */ 297 mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF | 298 MTX_RECURSE); 299 mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, 300 MTX_DEF); 301 302 /* Setup page locks. */ 303 for (i = 0; i < PA_LOCK_COUNT; i++) 304 mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF); 305 306 /* 307 * Initialize the queue headers for the hold queue, the active queue, 308 * and the inactive queue. 309 */ 310 for (i = 0; i < PQ_COUNT; i++) 311 TAILQ_INIT(&vm_page_queues[i].pl); 312 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; 313 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; 314 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; 315 316 /* 317 * Allocate memory for use when boot strapping the kernel memory 318 * allocator. 319 */ 320 new_end = end - (boot_pages * UMA_SLAB_SIZE); 321 new_end = trunc_page(new_end); 322 mapped = pmap_map(&vaddr, new_end, end, 323 VM_PROT_READ | VM_PROT_WRITE); 324 bzero((void *)mapped, end - new_end); 325 uma_startup((void *)mapped, boot_pages); 326 327 #if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \ 328 defined(__mips__) 329 /* 330 * Allocate a bitmap to indicate that a random physical page 331 * needs to be included in a minidump. 332 * 333 * The amd64 port needs this to indicate which direct map pages 334 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 335 * 336 * However, i386 still needs this workspace internally within the 337 * minidump code. In theory, they are not needed on i386, but are 338 * included should the sf_buf code decide to use them. 339 */ 340 last_pa = 0; 341 for (i = 0; dump_avail[i + 1] != 0; i += 2) 342 if (dump_avail[i + 1] > last_pa) 343 last_pa = dump_avail[i + 1]; 344 page_range = last_pa / PAGE_SIZE; 345 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 346 new_end -= vm_page_dump_size; 347 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, 348 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); 349 bzero((void *)vm_page_dump, vm_page_dump_size); 350 #endif 351 #ifdef __amd64__ 352 /* 353 * Request that the physical pages underlying the message buffer be 354 * included in a crash dump. Since the message buffer is accessed 355 * through the direct map, they are not automatically included. 356 */ 357 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 358 last_pa = pa + round_page(msgbufsize); 359 while (pa < last_pa) { 360 dump_add_page(pa); 361 pa += PAGE_SIZE; 362 } 363 #endif 364 /* 365 * Compute the number of pages of memory that will be available for 366 * use (taking into account the overhead of a page structure per 367 * page). 368 */ 369 first_page = low_water / PAGE_SIZE; 370 #ifdef VM_PHYSSEG_SPARSE 371 page_range = 0; 372 for (i = 0; phys_avail[i + 1] != 0; i += 2) 373 page_range += atop(phys_avail[i + 1] - phys_avail[i]); 374 #elif defined(VM_PHYSSEG_DENSE) 375 page_range = high_water / PAGE_SIZE - first_page; 376 #else 377 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 378 #endif 379 end = new_end; 380 381 /* 382 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 383 */ 384 vaddr += PAGE_SIZE; 385 386 /* 387 * Initialize the mem entry structures now, and put them in the free 388 * queue. 389 */ 390 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 391 mapped = pmap_map(&vaddr, new_end, end, 392 VM_PROT_READ | VM_PROT_WRITE); 393 vm_page_array = (vm_page_t) mapped; 394 #if VM_NRESERVLEVEL > 0 395 /* 396 * Allocate memory for the reservation management system's data 397 * structures. 398 */ 399 new_end = vm_reserv_startup(&vaddr, new_end, high_water); 400 #endif 401 #if defined(__amd64__) || defined(__mips__) 402 /* 403 * pmap_map on amd64 and mips can come out of the direct-map, not kvm 404 * like i386, so the pages must be tracked for a crashdump to include 405 * this data. This includes the vm_page_array and the early UMA 406 * bootstrap pages. 407 */ 408 for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) 409 dump_add_page(pa); 410 #endif 411 phys_avail[biggestone + 1] = new_end; 412 413 /* 414 * Clear all of the page structures 415 */ 416 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 417 for (i = 0; i < page_range; i++) 418 vm_page_array[i].order = VM_NFREEORDER; 419 vm_page_array_size = page_range; 420 421 /* 422 * Initialize the physical memory allocator. 423 */ 424 vm_phys_init(); 425 426 /* 427 * Add every available physical page that is not blacklisted to 428 * the free lists. 429 */ 430 cnt.v_page_count = 0; 431 cnt.v_free_count = 0; 432 list = getenv("vm.blacklist"); 433 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 434 pa = phys_avail[i]; 435 last_pa = phys_avail[i + 1]; 436 while (pa < last_pa) { 437 if (list != NULL && 438 vm_page_blacklist_lookup(list, pa)) 439 printf("Skipping page with pa 0x%jx\n", 440 (uintmax_t)pa); 441 else 442 vm_phys_add_page(pa); 443 pa += PAGE_SIZE; 444 } 445 } 446 freeenv(list); 447 #if VM_NRESERVLEVEL > 0 448 /* 449 * Initialize the reservation management system. 450 */ 451 vm_reserv_init(); 452 #endif 453 return (vaddr); 454 } 455 456 457 CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); 458 459 void 460 vm_page_aflag_set(vm_page_t m, uint8_t bits) 461 { 462 uint32_t *addr, val; 463 464 /* 465 * The PGA_WRITEABLE flag can only be set if the page is managed and 466 * VPO_BUSY. Currently, this flag is only set by pmap_enter(). 467 */ 468 KASSERT((bits & PGA_WRITEABLE) == 0 || 469 (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY, 470 ("PGA_WRITEABLE and !VPO_BUSY")); 471 472 /* 473 * We want to use atomic updates for m->aflags, which is a 474 * byte wide. Not all architectures provide atomic operations 475 * on the single-byte destination. Punt and access the whole 476 * 4-byte word with an atomic update. Parallel non-atomic 477 * updates to the fields included in the update by proximity 478 * are handled properly by atomics. 479 */ 480 addr = (void *)&m->aflags; 481 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 482 val = bits; 483 #if BYTE_ORDER == BIG_ENDIAN 484 val <<= 24; 485 #endif 486 atomic_set_32(addr, val); 487 } 488 489 void 490 vm_page_aflag_clear(vm_page_t m, uint8_t bits) 491 { 492 uint32_t *addr, val; 493 494 /* 495 * The PGA_REFERENCED flag can only be cleared if the object 496 * containing the page is locked. 497 */ 498 KASSERT((bits & PGA_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object), 499 ("PGA_REFERENCED and !VM_OBJECT_LOCKED")); 500 501 /* 502 * See the comment in vm_page_aflag_set(). 503 */ 504 addr = (void *)&m->aflags; 505 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 506 val = bits; 507 #if BYTE_ORDER == BIG_ENDIAN 508 val <<= 24; 509 #endif 510 atomic_clear_32(addr, val); 511 } 512 513 void 514 vm_page_reference(vm_page_t m) 515 { 516 517 vm_page_aflag_set(m, PGA_REFERENCED); 518 } 519 520 void 521 vm_page_busy(vm_page_t m) 522 { 523 524 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 525 KASSERT((m->oflags & VPO_BUSY) == 0, 526 ("vm_page_busy: page already busy!!!")); 527 m->oflags |= VPO_BUSY; 528 } 529 530 /* 531 * vm_page_flash: 532 * 533 * wakeup anyone waiting for the page. 534 */ 535 void 536 vm_page_flash(vm_page_t m) 537 { 538 539 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 540 if (m->oflags & VPO_WANTED) { 541 m->oflags &= ~VPO_WANTED; 542 wakeup(m); 543 } 544 } 545 546 /* 547 * vm_page_wakeup: 548 * 549 * clear the VPO_BUSY flag and wakeup anyone waiting for the 550 * page. 551 * 552 */ 553 void 554 vm_page_wakeup(vm_page_t m) 555 { 556 557 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 558 KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!")); 559 m->oflags &= ~VPO_BUSY; 560 vm_page_flash(m); 561 } 562 563 void 564 vm_page_io_start(vm_page_t m) 565 { 566 567 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 568 m->busy++; 569 } 570 571 void 572 vm_page_io_finish(vm_page_t m) 573 { 574 575 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 576 KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m)); 577 m->busy--; 578 if (m->busy == 0) 579 vm_page_flash(m); 580 } 581 582 /* 583 * Keep page from being freed by the page daemon 584 * much of the same effect as wiring, except much lower 585 * overhead and should be used only for *very* temporary 586 * holding ("wiring"). 587 */ 588 void 589 vm_page_hold(vm_page_t mem) 590 { 591 592 vm_page_lock_assert(mem, MA_OWNED); 593 mem->hold_count++; 594 } 595 596 void 597 vm_page_unhold(vm_page_t mem) 598 { 599 600 vm_page_lock_assert(mem, MA_OWNED); 601 --mem->hold_count; 602 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 603 if (mem->hold_count == 0 && mem->queue == PQ_HOLD) 604 vm_page_free_toq(mem); 605 } 606 607 /* 608 * vm_page_unhold_pages: 609 * 610 * Unhold each of the pages that is referenced by the given array. 611 */ 612 void 613 vm_page_unhold_pages(vm_page_t *ma, int count) 614 { 615 struct mtx *mtx, *new_mtx; 616 617 mtx = NULL; 618 for (; count != 0; count--) { 619 /* 620 * Avoid releasing and reacquiring the same page lock. 621 */ 622 new_mtx = vm_page_lockptr(*ma); 623 if (mtx != new_mtx) { 624 if (mtx != NULL) 625 mtx_unlock(mtx); 626 mtx = new_mtx; 627 mtx_lock(mtx); 628 } 629 vm_page_unhold(*ma); 630 ma++; 631 } 632 if (mtx != NULL) 633 mtx_unlock(mtx); 634 } 635 636 /* 637 * vm_page_getfake: 638 * 639 * Create a fictitious page with the specified physical address and 640 * memory attribute. The memory attribute is the only the machine- 641 * dependent aspect of a fictitious page that must be initialized. 642 */ 643 vm_page_t 644 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 645 { 646 vm_page_t m; 647 648 m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); 649 m->phys_addr = paddr; 650 m->queue = PQ_NONE; 651 /* Fictitious pages don't use "segind". */ 652 m->flags = PG_FICTITIOUS; 653 /* Fictitious pages don't use "order" or "pool". */ 654 m->oflags = VPO_BUSY | VPO_UNMANAGED; 655 m->wire_count = 1; 656 pmap_page_set_memattr(m, memattr); 657 return (m); 658 } 659 660 /* 661 * vm_page_putfake: 662 * 663 * Release a fictitious page. 664 */ 665 void 666 vm_page_putfake(vm_page_t m) 667 { 668 669 KASSERT((m->flags & PG_FICTITIOUS) != 0, 670 ("vm_page_putfake: bad page %p", m)); 671 uma_zfree(fakepg_zone, m); 672 } 673 674 /* 675 * vm_page_updatefake: 676 * 677 * Update the given fictitious page to the specified physical address and 678 * memory attribute. 679 */ 680 void 681 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 682 { 683 684 KASSERT((m->flags & PG_FICTITIOUS) != 0, 685 ("vm_page_updatefake: bad page %p", m)); 686 m->phys_addr = paddr; 687 pmap_page_set_memattr(m, memattr); 688 } 689 690 /* 691 * vm_page_free: 692 * 693 * Free a page. 694 */ 695 void 696 vm_page_free(vm_page_t m) 697 { 698 699 m->flags &= ~PG_ZERO; 700 vm_page_free_toq(m); 701 } 702 703 /* 704 * vm_page_free_zero: 705 * 706 * Free a page to the zerod-pages queue 707 */ 708 void 709 vm_page_free_zero(vm_page_t m) 710 { 711 712 m->flags |= PG_ZERO; 713 vm_page_free_toq(m); 714 } 715 716 /* 717 * vm_page_sleep: 718 * 719 * Sleep and release the page and page queues locks. 720 * 721 * The object containing the given page must be locked. 722 */ 723 void 724 vm_page_sleep(vm_page_t m, const char *msg) 725 { 726 727 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 728 if (mtx_owned(&vm_page_queue_mtx)) 729 vm_page_unlock_queues(); 730 if (mtx_owned(vm_page_lockptr(m))) 731 vm_page_unlock(m); 732 733 /* 734 * It's possible that while we sleep, the page will get 735 * unbusied and freed. If we are holding the object 736 * lock, we will assume we hold a reference to the object 737 * such that even if m->object changes, we can re-lock 738 * it. 739 */ 740 m->oflags |= VPO_WANTED; 741 msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0); 742 } 743 744 /* 745 * vm_page_dirty: 746 * 747 * Set all bits in the page's dirty field. 748 * 749 * The object containing the specified page must be locked if the 750 * call is made from the machine-independent layer. 751 * 752 * See vm_page_clear_dirty_mask(). 753 */ 754 void 755 vm_page_dirty(vm_page_t m) 756 { 757 758 KASSERT((m->flags & PG_CACHED) == 0, 759 ("vm_page_dirty: page in cache!")); 760 KASSERT(!VM_PAGE_IS_FREE(m), 761 ("vm_page_dirty: page is free!")); 762 KASSERT(m->valid == VM_PAGE_BITS_ALL, 763 ("vm_page_dirty: page is invalid!")); 764 m->dirty = VM_PAGE_BITS_ALL; 765 } 766 767 /* 768 * vm_page_splay: 769 * 770 * Implements Sleator and Tarjan's top-down splay algorithm. Returns 771 * the vm_page containing the given pindex. If, however, that 772 * pindex is not found in the vm_object, returns a vm_page that is 773 * adjacent to the pindex, coming before or after it. 774 */ 775 vm_page_t 776 vm_page_splay(vm_pindex_t pindex, vm_page_t root) 777 { 778 struct vm_page dummy; 779 vm_page_t lefttreemax, righttreemin, y; 780 781 if (root == NULL) 782 return (root); 783 lefttreemax = righttreemin = &dummy; 784 for (;; root = y) { 785 if (pindex < root->pindex) { 786 if ((y = root->left) == NULL) 787 break; 788 if (pindex < y->pindex) { 789 /* Rotate right. */ 790 root->left = y->right; 791 y->right = root; 792 root = y; 793 if ((y = root->left) == NULL) 794 break; 795 } 796 /* Link into the new root's right tree. */ 797 righttreemin->left = root; 798 righttreemin = root; 799 } else if (pindex > root->pindex) { 800 if ((y = root->right) == NULL) 801 break; 802 if (pindex > y->pindex) { 803 /* Rotate left. */ 804 root->right = y->left; 805 y->left = root; 806 root = y; 807 if ((y = root->right) == NULL) 808 break; 809 } 810 /* Link into the new root's left tree. */ 811 lefttreemax->right = root; 812 lefttreemax = root; 813 } else 814 break; 815 } 816 /* Assemble the new root. */ 817 lefttreemax->right = root->left; 818 righttreemin->left = root->right; 819 root->left = dummy.right; 820 root->right = dummy.left; 821 return (root); 822 } 823 824 /* 825 * vm_page_insert: [ internal use only ] 826 * 827 * Inserts the given mem entry into the object and object list. 828 * 829 * The pagetables are not updated but will presumably fault the page 830 * in if necessary, or if a kernel page the caller will at some point 831 * enter the page into the kernel's pmap. We are not allowed to block 832 * here so we *can't* do this anyway. 833 * 834 * The object and page must be locked. 835 * This routine may not block. 836 */ 837 void 838 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 839 { 840 vm_page_t root; 841 842 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 843 if (m->object != NULL) 844 panic("vm_page_insert: page already inserted"); 845 846 /* 847 * Record the object/offset pair in this page 848 */ 849 m->object = object; 850 m->pindex = pindex; 851 852 /* 853 * Now link into the object's ordered list of backed pages. 854 */ 855 root = object->root; 856 if (root == NULL) { 857 m->left = NULL; 858 m->right = NULL; 859 TAILQ_INSERT_TAIL(&object->memq, m, listq); 860 } else { 861 root = vm_page_splay(pindex, root); 862 if (pindex < root->pindex) { 863 m->left = root->left; 864 m->right = root; 865 root->left = NULL; 866 TAILQ_INSERT_BEFORE(root, m, listq); 867 } else if (pindex == root->pindex) 868 panic("vm_page_insert: offset already allocated"); 869 else { 870 m->right = root->right; 871 m->left = root; 872 root->right = NULL; 873 TAILQ_INSERT_AFTER(&object->memq, root, m, listq); 874 } 875 } 876 object->root = m; 877 878 /* 879 * show that the object has one more resident page. 880 */ 881 object->resident_page_count++; 882 /* 883 * Hold the vnode until the last page is released. 884 */ 885 if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 886 vhold((struct vnode *)object->handle); 887 888 /* 889 * Since we are inserting a new and possibly dirty page, 890 * update the object's OBJ_MIGHTBEDIRTY flag. 891 */ 892 if (m->aflags & PGA_WRITEABLE) 893 vm_object_set_writeable_dirty(object); 894 } 895 896 /* 897 * vm_page_remove: 898 * NOTE: used by device pager as well -wfj 899 * 900 * Removes the given mem entry from the object/offset-page 901 * table and the object page list, but do not invalidate/terminate 902 * the backing store. 903 * 904 * The object and page must be locked. 905 * The underlying pmap entry (if any) is NOT removed here. 906 * This routine may not block. 907 */ 908 void 909 vm_page_remove(vm_page_t m) 910 { 911 vm_object_t object; 912 vm_page_t next, prev, root; 913 914 if ((m->oflags & VPO_UNMANAGED) == 0) 915 vm_page_lock_assert(m, MA_OWNED); 916 if ((object = m->object) == NULL) 917 return; 918 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 919 if (m->oflags & VPO_BUSY) { 920 m->oflags &= ~VPO_BUSY; 921 vm_page_flash(m); 922 } 923 924 /* 925 * Now remove from the object's list of backed pages. 926 */ 927 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 928 /* 929 * Since the page's successor in the list is also its parent 930 * in the tree, its right subtree must be empty. 931 */ 932 next->left = m->left; 933 KASSERT(m->right == NULL, 934 ("vm_page_remove: page %p has right child", m)); 935 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 936 prev->right == m) { 937 /* 938 * Since the page's predecessor in the list is also its parent 939 * in the tree, its left subtree must be empty. 940 */ 941 KASSERT(m->left == NULL, 942 ("vm_page_remove: page %p has left child", m)); 943 prev->right = m->right; 944 } else { 945 if (m != object->root) 946 vm_page_splay(m->pindex, object->root); 947 if (m->left == NULL) 948 root = m->right; 949 else if (m->right == NULL) 950 root = m->left; 951 else { 952 /* 953 * Move the page's successor to the root, because 954 * pages are usually removed in ascending order. 955 */ 956 if (m->right != next) 957 vm_page_splay(m->pindex, m->right); 958 next->left = m->left; 959 root = next; 960 } 961 object->root = root; 962 } 963 TAILQ_REMOVE(&object->memq, m, listq); 964 965 /* 966 * And show that the object has one fewer resident page. 967 */ 968 object->resident_page_count--; 969 /* 970 * The vnode may now be recycled. 971 */ 972 if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 973 vdrop((struct vnode *)object->handle); 974 975 m->object = NULL; 976 } 977 978 /* 979 * vm_page_lookup: 980 * 981 * Returns the page associated with the object/offset 982 * pair specified; if none is found, NULL is returned. 983 * 984 * The object must be locked. 985 * This routine may not block. 986 * This is a critical path routine 987 */ 988 vm_page_t 989 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 990 { 991 vm_page_t m; 992 993 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 994 if ((m = object->root) != NULL && m->pindex != pindex) { 995 m = vm_page_splay(pindex, m); 996 if ((object->root = m)->pindex != pindex) 997 m = NULL; 998 } 999 return (m); 1000 } 1001 1002 /* 1003 * vm_page_find_least: 1004 * 1005 * Returns the page associated with the object with least pindex 1006 * greater than or equal to the parameter pindex, or NULL. 1007 * 1008 * The object must be locked. 1009 * The routine may not block. 1010 */ 1011 vm_page_t 1012 vm_page_find_least(vm_object_t object, vm_pindex_t pindex) 1013 { 1014 vm_page_t m; 1015 1016 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1017 if ((m = TAILQ_FIRST(&object->memq)) != NULL) { 1018 if (m->pindex < pindex) { 1019 m = vm_page_splay(pindex, object->root); 1020 if ((object->root = m)->pindex < pindex) 1021 m = TAILQ_NEXT(m, listq); 1022 } 1023 } 1024 return (m); 1025 } 1026 1027 /* 1028 * Returns the given page's successor (by pindex) within the object if it is 1029 * resident; if none is found, NULL is returned. 1030 * 1031 * The object must be locked. 1032 */ 1033 vm_page_t 1034 vm_page_next(vm_page_t m) 1035 { 1036 vm_page_t next; 1037 1038 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1039 if ((next = TAILQ_NEXT(m, listq)) != NULL && 1040 next->pindex != m->pindex + 1) 1041 next = NULL; 1042 return (next); 1043 } 1044 1045 /* 1046 * Returns the given page's predecessor (by pindex) within the object if it is 1047 * resident; if none is found, NULL is returned. 1048 * 1049 * The object must be locked. 1050 */ 1051 vm_page_t 1052 vm_page_prev(vm_page_t m) 1053 { 1054 vm_page_t prev; 1055 1056 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1057 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 1058 prev->pindex != m->pindex - 1) 1059 prev = NULL; 1060 return (prev); 1061 } 1062 1063 /* 1064 * vm_page_rename: 1065 * 1066 * Move the given memory entry from its 1067 * current object to the specified target object/offset. 1068 * 1069 * The object must be locked. 1070 * This routine may not block. 1071 * 1072 * Note: swap associated with the page must be invalidated by the move. We 1073 * have to do this for several reasons: (1) we aren't freeing the 1074 * page, (2) we are dirtying the page, (3) the VM system is probably 1075 * moving the page from object A to B, and will then later move 1076 * the backing store from A to B and we can't have a conflict. 1077 * 1078 * Note: we *always* dirty the page. It is necessary both for the 1079 * fact that we moved it, and because we may be invalidating 1080 * swap. If the page is on the cache, we have to deactivate it 1081 * or vm_page_dirty() will panic. Dirty pages are not allowed 1082 * on the cache. 1083 */ 1084 void 1085 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 1086 { 1087 1088 vm_page_remove(m); 1089 vm_page_insert(m, new_object, new_pindex); 1090 vm_page_dirty(m); 1091 } 1092 1093 /* 1094 * Convert all of the given object's cached pages that have a 1095 * pindex within the given range into free pages. If the value 1096 * zero is given for "end", then the range's upper bound is 1097 * infinity. If the given object is backed by a vnode and it 1098 * transitions from having one or more cached pages to none, the 1099 * vnode's hold count is reduced. 1100 */ 1101 void 1102 vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1103 { 1104 vm_page_t m, m_next; 1105 boolean_t empty; 1106 1107 mtx_lock(&vm_page_queue_free_mtx); 1108 if (__predict_false(object->cache == NULL)) { 1109 mtx_unlock(&vm_page_queue_free_mtx); 1110 return; 1111 } 1112 m = object->cache = vm_page_splay(start, object->cache); 1113 if (m->pindex < start) { 1114 if (m->right == NULL) 1115 m = NULL; 1116 else { 1117 m_next = vm_page_splay(start, m->right); 1118 m_next->left = m; 1119 m->right = NULL; 1120 m = object->cache = m_next; 1121 } 1122 } 1123 1124 /* 1125 * At this point, "m" is either (1) a reference to the page 1126 * with the least pindex that is greater than or equal to 1127 * "start" or (2) NULL. 1128 */ 1129 for (; m != NULL && (m->pindex < end || end == 0); m = m_next) { 1130 /* 1131 * Find "m"'s successor and remove "m" from the 1132 * object's cache. 1133 */ 1134 if (m->right == NULL) { 1135 object->cache = m->left; 1136 m_next = NULL; 1137 } else { 1138 m_next = vm_page_splay(start, m->right); 1139 m_next->left = m->left; 1140 object->cache = m_next; 1141 } 1142 /* Convert "m" to a free page. */ 1143 m->object = NULL; 1144 m->valid = 0; 1145 /* Clear PG_CACHED and set PG_FREE. */ 1146 m->flags ^= PG_CACHED | PG_FREE; 1147 KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, 1148 ("vm_page_cache_free: page %p has inconsistent flags", m)); 1149 cnt.v_cache_count--; 1150 cnt.v_free_count++; 1151 } 1152 empty = object->cache == NULL; 1153 mtx_unlock(&vm_page_queue_free_mtx); 1154 if (object->type == OBJT_VNODE && empty) 1155 vdrop(object->handle); 1156 } 1157 1158 /* 1159 * Returns the cached page that is associated with the given 1160 * object and offset. If, however, none exists, returns NULL. 1161 * 1162 * The free page queue must be locked. 1163 */ 1164 static inline vm_page_t 1165 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) 1166 { 1167 vm_page_t m; 1168 1169 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1170 if ((m = object->cache) != NULL && m->pindex != pindex) { 1171 m = vm_page_splay(pindex, m); 1172 if ((object->cache = m)->pindex != pindex) 1173 m = NULL; 1174 } 1175 return (m); 1176 } 1177 1178 /* 1179 * Remove the given cached page from its containing object's 1180 * collection of cached pages. 1181 * 1182 * The free page queue must be locked. 1183 */ 1184 static void 1185 vm_page_cache_remove(vm_page_t m) 1186 { 1187 vm_object_t object; 1188 vm_page_t root; 1189 1190 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1191 KASSERT((m->flags & PG_CACHED) != 0, 1192 ("vm_page_cache_remove: page %p is not cached", m)); 1193 object = m->object; 1194 if (m != object->cache) { 1195 root = vm_page_splay(m->pindex, object->cache); 1196 KASSERT(root == m, 1197 ("vm_page_cache_remove: page %p is not cached in object %p", 1198 m, object)); 1199 } 1200 if (m->left == NULL) 1201 root = m->right; 1202 else if (m->right == NULL) 1203 root = m->left; 1204 else { 1205 root = vm_page_splay(m->pindex, m->left); 1206 root->right = m->right; 1207 } 1208 object->cache = root; 1209 m->object = NULL; 1210 cnt.v_cache_count--; 1211 } 1212 1213 /* 1214 * Transfer all of the cached pages with offset greater than or 1215 * equal to 'offidxstart' from the original object's cache to the 1216 * new object's cache. However, any cached pages with offset 1217 * greater than or equal to the new object's size are kept in the 1218 * original object. Initially, the new object's cache must be 1219 * empty. Offset 'offidxstart' in the original object must 1220 * correspond to offset zero in the new object. 1221 * 1222 * The new object must be locked. 1223 */ 1224 void 1225 vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, 1226 vm_object_t new_object) 1227 { 1228 vm_page_t m, m_next; 1229 1230 /* 1231 * Insertion into an object's collection of cached pages 1232 * requires the object to be locked. In contrast, removal does 1233 * not. 1234 */ 1235 VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); 1236 KASSERT(new_object->cache == NULL, 1237 ("vm_page_cache_transfer: object %p has cached pages", 1238 new_object)); 1239 mtx_lock(&vm_page_queue_free_mtx); 1240 if ((m = orig_object->cache) != NULL) { 1241 /* 1242 * Transfer all of the pages with offset greater than or 1243 * equal to 'offidxstart' from the original object's 1244 * cache to the new object's cache. 1245 */ 1246 m = vm_page_splay(offidxstart, m); 1247 if (m->pindex < offidxstart) { 1248 orig_object->cache = m; 1249 new_object->cache = m->right; 1250 m->right = NULL; 1251 } else { 1252 orig_object->cache = m->left; 1253 new_object->cache = m; 1254 m->left = NULL; 1255 } 1256 while ((m = new_object->cache) != NULL) { 1257 if ((m->pindex - offidxstart) >= new_object->size) { 1258 /* 1259 * Return all of the cached pages with 1260 * offset greater than or equal to the 1261 * new object's size to the original 1262 * object's cache. 1263 */ 1264 new_object->cache = m->left; 1265 m->left = orig_object->cache; 1266 orig_object->cache = m; 1267 break; 1268 } 1269 m_next = vm_page_splay(m->pindex, m->right); 1270 /* Update the page's object and offset. */ 1271 m->object = new_object; 1272 m->pindex -= offidxstart; 1273 if (m_next == NULL) 1274 break; 1275 m->right = NULL; 1276 m_next->left = m; 1277 new_object->cache = m_next; 1278 } 1279 KASSERT(new_object->cache == NULL || 1280 new_object->type == OBJT_SWAP, 1281 ("vm_page_cache_transfer: object %p's type is incompatible" 1282 " with cached pages", new_object)); 1283 } 1284 mtx_unlock(&vm_page_queue_free_mtx); 1285 } 1286 1287 /* 1288 * Returns TRUE if a cached page is associated with the given object and 1289 * offset, and FALSE otherwise. 1290 * 1291 * The object must be locked. 1292 */ 1293 boolean_t 1294 vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) 1295 { 1296 vm_page_t m; 1297 1298 /* 1299 * Insertion into an object's collection of cached pages requires the 1300 * object to be locked. Therefore, if the object is locked and the 1301 * object's collection is empty, there is no need to acquire the free 1302 * page queues lock in order to prove that the specified page doesn't 1303 * exist. 1304 */ 1305 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1306 if (__predict_true(object->cache == NULL)) 1307 return (FALSE); 1308 mtx_lock(&vm_page_queue_free_mtx); 1309 m = vm_page_cache_lookup(object, pindex); 1310 mtx_unlock(&vm_page_queue_free_mtx); 1311 return (m != NULL); 1312 } 1313 1314 /* 1315 * vm_page_alloc: 1316 * 1317 * Allocate and return a page that is associated with the specified 1318 * object and offset pair. By default, this page has the flag VPO_BUSY 1319 * set. 1320 * 1321 * The caller must always specify an allocation class. 1322 * 1323 * allocation classes: 1324 * VM_ALLOC_NORMAL normal process request 1325 * VM_ALLOC_SYSTEM system *really* needs a page 1326 * VM_ALLOC_INTERRUPT interrupt time request 1327 * 1328 * optional allocation flags: 1329 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1330 * intends to allocate 1331 * VM_ALLOC_IFCACHED return page only if it is cached 1332 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page 1333 * is cached 1334 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page 1335 * VM_ALLOC_NODUMP do not include the page in a kernel core dump 1336 * VM_ALLOC_NOOBJ page is not associated with an object and 1337 * should not have the flag VPO_BUSY set 1338 * VM_ALLOC_WIRED wire the allocated page 1339 * VM_ALLOC_ZERO prefer a zeroed page 1340 * 1341 * This routine may not sleep. 1342 */ 1343 vm_page_t 1344 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1345 { 1346 struct vnode *vp = NULL; 1347 vm_object_t m_object; 1348 vm_page_t m; 1349 int flags, req_class; 1350 1351 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1352 ("vm_page_alloc: inconsistent object/req")); 1353 if (object != NULL) 1354 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1355 1356 req_class = req & VM_ALLOC_CLASS_MASK; 1357 1358 /* 1359 * The page daemon is allowed to dig deeper into the free page list. 1360 */ 1361 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1362 req_class = VM_ALLOC_SYSTEM; 1363 1364 mtx_lock(&vm_page_queue_free_mtx); 1365 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1366 (req_class == VM_ALLOC_SYSTEM && 1367 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1368 (req_class == VM_ALLOC_INTERRUPT && 1369 cnt.v_free_count + cnt.v_cache_count > 0)) { 1370 /* 1371 * Allocate from the free queue if the number of free pages 1372 * exceeds the minimum for the request class. 1373 */ 1374 if (object != NULL && 1375 (m = vm_page_cache_lookup(object, pindex)) != NULL) { 1376 if ((req & VM_ALLOC_IFNOTCACHED) != 0) { 1377 mtx_unlock(&vm_page_queue_free_mtx); 1378 return (NULL); 1379 } 1380 if (vm_phys_unfree_page(m)) 1381 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); 1382 #if VM_NRESERVLEVEL > 0 1383 else if (!vm_reserv_reactivate_page(m)) 1384 #else 1385 else 1386 #endif 1387 panic("vm_page_alloc: cache page %p is missing" 1388 " from the free queue", m); 1389 } else if ((req & VM_ALLOC_IFCACHED) != 0) { 1390 mtx_unlock(&vm_page_queue_free_mtx); 1391 return (NULL); 1392 #if VM_NRESERVLEVEL > 0 1393 } else if (object == NULL || object->type == OBJT_DEVICE || 1394 object->type == OBJT_SG || 1395 (object->flags & OBJ_COLORED) == 0 || 1396 (m = vm_reserv_alloc_page(object, pindex)) == NULL) { 1397 #else 1398 } else { 1399 #endif 1400 m = vm_phys_alloc_pages(object != NULL ? 1401 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); 1402 #if VM_NRESERVLEVEL > 0 1403 if (m == NULL && vm_reserv_reclaim_inactive()) { 1404 m = vm_phys_alloc_pages(object != NULL ? 1405 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 1406 0); 1407 } 1408 #endif 1409 } 1410 } else { 1411 /* 1412 * Not allocatable, give up. 1413 */ 1414 mtx_unlock(&vm_page_queue_free_mtx); 1415 atomic_add_int(&vm_pageout_deficit, 1416 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1417 pagedaemon_wakeup(); 1418 return (NULL); 1419 } 1420 1421 /* 1422 * At this point we had better have found a good page. 1423 */ 1424 KASSERT(m != NULL, ("vm_page_alloc: missing page")); 1425 KASSERT(m->queue == PQ_NONE, 1426 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); 1427 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); 1428 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); 1429 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m)); 1430 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); 1431 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1432 ("vm_page_alloc: page %p has unexpected memattr %d", m, 1433 pmap_page_get_memattr(m))); 1434 if ((m->flags & PG_CACHED) != 0) { 1435 KASSERT((m->flags & PG_ZERO) == 0, 1436 ("vm_page_alloc: cached page %p is PG_ZERO", m)); 1437 KASSERT(m->valid != 0, 1438 ("vm_page_alloc: cached page %p is invalid", m)); 1439 if (m->object == object && m->pindex == pindex) 1440 cnt.v_reactivated++; 1441 else 1442 m->valid = 0; 1443 m_object = m->object; 1444 vm_page_cache_remove(m); 1445 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1446 vp = m_object->handle; 1447 } else { 1448 KASSERT(VM_PAGE_IS_FREE(m), 1449 ("vm_page_alloc: page %p is not free", m)); 1450 KASSERT(m->valid == 0, 1451 ("vm_page_alloc: free page %p is valid", m)); 1452 cnt.v_free_count--; 1453 } 1454 1455 /* 1456 * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag 1457 * must be cleared before the free page queues lock is released. 1458 */ 1459 flags = 0; 1460 if (req & VM_ALLOC_NODUMP) 1461 flags |= PG_NODUMP; 1462 if (m->flags & PG_ZERO) { 1463 vm_page_zero_count--; 1464 if (req & VM_ALLOC_ZERO) 1465 flags = PG_ZERO; 1466 } 1467 m->flags = flags; 1468 mtx_unlock(&vm_page_queue_free_mtx); 1469 m->aflags = 0; 1470 if (object == NULL || object->type == OBJT_PHYS) 1471 m->oflags = VPO_UNMANAGED; 1472 else 1473 m->oflags = 0; 1474 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0) 1475 m->oflags |= VPO_BUSY; 1476 if (req & VM_ALLOC_WIRED) { 1477 /* 1478 * The page lock is not required for wiring a page until that 1479 * page is inserted into the object. 1480 */ 1481 atomic_add_int(&cnt.v_wire_count, 1); 1482 m->wire_count = 1; 1483 } 1484 m->act_count = 0; 1485 1486 if (object != NULL) { 1487 /* Ignore device objects; the pager sets "memattr" for them. */ 1488 if (object->memattr != VM_MEMATTR_DEFAULT && 1489 object->type != OBJT_DEVICE && object->type != OBJT_SG) 1490 pmap_page_set_memattr(m, object->memattr); 1491 vm_page_insert(m, object, pindex); 1492 } else 1493 m->pindex = pindex; 1494 1495 /* 1496 * The following call to vdrop() must come after the above call 1497 * to vm_page_insert() in case both affect the same object and 1498 * vnode. Otherwise, the affected vnode's hold count could 1499 * temporarily become zero. 1500 */ 1501 if (vp != NULL) 1502 vdrop(vp); 1503 1504 /* 1505 * Don't wakeup too often - wakeup the pageout daemon when 1506 * we would be nearly out of memory. 1507 */ 1508 if (vm_paging_needed()) 1509 pagedaemon_wakeup(); 1510 1511 return (m); 1512 } 1513 1514 /* 1515 * vm_page_alloc_contig: 1516 * 1517 * Allocate a contiguous set of physical pages of the given size "npages" 1518 * from the free lists. All of the physical pages must be at or above 1519 * the given physical address "low" and below the given physical address 1520 * "high". The given value "alignment" determines the alignment of the 1521 * first physical page in the set. If the given value "boundary" is 1522 * non-zero, then the set of physical pages cannot cross any physical 1523 * address boundary that is a multiple of that value. Both "alignment" 1524 * and "boundary" must be a power of two. 1525 * 1526 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, 1527 * then the memory attribute setting for the physical pages is configured 1528 * to the object's memory attribute setting. Otherwise, the memory 1529 * attribute setting for the physical pages is configured to "memattr", 1530 * overriding the object's memory attribute setting. However, if the 1531 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the 1532 * memory attribute setting for the physical pages cannot be configured 1533 * to VM_MEMATTR_DEFAULT. 1534 * 1535 * The caller must always specify an allocation class. 1536 * 1537 * allocation classes: 1538 * VM_ALLOC_NORMAL normal process request 1539 * VM_ALLOC_SYSTEM system *really* needs a page 1540 * VM_ALLOC_INTERRUPT interrupt time request 1541 * 1542 * optional allocation flags: 1543 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page 1544 * VM_ALLOC_NOOBJ page is not associated with an object and 1545 * should not have the flag VPO_BUSY set 1546 * VM_ALLOC_WIRED wire the allocated page 1547 * VM_ALLOC_ZERO prefer a zeroed page 1548 * 1549 * This routine may not sleep. 1550 */ 1551 vm_page_t 1552 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, 1553 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 1554 vm_paddr_t boundary, vm_memattr_t memattr) 1555 { 1556 struct vnode *drop; 1557 vm_page_t deferred_vdrop_list, m, m_ret; 1558 u_int flags, oflags; 1559 int req_class; 1560 1561 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1562 ("vm_page_alloc_contig: inconsistent object/req")); 1563 if (object != NULL) { 1564 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1565 KASSERT(object->type == OBJT_PHYS, 1566 ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", 1567 object)); 1568 } 1569 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 1570 req_class = req & VM_ALLOC_CLASS_MASK; 1571 1572 /* 1573 * The page daemon is allowed to dig deeper into the free page list. 1574 */ 1575 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1576 req_class = VM_ALLOC_SYSTEM; 1577 1578 deferred_vdrop_list = NULL; 1579 mtx_lock(&vm_page_queue_free_mtx); 1580 if (cnt.v_free_count + cnt.v_cache_count >= npages + 1581 cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && 1582 cnt.v_free_count + cnt.v_cache_count >= npages + 1583 cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && 1584 cnt.v_free_count + cnt.v_cache_count >= npages)) { 1585 #if VM_NRESERVLEVEL > 0 1586 retry: 1587 if (object == NULL || (object->flags & OBJ_COLORED) == 0 || 1588 (m_ret = vm_reserv_alloc_contig(object, pindex, npages, 1589 low, high, alignment, boundary)) == NULL) 1590 #endif 1591 m_ret = vm_phys_alloc_contig(npages, low, high, 1592 alignment, boundary); 1593 } else { 1594 mtx_unlock(&vm_page_queue_free_mtx); 1595 atomic_add_int(&vm_pageout_deficit, npages); 1596 pagedaemon_wakeup(); 1597 return (NULL); 1598 } 1599 if (m_ret != NULL) 1600 for (m = m_ret; m < &m_ret[npages]; m++) { 1601 drop = vm_page_alloc_init(m); 1602 if (drop != NULL) { 1603 /* 1604 * Enqueue the vnode for deferred vdrop(). 1605 * 1606 * Once the pages are removed from the free 1607 * page list, "pageq" can be safely abused to 1608 * construct a short-lived list of vnodes. 1609 */ 1610 m->pageq.tqe_prev = (void *)drop; 1611 m->pageq.tqe_next = deferred_vdrop_list; 1612 deferred_vdrop_list = m; 1613 } 1614 } 1615 else { 1616 #if VM_NRESERVLEVEL > 0 1617 if (vm_reserv_reclaim_contig(npages, low, high, alignment, 1618 boundary)) 1619 goto retry; 1620 #endif 1621 } 1622 mtx_unlock(&vm_page_queue_free_mtx); 1623 if (m_ret == NULL) 1624 return (NULL); 1625 1626 /* 1627 * Initialize the pages. Only the PG_ZERO flag is inherited. 1628 */ 1629 flags = 0; 1630 if ((req & VM_ALLOC_ZERO) != 0) 1631 flags = PG_ZERO; 1632 if ((req & VM_ALLOC_NODUMP) != 0) 1633 flags |= PG_NODUMP; 1634 if ((req & VM_ALLOC_WIRED) != 0) 1635 atomic_add_int(&cnt.v_wire_count, npages); 1636 oflags = VPO_UNMANAGED; 1637 if (object != NULL) { 1638 if ((req & VM_ALLOC_NOBUSY) == 0) 1639 oflags |= VPO_BUSY; 1640 if (object->memattr != VM_MEMATTR_DEFAULT && 1641 memattr == VM_MEMATTR_DEFAULT) 1642 memattr = object->memattr; 1643 } 1644 for (m = m_ret; m < &m_ret[npages]; m++) { 1645 m->aflags = 0; 1646 m->flags &= flags; 1647 if ((req & VM_ALLOC_WIRED) != 0) 1648 m->wire_count = 1; 1649 /* Unmanaged pages don't use "act_count". */ 1650 m->oflags = oflags; 1651 if (memattr != VM_MEMATTR_DEFAULT) 1652 pmap_page_set_memattr(m, memattr); 1653 if (object != NULL) 1654 vm_page_insert(m, object, pindex); 1655 else 1656 m->pindex = pindex; 1657 pindex++; 1658 } 1659 while (deferred_vdrop_list != NULL) { 1660 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev); 1661 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next; 1662 } 1663 if (vm_paging_needed()) 1664 pagedaemon_wakeup(); 1665 return (m_ret); 1666 } 1667 1668 /* 1669 * Initialize a page that has been freshly dequeued from a freelist. 1670 * The caller has to drop the vnode returned, if it is not NULL. 1671 * 1672 * This function may only be used to initialize unmanaged pages. 1673 * 1674 * To be called with vm_page_queue_free_mtx held. 1675 */ 1676 static struct vnode * 1677 vm_page_alloc_init(vm_page_t m) 1678 { 1679 struct vnode *drop; 1680 vm_object_t m_object; 1681 1682 KASSERT(m->queue == PQ_NONE, 1683 ("vm_page_alloc_init: page %p has unexpected queue %d", 1684 m, m->queue)); 1685 KASSERT(m->wire_count == 0, 1686 ("vm_page_alloc_init: page %p is wired", m)); 1687 KASSERT(m->hold_count == 0, 1688 ("vm_page_alloc_init: page %p is held", m)); 1689 KASSERT(m->busy == 0, 1690 ("vm_page_alloc_init: page %p is busy", m)); 1691 KASSERT(m->dirty == 0, 1692 ("vm_page_alloc_init: page %p is dirty", m)); 1693 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1694 ("vm_page_alloc_init: page %p has unexpected memattr %d", 1695 m, pmap_page_get_memattr(m))); 1696 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1697 drop = NULL; 1698 if ((m->flags & PG_CACHED) != 0) { 1699 KASSERT((m->flags & PG_ZERO) == 0, 1700 ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); 1701 m->valid = 0; 1702 m_object = m->object; 1703 vm_page_cache_remove(m); 1704 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1705 drop = m_object->handle; 1706 } else { 1707 KASSERT(VM_PAGE_IS_FREE(m), 1708 ("vm_page_alloc_init: page %p is not free", m)); 1709 KASSERT(m->valid == 0, 1710 ("vm_page_alloc_init: free page %p is valid", m)); 1711 cnt.v_free_count--; 1712 if ((m->flags & PG_ZERO) != 0) 1713 vm_page_zero_count--; 1714 } 1715 /* Don't clear the PG_ZERO flag; we'll need it later. */ 1716 m->flags &= PG_ZERO; 1717 return (drop); 1718 } 1719 1720 /* 1721 * vm_page_alloc_freelist: 1722 * 1723 * Allocate a physical page from the specified free page list. 1724 * 1725 * The caller must always specify an allocation class. 1726 * 1727 * allocation classes: 1728 * VM_ALLOC_NORMAL normal process request 1729 * VM_ALLOC_SYSTEM system *really* needs a page 1730 * VM_ALLOC_INTERRUPT interrupt time request 1731 * 1732 * optional allocation flags: 1733 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1734 * intends to allocate 1735 * VM_ALLOC_WIRED wire the allocated page 1736 * VM_ALLOC_ZERO prefer a zeroed page 1737 * 1738 * This routine may not sleep. 1739 */ 1740 vm_page_t 1741 vm_page_alloc_freelist(int flind, int req) 1742 { 1743 struct vnode *drop; 1744 vm_page_t m; 1745 u_int flags; 1746 int req_class; 1747 1748 req_class = req & VM_ALLOC_CLASS_MASK; 1749 1750 /* 1751 * The page daemon is allowed to dig deeper into the free page list. 1752 */ 1753 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1754 req_class = VM_ALLOC_SYSTEM; 1755 1756 /* 1757 * Do not allocate reserved pages unless the req has asked for it. 1758 */ 1759 mtx_lock(&vm_page_queue_free_mtx); 1760 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1761 (req_class == VM_ALLOC_SYSTEM && 1762 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1763 (req_class == VM_ALLOC_INTERRUPT && 1764 cnt.v_free_count + cnt.v_cache_count > 0)) 1765 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); 1766 else { 1767 mtx_unlock(&vm_page_queue_free_mtx); 1768 atomic_add_int(&vm_pageout_deficit, 1769 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1770 pagedaemon_wakeup(); 1771 return (NULL); 1772 } 1773 if (m == NULL) { 1774 mtx_unlock(&vm_page_queue_free_mtx); 1775 return (NULL); 1776 } 1777 drop = vm_page_alloc_init(m); 1778 mtx_unlock(&vm_page_queue_free_mtx); 1779 1780 /* 1781 * Initialize the page. Only the PG_ZERO flag is inherited. 1782 */ 1783 m->aflags = 0; 1784 flags = 0; 1785 if ((req & VM_ALLOC_ZERO) != 0) 1786 flags = PG_ZERO; 1787 m->flags &= flags; 1788 if ((req & VM_ALLOC_WIRED) != 0) { 1789 /* 1790 * The page lock is not required for wiring a page that does 1791 * not belong to an object. 1792 */ 1793 atomic_add_int(&cnt.v_wire_count, 1); 1794 m->wire_count = 1; 1795 } 1796 /* Unmanaged pages don't use "act_count". */ 1797 m->oflags = VPO_UNMANAGED; 1798 if (drop != NULL) 1799 vdrop(drop); 1800 if (vm_paging_needed()) 1801 pagedaemon_wakeup(); 1802 return (m); 1803 } 1804 1805 /* 1806 * vm_wait: (also see VM_WAIT macro) 1807 * 1808 * Block until free pages are available for allocation 1809 * - Called in various places before memory allocations. 1810 */ 1811 void 1812 vm_wait(void) 1813 { 1814 1815 mtx_lock(&vm_page_queue_free_mtx); 1816 if (curproc == pageproc) { 1817 vm_pageout_pages_needed = 1; 1818 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, 1819 PDROP | PSWP, "VMWait", 0); 1820 } else { 1821 if (!vm_pages_needed) { 1822 vm_pages_needed = 1; 1823 wakeup(&vm_pages_needed); 1824 } 1825 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, 1826 "vmwait", 0); 1827 } 1828 } 1829 1830 /* 1831 * vm_waitpfault: (also see VM_WAITPFAULT macro) 1832 * 1833 * Block until free pages are available for allocation 1834 * - Called only in vm_fault so that processes page faulting 1835 * can be easily tracked. 1836 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 1837 * processes will be able to grab memory first. Do not change 1838 * this balance without careful testing first. 1839 */ 1840 void 1841 vm_waitpfault(void) 1842 { 1843 1844 mtx_lock(&vm_page_queue_free_mtx); 1845 if (!vm_pages_needed) { 1846 vm_pages_needed = 1; 1847 wakeup(&vm_pages_needed); 1848 } 1849 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, 1850 "pfault", 0); 1851 } 1852 1853 /* 1854 * vm_page_requeue: 1855 * 1856 * Move the given page to the tail of its present page queue. 1857 * 1858 * The page queues must be locked. 1859 */ 1860 void 1861 vm_page_requeue(vm_page_t m) 1862 { 1863 struct vpgqueues *vpq; 1864 int queue; 1865 1866 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1867 queue = m->queue; 1868 KASSERT(queue != PQ_NONE, 1869 ("vm_page_requeue: page %p is not queued", m)); 1870 vpq = &vm_page_queues[queue]; 1871 TAILQ_REMOVE(&vpq->pl, m, pageq); 1872 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1873 } 1874 1875 /* 1876 * vm_page_queue_remove: 1877 * 1878 * Remove the given page from the specified queue. 1879 * 1880 * The page and page queues must be locked. 1881 */ 1882 static __inline void 1883 vm_page_queue_remove(int queue, vm_page_t m) 1884 { 1885 struct vpgqueues *pq; 1886 1887 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1888 vm_page_lock_assert(m, MA_OWNED); 1889 pq = &vm_page_queues[queue]; 1890 TAILQ_REMOVE(&pq->pl, m, pageq); 1891 (*pq->cnt)--; 1892 } 1893 1894 /* 1895 * vm_pageq_remove: 1896 * 1897 * Remove a page from its queue. 1898 * 1899 * The given page must be locked. 1900 * This routine may not block. 1901 */ 1902 void 1903 vm_pageq_remove(vm_page_t m) 1904 { 1905 int queue; 1906 1907 vm_page_lock_assert(m, MA_OWNED); 1908 if ((queue = m->queue) != PQ_NONE) { 1909 vm_page_lock_queues(); 1910 m->queue = PQ_NONE; 1911 vm_page_queue_remove(queue, m); 1912 vm_page_unlock_queues(); 1913 } 1914 } 1915 1916 /* 1917 * vm_page_enqueue: 1918 * 1919 * Add the given page to the specified queue. 1920 * 1921 * The page queues must be locked. 1922 */ 1923 static void 1924 vm_page_enqueue(int queue, vm_page_t m) 1925 { 1926 struct vpgqueues *vpq; 1927 1928 vpq = &vm_page_queues[queue]; 1929 m->queue = queue; 1930 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1931 ++*vpq->cnt; 1932 } 1933 1934 /* 1935 * vm_page_activate: 1936 * 1937 * Put the specified page on the active list (if appropriate). 1938 * Ensure that act_count is at least ACT_INIT but do not otherwise 1939 * mess with it. 1940 * 1941 * The page must be locked. 1942 * This routine may not block. 1943 */ 1944 void 1945 vm_page_activate(vm_page_t m) 1946 { 1947 int queue; 1948 1949 vm_page_lock_assert(m, MA_OWNED); 1950 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1951 if ((queue = m->queue) != PQ_ACTIVE) { 1952 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 1953 if (m->act_count < ACT_INIT) 1954 m->act_count = ACT_INIT; 1955 vm_page_lock_queues(); 1956 if (queue != PQ_NONE) 1957 vm_page_queue_remove(queue, m); 1958 vm_page_enqueue(PQ_ACTIVE, m); 1959 vm_page_unlock_queues(); 1960 } else 1961 KASSERT(queue == PQ_NONE, 1962 ("vm_page_activate: wired page %p is queued", m)); 1963 } else { 1964 if (m->act_count < ACT_INIT) 1965 m->act_count = ACT_INIT; 1966 } 1967 } 1968 1969 /* 1970 * vm_page_free_wakeup: 1971 * 1972 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 1973 * routine is called when a page has been added to the cache or free 1974 * queues. 1975 * 1976 * The page queues must be locked. 1977 * This routine may not block. 1978 */ 1979 static inline void 1980 vm_page_free_wakeup(void) 1981 { 1982 1983 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1984 /* 1985 * if pageout daemon needs pages, then tell it that there are 1986 * some free. 1987 */ 1988 if (vm_pageout_pages_needed && 1989 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { 1990 wakeup(&vm_pageout_pages_needed); 1991 vm_pageout_pages_needed = 0; 1992 } 1993 /* 1994 * wakeup processes that are waiting on memory if we hit a 1995 * high water mark. And wakeup scheduler process if we have 1996 * lots of memory. this process will swapin processes. 1997 */ 1998 if (vm_pages_needed && !vm_page_count_min()) { 1999 vm_pages_needed = 0; 2000 wakeup(&cnt.v_free_count); 2001 } 2002 } 2003 2004 /* 2005 * vm_page_free_toq: 2006 * 2007 * Returns the given page to the free list, 2008 * disassociating it with any VM object. 2009 * 2010 * Object and page must be locked prior to entry. 2011 * This routine may not block. 2012 */ 2013 2014 void 2015 vm_page_free_toq(vm_page_t m) 2016 { 2017 2018 if ((m->oflags & VPO_UNMANAGED) == 0) { 2019 vm_page_lock_assert(m, MA_OWNED); 2020 KASSERT(!pmap_page_is_mapped(m), 2021 ("vm_page_free_toq: freeing mapped page %p", m)); 2022 } 2023 PCPU_INC(cnt.v_tfree); 2024 2025 if (VM_PAGE_IS_FREE(m)) 2026 panic("vm_page_free: freeing free page %p", m); 2027 else if (m->busy != 0) 2028 panic("vm_page_free: freeing busy page %p", m); 2029 2030 /* 2031 * unqueue, then remove page. Note that we cannot destroy 2032 * the page here because we do not want to call the pager's 2033 * callback routine until after we've put the page on the 2034 * appropriate free queue. 2035 */ 2036 if ((m->oflags & VPO_UNMANAGED) == 0) 2037 vm_pageq_remove(m); 2038 vm_page_remove(m); 2039 2040 /* 2041 * If fictitious remove object association and 2042 * return, otherwise delay object association removal. 2043 */ 2044 if ((m->flags & PG_FICTITIOUS) != 0) { 2045 return; 2046 } 2047 2048 m->valid = 0; 2049 vm_page_undirty(m); 2050 2051 if (m->wire_count != 0) 2052 panic("vm_page_free: freeing wired page %p", m); 2053 if (m->hold_count != 0) { 2054 m->flags &= ~PG_ZERO; 2055 vm_page_lock_queues(); 2056 vm_page_enqueue(PQ_HOLD, m); 2057 vm_page_unlock_queues(); 2058 } else { 2059 /* 2060 * Restore the default memory attribute to the page. 2061 */ 2062 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2063 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2064 2065 /* 2066 * Insert the page into the physical memory allocator's 2067 * cache/free page queues. 2068 */ 2069 mtx_lock(&vm_page_queue_free_mtx); 2070 m->flags |= PG_FREE; 2071 cnt.v_free_count++; 2072 #if VM_NRESERVLEVEL > 0 2073 if (!vm_reserv_free_page(m)) 2074 #else 2075 if (TRUE) 2076 #endif 2077 vm_phys_free_pages(m, 0); 2078 if ((m->flags & PG_ZERO) != 0) 2079 ++vm_page_zero_count; 2080 else 2081 vm_page_zero_idle_wakeup(); 2082 vm_page_free_wakeup(); 2083 mtx_unlock(&vm_page_queue_free_mtx); 2084 } 2085 } 2086 2087 /* 2088 * vm_page_wire: 2089 * 2090 * Mark this page as wired down by yet 2091 * another map, removing it from paging queues 2092 * as necessary. 2093 * 2094 * If the page is fictitious, then its wire count must remain one. 2095 * 2096 * The page must be locked. 2097 * This routine may not block. 2098 */ 2099 void 2100 vm_page_wire(vm_page_t m) 2101 { 2102 2103 /* 2104 * Only bump the wire statistics if the page is not already wired, 2105 * and only unqueue the page if it is on some queue (if it is unmanaged 2106 * it is already off the queues). 2107 */ 2108 vm_page_lock_assert(m, MA_OWNED); 2109 if ((m->flags & PG_FICTITIOUS) != 0) { 2110 KASSERT(m->wire_count == 1, 2111 ("vm_page_wire: fictitious page %p's wire count isn't one", 2112 m)); 2113 return; 2114 } 2115 if (m->wire_count == 0) { 2116 if ((m->oflags & VPO_UNMANAGED) == 0) 2117 vm_pageq_remove(m); 2118 atomic_add_int(&cnt.v_wire_count, 1); 2119 } 2120 m->wire_count++; 2121 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); 2122 } 2123 2124 /* 2125 * vm_page_unwire: 2126 * 2127 * Release one wiring of the specified page, potentially enabling it to be 2128 * paged again. If paging is enabled, then the value of the parameter 2129 * "activate" determines to which queue the page is added. If "activate" is 2130 * non-zero, then the page is added to the active queue. Otherwise, it is 2131 * added to the inactive queue. 2132 * 2133 * However, unless the page belongs to an object, it is not enqueued because 2134 * it cannot be paged out. 2135 * 2136 * If a page is fictitious, then its wire count must alway be one. 2137 * 2138 * A managed page must be locked. 2139 */ 2140 void 2141 vm_page_unwire(vm_page_t m, int activate) 2142 { 2143 2144 if ((m->oflags & VPO_UNMANAGED) == 0) 2145 vm_page_lock_assert(m, MA_OWNED); 2146 if ((m->flags & PG_FICTITIOUS) != 0) { 2147 KASSERT(m->wire_count == 1, 2148 ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); 2149 return; 2150 } 2151 if (m->wire_count > 0) { 2152 m->wire_count--; 2153 if (m->wire_count == 0) { 2154 atomic_subtract_int(&cnt.v_wire_count, 1); 2155 if ((m->oflags & VPO_UNMANAGED) != 0 || 2156 m->object == NULL) 2157 return; 2158 if (!activate) 2159 m->flags &= ~PG_WINATCFLS; 2160 vm_page_lock_queues(); 2161 vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m); 2162 vm_page_unlock_queues(); 2163 } 2164 } else 2165 panic("vm_page_unwire: page %p's wire count is zero", m); 2166 } 2167 2168 /* 2169 * Move the specified page to the inactive queue. 2170 * 2171 * Many pages placed on the inactive queue should actually go 2172 * into the cache, but it is difficult to figure out which. What 2173 * we do instead, if the inactive target is well met, is to put 2174 * clean pages at the head of the inactive queue instead of the tail. 2175 * This will cause them to be moved to the cache more quickly and 2176 * if not actively re-referenced, reclaimed more quickly. If we just 2177 * stick these pages at the end of the inactive queue, heavy filesystem 2178 * meta-data accesses can cause an unnecessary paging load on memory bound 2179 * processes. This optimization causes one-time-use metadata to be 2180 * reused more quickly. 2181 * 2182 * Normally athead is 0 resulting in LRU operation. athead is set 2183 * to 1 if we want this page to be 'as if it were placed in the cache', 2184 * except without unmapping it from the process address space. 2185 * 2186 * This routine may not block. 2187 */ 2188 static inline void 2189 _vm_page_deactivate(vm_page_t m, int athead) 2190 { 2191 int queue; 2192 2193 vm_page_lock_assert(m, MA_OWNED); 2194 2195 /* 2196 * Ignore if already inactive. 2197 */ 2198 if ((queue = m->queue) == PQ_INACTIVE) 2199 return; 2200 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 2201 m->flags &= ~PG_WINATCFLS; 2202 vm_page_lock_queues(); 2203 if (queue != PQ_NONE) 2204 vm_page_queue_remove(queue, m); 2205 if (athead) 2206 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, 2207 pageq); 2208 else 2209 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, 2210 pageq); 2211 m->queue = PQ_INACTIVE; 2212 cnt.v_inactive_count++; 2213 vm_page_unlock_queues(); 2214 } 2215 } 2216 2217 /* 2218 * Move the specified page to the inactive queue. 2219 * 2220 * The page must be locked. 2221 */ 2222 void 2223 vm_page_deactivate(vm_page_t m) 2224 { 2225 2226 _vm_page_deactivate(m, 0); 2227 } 2228 2229 /* 2230 * vm_page_try_to_cache: 2231 * 2232 * Returns 0 on failure, 1 on success 2233 */ 2234 int 2235 vm_page_try_to_cache(vm_page_t m) 2236 { 2237 2238 vm_page_lock_assert(m, MA_OWNED); 2239 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2240 if (m->dirty || m->hold_count || m->busy || m->wire_count || 2241 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0) 2242 return (0); 2243 pmap_remove_all(m); 2244 if (m->dirty) 2245 return (0); 2246 vm_page_cache(m); 2247 return (1); 2248 } 2249 2250 /* 2251 * vm_page_try_to_free() 2252 * 2253 * Attempt to free the page. If we cannot free it, we do nothing. 2254 * 1 is returned on success, 0 on failure. 2255 */ 2256 int 2257 vm_page_try_to_free(vm_page_t m) 2258 { 2259 2260 vm_page_lock_assert(m, MA_OWNED); 2261 if (m->object != NULL) 2262 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2263 if (m->dirty || m->hold_count || m->busy || m->wire_count || 2264 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0) 2265 return (0); 2266 pmap_remove_all(m); 2267 if (m->dirty) 2268 return (0); 2269 vm_page_free(m); 2270 return (1); 2271 } 2272 2273 /* 2274 * vm_page_cache 2275 * 2276 * Put the specified page onto the page cache queue (if appropriate). 2277 * 2278 * This routine may not block. 2279 */ 2280 void 2281 vm_page_cache(vm_page_t m) 2282 { 2283 vm_object_t object; 2284 vm_page_t next, prev, root; 2285 2286 vm_page_lock_assert(m, MA_OWNED); 2287 object = m->object; 2288 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2289 if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy || 2290 m->hold_count || m->wire_count) 2291 panic("vm_page_cache: attempting to cache busy page"); 2292 pmap_remove_all(m); 2293 if (m->dirty != 0) 2294 panic("vm_page_cache: page %p is dirty", m); 2295 if (m->valid == 0 || object->type == OBJT_DEFAULT || 2296 (object->type == OBJT_SWAP && 2297 !vm_pager_has_page(object, m->pindex, NULL, NULL))) { 2298 /* 2299 * Hypothesis: A cache-elgible page belonging to a 2300 * default object or swap object but without a backing 2301 * store must be zero filled. 2302 */ 2303 vm_page_free(m); 2304 return; 2305 } 2306 KASSERT((m->flags & PG_CACHED) == 0, 2307 ("vm_page_cache: page %p is already cached", m)); 2308 PCPU_INC(cnt.v_tcached); 2309 2310 /* 2311 * Remove the page from the paging queues. 2312 */ 2313 vm_pageq_remove(m); 2314 2315 /* 2316 * Remove the page from the object's collection of resident 2317 * pages. 2318 */ 2319 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 2320 /* 2321 * Since the page's successor in the list is also its parent 2322 * in the tree, its right subtree must be empty. 2323 */ 2324 next->left = m->left; 2325 KASSERT(m->right == NULL, 2326 ("vm_page_cache: page %p has right child", m)); 2327 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 2328 prev->right == m) { 2329 /* 2330 * Since the page's predecessor in the list is also its parent 2331 * in the tree, its left subtree must be empty. 2332 */ 2333 KASSERT(m->left == NULL, 2334 ("vm_page_cache: page %p has left child", m)); 2335 prev->right = m->right; 2336 } else { 2337 if (m != object->root) 2338 vm_page_splay(m->pindex, object->root); 2339 if (m->left == NULL) 2340 root = m->right; 2341 else if (m->right == NULL) 2342 root = m->left; 2343 else { 2344 /* 2345 * Move the page's successor to the root, because 2346 * pages are usually removed in ascending order. 2347 */ 2348 if (m->right != next) 2349 vm_page_splay(m->pindex, m->right); 2350 next->left = m->left; 2351 root = next; 2352 } 2353 object->root = root; 2354 } 2355 TAILQ_REMOVE(&object->memq, m, listq); 2356 object->resident_page_count--; 2357 2358 /* 2359 * Restore the default memory attribute to the page. 2360 */ 2361 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2362 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2363 2364 /* 2365 * Insert the page into the object's collection of cached pages 2366 * and the physical memory allocator's cache/free page queues. 2367 */ 2368 m->flags &= ~PG_ZERO; 2369 mtx_lock(&vm_page_queue_free_mtx); 2370 m->flags |= PG_CACHED; 2371 cnt.v_cache_count++; 2372 root = object->cache; 2373 if (root == NULL) { 2374 m->left = NULL; 2375 m->right = NULL; 2376 } else { 2377 root = vm_page_splay(m->pindex, root); 2378 if (m->pindex < root->pindex) { 2379 m->left = root->left; 2380 m->right = root; 2381 root->left = NULL; 2382 } else if (__predict_false(m->pindex == root->pindex)) 2383 panic("vm_page_cache: offset already cached"); 2384 else { 2385 m->right = root->right; 2386 m->left = root; 2387 root->right = NULL; 2388 } 2389 } 2390 object->cache = m; 2391 #if VM_NRESERVLEVEL > 0 2392 if (!vm_reserv_free_page(m)) { 2393 #else 2394 if (TRUE) { 2395 #endif 2396 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); 2397 vm_phys_free_pages(m, 0); 2398 } 2399 vm_page_free_wakeup(); 2400 mtx_unlock(&vm_page_queue_free_mtx); 2401 2402 /* 2403 * Increment the vnode's hold count if this is the object's only 2404 * cached page. Decrement the vnode's hold count if this was 2405 * the object's only resident page. 2406 */ 2407 if (object->type == OBJT_VNODE) { 2408 if (root == NULL && object->resident_page_count != 0) 2409 vhold(object->handle); 2410 else if (root != NULL && object->resident_page_count == 0) 2411 vdrop(object->handle); 2412 } 2413 } 2414 2415 /* 2416 * vm_page_dontneed 2417 * 2418 * Cache, deactivate, or do nothing as appropriate. This routine 2419 * is typically used by madvise() MADV_DONTNEED. 2420 * 2421 * Generally speaking we want to move the page into the cache so 2422 * it gets reused quickly. However, this can result in a silly syndrome 2423 * due to the page recycling too quickly. Small objects will not be 2424 * fully cached. On the otherhand, if we move the page to the inactive 2425 * queue we wind up with a problem whereby very large objects 2426 * unnecessarily blow away our inactive and cache queues. 2427 * 2428 * The solution is to move the pages based on a fixed weighting. We 2429 * either leave them alone, deactivate them, or move them to the cache, 2430 * where moving them to the cache has the highest weighting. 2431 * By forcing some pages into other queues we eventually force the 2432 * system to balance the queues, potentially recovering other unrelated 2433 * space from active. The idea is to not force this to happen too 2434 * often. 2435 */ 2436 void 2437 vm_page_dontneed(vm_page_t m) 2438 { 2439 int dnw; 2440 int head; 2441 2442 vm_page_lock_assert(m, MA_OWNED); 2443 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2444 dnw = PCPU_GET(dnweight); 2445 PCPU_INC(dnweight); 2446 2447 /* 2448 * Occasionally leave the page alone. 2449 */ 2450 if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) { 2451 if (m->act_count >= ACT_INIT) 2452 --m->act_count; 2453 return; 2454 } 2455 2456 /* 2457 * Clear any references to the page. Otherwise, the page daemon will 2458 * immediately reactivate the page. 2459 * 2460 * Perform the pmap_clear_reference() first. Otherwise, a concurrent 2461 * pmap operation, such as pmap_remove(), could clear a reference in 2462 * the pmap and set PGA_REFERENCED on the page before the 2463 * pmap_clear_reference() had completed. Consequently, the page would 2464 * appear referenced based upon an old reference that occurred before 2465 * this function ran. 2466 */ 2467 pmap_clear_reference(m); 2468 vm_page_aflag_clear(m, PGA_REFERENCED); 2469 2470 if (m->dirty == 0 && pmap_is_modified(m)) 2471 vm_page_dirty(m); 2472 2473 if (m->dirty || (dnw & 0x0070) == 0) { 2474 /* 2475 * Deactivate the page 3 times out of 32. 2476 */ 2477 head = 0; 2478 } else { 2479 /* 2480 * Cache the page 28 times out of every 32. Note that 2481 * the page is deactivated instead of cached, but placed 2482 * at the head of the queue instead of the tail. 2483 */ 2484 head = 1; 2485 } 2486 _vm_page_deactivate(m, head); 2487 } 2488 2489 /* 2490 * Grab a page, waiting until we are waken up due to the page 2491 * changing state. We keep on waiting, if the page continues 2492 * to be in the object. If the page doesn't exist, first allocate it 2493 * and then conditionally zero it. 2494 * 2495 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended 2496 * to facilitate its eventual removal. 2497 * 2498 * This routine may block. 2499 */ 2500 vm_page_t 2501 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 2502 { 2503 vm_page_t m; 2504 2505 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2506 KASSERT((allocflags & VM_ALLOC_RETRY) != 0, 2507 ("vm_page_grab: VM_ALLOC_RETRY is required")); 2508 retrylookup: 2509 if ((m = vm_page_lookup(object, pindex)) != NULL) { 2510 if ((m->oflags & VPO_BUSY) != 0 || 2511 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) { 2512 /* 2513 * Reference the page before unlocking and 2514 * sleeping so that the page daemon is less 2515 * likely to reclaim it. 2516 */ 2517 vm_page_aflag_set(m, PGA_REFERENCED); 2518 vm_page_sleep(m, "pgrbwt"); 2519 goto retrylookup; 2520 } else { 2521 if ((allocflags & VM_ALLOC_WIRED) != 0) { 2522 vm_page_lock(m); 2523 vm_page_wire(m); 2524 vm_page_unlock(m); 2525 } 2526 if ((allocflags & VM_ALLOC_NOBUSY) == 0) 2527 vm_page_busy(m); 2528 return (m); 2529 } 2530 } 2531 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY | 2532 VM_ALLOC_IGN_SBUSY)); 2533 if (m == NULL) { 2534 VM_OBJECT_UNLOCK(object); 2535 VM_WAIT; 2536 VM_OBJECT_LOCK(object); 2537 goto retrylookup; 2538 } else if (m->valid != 0) 2539 return (m); 2540 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 2541 pmap_zero_page(m); 2542 return (m); 2543 } 2544 2545 /* 2546 * Mapping function for valid bits or for dirty bits in 2547 * a page. May not block. 2548 * 2549 * Inputs are required to range within a page. 2550 */ 2551 vm_page_bits_t 2552 vm_page_bits(int base, int size) 2553 { 2554 int first_bit; 2555 int last_bit; 2556 2557 KASSERT( 2558 base + size <= PAGE_SIZE, 2559 ("vm_page_bits: illegal base/size %d/%d", base, size) 2560 ); 2561 2562 if (size == 0) /* handle degenerate case */ 2563 return (0); 2564 2565 first_bit = base >> DEV_BSHIFT; 2566 last_bit = (base + size - 1) >> DEV_BSHIFT; 2567 2568 return (((vm_page_bits_t)2 << last_bit) - 2569 ((vm_page_bits_t)1 << first_bit)); 2570 } 2571 2572 /* 2573 * vm_page_set_valid_range: 2574 * 2575 * Sets portions of a page valid. The arguments are expected 2576 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2577 * of any partial chunks touched by the range. The invalid portion of 2578 * such chunks will be zeroed. 2579 * 2580 * (base + size) must be less then or equal to PAGE_SIZE. 2581 */ 2582 void 2583 vm_page_set_valid_range(vm_page_t m, int base, int size) 2584 { 2585 int endoff, frag; 2586 2587 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2588 if (size == 0) /* handle degenerate case */ 2589 return; 2590 2591 /* 2592 * If the base is not DEV_BSIZE aligned and the valid 2593 * bit is clear, we have to zero out a portion of the 2594 * first block. 2595 */ 2596 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2597 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2598 pmap_zero_page_area(m, frag, base - frag); 2599 2600 /* 2601 * If the ending offset is not DEV_BSIZE aligned and the 2602 * valid bit is clear, we have to zero out a portion of 2603 * the last block. 2604 */ 2605 endoff = base + size; 2606 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2607 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2608 pmap_zero_page_area(m, endoff, 2609 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2610 2611 /* 2612 * Assert that no previously invalid block that is now being validated 2613 * is already dirty. 2614 */ 2615 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 2616 ("vm_page_set_valid_range: page %p is dirty", m)); 2617 2618 /* 2619 * Set valid bits inclusive of any overlap. 2620 */ 2621 m->valid |= vm_page_bits(base, size); 2622 } 2623 2624 /* 2625 * Clear the given bits from the specified page's dirty field. 2626 */ 2627 static __inline void 2628 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) 2629 { 2630 uintptr_t addr; 2631 #if PAGE_SIZE < 16384 2632 int shift; 2633 #endif 2634 2635 /* 2636 * If the object is locked and the page is neither VPO_BUSY nor 2637 * PGA_WRITEABLE, then the page's dirty field cannot possibly be 2638 * set by a concurrent pmap operation. 2639 */ 2640 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2641 if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0) 2642 m->dirty &= ~pagebits; 2643 else { 2644 /* 2645 * The pmap layer can call vm_page_dirty() without 2646 * holding a distinguished lock. The combination of 2647 * the object's lock and an atomic operation suffice 2648 * to guarantee consistency of the page dirty field. 2649 * 2650 * For PAGE_SIZE == 32768 case, compiler already 2651 * properly aligns the dirty field, so no forcible 2652 * alignment is needed. Only require existence of 2653 * atomic_clear_64 when page size is 32768. 2654 */ 2655 addr = (uintptr_t)&m->dirty; 2656 #if PAGE_SIZE == 32768 2657 atomic_clear_64((uint64_t *)addr, pagebits); 2658 #elif PAGE_SIZE == 16384 2659 atomic_clear_32((uint32_t *)addr, pagebits); 2660 #else /* PAGE_SIZE <= 8192 */ 2661 /* 2662 * Use a trick to perform a 32-bit atomic on the 2663 * containing aligned word, to not depend on the existence 2664 * of atomic_clear_{8, 16}. 2665 */ 2666 shift = addr & (sizeof(uint32_t) - 1); 2667 #if BYTE_ORDER == BIG_ENDIAN 2668 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; 2669 #else 2670 shift *= NBBY; 2671 #endif 2672 addr &= ~(sizeof(uint32_t) - 1); 2673 atomic_clear_32((uint32_t *)addr, pagebits << shift); 2674 #endif /* PAGE_SIZE */ 2675 } 2676 } 2677 2678 /* 2679 * vm_page_set_validclean: 2680 * 2681 * Sets portions of a page valid and clean. The arguments are expected 2682 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2683 * of any partial chunks touched by the range. The invalid portion of 2684 * such chunks will be zero'd. 2685 * 2686 * This routine may not block. 2687 * 2688 * (base + size) must be less then or equal to PAGE_SIZE. 2689 */ 2690 void 2691 vm_page_set_validclean(vm_page_t m, int base, int size) 2692 { 2693 vm_page_bits_t oldvalid, pagebits; 2694 int endoff, frag; 2695 2696 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2697 if (size == 0) /* handle degenerate case */ 2698 return; 2699 2700 /* 2701 * If the base is not DEV_BSIZE aligned and the valid 2702 * bit is clear, we have to zero out a portion of the 2703 * first block. 2704 */ 2705 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2706 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) 2707 pmap_zero_page_area(m, frag, base - frag); 2708 2709 /* 2710 * If the ending offset is not DEV_BSIZE aligned and the 2711 * valid bit is clear, we have to zero out a portion of 2712 * the last block. 2713 */ 2714 endoff = base + size; 2715 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2716 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) 2717 pmap_zero_page_area(m, endoff, 2718 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2719 2720 /* 2721 * Set valid, clear dirty bits. If validating the entire 2722 * page we can safely clear the pmap modify bit. We also 2723 * use this opportunity to clear the VPO_NOSYNC flag. If a process 2724 * takes a write fault on a MAP_NOSYNC memory area the flag will 2725 * be set again. 2726 * 2727 * We set valid bits inclusive of any overlap, but we can only 2728 * clear dirty bits for DEV_BSIZE chunks that are fully within 2729 * the range. 2730 */ 2731 oldvalid = m->valid; 2732 pagebits = vm_page_bits(base, size); 2733 m->valid |= pagebits; 2734 #if 0 /* NOT YET */ 2735 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 2736 frag = DEV_BSIZE - frag; 2737 base += frag; 2738 size -= frag; 2739 if (size < 0) 2740 size = 0; 2741 } 2742 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 2743 #endif 2744 if (base == 0 && size == PAGE_SIZE) { 2745 /* 2746 * The page can only be modified within the pmap if it is 2747 * mapped, and it can only be mapped if it was previously 2748 * fully valid. 2749 */ 2750 if (oldvalid == VM_PAGE_BITS_ALL) 2751 /* 2752 * Perform the pmap_clear_modify() first. Otherwise, 2753 * a concurrent pmap operation, such as 2754 * pmap_protect(), could clear a modification in the 2755 * pmap and set the dirty field on the page before 2756 * pmap_clear_modify() had begun and after the dirty 2757 * field was cleared here. 2758 */ 2759 pmap_clear_modify(m); 2760 m->dirty = 0; 2761 m->oflags &= ~VPO_NOSYNC; 2762 } else if (oldvalid != VM_PAGE_BITS_ALL) 2763 m->dirty &= ~pagebits; 2764 else 2765 vm_page_clear_dirty_mask(m, pagebits); 2766 } 2767 2768 void 2769 vm_page_clear_dirty(vm_page_t m, int base, int size) 2770 { 2771 2772 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 2773 } 2774 2775 /* 2776 * vm_page_set_invalid: 2777 * 2778 * Invalidates DEV_BSIZE'd chunks within a page. Both the 2779 * valid and dirty bits for the effected areas are cleared. 2780 * 2781 * May not block. 2782 */ 2783 void 2784 vm_page_set_invalid(vm_page_t m, int base, int size) 2785 { 2786 vm_page_bits_t bits; 2787 2788 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2789 KASSERT((m->oflags & VPO_BUSY) == 0, 2790 ("vm_page_set_invalid: page %p is busy", m)); 2791 bits = vm_page_bits(base, size); 2792 if (m->valid == VM_PAGE_BITS_ALL && bits != 0) 2793 pmap_remove_all(m); 2794 KASSERT(!pmap_page_is_mapped(m), 2795 ("vm_page_set_invalid: page %p is mapped", m)); 2796 m->valid &= ~bits; 2797 m->dirty &= ~bits; 2798 } 2799 2800 /* 2801 * vm_page_zero_invalid() 2802 * 2803 * The kernel assumes that the invalid portions of a page contain 2804 * garbage, but such pages can be mapped into memory by user code. 2805 * When this occurs, we must zero out the non-valid portions of the 2806 * page so user code sees what it expects. 2807 * 2808 * Pages are most often semi-valid when the end of a file is mapped 2809 * into memory and the file's size is not page aligned. 2810 */ 2811 void 2812 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 2813 { 2814 int b; 2815 int i; 2816 2817 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2818 /* 2819 * Scan the valid bits looking for invalid sections that 2820 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 2821 * valid bit may be set ) have already been zerod by 2822 * vm_page_set_validclean(). 2823 */ 2824 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 2825 if (i == (PAGE_SIZE / DEV_BSIZE) || 2826 (m->valid & ((vm_page_bits_t)1 << i))) { 2827 if (i > b) { 2828 pmap_zero_page_area(m, 2829 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 2830 } 2831 b = i + 1; 2832 } 2833 } 2834 2835 /* 2836 * setvalid is TRUE when we can safely set the zero'd areas 2837 * as being valid. We can do this if there are no cache consistancy 2838 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 2839 */ 2840 if (setvalid) 2841 m->valid = VM_PAGE_BITS_ALL; 2842 } 2843 2844 /* 2845 * vm_page_is_valid: 2846 * 2847 * Is (partial) page valid? Note that the case where size == 0 2848 * will return FALSE in the degenerate case where the page is 2849 * entirely invalid, and TRUE otherwise. 2850 * 2851 * May not block. 2852 */ 2853 int 2854 vm_page_is_valid(vm_page_t m, int base, int size) 2855 { 2856 vm_page_bits_t bits; 2857 2858 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2859 bits = vm_page_bits(base, size); 2860 if (m->valid && ((m->valid & bits) == bits)) 2861 return 1; 2862 else 2863 return 0; 2864 } 2865 2866 /* 2867 * update dirty bits from pmap/mmu. May not block. 2868 */ 2869 void 2870 vm_page_test_dirty(vm_page_t m) 2871 { 2872 2873 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2874 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 2875 vm_page_dirty(m); 2876 } 2877 2878 void 2879 vm_page_lock_KBI(vm_page_t m, const char *file, int line) 2880 { 2881 2882 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); 2883 } 2884 2885 void 2886 vm_page_unlock_KBI(vm_page_t m, const char *file, int line) 2887 { 2888 2889 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); 2890 } 2891 2892 int 2893 vm_page_trylock_KBI(vm_page_t m, const char *file, int line) 2894 { 2895 2896 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); 2897 } 2898 2899 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) 2900 void 2901 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) 2902 { 2903 2904 mtx_assert_(vm_page_lockptr(m), a, file, line); 2905 } 2906 #endif 2907 2908 int so_zerocp_fullpage = 0; 2909 2910 /* 2911 * Replace the given page with a copy. The copied page assumes 2912 * the portion of the given page's "wire_count" that is not the 2913 * responsibility of this copy-on-write mechanism. 2914 * 2915 * The object containing the given page must have a non-zero 2916 * paging-in-progress count and be locked. 2917 */ 2918 void 2919 vm_page_cowfault(vm_page_t m) 2920 { 2921 vm_page_t mnew; 2922 vm_object_t object; 2923 vm_pindex_t pindex; 2924 2925 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED); 2926 vm_page_lock_assert(m, MA_OWNED); 2927 object = m->object; 2928 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2929 KASSERT(object->paging_in_progress != 0, 2930 ("vm_page_cowfault: object %p's paging-in-progress count is zero.", 2931 object)); 2932 pindex = m->pindex; 2933 2934 retry_alloc: 2935 pmap_remove_all(m); 2936 vm_page_remove(m); 2937 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); 2938 if (mnew == NULL) { 2939 vm_page_insert(m, object, pindex); 2940 vm_page_unlock(m); 2941 VM_OBJECT_UNLOCK(object); 2942 VM_WAIT; 2943 VM_OBJECT_LOCK(object); 2944 if (m == vm_page_lookup(object, pindex)) { 2945 vm_page_lock(m); 2946 goto retry_alloc; 2947 } else { 2948 /* 2949 * Page disappeared during the wait. 2950 */ 2951 return; 2952 } 2953 } 2954 2955 if (m->cow == 0) { 2956 /* 2957 * check to see if we raced with an xmit complete when 2958 * waiting to allocate a page. If so, put things back 2959 * the way they were 2960 */ 2961 vm_page_unlock(m); 2962 vm_page_lock(mnew); 2963 vm_page_free(mnew); 2964 vm_page_unlock(mnew); 2965 vm_page_insert(m, object, pindex); 2966 } else { /* clear COW & copy page */ 2967 if (!so_zerocp_fullpage) 2968 pmap_copy_page(m, mnew); 2969 mnew->valid = VM_PAGE_BITS_ALL; 2970 vm_page_dirty(mnew); 2971 mnew->wire_count = m->wire_count - m->cow; 2972 m->wire_count = m->cow; 2973 vm_page_unlock(m); 2974 } 2975 } 2976 2977 void 2978 vm_page_cowclear(vm_page_t m) 2979 { 2980 2981 vm_page_lock_assert(m, MA_OWNED); 2982 if (m->cow) { 2983 m->cow--; 2984 /* 2985 * let vm_fault add back write permission lazily 2986 */ 2987 } 2988 /* 2989 * sf_buf_free() will free the page, so we needn't do it here 2990 */ 2991 } 2992 2993 int 2994 vm_page_cowsetup(vm_page_t m) 2995 { 2996 2997 vm_page_lock_assert(m, MA_OWNED); 2998 if ((m->flags & PG_FICTITIOUS) != 0 || 2999 (m->oflags & VPO_UNMANAGED) != 0 || 3000 m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object)) 3001 return (EBUSY); 3002 m->cow++; 3003 pmap_remove_write(m); 3004 VM_OBJECT_UNLOCK(m->object); 3005 return (0); 3006 } 3007 3008 #ifdef INVARIANTS 3009 void 3010 vm_page_object_lock_assert(vm_page_t m) 3011 { 3012 3013 /* 3014 * Certain of the page's fields may only be modified by the 3015 * holder of the containing object's lock or the setter of the 3016 * page's VPO_BUSY flag. Unfortunately, the setter of the 3017 * VPO_BUSY flag is not recorded, and thus cannot be checked 3018 * here. 3019 */ 3020 if (m->object != NULL && (m->oflags & VPO_BUSY) == 0) 3021 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 3022 } 3023 #endif 3024 3025 #include "opt_ddb.h" 3026 #ifdef DDB 3027 #include <sys/kernel.h> 3028 3029 #include <ddb/ddb.h> 3030 3031 DB_SHOW_COMMAND(page, vm_page_print_page_info) 3032 { 3033 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); 3034 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); 3035 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); 3036 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); 3037 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); 3038 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); 3039 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); 3040 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); 3041 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); 3042 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); 3043 } 3044 3045 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 3046 { 3047 3048 db_printf("PQ_FREE:"); 3049 db_printf(" %d", cnt.v_free_count); 3050 db_printf("\n"); 3051 3052 db_printf("PQ_CACHE:"); 3053 db_printf(" %d", cnt.v_cache_count); 3054 db_printf("\n"); 3055 3056 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 3057 *vm_page_queues[PQ_ACTIVE].cnt, 3058 *vm_page_queues[PQ_INACTIVE].cnt); 3059 } 3060 #endif /* DDB */ 3061