1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * VM - paged vnode. 40 * 41 * This file supplies vm support for the vnode operations that deal with pages. 42 */ 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/sysmacros.h> 47 #include <sys/systm.h> 48 #include <sys/time.h> 49 #include <sys/buf.h> 50 #include <sys/vnode.h> 51 #include <sys/uio.h> 52 #include <sys/vmsystm.h> 53 #include <sys/mman.h> 54 #include <sys/vfs.h> 55 #include <sys/cred.h> 56 #include <sys/user.h> 57 #include <sys/kmem.h> 58 #include <sys/cmn_err.h> 59 #include <sys/debug.h> 60 #include <sys/cpuvar.h> 61 #include <sys/vtrace.h> 62 #include <sys/tnf_probe.h> 63 64 #include <vm/hat.h> 65 #include <vm/as.h> 66 #include <vm/seg.h> 67 #include <vm/rm.h> 68 #include <vm/pvn.h> 69 #include <vm/page.h> 70 #include <vm/seg_map.h> 71 #include <vm/seg_kmem.h> 72 #include <sys/fs/swapnode.h> 73 74 int pvn_nofodklust = 0; 75 int pvn_write_noklust = 0; 76 77 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ 78 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ 79 /* support for vmodsort for testing */ 80 81 static struct kmem_cache *marker_cache = NULL; 82 83 /* 84 * Find the largest contiguous block which contains `addr' for file offset 85 * `offset' in it while living within the file system block sizes (`vp_off' 86 * and `vp_len') and the address space limits for which no pages currently 87 * exist and which map to consecutive file offsets. 88 */ 89 page_t * 90 pvn_read_kluster( 91 struct vnode *vp, 92 u_offset_t off, 93 struct seg *seg, 94 caddr_t addr, 95 u_offset_t *offp, /* return values */ 96 size_t *lenp, /* return values */ 97 u_offset_t vp_off, 98 size_t vp_len, 99 int isra) 100 { 101 ssize_t deltaf, deltab; 102 page_t *pp; 103 page_t *plist = NULL; 104 spgcnt_t pagesavail; 105 u_offset_t vp_end; 106 107 ASSERT(off >= vp_off && off < vp_off + vp_len); 108 109 /* 110 * We only want to do klustering/read ahead if there 111 * is more than minfree pages currently available. 112 */ 113 pagesavail = freemem - minfree; 114 115 if (pagesavail <= 0) 116 if (isra) 117 return ((page_t *)NULL); /* ra case - give up */ 118 else 119 pagesavail = 1; /* must return a page */ 120 121 /* We calculate in pages instead of bytes due to 32-bit overflows */ 122 if (pagesavail < (spgcnt_t)btopr(vp_len)) { 123 /* 124 * Don't have enough free memory for the 125 * max request, try sizing down vp request. 126 */ 127 deltab = (ssize_t)(off - vp_off); 128 vp_len -= deltab; 129 vp_off += deltab; 130 if (pagesavail < btopr(vp_len)) { 131 /* 132 * Still not enough memory, just settle for 133 * pagesavail which is at least 1. 134 */ 135 vp_len = ptob(pagesavail); 136 } 137 } 138 139 vp_end = vp_off + vp_len; 140 ASSERT(off >= vp_off && off < vp_end); 141 142 if (isra && SEGOP_KLUSTER(seg, addr, 0)) 143 return ((page_t *)NULL); /* segment driver says no */ 144 145 if ((plist = page_create_va(vp, off, 146 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) 147 return ((page_t *)NULL); 148 149 if (vp_len <= PAGESIZE || pvn_nofodklust) { 150 *offp = off; 151 *lenp = MIN(vp_len, PAGESIZE); 152 } else { 153 /* 154 * Scan back from front by incrementing "deltab" and 155 * comparing "off" with "vp_off + deltab" to avoid 156 * "signed" versus "unsigned" conversion problems. 157 */ 158 for (deltab = PAGESIZE; off >= vp_off + deltab; 159 deltab += PAGESIZE) { 160 /* 161 * Call back to the segment driver to verify that 162 * the klustering/read ahead operation makes sense. 163 */ 164 if (SEGOP_KLUSTER(seg, addr, -deltab)) 165 break; /* page not eligible */ 166 if ((pp = page_create_va(vp, off - deltab, 167 PAGESIZE, PG_EXCL, seg, addr - deltab)) 168 == NULL) 169 break; /* already have the page */ 170 /* 171 * Add page to front of page list. 172 */ 173 page_add(&plist, pp); 174 } 175 deltab -= PAGESIZE; 176 177 /* scan forward from front */ 178 for (deltaf = PAGESIZE; off + deltaf < vp_end; 179 deltaf += PAGESIZE) { 180 /* 181 * Call back to the segment driver to verify that 182 * the klustering/read ahead operation makes sense. 183 */ 184 if (SEGOP_KLUSTER(seg, addr, deltaf)) 185 break; /* page not file extension */ 186 if ((pp = page_create_va(vp, off + deltaf, 187 PAGESIZE, PG_EXCL, seg, addr + deltaf)) 188 == NULL) 189 break; /* already have page */ 190 191 /* 192 * Add page to end of page list. 193 */ 194 page_add(&plist, pp); 195 plist = plist->p_next; 196 } 197 *offp = off = off - deltab; 198 *lenp = deltab + deltaf; 199 ASSERT(off >= vp_off); 200 201 /* 202 * If we ended up getting more than was actually 203 * requested, retract the returned length to only 204 * reflect what was requested. This might happen 205 * if we were allowed to kluster pages across a 206 * span of (say) 5 frags, and frag size is less 207 * than PAGESIZE. We need a whole number of 208 * pages to contain those frags, but the returned 209 * size should only allow the returned range to 210 * extend as far as the end of the frags. 211 */ 212 if ((vp_off + vp_len) < (off + *lenp)) { 213 ASSERT(vp_end > off); 214 *lenp = vp_end - off; 215 } 216 } 217 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, 218 "pvn_read_kluster:seg %p addr %x isra %x", 219 seg, addr, isra); 220 return (plist); 221 } 222 223 /* 224 * Handle pages for this vnode on either side of the page "pp" 225 * which has been locked by the caller. This routine will also 226 * do klustering in the range [vp_off, vp_off + vp_len] up 227 * until a page which is not found. The offset and length 228 * of pages included is returned in "*offp" and "*lenp". 229 * 230 * Returns a list of dirty locked pages all ready to be 231 * written back. 232 */ 233 page_t * 234 pvn_write_kluster( 235 struct vnode *vp, 236 page_t *pp, 237 u_offset_t *offp, /* return values */ 238 size_t *lenp, /* return values */ 239 u_offset_t vp_off, 240 size_t vp_len, 241 int flags) 242 { 243 u_offset_t off; 244 page_t *dirty; 245 size_t deltab, deltaf; 246 se_t se; 247 u_offset_t vp_end; 248 249 off = pp->p_offset; 250 251 /* 252 * Kustering should not be done if we are invalidating 253 * pages since we could destroy pages that belong to 254 * some other process if this is a swap vnode. 255 */ 256 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { 257 *offp = off; 258 *lenp = PAGESIZE; 259 return (pp); 260 } 261 262 if (flags & (B_FREE | B_INVAL)) 263 se = SE_EXCL; 264 else 265 se = SE_SHARED; 266 267 dirty = pp; 268 /* 269 * Scan backwards looking for pages to kluster by incrementing 270 * "deltab" and comparing "off" with "vp_off + deltab" to 271 * avoid "signed" versus "unsigned" conversion problems. 272 */ 273 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { 274 pp = page_lookup_nowait(vp, off - deltab, se); 275 if (pp == NULL) 276 break; /* page not found */ 277 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 278 break; 279 page_add(&dirty, pp); 280 } 281 deltab -= PAGESIZE; 282 283 vp_end = vp_off + vp_len; 284 /* now scan forwards looking for pages to kluster */ 285 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { 286 pp = page_lookup_nowait(vp, off + deltaf, se); 287 if (pp == NULL) 288 break; /* page not found */ 289 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 290 break; 291 page_add(&dirty, pp); 292 dirty = dirty->p_next; 293 } 294 295 *offp = off - deltab; 296 *lenp = deltab + deltaf; 297 return (dirty); 298 } 299 300 /* 301 * Generic entry point used to release the "shared/exclusive" lock 302 * and the "p_iolock" on pages after i/o is complete. 303 */ 304 void 305 pvn_io_done(page_t *plist) 306 { 307 page_t *pp; 308 309 while (plist != NULL) { 310 pp = plist; 311 page_sub(&plist, pp); 312 page_io_unlock(pp); 313 page_unlock(pp); 314 } 315 } 316 317 /* 318 * Entry point to be used by file system getpage subr's and 319 * other such routines which either want to unlock pages (B_ASYNC 320 * request) or destroy a list of pages if an error occurred. 321 */ 322 void 323 pvn_read_done(page_t *plist, int flags) 324 { 325 page_t *pp; 326 327 while (plist != NULL) { 328 pp = plist; 329 page_sub(&plist, pp); 330 page_io_unlock(pp); 331 if (flags & B_ERROR) { 332 /*LINTED: constant in conditional context*/ 333 VN_DISPOSE(pp, B_INVAL, 0, kcred); 334 } else { 335 (void) page_release(pp, 0); 336 } 337 } 338 } 339 340 /* 341 * Automagic pageout. 342 * When memory gets tight, start freeing pages popping out of the 343 * write queue. 344 */ 345 int write_free = 1; 346 pgcnt_t pages_before_pager = 200; /* LMXXX */ 347 348 /* 349 * Routine to be called when page-out's complete. 350 * The caller, typically VOP_PUTPAGE, has to explicity call this routine 351 * after waiting for i/o to complete (biowait) to free the list of 352 * pages associated with the buffer. These pages must be locked 353 * before i/o is initiated. 354 * 355 * If a write error occurs, the pages are marked as modified 356 * so the write will be re-tried later. 357 */ 358 359 void 360 pvn_write_done(page_t *plist, int flags) 361 { 362 int dfree = 0; 363 int pgrec = 0; 364 int pgout = 0; 365 int pgpgout = 0; 366 int anonpgout = 0; 367 int anonfree = 0; 368 int fspgout = 0; 369 int fsfree = 0; 370 int execpgout = 0; 371 int execfree = 0; 372 page_t *pp; 373 struct cpu *cpup; 374 struct vnode *vp = NULL; /* for probe */ 375 uint_t ppattr; 376 kmutex_t *vphm = NULL; 377 378 ASSERT((flags & B_READ) == 0); 379 380 /* 381 * If we are about to start paging anyway, start freeing pages. 382 */ 383 if (write_free && freemem < lotsfree + pages_before_pager && 384 (flags & B_ERROR) == 0) { 385 flags |= B_FREE; 386 } 387 388 /* 389 * Handle each page involved in the i/o operation. 390 */ 391 while (plist != NULL) { 392 pp = plist; 393 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); 394 page_sub(&plist, pp); 395 396 /* Kernel probe support */ 397 if (vp == NULL) 398 vp = pp->p_vnode; 399 400 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { 401 /* 402 * Move page to the top of the v_page list. 403 * Skip pages modified during IO. 404 */ 405 vphm = page_vnode_mutex(vp); 406 mutex_enter(vphm); 407 if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { 408 page_vpsub(&vp->v_pages, pp); 409 page_vpadd(&vp->v_pages, pp); 410 } 411 mutex_exit(vphm); 412 } 413 414 if (flags & B_ERROR) { 415 /* 416 * Write operation failed. We don't want 417 * to destroy (or free) the page unless B_FORCE 418 * is set. We set the mod bit again and release 419 * all locks on the page so that it will get written 420 * back again later when things are hopefully 421 * better again. 422 * If B_INVAL and B_FORCE is set we really have 423 * to destroy the page. 424 */ 425 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { 426 page_io_unlock(pp); 427 /*LINTED: constant in conditional context*/ 428 VN_DISPOSE(pp, B_INVAL, 0, kcred); 429 } else { 430 hat_setmod_only(pp); 431 page_io_unlock(pp); 432 page_unlock(pp); 433 } 434 } else if (flags & B_INVAL) { 435 /* 436 * XXX - Failed writes with B_INVAL set are 437 * not handled appropriately. 438 */ 439 page_io_unlock(pp); 440 /*LINTED: constant in conditional context*/ 441 VN_DISPOSE(pp, B_INVAL, 0, kcred); 442 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { 443 /* 444 * Update statistics for pages being paged out 445 */ 446 if (pp->p_vnode) { 447 if (IS_SWAPFSVP(pp->p_vnode)) { 448 anonpgout++; 449 } else { 450 if (pp->p_vnode->v_flag & VVMEXEC) { 451 execpgout++; 452 } else { 453 fspgout++; 454 } 455 } 456 } 457 page_io_unlock(pp); 458 pgout = 1; 459 pgpgout++; 460 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, 461 "page_ws_out:pp %p", pp); 462 463 /* 464 * The page_struct_lock need not be acquired to 465 * examine "p_lckcnt" and "p_cowcnt" since we'll 466 * have an "exclusive" lock if the upgrade succeeds. 467 */ 468 if (page_tryupgrade(pp) && 469 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 470 /* 471 * Check if someone has reclaimed the 472 * page. If ref and mod are not set, no 473 * one is using it so we can free it. 474 * The rest of the system is careful 475 * to use the NOSYNC flag to unload 476 * translations set up for i/o w/o 477 * affecting ref and mod bits. 478 * 479 * Obtain a copy of the real hardware 480 * mod bit using hat_pagesync(pp, HAT_DONTZERO) 481 * to avoid having to flush the cache. 482 */ 483 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 484 HAT_SYNC_STOPON_MOD); 485 ck_refmod: 486 if (!(ppattr & (P_REF | P_MOD))) { 487 if (hat_page_is_mapped(pp)) { 488 /* 489 * Doesn't look like the page 490 * was modified so now we 491 * really have to unload the 492 * translations. Meanwhile 493 * another CPU could've 494 * modified it so we have to 495 * check again. We don't loop 496 * forever here because now 497 * the translations are gone 498 * and no one can get a new one 499 * since we have the "exclusive" 500 * lock on the page. 501 */ 502 (void) hat_pageunload(pp, 503 HAT_FORCE_PGUNLOAD); 504 ppattr = hat_page_getattr(pp, 505 P_REF | P_MOD); 506 goto ck_refmod; 507 } 508 /* 509 * Update statistics for pages being 510 * freed 511 */ 512 if (pp->p_vnode) { 513 if (IS_SWAPFSVP(pp->p_vnode)) { 514 anonfree++; 515 } else { 516 if (pp->p_vnode->v_flag 517 & VVMEXEC) { 518 execfree++; 519 } else { 520 fsfree++; 521 } 522 } 523 } 524 /*LINTED: constant in conditional ctx*/ 525 VN_DISPOSE(pp, B_FREE, 526 (flags & B_DONTNEED), kcred); 527 dfree++; 528 } else { 529 page_unlock(pp); 530 pgrec++; 531 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, 532 "page_ws_free:pp %p", pp); 533 } 534 } else { 535 /* 536 * Page is either `locked' in memory 537 * or was reclaimed and now has a 538 * "shared" lock, so release it. 539 */ 540 page_unlock(pp); 541 } 542 } else { 543 /* 544 * Neither B_FREE nor B_INVAL nor B_ERROR. 545 * Just release locks. 546 */ 547 page_io_unlock(pp); 548 page_unlock(pp); 549 } 550 } 551 552 CPU_STATS_ENTER_K(); 553 cpup = CPU; /* get cpup now that CPU cannot change */ 554 CPU_STATS_ADDQ(cpup, vm, dfree, dfree); 555 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); 556 CPU_STATS_ADDQ(cpup, vm, pgout, pgout); 557 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); 558 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); 559 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); 560 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); 561 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); 562 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); 563 CPU_STATS_ADDQ(cpup, vm, execfree, execfree); 564 CPU_STATS_EXIT_K(); 565 566 /* Kernel probe */ 567 TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, 568 tnf_opaque, vnode, vp, 569 tnf_ulong, pages_pageout, pgpgout, 570 tnf_ulong, pages_freed, dfree, 571 tnf_ulong, pages_reclaimed, pgrec); 572 } 573 574 /* 575 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, 576 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster 577 * operation and is only to be considered if it doesn't involve any 578 * waiting here. B_TRUNC indicates that the file is being truncated 579 * and so no i/o needs to be done. B_FORCE indicates that the page 580 * must be destroyed so don't try wrting it out. 581 * 582 * The caller must ensure that the page is locked. Returns 1, if 583 * the page should be written back (the "iolock" is held in this 584 * case), or 0 if the page has been dealt with or has been 585 * unlocked. 586 */ 587 int 588 pvn_getdirty(page_t *pp, int flags) 589 { 590 ASSERT((flags & (B_INVAL | B_FREE)) ? 591 PAGE_EXCL(pp) : PAGE_SHARED(pp)); 592 ASSERT(PP_ISFREE(pp) == 0); 593 594 /* 595 * If trying to invalidate or free a logically `locked' page, 596 * forget it. Don't need page_struct_lock to check p_lckcnt and 597 * p_cowcnt as the page is exclusively locked. 598 */ 599 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && 600 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { 601 page_unlock(pp); 602 return (0); 603 } 604 605 /* 606 * Now acquire the i/o lock so we can add it to the dirty 607 * list (if necessary). We avoid blocking on the i/o lock 608 * in the following cases: 609 * 610 * If B_DELWRI is set, which implies that this request is 611 * due to a klustering operartion. 612 * 613 * If this is an async (B_ASYNC) operation and we are not doing 614 * invalidation (B_INVAL) [The current i/o or fsflush will ensure 615 * that the the page is written out]. 616 */ 617 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { 618 if (!page_io_trylock(pp)) { 619 page_unlock(pp); 620 return (0); 621 } 622 } else { 623 page_io_lock(pp); 624 } 625 626 /* 627 * If we want to free or invalidate the page then 628 * we need to unload it so that anyone who wants 629 * it will have to take a minor fault to get it. 630 * Otherwise, we're just writing the page back so we 631 * need to sync up the hardwre and software mod bit to 632 * detect any future modifications. We clear the 633 * software mod bit when we put the page on the dirty 634 * list. 635 */ 636 if (flags & (B_INVAL | B_FREE)) { 637 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 638 } else { 639 (void) hat_pagesync(pp, HAT_SYNC_ZERORM); 640 } 641 642 if (!hat_ismod(pp) || (flags & B_TRUNC)) { 643 /* 644 * Don't need to add it to the 645 * list after all. 646 */ 647 page_io_unlock(pp); 648 if (flags & B_INVAL) { 649 /*LINTED: constant in conditional context*/ 650 VN_DISPOSE(pp, B_INVAL, 0, kcred); 651 } else if (flags & B_FREE) { 652 /*LINTED: constant in conditional context*/ 653 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); 654 } else { 655 /* 656 * This is advisory path for the callers 657 * of VOP_PUTPAGE() who prefer freeing the 658 * page _only_ if no one else is accessing it. 659 * E.g. segmap_release() 660 * 661 * The above hat_ismod() check is useless because: 662 * (1) we may not be holding SE_EXCL lock; 663 * (2) we've not unloaded _all_ translations 664 * 665 * Let page_release() do the heavy-lifting. 666 */ 667 (void) page_release(pp, 1); 668 } 669 return (0); 670 } 671 672 /* 673 * Page is dirty, get it ready for the write back 674 * and add page to the dirty list. 675 */ 676 hat_clrrefmod(pp); 677 678 /* 679 * If we're going to free the page when we're done 680 * then we can let others try to use it starting now. 681 * We'll detect the fact that they used it when the 682 * i/o is done and avoid freeing the page. 683 */ 684 if (flags & B_FREE) 685 page_downgrade(pp); 686 687 688 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); 689 690 return (1); 691 } 692 693 694 /*ARGSUSED*/ 695 static int 696 marker_constructor(void *buf, void *cdrarg, int kmflags) 697 { 698 page_t *mark = buf; 699 bzero(mark, sizeof (page_t)); 700 mark->p_hash = PVN_VPLIST_HASH_TAG; 701 return (0); 702 } 703 704 void 705 pvn_init() 706 { 707 if (pvn_vmodsort_disable == 0) 708 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); 709 marker_cache = kmem_cache_create("marker_cache", 710 sizeof (page_t), 0, marker_constructor, 711 NULL, NULL, NULL, NULL, 0); 712 } 713 714 715 /* 716 * Process a vnode's page list for all pages whose offset is >= off. 717 * Pages are to either be free'd, invalidated, or written back to disk. 718 * 719 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE 720 * is specified, otherwise they are "shared" locked. 721 * 722 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} 723 * 724 * Special marker page_t's are inserted in the list in order 725 * to keep track of where we are in the list when locks are dropped. 726 * 727 * Note the list is circular and insertions can happen only at the 728 * head and tail of the list. The algorithm ensures visiting all pages 729 * on the list in the following way: 730 * 731 * Drop two marker pages at the end of the list. 732 * 733 * Move one marker page backwards towards the start of the list until 734 * it is at the list head, processing the pages passed along the way. 735 * 736 * Due to race conditions when the vphm mutex is dropped, additional pages 737 * can be added to either end of the list, so we'll continue to move 738 * the marker and process pages until it is up against the end marker. 739 * 740 * There is one special exit condition. If we are processing a VMODSORT 741 * vnode and only writing back modified pages, we can stop as soon as 742 * we run into an unmodified page. This makes fsync(3) operations fast. 743 */ 744 int 745 pvn_vplist_dirty( 746 vnode_t *vp, 747 u_offset_t off, 748 int (*putapage)(vnode_t *, page_t *, u_offset_t *, 749 size_t *, int, cred_t *), 750 int flags, 751 cred_t *cred) 752 { 753 page_t *pp; 754 page_t *mark; /* marker page that moves toward head */ 755 page_t *end; /* marker page at end of list */ 756 int err = 0; 757 int error; 758 kmutex_t *vphm; 759 se_t se; 760 page_t **where_to_move; 761 762 ASSERT(vp->v_type != VCHR); 763 764 if (vp->v_pages == NULL) 765 return (0); 766 767 768 /* 769 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. 770 * 771 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() 772 * from getting blocked while flushing pages to a dead NFS server. 773 */ 774 mutex_enter(&vp->v_lock); 775 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { 776 mutex_exit(&vp->v_lock); 777 return (EAGAIN); 778 } 779 780 while (vp->v_flag & VVMLOCK) 781 cv_wait(&vp->v_cv, &vp->v_lock); 782 783 if (vp->v_pages == NULL) { 784 mutex_exit(&vp->v_lock); 785 return (0); 786 } 787 788 vp->v_flag |= VVMLOCK; 789 mutex_exit(&vp->v_lock); 790 791 792 /* 793 * Set up the marker pages used to walk the list 794 */ 795 end = kmem_cache_alloc(marker_cache, KM_SLEEP); 796 end->p_vnode = vp; 797 end->p_offset = (u_offset_t)-2; 798 mark = kmem_cache_alloc(marker_cache, KM_SLEEP); 799 mark->p_vnode = vp; 800 mark->p_offset = (u_offset_t)-1; 801 802 /* 803 * Grab the lock protecting the vnode's page list 804 * note that this lock is dropped at times in the loop. 805 */ 806 vphm = page_vnode_mutex(vp); 807 mutex_enter(vphm); 808 if (vp->v_pages == NULL) 809 goto leave; 810 811 /* 812 * insert the markers and loop through the list of pages 813 */ 814 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); 815 page_vpadd(&mark->p_vpnext, end); 816 for (;;) { 817 818 /* 819 * If only doing an async write back, then we can 820 * stop as soon as we get to start of the list. 821 */ 822 if (flags == B_ASYNC && vp->v_pages == mark) 823 break; 824 825 /* 826 * otherwise stop when we've gone through all the pages 827 */ 828 if (mark->p_vpprev == end) 829 break; 830 831 pp = mark->p_vpprev; 832 if (vp->v_pages == pp) 833 where_to_move = &vp->v_pages; 834 else 835 where_to_move = &pp->p_vpprev->p_vpnext; 836 837 ASSERT(pp->p_vnode == vp); 838 839 /* 840 * If just flushing dirty pages to disk and this vnode 841 * is using a sorted list of pages, we can stop processing 842 * as soon as we find an unmodified page. Since all the 843 * modified pages are visited first. 844 */ 845 if (IS_VMODSORT(vp) && 846 !(flags & (B_INVAL | B_FREE | B_TRUNC))) { 847 if (!hat_ismod(pp) && !page_io_locked(pp)) { 848 #ifdef DEBUG 849 /* 850 * For debug kernels examine what should be 851 * all the remaining clean pages, asserting 852 * that they are not modified. 853 */ 854 page_t *chk = pp; 855 int attr; 856 857 page_vpsub(&vp->v_pages, mark); 858 page_vpadd(where_to_move, mark); 859 do { 860 chk = chk->p_vpprev; 861 ASSERT(chk != end); 862 if (chk == mark) 863 continue; 864 attr = hat_page_getattr(chk, P_MOD | 865 P_REF); 866 if ((attr & P_MOD) == 0) 867 continue; 868 panic("v_pages list not all clean: " 869 "page_t*=%p vnode=%p off=%lx " 870 "attr=0x%x last clean page_t*=%p\n", 871 (void *)chk, (void *)chk->p_vnode, 872 (long)chk->p_offset, attr, 873 (void *)pp); 874 } while (chk != vp->v_pages); 875 #endif 876 break; 877 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { 878 /* 879 * Couldn't get io lock, wait until IO is done. 880 * Block only for sync IO since we don't want 881 * to block async IO. 882 */ 883 mutex_exit(vphm); 884 page_io_wait(pp); 885 mutex_enter(vphm); 886 continue; 887 } 888 } 889 890 /* 891 * Skip this page if the offset is out of the desired range. 892 * Just move the marker and continue. 893 */ 894 if (pp->p_offset < off) { 895 page_vpsub(&vp->v_pages, mark); 896 page_vpadd(where_to_move, mark); 897 continue; 898 } 899 900 /* 901 * If we are supposed to invalidate or free this 902 * page, then we need an exclusive lock. 903 */ 904 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 905 906 /* 907 * We must acquire the page lock for all synchronous 908 * operations (invalidate, free and write). 909 */ 910 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { 911 /* 912 * If the page_lock() drops the mutex 913 * we must retry the loop. 914 */ 915 if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) 916 continue; 917 918 /* 919 * It's ok to move the marker page now. 920 */ 921 page_vpsub(&vp->v_pages, mark); 922 page_vpadd(where_to_move, mark); 923 } else { 924 925 /* 926 * update the marker page for all remaining cases 927 */ 928 page_vpsub(&vp->v_pages, mark); 929 page_vpadd(where_to_move, mark); 930 931 /* 932 * For write backs, If we can't lock the page, it's 933 * invalid or in the process of being destroyed. Skip 934 * it, assuming someone else is writing it. 935 */ 936 if (!page_trylock(pp, se)) 937 continue; 938 } 939 940 ASSERT(pp->p_vnode == vp); 941 942 /* 943 * Successfully locked the page, now figure out what to 944 * do with it. Free pages are easily dealt with, invalidate 945 * if desired or just go on to the next page. 946 */ 947 if (PP_ISFREE(pp)) { 948 if ((flags & B_INVAL) == 0) { 949 page_unlock(pp); 950 continue; 951 } 952 953 /* 954 * Invalidate (destroy) the page. 955 */ 956 mutex_exit(vphm); 957 page_destroy_free(pp); 958 mutex_enter(vphm); 959 continue; 960 } 961 962 /* 963 * pvn_getdirty() figures out what do do with a dirty page. 964 * If the page is dirty, the putapage() routine will write it 965 * and will kluster any other adjacent dirty pages it can. 966 * 967 * pvn_getdirty() and `(*putapage)' unlock the page. 968 */ 969 mutex_exit(vphm); 970 if (pvn_getdirty(pp, flags)) { 971 error = (*putapage)(vp, pp, NULL, NULL, flags, cred); 972 if (!err) 973 err = error; 974 } 975 mutex_enter(vphm); 976 } 977 page_vpsub(&vp->v_pages, mark); 978 page_vpsub(&vp->v_pages, end); 979 980 leave: 981 /* 982 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds 983 */ 984 mutex_exit(vphm); 985 kmem_cache_free(marker_cache, mark); 986 kmem_cache_free(marker_cache, end); 987 mutex_enter(&vp->v_lock); 988 vp->v_flag &= ~VVMLOCK; 989 cv_broadcast(&vp->v_cv); 990 mutex_exit(&vp->v_lock); 991 return (err); 992 } 993 994 /* 995 * Walk the vp->v_pages list, for every page call the callback function 996 * pointed by *page_check. If page_check returns non-zero, then mark the 997 * page as modified and if VMODSORT is set, move it to the end of v_pages 998 * list. Moving makes sense only if we have at least two pages - this also 999 * avoids having v_pages temporarily being NULL after calling page_vpsub() 1000 * if there was just one page. 1001 */ 1002 void 1003 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *)) 1004 { 1005 page_t *pp, *next, *end; 1006 kmutex_t *vphm; 1007 int shuffle; 1008 1009 vphm = page_vnode_mutex(vp); 1010 mutex_enter(vphm); 1011 1012 if (vp->v_pages == NULL) { 1013 mutex_exit(vphm); 1014 return; 1015 } 1016 1017 end = vp->v_pages->p_vpprev; 1018 shuffle = IS_VMODSORT(vp) && (vp->v_pages != end); 1019 pp = vp->v_pages; 1020 1021 for (;;) { 1022 next = pp->p_vpnext; 1023 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) { 1024 /* 1025 * hat_setmod_only() in contrast to hat_setmod() does 1026 * not shuffle the pages and does not grab the mutex 1027 * page_vnode_mutex. Exactly what we need. 1028 */ 1029 hat_setmod_only(pp); 1030 if (shuffle) { 1031 page_vpsub(&vp->v_pages, pp); 1032 ASSERT(vp->v_pages != NULL); 1033 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, 1034 pp); 1035 } 1036 } 1037 /* Stop if we have just processed the last page. */ 1038 if (pp == end) 1039 break; 1040 pp = next; 1041 } 1042 1043 mutex_exit(vphm); 1044 } 1045 1046 /* 1047 * Zero out zbytes worth of data. Caller should be aware that this 1048 * routine may enter back into the fs layer (xxx_getpage). Locks 1049 * that the xxx_getpage routine may need should not be held while 1050 * calling this. 1051 */ 1052 void 1053 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) 1054 { 1055 caddr_t addr; 1056 1057 ASSERT(vp->v_type != VCHR); 1058 1059 if (vp->v_pages == NULL) 1060 return; 1061 1062 /* 1063 * zbytes may be zero but there still may be some portion of 1064 * a page which needs clearing (since zbytes is a function 1065 * of filesystem block size, not pagesize.) 1066 */ 1067 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) 1068 return; 1069 1070 /* 1071 * We get the last page and handle the partial 1072 * zeroing via kernel mappings. This will make the page 1073 * dirty so that we know that when this page is written 1074 * back, the zeroed information will go out with it. If 1075 * the page is not currently in memory, then the kzero 1076 * operation will cause it to be brought it. We use kzero 1077 * instead of bzero so that if the page cannot be read in 1078 * for any reason, the system will not panic. We need 1079 * to zero out a minimum of the fs given zbytes, but we 1080 * might also have to do more to get the entire last page. 1081 */ 1082 1083 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) 1084 panic("pvn_vptrunc zbytes"); 1085 addr = segmap_getmapflt(segkmap, vp, vplen, 1086 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); 1087 (void) kzero(addr + (vplen & MAXBOFFSET), 1088 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); 1089 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); 1090 } 1091 1092 /* 1093 * Handles common work of the VOP_GETPAGE routines when more than 1094 * one page must be returned by calling a file system specific operation 1095 * to do most of the work. Must be called with the vp already locked 1096 * by the VOP_GETPAGE routine. 1097 */ 1098 int 1099 pvn_getpages( 1100 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], 1101 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), 1102 struct vnode *vp, 1103 u_offset_t off, 1104 size_t len, 1105 uint_t *protp, 1106 page_t *pl[], 1107 size_t plsz, 1108 struct seg *seg, 1109 caddr_t addr, 1110 enum seg_rw rw, 1111 struct cred *cred) 1112 { 1113 page_t **ppp; 1114 u_offset_t o, eoff; 1115 size_t sz, xlen; 1116 int err; 1117 1118 ASSERT(plsz >= len); /* insure that we have enough space */ 1119 1120 /* 1121 * Loop one page at a time and let getapage function fill 1122 * in the next page in array. We only allow one page to be 1123 * returned at a time (except for the last page) so that we 1124 * don't have any problems with duplicates and other such 1125 * painful problems. This is a very simple minded algorithm, 1126 * but it does the job correctly. We hope that the cost of a 1127 * getapage call for a resident page that we might have been 1128 * able to get from an earlier call doesn't cost too much. 1129 */ 1130 ppp = pl; 1131 sz = PAGESIZE; 1132 eoff = off + len; 1133 xlen = len; 1134 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, 1135 xlen -= PAGESIZE) { 1136 if (o + PAGESIZE >= eoff) { 1137 /* 1138 * Last time through - allow the all of 1139 * what's left of the pl[] array to be used. 1140 */ 1141 sz = plsz - (o - off); 1142 } 1143 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, 1144 rw, cred); 1145 if (err) { 1146 /* 1147 * Release any pages we already got. 1148 */ 1149 if (o > off && pl != NULL) { 1150 for (ppp = pl; *ppp != NULL; *ppp++ = NULL) 1151 (void) page_release(*ppp, 1); 1152 } 1153 break; 1154 } 1155 if (pl != NULL) 1156 ppp++; 1157 } 1158 return (err); 1159 } 1160 1161 /* 1162 * Initialize the page list array. 1163 */ 1164 /*ARGSUSED*/ 1165 void 1166 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, 1167 u_offset_t off, size_t io_len, enum seg_rw rw) 1168 { 1169 ssize_t sz; 1170 page_t *ppcur, **ppp; 1171 1172 /* 1173 * Set up to load plsz worth 1174 * starting at the needed page. 1175 */ 1176 while (pp != NULL && pp->p_offset != off) { 1177 /* 1178 * Remove page from the i/o list, 1179 * release the i/o and the page lock. 1180 */ 1181 ppcur = pp; 1182 page_sub(&pp, ppcur); 1183 page_io_unlock(ppcur); 1184 (void) page_release(ppcur, 1); 1185 } 1186 1187 if (pp == NULL) { 1188 pl[0] = NULL; 1189 return; 1190 } 1191 1192 sz = plsz; 1193 1194 /* 1195 * Initialize the page list array. 1196 */ 1197 ppp = pl; 1198 do { 1199 ppcur = pp; 1200 *ppp++ = ppcur; 1201 page_sub(&pp, ppcur); 1202 page_io_unlock(ppcur); 1203 if (rw != S_CREATE) 1204 page_downgrade(ppcur); 1205 sz -= PAGESIZE; 1206 } while (sz > 0 && pp != NULL); 1207 *ppp = NULL; /* terminate list */ 1208 1209 /* 1210 * Now free the remaining pages that weren't 1211 * loaded in the page list. 1212 */ 1213 while (pp != NULL) { 1214 ppcur = pp; 1215 page_sub(&pp, ppcur); 1216 page_io_unlock(ppcur); 1217 (void) page_release(ppcur, 1); 1218 } 1219 } 1220