1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - paged vnode. 43 * 44 * This file supplies vm support for the vnode operations that deal with pages. 45 */ 46 #include <sys/types.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/sysmacros.h> 50 #include <sys/systm.h> 51 #include <sys/time.h> 52 #include <sys/buf.h> 53 #include <sys/vnode.h> 54 #include <sys/uio.h> 55 #include <sys/vmmeter.h> 56 #include <sys/vmsystm.h> 57 #include <sys/mman.h> 58 #include <sys/vfs.h> 59 #include <sys/cred.h> 60 #include <sys/user.h> 61 #include <sys/kmem.h> 62 #include <sys/cmn_err.h> 63 #include <sys/debug.h> 64 #include <sys/cpuvar.h> 65 #include <sys/vtrace.h> 66 #include <sys/tnf_probe.h> 67 68 #include <vm/hat.h> 69 #include <vm/as.h> 70 #include <vm/seg.h> 71 #include <vm/rm.h> 72 #include <vm/pvn.h> 73 #include <vm/page.h> 74 #include <vm/seg_map.h> 75 #include <vm/seg_kmem.h> 76 #include <sys/fs/swapnode.h> 77 78 int pvn_nofodklust = 0; 79 int pvn_write_noklust = 0; 80 81 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ 82 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ 83 /* support for vmodsort for testing */ 84 85 static struct kmem_cache *marker_cache = NULL; 86 87 /* 88 * Find the largest contiguous block which contains `addr' for file offset 89 * `offset' in it while living within the file system block sizes (`vp_off' 90 * and `vp_len') and the address space limits for which no pages currently 91 * exist and which map to consecutive file offsets. 92 */ 93 page_t * 94 pvn_read_kluster( 95 struct vnode *vp, 96 u_offset_t off, 97 struct seg *seg, 98 caddr_t addr, 99 u_offset_t *offp, /* return values */ 100 size_t *lenp, /* return values */ 101 u_offset_t vp_off, 102 size_t vp_len, 103 int isra) 104 { 105 ssize_t deltaf, deltab; 106 page_t *pp; 107 page_t *plist = NULL; 108 spgcnt_t pagesavail; 109 u_offset_t vp_end; 110 111 ASSERT(off >= vp_off && off < vp_off + vp_len); 112 113 /* 114 * We only want to do klustering/read ahead if there 115 * is more than minfree pages currently available. 116 */ 117 pagesavail = freemem - minfree; 118 119 if (pagesavail <= 0) 120 if (isra) 121 return ((page_t *)NULL); /* ra case - give up */ 122 else 123 pagesavail = 1; /* must return a page */ 124 125 /* We calculate in pages instead of bytes due to 32-bit overflows */ 126 if (pagesavail < (spgcnt_t)btopr(vp_len)) { 127 /* 128 * Don't have enough free memory for the 129 * max request, try sizing down vp request. 130 */ 131 deltab = (ssize_t)(off - vp_off); 132 vp_len -= deltab; 133 vp_off += deltab; 134 if (pagesavail < btopr(vp_len)) { 135 /* 136 * Still not enough memory, just settle for 137 * pagesavail which is at least 1. 138 */ 139 vp_len = ptob(pagesavail); 140 } 141 } 142 143 vp_end = vp_off + vp_len; 144 ASSERT(off >= vp_off && off < vp_end); 145 146 if (isra && SEGOP_KLUSTER(seg, addr, 0)) 147 return ((page_t *)NULL); /* segment driver says no */ 148 149 if ((plist = page_create_va(vp, off, 150 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) 151 return ((page_t *)NULL); 152 153 if (vp_len <= PAGESIZE || pvn_nofodklust) { 154 *offp = off; 155 *lenp = MIN(vp_len, PAGESIZE); 156 } else { 157 /* 158 * Scan back from front by incrementing "deltab" and 159 * comparing "off" with "vp_off + deltab" to avoid 160 * "signed" versus "unsigned" conversion problems. 161 */ 162 for (deltab = PAGESIZE; off >= vp_off + deltab; 163 deltab += PAGESIZE) { 164 /* 165 * Call back to the segment driver to verify that 166 * the klustering/read ahead operation makes sense. 167 */ 168 if (SEGOP_KLUSTER(seg, addr, -deltab)) 169 break; /* page not eligible */ 170 if ((pp = page_create_va(vp, off - deltab, 171 PAGESIZE, PG_EXCL, seg, addr - deltab)) 172 == NULL) 173 break; /* already have the page */ 174 /* 175 * Add page to front of page list. 176 */ 177 page_add(&plist, pp); 178 } 179 deltab -= PAGESIZE; 180 181 /* scan forward from front */ 182 for (deltaf = PAGESIZE; off + deltaf < vp_end; 183 deltaf += PAGESIZE) { 184 /* 185 * Call back to the segment driver to verify that 186 * the klustering/read ahead operation makes sense. 187 */ 188 if (SEGOP_KLUSTER(seg, addr, deltaf)) 189 break; /* page not file extension */ 190 if ((pp = page_create_va(vp, off + deltaf, 191 PAGESIZE, PG_EXCL, seg, addr + deltaf)) 192 == NULL) 193 break; /* already have page */ 194 195 /* 196 * Add page to end of page list. 197 */ 198 page_add(&plist, pp); 199 plist = plist->p_next; 200 } 201 *offp = off = off - deltab; 202 *lenp = deltab + deltaf; 203 ASSERT(off >= vp_off); 204 205 /* 206 * If we ended up getting more than was actually 207 * requested, retract the returned length to only 208 * reflect what was requested. This might happen 209 * if we were allowed to kluster pages across a 210 * span of (say) 5 frags, and frag size is less 211 * than PAGESIZE. We need a whole number of 212 * pages to contain those frags, but the returned 213 * size should only allow the returned range to 214 * extend as far as the end of the frags. 215 */ 216 if ((vp_off + vp_len) < (off + *lenp)) { 217 ASSERT(vp_end > off); 218 *lenp = vp_end - off; 219 } 220 } 221 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, 222 "pvn_read_kluster:seg %p addr %x isra %x", 223 seg, addr, isra); 224 return (plist); 225 } 226 227 /* 228 * Handle pages for this vnode on either side of the page "pp" 229 * which has been locked by the caller. This routine will also 230 * do klustering in the range [vp_off, vp_off + vp_len] up 231 * until a page which is not found. The offset and length 232 * of pages included is returned in "*offp" and "*lenp". 233 * 234 * Returns a list of dirty locked pages all ready to be 235 * written back. 236 */ 237 page_t * 238 pvn_write_kluster( 239 struct vnode *vp, 240 page_t *pp, 241 u_offset_t *offp, /* return values */ 242 size_t *lenp, /* return values */ 243 u_offset_t vp_off, 244 size_t vp_len, 245 int flags) 246 { 247 u_offset_t off; 248 page_t *dirty; 249 size_t deltab, deltaf; 250 se_t se; 251 u_offset_t vp_end; 252 253 off = pp->p_offset; 254 255 /* 256 * Kustering should not be done if we are invalidating 257 * pages since we could destroy pages that belong to 258 * some other process if this is a swap vnode. 259 */ 260 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { 261 *offp = off; 262 *lenp = PAGESIZE; 263 return (pp); 264 } 265 266 if (flags & (B_FREE | B_INVAL)) 267 se = SE_EXCL; 268 else 269 se = SE_SHARED; 270 271 dirty = pp; 272 /* 273 * Scan backwards looking for pages to kluster by incrementing 274 * "deltab" and comparing "off" with "vp_off + deltab" to 275 * avoid "signed" versus "unsigned" conversion problems. 276 */ 277 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { 278 pp = page_lookup_nowait(vp, off - deltab, se); 279 if (pp == NULL) 280 break; /* page not found */ 281 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 282 break; 283 page_add(&dirty, pp); 284 } 285 deltab -= PAGESIZE; 286 287 vp_end = vp_off + vp_len; 288 /* now scan forwards looking for pages to kluster */ 289 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { 290 pp = page_lookup_nowait(vp, off + deltaf, se); 291 if (pp == NULL) 292 break; /* page not found */ 293 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 294 break; 295 page_add(&dirty, pp); 296 dirty = dirty->p_next; 297 } 298 299 *offp = off - deltab; 300 *lenp = deltab + deltaf; 301 return (dirty); 302 } 303 304 /* 305 * Generic entry point used to release the "shared/exclusive" lock 306 * and the "p_iolock" on pages after i/o is complete. 307 */ 308 void 309 pvn_io_done(page_t *plist) 310 { 311 page_t *pp; 312 313 while (plist != NULL) { 314 pp = plist; 315 page_sub(&plist, pp); 316 page_io_unlock(pp); 317 page_unlock(pp); 318 } 319 } 320 321 /* 322 * Entry point to be used by file system getpage subr's and 323 * other such routines which either want to unlock pages (B_ASYNC 324 * request) or destroy a list of pages if an error occurred. 325 */ 326 void 327 pvn_read_done(page_t *plist, int flags) 328 { 329 page_t *pp; 330 331 while (plist != NULL) { 332 pp = plist; 333 page_sub(&plist, pp); 334 page_io_unlock(pp); 335 if (flags & B_ERROR) { 336 /*LINTED: constant in conditional context*/ 337 VN_DISPOSE(pp, B_INVAL, 0, kcred); 338 } else { 339 (void) page_release(pp, 0); 340 } 341 } 342 } 343 344 /* 345 * Automagic pageout. 346 * When memory gets tight, start freeing pages popping out of the 347 * write queue. 348 */ 349 int write_free = 1; 350 pgcnt_t pages_before_pager = 200; /* LMXXX */ 351 352 /* 353 * Routine to be called when page-out's complete. 354 * The caller, typically VOP_PUTPAGE, has to explicity call this routine 355 * after waiting for i/o to complete (biowait) to free the list of 356 * pages associated with the buffer. These pages must be locked 357 * before i/o is initiated. 358 * 359 * If a write error occurs, the pages are marked as modified 360 * so the write will be re-tried later. 361 */ 362 363 void 364 pvn_write_done(page_t *plist, int flags) 365 { 366 int dfree = 0; 367 int pgrec = 0; 368 int pgout = 0; 369 int pgpgout = 0; 370 int anonpgout = 0; 371 int anonfree = 0; 372 int fspgout = 0; 373 int fsfree = 0; 374 int execpgout = 0; 375 int execfree = 0; 376 page_t *pp; 377 struct cpu *cpup; 378 struct vnode *vp = NULL; /* for probe */ 379 uint_t ppattr; 380 kmutex_t *vphm = NULL; 381 382 ASSERT((flags & B_READ) == 0); 383 384 /* 385 * If we are about to start paging anyway, start freeing pages. 386 */ 387 if (write_free && freemem < lotsfree + pages_before_pager && 388 (flags & B_ERROR) == 0) { 389 flags |= B_FREE; 390 } 391 392 /* 393 * Handle each page involved in the i/o operation. 394 */ 395 while (plist != NULL) { 396 pp = plist; 397 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); 398 page_sub(&plist, pp); 399 400 /* Kernel probe support */ 401 if (vp == NULL) 402 vp = pp->p_vnode; 403 404 if (IS_VMODSORT(vp)) { 405 /* 406 * Move page to the top of the v_page list. 407 * Skip pages modified during IO. 408 */ 409 vphm = page_vnode_mutex(vp); 410 mutex_enter(vphm); 411 if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { 412 page_vpsub(&vp->v_pages, pp); 413 page_vpadd(&vp->v_pages, pp); 414 } 415 mutex_exit(vphm); 416 } 417 418 if (flags & B_ERROR) { 419 /* 420 * Write operation failed. We don't want 421 * to destroy (or free) the page unless B_FORCE 422 * is set. We set the mod bit again and release 423 * all locks on the page so that it will get written 424 * back again later when things are hopefully 425 * better again. 426 * If B_INVAL and B_FORCE is set we really have 427 * to destroy the page. 428 */ 429 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { 430 page_io_unlock(pp); 431 /*LINTED: constant in conditional context*/ 432 VN_DISPOSE(pp, B_INVAL, 0, kcred); 433 } else { 434 hat_setmod(pp); 435 page_io_unlock(pp); 436 page_unlock(pp); 437 } 438 } else if (flags & B_INVAL) { 439 /* 440 * XXX - Failed writes with B_INVAL set are 441 * not handled appropriately. 442 */ 443 page_io_unlock(pp); 444 /*LINTED: constant in conditional context*/ 445 VN_DISPOSE(pp, B_INVAL, 0, kcred); 446 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { 447 /* 448 * Update statistics for pages being paged out 449 */ 450 if (pp->p_vnode) { 451 if (IS_SWAPFSVP(pp->p_vnode)) { 452 anonpgout++; 453 } else { 454 if (pp->p_vnode->v_flag & VVMEXEC) { 455 execpgout++; 456 } else { 457 fspgout++; 458 } 459 } 460 } 461 page_io_unlock(pp); 462 pgout = 1; 463 pgpgout++; 464 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, 465 "page_ws_out:pp %p", pp); 466 467 /* 468 * The page_struct_lock need not be acquired to 469 * examine "p_lckcnt" and "p_cowcnt" since we'll 470 * have an "exclusive" lock if the upgrade succeeds. 471 */ 472 if (page_tryupgrade(pp) && 473 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 474 /* 475 * Check if someone has reclaimed the 476 * page. If ref and mod are not set, no 477 * one is using it so we can free it. 478 * The rest of the system is careful 479 * to use the NOSYNC flag to unload 480 * translations set up for i/o w/o 481 * affecting ref and mod bits. 482 * 483 * Obtain a copy of the real hardware 484 * mod bit using hat_pagesync(pp, HAT_DONTZERO) 485 * to avoid having to flush the cache. 486 */ 487 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 488 HAT_SYNC_STOPON_MOD); 489 ck_refmod: 490 if (!(ppattr & (P_REF | P_MOD))) { 491 if (hat_page_is_mapped(pp)) { 492 /* 493 * Doesn't look like the page 494 * was modified so now we 495 * really have to unload the 496 * translations. Meanwhile 497 * another CPU could've 498 * modified it so we have to 499 * check again. We don't loop 500 * forever here because now 501 * the translations are gone 502 * and no one can get a new one 503 * since we have the "exclusive" 504 * lock on the page. 505 */ 506 (void) hat_pageunload(pp, 507 HAT_FORCE_PGUNLOAD); 508 ppattr = hat_page_getattr(pp, 509 P_REF | P_MOD); 510 goto ck_refmod; 511 } 512 /* 513 * Update statistics for pages being 514 * freed 515 */ 516 if (pp->p_vnode) { 517 if (IS_SWAPFSVP(pp->p_vnode)) { 518 anonfree++; 519 } else { 520 if (pp->p_vnode->v_flag 521 & VVMEXEC) { 522 execfree++; 523 } else { 524 fsfree++; 525 } 526 } 527 } 528 /*LINTED: constant in conditional ctx*/ 529 VN_DISPOSE(pp, B_FREE, 530 (flags & B_DONTNEED), kcred); 531 dfree++; 532 } else { 533 page_unlock(pp); 534 pgrec++; 535 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, 536 "page_ws_free:pp %p", pp); 537 } 538 } else { 539 /* 540 * Page is either `locked' in memory 541 * or was reclaimed and now has a 542 * "shared" lock, so release it. 543 */ 544 page_unlock(pp); 545 } 546 } else { 547 /* 548 * Neither B_FREE nor B_INVAL nor B_ERROR. 549 * Just release locks. 550 */ 551 page_io_unlock(pp); 552 page_unlock(pp); 553 } 554 } 555 556 CPU_STATS_ENTER_K(); 557 cpup = CPU; /* get cpup now that CPU cannot change */ 558 CPU_STATS_ADDQ(cpup, vm, dfree, dfree); 559 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); 560 CPU_STATS_ADDQ(cpup, vm, pgout, pgout); 561 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); 562 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); 563 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); 564 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); 565 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); 566 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); 567 CPU_STATS_ADDQ(cpup, vm, execfree, execfree); 568 CPU_STATS_EXIT_K(); 569 570 /* Kernel probe */ 571 TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, 572 tnf_opaque, vnode, vp, 573 tnf_ulong, pages_pageout, pgpgout, 574 tnf_ulong, pages_freed, dfree, 575 tnf_ulong, pages_reclaimed, pgrec); 576 } 577 578 /* 579 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, 580 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster 581 * operation and is only to be considered if it doesn't involve any 582 * waiting here. B_TRUNC indicates that the file is being truncated 583 * and so no i/o needs to be done. B_FORCE indicates that the page 584 * must be destroyed so don't try wrting it out. 585 * 586 * The caller must ensure that the page is locked. Returns 1, if 587 * the page should be written back (the "iolock" is held in this 588 * case), or 0 if the page has been dealt with or has been 589 * unlocked. 590 */ 591 int 592 pvn_getdirty(page_t *pp, int flags) 593 { 594 ASSERT((flags & (B_INVAL | B_FREE)) ? 595 PAGE_EXCL(pp) : PAGE_SHARED(pp)); 596 ASSERT(PP_ISFREE(pp) == 0); 597 598 /* 599 * If trying to invalidate or free a logically `locked' page, 600 * forget it. Don't need page_struct_lock to check p_lckcnt and 601 * p_cowcnt as the page is exclusively locked. 602 */ 603 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && 604 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { 605 page_unlock(pp); 606 return (0); 607 } 608 609 /* 610 * Now acquire the i/o lock so we can add it to the dirty 611 * list (if necessary). We avoid blocking on the i/o lock 612 * in the following cases: 613 * 614 * If B_DELWRI is set, which implies that this request is 615 * due to a klustering operartion. 616 * 617 * If this is an async (B_ASYNC) operation and we are not doing 618 * invalidation (B_INVAL) [The current i/o or fsflush will ensure 619 * that the the page is written out]. 620 */ 621 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { 622 if (!page_io_trylock(pp)) { 623 page_unlock(pp); 624 return (0); 625 } 626 } else { 627 page_io_lock(pp); 628 } 629 630 /* 631 * If we want to free or invalidate the page then 632 * we need to unload it so that anyone who wants 633 * it will have to take a minor fault to get it. 634 * Otherwise, we're just writing the page back so we 635 * need to sync up the hardwre and software mod bit to 636 * detect any future modifications. We clear the 637 * software mod bit when we put the page on the dirty 638 * list. 639 */ 640 if (flags & (B_INVAL | B_FREE)) { 641 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 642 } else { 643 (void) hat_pagesync(pp, HAT_SYNC_ZERORM); 644 } 645 646 if (!hat_ismod(pp) || (flags & B_TRUNC)) { 647 /* 648 * Don't need to add it to the 649 * list after all. 650 */ 651 page_io_unlock(pp); 652 if (flags & B_INVAL) { 653 /*LINTED: constant in conditional context*/ 654 VN_DISPOSE(pp, B_INVAL, 0, kcred); 655 } else if (flags & B_FREE) { 656 /*LINTED: constant in conditional context*/ 657 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); 658 } else { 659 /* 660 * This is advisory path for the callers 661 * of VOP_PUTPAGE() who prefer freeing the 662 * page _only_ if no one else is accessing it. 663 * E.g. segmap_release() 664 * 665 * The above hat_ismod() check is useless because: 666 * (1) we may not be holding SE_EXCL lock; 667 * (2) we've not unloaded _all_ translations 668 * 669 * Let page_release() do the heavy-lifting. 670 */ 671 (void) page_release(pp, 1); 672 } 673 return (0); 674 } 675 676 /* 677 * Page is dirty, get it ready for the write back 678 * and add page to the dirty list. 679 */ 680 hat_clrrefmod(pp); 681 682 /* 683 * If we're going to free the page when we're done 684 * then we can let others try to use it starting now. 685 * We'll detect the fact that they used it when the 686 * i/o is done and avoid freeing the page. 687 */ 688 if (flags & B_FREE) 689 page_downgrade(pp); 690 691 692 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); 693 694 return (1); 695 } 696 697 698 /*ARGSUSED*/ 699 static int 700 marker_constructor(void *buf, void *cdrarg, int kmflags) 701 { 702 page_t *mark = buf; 703 bzero(mark, sizeof (page_t)); 704 return (0); 705 } 706 707 void 708 pvn_init() 709 { 710 if (pvn_vmodsort_disable == 0) 711 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); 712 marker_cache = kmem_cache_create("marker_cache", 713 sizeof (page_t), 0, marker_constructor, 714 NULL, NULL, NULL, NULL, 0); 715 } 716 717 718 /* 719 * Process a vnode's page list for all pages whose offset is >= off. 720 * Pages are to either be free'd, invalidated, or written back to disk. 721 * 722 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE 723 * is specified, otherwise they are "shared" locked. 724 * 725 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} 726 * 727 * Special marker page_t's are inserted in the list in order 728 * to keep track of where we are in the list when locks are dropped. 729 * 730 * Note the list is circular and insertions can happen only at the 731 * head and tail of the list. The algorithm ensures visiting all pages 732 * on the list in the following way: 733 * 734 * Drop two marker pages at the end of the list. 735 * 736 * Move one marker page backwards towards the start of the list until 737 * it is at the list head, processing the pages passed along the way. 738 * 739 * Due to race conditions when the vphm mutex is dropped, additional pages 740 * can be added to either end of the list, so we'll continue to move 741 * the marker and process pages until it is up against the end marker. 742 * 743 * There is one special exit condition. If we are processing a VMODSORT 744 * vnode and only writing back modified pages, we can stop as soon as 745 * we run into an unmodified page. This makes fsync(3) operations fast. 746 */ 747 int 748 pvn_vplist_dirty( 749 vnode_t *vp, 750 u_offset_t off, 751 int (*putapage)(vnode_t *, page_t *, u_offset_t *, 752 size_t *, int, cred_t *), 753 int flags, 754 cred_t *cred) 755 { 756 page_t *pp; 757 page_t *mark; /* marker page that moves toward head */ 758 page_t *end; /* marker page at end of list */ 759 int err = 0; 760 int error; 761 kmutex_t *vphm; 762 se_t se; 763 page_t **where_to_move; 764 765 ASSERT(vp->v_type != VCHR); 766 767 if (vp->v_pages == NULL) 768 return (0); 769 770 771 /* 772 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. 773 * 774 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() 775 * from getting blocked while flushing pages to a dead NFS server. 776 */ 777 mutex_enter(&vp->v_lock); 778 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { 779 mutex_exit(&vp->v_lock); 780 return (EAGAIN); 781 } 782 783 while (vp->v_flag & VVMLOCK) 784 cv_wait(&vp->v_cv, &vp->v_lock); 785 786 if (vp->v_pages == NULL) { 787 mutex_exit(&vp->v_lock); 788 return (0); 789 } 790 791 vp->v_flag |= VVMLOCK; 792 mutex_exit(&vp->v_lock); 793 794 795 /* 796 * Set up the marker pages used to walk the list 797 */ 798 end = kmem_cache_alloc(marker_cache, KM_SLEEP); 799 end->p_vnode = vp; 800 end->p_offset = (u_offset_t)-2; 801 mark = kmem_cache_alloc(marker_cache, KM_SLEEP); 802 mark->p_vnode = vp; 803 mark->p_offset = (u_offset_t)-1; 804 805 /* 806 * Grab the lock protecting the vnode's page list 807 * note that this lock is dropped at times in the loop. 808 */ 809 vphm = page_vnode_mutex(vp); 810 mutex_enter(vphm); 811 if (vp->v_pages == NULL) 812 goto leave; 813 814 /* 815 * insert the markers and loop through the list of pages 816 */ 817 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); 818 page_vpadd(&mark->p_vpnext, end); 819 for (;;) { 820 821 /* 822 * If only doing an async write back, then we can 823 * stop as soon as we get to start of the list. 824 */ 825 if (flags == B_ASYNC && vp->v_pages == mark) 826 break; 827 828 /* 829 * otherwise stop when we've gone through all the pages 830 */ 831 if (mark->p_vpprev == end) 832 break; 833 834 pp = mark->p_vpprev; 835 if (vp->v_pages == pp) 836 where_to_move = &vp->v_pages; 837 else 838 where_to_move = &pp->p_vpprev->p_vpnext; 839 840 ASSERT(pp->p_vnode == vp); 841 842 /* 843 * Skip this page if the offset is out of the desired range. 844 * Just move the marker and continue. 845 */ 846 if (pp->p_offset < off) { 847 page_vpsub(&vp->v_pages, mark); 848 page_vpadd(where_to_move, mark); 849 continue; 850 } 851 852 /* 853 * If just flushing dirty pages to disk and this vnode 854 * is using a sorted list of pages, we can stop processing 855 * as soon as we find an unmodified page. Since all the 856 * modified pages are visited first. 857 */ 858 if (IS_VMODSORT(vp) && 859 !(flags & (B_INVAL | B_FREE | B_TRUNC))) { 860 if (!hat_ismod(pp) && !page_io_locked(pp)) { 861 #ifdef DEBUG 862 /* 863 * For debug kernels examine what should be 864 * all the remaining clean pages, asserting 865 * that they are not modified. 866 */ 867 page_t *chk = pp; 868 int attr; 869 870 page_vpsub(&vp->v_pages, mark); 871 page_vpadd(where_to_move, mark); 872 do { 873 chk = chk->p_vpprev; 874 ASSERT(chk != end); 875 if (chk == mark) 876 continue; 877 attr = hat_page_getattr(chk, P_MOD | 878 P_REF); 879 if ((attr & P_MOD) == 0) 880 continue; 881 panic("v_pages list not all clean: " 882 "page_t*=%p vnode=%p off=%lx " 883 "attr=0x%x last clean page_t*=%p\n", 884 (void *)chk, (void *)chk->p_vnode, 885 (long)chk->p_offset, attr, 886 (void *)pp); 887 } while (chk != vp->v_pages); 888 #endif 889 break; 890 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { 891 /* 892 * Couldn't get io lock, wait until IO is done. 893 * Block only for sync IO since we don't want 894 * to block async IO. 895 */ 896 mutex_exit(vphm); 897 page_io_wait(pp); 898 mutex_enter(vphm); 899 continue; 900 } 901 } 902 903 /* 904 * If we are supposed to invalidate or free this 905 * page, then we need an exclusive lock. 906 */ 907 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 908 909 /* 910 * We must acquire the page lock for all synchronous 911 * operations (invalidate, free and write). 912 */ 913 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { 914 /* 915 * If the page_lock() drops the mutex 916 * we must retry the loop. 917 */ 918 if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) 919 continue; 920 921 /* 922 * It's ok to move the marker page now. 923 */ 924 page_vpsub(&vp->v_pages, mark); 925 page_vpadd(where_to_move, mark); 926 } else { 927 928 /* 929 * update the marker page for all remaining cases 930 */ 931 page_vpsub(&vp->v_pages, mark); 932 page_vpadd(where_to_move, mark); 933 934 /* 935 * For write backs, If we can't lock the page, it's 936 * invalid or in the process of being destroyed. Skip 937 * it, assuming someone else is writing it. 938 */ 939 if (!page_trylock(pp, se)) 940 continue; 941 } 942 943 ASSERT(pp->p_vnode == vp); 944 945 /* 946 * Successfully locked the page, now figure out what to 947 * do with it. Free pages are easily dealt with, invalidate 948 * if desired or just go on to the next page. 949 */ 950 if (PP_ISFREE(pp)) { 951 if ((flags & B_INVAL) == 0) { 952 page_unlock(pp); 953 continue; 954 } 955 956 /* 957 * Invalidate (destroy) the page. 958 */ 959 mutex_exit(vphm); 960 page_destroy_free(pp); 961 mutex_enter(vphm); 962 continue; 963 } 964 965 /* 966 * pvn_getdirty() figures out what do do with a dirty page. 967 * If the page is dirty, the putapage() routine will write it 968 * and will kluster any other adjacent dirty pages it can. 969 * 970 * pvn_getdirty() and `(*putapage)' unlock the page. 971 */ 972 mutex_exit(vphm); 973 if (pvn_getdirty(pp, flags)) { 974 error = (*putapage)(vp, pp, NULL, NULL, flags, cred); 975 if (!err) 976 err = error; 977 } 978 mutex_enter(vphm); 979 } 980 page_vpsub(&vp->v_pages, mark); 981 page_vpsub(&vp->v_pages, end); 982 983 leave: 984 /* 985 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds 986 */ 987 mutex_exit(vphm); 988 kmem_cache_free(marker_cache, mark); 989 kmem_cache_free(marker_cache, end); 990 mutex_enter(&vp->v_lock); 991 vp->v_flag &= ~VVMLOCK; 992 cv_broadcast(&vp->v_cv); 993 mutex_exit(&vp->v_lock); 994 return (err); 995 } 996 997 /* 998 * Zero out zbytes worth of data. Caller should be aware that this 999 * routine may enter back into the fs layer (xxx_getpage). Locks 1000 * that the xxx_getpage routine may need should not be held while 1001 * calling this. 1002 */ 1003 void 1004 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) 1005 { 1006 caddr_t addr; 1007 1008 ASSERT(vp->v_type != VCHR); 1009 1010 if (vp->v_pages == NULL) 1011 return; 1012 1013 /* 1014 * zbytes may be zero but there still may be some portion of 1015 * a page which needs clearing (since zbytes is a function 1016 * of filesystem block size, not pagesize.) 1017 */ 1018 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) 1019 return; 1020 1021 /* 1022 * We get the last page and handle the partial 1023 * zeroing via kernel mappings. This will make the page 1024 * dirty so that we know that when this page is written 1025 * back, the zeroed information will go out with it. If 1026 * the page is not currently in memory, then the kzero 1027 * operation will cause it to be brought it. We use kzero 1028 * instead of bzero so that if the page cannot be read in 1029 * for any reason, the system will not panic. We need 1030 * to zero out a minimum of the fs given zbytes, but we 1031 * might also have to do more to get the entire last page. 1032 */ 1033 1034 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) 1035 panic("pvn_vptrunc zbytes"); 1036 addr = segmap_getmapflt(segkmap, vp, vplen, 1037 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); 1038 (void) kzero(addr + (vplen & MAXBOFFSET), 1039 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); 1040 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); 1041 } 1042 1043 /* 1044 * Handles common work of the VOP_GETPAGE routines when more than 1045 * one page must be returned by calling a file system specific operation 1046 * to do most of the work. Must be called with the vp already locked 1047 * by the VOP_GETPAGE routine. 1048 */ 1049 int 1050 pvn_getpages( 1051 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], 1052 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), 1053 struct vnode *vp, 1054 u_offset_t off, 1055 size_t len, 1056 uint_t *protp, 1057 page_t *pl[], 1058 size_t plsz, 1059 struct seg *seg, 1060 caddr_t addr, 1061 enum seg_rw rw, 1062 struct cred *cred) 1063 { 1064 page_t **ppp; 1065 u_offset_t o, eoff; 1066 size_t sz, xlen; 1067 int err; 1068 1069 ASSERT(plsz >= len); /* insure that we have enough space */ 1070 1071 /* 1072 * Loop one page at a time and let getapage function fill 1073 * in the next page in array. We only allow one page to be 1074 * returned at a time (except for the last page) so that we 1075 * don't have any problems with duplicates and other such 1076 * painful problems. This is a very simple minded algorithm, 1077 * but it does the job correctly. We hope that the cost of a 1078 * getapage call for a resident page that we might have been 1079 * able to get from an earlier call doesn't cost too much. 1080 */ 1081 ppp = pl; 1082 sz = PAGESIZE; 1083 eoff = off + len; 1084 xlen = len; 1085 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, 1086 xlen -= PAGESIZE) { 1087 if (o + PAGESIZE >= eoff) { 1088 /* 1089 * Last time through - allow the all of 1090 * what's left of the pl[] array to be used. 1091 */ 1092 sz = plsz - (o - off); 1093 } 1094 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, 1095 rw, cred); 1096 if (err) { 1097 /* 1098 * Release any pages we already got. 1099 */ 1100 if (o > off && pl != NULL) { 1101 for (ppp = pl; *ppp != NULL; *ppp++ = NULL) 1102 (void) page_release(*ppp, 1); 1103 } 1104 break; 1105 } 1106 if (pl != NULL) 1107 ppp++; 1108 } 1109 return (err); 1110 } 1111 1112 /* 1113 * Initialize the page list array. 1114 */ 1115 void 1116 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, 1117 u_offset_t off, size_t io_len, enum seg_rw rw) 1118 { 1119 ssize_t sz; 1120 page_t *ppcur, **ppp; 1121 1122 if (plsz >= io_len) { 1123 /* 1124 * Everything fits, set up to load 1125 * all the pages. 1126 */ 1127 sz = io_len; 1128 } else { 1129 /* 1130 * Set up to load plsz worth 1131 * starting at the needed page. 1132 */ 1133 while (pp->p_offset != off) { 1134 /* XXX - Do we need this assert? */ 1135 ASSERT(pp->p_next->p_offset != 1136 pp->p_offset); 1137 /* 1138 * Remove page from the i/o list, 1139 * release the i/o and the page lock. 1140 */ 1141 ppcur = pp; 1142 page_sub(&pp, ppcur); 1143 page_io_unlock(ppcur); 1144 (void) page_release(ppcur, 1); 1145 } 1146 sz = plsz; 1147 } 1148 1149 /* 1150 * Initialize the page list array. 1151 */ 1152 ppp = pl; 1153 do { 1154 ppcur = pp; 1155 *ppp++ = ppcur; 1156 page_sub(&pp, ppcur); 1157 page_io_unlock(ppcur); 1158 if (rw != S_CREATE) 1159 page_downgrade(ppcur); 1160 sz -= PAGESIZE; 1161 } while (sz > 0 && pp != NULL); 1162 *ppp = NULL; /* terminate list */ 1163 1164 /* 1165 * Now free the remaining pages that weren't 1166 * loaded in the page list. 1167 */ 1168 while (pp != NULL) { 1169 ppcur = pp; 1170 page_sub(&pp, ppcur); 1171 page_io_unlock(ppcur); 1172 (void) page_release(ppcur, 1); 1173 } 1174 } 1175