1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - paged vnode. 44 * 45 * This file supplies vm support for the vnode operations that deal with pages. 46 */ 47 #include <sys/types.h> 48 #include <sys/t_lock.h> 49 #include <sys/param.h> 50 #include <sys/sysmacros.h> 51 #include <sys/systm.h> 52 #include <sys/time.h> 53 #include <sys/buf.h> 54 #include <sys/vnode.h> 55 #include <sys/uio.h> 56 #include <sys/vmmeter.h> 57 #include <sys/vmsystm.h> 58 #include <sys/mman.h> 59 #include <sys/vfs.h> 60 #include <sys/cred.h> 61 #include <sys/user.h> 62 #include <sys/kmem.h> 63 #include <sys/cmn_err.h> 64 #include <sys/debug.h> 65 #include <sys/cpuvar.h> 66 #include <sys/vtrace.h> 67 #include <sys/tnf_probe.h> 68 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/seg.h> 72 #include <vm/rm.h> 73 #include <vm/pvn.h> 74 #include <vm/page.h> 75 #include <vm/seg_map.h> 76 #include <vm/seg_kmem.h> 77 #include <sys/fs/swapnode.h> 78 79 int pvn_nofodklust = 0; 80 int pvn_write_noklust = 0; 81 82 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ 83 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ 84 /* support for vmodsort for testing */ 85 86 static struct kmem_cache *marker_cache = NULL; 87 88 /* 89 * Find the largest contiguous block which contains `addr' for file offset 90 * `offset' in it while living within the file system block sizes (`vp_off' 91 * and `vp_len') and the address space limits for which no pages currently 92 * exist and which map to consecutive file offsets. 93 */ 94 page_t * 95 pvn_read_kluster( 96 struct vnode *vp, 97 u_offset_t off, 98 struct seg *seg, 99 caddr_t addr, 100 u_offset_t *offp, /* return values */ 101 size_t *lenp, /* return values */ 102 u_offset_t vp_off, 103 size_t vp_len, 104 int isra) 105 { 106 ssize_t deltaf, deltab; 107 page_t *pp; 108 page_t *plist = NULL; 109 spgcnt_t pagesavail; 110 u_offset_t vp_end; 111 112 ASSERT(off >= vp_off && off < vp_off + vp_len); 113 114 /* 115 * We only want to do klustering/read ahead if there 116 * is more than minfree pages currently available. 117 */ 118 pagesavail = freemem - minfree; 119 120 if (pagesavail <= 0) 121 if (isra) 122 return ((page_t *)NULL); /* ra case - give up */ 123 else 124 pagesavail = 1; /* must return a page */ 125 126 /* We calculate in pages instead of bytes due to 32-bit overflows */ 127 if (pagesavail < (spgcnt_t)btopr(vp_len)) { 128 /* 129 * Don't have enough free memory for the 130 * max request, try sizing down vp request. 131 */ 132 deltab = (ssize_t)(off - vp_off); 133 vp_len -= deltab; 134 vp_off += deltab; 135 if (pagesavail < btopr(vp_len)) { 136 /* 137 * Still not enough memory, just settle for 138 * pagesavail which is at least 1. 139 */ 140 vp_len = ptob(pagesavail); 141 } 142 } 143 144 vp_end = vp_off + vp_len; 145 ASSERT(off >= vp_off && off < vp_end); 146 147 if (isra && SEGOP_KLUSTER(seg, addr, 0)) 148 return ((page_t *)NULL); /* segment driver says no */ 149 150 if ((plist = page_create_va(vp, off, 151 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) 152 return ((page_t *)NULL); 153 154 if (vp_len <= PAGESIZE || pvn_nofodklust) { 155 *offp = off; 156 *lenp = MIN(vp_len, PAGESIZE); 157 } else { 158 /* 159 * Scan back from front by incrementing "deltab" and 160 * comparing "off" with "vp_off + deltab" to avoid 161 * "signed" versus "unsigned" conversion problems. 162 */ 163 for (deltab = PAGESIZE; off >= vp_off + deltab; 164 deltab += PAGESIZE) { 165 /* 166 * Call back to the segment driver to verify that 167 * the klustering/read ahead operation makes sense. 168 */ 169 if (SEGOP_KLUSTER(seg, addr, -deltab)) 170 break; /* page not eligible */ 171 if ((pp = page_create_va(vp, off - deltab, 172 PAGESIZE, PG_EXCL, seg, addr - deltab)) 173 == NULL) 174 break; /* already have the page */ 175 /* 176 * Add page to front of page list. 177 */ 178 page_add(&plist, pp); 179 } 180 deltab -= PAGESIZE; 181 182 /* scan forward from front */ 183 for (deltaf = PAGESIZE; off + deltaf < vp_end; 184 deltaf += PAGESIZE) { 185 /* 186 * Call back to the segment driver to verify that 187 * the klustering/read ahead operation makes sense. 188 */ 189 if (SEGOP_KLUSTER(seg, addr, deltaf)) 190 break; /* page not file extension */ 191 if ((pp = page_create_va(vp, off + deltaf, 192 PAGESIZE, PG_EXCL, seg, addr + deltaf)) 193 == NULL) 194 break; /* already have page */ 195 196 /* 197 * Add page to end of page list. 198 */ 199 page_add(&plist, pp); 200 plist = plist->p_next; 201 } 202 *offp = off = off - deltab; 203 *lenp = deltab + deltaf; 204 ASSERT(off >= vp_off); 205 206 /* 207 * If we ended up getting more than was actually 208 * requested, retract the returned length to only 209 * reflect what was requested. This might happen 210 * if we were allowed to kluster pages across a 211 * span of (say) 5 frags, and frag size is less 212 * than PAGESIZE. We need a whole number of 213 * pages to contain those frags, but the returned 214 * size should only allow the returned range to 215 * extend as far as the end of the frags. 216 */ 217 if ((vp_off + vp_len) < (off + *lenp)) { 218 ASSERT(vp_end > off); 219 *lenp = vp_end - off; 220 } 221 } 222 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, 223 "pvn_read_kluster:seg %p addr %x isra %x", 224 seg, addr, isra); 225 return (plist); 226 } 227 228 /* 229 * Handle pages for this vnode on either side of the page "pp" 230 * which has been locked by the caller. This routine will also 231 * do klustering in the range [vp_off, vp_off + vp_len] up 232 * until a page which is not found. The offset and length 233 * of pages included is returned in "*offp" and "*lenp". 234 * 235 * Returns a list of dirty locked pages all ready to be 236 * written back. 237 */ 238 page_t * 239 pvn_write_kluster( 240 struct vnode *vp, 241 page_t *pp, 242 u_offset_t *offp, /* return values */ 243 size_t *lenp, /* return values */ 244 u_offset_t vp_off, 245 size_t vp_len, 246 int flags) 247 { 248 u_offset_t off; 249 page_t *dirty; 250 size_t deltab, deltaf; 251 se_t se; 252 u_offset_t vp_end; 253 254 off = pp->p_offset; 255 256 /* 257 * Kustering should not be done if we are invalidating 258 * pages since we could destroy pages that belong to 259 * some other process if this is a swap vnode. 260 */ 261 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { 262 *offp = off; 263 *lenp = PAGESIZE; 264 return (pp); 265 } 266 267 if (flags & (B_FREE | B_INVAL)) 268 se = SE_EXCL; 269 else 270 se = SE_SHARED; 271 272 dirty = pp; 273 /* 274 * Scan backwards looking for pages to kluster by incrementing 275 * "deltab" and comparing "off" with "vp_off + deltab" to 276 * avoid "signed" versus "unsigned" conversion problems. 277 */ 278 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { 279 pp = page_lookup_nowait(vp, off - deltab, se); 280 if (pp == NULL) 281 break; /* page not found */ 282 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 283 break; 284 page_add(&dirty, pp); 285 } 286 deltab -= PAGESIZE; 287 288 vp_end = vp_off + vp_len; 289 /* now scan forwards looking for pages to kluster */ 290 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { 291 pp = page_lookup_nowait(vp, off + deltaf, se); 292 if (pp == NULL) 293 break; /* page not found */ 294 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 295 break; 296 page_add(&dirty, pp); 297 dirty = dirty->p_next; 298 } 299 300 *offp = off - deltab; 301 *lenp = deltab + deltaf; 302 return (dirty); 303 } 304 305 /* 306 * Generic entry point used to release the "shared/exclusive" lock 307 * and the "p_iolock" on pages after i/o is complete. 308 */ 309 void 310 pvn_io_done(page_t *plist) 311 { 312 page_t *pp; 313 314 while (plist != NULL) { 315 pp = plist; 316 page_sub(&plist, pp); 317 page_io_unlock(pp); 318 page_unlock(pp); 319 } 320 } 321 322 /* 323 * Entry point to be used by file system getpage subr's and 324 * other such routines which either want to unlock pages (B_ASYNC 325 * request) or destroy a list of pages if an error occurred. 326 */ 327 void 328 pvn_read_done(page_t *plist, int flags) 329 { 330 page_t *pp; 331 332 while (plist != NULL) { 333 pp = plist; 334 page_sub(&plist, pp); 335 page_io_unlock(pp); 336 if (flags & B_ERROR) { 337 /*LINTED: constant in conditional context*/ 338 VN_DISPOSE(pp, B_INVAL, 0, kcred); 339 } else { 340 (void) page_release(pp, 0); 341 } 342 } 343 } 344 345 /* 346 * Automagic pageout. 347 * When memory gets tight, start freeing pages popping out of the 348 * write queue. 349 */ 350 int write_free = 1; 351 pgcnt_t pages_before_pager = 200; /* LMXXX */ 352 353 /* 354 * Routine to be called when page-out's complete. 355 * The caller, typically VOP_PUTPAGE, has to explicity call this routine 356 * after waiting for i/o to complete (biowait) to free the list of 357 * pages associated with the buffer. These pages must be locked 358 * before i/o is initiated. 359 * 360 * If a write error occurs, the pages are marked as modified 361 * so the write will be re-tried later. 362 */ 363 364 void 365 pvn_write_done(page_t *plist, int flags) 366 { 367 int dfree = 0; 368 int pgrec = 0; 369 int pgout = 0; 370 int pgpgout = 0; 371 int anonpgout = 0; 372 int anonfree = 0; 373 int fspgout = 0; 374 int fsfree = 0; 375 int execpgout = 0; 376 int execfree = 0; 377 page_t *pp; 378 struct cpu *cpup; 379 struct vnode *vp = NULL; /* for probe */ 380 uint_t ppattr; 381 382 ASSERT((flags & B_READ) == 0); 383 384 /* 385 * If we are about to start paging anyway, start freeing pages. 386 */ 387 if (write_free && freemem < lotsfree + pages_before_pager && 388 (flags & B_ERROR) == 0) { 389 flags |= B_FREE; 390 } 391 392 /* 393 * Handle each page involved in the i/o operation. 394 */ 395 while (plist != NULL) { 396 pp = plist; 397 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); 398 page_sub(&plist, pp); 399 400 /* Kernel probe support */ 401 if (vp == NULL) 402 vp = pp->p_vnode; 403 404 if (flags & B_ERROR) { 405 /* 406 * Write operation failed. We don't want 407 * to destroy (or free) the page unless B_FORCE 408 * is set. We set the mod bit again and release 409 * all locks on the page so that it will get written 410 * back again later when things are hopefully 411 * better again. 412 * If B_INVAL and B_FORCE is set we really have 413 * to destroy the page. 414 */ 415 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { 416 page_io_unlock(pp); 417 /*LINTED: constant in conditional context*/ 418 VN_DISPOSE(pp, B_INVAL, 0, kcred); 419 } else { 420 hat_setmod(pp); 421 page_io_unlock(pp); 422 page_unlock(pp); 423 } 424 } else if (flags & B_INVAL) { 425 /* 426 * XXX - Failed writes with B_INVAL set are 427 * not handled appropriately. 428 */ 429 page_io_unlock(pp); 430 /*LINTED: constant in conditional context*/ 431 VN_DISPOSE(pp, B_INVAL, 0, kcred); 432 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { 433 /* 434 * Update statistics for pages being paged out 435 */ 436 if (pp->p_vnode) { 437 if (IS_SWAPFSVP(pp->p_vnode)) { 438 anonpgout++; 439 } else { 440 if (pp->p_vnode->v_flag & VVMEXEC) { 441 execpgout++; 442 } else { 443 fspgout++; 444 } 445 } 446 } 447 page_io_unlock(pp); 448 pgout = 1; 449 pgpgout++; 450 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, 451 "page_ws_out:pp %p", pp); 452 453 /* 454 * The page_struct_lock need not be acquired to 455 * examine "p_lckcnt" and "p_cowcnt" since we'll 456 * have an "exclusive" lock if the upgrade succeeds. 457 */ 458 if (page_tryupgrade(pp) && 459 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 460 /* 461 * Check if someone has reclaimed the 462 * page. If ref and mod are not set, no 463 * one is using it so we can free it. 464 * The rest of the system is careful 465 * to use the NOSYNC flag to unload 466 * translations set up for i/o w/o 467 * affecting ref and mod bits. 468 * 469 * Obtain a copy of the real hardware 470 * mod bit using hat_pagesync(pp, HAT_DONTZERO) 471 * to avoid having to flush the cache. 472 */ 473 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 474 HAT_SYNC_STOPON_MOD); 475 ck_refmod: 476 if (!(ppattr & (P_REF | P_MOD))) { 477 if (hat_page_is_mapped(pp)) { 478 /* 479 * Doesn't look like the page 480 * was modified so now we 481 * really have to unload the 482 * translations. Meanwhile 483 * another CPU could've 484 * modified it so we have to 485 * check again. We don't loop 486 * forever here because now 487 * the translations are gone 488 * and no one can get a new one 489 * since we have the "exclusive" 490 * lock on the page. 491 */ 492 (void) hat_pageunload(pp, 493 HAT_FORCE_PGUNLOAD); 494 ppattr = hat_page_getattr(pp, 495 P_REF | P_MOD); 496 goto ck_refmod; 497 } 498 /* 499 * Update statistics for pages being 500 * freed 501 */ 502 if (pp->p_vnode) { 503 if (IS_SWAPFSVP(pp->p_vnode)) { 504 anonfree++; 505 } else { 506 if (pp->p_vnode->v_flag 507 & VVMEXEC) { 508 execfree++; 509 } else { 510 fsfree++; 511 } 512 } 513 } 514 /*LINTED: constant in conditional ctx*/ 515 VN_DISPOSE(pp, B_FREE, 516 (flags & B_DONTNEED), kcred); 517 dfree++; 518 } else { 519 page_unlock(pp); 520 pgrec++; 521 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, 522 "page_ws_free:pp %p", pp); 523 } 524 } else { 525 /* 526 * Page is either `locked' in memory 527 * or was reclaimed and now has a 528 * "shared" lock, so release it. 529 */ 530 page_unlock(pp); 531 } 532 } else { 533 /* 534 * Neither B_FREE nor B_INVAL nor B_ERROR. 535 * Just release locks. 536 */ 537 page_io_unlock(pp); 538 page_unlock(pp); 539 } 540 } 541 542 CPU_STATS_ENTER_K(); 543 cpup = CPU; /* get cpup now that CPU cannot change */ 544 CPU_STATS_ADDQ(cpup, vm, dfree, dfree); 545 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); 546 CPU_STATS_ADDQ(cpup, vm, pgout, pgout); 547 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); 548 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); 549 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); 550 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); 551 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); 552 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); 553 CPU_STATS_ADDQ(cpup, vm, execfree, execfree); 554 CPU_STATS_EXIT_K(); 555 556 /* Kernel probe */ 557 TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, 558 tnf_opaque, vnode, vp, 559 tnf_ulong, pages_pageout, pgpgout, 560 tnf_ulong, pages_freed, dfree, 561 tnf_ulong, pages_reclaimed, pgrec); 562 } 563 564 /* 565 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, 566 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster 567 * operation and is only to be considered if it doesn't involve any 568 * waiting here. B_TRUNC indicates that the file is being truncated 569 * and so no i/o needs to be done. B_FORCE indicates that the page 570 * must be destroyed so don't try wrting it out. 571 * 572 * The caller must ensure that the page is locked. Returns 1, if 573 * the page should be written back (the "iolock" is held in this 574 * case), or 0 if the page has been dealt with or has been 575 * unlocked. 576 */ 577 int 578 pvn_getdirty(page_t *pp, int flags) 579 { 580 ASSERT((flags & (B_INVAL | B_FREE)) ? 581 PAGE_EXCL(pp) : PAGE_SHARED(pp)); 582 ASSERT(PP_ISFREE(pp) == 0); 583 584 /* 585 * If trying to invalidate or free a logically `locked' page, 586 * forget it. Don't need page_struct_lock to check p_lckcnt and 587 * p_cowcnt as the page is exclusively locked. 588 */ 589 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && 590 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { 591 page_unlock(pp); 592 return (0); 593 } 594 595 /* 596 * Now acquire the i/o lock so we can add it to the dirty 597 * list (if necessary). We avoid blocking on the i/o lock 598 * in the following cases: 599 * 600 * If B_DELWRI is set, which implies that this request is 601 * due to a klustering operartion. 602 * 603 * If this is an async (B_ASYNC) operation and we are not doing 604 * invalidation (B_INVAL) [The current i/o or fsflush will ensure 605 * that the the page is written out]. 606 */ 607 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { 608 if (!page_io_trylock(pp)) { 609 page_unlock(pp); 610 return (0); 611 } 612 } else { 613 page_io_lock(pp); 614 } 615 616 /* 617 * If we want to free or invalidate the page then 618 * we need to unload it so that anyone who wants 619 * it will have to take a minor fault to get it. 620 * Otherwise, we're just writing the page back so we 621 * need to sync up the hardwre and software mod bit to 622 * detect any future modifications. We clear the 623 * software mod bit when we put the page on the dirty 624 * list. 625 */ 626 if (flags & (B_INVAL | B_FREE)) { 627 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 628 } else { 629 (void) hat_pagesync(pp, HAT_SYNC_ZERORM); 630 } 631 632 if (!hat_ismod(pp) || (flags & B_TRUNC)) { 633 /* 634 * Don't need to add it to the 635 * list after all. 636 */ 637 page_io_unlock(pp); 638 if (flags & B_INVAL) { 639 /*LINTED: constant in conditional context*/ 640 VN_DISPOSE(pp, B_INVAL, 0, kcred); 641 } else if (flags & B_FREE) { 642 /*LINTED: constant in conditional context*/ 643 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); 644 } else { 645 /* 646 * This is advisory path for the callers 647 * of VOP_PUTPAGE() who prefer freeing the 648 * page _only_ if no one else is accessing it. 649 * E.g. segmap_release() 650 * 651 * The above hat_ismod() check is useless because: 652 * (1) we may not be holding SE_EXCL lock; 653 * (2) we've not unloaded _all_ translations 654 * 655 * Let page_release() do the heavy-lifting. 656 */ 657 (void) page_release(pp, 1); 658 } 659 return (0); 660 } 661 662 /* 663 * Page is dirty, get it ready for the write back 664 * and add page to the dirty list. 665 */ 666 hat_clrrefmod(pp); 667 668 /* 669 * If we're going to free the page when we're done 670 * then we can let others try to use it starting now. 671 * We'll detect the fact that they used it when the 672 * i/o is done and avoid freeing the page. 673 */ 674 if (flags & B_FREE) 675 page_downgrade(pp); 676 677 678 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); 679 680 return (1); 681 } 682 683 684 /*ARGSUSED*/ 685 static int 686 marker_constructor(void *buf, void *cdrarg, int kmflags) 687 { 688 page_t *mark = buf; 689 bzero(mark, sizeof (page_t)); 690 return (0); 691 } 692 693 void 694 pvn_init() 695 { 696 if (pvn_vmodsort_disable == 0) 697 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); 698 marker_cache = kmem_cache_create("marker_cache", 699 sizeof (page_t), 0, marker_constructor, 700 NULL, NULL, NULL, NULL, 0); 701 } 702 703 704 /* 705 * Process a vnode's page list for all pages whose offset is >= off. 706 * Pages are to either be free'd, invalidated, or written back to disk. 707 * 708 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE 709 * is specified, otherwise they are "shared" locked. 710 * 711 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} 712 * 713 * Special marker page_t's are inserted in the list in order 714 * to keep track of where we are in the list when locks are dropped. 715 * 716 * Note the list is circular and insertions can happen only at the 717 * head and tail of the list. The algorithm ensures visiting all pages 718 * on the list in the following way: 719 * 720 * Drop two marker pages at the end of the list. 721 * 722 * Move one marker page backwards towards the start of the list until 723 * it is at the list head, processing the pages passed along the way. 724 * 725 * Due to race conditions when the vphm mutex is dropped, additional pages 726 * can be added to either end of the list, so we'll continue to move 727 * the marker and process pages until it is up against the end marker. 728 * 729 * There is one special exit condition. If we are processing a VMODSORT 730 * vnode and only writing back modified pages, we can stop as soon as 731 * we run into an unmodified page. This makes fsync(3) operations fast. 732 */ 733 int 734 pvn_vplist_dirty( 735 vnode_t *vp, 736 u_offset_t off, 737 int (*putapage)(vnode_t *, page_t *, u_offset_t *, 738 size_t *, int, cred_t *), 739 int flags, 740 cred_t *cred) 741 { 742 page_t *pp; 743 page_t *mark; /* marker page that moves toward head */ 744 page_t *end; /* marker page at end of list */ 745 int err = 0; 746 int error; 747 kmutex_t *vphm; 748 se_t se; 749 page_t **where_to_move; 750 751 ASSERT(vp->v_type != VCHR); 752 753 if (vp->v_pages == NULL) 754 return (0); 755 756 757 /* 758 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. 759 * 760 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() 761 * from getting blocked while flushing pages to a dead NFS server. 762 */ 763 mutex_enter(&vp->v_lock); 764 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { 765 mutex_exit(&vp->v_lock); 766 return (EAGAIN); 767 } 768 769 while (vp->v_flag & VVMLOCK) 770 cv_wait(&vp->v_cv, &vp->v_lock); 771 772 if (vp->v_pages == NULL) { 773 mutex_exit(&vp->v_lock); 774 return (0); 775 } 776 777 vp->v_flag |= VVMLOCK; 778 mutex_exit(&vp->v_lock); 779 780 781 /* 782 * Set up the marker pages used to walk the list 783 */ 784 end = kmem_cache_alloc(marker_cache, KM_SLEEP); 785 end->p_vnode = vp; 786 end->p_offset = (u_offset_t)-2; 787 mark = kmem_cache_alloc(marker_cache, KM_SLEEP); 788 mark->p_vnode = vp; 789 mark->p_offset = (u_offset_t)-1; 790 791 /* 792 * Grab the lock protecting the vnode's page list 793 * note that this lock is dropped at times in the loop. 794 */ 795 vphm = page_vnode_mutex(vp); 796 mutex_enter(vphm); 797 if (vp->v_pages == NULL) 798 goto leave; 799 800 /* 801 * insert the markers and loop through the list of pages 802 */ 803 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); 804 page_vpadd(&mark->p_vpnext, end); 805 for (;;) { 806 807 /* 808 * If only doing an async write back, then we can 809 * stop as soon as we get to start of the list. 810 */ 811 if (flags == B_ASYNC && vp->v_pages == mark) 812 break; 813 814 /* 815 * otherwise stop when we've gone through all the pages 816 */ 817 if (mark->p_vpprev == end) 818 break; 819 820 pp = mark->p_vpprev; 821 if (vp->v_pages == pp) 822 where_to_move = &vp->v_pages; 823 else 824 where_to_move = &pp->p_vpprev->p_vpnext; 825 826 ASSERT(pp->p_vnode == vp); 827 828 /* 829 * Skip this page if the offset is out of the desired range. 830 * Just move the marker and continue. 831 */ 832 if (pp->p_offset < off) { 833 page_vpsub(&vp->v_pages, mark); 834 page_vpadd(where_to_move, mark); 835 continue; 836 } 837 838 /* 839 * If just flushing dirty pages to disk and this vnode 840 * is using a sorted list of pages, we can stop processing 841 * as soon as we find an unmodified page. Since all the 842 * modified pages are visited first. 843 */ 844 if (IS_VMODSORT(vp) && 845 !(flags & (B_INVAL | B_FREE | B_TRUNC)) && 846 !hat_ismod(pp)) { 847 #ifdef DEBUG 848 /* 849 * For debug kernels examine what should be all the 850 * remaining clean pages, asserting that they are 851 * not modified. 852 */ 853 page_t *chk = pp; 854 int attr; 855 856 page_vpsub(&vp->v_pages, mark); 857 page_vpadd(where_to_move, mark); 858 do { 859 chk = chk->p_vpprev; 860 ASSERT(chk != end); 861 if (chk == mark) 862 continue; 863 attr = hat_page_getattr(chk, P_MOD | P_REF); 864 if ((attr & P_MOD) == 0) 865 continue; 866 panic("v_pages list not all clean: " 867 "page_t*=%p vnode=%p off=%lx " 868 "attr=0x%x last clean page_t*=%p\n", 869 (void *)chk, (void *)chk->p_vnode, 870 (long)chk->p_offset, attr, (void *)pp); 871 } while (chk != vp->v_pages); 872 #endif 873 break; 874 } 875 876 /* 877 * If we are supposed to invalidate or free this 878 * page, then we need an exclusive lock. 879 */ 880 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 881 882 /* 883 * We must acquire the page lock for all synchronous 884 * operations (invalidate, free and write). 885 */ 886 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { 887 /* 888 * If the page_lock() drops the mutex 889 * we must retry the loop. 890 */ 891 if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) 892 continue; 893 894 /* 895 * It's ok to move the marker page now. 896 */ 897 page_vpsub(&vp->v_pages, mark); 898 page_vpadd(where_to_move, mark); 899 } else { 900 901 /* 902 * update the marker page for all remaining cases 903 */ 904 page_vpsub(&vp->v_pages, mark); 905 page_vpadd(where_to_move, mark); 906 907 /* 908 * For write backs, If we can't lock the page, it's 909 * invalid or in the process of being destroyed. Skip 910 * it, assuming someone else is writing it. 911 */ 912 if (!page_trylock(pp, se)) 913 continue; 914 } 915 916 ASSERT(pp->p_vnode == vp); 917 918 /* 919 * Successfully locked the page, now figure out what to 920 * do with it. Free pages are easily dealt with, invalidate 921 * if desired or just go on to the next page. 922 */ 923 if (PP_ISFREE(pp)) { 924 if ((flags & B_INVAL) == 0) { 925 page_unlock(pp); 926 continue; 927 } 928 929 /* 930 * Invalidate (destroy) the page. 931 */ 932 mutex_exit(vphm); 933 page_destroy_free(pp); 934 mutex_enter(vphm); 935 continue; 936 } 937 938 /* 939 * pvn_getdirty() figures out what do do with a dirty page. 940 * If the page is dirty, the putapage() routine will write it 941 * and will kluster any other adjacent dirty pages it can. 942 * 943 * pvn_getdirty() and `(*putapage)' unlock the page. 944 */ 945 mutex_exit(vphm); 946 if (pvn_getdirty(pp, flags)) { 947 error = (*putapage)(vp, pp, NULL, NULL, flags, cred); 948 if (!err) 949 err = error; 950 } 951 mutex_enter(vphm); 952 } 953 page_vpsub(&vp->v_pages, mark); 954 page_vpsub(&vp->v_pages, end); 955 956 leave: 957 /* 958 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds 959 */ 960 mutex_exit(vphm); 961 kmem_cache_free(marker_cache, mark); 962 kmem_cache_free(marker_cache, end); 963 mutex_enter(&vp->v_lock); 964 vp->v_flag &= ~VVMLOCK; 965 cv_broadcast(&vp->v_cv); 966 mutex_exit(&vp->v_lock); 967 return (err); 968 } 969 970 /* 971 * Zero out zbytes worth of data. Caller should be aware that this 972 * routine may enter back into the fs layer (xxx_getpage). Locks 973 * that the xxx_getpage routine may need should not be held while 974 * calling this. 975 */ 976 void 977 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) 978 { 979 caddr_t addr; 980 981 ASSERT(vp->v_type != VCHR); 982 983 if (vp->v_pages == NULL) 984 return; 985 986 /* 987 * zbytes may be zero but there still may be some portion of 988 * a page which needs clearing (since zbytes is a function 989 * of filesystem block size, not pagesize.) 990 */ 991 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) 992 return; 993 994 /* 995 * We get the last page and handle the partial 996 * zeroing via kernel mappings. This will make the page 997 * dirty so that we know that when this page is written 998 * back, the zeroed information will go out with it. If 999 * the page is not currently in memory, then the kzero 1000 * operation will cause it to be brought it. We use kzero 1001 * instead of bzero so that if the page cannot be read in 1002 * for any reason, the system will not panic. We need 1003 * to zero out a minimum of the fs given zbytes, but we 1004 * might also have to do more to get the entire last page. 1005 */ 1006 1007 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) 1008 panic("pvn_vptrunc zbytes"); 1009 addr = segmap_getmapflt(segkmap, vp, vplen, 1010 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); 1011 (void) kzero(addr + (vplen & MAXBOFFSET), 1012 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); 1013 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); 1014 } 1015 1016 /* 1017 * Handles common work of the VOP_GETPAGE routines when more than 1018 * one page must be returned by calling a file system specific operation 1019 * to do most of the work. Must be called with the vp already locked 1020 * by the VOP_GETPAGE routine. 1021 */ 1022 int 1023 pvn_getpages( 1024 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], 1025 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), 1026 struct vnode *vp, 1027 u_offset_t off, 1028 size_t len, 1029 uint_t *protp, 1030 page_t *pl[], 1031 size_t plsz, 1032 struct seg *seg, 1033 caddr_t addr, 1034 enum seg_rw rw, 1035 struct cred *cred) 1036 { 1037 page_t **ppp; 1038 u_offset_t o, eoff; 1039 size_t sz, xlen; 1040 int err; 1041 1042 ASSERT(plsz >= len); /* insure that we have enough space */ 1043 1044 /* 1045 * Loop one page at a time and let getapage function fill 1046 * in the next page in array. We only allow one page to be 1047 * returned at a time (except for the last page) so that we 1048 * don't have any problems with duplicates and other such 1049 * painful problems. This is a very simple minded algorithm, 1050 * but it does the job correctly. We hope that the cost of a 1051 * getapage call for a resident page that we might have been 1052 * able to get from an earlier call doesn't cost too much. 1053 */ 1054 ppp = pl; 1055 sz = PAGESIZE; 1056 eoff = off + len; 1057 xlen = len; 1058 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, 1059 xlen -= PAGESIZE) { 1060 if (o + PAGESIZE >= eoff) { 1061 /* 1062 * Last time through - allow the all of 1063 * what's left of the pl[] array to be used. 1064 */ 1065 sz = plsz - (o - off); 1066 } 1067 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, 1068 rw, cred); 1069 if (err) { 1070 /* 1071 * Release any pages we already got. 1072 */ 1073 if (o > off && pl != NULL) { 1074 for (ppp = pl; *ppp != NULL; *ppp++ = NULL) 1075 (void) page_release(*ppp, 1); 1076 } 1077 break; 1078 } 1079 if (pl != NULL) 1080 ppp++; 1081 } 1082 return (err); 1083 } 1084 1085 /* 1086 * Initialize the page list array. 1087 */ 1088 void 1089 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, 1090 u_offset_t off, size_t io_len, enum seg_rw rw) 1091 { 1092 ssize_t sz; 1093 page_t *ppcur, **ppp; 1094 1095 if (plsz >= io_len) { 1096 /* 1097 * Everything fits, set up to load 1098 * all the pages. 1099 */ 1100 sz = io_len; 1101 } else { 1102 /* 1103 * Set up to load plsz worth 1104 * starting at the needed page. 1105 */ 1106 while (pp->p_offset != off) { 1107 /* XXX - Do we need this assert? */ 1108 ASSERT(pp->p_next->p_offset != 1109 pp->p_offset); 1110 /* 1111 * Remove page from the i/o list, 1112 * release the i/o and the page lock. 1113 */ 1114 ppcur = pp; 1115 page_sub(&pp, ppcur); 1116 page_io_unlock(ppcur); 1117 (void) page_release(ppcur, 1); 1118 } 1119 sz = plsz; 1120 } 1121 1122 /* 1123 * Initialize the page list array. 1124 */ 1125 ppp = pl; 1126 do { 1127 ppcur = pp; 1128 *ppp++ = ppcur; 1129 page_sub(&pp, ppcur); 1130 page_io_unlock(ppcur); 1131 if (rw != S_CREATE) 1132 page_downgrade(ppcur); 1133 sz -= PAGESIZE; 1134 } while (sz > 0 && pp != NULL); 1135 *ppp = NULL; /* terminate list */ 1136 1137 /* 1138 * Now free the remaining pages that weren't 1139 * loaded in the page list. 1140 */ 1141 while (pp != NULL) { 1142 ppcur = pp; 1143 page_sub(&pp, ppcur); 1144 page_io_unlock(ppcur); 1145 (void) page_release(ppcur, 1); 1146 } 1147 } 1148