1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - paged vnode. 41 * 42 * This file supplies vm support for the vnode operations that deal with pages. 43 */ 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/param.h> 47 #include <sys/sysmacros.h> 48 #include <sys/systm.h> 49 #include <sys/time.h> 50 #include <sys/buf.h> 51 #include <sys/vnode.h> 52 #include <sys/uio.h> 53 #include <sys/vmsystm.h> 54 #include <sys/mman.h> 55 #include <sys/vfs.h> 56 #include <sys/cred.h> 57 #include <sys/user.h> 58 #include <sys/kmem.h> 59 #include <sys/cmn_err.h> 60 #include <sys/debug.h> 61 #include <sys/cpuvar.h> 62 #include <sys/vtrace.h> 63 #include <sys/tnf_probe.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/rm.h> 69 #include <vm/pvn.h> 70 #include <vm/page.h> 71 #include <vm/seg_map.h> 72 #include <vm/seg_kmem.h> 73 #include <sys/fs/swapnode.h> 74 75 int pvn_nofodklust = 0; 76 int pvn_write_noklust = 0; 77 78 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ 79 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ 80 /* support for vmodsort for testing */ 81 82 static struct kmem_cache *marker_cache = NULL; 83 84 /* 85 * Find the largest contiguous block which contains `addr' for file offset 86 * `offset' in it while living within the file system block sizes (`vp_off' 87 * and `vp_len') and the address space limits for which no pages currently 88 * exist and which map to consecutive file offsets. 89 */ 90 page_t * 91 pvn_read_kluster( 92 struct vnode *vp, 93 u_offset_t off, 94 struct seg *seg, 95 caddr_t addr, 96 u_offset_t *offp, /* return values */ 97 size_t *lenp, /* return values */ 98 u_offset_t vp_off, 99 size_t vp_len, 100 int isra) 101 { 102 ssize_t deltaf, deltab; 103 page_t *pp; 104 page_t *plist = NULL; 105 spgcnt_t pagesavail; 106 u_offset_t vp_end; 107 108 ASSERT(off >= vp_off && off < vp_off + vp_len); 109 110 /* 111 * We only want to do klustering/read ahead if there 112 * is more than minfree pages currently available. 113 */ 114 pagesavail = freemem - minfree; 115 116 if (pagesavail <= 0) 117 if (isra) 118 return ((page_t *)NULL); /* ra case - give up */ 119 else 120 pagesavail = 1; /* must return a page */ 121 122 /* We calculate in pages instead of bytes due to 32-bit overflows */ 123 if (pagesavail < (spgcnt_t)btopr(vp_len)) { 124 /* 125 * Don't have enough free memory for the 126 * max request, try sizing down vp request. 127 */ 128 deltab = (ssize_t)(off - vp_off); 129 vp_len -= deltab; 130 vp_off += deltab; 131 if (pagesavail < btopr(vp_len)) { 132 /* 133 * Still not enough memory, just settle for 134 * pagesavail which is at least 1. 135 */ 136 vp_len = ptob(pagesavail); 137 } 138 } 139 140 vp_end = vp_off + vp_len; 141 ASSERT(off >= vp_off && off < vp_end); 142 143 if (isra && SEGOP_KLUSTER(seg, addr, 0)) 144 return ((page_t *)NULL); /* segment driver says no */ 145 146 if ((plist = page_create_va(vp, off, 147 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) 148 return ((page_t *)NULL); 149 150 if (vp_len <= PAGESIZE || pvn_nofodklust) { 151 *offp = off; 152 *lenp = MIN(vp_len, PAGESIZE); 153 } else { 154 /* 155 * Scan back from front by incrementing "deltab" and 156 * comparing "off" with "vp_off + deltab" to avoid 157 * "signed" versus "unsigned" conversion problems. 158 */ 159 for (deltab = PAGESIZE; off >= vp_off + deltab; 160 deltab += PAGESIZE) { 161 /* 162 * Call back to the segment driver to verify that 163 * the klustering/read ahead operation makes sense. 164 */ 165 if (SEGOP_KLUSTER(seg, addr, -deltab)) 166 break; /* page not eligible */ 167 if ((pp = page_create_va(vp, off - deltab, 168 PAGESIZE, PG_EXCL, seg, addr - deltab)) 169 == NULL) 170 break; /* already have the page */ 171 /* 172 * Add page to front of page list. 173 */ 174 page_add(&plist, pp); 175 } 176 deltab -= PAGESIZE; 177 178 /* scan forward from front */ 179 for (deltaf = PAGESIZE; off + deltaf < vp_end; 180 deltaf += PAGESIZE) { 181 /* 182 * Call back to the segment driver to verify that 183 * the klustering/read ahead operation makes sense. 184 */ 185 if (SEGOP_KLUSTER(seg, addr, deltaf)) 186 break; /* page not file extension */ 187 if ((pp = page_create_va(vp, off + deltaf, 188 PAGESIZE, PG_EXCL, seg, addr + deltaf)) 189 == NULL) 190 break; /* already have page */ 191 192 /* 193 * Add page to end of page list. 194 */ 195 page_add(&plist, pp); 196 plist = plist->p_next; 197 } 198 *offp = off = off - deltab; 199 *lenp = deltab + deltaf; 200 ASSERT(off >= vp_off); 201 202 /* 203 * If we ended up getting more than was actually 204 * requested, retract the returned length to only 205 * reflect what was requested. This might happen 206 * if we were allowed to kluster pages across a 207 * span of (say) 5 frags, and frag size is less 208 * than PAGESIZE. We need a whole number of 209 * pages to contain those frags, but the returned 210 * size should only allow the returned range to 211 * extend as far as the end of the frags. 212 */ 213 if ((vp_off + vp_len) < (off + *lenp)) { 214 ASSERT(vp_end > off); 215 *lenp = vp_end - off; 216 } 217 } 218 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, 219 "pvn_read_kluster:seg %p addr %x isra %x", 220 seg, addr, isra); 221 return (plist); 222 } 223 224 /* 225 * Handle pages for this vnode on either side of the page "pp" 226 * which has been locked by the caller. This routine will also 227 * do klustering in the range [vp_off, vp_off + vp_len] up 228 * until a page which is not found. The offset and length 229 * of pages included is returned in "*offp" and "*lenp". 230 * 231 * Returns a list of dirty locked pages all ready to be 232 * written back. 233 */ 234 page_t * 235 pvn_write_kluster( 236 struct vnode *vp, 237 page_t *pp, 238 u_offset_t *offp, /* return values */ 239 size_t *lenp, /* return values */ 240 u_offset_t vp_off, 241 size_t vp_len, 242 int flags) 243 { 244 u_offset_t off; 245 page_t *dirty; 246 size_t deltab, deltaf; 247 se_t se; 248 u_offset_t vp_end; 249 250 off = pp->p_offset; 251 252 /* 253 * Kustering should not be done if we are invalidating 254 * pages since we could destroy pages that belong to 255 * some other process if this is a swap vnode. 256 */ 257 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { 258 *offp = off; 259 *lenp = PAGESIZE; 260 return (pp); 261 } 262 263 if (flags & (B_FREE | B_INVAL)) 264 se = SE_EXCL; 265 else 266 se = SE_SHARED; 267 268 dirty = pp; 269 /* 270 * Scan backwards looking for pages to kluster by incrementing 271 * "deltab" and comparing "off" with "vp_off + deltab" to 272 * avoid "signed" versus "unsigned" conversion problems. 273 */ 274 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { 275 pp = page_lookup_nowait(vp, off - deltab, se); 276 if (pp == NULL) 277 break; /* page not found */ 278 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 279 break; 280 page_add(&dirty, pp); 281 } 282 deltab -= PAGESIZE; 283 284 vp_end = vp_off + vp_len; 285 /* now scan forwards looking for pages to kluster */ 286 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { 287 pp = page_lookup_nowait(vp, off + deltaf, se); 288 if (pp == NULL) 289 break; /* page not found */ 290 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 291 break; 292 page_add(&dirty, pp); 293 dirty = dirty->p_next; 294 } 295 296 *offp = off - deltab; 297 *lenp = deltab + deltaf; 298 return (dirty); 299 } 300 301 /* 302 * Generic entry point used to release the "shared/exclusive" lock 303 * and the "p_iolock" on pages after i/o is complete. 304 */ 305 void 306 pvn_io_done(page_t *plist) 307 { 308 page_t *pp; 309 310 while (plist != NULL) { 311 pp = plist; 312 page_sub(&plist, pp); 313 page_io_unlock(pp); 314 page_unlock(pp); 315 } 316 } 317 318 /* 319 * Entry point to be used by file system getpage subr's and 320 * other such routines which either want to unlock pages (B_ASYNC 321 * request) or destroy a list of pages if an error occurred. 322 */ 323 void 324 pvn_read_done(page_t *plist, int flags) 325 { 326 page_t *pp; 327 328 while (plist != NULL) { 329 pp = plist; 330 page_sub(&plist, pp); 331 page_io_unlock(pp); 332 if (flags & B_ERROR) { 333 /*LINTED: constant in conditional context*/ 334 VN_DISPOSE(pp, B_INVAL, 0, kcred); 335 } else { 336 (void) page_release(pp, 0); 337 } 338 } 339 } 340 341 /* 342 * Automagic pageout. 343 * When memory gets tight, start freeing pages popping out of the 344 * write queue. 345 */ 346 int write_free = 1; 347 pgcnt_t pages_before_pager = 200; /* LMXXX */ 348 349 /* 350 * Routine to be called when page-out's complete. 351 * The caller, typically VOP_PUTPAGE, has to explicity call this routine 352 * after waiting for i/o to complete (biowait) to free the list of 353 * pages associated with the buffer. These pages must be locked 354 * before i/o is initiated. 355 * 356 * If a write error occurs, the pages are marked as modified 357 * so the write will be re-tried later. 358 */ 359 360 void 361 pvn_write_done(page_t *plist, int flags) 362 { 363 int dfree = 0; 364 int pgrec = 0; 365 int pgout = 0; 366 int pgpgout = 0; 367 int anonpgout = 0; 368 int anonfree = 0; 369 int fspgout = 0; 370 int fsfree = 0; 371 int execpgout = 0; 372 int execfree = 0; 373 page_t *pp; 374 struct cpu *cpup; 375 struct vnode *vp = NULL; /* for probe */ 376 uint_t ppattr; 377 kmutex_t *vphm = NULL; 378 379 ASSERT((flags & B_READ) == 0); 380 381 /* 382 * If we are about to start paging anyway, start freeing pages. 383 */ 384 if (write_free && freemem < lotsfree + pages_before_pager && 385 (flags & B_ERROR) == 0) { 386 flags |= B_FREE; 387 } 388 389 /* 390 * Handle each page involved in the i/o operation. 391 */ 392 while (plist != NULL) { 393 pp = plist; 394 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); 395 page_sub(&plist, pp); 396 397 /* Kernel probe support */ 398 if (vp == NULL) 399 vp = pp->p_vnode; 400 401 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { 402 /* 403 * Move page to the top of the v_page list. 404 * Skip pages modified during IO. 405 */ 406 vphm = page_vnode_mutex(vp); 407 mutex_enter(vphm); 408 if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { 409 page_vpsub(&vp->v_pages, pp); 410 page_vpadd(&vp->v_pages, pp); 411 } 412 mutex_exit(vphm); 413 } 414 415 if (flags & B_ERROR) { 416 /* 417 * Write operation failed. We don't want 418 * to destroy (or free) the page unless B_FORCE 419 * is set. We set the mod bit again and release 420 * all locks on the page so that it will get written 421 * back again later when things are hopefully 422 * better again. 423 * If B_INVAL and B_FORCE is set we really have 424 * to destroy the page. 425 */ 426 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { 427 page_io_unlock(pp); 428 /*LINTED: constant in conditional context*/ 429 VN_DISPOSE(pp, B_INVAL, 0, kcred); 430 } else { 431 hat_setmod_only(pp); 432 page_io_unlock(pp); 433 page_unlock(pp); 434 } 435 } else if (flags & B_INVAL) { 436 /* 437 * XXX - Failed writes with B_INVAL set are 438 * not handled appropriately. 439 */ 440 page_io_unlock(pp); 441 /*LINTED: constant in conditional context*/ 442 VN_DISPOSE(pp, B_INVAL, 0, kcred); 443 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { 444 /* 445 * Update statistics for pages being paged out 446 */ 447 if (pp->p_vnode) { 448 if (IS_SWAPFSVP(pp->p_vnode)) { 449 anonpgout++; 450 } else { 451 if (pp->p_vnode->v_flag & VVMEXEC) { 452 execpgout++; 453 } else { 454 fspgout++; 455 } 456 } 457 } 458 page_io_unlock(pp); 459 pgout = 1; 460 pgpgout++; 461 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, 462 "page_ws_out:pp %p", pp); 463 464 /* 465 * The page_struct_lock need not be acquired to 466 * examine "p_lckcnt" and "p_cowcnt" since we'll 467 * have an "exclusive" lock if the upgrade succeeds. 468 */ 469 if (page_tryupgrade(pp) && 470 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 471 /* 472 * Check if someone has reclaimed the 473 * page. If ref and mod are not set, no 474 * one is using it so we can free it. 475 * The rest of the system is careful 476 * to use the NOSYNC flag to unload 477 * translations set up for i/o w/o 478 * affecting ref and mod bits. 479 * 480 * Obtain a copy of the real hardware 481 * mod bit using hat_pagesync(pp, HAT_DONTZERO) 482 * to avoid having to flush the cache. 483 */ 484 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 485 HAT_SYNC_STOPON_MOD); 486 ck_refmod: 487 if (!(ppattr & (P_REF | P_MOD))) { 488 if (hat_page_is_mapped(pp)) { 489 /* 490 * Doesn't look like the page 491 * was modified so now we 492 * really have to unload the 493 * translations. Meanwhile 494 * another CPU could've 495 * modified it so we have to 496 * check again. We don't loop 497 * forever here because now 498 * the translations are gone 499 * and no one can get a new one 500 * since we have the "exclusive" 501 * lock on the page. 502 */ 503 (void) hat_pageunload(pp, 504 HAT_FORCE_PGUNLOAD); 505 ppattr = hat_page_getattr(pp, 506 P_REF | P_MOD); 507 goto ck_refmod; 508 } 509 /* 510 * Update statistics for pages being 511 * freed 512 */ 513 if (pp->p_vnode) { 514 if (IS_SWAPFSVP(pp->p_vnode)) { 515 anonfree++; 516 } else { 517 if (pp->p_vnode->v_flag 518 & VVMEXEC) { 519 execfree++; 520 } else { 521 fsfree++; 522 } 523 } 524 } 525 /*LINTED: constant in conditional ctx*/ 526 VN_DISPOSE(pp, B_FREE, 527 (flags & B_DONTNEED), kcred); 528 dfree++; 529 } else { 530 page_unlock(pp); 531 pgrec++; 532 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, 533 "page_ws_free:pp %p", pp); 534 } 535 } else { 536 /* 537 * Page is either `locked' in memory 538 * or was reclaimed and now has a 539 * "shared" lock, so release it. 540 */ 541 page_unlock(pp); 542 } 543 } else { 544 /* 545 * Neither B_FREE nor B_INVAL nor B_ERROR. 546 * Just release locks. 547 */ 548 page_io_unlock(pp); 549 page_unlock(pp); 550 } 551 } 552 553 CPU_STATS_ENTER_K(); 554 cpup = CPU; /* get cpup now that CPU cannot change */ 555 CPU_STATS_ADDQ(cpup, vm, dfree, dfree); 556 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); 557 CPU_STATS_ADDQ(cpup, vm, pgout, pgout); 558 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); 559 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); 560 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); 561 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); 562 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); 563 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); 564 CPU_STATS_ADDQ(cpup, vm, execfree, execfree); 565 CPU_STATS_EXIT_K(); 566 567 /* Kernel probe */ 568 TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, 569 tnf_opaque, vnode, vp, 570 tnf_ulong, pages_pageout, pgpgout, 571 tnf_ulong, pages_freed, dfree, 572 tnf_ulong, pages_reclaimed, pgrec); 573 } 574 575 /* 576 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, 577 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster 578 * operation and is only to be considered if it doesn't involve any 579 * waiting here. B_TRUNC indicates that the file is being truncated 580 * and so no i/o needs to be done. B_FORCE indicates that the page 581 * must be destroyed so don't try wrting it out. 582 * 583 * The caller must ensure that the page is locked. Returns 1, if 584 * the page should be written back (the "iolock" is held in this 585 * case), or 0 if the page has been dealt with or has been 586 * unlocked. 587 */ 588 int 589 pvn_getdirty(page_t *pp, int flags) 590 { 591 ASSERT((flags & (B_INVAL | B_FREE)) ? 592 PAGE_EXCL(pp) : PAGE_SHARED(pp)); 593 ASSERT(PP_ISFREE(pp) == 0); 594 595 /* 596 * If trying to invalidate or free a logically `locked' page, 597 * forget it. Don't need page_struct_lock to check p_lckcnt and 598 * p_cowcnt as the page is exclusively locked. 599 */ 600 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && 601 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { 602 page_unlock(pp); 603 return (0); 604 } 605 606 /* 607 * Now acquire the i/o lock so we can add it to the dirty 608 * list (if necessary). We avoid blocking on the i/o lock 609 * in the following cases: 610 * 611 * If B_DELWRI is set, which implies that this request is 612 * due to a klustering operartion. 613 * 614 * If this is an async (B_ASYNC) operation and we are not doing 615 * invalidation (B_INVAL) [The current i/o or fsflush will ensure 616 * that the the page is written out]. 617 */ 618 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { 619 if (!page_io_trylock(pp)) { 620 page_unlock(pp); 621 return (0); 622 } 623 } else { 624 page_io_lock(pp); 625 } 626 627 /* 628 * If we want to free or invalidate the page then 629 * we need to unload it so that anyone who wants 630 * it will have to take a minor fault to get it. 631 * Otherwise, we're just writing the page back so we 632 * need to sync up the hardwre and software mod bit to 633 * detect any future modifications. We clear the 634 * software mod bit when we put the page on the dirty 635 * list. 636 */ 637 if (flags & (B_INVAL | B_FREE)) { 638 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 639 } else { 640 (void) hat_pagesync(pp, HAT_SYNC_ZERORM); 641 } 642 643 if (!hat_ismod(pp) || (flags & B_TRUNC)) { 644 /* 645 * Don't need to add it to the 646 * list after all. 647 */ 648 page_io_unlock(pp); 649 if (flags & B_INVAL) { 650 /*LINTED: constant in conditional context*/ 651 VN_DISPOSE(pp, B_INVAL, 0, kcred); 652 } else if (flags & B_FREE) { 653 /*LINTED: constant in conditional context*/ 654 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); 655 } else { 656 /* 657 * This is advisory path for the callers 658 * of VOP_PUTPAGE() who prefer freeing the 659 * page _only_ if no one else is accessing it. 660 * E.g. segmap_release() 661 * 662 * The above hat_ismod() check is useless because: 663 * (1) we may not be holding SE_EXCL lock; 664 * (2) we've not unloaded _all_ translations 665 * 666 * Let page_release() do the heavy-lifting. 667 */ 668 (void) page_release(pp, 1); 669 } 670 return (0); 671 } 672 673 /* 674 * Page is dirty, get it ready for the write back 675 * and add page to the dirty list. 676 */ 677 hat_clrrefmod(pp); 678 679 /* 680 * If we're going to free the page when we're done 681 * then we can let others try to use it starting now. 682 * We'll detect the fact that they used it when the 683 * i/o is done and avoid freeing the page. 684 */ 685 if (flags & B_FREE) 686 page_downgrade(pp); 687 688 689 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); 690 691 return (1); 692 } 693 694 695 /*ARGSUSED*/ 696 static int 697 marker_constructor(void *buf, void *cdrarg, int kmflags) 698 { 699 page_t *mark = buf; 700 bzero(mark, sizeof (page_t)); 701 mark->p_hash = PVN_VPLIST_HASH_TAG; 702 return (0); 703 } 704 705 void 706 pvn_init() 707 { 708 if (pvn_vmodsort_disable == 0) 709 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); 710 marker_cache = kmem_cache_create("marker_cache", 711 sizeof (page_t), 0, marker_constructor, 712 NULL, NULL, NULL, NULL, 0); 713 } 714 715 716 /* 717 * Process a vnode's page list for all pages whose offset is >= off. 718 * Pages are to either be free'd, invalidated, or written back to disk. 719 * 720 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE 721 * is specified, otherwise they are "shared" locked. 722 * 723 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} 724 * 725 * Special marker page_t's are inserted in the list in order 726 * to keep track of where we are in the list when locks are dropped. 727 * 728 * Note the list is circular and insertions can happen only at the 729 * head and tail of the list. The algorithm ensures visiting all pages 730 * on the list in the following way: 731 * 732 * Drop two marker pages at the end of the list. 733 * 734 * Move one marker page backwards towards the start of the list until 735 * it is at the list head, processing the pages passed along the way. 736 * 737 * Due to race conditions when the vphm mutex is dropped, additional pages 738 * can be added to either end of the list, so we'll continue to move 739 * the marker and process pages until it is up against the end marker. 740 * 741 * There is one special exit condition. If we are processing a VMODSORT 742 * vnode and only writing back modified pages, we can stop as soon as 743 * we run into an unmodified page. This makes fsync(3) operations fast. 744 */ 745 int 746 pvn_vplist_dirty( 747 vnode_t *vp, 748 u_offset_t off, 749 int (*putapage)(vnode_t *, page_t *, u_offset_t *, 750 size_t *, int, cred_t *), 751 int flags, 752 cred_t *cred) 753 { 754 page_t *pp; 755 page_t *mark; /* marker page that moves toward head */ 756 page_t *end; /* marker page at end of list */ 757 int err = 0; 758 int error; 759 kmutex_t *vphm; 760 se_t se; 761 page_t **where_to_move; 762 763 ASSERT(vp->v_type != VCHR); 764 765 if (vp->v_pages == NULL) 766 return (0); 767 768 769 /* 770 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. 771 * 772 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() 773 * from getting blocked while flushing pages to a dead NFS server. 774 */ 775 mutex_enter(&vp->v_lock); 776 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { 777 mutex_exit(&vp->v_lock); 778 return (EAGAIN); 779 } 780 781 while (vp->v_flag & VVMLOCK) 782 cv_wait(&vp->v_cv, &vp->v_lock); 783 784 if (vp->v_pages == NULL) { 785 mutex_exit(&vp->v_lock); 786 return (0); 787 } 788 789 vp->v_flag |= VVMLOCK; 790 mutex_exit(&vp->v_lock); 791 792 793 /* 794 * Set up the marker pages used to walk the list 795 */ 796 end = kmem_cache_alloc(marker_cache, KM_SLEEP); 797 end->p_vnode = vp; 798 end->p_offset = (u_offset_t)-2; 799 mark = kmem_cache_alloc(marker_cache, KM_SLEEP); 800 mark->p_vnode = vp; 801 mark->p_offset = (u_offset_t)-1; 802 803 /* 804 * Grab the lock protecting the vnode's page list 805 * note that this lock is dropped at times in the loop. 806 */ 807 vphm = page_vnode_mutex(vp); 808 mutex_enter(vphm); 809 if (vp->v_pages == NULL) 810 goto leave; 811 812 /* 813 * insert the markers and loop through the list of pages 814 */ 815 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); 816 page_vpadd(&mark->p_vpnext, end); 817 for (;;) { 818 819 /* 820 * If only doing an async write back, then we can 821 * stop as soon as we get to start of the list. 822 */ 823 if (flags == B_ASYNC && vp->v_pages == mark) 824 break; 825 826 /* 827 * otherwise stop when we've gone through all the pages 828 */ 829 if (mark->p_vpprev == end) 830 break; 831 832 pp = mark->p_vpprev; 833 if (vp->v_pages == pp) 834 where_to_move = &vp->v_pages; 835 else 836 where_to_move = &pp->p_vpprev->p_vpnext; 837 838 ASSERT(pp->p_vnode == vp); 839 840 /* 841 * If just flushing dirty pages to disk and this vnode 842 * is using a sorted list of pages, we can stop processing 843 * as soon as we find an unmodified page. Since all the 844 * modified pages are visited first. 845 */ 846 if (IS_VMODSORT(vp) && 847 !(flags & (B_INVAL | B_FREE | B_TRUNC))) { 848 if (!hat_ismod(pp) && !page_io_locked(pp)) { 849 #ifdef DEBUG 850 /* 851 * For debug kernels examine what should be 852 * all the remaining clean pages, asserting 853 * that they are not modified. 854 */ 855 page_t *chk = pp; 856 int attr; 857 858 page_vpsub(&vp->v_pages, mark); 859 page_vpadd(where_to_move, mark); 860 do { 861 chk = chk->p_vpprev; 862 ASSERT(chk != end); 863 if (chk == mark) 864 continue; 865 attr = hat_page_getattr(chk, P_MOD | 866 P_REF); 867 if ((attr & P_MOD) == 0) 868 continue; 869 panic("v_pages list not all clean: " 870 "page_t*=%p vnode=%p off=%lx " 871 "attr=0x%x last clean page_t*=%p\n", 872 (void *)chk, (void *)chk->p_vnode, 873 (long)chk->p_offset, attr, 874 (void *)pp); 875 } while (chk != vp->v_pages); 876 #endif 877 break; 878 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { 879 /* 880 * Couldn't get io lock, wait until IO is done. 881 * Block only for sync IO since we don't want 882 * to block async IO. 883 */ 884 mutex_exit(vphm); 885 page_io_wait(pp); 886 mutex_enter(vphm); 887 continue; 888 } 889 } 890 891 /* 892 * Skip this page if the offset is out of the desired range. 893 * Just move the marker and continue. 894 */ 895 if (pp->p_offset < off) { 896 page_vpsub(&vp->v_pages, mark); 897 page_vpadd(where_to_move, mark); 898 continue; 899 } 900 901 /* 902 * If we are supposed to invalidate or free this 903 * page, then we need an exclusive lock. 904 */ 905 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 906 907 /* 908 * We must acquire the page lock for all synchronous 909 * operations (invalidate, free and write). 910 */ 911 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { 912 /* 913 * If the page_lock() drops the mutex 914 * we must retry the loop. 915 */ 916 if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) 917 continue; 918 919 /* 920 * It's ok to move the marker page now. 921 */ 922 page_vpsub(&vp->v_pages, mark); 923 page_vpadd(where_to_move, mark); 924 } else { 925 926 /* 927 * update the marker page for all remaining cases 928 */ 929 page_vpsub(&vp->v_pages, mark); 930 page_vpadd(where_to_move, mark); 931 932 /* 933 * For write backs, If we can't lock the page, it's 934 * invalid or in the process of being destroyed. Skip 935 * it, assuming someone else is writing it. 936 */ 937 if (!page_trylock(pp, se)) 938 continue; 939 } 940 941 ASSERT(pp->p_vnode == vp); 942 943 /* 944 * Successfully locked the page, now figure out what to 945 * do with it. Free pages are easily dealt with, invalidate 946 * if desired or just go on to the next page. 947 */ 948 if (PP_ISFREE(pp)) { 949 if ((flags & B_INVAL) == 0) { 950 page_unlock(pp); 951 continue; 952 } 953 954 /* 955 * Invalidate (destroy) the page. 956 */ 957 mutex_exit(vphm); 958 page_destroy_free(pp); 959 mutex_enter(vphm); 960 continue; 961 } 962 963 /* 964 * pvn_getdirty() figures out what do do with a dirty page. 965 * If the page is dirty, the putapage() routine will write it 966 * and will kluster any other adjacent dirty pages it can. 967 * 968 * pvn_getdirty() and `(*putapage)' unlock the page. 969 */ 970 mutex_exit(vphm); 971 if (pvn_getdirty(pp, flags)) { 972 error = (*putapage)(vp, pp, NULL, NULL, flags, cred); 973 if (!err) 974 err = error; 975 } 976 mutex_enter(vphm); 977 } 978 page_vpsub(&vp->v_pages, mark); 979 page_vpsub(&vp->v_pages, end); 980 981 leave: 982 /* 983 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds 984 */ 985 mutex_exit(vphm); 986 kmem_cache_free(marker_cache, mark); 987 kmem_cache_free(marker_cache, end); 988 mutex_enter(&vp->v_lock); 989 vp->v_flag &= ~VVMLOCK; 990 cv_broadcast(&vp->v_cv); 991 mutex_exit(&vp->v_lock); 992 return (err); 993 } 994 995 /* 996 * Walk the vp->v_pages list, for every page call the callback function 997 * pointed by *page_check. If page_check returns non-zero, then mark the 998 * page as modified and if VMODSORT is set, move it to the end of v_pages 999 * list. Moving makes sense only if we have at least two pages - this also 1000 * avoids having v_pages temporarily being NULL after calling page_vpsub() 1001 * if there was just one page. 1002 */ 1003 void 1004 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *)) 1005 { 1006 page_t *pp, *next, *end; 1007 kmutex_t *vphm; 1008 int shuffle; 1009 1010 vphm = page_vnode_mutex(vp); 1011 mutex_enter(vphm); 1012 1013 if (vp->v_pages == NULL) { 1014 mutex_exit(vphm); 1015 return; 1016 } 1017 1018 end = vp->v_pages->p_vpprev; 1019 shuffle = IS_VMODSORT(vp) && (vp->v_pages != end); 1020 pp = vp->v_pages; 1021 1022 for (;;) { 1023 next = pp->p_vpnext; 1024 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) { 1025 /* 1026 * hat_setmod_only() in contrast to hat_setmod() does 1027 * not shuffle the pages and does not grab the mutex 1028 * page_vnode_mutex. Exactly what we need. 1029 */ 1030 hat_setmod_only(pp); 1031 if (shuffle) { 1032 page_vpsub(&vp->v_pages, pp); 1033 ASSERT(vp->v_pages != NULL); 1034 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, 1035 pp); 1036 } 1037 } 1038 /* Stop if we have just processed the last page. */ 1039 if (pp == end) 1040 break; 1041 pp = next; 1042 } 1043 1044 mutex_exit(vphm); 1045 } 1046 1047 /* 1048 * Zero out zbytes worth of data. Caller should be aware that this 1049 * routine may enter back into the fs layer (xxx_getpage). Locks 1050 * that the xxx_getpage routine may need should not be held while 1051 * calling this. 1052 */ 1053 void 1054 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) 1055 { 1056 caddr_t addr; 1057 1058 ASSERT(vp->v_type != VCHR); 1059 1060 if (vp->v_pages == NULL) 1061 return; 1062 1063 /* 1064 * zbytes may be zero but there still may be some portion of 1065 * a page which needs clearing (since zbytes is a function 1066 * of filesystem block size, not pagesize.) 1067 */ 1068 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) 1069 return; 1070 1071 /* 1072 * We get the last page and handle the partial 1073 * zeroing via kernel mappings. This will make the page 1074 * dirty so that we know that when this page is written 1075 * back, the zeroed information will go out with it. If 1076 * the page is not currently in memory, then the kzero 1077 * operation will cause it to be brought it. We use kzero 1078 * instead of bzero so that if the page cannot be read in 1079 * for any reason, the system will not panic. We need 1080 * to zero out a minimum of the fs given zbytes, but we 1081 * might also have to do more to get the entire last page. 1082 */ 1083 1084 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) 1085 panic("pvn_vptrunc zbytes"); 1086 addr = segmap_getmapflt(segkmap, vp, vplen, 1087 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); 1088 (void) kzero(addr + (vplen & MAXBOFFSET), 1089 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); 1090 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); 1091 } 1092 1093 /* 1094 * Handles common work of the VOP_GETPAGE routines by iterating page by page 1095 * calling the getpage helper for each. 1096 */ 1097 int 1098 pvn_getpages( 1099 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], 1100 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), 1101 struct vnode *vp, 1102 u_offset_t off, 1103 size_t len, 1104 uint_t *protp, 1105 page_t *pl[], 1106 size_t plsz, 1107 struct seg *seg, 1108 caddr_t addr, 1109 enum seg_rw rw, 1110 struct cred *cred) 1111 { 1112 page_t **ppp; 1113 u_offset_t o, eoff; 1114 size_t sz, xlen; 1115 int err; 1116 1117 /* ensure that we have enough space */ 1118 ASSERT(pl == NULL || plsz >= len); 1119 1120 /* 1121 * Loop one page at a time and let getapage function fill 1122 * in the next page in array. We only allow one page to be 1123 * returned at a time (except for the last page) so that we 1124 * don't have any problems with duplicates and other such 1125 * painful problems. This is a very simple minded algorithm, 1126 * but it does the job correctly. We hope that the cost of a 1127 * getapage call for a resident page that we might have been 1128 * able to get from an earlier call doesn't cost too much. 1129 */ 1130 ppp = pl; 1131 sz = (pl != NULL) ? PAGESIZE : 0; 1132 eoff = off + len; 1133 xlen = len; 1134 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, 1135 xlen -= PAGESIZE) { 1136 if (o + PAGESIZE >= eoff && pl != NULL) { 1137 /* 1138 * Last time through - allow the all of 1139 * what's left of the pl[] array to be used. 1140 */ 1141 sz = plsz - (o - off); 1142 } 1143 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, 1144 rw, cred); 1145 if (err) { 1146 /* 1147 * Release any pages we already got. 1148 */ 1149 if (o > off && pl != NULL) { 1150 for (ppp = pl; *ppp != NULL; *ppp++ = NULL) 1151 (void) page_release(*ppp, 1); 1152 } 1153 break; 1154 } 1155 if (pl != NULL) 1156 ppp++; 1157 } 1158 return (err); 1159 } 1160 1161 /* 1162 * Initialize the page list array. 1163 */ 1164 /*ARGSUSED*/ 1165 void 1166 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, 1167 u_offset_t off, size_t io_len, enum seg_rw rw) 1168 { 1169 ssize_t sz; 1170 page_t *ppcur, **ppp; 1171 1172 /* 1173 * Set up to load plsz worth 1174 * starting at the needed page. 1175 */ 1176 while (pp != NULL && pp->p_offset != off) { 1177 /* 1178 * Remove page from the i/o list, 1179 * release the i/o and the page lock. 1180 */ 1181 ppcur = pp; 1182 page_sub(&pp, ppcur); 1183 page_io_unlock(ppcur); 1184 (void) page_release(ppcur, 1); 1185 } 1186 1187 if (pp == NULL) { 1188 pl[0] = NULL; 1189 return; 1190 } 1191 1192 sz = plsz; 1193 1194 /* 1195 * Initialize the page list array. 1196 */ 1197 ppp = pl; 1198 do { 1199 ppcur = pp; 1200 *ppp++ = ppcur; 1201 page_sub(&pp, ppcur); 1202 page_io_unlock(ppcur); 1203 if (rw != S_CREATE) 1204 page_downgrade(ppcur); 1205 sz -= PAGESIZE; 1206 } while (sz > 0 && pp != NULL); 1207 *ppp = NULL; /* terminate list */ 1208 1209 /* 1210 * Now free the remaining pages that weren't 1211 * loaded in the page list. 1212 */ 1213 while (pp != NULL) { 1214 ppcur = pp; 1215 page_sub(&pp, ppcur); 1216 page_io_unlock(ppcur); 1217 (void) page_release(ppcur, 1); 1218 } 1219 } 1220