17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5a71e32b6Sstans * Common Development and Distribution License (the "License"). 6a71e32b6Sstans * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 2244c4f64bSJohn Levon * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23*ca411232SJosef 'Jeff' Sipek * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate */ 257c478bd9Sstevel@tonic-gate 267c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 277c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate /* 307c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 317c478bd9Sstevel@tonic-gate * The Regents of the University of California 327c478bd9Sstevel@tonic-gate * All Rights Reserved 337c478bd9Sstevel@tonic-gate * 347c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 357c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 367c478bd9Sstevel@tonic-gate * contributors. 377c478bd9Sstevel@tonic-gate */ 387c478bd9Sstevel@tonic-gate 397c478bd9Sstevel@tonic-gate /* 407c478bd9Sstevel@tonic-gate * VM - paged vnode. 417c478bd9Sstevel@tonic-gate * 427c478bd9Sstevel@tonic-gate * This file supplies vm support for the vnode operations that deal with pages. 437c478bd9Sstevel@tonic-gate */ 447c478bd9Sstevel@tonic-gate #include <sys/types.h> 457c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 467c478bd9Sstevel@tonic-gate #include <sys/param.h> 477c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 487c478bd9Sstevel@tonic-gate #include <sys/systm.h> 497c478bd9Sstevel@tonic-gate #include <sys/time.h> 507c478bd9Sstevel@tonic-gate #include <sys/buf.h> 517c478bd9Sstevel@tonic-gate #include <sys/vnode.h> 527c478bd9Sstevel@tonic-gate #include <sys/uio.h> 537c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 547c478bd9Sstevel@tonic-gate #include <sys/mman.h> 557c478bd9Sstevel@tonic-gate #include <sys/vfs.h> 567c478bd9Sstevel@tonic-gate #include <sys/cred.h> 577c478bd9Sstevel@tonic-gate #include <sys/user.h> 587c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 597c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 607c478bd9Sstevel@tonic-gate #include <sys/debug.h> 617c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 627c478bd9Sstevel@tonic-gate #include <sys/vtrace.h> 637c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h> 647c478bd9Sstevel@tonic-gate 657c478bd9Sstevel@tonic-gate #include <vm/hat.h> 667c478bd9Sstevel@tonic-gate #include <vm/as.h> 677c478bd9Sstevel@tonic-gate #include <vm/seg.h> 687c478bd9Sstevel@tonic-gate #include <vm/rm.h> 697c478bd9Sstevel@tonic-gate #include <vm/pvn.h> 707c478bd9Sstevel@tonic-gate #include <vm/page.h> 717c478bd9Sstevel@tonic-gate #include <vm/seg_map.h> 727c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 737c478bd9Sstevel@tonic-gate #include <sys/fs/swapnode.h> 747c478bd9Sstevel@tonic-gate 757c478bd9Sstevel@tonic-gate int pvn_nofodklust = 0; 767c478bd9Sstevel@tonic-gate int pvn_write_noklust = 0; 777c478bd9Sstevel@tonic-gate 787c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */ 797c478bd9Sstevel@tonic-gate uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */ 807c478bd9Sstevel@tonic-gate /* support for vmodsort for testing */ 817c478bd9Sstevel@tonic-gate 827c478bd9Sstevel@tonic-gate static struct kmem_cache *marker_cache = NULL; 837c478bd9Sstevel@tonic-gate 847c478bd9Sstevel@tonic-gate /* 857c478bd9Sstevel@tonic-gate * Find the largest contiguous block which contains `addr' for file offset 867c478bd9Sstevel@tonic-gate * `offset' in it while living within the file system block sizes (`vp_off' 877c478bd9Sstevel@tonic-gate * and `vp_len') and the address space limits for which no pages currently 887c478bd9Sstevel@tonic-gate * exist and which map to consecutive file offsets. 897c478bd9Sstevel@tonic-gate */ 907c478bd9Sstevel@tonic-gate page_t * 917c478bd9Sstevel@tonic-gate pvn_read_kluster( 927c478bd9Sstevel@tonic-gate struct vnode *vp, 937c478bd9Sstevel@tonic-gate u_offset_t off, 947c478bd9Sstevel@tonic-gate struct seg *seg, 957c478bd9Sstevel@tonic-gate caddr_t addr, 967c478bd9Sstevel@tonic-gate u_offset_t *offp, /* return values */ 977c478bd9Sstevel@tonic-gate size_t *lenp, /* return values */ 987c478bd9Sstevel@tonic-gate u_offset_t vp_off, 997c478bd9Sstevel@tonic-gate size_t vp_len, 1007c478bd9Sstevel@tonic-gate int isra) 1017c478bd9Sstevel@tonic-gate { 1027c478bd9Sstevel@tonic-gate ssize_t deltaf, deltab; 1037c478bd9Sstevel@tonic-gate page_t *pp; 1047c478bd9Sstevel@tonic-gate page_t *plist = NULL; 1057c478bd9Sstevel@tonic-gate spgcnt_t pagesavail; 1067c478bd9Sstevel@tonic-gate u_offset_t vp_end; 1077c478bd9Sstevel@tonic-gate 1087c478bd9Sstevel@tonic-gate ASSERT(off >= vp_off && off < vp_off + vp_len); 1097c478bd9Sstevel@tonic-gate 1107c478bd9Sstevel@tonic-gate /* 1117c478bd9Sstevel@tonic-gate * We only want to do klustering/read ahead if there 1127c478bd9Sstevel@tonic-gate * is more than minfree pages currently available. 1137c478bd9Sstevel@tonic-gate */ 1147c478bd9Sstevel@tonic-gate pagesavail = freemem - minfree; 1157c478bd9Sstevel@tonic-gate 1167c478bd9Sstevel@tonic-gate if (pagesavail <= 0) 1177c478bd9Sstevel@tonic-gate if (isra) 1187c478bd9Sstevel@tonic-gate return ((page_t *)NULL); /* ra case - give up */ 1197c478bd9Sstevel@tonic-gate else 1207c478bd9Sstevel@tonic-gate pagesavail = 1; /* must return a page */ 1217c478bd9Sstevel@tonic-gate 1227c478bd9Sstevel@tonic-gate /* We calculate in pages instead of bytes due to 32-bit overflows */ 1237c478bd9Sstevel@tonic-gate if (pagesavail < (spgcnt_t)btopr(vp_len)) { 1247c478bd9Sstevel@tonic-gate /* 1257c478bd9Sstevel@tonic-gate * Don't have enough free memory for the 1267c478bd9Sstevel@tonic-gate * max request, try sizing down vp request. 1277c478bd9Sstevel@tonic-gate */ 1287c478bd9Sstevel@tonic-gate deltab = (ssize_t)(off - vp_off); 1297c478bd9Sstevel@tonic-gate vp_len -= deltab; 1307c478bd9Sstevel@tonic-gate vp_off += deltab; 1317c478bd9Sstevel@tonic-gate if (pagesavail < btopr(vp_len)) { 1327c478bd9Sstevel@tonic-gate /* 1337c478bd9Sstevel@tonic-gate * Still not enough memory, just settle for 1347c478bd9Sstevel@tonic-gate * pagesavail which is at least 1. 1357c478bd9Sstevel@tonic-gate */ 1367c478bd9Sstevel@tonic-gate vp_len = ptob(pagesavail); 1377c478bd9Sstevel@tonic-gate } 1387c478bd9Sstevel@tonic-gate } 1397c478bd9Sstevel@tonic-gate 1407c478bd9Sstevel@tonic-gate vp_end = vp_off + vp_len; 1417c478bd9Sstevel@tonic-gate ASSERT(off >= vp_off && off < vp_end); 1427c478bd9Sstevel@tonic-gate 1437c478bd9Sstevel@tonic-gate if (isra && SEGOP_KLUSTER(seg, addr, 0)) 1447c478bd9Sstevel@tonic-gate return ((page_t *)NULL); /* segment driver says no */ 1457c478bd9Sstevel@tonic-gate 1467c478bd9Sstevel@tonic-gate if ((plist = page_create_va(vp, off, 1477c478bd9Sstevel@tonic-gate PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL) 1487c478bd9Sstevel@tonic-gate return ((page_t *)NULL); 1497c478bd9Sstevel@tonic-gate 1507c478bd9Sstevel@tonic-gate if (vp_len <= PAGESIZE || pvn_nofodklust) { 1517c478bd9Sstevel@tonic-gate *offp = off; 1527c478bd9Sstevel@tonic-gate *lenp = MIN(vp_len, PAGESIZE); 1537c478bd9Sstevel@tonic-gate } else { 1547c478bd9Sstevel@tonic-gate /* 1557c478bd9Sstevel@tonic-gate * Scan back from front by incrementing "deltab" and 1567c478bd9Sstevel@tonic-gate * comparing "off" with "vp_off + deltab" to avoid 1577c478bd9Sstevel@tonic-gate * "signed" versus "unsigned" conversion problems. 1587c478bd9Sstevel@tonic-gate */ 1597c478bd9Sstevel@tonic-gate for (deltab = PAGESIZE; off >= vp_off + deltab; 1607c478bd9Sstevel@tonic-gate deltab += PAGESIZE) { 1617c478bd9Sstevel@tonic-gate /* 1627c478bd9Sstevel@tonic-gate * Call back to the segment driver to verify that 1637c478bd9Sstevel@tonic-gate * the klustering/read ahead operation makes sense. 1647c478bd9Sstevel@tonic-gate */ 1657c478bd9Sstevel@tonic-gate if (SEGOP_KLUSTER(seg, addr, -deltab)) 1667c478bd9Sstevel@tonic-gate break; /* page not eligible */ 1677c478bd9Sstevel@tonic-gate if ((pp = page_create_va(vp, off - deltab, 1687c478bd9Sstevel@tonic-gate PAGESIZE, PG_EXCL, seg, addr - deltab)) 1697c478bd9Sstevel@tonic-gate == NULL) 1707c478bd9Sstevel@tonic-gate break; /* already have the page */ 1717c478bd9Sstevel@tonic-gate /* 1727c478bd9Sstevel@tonic-gate * Add page to front of page list. 1737c478bd9Sstevel@tonic-gate */ 1747c478bd9Sstevel@tonic-gate page_add(&plist, pp); 1757c478bd9Sstevel@tonic-gate } 1767c478bd9Sstevel@tonic-gate deltab -= PAGESIZE; 1777c478bd9Sstevel@tonic-gate 1787c478bd9Sstevel@tonic-gate /* scan forward from front */ 1797c478bd9Sstevel@tonic-gate for (deltaf = PAGESIZE; off + deltaf < vp_end; 1807c478bd9Sstevel@tonic-gate deltaf += PAGESIZE) { 1817c478bd9Sstevel@tonic-gate /* 1827c478bd9Sstevel@tonic-gate * Call back to the segment driver to verify that 1837c478bd9Sstevel@tonic-gate * the klustering/read ahead operation makes sense. 1847c478bd9Sstevel@tonic-gate */ 1857c478bd9Sstevel@tonic-gate if (SEGOP_KLUSTER(seg, addr, deltaf)) 1867c478bd9Sstevel@tonic-gate break; /* page not file extension */ 1877c478bd9Sstevel@tonic-gate if ((pp = page_create_va(vp, off + deltaf, 1887c478bd9Sstevel@tonic-gate PAGESIZE, PG_EXCL, seg, addr + deltaf)) 1897c478bd9Sstevel@tonic-gate == NULL) 1907c478bd9Sstevel@tonic-gate break; /* already have page */ 1917c478bd9Sstevel@tonic-gate 1927c478bd9Sstevel@tonic-gate /* 1937c478bd9Sstevel@tonic-gate * Add page to end of page list. 1947c478bd9Sstevel@tonic-gate */ 1957c478bd9Sstevel@tonic-gate page_add(&plist, pp); 1967c478bd9Sstevel@tonic-gate plist = plist->p_next; 1977c478bd9Sstevel@tonic-gate } 1987c478bd9Sstevel@tonic-gate *offp = off = off - deltab; 1997c478bd9Sstevel@tonic-gate *lenp = deltab + deltaf; 2007c478bd9Sstevel@tonic-gate ASSERT(off >= vp_off); 2017c478bd9Sstevel@tonic-gate 2027c478bd9Sstevel@tonic-gate /* 2037c478bd9Sstevel@tonic-gate * If we ended up getting more than was actually 2047c478bd9Sstevel@tonic-gate * requested, retract the returned length to only 2057c478bd9Sstevel@tonic-gate * reflect what was requested. This might happen 2067c478bd9Sstevel@tonic-gate * if we were allowed to kluster pages across a 2077c478bd9Sstevel@tonic-gate * span of (say) 5 frags, and frag size is less 2087c478bd9Sstevel@tonic-gate * than PAGESIZE. We need a whole number of 2097c478bd9Sstevel@tonic-gate * pages to contain those frags, but the returned 2107c478bd9Sstevel@tonic-gate * size should only allow the returned range to 2117c478bd9Sstevel@tonic-gate * extend as far as the end of the frags. 2127c478bd9Sstevel@tonic-gate */ 2137c478bd9Sstevel@tonic-gate if ((vp_off + vp_len) < (off + *lenp)) { 2147c478bd9Sstevel@tonic-gate ASSERT(vp_end > off); 2157c478bd9Sstevel@tonic-gate *lenp = vp_end - off; 2167c478bd9Sstevel@tonic-gate } 2177c478bd9Sstevel@tonic-gate } 2187c478bd9Sstevel@tonic-gate TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER, 2197c478bd9Sstevel@tonic-gate "pvn_read_kluster:seg %p addr %x isra %x", 2207c478bd9Sstevel@tonic-gate seg, addr, isra); 2217c478bd9Sstevel@tonic-gate return (plist); 2227c478bd9Sstevel@tonic-gate } 2237c478bd9Sstevel@tonic-gate 2247c478bd9Sstevel@tonic-gate /* 2257c478bd9Sstevel@tonic-gate * Handle pages for this vnode on either side of the page "pp" 2267c478bd9Sstevel@tonic-gate * which has been locked by the caller. This routine will also 2277c478bd9Sstevel@tonic-gate * do klustering in the range [vp_off, vp_off + vp_len] up 2287c478bd9Sstevel@tonic-gate * until a page which is not found. The offset and length 2297c478bd9Sstevel@tonic-gate * of pages included is returned in "*offp" and "*lenp". 2307c478bd9Sstevel@tonic-gate * 2317c478bd9Sstevel@tonic-gate * Returns a list of dirty locked pages all ready to be 2327c478bd9Sstevel@tonic-gate * written back. 2337c478bd9Sstevel@tonic-gate */ 2347c478bd9Sstevel@tonic-gate page_t * 2357c478bd9Sstevel@tonic-gate pvn_write_kluster( 2367c478bd9Sstevel@tonic-gate struct vnode *vp, 2377c478bd9Sstevel@tonic-gate page_t *pp, 2387c478bd9Sstevel@tonic-gate u_offset_t *offp, /* return values */ 2397c478bd9Sstevel@tonic-gate size_t *lenp, /* return values */ 2407c478bd9Sstevel@tonic-gate u_offset_t vp_off, 2417c478bd9Sstevel@tonic-gate size_t vp_len, 2427c478bd9Sstevel@tonic-gate int flags) 2437c478bd9Sstevel@tonic-gate { 2447c478bd9Sstevel@tonic-gate u_offset_t off; 2457c478bd9Sstevel@tonic-gate page_t *dirty; 2467c478bd9Sstevel@tonic-gate size_t deltab, deltaf; 2477c478bd9Sstevel@tonic-gate se_t se; 2487c478bd9Sstevel@tonic-gate u_offset_t vp_end; 2497c478bd9Sstevel@tonic-gate 2507c478bd9Sstevel@tonic-gate off = pp->p_offset; 2517c478bd9Sstevel@tonic-gate 2527c478bd9Sstevel@tonic-gate /* 2537c478bd9Sstevel@tonic-gate * Kustering should not be done if we are invalidating 2547c478bd9Sstevel@tonic-gate * pages since we could destroy pages that belong to 2557c478bd9Sstevel@tonic-gate * some other process if this is a swap vnode. 2567c478bd9Sstevel@tonic-gate */ 2577c478bd9Sstevel@tonic-gate if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) { 2587c478bd9Sstevel@tonic-gate *offp = off; 2597c478bd9Sstevel@tonic-gate *lenp = PAGESIZE; 2607c478bd9Sstevel@tonic-gate return (pp); 2617c478bd9Sstevel@tonic-gate } 2627c478bd9Sstevel@tonic-gate 2637c478bd9Sstevel@tonic-gate if (flags & (B_FREE | B_INVAL)) 2647c478bd9Sstevel@tonic-gate se = SE_EXCL; 2657c478bd9Sstevel@tonic-gate else 2667c478bd9Sstevel@tonic-gate se = SE_SHARED; 2677c478bd9Sstevel@tonic-gate 2687c478bd9Sstevel@tonic-gate dirty = pp; 2697c478bd9Sstevel@tonic-gate /* 2707c478bd9Sstevel@tonic-gate * Scan backwards looking for pages to kluster by incrementing 2717c478bd9Sstevel@tonic-gate * "deltab" and comparing "off" with "vp_off + deltab" to 2727c478bd9Sstevel@tonic-gate * avoid "signed" versus "unsigned" conversion problems. 2737c478bd9Sstevel@tonic-gate */ 2747c478bd9Sstevel@tonic-gate for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) { 2757c478bd9Sstevel@tonic-gate pp = page_lookup_nowait(vp, off - deltab, se); 2767c478bd9Sstevel@tonic-gate if (pp == NULL) 2777c478bd9Sstevel@tonic-gate break; /* page not found */ 2787c478bd9Sstevel@tonic-gate if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 2797c478bd9Sstevel@tonic-gate break; 2807c478bd9Sstevel@tonic-gate page_add(&dirty, pp); 2817c478bd9Sstevel@tonic-gate } 2827c478bd9Sstevel@tonic-gate deltab -= PAGESIZE; 2837c478bd9Sstevel@tonic-gate 2847c478bd9Sstevel@tonic-gate vp_end = vp_off + vp_len; 2857c478bd9Sstevel@tonic-gate /* now scan forwards looking for pages to kluster */ 2867c478bd9Sstevel@tonic-gate for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) { 2877c478bd9Sstevel@tonic-gate pp = page_lookup_nowait(vp, off + deltaf, se); 2887c478bd9Sstevel@tonic-gate if (pp == NULL) 2897c478bd9Sstevel@tonic-gate break; /* page not found */ 2907c478bd9Sstevel@tonic-gate if (pvn_getdirty(pp, flags | B_DELWRI) == 0) 2917c478bd9Sstevel@tonic-gate break; 2927c478bd9Sstevel@tonic-gate page_add(&dirty, pp); 2937c478bd9Sstevel@tonic-gate dirty = dirty->p_next; 2947c478bd9Sstevel@tonic-gate } 2957c478bd9Sstevel@tonic-gate 2967c478bd9Sstevel@tonic-gate *offp = off - deltab; 2977c478bd9Sstevel@tonic-gate *lenp = deltab + deltaf; 2987c478bd9Sstevel@tonic-gate return (dirty); 2997c478bd9Sstevel@tonic-gate } 3007c478bd9Sstevel@tonic-gate 3017c478bd9Sstevel@tonic-gate /* 3027c478bd9Sstevel@tonic-gate * Generic entry point used to release the "shared/exclusive" lock 3037c478bd9Sstevel@tonic-gate * and the "p_iolock" on pages after i/o is complete. 3047c478bd9Sstevel@tonic-gate */ 3057c478bd9Sstevel@tonic-gate void 3067c478bd9Sstevel@tonic-gate pvn_io_done(page_t *plist) 3077c478bd9Sstevel@tonic-gate { 3087c478bd9Sstevel@tonic-gate page_t *pp; 3097c478bd9Sstevel@tonic-gate 3107c478bd9Sstevel@tonic-gate while (plist != NULL) { 3117c478bd9Sstevel@tonic-gate pp = plist; 3127c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 3137c478bd9Sstevel@tonic-gate page_io_unlock(pp); 3147c478bd9Sstevel@tonic-gate page_unlock(pp); 3157c478bd9Sstevel@tonic-gate } 3167c478bd9Sstevel@tonic-gate } 3177c478bd9Sstevel@tonic-gate 3187c478bd9Sstevel@tonic-gate /* 3197c478bd9Sstevel@tonic-gate * Entry point to be used by file system getpage subr's and 3207c478bd9Sstevel@tonic-gate * other such routines which either want to unlock pages (B_ASYNC 3217c478bd9Sstevel@tonic-gate * request) or destroy a list of pages if an error occurred. 3227c478bd9Sstevel@tonic-gate */ 3237c478bd9Sstevel@tonic-gate void 3247c478bd9Sstevel@tonic-gate pvn_read_done(page_t *plist, int flags) 3257c478bd9Sstevel@tonic-gate { 3267c478bd9Sstevel@tonic-gate page_t *pp; 3277c478bd9Sstevel@tonic-gate 3287c478bd9Sstevel@tonic-gate while (plist != NULL) { 3297c478bd9Sstevel@tonic-gate pp = plist; 3307c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 3317c478bd9Sstevel@tonic-gate page_io_unlock(pp); 3327c478bd9Sstevel@tonic-gate if (flags & B_ERROR) { 3337c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 3347c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 3357c478bd9Sstevel@tonic-gate } else { 3367c478bd9Sstevel@tonic-gate (void) page_release(pp, 0); 3377c478bd9Sstevel@tonic-gate } 3387c478bd9Sstevel@tonic-gate } 3397c478bd9Sstevel@tonic-gate } 3407c478bd9Sstevel@tonic-gate 3417c478bd9Sstevel@tonic-gate /* 3427c478bd9Sstevel@tonic-gate * Automagic pageout. 3437c478bd9Sstevel@tonic-gate * When memory gets tight, start freeing pages popping out of the 3447c478bd9Sstevel@tonic-gate * write queue. 3457c478bd9Sstevel@tonic-gate */ 3467c478bd9Sstevel@tonic-gate int write_free = 1; 3477c478bd9Sstevel@tonic-gate pgcnt_t pages_before_pager = 200; /* LMXXX */ 3487c478bd9Sstevel@tonic-gate 3497c478bd9Sstevel@tonic-gate /* 3507c478bd9Sstevel@tonic-gate * Routine to be called when page-out's complete. 3517c478bd9Sstevel@tonic-gate * The caller, typically VOP_PUTPAGE, has to explicity call this routine 3527c478bd9Sstevel@tonic-gate * after waiting for i/o to complete (biowait) to free the list of 3537c478bd9Sstevel@tonic-gate * pages associated with the buffer. These pages must be locked 3547c478bd9Sstevel@tonic-gate * before i/o is initiated. 3557c478bd9Sstevel@tonic-gate * 3567c478bd9Sstevel@tonic-gate * If a write error occurs, the pages are marked as modified 3577c478bd9Sstevel@tonic-gate * so the write will be re-tried later. 3587c478bd9Sstevel@tonic-gate */ 3597c478bd9Sstevel@tonic-gate 3607c478bd9Sstevel@tonic-gate void 3617c478bd9Sstevel@tonic-gate pvn_write_done(page_t *plist, int flags) 3627c478bd9Sstevel@tonic-gate { 3637c478bd9Sstevel@tonic-gate int dfree = 0; 3647c478bd9Sstevel@tonic-gate int pgrec = 0; 3657c478bd9Sstevel@tonic-gate int pgout = 0; 3667c478bd9Sstevel@tonic-gate int pgpgout = 0; 3677c478bd9Sstevel@tonic-gate int anonpgout = 0; 3687c478bd9Sstevel@tonic-gate int anonfree = 0; 3697c478bd9Sstevel@tonic-gate int fspgout = 0; 3707c478bd9Sstevel@tonic-gate int fsfree = 0; 3717c478bd9Sstevel@tonic-gate int execpgout = 0; 3727c478bd9Sstevel@tonic-gate int execfree = 0; 3737c478bd9Sstevel@tonic-gate page_t *pp; 3747c478bd9Sstevel@tonic-gate struct cpu *cpup; 3757c478bd9Sstevel@tonic-gate struct vnode *vp = NULL; /* for probe */ 3767c478bd9Sstevel@tonic-gate uint_t ppattr; 377a71e32b6Sstans kmutex_t *vphm = NULL; 3787c478bd9Sstevel@tonic-gate 3797c478bd9Sstevel@tonic-gate ASSERT((flags & B_READ) == 0); 3807c478bd9Sstevel@tonic-gate 3817c478bd9Sstevel@tonic-gate /* 3827c478bd9Sstevel@tonic-gate * If we are about to start paging anyway, start freeing pages. 3837c478bd9Sstevel@tonic-gate */ 3847c478bd9Sstevel@tonic-gate if (write_free && freemem < lotsfree + pages_before_pager && 3857c478bd9Sstevel@tonic-gate (flags & B_ERROR) == 0) { 3867c478bd9Sstevel@tonic-gate flags |= B_FREE; 3877c478bd9Sstevel@tonic-gate } 3887c478bd9Sstevel@tonic-gate 3897c478bd9Sstevel@tonic-gate /* 3907c478bd9Sstevel@tonic-gate * Handle each page involved in the i/o operation. 3917c478bd9Sstevel@tonic-gate */ 3927c478bd9Sstevel@tonic-gate while (plist != NULL) { 3937c478bd9Sstevel@tonic-gate pp = plist; 3947c478bd9Sstevel@tonic-gate ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp)); 3957c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 3967c478bd9Sstevel@tonic-gate 3977c478bd9Sstevel@tonic-gate /* Kernel probe support */ 3987c478bd9Sstevel@tonic-gate if (vp == NULL) 3997c478bd9Sstevel@tonic-gate vp = pp->p_vnode; 4007c478bd9Sstevel@tonic-gate 401d9615970Sqiao if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) { 402a71e32b6Sstans /* 403a71e32b6Sstans * Move page to the top of the v_page list. 404a71e32b6Sstans * Skip pages modified during IO. 405a71e32b6Sstans */ 406a71e32b6Sstans vphm = page_vnode_mutex(vp); 407a71e32b6Sstans mutex_enter(vphm); 408a71e32b6Sstans if ((pp->p_vpnext != pp) && !hat_ismod(pp)) { 409a71e32b6Sstans page_vpsub(&vp->v_pages, pp); 410a71e32b6Sstans page_vpadd(&vp->v_pages, pp); 411a71e32b6Sstans } 412a71e32b6Sstans mutex_exit(vphm); 413a71e32b6Sstans } 414a71e32b6Sstans 4157c478bd9Sstevel@tonic-gate if (flags & B_ERROR) { 4167c478bd9Sstevel@tonic-gate /* 4177c478bd9Sstevel@tonic-gate * Write operation failed. We don't want 4187c478bd9Sstevel@tonic-gate * to destroy (or free) the page unless B_FORCE 4197c478bd9Sstevel@tonic-gate * is set. We set the mod bit again and release 4207c478bd9Sstevel@tonic-gate * all locks on the page so that it will get written 4217c478bd9Sstevel@tonic-gate * back again later when things are hopefully 4227c478bd9Sstevel@tonic-gate * better again. 4237c478bd9Sstevel@tonic-gate * If B_INVAL and B_FORCE is set we really have 4247c478bd9Sstevel@tonic-gate * to destroy the page. 4257c478bd9Sstevel@tonic-gate */ 4267c478bd9Sstevel@tonic-gate if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) { 4277c478bd9Sstevel@tonic-gate page_io_unlock(pp); 4287c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 4297c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 4307c478bd9Sstevel@tonic-gate } else { 431d9615970Sqiao hat_setmod_only(pp); 4327c478bd9Sstevel@tonic-gate page_io_unlock(pp); 4337c478bd9Sstevel@tonic-gate page_unlock(pp); 4347c478bd9Sstevel@tonic-gate } 4357c478bd9Sstevel@tonic-gate } else if (flags & B_INVAL) { 4367c478bd9Sstevel@tonic-gate /* 4377c478bd9Sstevel@tonic-gate * XXX - Failed writes with B_INVAL set are 4387c478bd9Sstevel@tonic-gate * not handled appropriately. 4397c478bd9Sstevel@tonic-gate */ 4407c478bd9Sstevel@tonic-gate page_io_unlock(pp); 4417c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 4427c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 4437c478bd9Sstevel@tonic-gate } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) { 4447c478bd9Sstevel@tonic-gate /* 4457c478bd9Sstevel@tonic-gate * Update statistics for pages being paged out 4467c478bd9Sstevel@tonic-gate */ 4477c478bd9Sstevel@tonic-gate if (pp->p_vnode) { 4487c478bd9Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 4497c478bd9Sstevel@tonic-gate anonpgout++; 4507c478bd9Sstevel@tonic-gate } else { 4517c478bd9Sstevel@tonic-gate if (pp->p_vnode->v_flag & VVMEXEC) { 4527c478bd9Sstevel@tonic-gate execpgout++; 4537c478bd9Sstevel@tonic-gate } else { 4547c478bd9Sstevel@tonic-gate fspgout++; 4557c478bd9Sstevel@tonic-gate } 4567c478bd9Sstevel@tonic-gate } 4577c478bd9Sstevel@tonic-gate } 4587c478bd9Sstevel@tonic-gate page_io_unlock(pp); 4597c478bd9Sstevel@tonic-gate pgout = 1; 4607c478bd9Sstevel@tonic-gate pgpgout++; 4617c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT, 4627c478bd9Sstevel@tonic-gate "page_ws_out:pp %p", pp); 4637c478bd9Sstevel@tonic-gate 4647c478bd9Sstevel@tonic-gate /* 4657c478bd9Sstevel@tonic-gate * The page_struct_lock need not be acquired to 4667c478bd9Sstevel@tonic-gate * examine "p_lckcnt" and "p_cowcnt" since we'll 4677c478bd9Sstevel@tonic-gate * have an "exclusive" lock if the upgrade succeeds. 4687c478bd9Sstevel@tonic-gate */ 4697c478bd9Sstevel@tonic-gate if (page_tryupgrade(pp) && 4707c478bd9Sstevel@tonic-gate pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 4717c478bd9Sstevel@tonic-gate /* 4727c478bd9Sstevel@tonic-gate * Check if someone has reclaimed the 4737c478bd9Sstevel@tonic-gate * page. If ref and mod are not set, no 4747c478bd9Sstevel@tonic-gate * one is using it so we can free it. 4757c478bd9Sstevel@tonic-gate * The rest of the system is careful 4767c478bd9Sstevel@tonic-gate * to use the NOSYNC flag to unload 4777c478bd9Sstevel@tonic-gate * translations set up for i/o w/o 4787c478bd9Sstevel@tonic-gate * affecting ref and mod bits. 4797c478bd9Sstevel@tonic-gate * 4807c478bd9Sstevel@tonic-gate * Obtain a copy of the real hardware 4817c478bd9Sstevel@tonic-gate * mod bit using hat_pagesync(pp, HAT_DONTZERO) 4827c478bd9Sstevel@tonic-gate * to avoid having to flush the cache. 4837c478bd9Sstevel@tonic-gate */ 4847c478bd9Sstevel@tonic-gate ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 4857c478bd9Sstevel@tonic-gate HAT_SYNC_STOPON_MOD); 4867c478bd9Sstevel@tonic-gate ck_refmod: 4877c478bd9Sstevel@tonic-gate if (!(ppattr & (P_REF | P_MOD))) { 4887c478bd9Sstevel@tonic-gate if (hat_page_is_mapped(pp)) { 4897c478bd9Sstevel@tonic-gate /* 4907c478bd9Sstevel@tonic-gate * Doesn't look like the page 4917c478bd9Sstevel@tonic-gate * was modified so now we 4927c478bd9Sstevel@tonic-gate * really have to unload the 4937c478bd9Sstevel@tonic-gate * translations. Meanwhile 4947c478bd9Sstevel@tonic-gate * another CPU could've 4957c478bd9Sstevel@tonic-gate * modified it so we have to 4967c478bd9Sstevel@tonic-gate * check again. We don't loop 4977c478bd9Sstevel@tonic-gate * forever here because now 4987c478bd9Sstevel@tonic-gate * the translations are gone 4997c478bd9Sstevel@tonic-gate * and no one can get a new one 5007c478bd9Sstevel@tonic-gate * since we have the "exclusive" 5017c478bd9Sstevel@tonic-gate * lock on the page. 5027c478bd9Sstevel@tonic-gate */ 5037c478bd9Sstevel@tonic-gate (void) hat_pageunload(pp, 5047c478bd9Sstevel@tonic-gate HAT_FORCE_PGUNLOAD); 5057c478bd9Sstevel@tonic-gate ppattr = hat_page_getattr(pp, 5067c478bd9Sstevel@tonic-gate P_REF | P_MOD); 5077c478bd9Sstevel@tonic-gate goto ck_refmod; 5087c478bd9Sstevel@tonic-gate } 5097c478bd9Sstevel@tonic-gate /* 5107c478bd9Sstevel@tonic-gate * Update statistics for pages being 5117c478bd9Sstevel@tonic-gate * freed 5127c478bd9Sstevel@tonic-gate */ 5137c478bd9Sstevel@tonic-gate if (pp->p_vnode) { 5147c478bd9Sstevel@tonic-gate if (IS_SWAPFSVP(pp->p_vnode)) { 5157c478bd9Sstevel@tonic-gate anonfree++; 5167c478bd9Sstevel@tonic-gate } else { 5177c478bd9Sstevel@tonic-gate if (pp->p_vnode->v_flag 5187c478bd9Sstevel@tonic-gate & VVMEXEC) { 5197c478bd9Sstevel@tonic-gate execfree++; 5207c478bd9Sstevel@tonic-gate } else { 5217c478bd9Sstevel@tonic-gate fsfree++; 5227c478bd9Sstevel@tonic-gate } 5237c478bd9Sstevel@tonic-gate } 5247c478bd9Sstevel@tonic-gate } 5257c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 5267c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_FREE, 5277c478bd9Sstevel@tonic-gate (flags & B_DONTNEED), kcred); 5287c478bd9Sstevel@tonic-gate dfree++; 5297c478bd9Sstevel@tonic-gate } else { 5307c478bd9Sstevel@tonic-gate page_unlock(pp); 5317c478bd9Sstevel@tonic-gate pgrec++; 5327c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE, 5337c478bd9Sstevel@tonic-gate "page_ws_free:pp %p", pp); 5347c478bd9Sstevel@tonic-gate } 5357c478bd9Sstevel@tonic-gate } else { 5367c478bd9Sstevel@tonic-gate /* 5377c478bd9Sstevel@tonic-gate * Page is either `locked' in memory 5387c478bd9Sstevel@tonic-gate * or was reclaimed and now has a 5397c478bd9Sstevel@tonic-gate * "shared" lock, so release it. 5407c478bd9Sstevel@tonic-gate */ 5417c478bd9Sstevel@tonic-gate page_unlock(pp); 5427c478bd9Sstevel@tonic-gate } 5437c478bd9Sstevel@tonic-gate } else { 5447c478bd9Sstevel@tonic-gate /* 5457c478bd9Sstevel@tonic-gate * Neither B_FREE nor B_INVAL nor B_ERROR. 5467c478bd9Sstevel@tonic-gate * Just release locks. 5477c478bd9Sstevel@tonic-gate */ 5487c478bd9Sstevel@tonic-gate page_io_unlock(pp); 5497c478bd9Sstevel@tonic-gate page_unlock(pp); 5507c478bd9Sstevel@tonic-gate } 5517c478bd9Sstevel@tonic-gate } 5527c478bd9Sstevel@tonic-gate 5537c478bd9Sstevel@tonic-gate CPU_STATS_ENTER_K(); 5547c478bd9Sstevel@tonic-gate cpup = CPU; /* get cpup now that CPU cannot change */ 5557c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, dfree, dfree); 5567c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec); 5577c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgout, pgout); 5587c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout); 5597c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout); 5607c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree); 5617c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout); 5627c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree); 5637c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout); 5647c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, vm, execfree, execfree); 5657c478bd9Sstevel@tonic-gate CPU_STATS_EXIT_K(); 5667c478bd9Sstevel@tonic-gate 5677c478bd9Sstevel@tonic-gate /* Kernel probe */ 5687c478bd9Sstevel@tonic-gate TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */, 5697c478bd9Sstevel@tonic-gate tnf_opaque, vnode, vp, 5707c478bd9Sstevel@tonic-gate tnf_ulong, pages_pageout, pgpgout, 5717c478bd9Sstevel@tonic-gate tnf_ulong, pages_freed, dfree, 5727c478bd9Sstevel@tonic-gate tnf_ulong, pages_reclaimed, pgrec); 5737c478bd9Sstevel@tonic-gate } 5747c478bd9Sstevel@tonic-gate 5757c478bd9Sstevel@tonic-gate /* 5767c478bd9Sstevel@tonic-gate * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, 5777c478bd9Sstevel@tonic-gate * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster 5787c478bd9Sstevel@tonic-gate * operation and is only to be considered if it doesn't involve any 5797c478bd9Sstevel@tonic-gate * waiting here. B_TRUNC indicates that the file is being truncated 5807c478bd9Sstevel@tonic-gate * and so no i/o needs to be done. B_FORCE indicates that the page 5817c478bd9Sstevel@tonic-gate * must be destroyed so don't try wrting it out. 5827c478bd9Sstevel@tonic-gate * 5837c478bd9Sstevel@tonic-gate * The caller must ensure that the page is locked. Returns 1, if 5847c478bd9Sstevel@tonic-gate * the page should be written back (the "iolock" is held in this 5857c478bd9Sstevel@tonic-gate * case), or 0 if the page has been dealt with or has been 5867c478bd9Sstevel@tonic-gate * unlocked. 5877c478bd9Sstevel@tonic-gate */ 5887c478bd9Sstevel@tonic-gate int 5897c478bd9Sstevel@tonic-gate pvn_getdirty(page_t *pp, int flags) 5907c478bd9Sstevel@tonic-gate { 5917c478bd9Sstevel@tonic-gate ASSERT((flags & (B_INVAL | B_FREE)) ? 5927c478bd9Sstevel@tonic-gate PAGE_EXCL(pp) : PAGE_SHARED(pp)); 5937c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp) == 0); 5947c478bd9Sstevel@tonic-gate 5957c478bd9Sstevel@tonic-gate /* 5967c478bd9Sstevel@tonic-gate * If trying to invalidate or free a logically `locked' page, 5977c478bd9Sstevel@tonic-gate * forget it. Don't need page_struct_lock to check p_lckcnt and 5987c478bd9Sstevel@tonic-gate * p_cowcnt as the page is exclusively locked. 5997c478bd9Sstevel@tonic-gate */ 6007c478bd9Sstevel@tonic-gate if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) && 6017c478bd9Sstevel@tonic-gate (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) { 6027c478bd9Sstevel@tonic-gate page_unlock(pp); 6037c478bd9Sstevel@tonic-gate return (0); 6047c478bd9Sstevel@tonic-gate } 6057c478bd9Sstevel@tonic-gate 6067c478bd9Sstevel@tonic-gate /* 6077c478bd9Sstevel@tonic-gate * Now acquire the i/o lock so we can add it to the dirty 6087c478bd9Sstevel@tonic-gate * list (if necessary). We avoid blocking on the i/o lock 6097c478bd9Sstevel@tonic-gate * in the following cases: 6107c478bd9Sstevel@tonic-gate * 6117c478bd9Sstevel@tonic-gate * If B_DELWRI is set, which implies that this request is 6127c478bd9Sstevel@tonic-gate * due to a klustering operartion. 6137c478bd9Sstevel@tonic-gate * 6147c478bd9Sstevel@tonic-gate * If this is an async (B_ASYNC) operation and we are not doing 6157c478bd9Sstevel@tonic-gate * invalidation (B_INVAL) [The current i/o or fsflush will ensure 6167c478bd9Sstevel@tonic-gate * that the the page is written out]. 6177c478bd9Sstevel@tonic-gate */ 6187c478bd9Sstevel@tonic-gate if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) { 6197c478bd9Sstevel@tonic-gate if (!page_io_trylock(pp)) { 6207c478bd9Sstevel@tonic-gate page_unlock(pp); 6217c478bd9Sstevel@tonic-gate return (0); 6227c478bd9Sstevel@tonic-gate } 6237c478bd9Sstevel@tonic-gate } else { 6247c478bd9Sstevel@tonic-gate page_io_lock(pp); 6257c478bd9Sstevel@tonic-gate } 6267c478bd9Sstevel@tonic-gate 6277c478bd9Sstevel@tonic-gate /* 6287c478bd9Sstevel@tonic-gate * If we want to free or invalidate the page then 6297c478bd9Sstevel@tonic-gate * we need to unload it so that anyone who wants 6307c478bd9Sstevel@tonic-gate * it will have to take a minor fault to get it. 6317c478bd9Sstevel@tonic-gate * Otherwise, we're just writing the page back so we 6327c478bd9Sstevel@tonic-gate * need to sync up the hardwre and software mod bit to 6337c478bd9Sstevel@tonic-gate * detect any future modifications. We clear the 6347c478bd9Sstevel@tonic-gate * software mod bit when we put the page on the dirty 6357c478bd9Sstevel@tonic-gate * list. 6367c478bd9Sstevel@tonic-gate */ 6377c478bd9Sstevel@tonic-gate if (flags & (B_INVAL | B_FREE)) { 6387c478bd9Sstevel@tonic-gate (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 6397c478bd9Sstevel@tonic-gate } else { 6407c478bd9Sstevel@tonic-gate (void) hat_pagesync(pp, HAT_SYNC_ZERORM); 6417c478bd9Sstevel@tonic-gate } 6427c478bd9Sstevel@tonic-gate 6437c478bd9Sstevel@tonic-gate if (!hat_ismod(pp) || (flags & B_TRUNC)) { 6447c478bd9Sstevel@tonic-gate /* 6457c478bd9Sstevel@tonic-gate * Don't need to add it to the 6467c478bd9Sstevel@tonic-gate * list after all. 6477c478bd9Sstevel@tonic-gate */ 6487c478bd9Sstevel@tonic-gate page_io_unlock(pp); 6497c478bd9Sstevel@tonic-gate if (flags & B_INVAL) { 6507c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 6517c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 6527c478bd9Sstevel@tonic-gate } else if (flags & B_FREE) { 6537c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/ 6547c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred); 6557c478bd9Sstevel@tonic-gate } else { 6567c478bd9Sstevel@tonic-gate /* 6577c478bd9Sstevel@tonic-gate * This is advisory path for the callers 6587c478bd9Sstevel@tonic-gate * of VOP_PUTPAGE() who prefer freeing the 6597c478bd9Sstevel@tonic-gate * page _only_ if no one else is accessing it. 6607c478bd9Sstevel@tonic-gate * E.g. segmap_release() 6617c478bd9Sstevel@tonic-gate * 6627c478bd9Sstevel@tonic-gate * The above hat_ismod() check is useless because: 6637c478bd9Sstevel@tonic-gate * (1) we may not be holding SE_EXCL lock; 6647c478bd9Sstevel@tonic-gate * (2) we've not unloaded _all_ translations 6657c478bd9Sstevel@tonic-gate * 6667c478bd9Sstevel@tonic-gate * Let page_release() do the heavy-lifting. 6677c478bd9Sstevel@tonic-gate */ 6687c478bd9Sstevel@tonic-gate (void) page_release(pp, 1); 6697c478bd9Sstevel@tonic-gate } 6707c478bd9Sstevel@tonic-gate return (0); 6717c478bd9Sstevel@tonic-gate } 6727c478bd9Sstevel@tonic-gate 6737c478bd9Sstevel@tonic-gate /* 6747c478bd9Sstevel@tonic-gate * Page is dirty, get it ready for the write back 6757c478bd9Sstevel@tonic-gate * and add page to the dirty list. 6767c478bd9Sstevel@tonic-gate */ 6777c478bd9Sstevel@tonic-gate hat_clrrefmod(pp); 6787c478bd9Sstevel@tonic-gate 6797c478bd9Sstevel@tonic-gate /* 6807c478bd9Sstevel@tonic-gate * If we're going to free the page when we're done 6817c478bd9Sstevel@tonic-gate * then we can let others try to use it starting now. 6827c478bd9Sstevel@tonic-gate * We'll detect the fact that they used it when the 6837c478bd9Sstevel@tonic-gate * i/o is done and avoid freeing the page. 6847c478bd9Sstevel@tonic-gate */ 6857c478bd9Sstevel@tonic-gate if (flags & B_FREE) 6867c478bd9Sstevel@tonic-gate page_downgrade(pp); 6877c478bd9Sstevel@tonic-gate 6887c478bd9Sstevel@tonic-gate 6897c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp); 6907c478bd9Sstevel@tonic-gate 6917c478bd9Sstevel@tonic-gate return (1); 6927c478bd9Sstevel@tonic-gate } 6937c478bd9Sstevel@tonic-gate 6947c478bd9Sstevel@tonic-gate 6957c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 6967c478bd9Sstevel@tonic-gate static int 6977c478bd9Sstevel@tonic-gate marker_constructor(void *buf, void *cdrarg, int kmflags) 6987c478bd9Sstevel@tonic-gate { 6997c478bd9Sstevel@tonic-gate page_t *mark = buf; 7007c478bd9Sstevel@tonic-gate bzero(mark, sizeof (page_t)); 701f8bbc571SPavel Filipensky mark->p_hash = PVN_VPLIST_HASH_TAG; 7027c478bd9Sstevel@tonic-gate return (0); 7037c478bd9Sstevel@tonic-gate } 7047c478bd9Sstevel@tonic-gate 7057c478bd9Sstevel@tonic-gate void 7067c478bd9Sstevel@tonic-gate pvn_init() 7077c478bd9Sstevel@tonic-gate { 7087c478bd9Sstevel@tonic-gate if (pvn_vmodsort_disable == 0) 7097c478bd9Sstevel@tonic-gate pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL); 7107c478bd9Sstevel@tonic-gate marker_cache = kmem_cache_create("marker_cache", 7117c478bd9Sstevel@tonic-gate sizeof (page_t), 0, marker_constructor, 7127c478bd9Sstevel@tonic-gate NULL, NULL, NULL, NULL, 0); 7137c478bd9Sstevel@tonic-gate } 7147c478bd9Sstevel@tonic-gate 7157c478bd9Sstevel@tonic-gate 7167c478bd9Sstevel@tonic-gate /* 7177c478bd9Sstevel@tonic-gate * Process a vnode's page list for all pages whose offset is >= off. 7187c478bd9Sstevel@tonic-gate * Pages are to either be free'd, invalidated, or written back to disk. 7197c478bd9Sstevel@tonic-gate * 7207c478bd9Sstevel@tonic-gate * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE 7217c478bd9Sstevel@tonic-gate * is specified, otherwise they are "shared" locked. 7227c478bd9Sstevel@tonic-gate * 7237c478bd9Sstevel@tonic-gate * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC} 7247c478bd9Sstevel@tonic-gate * 7257c478bd9Sstevel@tonic-gate * Special marker page_t's are inserted in the list in order 7267c478bd9Sstevel@tonic-gate * to keep track of where we are in the list when locks are dropped. 7277c478bd9Sstevel@tonic-gate * 7287c478bd9Sstevel@tonic-gate * Note the list is circular and insertions can happen only at the 7297c478bd9Sstevel@tonic-gate * head and tail of the list. The algorithm ensures visiting all pages 7307c478bd9Sstevel@tonic-gate * on the list in the following way: 7317c478bd9Sstevel@tonic-gate * 7327c478bd9Sstevel@tonic-gate * Drop two marker pages at the end of the list. 7337c478bd9Sstevel@tonic-gate * 7347c478bd9Sstevel@tonic-gate * Move one marker page backwards towards the start of the list until 7357c478bd9Sstevel@tonic-gate * it is at the list head, processing the pages passed along the way. 7367c478bd9Sstevel@tonic-gate * 7377c478bd9Sstevel@tonic-gate * Due to race conditions when the vphm mutex is dropped, additional pages 7387c478bd9Sstevel@tonic-gate * can be added to either end of the list, so we'll continue to move 7397c478bd9Sstevel@tonic-gate * the marker and process pages until it is up against the end marker. 7407c478bd9Sstevel@tonic-gate * 7417c478bd9Sstevel@tonic-gate * There is one special exit condition. If we are processing a VMODSORT 7427c478bd9Sstevel@tonic-gate * vnode and only writing back modified pages, we can stop as soon as 7437c478bd9Sstevel@tonic-gate * we run into an unmodified page. This makes fsync(3) operations fast. 7447c478bd9Sstevel@tonic-gate */ 7457c478bd9Sstevel@tonic-gate int 7467c478bd9Sstevel@tonic-gate pvn_vplist_dirty( 7477c478bd9Sstevel@tonic-gate vnode_t *vp, 7487c478bd9Sstevel@tonic-gate u_offset_t off, 7497c478bd9Sstevel@tonic-gate int (*putapage)(vnode_t *, page_t *, u_offset_t *, 7507c478bd9Sstevel@tonic-gate size_t *, int, cred_t *), 7517c478bd9Sstevel@tonic-gate int flags, 7527c478bd9Sstevel@tonic-gate cred_t *cred) 7537c478bd9Sstevel@tonic-gate { 7547c478bd9Sstevel@tonic-gate page_t *pp; 7557c478bd9Sstevel@tonic-gate page_t *mark; /* marker page that moves toward head */ 7567c478bd9Sstevel@tonic-gate page_t *end; /* marker page at end of list */ 7577c478bd9Sstevel@tonic-gate int err = 0; 7587c478bd9Sstevel@tonic-gate int error; 7597c478bd9Sstevel@tonic-gate kmutex_t *vphm; 7607c478bd9Sstevel@tonic-gate se_t se; 7617c478bd9Sstevel@tonic-gate page_t **where_to_move; 7627c478bd9Sstevel@tonic-gate 7637c478bd9Sstevel@tonic-gate ASSERT(vp->v_type != VCHR); 7647c478bd9Sstevel@tonic-gate 7657c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) 7667c478bd9Sstevel@tonic-gate return (0); 7677c478bd9Sstevel@tonic-gate 7687c478bd9Sstevel@tonic-gate 7697c478bd9Sstevel@tonic-gate /* 7707c478bd9Sstevel@tonic-gate * Serialize vplist_dirty operations on this vnode by setting VVMLOCK. 7717c478bd9Sstevel@tonic-gate * 7727c478bd9Sstevel@tonic-gate * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync() 7737c478bd9Sstevel@tonic-gate * from getting blocked while flushing pages to a dead NFS server. 7747c478bd9Sstevel@tonic-gate */ 7757c478bd9Sstevel@tonic-gate mutex_enter(&vp->v_lock); 7767c478bd9Sstevel@tonic-gate if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) { 7777c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 7787c478bd9Sstevel@tonic-gate return (EAGAIN); 7797c478bd9Sstevel@tonic-gate } 7807c478bd9Sstevel@tonic-gate 7817c478bd9Sstevel@tonic-gate while (vp->v_flag & VVMLOCK) 7827c478bd9Sstevel@tonic-gate cv_wait(&vp->v_cv, &vp->v_lock); 7837c478bd9Sstevel@tonic-gate 7847c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) { 7857c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 7867c478bd9Sstevel@tonic-gate return (0); 7877c478bd9Sstevel@tonic-gate } 7887c478bd9Sstevel@tonic-gate 7897c478bd9Sstevel@tonic-gate vp->v_flag |= VVMLOCK; 7907c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 7917c478bd9Sstevel@tonic-gate 7927c478bd9Sstevel@tonic-gate 7937c478bd9Sstevel@tonic-gate /* 7947c478bd9Sstevel@tonic-gate * Set up the marker pages used to walk the list 7957c478bd9Sstevel@tonic-gate */ 7967c478bd9Sstevel@tonic-gate end = kmem_cache_alloc(marker_cache, KM_SLEEP); 7977c478bd9Sstevel@tonic-gate end->p_vnode = vp; 7987c478bd9Sstevel@tonic-gate end->p_offset = (u_offset_t)-2; 7997c478bd9Sstevel@tonic-gate mark = kmem_cache_alloc(marker_cache, KM_SLEEP); 8007c478bd9Sstevel@tonic-gate mark->p_vnode = vp; 8017c478bd9Sstevel@tonic-gate mark->p_offset = (u_offset_t)-1; 8027c478bd9Sstevel@tonic-gate 8037c478bd9Sstevel@tonic-gate /* 8047c478bd9Sstevel@tonic-gate * Grab the lock protecting the vnode's page list 8057c478bd9Sstevel@tonic-gate * note that this lock is dropped at times in the loop. 8067c478bd9Sstevel@tonic-gate */ 8077c478bd9Sstevel@tonic-gate vphm = page_vnode_mutex(vp); 8087c478bd9Sstevel@tonic-gate mutex_enter(vphm); 8097c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) 8107c478bd9Sstevel@tonic-gate goto leave; 8117c478bd9Sstevel@tonic-gate 8127c478bd9Sstevel@tonic-gate /* 8137c478bd9Sstevel@tonic-gate * insert the markers and loop through the list of pages 8147c478bd9Sstevel@tonic-gate */ 8157c478bd9Sstevel@tonic-gate page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark); 8167c478bd9Sstevel@tonic-gate page_vpadd(&mark->p_vpnext, end); 8177c478bd9Sstevel@tonic-gate for (;;) { 8187c478bd9Sstevel@tonic-gate 8197c478bd9Sstevel@tonic-gate /* 8207c478bd9Sstevel@tonic-gate * If only doing an async write back, then we can 8217c478bd9Sstevel@tonic-gate * stop as soon as we get to start of the list. 8227c478bd9Sstevel@tonic-gate */ 8237c478bd9Sstevel@tonic-gate if (flags == B_ASYNC && vp->v_pages == mark) 8247c478bd9Sstevel@tonic-gate break; 8257c478bd9Sstevel@tonic-gate 8267c478bd9Sstevel@tonic-gate /* 8277c478bd9Sstevel@tonic-gate * otherwise stop when we've gone through all the pages 8287c478bd9Sstevel@tonic-gate */ 8297c478bd9Sstevel@tonic-gate if (mark->p_vpprev == end) 8307c478bd9Sstevel@tonic-gate break; 8317c478bd9Sstevel@tonic-gate 8327c478bd9Sstevel@tonic-gate pp = mark->p_vpprev; 8337c478bd9Sstevel@tonic-gate if (vp->v_pages == pp) 8347c478bd9Sstevel@tonic-gate where_to_move = &vp->v_pages; 8357c478bd9Sstevel@tonic-gate else 8367c478bd9Sstevel@tonic-gate where_to_move = &pp->p_vpprev->p_vpnext; 8377c478bd9Sstevel@tonic-gate 8387c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == vp); 8397c478bd9Sstevel@tonic-gate 8407c478bd9Sstevel@tonic-gate /* 8417c478bd9Sstevel@tonic-gate * If just flushing dirty pages to disk and this vnode 8427c478bd9Sstevel@tonic-gate * is using a sorted list of pages, we can stop processing 8437c478bd9Sstevel@tonic-gate * as soon as we find an unmodified page. Since all the 8447c478bd9Sstevel@tonic-gate * modified pages are visited first. 8457c478bd9Sstevel@tonic-gate */ 8467c478bd9Sstevel@tonic-gate if (IS_VMODSORT(vp) && 847a71e32b6Sstans !(flags & (B_INVAL | B_FREE | B_TRUNC))) { 848a71e32b6Sstans if (!hat_ismod(pp) && !page_io_locked(pp)) { 8497c478bd9Sstevel@tonic-gate #ifdef DEBUG 8507c478bd9Sstevel@tonic-gate /* 851a71e32b6Sstans * For debug kernels examine what should be 852a71e32b6Sstans * all the remaining clean pages, asserting 853a71e32b6Sstans * that they are not modified. 8547c478bd9Sstevel@tonic-gate */ 8557c478bd9Sstevel@tonic-gate page_t *chk = pp; 8567c478bd9Sstevel@tonic-gate int attr; 8577c478bd9Sstevel@tonic-gate 8587c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 8597c478bd9Sstevel@tonic-gate page_vpadd(where_to_move, mark); 8607c478bd9Sstevel@tonic-gate do { 8617c478bd9Sstevel@tonic-gate chk = chk->p_vpprev; 8627c478bd9Sstevel@tonic-gate ASSERT(chk != end); 8637c478bd9Sstevel@tonic-gate if (chk == mark) 8647c478bd9Sstevel@tonic-gate continue; 865a71e32b6Sstans attr = hat_page_getattr(chk, P_MOD | 866a71e32b6Sstans P_REF); 8677c478bd9Sstevel@tonic-gate if ((attr & P_MOD) == 0) 8687c478bd9Sstevel@tonic-gate continue; 8697c478bd9Sstevel@tonic-gate panic("v_pages list not all clean: " 8707c478bd9Sstevel@tonic-gate "page_t*=%p vnode=%p off=%lx " 8717c478bd9Sstevel@tonic-gate "attr=0x%x last clean page_t*=%p\n", 8727c478bd9Sstevel@tonic-gate (void *)chk, (void *)chk->p_vnode, 873a71e32b6Sstans (long)chk->p_offset, attr, 874a71e32b6Sstans (void *)pp); 8757c478bd9Sstevel@tonic-gate } while (chk != vp->v_pages); 8767c478bd9Sstevel@tonic-gate #endif 8777c478bd9Sstevel@tonic-gate break; 878a71e32b6Sstans } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) { 879a71e32b6Sstans /* 880a71e32b6Sstans * Couldn't get io lock, wait until IO is done. 881a71e32b6Sstans * Block only for sync IO since we don't want 882a71e32b6Sstans * to block async IO. 883a71e32b6Sstans */ 884a71e32b6Sstans mutex_exit(vphm); 885a71e32b6Sstans page_io_wait(pp); 886a71e32b6Sstans mutex_enter(vphm); 887a71e32b6Sstans continue; 888a71e32b6Sstans } 8897c478bd9Sstevel@tonic-gate } 8907c478bd9Sstevel@tonic-gate 8917c478bd9Sstevel@tonic-gate /* 892adbe22e1SPeter Telford * Skip this page if the offset is out of the desired range. 893adbe22e1SPeter Telford * Just move the marker and continue. 894adbe22e1SPeter Telford */ 895adbe22e1SPeter Telford if (pp->p_offset < off) { 896adbe22e1SPeter Telford page_vpsub(&vp->v_pages, mark); 897adbe22e1SPeter Telford page_vpadd(where_to_move, mark); 898adbe22e1SPeter Telford continue; 899adbe22e1SPeter Telford } 900adbe22e1SPeter Telford 901adbe22e1SPeter Telford /* 9027c478bd9Sstevel@tonic-gate * If we are supposed to invalidate or free this 9037c478bd9Sstevel@tonic-gate * page, then we need an exclusive lock. 9047c478bd9Sstevel@tonic-gate */ 9057c478bd9Sstevel@tonic-gate se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 9067c478bd9Sstevel@tonic-gate 9077c478bd9Sstevel@tonic-gate /* 9087c478bd9Sstevel@tonic-gate * We must acquire the page lock for all synchronous 9097c478bd9Sstevel@tonic-gate * operations (invalidate, free and write). 9107c478bd9Sstevel@tonic-gate */ 9117c478bd9Sstevel@tonic-gate if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) { 9127c478bd9Sstevel@tonic-gate /* 9137c478bd9Sstevel@tonic-gate * If the page_lock() drops the mutex 9147c478bd9Sstevel@tonic-gate * we must retry the loop. 9157c478bd9Sstevel@tonic-gate */ 9167c478bd9Sstevel@tonic-gate if (!page_lock(pp, se, vphm, P_NO_RECLAIM)) 9177c478bd9Sstevel@tonic-gate continue; 9187c478bd9Sstevel@tonic-gate 9197c478bd9Sstevel@tonic-gate /* 9207c478bd9Sstevel@tonic-gate * It's ok to move the marker page now. 9217c478bd9Sstevel@tonic-gate */ 9227c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 9237c478bd9Sstevel@tonic-gate page_vpadd(where_to_move, mark); 9247c478bd9Sstevel@tonic-gate } else { 9257c478bd9Sstevel@tonic-gate 9267c478bd9Sstevel@tonic-gate /* 9277c478bd9Sstevel@tonic-gate * update the marker page for all remaining cases 9287c478bd9Sstevel@tonic-gate */ 9297c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 9307c478bd9Sstevel@tonic-gate page_vpadd(where_to_move, mark); 9317c478bd9Sstevel@tonic-gate 9327c478bd9Sstevel@tonic-gate /* 9337c478bd9Sstevel@tonic-gate * For write backs, If we can't lock the page, it's 9347c478bd9Sstevel@tonic-gate * invalid or in the process of being destroyed. Skip 9357c478bd9Sstevel@tonic-gate * it, assuming someone else is writing it. 9367c478bd9Sstevel@tonic-gate */ 9377c478bd9Sstevel@tonic-gate if (!page_trylock(pp, se)) 9387c478bd9Sstevel@tonic-gate continue; 9397c478bd9Sstevel@tonic-gate } 9407c478bd9Sstevel@tonic-gate 9417c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == vp); 9427c478bd9Sstevel@tonic-gate 9437c478bd9Sstevel@tonic-gate /* 9447c478bd9Sstevel@tonic-gate * Successfully locked the page, now figure out what to 9457c478bd9Sstevel@tonic-gate * do with it. Free pages are easily dealt with, invalidate 9467c478bd9Sstevel@tonic-gate * if desired or just go on to the next page. 9477c478bd9Sstevel@tonic-gate */ 9487c478bd9Sstevel@tonic-gate if (PP_ISFREE(pp)) { 9497c478bd9Sstevel@tonic-gate if ((flags & B_INVAL) == 0) { 9507c478bd9Sstevel@tonic-gate page_unlock(pp); 9517c478bd9Sstevel@tonic-gate continue; 9527c478bd9Sstevel@tonic-gate } 9537c478bd9Sstevel@tonic-gate 9547c478bd9Sstevel@tonic-gate /* 9557c478bd9Sstevel@tonic-gate * Invalidate (destroy) the page. 9567c478bd9Sstevel@tonic-gate */ 9577c478bd9Sstevel@tonic-gate mutex_exit(vphm); 9587c478bd9Sstevel@tonic-gate page_destroy_free(pp); 9597c478bd9Sstevel@tonic-gate mutex_enter(vphm); 9607c478bd9Sstevel@tonic-gate continue; 9617c478bd9Sstevel@tonic-gate } 9627c478bd9Sstevel@tonic-gate 9637c478bd9Sstevel@tonic-gate /* 9647c478bd9Sstevel@tonic-gate * pvn_getdirty() figures out what do do with a dirty page. 9657c478bd9Sstevel@tonic-gate * If the page is dirty, the putapage() routine will write it 9667c478bd9Sstevel@tonic-gate * and will kluster any other adjacent dirty pages it can. 9677c478bd9Sstevel@tonic-gate * 9687c478bd9Sstevel@tonic-gate * pvn_getdirty() and `(*putapage)' unlock the page. 9697c478bd9Sstevel@tonic-gate */ 9707c478bd9Sstevel@tonic-gate mutex_exit(vphm); 9717c478bd9Sstevel@tonic-gate if (pvn_getdirty(pp, flags)) { 9727c478bd9Sstevel@tonic-gate error = (*putapage)(vp, pp, NULL, NULL, flags, cred); 9737c478bd9Sstevel@tonic-gate if (!err) 9747c478bd9Sstevel@tonic-gate err = error; 9757c478bd9Sstevel@tonic-gate } 9767c478bd9Sstevel@tonic-gate mutex_enter(vphm); 9777c478bd9Sstevel@tonic-gate } 9787c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, mark); 9797c478bd9Sstevel@tonic-gate page_vpsub(&vp->v_pages, end); 9807c478bd9Sstevel@tonic-gate 9817c478bd9Sstevel@tonic-gate leave: 9827c478bd9Sstevel@tonic-gate /* 9837c478bd9Sstevel@tonic-gate * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds 9847c478bd9Sstevel@tonic-gate */ 9857c478bd9Sstevel@tonic-gate mutex_exit(vphm); 9867c478bd9Sstevel@tonic-gate kmem_cache_free(marker_cache, mark); 9877c478bd9Sstevel@tonic-gate kmem_cache_free(marker_cache, end); 9887c478bd9Sstevel@tonic-gate mutex_enter(&vp->v_lock); 9897c478bd9Sstevel@tonic-gate vp->v_flag &= ~VVMLOCK; 9907c478bd9Sstevel@tonic-gate cv_broadcast(&vp->v_cv); 9917c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 9927c478bd9Sstevel@tonic-gate return (err); 9937c478bd9Sstevel@tonic-gate } 9947c478bd9Sstevel@tonic-gate 9957c478bd9Sstevel@tonic-gate /* 996f8bbc571SPavel Filipensky * Walk the vp->v_pages list, for every page call the callback function 997f8bbc571SPavel Filipensky * pointed by *page_check. If page_check returns non-zero, then mark the 998f8bbc571SPavel Filipensky * page as modified and if VMODSORT is set, move it to the end of v_pages 999f8bbc571SPavel Filipensky * list. Moving makes sense only if we have at least two pages - this also 1000f8bbc571SPavel Filipensky * avoids having v_pages temporarily being NULL after calling page_vpsub() 1001f8bbc571SPavel Filipensky * if there was just one page. 1002f8bbc571SPavel Filipensky */ 1003f8bbc571SPavel Filipensky void 1004f8bbc571SPavel Filipensky pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *)) 1005f8bbc571SPavel Filipensky { 1006f8bbc571SPavel Filipensky page_t *pp, *next, *end; 1007f8bbc571SPavel Filipensky kmutex_t *vphm; 1008f8bbc571SPavel Filipensky int shuffle; 1009f8bbc571SPavel Filipensky 1010f8bbc571SPavel Filipensky vphm = page_vnode_mutex(vp); 1011f8bbc571SPavel Filipensky mutex_enter(vphm); 1012f8bbc571SPavel Filipensky 1013f8bbc571SPavel Filipensky if (vp->v_pages == NULL) { 1014f8bbc571SPavel Filipensky mutex_exit(vphm); 1015f8bbc571SPavel Filipensky return; 1016f8bbc571SPavel Filipensky } 1017f8bbc571SPavel Filipensky 1018f8bbc571SPavel Filipensky end = vp->v_pages->p_vpprev; 1019f8bbc571SPavel Filipensky shuffle = IS_VMODSORT(vp) && (vp->v_pages != end); 1020f8bbc571SPavel Filipensky pp = vp->v_pages; 1021f8bbc571SPavel Filipensky 1022f8bbc571SPavel Filipensky for (;;) { 1023f8bbc571SPavel Filipensky next = pp->p_vpnext; 1024f8bbc571SPavel Filipensky if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) { 1025f8bbc571SPavel Filipensky /* 1026f8bbc571SPavel Filipensky * hat_setmod_only() in contrast to hat_setmod() does 1027f8bbc571SPavel Filipensky * not shuffle the pages and does not grab the mutex 1028f8bbc571SPavel Filipensky * page_vnode_mutex. Exactly what we need. 1029f8bbc571SPavel Filipensky */ 1030f8bbc571SPavel Filipensky hat_setmod_only(pp); 1031f8bbc571SPavel Filipensky if (shuffle) { 1032f8bbc571SPavel Filipensky page_vpsub(&vp->v_pages, pp); 1033f8bbc571SPavel Filipensky ASSERT(vp->v_pages != NULL); 1034f8bbc571SPavel Filipensky page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, 1035f8bbc571SPavel Filipensky pp); 1036f8bbc571SPavel Filipensky } 1037f8bbc571SPavel Filipensky } 1038f8bbc571SPavel Filipensky /* Stop if we have just processed the last page. */ 1039f8bbc571SPavel Filipensky if (pp == end) 1040f8bbc571SPavel Filipensky break; 1041f8bbc571SPavel Filipensky pp = next; 1042f8bbc571SPavel Filipensky } 1043f8bbc571SPavel Filipensky 1044f8bbc571SPavel Filipensky mutex_exit(vphm); 1045f8bbc571SPavel Filipensky } 1046f8bbc571SPavel Filipensky 1047f8bbc571SPavel Filipensky /* 10487c478bd9Sstevel@tonic-gate * Zero out zbytes worth of data. Caller should be aware that this 10497c478bd9Sstevel@tonic-gate * routine may enter back into the fs layer (xxx_getpage). Locks 10507c478bd9Sstevel@tonic-gate * that the xxx_getpage routine may need should not be held while 10517c478bd9Sstevel@tonic-gate * calling this. 10527c478bd9Sstevel@tonic-gate */ 10537c478bd9Sstevel@tonic-gate void 10547c478bd9Sstevel@tonic-gate pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes) 10557c478bd9Sstevel@tonic-gate { 10567c478bd9Sstevel@tonic-gate caddr_t addr; 10577c478bd9Sstevel@tonic-gate 10587c478bd9Sstevel@tonic-gate ASSERT(vp->v_type != VCHR); 10597c478bd9Sstevel@tonic-gate 10607c478bd9Sstevel@tonic-gate if (vp->v_pages == NULL) 10617c478bd9Sstevel@tonic-gate return; 10627c478bd9Sstevel@tonic-gate 10637c478bd9Sstevel@tonic-gate /* 10647c478bd9Sstevel@tonic-gate * zbytes may be zero but there still may be some portion of 10657c478bd9Sstevel@tonic-gate * a page which needs clearing (since zbytes is a function 10667c478bd9Sstevel@tonic-gate * of filesystem block size, not pagesize.) 10677c478bd9Sstevel@tonic-gate */ 10687c478bd9Sstevel@tonic-gate if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0) 10697c478bd9Sstevel@tonic-gate return; 10707c478bd9Sstevel@tonic-gate 10717c478bd9Sstevel@tonic-gate /* 10727c478bd9Sstevel@tonic-gate * We get the last page and handle the partial 10737c478bd9Sstevel@tonic-gate * zeroing via kernel mappings. This will make the page 10747c478bd9Sstevel@tonic-gate * dirty so that we know that when this page is written 10757c478bd9Sstevel@tonic-gate * back, the zeroed information will go out with it. If 10767c478bd9Sstevel@tonic-gate * the page is not currently in memory, then the kzero 10777c478bd9Sstevel@tonic-gate * operation will cause it to be brought it. We use kzero 10787c478bd9Sstevel@tonic-gate * instead of bzero so that if the page cannot be read in 10797c478bd9Sstevel@tonic-gate * for any reason, the system will not panic. We need 10807c478bd9Sstevel@tonic-gate * to zero out a minimum of the fs given zbytes, but we 10817c478bd9Sstevel@tonic-gate * might also have to do more to get the entire last page. 10827c478bd9Sstevel@tonic-gate */ 10837c478bd9Sstevel@tonic-gate 10847c478bd9Sstevel@tonic-gate if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE) 10857c478bd9Sstevel@tonic-gate panic("pvn_vptrunc zbytes"); 10867c478bd9Sstevel@tonic-gate addr = segmap_getmapflt(segkmap, vp, vplen, 10877c478bd9Sstevel@tonic-gate MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE); 10887c478bd9Sstevel@tonic-gate (void) kzero(addr + (vplen & MAXBOFFSET), 10897c478bd9Sstevel@tonic-gate MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET))); 10907c478bd9Sstevel@tonic-gate (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC); 10917c478bd9Sstevel@tonic-gate } 10927c478bd9Sstevel@tonic-gate 10937c478bd9Sstevel@tonic-gate /* 1094*ca411232SJosef 'Jeff' Sipek * Handles common work of the VOP_GETPAGE routines by iterating page by page 1095*ca411232SJosef 'Jeff' Sipek * calling the getpage helper for each. 10967c478bd9Sstevel@tonic-gate */ 10977c478bd9Sstevel@tonic-gate int 10987c478bd9Sstevel@tonic-gate pvn_getpages( 10997c478bd9Sstevel@tonic-gate int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[], 11007c478bd9Sstevel@tonic-gate size_t, struct seg *, caddr_t, enum seg_rw, cred_t *), 11017c478bd9Sstevel@tonic-gate struct vnode *vp, 11027c478bd9Sstevel@tonic-gate u_offset_t off, 11037c478bd9Sstevel@tonic-gate size_t len, 11047c478bd9Sstevel@tonic-gate uint_t *protp, 11057c478bd9Sstevel@tonic-gate page_t *pl[], 11067c478bd9Sstevel@tonic-gate size_t plsz, 11077c478bd9Sstevel@tonic-gate struct seg *seg, 11087c478bd9Sstevel@tonic-gate caddr_t addr, 11097c478bd9Sstevel@tonic-gate enum seg_rw rw, 11107c478bd9Sstevel@tonic-gate struct cred *cred) 11117c478bd9Sstevel@tonic-gate { 11127c478bd9Sstevel@tonic-gate page_t **ppp; 11137c478bd9Sstevel@tonic-gate u_offset_t o, eoff; 11147c478bd9Sstevel@tonic-gate size_t sz, xlen; 11157c478bd9Sstevel@tonic-gate int err; 11167c478bd9Sstevel@tonic-gate 1117*ca411232SJosef 'Jeff' Sipek /* ensure that we have enough space */ 1118*ca411232SJosef 'Jeff' Sipek ASSERT(pl == NULL || plsz >= len); 11197c478bd9Sstevel@tonic-gate 11207c478bd9Sstevel@tonic-gate /* 11217c478bd9Sstevel@tonic-gate * Loop one page at a time and let getapage function fill 11227c478bd9Sstevel@tonic-gate * in the next page in array. We only allow one page to be 11237c478bd9Sstevel@tonic-gate * returned at a time (except for the last page) so that we 11247c478bd9Sstevel@tonic-gate * don't have any problems with duplicates and other such 11257c478bd9Sstevel@tonic-gate * painful problems. This is a very simple minded algorithm, 11267c478bd9Sstevel@tonic-gate * but it does the job correctly. We hope that the cost of a 11277c478bd9Sstevel@tonic-gate * getapage call for a resident page that we might have been 11287c478bd9Sstevel@tonic-gate * able to get from an earlier call doesn't cost too much. 11297c478bd9Sstevel@tonic-gate */ 11307c478bd9Sstevel@tonic-gate ppp = pl; 1131*ca411232SJosef 'Jeff' Sipek sz = (pl != NULL) ? PAGESIZE : 0; 11327c478bd9Sstevel@tonic-gate eoff = off + len; 11337c478bd9Sstevel@tonic-gate xlen = len; 11347c478bd9Sstevel@tonic-gate for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE, 11357c478bd9Sstevel@tonic-gate xlen -= PAGESIZE) { 1136*ca411232SJosef 'Jeff' Sipek if (o + PAGESIZE >= eoff && pl != NULL) { 11377c478bd9Sstevel@tonic-gate /* 11387c478bd9Sstevel@tonic-gate * Last time through - allow the all of 11397c478bd9Sstevel@tonic-gate * what's left of the pl[] array to be used. 11407c478bd9Sstevel@tonic-gate */ 11417c478bd9Sstevel@tonic-gate sz = plsz - (o - off); 11427c478bd9Sstevel@tonic-gate } 11437c478bd9Sstevel@tonic-gate err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr, 11447c478bd9Sstevel@tonic-gate rw, cred); 11457c478bd9Sstevel@tonic-gate if (err) { 11467c478bd9Sstevel@tonic-gate /* 11477c478bd9Sstevel@tonic-gate * Release any pages we already got. 11487c478bd9Sstevel@tonic-gate */ 11497c478bd9Sstevel@tonic-gate if (o > off && pl != NULL) { 11507c478bd9Sstevel@tonic-gate for (ppp = pl; *ppp != NULL; *ppp++ = NULL) 11517c478bd9Sstevel@tonic-gate (void) page_release(*ppp, 1); 11527c478bd9Sstevel@tonic-gate } 11537c478bd9Sstevel@tonic-gate break; 11547c478bd9Sstevel@tonic-gate } 11557c478bd9Sstevel@tonic-gate if (pl != NULL) 11567c478bd9Sstevel@tonic-gate ppp++; 11577c478bd9Sstevel@tonic-gate } 11587c478bd9Sstevel@tonic-gate return (err); 11597c478bd9Sstevel@tonic-gate } 11607c478bd9Sstevel@tonic-gate 11617c478bd9Sstevel@tonic-gate /* 11627c478bd9Sstevel@tonic-gate * Initialize the page list array. 11637c478bd9Sstevel@tonic-gate */ 1164081a94b0Saguzovsk /*ARGSUSED*/ 11657c478bd9Sstevel@tonic-gate void 11667c478bd9Sstevel@tonic-gate pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz, 11677c478bd9Sstevel@tonic-gate u_offset_t off, size_t io_len, enum seg_rw rw) 11687c478bd9Sstevel@tonic-gate { 11697c478bd9Sstevel@tonic-gate ssize_t sz; 11707c478bd9Sstevel@tonic-gate page_t *ppcur, **ppp; 11717c478bd9Sstevel@tonic-gate 11727c478bd9Sstevel@tonic-gate /* 11737c478bd9Sstevel@tonic-gate * Set up to load plsz worth 11747c478bd9Sstevel@tonic-gate * starting at the needed page. 11757c478bd9Sstevel@tonic-gate */ 1176081a94b0Saguzovsk while (pp != NULL && pp->p_offset != off) { 11777c478bd9Sstevel@tonic-gate /* 11787c478bd9Sstevel@tonic-gate * Remove page from the i/o list, 11797c478bd9Sstevel@tonic-gate * release the i/o and the page lock. 11807c478bd9Sstevel@tonic-gate */ 11817c478bd9Sstevel@tonic-gate ppcur = pp; 11827c478bd9Sstevel@tonic-gate page_sub(&pp, ppcur); 11837c478bd9Sstevel@tonic-gate page_io_unlock(ppcur); 11847c478bd9Sstevel@tonic-gate (void) page_release(ppcur, 1); 11857c478bd9Sstevel@tonic-gate } 1186081a94b0Saguzovsk 1187081a94b0Saguzovsk if (pp == NULL) { 1188081a94b0Saguzovsk pl[0] = NULL; 1189081a94b0Saguzovsk return; 11907c478bd9Sstevel@tonic-gate } 11917c478bd9Sstevel@tonic-gate 1192081a94b0Saguzovsk sz = plsz; 1193081a94b0Saguzovsk 11947c478bd9Sstevel@tonic-gate /* 11957c478bd9Sstevel@tonic-gate * Initialize the page list array. 11967c478bd9Sstevel@tonic-gate */ 11977c478bd9Sstevel@tonic-gate ppp = pl; 11987c478bd9Sstevel@tonic-gate do { 11997c478bd9Sstevel@tonic-gate ppcur = pp; 12007c478bd9Sstevel@tonic-gate *ppp++ = ppcur; 12017c478bd9Sstevel@tonic-gate page_sub(&pp, ppcur); 12027c478bd9Sstevel@tonic-gate page_io_unlock(ppcur); 12037c478bd9Sstevel@tonic-gate if (rw != S_CREATE) 12047c478bd9Sstevel@tonic-gate page_downgrade(ppcur); 12057c478bd9Sstevel@tonic-gate sz -= PAGESIZE; 12067c478bd9Sstevel@tonic-gate } while (sz > 0 && pp != NULL); 12077c478bd9Sstevel@tonic-gate *ppp = NULL; /* terminate list */ 12087c478bd9Sstevel@tonic-gate 12097c478bd9Sstevel@tonic-gate /* 12107c478bd9Sstevel@tonic-gate * Now free the remaining pages that weren't 12117c478bd9Sstevel@tonic-gate * loaded in the page list. 12127c478bd9Sstevel@tonic-gate */ 12137c478bd9Sstevel@tonic-gate while (pp != NULL) { 12147c478bd9Sstevel@tonic-gate ppcur = pp; 12157c478bd9Sstevel@tonic-gate page_sub(&pp, ppcur); 12167c478bd9Sstevel@tonic-gate page_io_unlock(ppcur); 12177c478bd9Sstevel@tonic-gate (void) page_release(ppcur, 1); 12187c478bd9Sstevel@tonic-gate } 12197c478bd9Sstevel@tonic-gate } 1220