/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int doiflush = 1; /* non-zero to turn inode flushing on */ int dopageflush = 1; /* non-zero to turn page flushing on */ /* * To improve boot performance, don't run the inode flushing loop until * the specified number of seconds after boot. To revert to the old * behavior, set fsflush_iflush_delay to 0. We have not created any new * filesystem danger that did not exist previously, since there is always a * window in between when fsflush does the inode flush loop during which the * system could crash, fail to sync the filesystem, and fsck will be needed * to recover. We have, however, widened this window. Finally, * we never delay inode flushing if we're booting into single user mode, * where the administrator may be modifying files or using fsck. This * modification avoids inode flushes during boot whose only purpose is to * update atimes on files which have been accessed during boot. */ int fsflush_iflush_delay = 60; kcondvar_t fsflush_cv; static kmutex_t fsflush_lock; /* just for the cv_wait */ ksema_t fsflush_sema; /* to serialize with reboot */ /* * some statistics for fsflush_do_pages */ typedef struct { ulong_t fsf_scan; /* number of pages scanned */ ulong_t fsf_examined; /* number of page_t's actually examined, can */ /* be less than fsf_scan due to large pages */ ulong_t fsf_locked; /* pages we actually page_lock()ed */ ulong_t fsf_modified; /* number of modified pages found */ ulong_t fsf_coalesce; /* number of page coalesces done */ ulong_t fsf_time; /* nanoseconds of run time */ ulong_t fsf_releases; /* number of page_release() done */ } fsf_stat_t; fsf_stat_t fsf_recent; /* counts for most recent duty cycle */ fsf_stat_t fsf_total; /* total of counts */ ulong_t fsf_cycles; /* number of runs refelected in fsf_total */ /* * data used to determine when we can coalese consecutive free pages * into larger pages. */ #define MAX_PAGESIZES 32 static ulong_t fsf_npgsz; static pgcnt_t fsf_pgcnt[MAX_PAGESIZES]; static pgcnt_t fsf_mask[MAX_PAGESIZES]; /* * Scan page_t's and issue I/O's for modified pages. * * Also coalesces consecutive small sized free pages into the next larger * pagesize. This costs a tiny bit of time in fsflush, but will reduce time * spent scanning on later passes and for anybody allocating large pages. */ static void fsflush_do_pages() { vnode_t *vp; ulong_t pcount; hrtime_t timer = gethrtime(); ulong_t releases = 0; ulong_t nexamined = 0; ulong_t nlocked = 0; ulong_t nmodified = 0; ulong_t ncoalesce = 0; int mod; u_offset_t offset; uint_t szc; page_t *coal_page = NULL; /* 1st page in group to coalese */ uint_t coal_szc = 0; /* size code, coal_page->p_szc */ uint_t coal_cnt = 0; /* count of pages seen */ static ulong_t nscan = 0; static pgcnt_t last_total_pages = 0; static void *pp_cookie = NULL; static page_t *pp; /* * Check to see if total_pages has changed. */ if (total_pages != last_total_pages) { last_total_pages = total_pages; nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; } /* * On first time through initialize the cookie used for page_t scans */ if (pp_cookie == NULL) pp = page_next_scan_init(&pp_cookie); pcount = 0; while (pcount <= nscan) { /* * move to the next page, skipping over large pages * and issuing prefetches. */ pp = page_next_scan_large(pp, &pcount, &pp_cookie); prefetch_page_r((void *)pp); ASSERT(pp != NULL); /* * Do a bunch of dirty tests (ie. no locking) to determine * if we can quickly skip this page. These tests are repeated * after acquiring the page lock. */ ++nexamined; if (PP_ISSWAP(pp)) { coal_page = NULL; continue; } /* * skip free pages too, but try coalescing them into larger * pagesizes */ if (PP_ISFREE(pp)) { /* * skip pages with a file system identity or that * are already maximum size */ szc = pp->p_szc; if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { coal_page = NULL; continue; } /* * If not in a coalescing candidate page or the size * codes are different, start a new candidate. */ if (coal_page == NULL || coal_szc != szc) { /* * page must be properly aligned */ if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { coal_page = NULL; continue; } coal_page = pp; coal_szc = szc; coal_cnt = 1; continue; } /* * acceptable to add this to existing candidate page */ ++coal_cnt; if (coal_cnt < fsf_pgcnt[coal_szc]) continue; /* * We've got enough pages to coalesce, so do it. * After promoting, we clear coal_page, so it will * take another pass to promote this to an even * larger page. */ ++ncoalesce; (void) page_promote_size(coal_page, coal_szc); coal_page = NULL; continue; } else { coal_page = NULL; } if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) continue; /* * Reject pages that can't be "exclusively" locked. */ if (!page_trylock(pp, SE_EXCL)) continue; ++nlocked; /* * After locking the page, redo the above checks. * Since we locked the page, leave out the PAGE_LOCKED() test. */ vp = pp->p_vnode; if (PP_ISSWAP(pp) || PP_ISFREE(pp) || vp == NULL || PP_ISKAS(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || (vp->v_flag & VISSWAP) != 0) { page_unlock(pp); continue; } ASSERT(vp->v_type != VCHR); /* * Check the modified bit. Leaving the bit alone in hardware. * It will be cleared if we do the putpage. */ if (IS_VMODSORT(vp)) mod = hat_ismod(pp); else mod = hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; if (mod) { ++nmodified; offset = pp->p_offset; /* * Hold the vnode before releasing the page lock * to prevent it from being freed and re-used by * some other thread. */ VN_HOLD(vp); page_unlock(pp); (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, kcred); VN_RELE(vp); } else { /* * Catch any pages which should be on the cache list, * but aren't yet. */ if (hat_page_is_mapped(pp) == 0) { ++releases; (void) page_release(pp, 1); } else { page_unlock(pp); } } } /* * maintain statistics * reset every million wakeups, just to avoid overflow */ if (++fsf_cycles == 1000000) { fsf_cycles = 0; fsf_total.fsf_scan = 0; fsf_total.fsf_examined = 0; fsf_total.fsf_locked = 0; fsf_total.fsf_modified = 0; fsf_total.fsf_coalesce = 0; fsf_total.fsf_time = 0; fsf_total.fsf_releases = 0; } else { fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; } } /* * As part of file system hardening, this daemon is awakened * every second to flush cached data which includes the * buffer cache, the inode cache and mapped pages. */ void fsflush() { struct buf *bp, *dwp; struct hbuf *hp; int autoup; unsigned int ix, icount, count = 0; callb_cpr_t cprinfo; uint_t bcount; kmutex_t *hmp; struct vfssw *vswp; proc_fsflush = ttoproc(curthread); proc_fsflush->p_cstime = 0; proc_fsflush->p_stime = 0; proc_fsflush->p_cutime = 0; proc_fsflush->p_utime = 0; bcopy("fsflush", u.u_psargs, 8); bcopy("fsflush", u.u_comm, 7); mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL); sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL); /* * Setup page coalescing. */ fsf_npgsz = page_num_pagesizes(); ASSERT(fsf_npgsz < MAX_PAGESIZES); for (ix = 0; ix < fsf_npgsz - 1; ++ix) { fsf_pgcnt[ix] = page_get_pagesize(ix + 1) / page_get_pagesize(ix); fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1; } autoup = v.v_autoup * hz; icount = v.v_autoup / tune.t_fsflushr; CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush"); loop: sema_v(&fsflush_sema); mutex_enter(&fsflush_lock); CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */ CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock); mutex_exit(&fsflush_lock); sema_p(&fsflush_sema); /* * Write back all old B_DELWRI buffers on the freelist. */ bcount = 0; for (ix = 0; ix < v.v_hbuf; ix++) { hp = &hbuf[ix]; dwp = (struct buf *)&dwbuf[ix]; bcount += (hp->b_length); if (dwp->av_forw == dwp) { continue; } hmp = &hbuf[ix].b_lock; mutex_enter(hmp); bp = dwp->av_forw; /* * Go down only on the delayed write lists. */ while (bp != dwp) { ASSERT(bp->b_flags & B_DELWRI); if ((bp->b_flags & B_DELWRI) && (lbolt - bp->b_start >= autoup) && sema_tryp(&bp->b_sem)) { bp->b_flags |= B_ASYNC; hp->b_length--; notavail(bp); mutex_exit(hmp); if (bp->b_vp == NULL) { BWRITE(bp); } else { UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); } mutex_enter(hmp); bp = dwp->av_forw; } else { bp = bp->av_forw; } } mutex_exit(hmp); } /* * * There is no need to wakeup any thread waiting on bio_mem_cv * since brelse will wake them up as soon as IO is complete. */ bfreelist.b_bcount = bcount; if (dopageflush) fsflush_do_pages(); if (!doiflush) goto loop; /* * If the system was not booted to single user mode, skip the * inode flushing until after fsflush_iflush_delay secs have elapsed. */ if ((boothowto & RB_SINGLE) == 0 && (lbolt64 / hz) < fsflush_iflush_delay) goto loop; /* * Flush cached attribute information (e.g. inodes). */ if (++count >= icount) { count = 0; /* * Sync back cached data. */ RLOCK_VFSSW(); for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { vfs_refvfssw(vswp); RUNLOCK_VFSSW(); (void) fsop_sync_by_kind(vswp - vfssw, SYNC_ATTR, kcred); vfs_unrefvfssw(vswp); RLOCK_VFSSW(); } } RUNLOCK_VFSSW(); } goto loop; }