1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 22 /* All Rights Reserved */ 23 24 25 /* 26 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #include <sys/types.h> 33 #include <sys/t_lock.h> 34 #include <sys/param.h> 35 #include <sys/tuneable.h> 36 #include <sys/inline.h> 37 #include <sys/systm.h> 38 #include <sys/proc.h> 39 #include <sys/user.h> 40 #include <sys/var.h> 41 #include <sys/buf.h> 42 #include <sys/vfs.h> 43 #include <sys/cred.h> 44 #include <sys/kmem.h> 45 #include <sys/vnode.h> 46 #include <sys/swap.h> 47 #include <sys/vm.h> 48 #include <sys/debug.h> 49 #include <sys/cmn_err.h> 50 #include <sys/sysinfo.h> 51 #include <sys/callb.h> 52 #include <sys/reboot.h> 53 #include <sys/time.h> 54 #include <sys/fs/ufs_inode.h> 55 #include <sys/fs/ufs_bio.h> 56 57 #include <vm/hat.h> 58 #include <vm/page.h> 59 #include <vm/pvn.h> 60 #include <vm/seg_kmem.h> 61 62 int doiflush = 1; /* non-zero to turn inode flushing on */ 63 int dopageflush = 1; /* non-zero to turn page flushing on */ 64 65 /* 66 * To improve boot performance, don't run the inode flushing loop until 67 * the specified number of seconds after boot. To revert to the old 68 * behavior, set fsflush_iflush_delay to 0. We have not created any new 69 * filesystem danger that did not exist previously, since there is always a 70 * window in between when fsflush does the inode flush loop during which the 71 * system could crash, fail to sync the filesystem, and fsck will be needed 72 * to recover. We have, however, widened this window. Finally, 73 * we never delay inode flushing if we're booting into single user mode, 74 * where the administrator may be modifying files or using fsck. This 75 * modification avoids inode flushes during boot whose only purpose is to 76 * update atimes on files which have been accessed during boot. 77 */ 78 int fsflush_iflush_delay = 60; 79 80 kcondvar_t fsflush_cv; 81 static kmutex_t fsflush_lock; /* just for the cv_wait */ 82 ksema_t fsflush_sema; /* to serialize with reboot */ 83 84 /* 85 * some statistics for fsflush_do_pages 86 */ 87 typedef struct { 88 ulong_t fsf_scan; /* number of pages scanned */ 89 ulong_t fsf_examined; /* number of page_t's actually examined, can */ 90 /* be less than fsf_scan due to large pages */ 91 ulong_t fsf_locked; /* pages we actually page_lock()ed */ 92 ulong_t fsf_modified; /* number of modified pages found */ 93 ulong_t fsf_coalesce; /* number of page coalesces done */ 94 ulong_t fsf_time; /* nanoseconds of run time */ 95 ulong_t fsf_releases; /* number of page_release() done */ 96 } fsf_stat_t; 97 98 fsf_stat_t fsf_recent; /* counts for most recent duty cycle */ 99 fsf_stat_t fsf_total; /* total of counts */ 100 ulong_t fsf_cycles; /* number of runs refelected in fsf_total */ 101 102 /* 103 * data used to determine when we can coalesce consecutive free pages 104 * into larger pages. 105 */ 106 #define MAX_PAGESIZES 32 107 static ulong_t fsf_npgsz; 108 static pgcnt_t fsf_pgcnt[MAX_PAGESIZES]; 109 static pgcnt_t fsf_mask[MAX_PAGESIZES]; 110 111 112 /* 113 * Scan page_t's and issue I/O's for modified pages. 114 * 115 * Also coalesces consecutive small sized free pages into the next larger 116 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time 117 * spent scanning on later passes and for anybody allocating large pages. 118 */ 119 static void 120 fsflush_do_pages() 121 { 122 vnode_t *vp; 123 ulong_t pcount; 124 hrtime_t timer = gethrtime(); 125 ulong_t releases = 0; 126 ulong_t nexamined = 0; 127 ulong_t nlocked = 0; 128 ulong_t nmodified = 0; 129 ulong_t ncoalesce = 0; 130 int mod; 131 u_offset_t offset; 132 uint_t szc; 133 134 page_t *coal_page = NULL; /* 1st page in group to coalesce */ 135 uint_t coal_szc = 0; /* size code, coal_page->p_szc */ 136 uint_t coal_cnt = 0; /* count of pages seen */ 137 138 static ulong_t nscan = 0; 139 static pgcnt_t last_total_pages = 0; 140 static void *pp_cookie = NULL; 141 static page_t *pp; 142 143 /* 144 * Check to see if total_pages has changed. 145 */ 146 if (total_pages != last_total_pages) { 147 last_total_pages = total_pages; 148 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; 149 } 150 151 /* 152 * On first time through initialize the cookie used for page_t scans 153 */ 154 if (pp_cookie == NULL) 155 pp = page_next_scan_init(&pp_cookie); 156 157 pcount = 0; 158 while (pcount <= nscan) { 159 160 /* 161 * move to the next page, skipping over large pages 162 * and issuing prefetches. 163 */ 164 pp = page_next_scan_large(pp, &pcount, &pp_cookie); 165 prefetch_page_r((void *)pp); 166 ASSERT(pp != NULL); 167 168 /* 169 * Do a bunch of dirty tests (ie. no locking) to determine 170 * if we can quickly skip this page. These tests are repeated 171 * after acquiring the page lock. 172 */ 173 ++nexamined; 174 if (PP_ISSWAP(pp)) { 175 coal_page = NULL; 176 continue; 177 } 178 179 /* 180 * skip free pages too, but try coalescing them into larger 181 * pagesizes 182 */ 183 if (PP_ISFREE(pp)) { 184 /* 185 * skip pages with a file system identity or that 186 * are already maximum size 187 */ 188 szc = pp->p_szc; 189 if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { 190 coal_page = NULL; 191 continue; 192 } 193 194 /* 195 * If not in a coalescing candidate page or the size 196 * codes are different, start a new candidate. 197 */ 198 if (coal_page == NULL || coal_szc != szc) { 199 200 /* 201 * page must be properly aligned 202 */ 203 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { 204 coal_page = NULL; 205 continue; 206 } 207 coal_page = pp; 208 coal_szc = szc; 209 coal_cnt = 1; 210 continue; 211 } 212 213 /* 214 * acceptable to add this to existing candidate page 215 */ 216 ++coal_cnt; 217 if (coal_cnt < fsf_pgcnt[coal_szc]) 218 continue; 219 220 /* 221 * We've got enough pages to coalesce, so do it. 222 * After promoting, we clear coal_page, so it will 223 * take another pass to promote this to an even 224 * larger page. 225 */ 226 ++ncoalesce; 227 (void) page_promote_size(coal_page, coal_szc); 228 coal_page = NULL; 229 continue; 230 } else { 231 coal_page = NULL; 232 } 233 234 if (PP_ISKAS(pp) || 235 PAGE_LOCKED(pp) || 236 pp->p_lckcnt != 0 || 237 pp->p_cowcnt != 0) 238 continue; 239 240 241 /* 242 * Reject pages that can't be "exclusively" locked. 243 */ 244 if (!page_trylock(pp, SE_EXCL)) 245 continue; 246 ++nlocked; 247 248 249 /* 250 * After locking the page, redo the above checks. 251 * Since we locked the page, leave out the PAGE_LOCKED() test. 252 */ 253 vp = pp->p_vnode; 254 if (PP_ISSWAP(pp) || 255 PP_ISFREE(pp) || 256 vp == NULL || 257 PP_ISKAS(pp) || 258 pp->p_lckcnt != 0 || 259 pp->p_cowcnt != 0 || 260 (vp->v_flag & VISSWAP) != 0) { 261 page_unlock(pp); 262 continue; 263 } 264 265 ASSERT(vp->v_type != VCHR); 266 267 /* 268 * Check the modified bit. Leaving the bit alone in hardware. 269 * It will be cleared if we do the putpage. 270 */ 271 if (IS_VMODSORT(vp)) 272 mod = hat_ismod(pp); 273 else 274 mod = hat_pagesync(pp, 275 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; 276 277 if (mod) { 278 ++nmodified; 279 offset = pp->p_offset; 280 281 /* 282 * Hold the vnode before releasing the page lock 283 * to prevent it from being freed and re-used by 284 * some other thread. 285 */ 286 VN_HOLD(vp); 287 288 page_unlock(pp); 289 290 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, 291 kcred, NULL); 292 293 VN_RELE(vp); 294 } else { 295 296 /* 297 * Catch any pages which should be on the cache list, 298 * but aren't yet. 299 */ 300 if (hat_page_is_mapped(pp) == 0) { 301 ++releases; 302 (void) page_release(pp, 1); 303 } else { 304 page_unlock(pp); 305 } 306 } 307 } 308 309 /* 310 * maintain statistics 311 * reset every million wakeups, just to avoid overflow 312 */ 313 if (++fsf_cycles == 1000000) { 314 fsf_cycles = 0; 315 fsf_total.fsf_scan = 0; 316 fsf_total.fsf_examined = 0; 317 fsf_total.fsf_locked = 0; 318 fsf_total.fsf_modified = 0; 319 fsf_total.fsf_coalesce = 0; 320 fsf_total.fsf_time = 0; 321 fsf_total.fsf_releases = 0; 322 } else { 323 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; 324 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; 325 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; 326 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; 327 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; 328 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; 329 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; 330 } 331 } 332 333 /* 334 * As part of file system hardening, this daemon is awakened 335 * every second to flush cached data which includes the 336 * buffer cache, the inode cache and mapped pages. 337 */ 338 void 339 fsflush() 340 { 341 struct buf *bp, *dwp; 342 struct hbuf *hp; 343 int autoup; 344 unsigned int ix, icount, count = 0; 345 callb_cpr_t cprinfo; 346 uint_t bcount; 347 kmutex_t *hmp; 348 struct vfssw *vswp; 349 350 proc_fsflush = ttoproc(curthread); 351 proc_fsflush->p_cstime = 0; 352 proc_fsflush->p_stime = 0; 353 proc_fsflush->p_cutime = 0; 354 proc_fsflush->p_utime = 0; 355 bcopy("fsflush", curproc->p_user.u_psargs, 8); 356 bcopy("fsflush", curproc->p_user.u_comm, 7); 357 358 mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL); 359 sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL); 360 361 /* 362 * Setup page coalescing. 363 */ 364 fsf_npgsz = page_num_pagesizes(); 365 ASSERT(fsf_npgsz < MAX_PAGESIZES); 366 for (ix = 0; ix < fsf_npgsz - 1; ++ix) { 367 fsf_pgcnt[ix] = 368 page_get_pagesize(ix + 1) / page_get_pagesize(ix); 369 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1; 370 } 371 372 autoup = v.v_autoup * hz; 373 icount = v.v_autoup / tune.t_fsflushr; 374 CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush"); 375 loop: 376 sema_v(&fsflush_sema); 377 mutex_enter(&fsflush_lock); 378 CALLB_CPR_SAFE_BEGIN(&cprinfo); 379 cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */ 380 CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock); 381 mutex_exit(&fsflush_lock); 382 sema_p(&fsflush_sema); 383 384 /* 385 * Write back all old B_DELWRI buffers on the freelist. 386 */ 387 bcount = 0; 388 for (ix = 0; ix < v.v_hbuf; ix++) { 389 390 hp = &hbuf[ix]; 391 dwp = (struct buf *)&dwbuf[ix]; 392 393 bcount += (hp->b_length); 394 395 if (dwp->av_forw == dwp) { 396 continue; 397 } 398 399 hmp = &hbuf[ix].b_lock; 400 mutex_enter(hmp); 401 bp = dwp->av_forw; 402 403 /* 404 * Go down only on the delayed write lists. 405 */ 406 while (bp != dwp) { 407 408 ASSERT(bp->b_flags & B_DELWRI); 409 410 if ((bp->b_flags & B_DELWRI) && 411 (lbolt - bp->b_start >= autoup) && 412 sema_tryp(&bp->b_sem)) { 413 bp->b_flags |= B_ASYNC; 414 hp->b_length--; 415 notavail(bp); 416 mutex_exit(hmp); 417 if (bp->b_vp == NULL) { 418 BWRITE(bp); 419 } else { 420 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, 421 bp); 422 } 423 mutex_enter(hmp); 424 bp = dwp->av_forw; 425 } else { 426 bp = bp->av_forw; 427 } 428 } 429 mutex_exit(hmp); 430 } 431 432 /* 433 * 434 * There is no need to wakeup any thread waiting on bio_mem_cv 435 * since brelse will wake them up as soon as IO is complete. 436 */ 437 bfreelist.b_bcount = bcount; 438 439 if (dopageflush) 440 fsflush_do_pages(); 441 442 if (!doiflush) 443 goto loop; 444 445 /* 446 * If the system was not booted to single user mode, skip the 447 * inode flushing until after fsflush_iflush_delay secs have elapsed. 448 */ 449 if ((boothowto & RB_SINGLE) == 0 && 450 (lbolt64 / hz) < fsflush_iflush_delay) 451 goto loop; 452 453 /* 454 * Flush cached attribute information (e.g. inodes). 455 */ 456 if (++count >= icount) { 457 count = 0; 458 459 /* 460 * Sync back cached data. 461 */ 462 RLOCK_VFSSW(); 463 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 464 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 465 vfs_refvfssw(vswp); 466 RUNLOCK_VFSSW(); 467 (void) fsop_sync_by_kind(vswp - vfssw, 468 SYNC_ATTR, kcred); 469 vfs_unrefvfssw(vswp); 470 RLOCK_VFSSW(); 471 } 472 } 473 RUNLOCK_VFSSW(); 474 } 475 goto loop; 476 } 477