1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 22 /* All Rights Reserved */ 23 24 25 /* 26 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/t_lock.h> 32 #include <sys/param.h> 33 #include <sys/tuneable.h> 34 #include <sys/inline.h> 35 #include <sys/systm.h> 36 #include <sys/proc.h> 37 #include <sys/user.h> 38 #include <sys/var.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/cred.h> 42 #include <sys/kmem.h> 43 #include <sys/vnode.h> 44 #include <sys/swap.h> 45 #include <sys/vm.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/sysinfo.h> 49 #include <sys/callb.h> 50 #include <sys/reboot.h> 51 #include <sys/time.h> 52 #include <sys/fs/ufs_inode.h> 53 #include <sys/fs/ufs_bio.h> 54 55 #include <vm/hat.h> 56 #include <vm/page.h> 57 #include <vm/pvn.h> 58 #include <vm/seg_kmem.h> 59 60 int doiflush = 1; /* non-zero to turn inode flushing on */ 61 int dopageflush = 1; /* non-zero to turn page flushing on */ 62 63 /* 64 * To improve boot performance, don't run the inode flushing loop until 65 * the specified number of seconds after boot. To revert to the old 66 * behavior, set fsflush_iflush_delay to 0. We have not created any new 67 * filesystem danger that did not exist previously, since there is always a 68 * window in between when fsflush does the inode flush loop during which the 69 * system could crash, fail to sync the filesystem, and fsck will be needed 70 * to recover. We have, however, widened this window. Finally, 71 * we never delay inode flushing if we're booting into single user mode, 72 * where the administrator may be modifying files or using fsck. This 73 * modification avoids inode flushes during boot whose only purpose is to 74 * update atimes on files which have been accessed during boot. 75 */ 76 int fsflush_iflush_delay = 60; 77 78 kcondvar_t fsflush_cv; 79 static kmutex_t fsflush_lock; /* just for the cv_wait */ 80 ksema_t fsflush_sema; /* to serialize with reboot */ 81 82 /* 83 * some statistics for fsflush_do_pages 84 */ 85 typedef struct { 86 ulong_t fsf_scan; /* number of pages scanned */ 87 ulong_t fsf_examined; /* number of page_t's actually examined, can */ 88 /* be less than fsf_scan due to large pages */ 89 ulong_t fsf_locked; /* pages we actually page_lock()ed */ 90 ulong_t fsf_modified; /* number of modified pages found */ 91 ulong_t fsf_coalesce; /* number of page coalesces done */ 92 ulong_t fsf_time; /* nanoseconds of run time */ 93 ulong_t fsf_releases; /* number of page_release() done */ 94 } fsf_stat_t; 95 96 fsf_stat_t fsf_recent; /* counts for most recent duty cycle */ 97 fsf_stat_t fsf_total; /* total of counts */ 98 ulong_t fsf_cycles; /* number of runs refelected in fsf_total */ 99 100 /* 101 * data used to determine when we can coalesce consecutive free pages 102 * into larger pages. 103 */ 104 #define MAX_PAGESIZES 32 105 static ulong_t fsf_npgsz; 106 static pgcnt_t fsf_pgcnt[MAX_PAGESIZES]; 107 static pgcnt_t fsf_mask[MAX_PAGESIZES]; 108 109 110 /* 111 * Scan page_t's and issue I/O's for modified pages. 112 * 113 * Also coalesces consecutive small sized free pages into the next larger 114 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time 115 * spent scanning on later passes and for anybody allocating large pages. 116 */ 117 static void 118 fsflush_do_pages() 119 { 120 vnode_t *vp; 121 ulong_t pcount; 122 hrtime_t timer = gethrtime(); 123 ulong_t releases = 0; 124 ulong_t nexamined = 0; 125 ulong_t nlocked = 0; 126 ulong_t nmodified = 0; 127 ulong_t ncoalesce = 0; 128 ulong_t cnt; 129 int mod; 130 int fspage = 1; 131 u_offset_t offset; 132 uint_t szc; 133 134 page_t *coal_page = NULL; /* 1st page in group to coalesce */ 135 uint_t coal_szc = 0; /* size code, coal_page->p_szc */ 136 uint_t coal_cnt = 0; /* count of pages seen */ 137 138 static ulong_t nscan = 0; 139 static pgcnt_t last_total_pages = 0; 140 static page_t *pp = NULL; 141 142 /* 143 * Check to see if total_pages has changed. 144 */ 145 if (total_pages != last_total_pages) { 146 last_total_pages = total_pages; 147 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; 148 } 149 150 if (pp == NULL) 151 pp = memsegs->pages; 152 153 pcount = 0; 154 while (pcount < nscan) { 155 156 /* 157 * move to the next page, skipping over large pages 158 * and issuing prefetches. 159 */ 160 if (pp->p_szc && fspage == 0) { 161 pfn_t pfn; 162 163 pfn = page_pptonum(pp); 164 cnt = page_get_pagecnt(pp->p_szc); 165 cnt -= pfn & (cnt - 1); 166 } else 167 cnt = 1; 168 169 pp = page_nextn(pp, cnt); 170 prefetch_page_r((void *)pp); 171 ASSERT(pp != NULL); 172 pcount += cnt; 173 174 /* 175 * Do a bunch of dirty tests (ie. no locking) to determine 176 * if we can quickly skip this page. These tests are repeated 177 * after acquiring the page lock. 178 */ 179 ++nexamined; 180 if (PP_ISSWAP(pp)) { 181 fspage = 0; 182 coal_page = NULL; 183 continue; 184 } 185 186 /* 187 * skip free pages too, but try coalescing them into larger 188 * pagesizes 189 */ 190 if (PP_ISFREE(pp)) { 191 /* 192 * skip pages with a file system identity or that 193 * are already maximum size 194 */ 195 fspage = 0; 196 szc = pp->p_szc; 197 if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { 198 coal_page = NULL; 199 continue; 200 } 201 202 /* 203 * If not in a coalescing candidate page or the size 204 * codes are different, start a new candidate. 205 */ 206 if (coal_page == NULL || coal_szc != szc) { 207 208 /* 209 * page must be properly aligned 210 */ 211 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { 212 coal_page = NULL; 213 continue; 214 } 215 coal_page = pp; 216 coal_szc = szc; 217 coal_cnt = 1; 218 continue; 219 } 220 221 /* 222 * acceptable to add this to existing candidate page 223 */ 224 ++coal_cnt; 225 if (coal_cnt < fsf_pgcnt[coal_szc]) 226 continue; 227 228 /* 229 * We've got enough pages to coalesce, so do it. 230 * After promoting, we clear coal_page, so it will 231 * take another pass to promote this to an even 232 * larger page. 233 */ 234 ++ncoalesce; 235 (void) page_promote_size(coal_page, coal_szc); 236 coal_page = NULL; 237 continue; 238 } else { 239 coal_page = NULL; 240 } 241 242 if (PP_ISKAS(pp) || 243 PAGE_LOCKED(pp) || 244 pp->p_lckcnt != 0 || 245 pp->p_cowcnt != 0) { 246 fspage = 0; 247 continue; 248 } 249 250 251 /* 252 * Reject pages that can't be "exclusively" locked. 253 */ 254 if (!page_trylock(pp, SE_EXCL)) 255 continue; 256 ++nlocked; 257 258 259 /* 260 * After locking the page, redo the above checks. 261 * Since we locked the page, leave out the PAGE_LOCKED() test. 262 */ 263 vp = pp->p_vnode; 264 if (PP_ISSWAP(pp) || 265 PP_ISFREE(pp) || 266 vp == NULL || 267 PP_ISKAS(pp) || 268 (vp->v_flag & VISSWAP) != 0) { 269 page_unlock(pp); 270 fspage = 0; 271 continue; 272 } 273 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 274 page_unlock(pp); 275 continue; 276 } 277 278 fspage = 1; 279 ASSERT(vp->v_type != VCHR); 280 281 /* 282 * Check the modified bit. Leaving the bit alone in hardware. 283 * It will be cleared if we do the putpage. 284 */ 285 if (IS_VMODSORT(vp)) 286 mod = hat_ismod(pp); 287 else 288 mod = hat_pagesync(pp, 289 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; 290 291 if (mod) { 292 ++nmodified; 293 offset = pp->p_offset; 294 295 /* 296 * Hold the vnode before releasing the page lock 297 * to prevent it from being freed and re-used by 298 * some other thread. 299 */ 300 VN_HOLD(vp); 301 302 page_unlock(pp); 303 304 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, 305 kcred, NULL); 306 307 VN_RELE(vp); 308 } else { 309 310 /* 311 * Catch any pages which should be on the cache list, 312 * but aren't yet. 313 */ 314 if (hat_page_is_mapped(pp) == 0) { 315 ++releases; 316 (void) page_release(pp, 1); 317 } else { 318 page_unlock(pp); 319 } 320 } 321 } 322 323 /* 324 * maintain statistics 325 * reset every million wakeups, just to avoid overflow 326 */ 327 if (++fsf_cycles == 1000000) { 328 fsf_cycles = 0; 329 fsf_total.fsf_scan = 0; 330 fsf_total.fsf_examined = 0; 331 fsf_total.fsf_locked = 0; 332 fsf_total.fsf_modified = 0; 333 fsf_total.fsf_coalesce = 0; 334 fsf_total.fsf_time = 0; 335 fsf_total.fsf_releases = 0; 336 } else { 337 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; 338 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; 339 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; 340 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; 341 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; 342 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; 343 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; 344 } 345 } 346 347 /* 348 * As part of file system hardening, this daemon is awakened 349 * every second to flush cached data which includes the 350 * buffer cache, the inode cache and mapped pages. 351 */ 352 void 353 fsflush() 354 { 355 struct buf *bp, *dwp; 356 struct hbuf *hp; 357 int autoup; 358 unsigned int ix, icount, count = 0; 359 callb_cpr_t cprinfo; 360 uint_t bcount; 361 kmutex_t *hmp; 362 struct vfssw *vswp; 363 364 proc_fsflush = ttoproc(curthread); 365 proc_fsflush->p_cstime = 0; 366 proc_fsflush->p_stime = 0; 367 proc_fsflush->p_cutime = 0; 368 proc_fsflush->p_utime = 0; 369 bcopy("fsflush", curproc->p_user.u_psargs, 8); 370 bcopy("fsflush", curproc->p_user.u_comm, 7); 371 372 mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL); 373 sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL); 374 375 /* 376 * Setup page coalescing. 377 */ 378 fsf_npgsz = page_num_pagesizes(); 379 ASSERT(fsf_npgsz < MAX_PAGESIZES); 380 for (ix = 0; ix < fsf_npgsz - 1; ++ix) { 381 fsf_pgcnt[ix] = 382 page_get_pagesize(ix + 1) / page_get_pagesize(ix); 383 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1; 384 } 385 386 autoup = v.v_autoup * hz; 387 icount = v.v_autoup / tune.t_fsflushr; 388 CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush"); 389 loop: 390 sema_v(&fsflush_sema); 391 mutex_enter(&fsflush_lock); 392 CALLB_CPR_SAFE_BEGIN(&cprinfo); 393 cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */ 394 CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock); 395 mutex_exit(&fsflush_lock); 396 sema_p(&fsflush_sema); 397 398 /* 399 * Write back all old B_DELWRI buffers on the freelist. 400 */ 401 bcount = 0; 402 for (ix = 0; ix < v.v_hbuf; ix++) { 403 404 hp = &hbuf[ix]; 405 dwp = (struct buf *)&dwbuf[ix]; 406 407 bcount += (hp->b_length); 408 409 if (dwp->av_forw == dwp) { 410 continue; 411 } 412 413 hmp = &hbuf[ix].b_lock; 414 mutex_enter(hmp); 415 bp = dwp->av_forw; 416 417 /* 418 * Go down only on the delayed write lists. 419 */ 420 while (bp != dwp) { 421 422 ASSERT(bp->b_flags & B_DELWRI); 423 424 if ((bp->b_flags & B_DELWRI) && 425 (ddi_get_lbolt() - bp->b_start >= autoup) && 426 sema_tryp(&bp->b_sem)) { 427 bp->b_flags |= B_ASYNC; 428 hp->b_length--; 429 notavail(bp); 430 mutex_exit(hmp); 431 if (bp->b_vp == NULL) { 432 BWRITE(bp); 433 } else { 434 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, 435 bp); 436 } 437 mutex_enter(hmp); 438 bp = dwp->av_forw; 439 } else { 440 bp = bp->av_forw; 441 } 442 } 443 mutex_exit(hmp); 444 } 445 446 /* 447 * 448 * There is no need to wakeup any thread waiting on bio_mem_cv 449 * since brelse will wake them up as soon as IO is complete. 450 */ 451 bfreelist.b_bcount = bcount; 452 453 if (dopageflush) 454 fsflush_do_pages(); 455 456 if (!doiflush) 457 goto loop; 458 459 /* 460 * If the system was not booted to single user mode, skip the 461 * inode flushing until after fsflush_iflush_delay secs have elapsed. 462 */ 463 if ((boothowto & RB_SINGLE) == 0 && 464 (ddi_get_lbolt64() / hz) < fsflush_iflush_delay) 465 goto loop; 466 467 /* 468 * Flush cached attribute information (e.g. inodes). 469 */ 470 if (++count >= icount) { 471 count = 0; 472 473 /* 474 * Sync back cached data. 475 */ 476 RLOCK_VFSSW(); 477 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 478 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 479 vfs_refvfssw(vswp); 480 RUNLOCK_VFSSW(); 481 (void) fsop_sync_by_kind(vswp - vfssw, 482 SYNC_ATTR, kcred); 483 vfs_unrefvfssw(vswp); 484 RLOCK_VFSSW(); 485 } 486 } 487 RUNLOCK_VFSSW(); 488 } 489 goto loop; 490 } 491