1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 22 /* All Rights Reserved */ 23 24 25 /* 26 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/t_lock.h> 32 #include <sys/param.h> 33 #include <sys/tuneable.h> 34 #include <sys/inline.h> 35 #include <sys/systm.h> 36 #include <sys/proc.h> 37 #include <sys/user.h> 38 #include <sys/var.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/cred.h> 42 #include <sys/kmem.h> 43 #include <sys/vnode.h> 44 #include <sys/swap.h> 45 #include <sys/vm.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/sysinfo.h> 49 #include <sys/callb.h> 50 #include <sys/reboot.h> 51 #include <sys/time.h> 52 #include <sys/fs/ufs_inode.h> 53 #include <sys/fs/ufs_bio.h> 54 55 #include <vm/hat.h> 56 #include <vm/page.h> 57 #include <vm/pvn.h> 58 #include <vm/seg_kmem.h> 59 60 int doiflush = 1; /* non-zero to turn inode flushing on */ 61 int dopageflush = 1; /* non-zero to turn page flushing on */ 62 63 /* 64 * To improve boot performance, don't run the inode flushing loop until 65 * the specified number of seconds after boot. To revert to the old 66 * behavior, set fsflush_iflush_delay to 0. We have not created any new 67 * filesystem danger that did not exist previously, since there is always a 68 * window in between when fsflush does the inode flush loop during which the 69 * system could crash, fail to sync the filesystem, and fsck will be needed 70 * to recover. We have, however, widened this window. Finally, 71 * we never delay inode flushing if we're booting into single user mode, 72 * where the administrator may be modifying files or using fsck. This 73 * modification avoids inode flushes during boot whose only purpose is to 74 * update atimes on files which have been accessed during boot. 75 */ 76 int fsflush_iflush_delay = 60; 77 78 kcondvar_t fsflush_cv; 79 static kmutex_t fsflush_lock; /* just for the cv_wait */ 80 ksema_t fsflush_sema; /* to serialize with reboot */ 81 82 /* 83 * some statistics for fsflush_do_pages 84 */ 85 typedef struct { 86 ulong_t fsf_scan; /* number of pages scanned */ 87 ulong_t fsf_examined; /* number of page_t's actually examined, can */ 88 /* be less than fsf_scan due to large pages */ 89 ulong_t fsf_locked; /* pages we actually page_lock()ed */ 90 ulong_t fsf_modified; /* number of modified pages found */ 91 ulong_t fsf_coalesce; /* number of page coalesces done */ 92 ulong_t fsf_time; /* nanoseconds of run time */ 93 ulong_t fsf_releases; /* number of page_release() done */ 94 } fsf_stat_t; 95 96 fsf_stat_t fsf_recent; /* counts for most recent duty cycle */ 97 fsf_stat_t fsf_total; /* total of counts */ 98 ulong_t fsf_cycles; /* number of runs refelected in fsf_total */ 99 100 /* 101 * data used to determine when we can coalesce consecutive free pages 102 * into larger pages. 103 */ 104 #define MAX_PAGESIZES 32 105 static ulong_t fsf_npgsz; 106 static pgcnt_t fsf_pgcnt[MAX_PAGESIZES]; 107 static pgcnt_t fsf_mask[MAX_PAGESIZES]; 108 109 110 /* 111 * Scan page_t's and issue I/O's for modified pages. 112 * 113 * Also coalesces consecutive small sized free pages into the next larger 114 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time 115 * spent scanning on later passes and for anybody allocating large pages. 116 */ 117 static void 118 fsflush_do_pages() 119 { 120 vnode_t *vp; 121 ulong_t pcount; 122 hrtime_t timer = gethrtime(); 123 ulong_t releases = 0; 124 ulong_t nexamined = 0; 125 ulong_t nlocked = 0; 126 ulong_t nmodified = 0; 127 ulong_t ncoalesce = 0; 128 int mod; 129 u_offset_t offset; 130 uint_t szc; 131 132 page_t *coal_page = NULL; /* 1st page in group to coalesce */ 133 uint_t coal_szc = 0; /* size code, coal_page->p_szc */ 134 uint_t coal_cnt = 0; /* count of pages seen */ 135 136 static ulong_t nscan = 0; 137 static pgcnt_t last_total_pages = 0; 138 static void *pp_cookie = NULL; 139 static page_t *pp; 140 141 /* 142 * Check to see if total_pages has changed. 143 */ 144 if (total_pages != last_total_pages) { 145 last_total_pages = total_pages; 146 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; 147 } 148 149 /* 150 * On first time through initialize the cookie used for page_t scans 151 */ 152 if (pp_cookie == NULL) 153 pp = page_next_scan_init(&pp_cookie); 154 155 pcount = 0; 156 while (pcount < nscan) { 157 158 /* 159 * move to the next page, skipping over large pages 160 * and issuing prefetches. 161 */ 162 pp = page_next_scan_large(pp, &pcount, &pp_cookie); 163 prefetch_page_r((void *)pp); 164 ASSERT(pp != NULL); 165 166 /* 167 * Do a bunch of dirty tests (ie. no locking) to determine 168 * if we can quickly skip this page. These tests are repeated 169 * after acquiring the page lock. 170 */ 171 ++nexamined; 172 if (PP_ISSWAP(pp)) { 173 coal_page = NULL; 174 continue; 175 } 176 177 /* 178 * skip free pages too, but try coalescing them into larger 179 * pagesizes 180 */ 181 if (PP_ISFREE(pp)) { 182 /* 183 * skip pages with a file system identity or that 184 * are already maximum size 185 */ 186 szc = pp->p_szc; 187 if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { 188 coal_page = NULL; 189 continue; 190 } 191 192 /* 193 * If not in a coalescing candidate page or the size 194 * codes are different, start a new candidate. 195 */ 196 if (coal_page == NULL || coal_szc != szc) { 197 198 /* 199 * page must be properly aligned 200 */ 201 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { 202 coal_page = NULL; 203 continue; 204 } 205 coal_page = pp; 206 coal_szc = szc; 207 coal_cnt = 1; 208 continue; 209 } 210 211 /* 212 * acceptable to add this to existing candidate page 213 */ 214 ++coal_cnt; 215 if (coal_cnt < fsf_pgcnt[coal_szc]) 216 continue; 217 218 /* 219 * We've got enough pages to coalesce, so do it. 220 * After promoting, we clear coal_page, so it will 221 * take another pass to promote this to an even 222 * larger page. 223 */ 224 ++ncoalesce; 225 (void) page_promote_size(coal_page, coal_szc); 226 coal_page = NULL; 227 continue; 228 } else { 229 coal_page = NULL; 230 } 231 232 if (PP_ISKAS(pp) || 233 PAGE_LOCKED(pp) || 234 pp->p_lckcnt != 0 || 235 pp->p_cowcnt != 0) 236 continue; 237 238 239 /* 240 * Reject pages that can't be "exclusively" locked. 241 */ 242 if (!page_trylock(pp, SE_EXCL)) 243 continue; 244 ++nlocked; 245 246 247 /* 248 * After locking the page, redo the above checks. 249 * Since we locked the page, leave out the PAGE_LOCKED() test. 250 */ 251 vp = pp->p_vnode; 252 if (PP_ISSWAP(pp) || 253 PP_ISFREE(pp) || 254 vp == NULL || 255 PP_ISKAS(pp) || 256 pp->p_lckcnt != 0 || 257 pp->p_cowcnt != 0 || 258 (vp->v_flag & VISSWAP) != 0) { 259 page_unlock(pp); 260 continue; 261 } 262 263 ASSERT(vp->v_type != VCHR); 264 265 /* 266 * Check the modified bit. Leaving the bit alone in hardware. 267 * It will be cleared if we do the putpage. 268 */ 269 if (IS_VMODSORT(vp)) 270 mod = hat_ismod(pp); 271 else 272 mod = hat_pagesync(pp, 273 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; 274 275 if (mod) { 276 ++nmodified; 277 offset = pp->p_offset; 278 279 /* 280 * Hold the vnode before releasing the page lock 281 * to prevent it from being freed and re-used by 282 * some other thread. 283 */ 284 VN_HOLD(vp); 285 286 page_unlock(pp); 287 288 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, 289 kcred, NULL); 290 291 VN_RELE(vp); 292 } else { 293 294 /* 295 * Catch any pages which should be on the cache list, 296 * but aren't yet. 297 */ 298 if (hat_page_is_mapped(pp) == 0) { 299 ++releases; 300 (void) page_release(pp, 1); 301 } else { 302 page_unlock(pp); 303 } 304 } 305 } 306 307 /* 308 * maintain statistics 309 * reset every million wakeups, just to avoid overflow 310 */ 311 if (++fsf_cycles == 1000000) { 312 fsf_cycles = 0; 313 fsf_total.fsf_scan = 0; 314 fsf_total.fsf_examined = 0; 315 fsf_total.fsf_locked = 0; 316 fsf_total.fsf_modified = 0; 317 fsf_total.fsf_coalesce = 0; 318 fsf_total.fsf_time = 0; 319 fsf_total.fsf_releases = 0; 320 } else { 321 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; 322 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; 323 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; 324 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; 325 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; 326 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; 327 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; 328 } 329 } 330 331 /* 332 * As part of file system hardening, this daemon is awakened 333 * every second to flush cached data which includes the 334 * buffer cache, the inode cache and mapped pages. 335 */ 336 void 337 fsflush() 338 { 339 struct buf *bp, *dwp; 340 struct hbuf *hp; 341 int autoup; 342 unsigned int ix, icount, count = 0; 343 callb_cpr_t cprinfo; 344 uint_t bcount; 345 kmutex_t *hmp; 346 struct vfssw *vswp; 347 348 proc_fsflush = ttoproc(curthread); 349 proc_fsflush->p_cstime = 0; 350 proc_fsflush->p_stime = 0; 351 proc_fsflush->p_cutime = 0; 352 proc_fsflush->p_utime = 0; 353 bcopy("fsflush", curproc->p_user.u_psargs, 8); 354 bcopy("fsflush", curproc->p_user.u_comm, 7); 355 356 mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL); 357 sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL); 358 359 /* 360 * Setup page coalescing. 361 */ 362 fsf_npgsz = page_num_pagesizes(); 363 ASSERT(fsf_npgsz < MAX_PAGESIZES); 364 for (ix = 0; ix < fsf_npgsz - 1; ++ix) { 365 fsf_pgcnt[ix] = 366 page_get_pagesize(ix + 1) / page_get_pagesize(ix); 367 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1; 368 } 369 370 autoup = v.v_autoup * hz; 371 icount = v.v_autoup / tune.t_fsflushr; 372 CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush"); 373 loop: 374 sema_v(&fsflush_sema); 375 mutex_enter(&fsflush_lock); 376 CALLB_CPR_SAFE_BEGIN(&cprinfo); 377 cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */ 378 CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock); 379 mutex_exit(&fsflush_lock); 380 sema_p(&fsflush_sema); 381 382 /* 383 * Write back all old B_DELWRI buffers on the freelist. 384 */ 385 bcount = 0; 386 for (ix = 0; ix < v.v_hbuf; ix++) { 387 388 hp = &hbuf[ix]; 389 dwp = (struct buf *)&dwbuf[ix]; 390 391 bcount += (hp->b_length); 392 393 if (dwp->av_forw == dwp) { 394 continue; 395 } 396 397 hmp = &hbuf[ix].b_lock; 398 mutex_enter(hmp); 399 bp = dwp->av_forw; 400 401 /* 402 * Go down only on the delayed write lists. 403 */ 404 while (bp != dwp) { 405 406 ASSERT(bp->b_flags & B_DELWRI); 407 408 if ((bp->b_flags & B_DELWRI) && 409 (lbolt - bp->b_start >= autoup) && 410 sema_tryp(&bp->b_sem)) { 411 bp->b_flags |= B_ASYNC; 412 hp->b_length--; 413 notavail(bp); 414 mutex_exit(hmp); 415 if (bp->b_vp == NULL) { 416 BWRITE(bp); 417 } else { 418 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, 419 bp); 420 } 421 mutex_enter(hmp); 422 bp = dwp->av_forw; 423 } else { 424 bp = bp->av_forw; 425 } 426 } 427 mutex_exit(hmp); 428 } 429 430 /* 431 * 432 * There is no need to wakeup any thread waiting on bio_mem_cv 433 * since brelse will wake them up as soon as IO is complete. 434 */ 435 bfreelist.b_bcount = bcount; 436 437 if (dopageflush) 438 fsflush_do_pages(); 439 440 if (!doiflush) 441 goto loop; 442 443 /* 444 * If the system was not booted to single user mode, skip the 445 * inode flushing until after fsflush_iflush_delay secs have elapsed. 446 */ 447 if ((boothowto & RB_SINGLE) == 0 && 448 (lbolt64 / hz) < fsflush_iflush_delay) 449 goto loop; 450 451 /* 452 * Flush cached attribute information (e.g. inodes). 453 */ 454 if (++count >= icount) { 455 count = 0; 456 457 /* 458 * Sync back cached data. 459 */ 460 RLOCK_VFSSW(); 461 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 462 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 463 vfs_refvfssw(vswp); 464 RUNLOCK_VFSSW(); 465 (void) fsop_sync_by_kind(vswp - vfssw, 466 SYNC_ATTR, kcred); 467 vfs_unrefvfssw(vswp); 468 RLOCK_VFSSW(); 469 } 470 } 471 RUNLOCK_VFSSW(); 472 } 473 goto loop; 474 } 475