1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 23 /* All Rights Reserved */ 24 25 26 /* 27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/types.h> 34 #include <sys/t_lock.h> 35 #include <sys/param.h> 36 #include <sys/tuneable.h> 37 #include <sys/inline.h> 38 #include <sys/systm.h> 39 #include <sys/proc.h> 40 #include <sys/user.h> 41 #include <sys/var.h> 42 #include <sys/buf.h> 43 #include <sys/vfs.h> 44 #include <sys/cred.h> 45 #include <sys/kmem.h> 46 #include <sys/vnode.h> 47 #include <sys/swap.h> 48 #include <sys/vm.h> 49 #include <sys/debug.h> 50 #include <sys/cmn_err.h> 51 #include <sys/sysinfo.h> 52 #include <sys/callb.h> 53 #include <sys/reboot.h> 54 #include <sys/time.h> 55 #include <sys/fs/ufs_inode.h> 56 #include <sys/fs/ufs_bio.h> 57 58 #include <vm/hat.h> 59 #include <vm/page.h> 60 #include <vm/pvn.h> 61 #include <vm/seg_kmem.h> 62 63 int doiflush = 1; /* non-zero to turn inode flushing on */ 64 int dopageflush = 1; /* non-zero to turn page flushing on */ 65 66 /* 67 * To improve boot performance, don't run the inode flushing loop until 68 * the specified number of seconds after boot. To revert to the old 69 * behavior, set fsflush_iflush_delay to 0. We have not created any new 70 * filesystem danger that did not exist previously, since there is always a 71 * window in between when fsflush does the inode flush loop during which the 72 * system could crash, fail to sync the filesystem, and fsck will be needed 73 * to recover. We have, however, widened this window. Finally, 74 * we never delay inode flushing if we're booting into single user mode, 75 * where the administrator may be modifying files or using fsck. This 76 * modification avoids inode flushes during boot whose only purpose is to 77 * update atimes on files which have been accessed during boot. 78 */ 79 int fsflush_iflush_delay = 60; 80 81 kcondvar_t fsflush_cv; 82 static kmutex_t fsflush_lock; /* just for the cv_wait */ 83 ksema_t fsflush_sema; /* to serialize with reboot */ 84 85 /* 86 * some statistics for fsflush_do_pages 87 */ 88 typedef struct { 89 ulong_t fsf_scan; /* number of pages scanned */ 90 ulong_t fsf_examined; /* number of page_t's actually examined, can */ 91 /* be less than fsf_scan due to large pages */ 92 ulong_t fsf_locked; /* pages we actually page_lock()ed */ 93 ulong_t fsf_modified; /* number of modified pages found */ 94 ulong_t fsf_coalesce; /* number of page coalesces done */ 95 ulong_t fsf_time; /* nanoseconds of run time */ 96 ulong_t fsf_releases; /* number of page_release() done */ 97 } fsf_stat_t; 98 99 fsf_stat_t fsf_recent; /* counts for most recent duty cycle */ 100 fsf_stat_t fsf_total; /* total of counts */ 101 ulong_t fsf_cycles; /* number of runs refelected in fsf_total */ 102 103 /* 104 * data used to determine when we can coalese consecutive free pages 105 * into larger pages. 106 */ 107 #define MAX_PAGESIZES 32 108 static ulong_t fsf_npgsz; 109 static pgcnt_t fsf_pgcnt[MAX_PAGESIZES]; 110 static pgcnt_t fsf_mask[MAX_PAGESIZES]; 111 112 113 /* 114 * Scan page_t's and issue I/O's for modified pages. 115 * 116 * Also coalesces consecutive small sized free pages into the next larger 117 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time 118 * spent scanning on later passes and for anybody allocating large pages. 119 */ 120 static void 121 fsflush_do_pages() 122 { 123 vnode_t *vp; 124 ulong_t pcount; 125 hrtime_t timer = gethrtime(); 126 ulong_t releases = 0; 127 ulong_t nexamined = 0; 128 ulong_t nlocked = 0; 129 ulong_t nmodified = 0; 130 ulong_t ncoalesce = 0; 131 int mod; 132 u_offset_t offset; 133 uint_t szc; 134 135 page_t *coal_page = NULL; /* 1st page in group to coalese */ 136 uint_t coal_szc = 0; /* size code, coal_page->p_szc */ 137 uint_t coal_cnt = 0; /* count of pages seen */ 138 139 static ulong_t nscan = 0; 140 static pgcnt_t last_total_pages = 0; 141 static void *pp_cookie = NULL; 142 static page_t *pp; 143 144 /* 145 * Check to see if total_pages has changed. 146 */ 147 if (total_pages != last_total_pages) { 148 last_total_pages = total_pages; 149 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; 150 } 151 152 /* 153 * On first time through initialize the cookie used for page_t scans 154 */ 155 if (pp_cookie == NULL) 156 pp = page_next_scan_init(&pp_cookie); 157 158 pcount = 0; 159 while (pcount <= nscan) { 160 161 /* 162 * move to the next page, skipping over large pages 163 * and issuing prefetches. 164 */ 165 pp = page_next_scan_large(pp, &pcount, &pp_cookie); 166 prefetch_page_r((void *)pp); 167 ASSERT(pp != NULL); 168 169 /* 170 * Do a bunch of dirty tests (ie. no locking) to determine 171 * if we can quickly skip this page. These tests are repeated 172 * after acquiring the page lock. 173 */ 174 ++nexamined; 175 if (PP_ISSWAP(pp)) { 176 coal_page = NULL; 177 continue; 178 } 179 180 /* 181 * skip free pages too, but try coalescing them into larger 182 * pagesizes 183 */ 184 if (PP_ISFREE(pp)) { 185 /* 186 * skip pages with a file system identity or that 187 * are already maximum size 188 */ 189 szc = pp->p_szc; 190 if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { 191 coal_page = NULL; 192 continue; 193 } 194 195 /* 196 * If not in a coalescing candidate page or the size 197 * codes are different, start a new candidate. 198 */ 199 if (coal_page == NULL || coal_szc != szc) { 200 201 /* 202 * page must be properly aligned 203 */ 204 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { 205 coal_page = NULL; 206 continue; 207 } 208 coal_page = pp; 209 coal_szc = szc; 210 coal_cnt = 1; 211 continue; 212 } 213 214 /* 215 * acceptable to add this to existing candidate page 216 */ 217 ++coal_cnt; 218 if (coal_cnt < fsf_pgcnt[coal_szc]) 219 continue; 220 221 /* 222 * We've got enough pages to coalesce, so do it. 223 * After promoting, we clear coal_page, so it will 224 * take another pass to promote this to an even 225 * larger page. 226 */ 227 ++ncoalesce; 228 (void) page_promote_size(coal_page, coal_szc); 229 coal_page = NULL; 230 continue; 231 } else { 232 coal_page = NULL; 233 } 234 235 if (pp->p_vnode == &kvp || 236 PAGE_LOCKED(pp) || 237 pp->p_lckcnt != 0 || 238 pp->p_cowcnt != 0) 239 continue; 240 241 242 /* 243 * Reject pages that can't be "exclusively" locked. 244 */ 245 if (!page_trylock(pp, SE_EXCL)) 246 continue; 247 ++nlocked; 248 249 250 /* 251 * After locking the page, redo the above checks. 252 * Since we locked the page, leave out the PAGE_LOCKED() test. 253 */ 254 vp = pp->p_vnode; 255 if (PP_ISSWAP(pp) || 256 PP_ISFREE(pp) || 257 vp == NULL || 258 vp == &kvp || 259 pp->p_lckcnt != 0 || 260 pp->p_cowcnt != 0 || 261 (vp->v_flag & VISSWAP) != 0) { 262 page_unlock(pp); 263 continue; 264 } 265 266 ASSERT(vp->v_type != VCHR); 267 268 /* 269 * Check the modified bit. Leaving the bit alone in hardware. 270 * It will be cleared if we do the putpage. 271 */ 272 if (IS_VMODSORT(vp)) 273 mod = hat_ismod(pp); 274 else 275 mod = hat_pagesync(pp, 276 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; 277 278 if (mod) { 279 ++nmodified; 280 offset = pp->p_offset; 281 282 /* 283 * Hold the vnode before releasing the page lock 284 * to prevent it from being freed and re-used by 285 * some other thread. 286 */ 287 VN_HOLD(vp); 288 289 page_unlock(pp); 290 291 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, 292 kcred); 293 294 VN_RELE(vp); 295 } else { 296 297 /* 298 * Catch any pages which should be on the cache list, 299 * but aren't yet. 300 */ 301 if (hat_page_is_mapped(pp) == 0) { 302 ++releases; 303 (void) page_release(pp, 1); 304 } else { 305 page_unlock(pp); 306 } 307 } 308 } 309 310 /* 311 * maintain statistics 312 * reset every million wakeups, just to avoid overflow 313 */ 314 if (++fsf_cycles == 1000000) { 315 fsf_cycles = 0; 316 fsf_total.fsf_scan = 0; 317 fsf_total.fsf_examined = 0; 318 fsf_total.fsf_locked = 0; 319 fsf_total.fsf_modified = 0; 320 fsf_total.fsf_coalesce = 0; 321 fsf_total.fsf_time = 0; 322 fsf_total.fsf_releases = 0; 323 } else { 324 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; 325 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; 326 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; 327 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; 328 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; 329 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; 330 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; 331 } 332 } 333 334 /* 335 * As part of file system hardening, this daemon is awakened 336 * every second to flush cached data which includes the 337 * buffer cache, the inode cache and mapped pages. 338 */ 339 void 340 fsflush() 341 { 342 struct buf *bp, *dwp; 343 struct hbuf *hp; 344 int autoup; 345 unsigned int ix, icount, count = 0; 346 callb_cpr_t cprinfo; 347 uint_t bcount; 348 kmutex_t *hmp; 349 struct vfssw *vswp; 350 351 proc_fsflush = ttoproc(curthread); 352 proc_fsflush->p_cstime = 0; 353 proc_fsflush->p_stime = 0; 354 proc_fsflush->p_cutime = 0; 355 proc_fsflush->p_utime = 0; 356 bcopy("fsflush", u.u_psargs, 8); 357 bcopy("fsflush", u.u_comm, 7); 358 359 mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL); 360 sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL); 361 362 /* 363 * Setup page coalescing. 364 */ 365 fsf_npgsz = page_num_pagesizes(); 366 ASSERT(fsf_npgsz < MAX_PAGESIZES); 367 for (ix = 0; ix < fsf_npgsz - 1; ++ix) { 368 fsf_pgcnt[ix] = 369 page_get_pagesize(ix + 1) / page_get_pagesize(ix); 370 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1; 371 } 372 373 autoup = v.v_autoup * hz; 374 icount = v.v_autoup / tune.t_fsflushr; 375 CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush"); 376 loop: 377 sema_v(&fsflush_sema); 378 mutex_enter(&fsflush_lock); 379 CALLB_CPR_SAFE_BEGIN(&cprinfo); 380 cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */ 381 CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock); 382 mutex_exit(&fsflush_lock); 383 sema_p(&fsflush_sema); 384 385 /* 386 * Write back all old B_DELWRI buffers on the freelist. 387 */ 388 bcount = 0; 389 for (ix = 0; ix < v.v_hbuf; ix++) { 390 391 hp = &hbuf[ix]; 392 dwp = (struct buf *)&dwbuf[ix]; 393 394 bcount += (hp->b_length); 395 396 if (dwp->av_forw == dwp) { 397 continue; 398 } 399 400 hmp = &hbuf[ix].b_lock; 401 mutex_enter(hmp); 402 bp = dwp->av_forw; 403 404 /* 405 * Go down only on the delayed write lists. 406 */ 407 while (bp != dwp) { 408 409 ASSERT(bp->b_flags & B_DELWRI); 410 411 if ((bp->b_flags & B_DELWRI) && 412 (lbolt - bp->b_start >= autoup) && 413 sema_tryp(&bp->b_sem)) { 414 bp->b_flags |= B_ASYNC; 415 hp->b_length--; 416 notavail(bp); 417 mutex_exit(hmp); 418 if (bp->b_vp == NULL) { 419 BWRITE(bp); 420 } else { 421 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, 422 bp); 423 } 424 mutex_enter(hmp); 425 bp = dwp->av_forw; 426 } else { 427 bp = bp->av_forw; 428 } 429 } 430 mutex_exit(hmp); 431 } 432 433 /* 434 * 435 * There is no need to wakeup any thread waiting on bio_mem_cv 436 * since brelse will wake them up as soon as IO is complete. 437 */ 438 bfreelist.b_bcount = bcount; 439 440 if (dopageflush) 441 fsflush_do_pages(); 442 443 if (!doiflush) 444 goto loop; 445 446 /* 447 * If the system was not booted to single user mode, skip the 448 * inode flushing until after fsflush_iflush_delay secs have elapsed. 449 */ 450 if ((boothowto & RB_SINGLE) == 0 && 451 (lbolt64 / hz) < fsflush_iflush_delay) 452 goto loop; 453 454 /* 455 * Flush cached attribute information (e.g. inodes). 456 */ 457 if (++count >= icount) { 458 count = 0; 459 460 /* 461 * Sync back cached data. 462 */ 463 RLOCK_VFSSW(); 464 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 465 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 466 vfs_refvfssw(vswp); 467 RUNLOCK_VFSSW(); 468 (void) fsop_sync_by_kind(vswp - vfssw, 469 SYNC_ATTR, kcred); 470 vfs_unrefvfssw(vswp); 471 RLOCK_VFSSW(); 472 } 473 } 474 RUNLOCK_VFSSW(); 475 } 476 goto loop; 477 } 478