xref: /illumos-gate/usr/src/uts/common/fs/fsflush.c (revision de81e71e031139a0a7f13b7bf64152c3faa76698)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22 /*	  All Rights Reserved  	*/
23 
24 
25 /*
26  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/tuneable.h>
34 #include <sys/inline.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/user.h>
38 #include <sys/var.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/cred.h>
42 #include <sys/kmem.h>
43 #include <sys/vnode.h>
44 #include <sys/swap.h>
45 #include <sys/vm.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/sysinfo.h>
49 #include <sys/callb.h>
50 #include <sys/reboot.h>
51 #include <sys/time.h>
52 #include <sys/fs/ufs_inode.h>
53 #include <sys/fs/ufs_bio.h>
54 
55 #include <vm/hat.h>
56 #include <vm/page.h>
57 #include <vm/pvn.h>
58 #include <vm/seg_kmem.h>
59 
60 int doiflush = 1;	/* non-zero to turn inode flushing on */
61 int dopageflush = 1;	/* non-zero to turn page flushing on */
62 
63 /*
64  * To improve boot performance, don't run the inode flushing loop until
65  * the specified number of seconds after boot.  To revert to the old
66  * behavior, set fsflush_iflush_delay to 0.  We have not created any new
67  * filesystem danger that did not exist previously, since there is always a
68  * window in between when fsflush does the inode flush loop during which the
69  * system could crash, fail to sync the filesystem, and fsck will be needed
70  * to recover.  We have, however, widened this window.  Finally,
71  * we never delay inode flushing if we're booting into single user mode,
72  * where the administrator may be modifying files or using fsck.  This
73  * modification avoids inode flushes during boot whose only purpose is to
74  * update atimes on files which have been accessed during boot.
75  */
76 int fsflush_iflush_delay = 60;
77 
78 kcondvar_t fsflush_cv;
79 static kmutex_t fsflush_lock;	/* just for the cv_wait */
80 ksema_t fsflush_sema;		/* to serialize with reboot */
81 
82 /*
83  * some statistics for fsflush_do_pages
84  */
85 typedef struct {
86 	ulong_t fsf_scan;	/* number of pages scanned */
87 	ulong_t fsf_examined;	/* number of page_t's actually examined, can */
88 				/* be less than fsf_scan due to large pages */
89 	ulong_t fsf_locked;	/* pages we actually page_lock()ed */
90 	ulong_t fsf_modified;	/* number of modified pages found */
91 	ulong_t fsf_coalesce;	/* number of page coalesces done */
92 	ulong_t fsf_time;	/* nanoseconds of run time */
93 	ulong_t fsf_releases;	/* number of page_release() done */
94 } fsf_stat_t;
95 
96 fsf_stat_t fsf_recent;	/* counts for most recent duty cycle */
97 fsf_stat_t fsf_total;	/* total of counts */
98 ulong_t fsf_cycles;	/* number of runs refelected in fsf_total */
99 
100 /*
101  * data used to determine when we can coalesce consecutive free pages
102  * into larger pages.
103  */
104 #define	MAX_PAGESIZES	32
105 static ulong_t		fsf_npgsz;
106 static pgcnt_t		fsf_pgcnt[MAX_PAGESIZES];
107 static pgcnt_t		fsf_mask[MAX_PAGESIZES];
108 
109 
110 /*
111  * Scan page_t's and issue I/O's for modified pages.
112  *
113  * Also coalesces consecutive small sized free pages into the next larger
114  * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
115  * spent scanning on later passes and for anybody allocating large pages.
116  */
117 static void
118 fsflush_do_pages()
119 {
120 	vnode_t		*vp;
121 	ulong_t		pcount;
122 	hrtime_t	timer = gethrtime();
123 	ulong_t		releases = 0;
124 	ulong_t		nexamined = 0;
125 	ulong_t		nlocked = 0;
126 	ulong_t		nmodified = 0;
127 	ulong_t		ncoalesce = 0;
128 	int		mod;
129 	u_offset_t	offset;
130 	uint_t		szc;
131 
132 	page_t		*coal_page = NULL;  /* 1st page in group to coalesce */
133 	uint_t		coal_szc = 0;	    /* size code, coal_page->p_szc */
134 	uint_t		coal_cnt = 0;	    /* count of pages seen */
135 
136 	static ulong_t	nscan = 0;
137 	static pgcnt_t	last_total_pages = 0;
138 	static void	*pp_cookie = NULL;
139 	static page_t	*pp;
140 
141 	/*
142 	 * Check to see if total_pages has changed.
143 	 */
144 	if (total_pages != last_total_pages) {
145 		last_total_pages = total_pages;
146 		nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
147 	}
148 
149 	/*
150 	 * On first time through initialize the cookie used for page_t scans
151 	 */
152 	if (pp_cookie == NULL)
153 		pp = page_next_scan_init(&pp_cookie);
154 
155 	pcount = 0;
156 	while (pcount < nscan) {
157 
158 		/*
159 		 * move to the next page, skipping over large pages
160 		 * and issuing prefetches.
161 		 */
162 		pp = page_next_scan_large(pp, &pcount, &pp_cookie);
163 		prefetch_page_r((void *)pp);
164 		ASSERT(pp != NULL);
165 
166 		/*
167 		 * Do a bunch of dirty tests (ie. no locking) to determine
168 		 * if we can quickly skip this page. These tests are repeated
169 		 * after acquiring the page lock.
170 		 */
171 		++nexamined;
172 		if (PP_ISSWAP(pp)) {
173 			coal_page = NULL;
174 			continue;
175 		}
176 
177 		/*
178 		 * skip free pages too, but try coalescing them into larger
179 		 * pagesizes
180 		 */
181 		if (PP_ISFREE(pp)) {
182 			/*
183 			 * skip pages with a file system identity or that
184 			 * are already maximum size
185 			 */
186 			szc = pp->p_szc;
187 			if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
188 				coal_page = NULL;
189 				continue;
190 			}
191 
192 			/*
193 			 * If not in a coalescing candidate page or the size
194 			 * codes are different, start a new candidate.
195 			 */
196 			if (coal_page == NULL || coal_szc != szc) {
197 
198 				/*
199 				 * page must be properly aligned
200 				 */
201 				if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
202 					coal_page = NULL;
203 					continue;
204 				}
205 				coal_page = pp;
206 				coal_szc = szc;
207 				coal_cnt = 1;
208 				continue;
209 			}
210 
211 			/*
212 			 * acceptable to add this to existing candidate page
213 			 */
214 			++coal_cnt;
215 			if (coal_cnt < fsf_pgcnt[coal_szc])
216 				continue;
217 
218 			/*
219 			 * We've got enough pages to coalesce, so do it.
220 			 * After promoting, we clear coal_page, so it will
221 			 * take another pass to promote this to an even
222 			 * larger page.
223 			 */
224 			++ncoalesce;
225 			(void) page_promote_size(coal_page, coal_szc);
226 			coal_page = NULL;
227 			continue;
228 		} else {
229 			coal_page = NULL;
230 		}
231 
232 		if (PP_ISKAS(pp) ||
233 		    PAGE_LOCKED(pp) ||
234 		    pp->p_lckcnt != 0 ||
235 		    pp->p_cowcnt != 0)
236 			continue;
237 
238 
239 		/*
240 		 * Reject pages that can't be "exclusively" locked.
241 		 */
242 		if (!page_trylock(pp, SE_EXCL))
243 			continue;
244 		++nlocked;
245 
246 
247 		/*
248 		 * After locking the page, redo the above checks.
249 		 * Since we locked the page, leave out the PAGE_LOCKED() test.
250 		 */
251 		vp = pp->p_vnode;
252 		if (PP_ISSWAP(pp) ||
253 		    PP_ISFREE(pp) ||
254 		    vp == NULL ||
255 		    PP_ISKAS(pp) ||
256 		    pp->p_lckcnt != 0 ||
257 		    pp->p_cowcnt != 0 ||
258 		    (vp->v_flag & VISSWAP) != 0) {
259 			page_unlock(pp);
260 			continue;
261 		}
262 
263 		ASSERT(vp->v_type != VCHR);
264 
265 		/*
266 		 * Check the modified bit. Leaving the bit alone in hardware.
267 		 * It will be cleared if we do the putpage.
268 		 */
269 		if (IS_VMODSORT(vp))
270 			mod = hat_ismod(pp);
271 		else
272 			mod = hat_pagesync(pp,
273 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
274 
275 		if (mod) {
276 			++nmodified;
277 			offset = pp->p_offset;
278 
279 			/*
280 			 * Hold the vnode before releasing the page lock
281 			 * to prevent it from being freed and re-used by
282 			 * some other thread.
283 			 */
284 			VN_HOLD(vp);
285 
286 			page_unlock(pp);
287 
288 			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
289 			    kcred, NULL);
290 
291 			VN_RELE(vp);
292 		} else {
293 
294 			/*
295 			 * Catch any pages which should be on the cache list,
296 			 * but aren't yet.
297 			 */
298 			if (hat_page_is_mapped(pp) == 0) {
299 				++releases;
300 				(void) page_release(pp, 1);
301 			} else {
302 				page_unlock(pp);
303 			}
304 		}
305 	}
306 
307 	/*
308 	 * maintain statistics
309 	 * reset every million wakeups, just to avoid overflow
310 	 */
311 	if (++fsf_cycles == 1000000) {
312 		fsf_cycles = 0;
313 		fsf_total.fsf_scan = 0;
314 		fsf_total.fsf_examined = 0;
315 		fsf_total.fsf_locked = 0;
316 		fsf_total.fsf_modified = 0;
317 		fsf_total.fsf_coalesce = 0;
318 		fsf_total.fsf_time = 0;
319 		fsf_total.fsf_releases = 0;
320 	} else {
321 		fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
322 		fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
323 		fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
324 		fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
325 		fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
326 		fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
327 		fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
328 	}
329 }
330 
331 /*
332  * As part of file system hardening, this daemon is awakened
333  * every second to flush cached data which includes the
334  * buffer cache, the inode cache and mapped pages.
335  */
336 void
337 fsflush()
338 {
339 	struct buf *bp, *dwp;
340 	struct hbuf *hp;
341 	int autoup;
342 	unsigned int ix, icount, count = 0;
343 	callb_cpr_t cprinfo;
344 	uint_t		bcount;
345 	kmutex_t	*hmp;
346 	struct vfssw *vswp;
347 
348 	proc_fsflush = ttoproc(curthread);
349 	proc_fsflush->p_cstime = 0;
350 	proc_fsflush->p_stime =  0;
351 	proc_fsflush->p_cutime =  0;
352 	proc_fsflush->p_utime = 0;
353 	bcopy("fsflush", curproc->p_user.u_psargs, 8);
354 	bcopy("fsflush", curproc->p_user.u_comm, 7);
355 
356 	mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
357 	sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
358 
359 	/*
360 	 * Setup page coalescing.
361 	 */
362 	fsf_npgsz = page_num_pagesizes();
363 	ASSERT(fsf_npgsz < MAX_PAGESIZES);
364 	for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
365 		fsf_pgcnt[ix] =
366 		    page_get_pagesize(ix + 1) / page_get_pagesize(ix);
367 		fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
368 	}
369 
370 	autoup = v.v_autoup * hz;
371 	icount = v.v_autoup / tune.t_fsflushr;
372 	CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
373 loop:
374 	sema_v(&fsflush_sema);
375 	mutex_enter(&fsflush_lock);
376 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
377 	cv_wait(&fsflush_cv, &fsflush_lock);		/* wait for clock */
378 	CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
379 	mutex_exit(&fsflush_lock);
380 	sema_p(&fsflush_sema);
381 
382 	/*
383 	 * Write back all old B_DELWRI buffers on the freelist.
384 	 */
385 	bcount = 0;
386 	for (ix = 0; ix < v.v_hbuf; ix++) {
387 
388 		hp = &hbuf[ix];
389 		dwp = (struct buf *)&dwbuf[ix];
390 
391 		bcount += (hp->b_length);
392 
393 		if (dwp->av_forw == dwp) {
394 			continue;
395 		}
396 
397 		hmp = &hbuf[ix].b_lock;
398 		mutex_enter(hmp);
399 		bp = dwp->av_forw;
400 
401 		/*
402 		 * Go down only on the delayed write lists.
403 		 */
404 		while (bp != dwp) {
405 
406 			ASSERT(bp->b_flags & B_DELWRI);
407 
408 			if ((bp->b_flags & B_DELWRI) &&
409 			    (lbolt - bp->b_start >= autoup) &&
410 			    sema_tryp(&bp->b_sem)) {
411 				bp->b_flags |= B_ASYNC;
412 				hp->b_length--;
413 				notavail(bp);
414 				mutex_exit(hmp);
415 				if (bp->b_vp == NULL) {
416 					BWRITE(bp);
417 				} else {
418 					UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
419 					    bp);
420 				}
421 				mutex_enter(hmp);
422 				bp = dwp->av_forw;
423 			} else {
424 				bp = bp->av_forw;
425 			}
426 		}
427 		mutex_exit(hmp);
428 	}
429 
430 	/*
431 	 *
432 	 * There is no need to wakeup any thread waiting on bio_mem_cv
433 	 * since brelse will wake them up as soon as IO is complete.
434 	 */
435 	bfreelist.b_bcount = bcount;
436 
437 	if (dopageflush)
438 		fsflush_do_pages();
439 
440 	if (!doiflush)
441 		goto loop;
442 
443 	/*
444 	 * If the system was not booted to single user mode, skip the
445 	 * inode flushing until after fsflush_iflush_delay secs have elapsed.
446 	 */
447 	if ((boothowto & RB_SINGLE) == 0 &&
448 	    (lbolt64 / hz) < fsflush_iflush_delay)
449 		goto loop;
450 
451 	/*
452 	 * Flush cached attribute information (e.g. inodes).
453 	 */
454 	if (++count >= icount) {
455 		count = 0;
456 
457 		/*
458 		 * Sync back cached data.
459 		 */
460 		RLOCK_VFSSW();
461 		for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
462 			if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
463 				vfs_refvfssw(vswp);
464 				RUNLOCK_VFSSW();
465 				(void) fsop_sync_by_kind(vswp - vfssw,
466 				    SYNC_ATTR, kcred);
467 				vfs_unrefvfssw(vswp);
468 				RLOCK_VFSSW();
469 			}
470 		}
471 		RUNLOCK_VFSSW();
472 	}
473 	goto loop;
474 }
475