xref: /titanic_41/usr/src/uts/common/fs/ufs/lufs_thread.c (revision d3d50737e566cade9a08d73d2af95105ac7cd960)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/proc.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/inttypes.h>
42 #include <sys/callb.h>
43 #include <sys/tnf_probe.h>
44 
45 /*
46  * Kernel threads for logging
47  * Currently only one for rolling the log (one per log).
48  */
49 
50 #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
51 #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
52 #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
53 
54 /*
55  * Macros
56  */
57 #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
58 #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
59 
60 /*
61  * Tunables
62  */
63 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
64 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
65 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
66 long logmap_maxnme = 1536;
67 int trans_roll_tics = 0;
68 uint64_t trans_roll_new_delta = 0;
69 uint64_t lrr_wait = 0;
70 /*
71  * Key for thread specific data for the roll thread to
72  * bypass snapshot throttling
73  */
74 uint_t bypass_snapshot_throttle_key;
75 
76 /*
77  * externs
78  */
79 extern kmutex_t		ml_scan;
80 extern kcondvar_t	ml_scan_cv;
81 extern int		maxphys;
82 
83 static void
trans_roll_wait(mt_map_t * logmap,callb_cpr_t * cprinfop)84 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
85 {
86 	mutex_enter(&logmap->mtm_mutex);
87 	logmap->mtm_ref = 0;
88 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
89 		cv_broadcast(&logmap->mtm_from_roll_cv);
90 	}
91 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
92 	CALLB_CPR_SAFE_BEGIN(cprinfop);
93 	(void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
94 	    trans_roll_tics, TR_CLOCK_TICK);
95 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
96 	logmap->mtm_flags |= MTM_ROLLING;
97 	mutex_exit(&logmap->mtm_mutex);
98 }
99 
100 /*
101  * returns the number of 8K buffers to use for rolling the log
102  */
103 static uint32_t
log_roll_buffers()104 log_roll_buffers()
105 {
106 	/*
107 	 * sanity validate the tunable lufs_num_roll_bufs
108 	 */
109 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
110 		return (lufs_min_roll_bufs);
111 	}
112 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
113 		return (lufs_max_roll_bufs);
114 	}
115 	return (lufs_num_roll_bufs);
116 }
117 
118 /*
119  * Find something to roll, then if we don't have cached roll buffers
120  * covering all the deltas in that MAPBLOCK then read the master
121  * and overlay the deltas.
122  * returns;
123  * 	0 if sucessful
124  *	1 on finding nothing to roll
125  *	2 on error
126  */
127 int
log_roll_read(ml_unit_t * ul,rollbuf_t * rbs,int nmblk,caddr_t roll_bufs,int * retnbuf)128 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
129     int *retnbuf)
130 {
131 	offset_t	mof;
132 	buf_t		*bp;
133 	rollbuf_t	*rbp;
134 	mt_map_t	*logmap = ul->un_logmap;
135 	daddr_t		mblkno;
136 	int		i;
137 	int		error;
138 	int		nbuf;
139 
140 	/*
141 	 * Make sure there is really something to roll
142 	 */
143 	mof = 0;
144 	if (!logmap_next_roll(logmap, &mof)) {
145 		return (1);
146 	}
147 
148 	/*
149 	 * build some master blocks + deltas to roll forward
150 	 */
151 	rw_enter(&logmap->mtm_rwlock, RW_READER);
152 	nbuf = 0;
153 	do {
154 		mof = mof & (offset_t)MAPBLOCKMASK;
155 		mblkno = lbtodb(mof);
156 
157 		/*
158 		 * Check for the case of a new delta to a set up buffer
159 		 */
160 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
161 			if (P2ALIGN(rbp->rb_bh.b_blkno,
162 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
163 				TNF_PROBE_0(trans_roll_new_delta, "lufs",
164 				    /* CSTYLED */);
165 				trans_roll_new_delta++;
166 				/* Flush out the current set of buffers */
167 				goto flush_bufs;
168 			}
169 		}
170 
171 		/*
172 		 * Work out what to roll next. If it isn't cached then read
173 		 * it asynchronously from the master.
174 		 */
175 		bp = &rbp->rb_bh;
176 		bp->b_blkno = mblkno;
177 		bp->b_flags = B_READ;
178 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
179 		bp->b_bufsize = MAPBLOCKSIZE;
180 		if (top_read_roll(rbp, ul)) {
181 			/* logmap deltas were in use */
182 			if (nbuf == 0) {
183 				/*
184 				 * On first buffer wait for the logmap user
185 				 * to finish by grabbing the logmap lock
186 				 * exclusively rather than spinning
187 				 */
188 				rw_exit(&logmap->mtm_rwlock);
189 				lrr_wait++;
190 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
191 				rw_exit(&logmap->mtm_rwlock);
192 				return (1);
193 			}
194 			/* we have at least one buffer - flush it */
195 			goto flush_bufs;
196 		}
197 		if ((bp->b_flags & B_INVAL) == 0) {
198 			nbuf++;
199 		}
200 		mof += MAPBLOCKSIZE;
201 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
202 
203 	/*
204 	 * If there was nothing to roll cycle back
205 	 */
206 	if (nbuf == 0) {
207 		rw_exit(&logmap->mtm_rwlock);
208 		return (1);
209 	}
210 
211 flush_bufs:
212 	/*
213 	 * For each buffer, if it isn't cached then wait for the read to
214 	 * finish and overlay the deltas.
215 	 */
216 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
217 		if (!rbp->rb_crb) {
218 			bp = &rbp->rb_bh;
219 			if (trans_not_wait(bp)) {
220 				ldl_seterror(ul,
221 				    "Error reading master during ufs log roll");
222 				error = 1;
223 			}
224 			/*
225 			 * sync read the data from the log
226 			 */
227 			if (ldl_read(ul, bp->b_un.b_addr,
228 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
229 			    MAPBLOCKSIZE, rbp->rb_age)) {
230 				error = 1;
231 			}
232 		}
233 
234 		/*
235 		 * reset the age bit in the age list
236 		 */
237 		logmap_list_put_roll(logmap, rbp->rb_age);
238 
239 		if (ul->un_flags & LDL_ERROR) {
240 			error = 1;
241 		}
242 	}
243 	rw_exit(&logmap->mtm_rwlock);
244 	if (error)
245 		return (2);
246 	*retnbuf = nbuf;
247 	return (0);
248 }
249 
250 /*
251  * Write out a cached roll buffer
252  */
253 void
log_roll_write_crb(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)254 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
255 {
256 	crb_t *crb = rbp->rb_crb;
257 	buf_t *bp = &rbp->rb_bh;
258 
259 	bp->b_blkno = lbtodb(crb->c_mof);
260 	bp->b_un.b_addr = crb->c_buf;
261 	bp->b_bcount = crb->c_nb;
262 	bp->b_bufsize = crb->c_nb;
263 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
264 	bp->b_flags = B_WRITE;
265 	logstats.ls_rwrites.value.ui64++;
266 
267 	/* if snapshots are enabled, call it */
268 	if (ufsvfsp->vfs_snapshot) {
269 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
270 	} else {
271 		(void) bdev_strategy(bp);
272 	}
273 }
274 
275 /*
276  * Write out a set of non cached roll buffers
277  */
278 void
log_roll_write_bufs(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)279 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
280 {
281 	buf_t		*bp = &rbp->rb_bh;
282 	buf_t		*bp2;
283 	rbsecmap_t	secmap = rbp->rb_secmap;
284 	int		j, k;
285 
286 	ASSERT(secmap);
287 	ASSERT((bp->b_flags & B_INVAL) == 0);
288 
289 	do { /* for each contiguous block of sectors */
290 		/* find start of next sector to write */
291 		for (j = 0; j < 16; ++j) {
292 			if (secmap & UINT16_C(1))
293 				break;
294 			secmap >>= 1;
295 		}
296 		bp->b_un.b_addr += (j << DEV_BSHIFT);
297 		bp->b_blkno += j;
298 
299 		/* calculate number of sectors */
300 		secmap >>= 1;
301 		j++;
302 		for (k = 1; j < 16; ++j) {
303 			if ((secmap & UINT16_C(1)) == 0)
304 				break;
305 			secmap >>= 1;
306 			k++;
307 		}
308 		bp->b_bcount = k << DEV_BSHIFT;
309 		bp->b_flags = B_WRITE;
310 		logstats.ls_rwrites.value.ui64++;
311 
312 		/* if snapshots are enabled, call it */
313 		if (ufsvfsp->vfs_snapshot)
314 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
315 		else
316 			(void) bdev_strategy(bp);
317 		if (secmap) {
318 			/*
319 			 * Allocate another buf_t to handle
320 			 * the next write in this MAPBLOCK
321 			 * Chain them via b_list.
322 			 */
323 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
324 			bp->b_list = bp2;
325 			bioinit(bp2);
326 			bp2->b_iodone = trans_not_done;
327 			bp2->b_bufsize = MAPBLOCKSIZE;
328 			bp2->b_edev = bp->b_edev;
329 			bp2->b_un.b_addr =
330 			    bp->b_un.b_addr + bp->b_bcount;
331 			bp2->b_blkno = bp->b_blkno + k;
332 			bp = bp2;
333 		}
334 	} while (secmap);
335 }
336 
337 /*
338  * Asynchronously roll the deltas, using the sector map
339  * in each rollbuf_t.
340  */
341 int
log_roll_write(ml_unit_t * ul,rollbuf_t * rbs,int nbuf)342 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
343 {
344 
345 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
346 	rollbuf_t	*rbp;
347 	buf_t		*bp, *bp2;
348 	rollbuf_t	*head, *prev, *rbp2;
349 
350 	/*
351 	 * Order the buffers by blkno
352 	 */
353 	ASSERT(nbuf > 0);
354 #ifdef lint
355 	prev = rbs;
356 #endif
357 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
358 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
359 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
360 				if (rbp2 == head) {
361 					rbp->rb_next = head;
362 					head = rbp;
363 				} else {
364 					prev->rb_next = rbp;
365 					rbp->rb_next = rbp2;
366 				}
367 				break;
368 			}
369 		}
370 		if (rbp2 == NULL) {
371 			prev->rb_next = rbp;
372 			rbp->rb_next = NULL;
373 		}
374 	}
375 
376 	/*
377 	 * issue the in-order writes
378 	 */
379 	for (rbp = head; rbp; rbp = rbp2) {
380 		if (rbp->rb_crb) {
381 			log_roll_write_crb(ufsvfsp, rbp);
382 		} else {
383 			log_roll_write_bufs(ufsvfsp, rbp);
384 		}
385 		/* null out the rb_next link for next set of rolling */
386 		rbp2 = rbp->rb_next;
387 		rbp->rb_next = NULL;
388 	}
389 
390 	/*
391 	 * wait for all the writes to finish
392 	 */
393 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
394 		bp = &rbp->rb_bh;
395 		if (trans_not_wait(bp)) {
396 			ldl_seterror(ul,
397 			    "Error writing master during ufs log roll");
398 		}
399 
400 		/*
401 		 * Now wait for all the "cloned" buffer writes (if any)
402 		 * and free those headers
403 		 */
404 		bp2 = bp->b_list;
405 		bp->b_list = NULL;
406 		while (bp2) {
407 			if (trans_not_wait(bp2)) {
408 				ldl_seterror(ul,
409 				    "Error writing master during ufs log roll");
410 			}
411 			bp = bp2;
412 			bp2 = bp2->b_list;
413 			kmem_free(bp, sizeof (buf_t));
414 		}
415 	}
416 
417 	if (ul->un_flags & LDL_ERROR)
418 		return (1);
419 	return (0);
420 }
421 
422 void
trans_roll(ml_unit_t * ul)423 trans_roll(ml_unit_t *ul)
424 {
425 	callb_cpr_t	cprinfo;
426 	mt_map_t	*logmap = ul->un_logmap;
427 	rollbuf_t	*rbs;
428 	rollbuf_t	*rbp;
429 	buf_t		*bp;
430 	caddr_t		roll_bufs;
431 	uint32_t	nmblk;
432 	int		i;
433 	int		doingforceroll;
434 	int		nbuf;
435 
436 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
437 	    "trans_roll");
438 
439 	/*
440 	 * We do not want the roll thread's writes to be
441 	 * throttled by the snapshot.
442 	 * If they are throttled then we can have a deadlock
443 	 * between the roll thread and the snapshot taskq thread:
444 	 * roll thread wants the throttling semaphore and
445 	 * the snapshot taskq thread cannot release the semaphore
446 	 * because it is writing to the log and the log is full.
447 	 */
448 
449 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
450 
451 	/*
452 	 * setup some roll parameters
453 	 */
454 	if (trans_roll_tics == 0)
455 		trans_roll_tics = 5 * hz;
456 	nmblk = log_roll_buffers();
457 
458 	/*
459 	 * allocate the buffers and buffer headers
460 	 */
461 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
462 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
463 
464 	/*
465 	 * initialize the buffer headers
466 	 */
467 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
468 		rbp->rb_next = NULL;
469 		bp = &rbp->rb_bh;
470 		bioinit(bp);
471 		bp->b_edev = ul->un_dev;
472 		bp->b_iodone = trans_not_done;
473 		bp->b_bufsize = MAPBLOCKSIZE;
474 	}
475 
476 	doingforceroll = 0;
477 
478 again:
479 	/*
480 	 * LOOP FOREVER
481 	 */
482 
483 	/*
484 	 * exit on demand
485 	 */
486 	mutex_enter(&logmap->mtm_mutex);
487 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
488 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
489 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
490 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
491 		    MTM_ROLL_EXIT | MTM_ROLLING);
492 		cv_broadcast(&logmap->mtm_from_roll_cv);
493 		CALLB_CPR_EXIT(&cprinfo);
494 		thread_exit();
495 		/* NOTREACHED */
496 	}
497 
498 	/*
499 	 * MT_SCAN debug mode
500 	 *	don't roll except in FORCEROLL situations
501 	 */
502 	if (logmap->mtm_debug & MT_SCAN)
503 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
504 			mutex_exit(&logmap->mtm_mutex);
505 			trans_roll_wait(logmap, &cprinfo);
506 			goto again;
507 		}
508 	ASSERT(logmap->mtm_trimlof == 0);
509 
510 	/*
511 	 * If we've finished a force roll cycle then wakeup any
512 	 * waiters.
513 	 */
514 	if (doingforceroll) {
515 		doingforceroll = 0;
516 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
517 		mutex_exit(&logmap->mtm_mutex);
518 		cv_broadcast(&logmap->mtm_from_roll_cv);
519 	} else {
520 		mutex_exit(&logmap->mtm_mutex);
521 	}
522 
523 	/*
524 	 * If someone wants us to roll something; then do it
525 	 */
526 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
527 		doingforceroll = 1;
528 		goto rollsomething;
529 	}
530 
531 	/*
532 	 * Log is busy, check if logmap is getting full.
533 	 */
534 	if (logmap_need_roll(logmap)) {
535 		goto rollsomething;
536 	}
537 
538 	/*
539 	 * Check if the log is idle and is not empty
540 	 */
541 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
542 		goto rollsomething;
543 	}
544 
545 	/*
546 	 * Log is busy, check if its getting full
547 	 */
548 	if (ldl_need_roll(ul)) {
549 		goto rollsomething;
550 	}
551 
552 	/*
553 	 * nothing to do; wait a bit and then start over
554 	 */
555 	trans_roll_wait(logmap, &cprinfo);
556 	goto again;
557 
558 	/*
559 	 * ROLL SOMETHING
560 	 */
561 
562 rollsomething:
563 	/*
564 	 * Use the cached roll buffers, or read the master
565 	 * and overlay the deltas
566 	 */
567 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
568 	case 1: trans_roll_wait(logmap, &cprinfo);
569 		/* FALLTHROUGH */
570 	case 2: goto again;
571 	/* default case is success */
572 	}
573 
574 	/*
575 	 * Asynchronously write out the deltas
576 	 */
577 	if (log_roll_write(ul, rbs, nbuf))
578 		goto again;
579 
580 	/*
581 	 * free up the deltas in the logmap
582 	 */
583 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
584 		bp = &rbp->rb_bh;
585 		logmap_remove_roll(logmap,
586 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
587 	}
588 
589 	/*
590 	 * free up log space; if possible
591 	 */
592 	logmap_sethead(logmap, ul);
593 
594 	/*
595 	 * LOOP
596 	 */
597 	goto again;
598 }
599