xref: /titanic_44/usr/src/uts/common/fs/ufs/lufs_thread.c (revision b00044a2eb43864b8718585d21949611a2ee59ef)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <sys/types.h>
30 #include <sys/vnode.h>
31 #include <sys/errno.h>
32 #include <sys/sysmacros.h>
33 #include <sys/debug.h>
34 #include <sys/kmem.h>
35 #include <sys/conf.h>
36 #include <sys/proc.h>
37 #include <sys/cmn_err.h>
38 #include <sys/fssnap_if.h>
39 #include <sys/fs/ufs_inode.h>
40 #include <sys/fs/ufs_filio.h>
41 #include <sys/fs/ufs_log.h>
42 #include <sys/fs/ufs_bio.h>
43 #include <sys/inttypes.h>
44 #include <sys/callb.h>
45 #include <sys/tnf_probe.h>
46 
47 /*
48  * Kernel threads for logging
49  * Currently only one for rolling the log (one per log).
50  */
51 
52 #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
53 #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
54 #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
55 
56 /*
57  * Macros
58  */
59 #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
60 #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
61 
62 /*
63  * Tunables
64  */
65 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
66 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
67 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
68 long logmap_maxnme = 1536;
69 int trans_roll_tics = 0;
70 uint64_t trans_roll_new_delta = 0;
71 uint64_t lrr_wait = 0;
72 /*
73  * Key for thread specific data for the roll thread to
74  * bypass snapshot throttling
75  */
76 uint_t bypass_snapshot_throttle_key;
77 
78 /*
79  * externs
80  */
81 extern kmutex_t		ml_scan;
82 extern kcondvar_t	ml_scan_cv;
83 extern int		maxphys;
84 
85 static void
86 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
87 {
88 	mutex_enter(&logmap->mtm_mutex);
89 	logmap->mtm_ref = 0;
90 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
91 		cv_broadcast(&logmap->mtm_from_roll_cv);
92 	}
93 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
94 	CALLB_CPR_SAFE_BEGIN(cprinfop);
95 	(void) cv_timedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
96 	    lbolt + trans_roll_tics);
97 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
98 	logmap->mtm_flags |= MTM_ROLLING;
99 	mutex_exit(&logmap->mtm_mutex);
100 }
101 
102 /*
103  * returns the number of 8K buffers to use for rolling the log
104  */
105 static uint32_t
106 log_roll_buffers()
107 {
108 	/*
109 	 * sanity validate the tunable lufs_num_roll_bufs
110 	 */
111 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
112 		return (lufs_min_roll_bufs);
113 	}
114 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
115 		return (lufs_max_roll_bufs);
116 	}
117 	return (lufs_num_roll_bufs);
118 }
119 
120 /*
121  * Find something to roll, then if we don't have cached roll buffers
122  * covering all the deltas in that MAPBLOCK then read the master
123  * and overlay the deltas.
124  * returns;
125  * 	0 if sucessful
126  *	1 on finding nothing to roll
127  *	2 on error
128  */
129 int
130 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
131     int *retnbuf)
132 {
133 	offset_t	mof;
134 	buf_t		*bp;
135 	rollbuf_t	*rbp;
136 	mt_map_t	*logmap = ul->un_logmap;
137 	daddr_t		mblkno;
138 	int		i;
139 	int		error;
140 	int		nbuf;
141 
142 	/*
143 	 * Make sure there is really something to roll
144 	 */
145 	mof = 0;
146 	if (!logmap_next_roll(logmap, &mof)) {
147 		return (1);
148 	}
149 
150 	/*
151 	 * build some master blocks + deltas to roll forward
152 	 */
153 	rw_enter(&logmap->mtm_rwlock, RW_READER);
154 	nbuf = 0;
155 	do {
156 		mof = mof & (offset_t)MAPBLOCKMASK;
157 		mblkno = lbtodb(mof);
158 
159 		/*
160 		 * Check for the case of a new delta to a set up buffer
161 		 */
162 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
163 			if (P2ALIGN(rbp->rb_bh.b_blkno,
164 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
165 				TNF_PROBE_0(trans_roll_new_delta, "lufs",
166 				    /* CSTYLED */);
167 				trans_roll_new_delta++;
168 				/* Flush out the current set of buffers */
169 				goto flush_bufs;
170 			}
171 		}
172 
173 		/*
174 		 * Work out what to roll next. If it isn't cached then read
175 		 * it asynchronously from the master.
176 		 */
177 		bp = &rbp->rb_bh;
178 		bp->b_blkno = mblkno;
179 		bp->b_flags = B_READ;
180 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
181 		bp->b_bufsize = MAPBLOCKSIZE;
182 		if (top_read_roll(rbp, ul)) {
183 			/* logmap deltas were in use */
184 			if (nbuf == 0) {
185 				/*
186 				 * On first buffer wait for the logmap user
187 				 * to finish by grabbing the logmap lock
188 				 * exclusively rather than spinning
189 				 */
190 				rw_exit(&logmap->mtm_rwlock);
191 				lrr_wait++;
192 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
193 				rw_exit(&logmap->mtm_rwlock);
194 				return (1);
195 			}
196 			/* we have at least one buffer - flush it */
197 			goto flush_bufs;
198 		}
199 		if ((bp->b_flags & B_INVAL) == 0) {
200 			nbuf++;
201 		}
202 		mof += MAPBLOCKSIZE;
203 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
204 
205 	/*
206 	 * If there was nothing to roll cycle back
207 	 */
208 	if (nbuf == 0) {
209 		rw_exit(&logmap->mtm_rwlock);
210 		return (1);
211 	}
212 
213 flush_bufs:
214 	/*
215 	 * For each buffer, if it isn't cached then wait for the read to
216 	 * finish and overlay the deltas.
217 	 */
218 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
219 		if (!rbp->rb_crb) {
220 			bp = &rbp->rb_bh;
221 			if (trans_not_wait(bp)) {
222 				ldl_seterror(ul,
223 				    "Error reading master during ufs log roll");
224 				error = 1;
225 			}
226 			/*
227 			 * sync read the data from the log
228 			 */
229 			if (ldl_read(ul, bp->b_un.b_addr,
230 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
231 			    MAPBLOCKSIZE, rbp->rb_age)) {
232 				error = 1;
233 			}
234 		}
235 
236 		/*
237 		 * reset the age bit in the age list
238 		 */
239 		logmap_list_put_roll(logmap, rbp->rb_age);
240 
241 		if (ul->un_flags & LDL_ERROR) {
242 			error = 1;
243 		}
244 	}
245 	rw_exit(&logmap->mtm_rwlock);
246 	if (error)
247 		return (2);
248 	*retnbuf = nbuf;
249 	return (0);
250 }
251 
252 /*
253  * Write out a cached roll buffer
254  */
255 void
256 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
257 {
258 	crb_t *crb = rbp->rb_crb;
259 	buf_t *bp = &rbp->rb_bh;
260 
261 	bp->b_blkno = lbtodb(crb->c_mof);
262 	bp->b_un.b_addr = crb->c_buf;
263 	bp->b_bcount = crb->c_nb;
264 	bp->b_bufsize = crb->c_nb;
265 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
266 	bp->b_flags = B_WRITE;
267 	logstats.ls_rwrites.value.ui64++;
268 
269 	/* if snapshots are enabled, call it */
270 	if (ufsvfsp->vfs_snapshot) {
271 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
272 	} else {
273 		(void) bdev_strategy(bp);
274 	}
275 }
276 
277 /*
278  * Write out a set of non cached roll buffers
279  */
280 void
281 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
282 {
283 	buf_t		*bp = &rbp->rb_bh;
284 	buf_t		*bp2;
285 	rbsecmap_t	secmap = rbp->rb_secmap;
286 	int		j, k;
287 
288 	ASSERT(secmap);
289 	ASSERT((bp->b_flags & B_INVAL) == 0);
290 
291 	do { /* for each contiguous block of sectors */
292 		/* find start of next sector to write */
293 		for (j = 0; j < 16; ++j) {
294 			if (secmap & UINT16_C(1))
295 				break;
296 			secmap >>= 1;
297 		}
298 		bp->b_un.b_addr += (j << DEV_BSHIFT);
299 		bp->b_blkno += j;
300 
301 		/* calculate number of sectors */
302 		secmap >>= 1;
303 		j++;
304 		for (k = 1; j < 16; ++j) {
305 			if ((secmap & UINT16_C(1)) == 0)
306 				break;
307 			secmap >>= 1;
308 			k++;
309 		}
310 		bp->b_bcount = k << DEV_BSHIFT;
311 		bp->b_flags = B_WRITE;
312 		logstats.ls_rwrites.value.ui64++;
313 
314 		/* if snapshots are enabled, call it */
315 		if (ufsvfsp->vfs_snapshot)
316 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
317 		else
318 			(void) bdev_strategy(bp);
319 		if (secmap) {
320 			/*
321 			 * Allocate another buf_t to handle
322 			 * the next write in this MAPBLOCK
323 			 * Chain them via b_list.
324 			 */
325 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
326 			bp->b_list = bp2;
327 			bioinit(bp2);
328 			bp2->b_iodone = trans_not_done;
329 			bp2->b_bufsize = MAPBLOCKSIZE;
330 			bp2->b_edev = bp->b_edev;
331 			bp2->b_un.b_addr =
332 			    bp->b_un.b_addr + bp->b_bcount;
333 			bp2->b_blkno = bp->b_blkno + k;
334 			bp = bp2;
335 		}
336 	} while (secmap);
337 }
338 
339 /*
340  * Asynchronously roll the deltas, using the sector map
341  * in each rollbuf_t.
342  */
343 int
344 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
345 {
346 
347 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
348 	rollbuf_t	*rbp;
349 	buf_t		*bp, *bp2;
350 	rollbuf_t	*head, *prev, *rbp2;
351 
352 	/*
353 	 * Order the buffers by blkno
354 	 */
355 	ASSERT(nbuf > 0);
356 #ifdef lint
357 	prev = rbs;
358 #endif
359 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
360 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
361 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
362 				if (rbp2 == head) {
363 					rbp->rb_next = head;
364 					head = rbp;
365 				} else {
366 					prev->rb_next = rbp;
367 					rbp->rb_next = rbp2;
368 				}
369 				break;
370 			}
371 		}
372 		if (rbp2 == NULL) {
373 			prev->rb_next = rbp;
374 			rbp->rb_next = NULL;
375 		}
376 	}
377 
378 	/*
379 	 * issue the in-order writes
380 	 */
381 	for (rbp = head; rbp; rbp = rbp2) {
382 		if (rbp->rb_crb) {
383 			log_roll_write_crb(ufsvfsp, rbp);
384 		} else {
385 			log_roll_write_bufs(ufsvfsp, rbp);
386 		}
387 		/* null out the rb_next link for next set of rolling */
388 		rbp2 = rbp->rb_next;
389 		rbp->rb_next = NULL;
390 	}
391 
392 	/*
393 	 * wait for all the writes to finish
394 	 */
395 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
396 		bp = &rbp->rb_bh;
397 		if (trans_not_wait(bp)) {
398 			ldl_seterror(ul,
399 			    "Error writing master during ufs log roll");
400 		}
401 
402 		/*
403 		 * Now wait for all the "cloned" buffer writes (if any)
404 		 * and free those headers
405 		 */
406 		bp2 = bp->b_list;
407 		bp->b_list = NULL;
408 		while (bp2) {
409 			if (trans_not_wait(bp2)) {
410 				ldl_seterror(ul,
411 				    "Error writing master during ufs log roll");
412 			}
413 			bp = bp2;
414 			bp2 = bp2->b_list;
415 			kmem_free(bp, sizeof (buf_t));
416 		}
417 	}
418 
419 	if (ul->un_flags & LDL_ERROR)
420 		return (1);
421 	return (0);
422 }
423 
424 void
425 trans_roll(ml_unit_t *ul)
426 {
427 	callb_cpr_t	cprinfo;
428 	mt_map_t	*logmap = ul->un_logmap;
429 	rollbuf_t	*rbs;
430 	rollbuf_t	*rbp;
431 	buf_t		*bp;
432 	caddr_t		roll_bufs;
433 	uint32_t	nmblk;
434 	int		i;
435 	int		doingforceroll;
436 	int		nbuf;
437 
438 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
439 	    "trans_roll");
440 
441 	/*
442 	 * We do not want the roll thread's writes to be
443 	 * throttled by the snapshot.
444 	 * If they are throttled then we can have a deadlock
445 	 * between the roll thread and the snapshot taskq thread:
446 	 * roll thread wants the throttling semaphore and
447 	 * the snapshot taskq thread cannot release the semaphore
448 	 * because it is writing to the log and the log is full.
449 	 */
450 
451 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
452 
453 	/*
454 	 * setup some roll parameters
455 	 */
456 	if (trans_roll_tics == 0)
457 		trans_roll_tics = 5 * hz;
458 	nmblk = log_roll_buffers();
459 
460 	/*
461 	 * allocate the buffers and buffer headers
462 	 */
463 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
464 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
465 
466 	/*
467 	 * initialize the buffer headers
468 	 */
469 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
470 		rbp->rb_next = NULL;
471 		bp = &rbp->rb_bh;
472 		bioinit(bp);
473 		bp->b_edev = ul->un_dev;
474 		bp->b_iodone = trans_not_done;
475 		bp->b_bufsize = MAPBLOCKSIZE;
476 	}
477 
478 	doingforceroll = 0;
479 
480 again:
481 	/*
482 	 * LOOP FOREVER
483 	 */
484 
485 	/*
486 	 * exit on demand
487 	 */
488 	mutex_enter(&logmap->mtm_mutex);
489 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
490 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
491 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
492 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
493 		    MTM_ROLL_EXIT | MTM_ROLLING);
494 		cv_broadcast(&logmap->mtm_from_roll_cv);
495 		CALLB_CPR_EXIT(&cprinfo);
496 		thread_exit();
497 		/* NOTREACHED */
498 	}
499 
500 	/*
501 	 * MT_SCAN debug mode
502 	 *	don't roll except in FORCEROLL situations
503 	 */
504 	if (logmap->mtm_debug & MT_SCAN)
505 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
506 			mutex_exit(&logmap->mtm_mutex);
507 			trans_roll_wait(logmap, &cprinfo);
508 			goto again;
509 		}
510 	ASSERT(logmap->mtm_trimlof == 0);
511 
512 	/*
513 	 * If we've finished a force roll cycle then wakeup any
514 	 * waiters.
515 	 */
516 	if (doingforceroll) {
517 		doingforceroll = 0;
518 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
519 		mutex_exit(&logmap->mtm_mutex);
520 		cv_broadcast(&logmap->mtm_from_roll_cv);
521 	} else {
522 		mutex_exit(&logmap->mtm_mutex);
523 	}
524 
525 	/*
526 	 * If someone wants us to roll something; then do it
527 	 */
528 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
529 		doingforceroll = 1;
530 		goto rollsomething;
531 	}
532 
533 	/*
534 	 * Log is busy, check if logmap is getting full.
535 	 */
536 	if (logmap_need_roll(logmap)) {
537 		goto rollsomething;
538 	}
539 
540 	/*
541 	 * Check if the log is idle and is not empty
542 	 */
543 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
544 		goto rollsomething;
545 	}
546 
547 	/*
548 	 * Log is busy, check if its getting full
549 	 */
550 	if (ldl_need_roll(ul)) {
551 		goto rollsomething;
552 	}
553 
554 	/*
555 	 * nothing to do; wait a bit and then start over
556 	 */
557 	trans_roll_wait(logmap, &cprinfo);
558 	goto again;
559 
560 	/*
561 	 * ROLL SOMETHING
562 	 */
563 
564 rollsomething:
565 	/*
566 	 * Use the cached roll buffers, or read the master
567 	 * and overlay the deltas
568 	 */
569 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
570 	case 1: trans_roll_wait(logmap, &cprinfo);
571 		/* FALLTHROUGH */
572 	case 2: goto again;
573 	/* default case is success */
574 	}
575 
576 	/*
577 	 * Asynchronously write out the deltas
578 	 */
579 	if (log_roll_write(ul, rbs, nbuf))
580 		goto again;
581 
582 	/*
583 	 * free up the deltas in the logmap
584 	 */
585 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
586 		bp = &rbp->rb_bh;
587 		logmap_remove_roll(logmap,
588 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
589 	}
590 
591 	/*
592 	 * free up log space; if possible
593 	 */
594 	logmap_sethead(logmap, ul);
595 
596 	/*
597 	 * LOOP
598 	 */
599 	goto again;
600 }
601