xref: /titanic_44/usr/src/uts/common/fs/ufs/lufs_thread.c (revision d3d50737e566cade9a08d73d2af95105ac7cd960)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
580d34432Sfrankho  * Common Development and Distribution License (the "License").
680d34432Sfrankho  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*d3d50737SRafael Vanoni  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate #include <sys/systm.h>
277c478bd9Sstevel@tonic-gate #include <sys/types.h>
287c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
297c478bd9Sstevel@tonic-gate #include <sys/errno.h>
307c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
317c478bd9Sstevel@tonic-gate #include <sys/debug.h>
327c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
337c478bd9Sstevel@tonic-gate #include <sys/conf.h>
347c478bd9Sstevel@tonic-gate #include <sys/proc.h>
357c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
367c478bd9Sstevel@tonic-gate #include <sys/fssnap_if.h>
377c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
387c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_filio.h>
397c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
407c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
417c478bd9Sstevel@tonic-gate #include <sys/inttypes.h>
427c478bd9Sstevel@tonic-gate #include <sys/callb.h>
437c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
447c478bd9Sstevel@tonic-gate 
457c478bd9Sstevel@tonic-gate /*
467c478bd9Sstevel@tonic-gate  * Kernel threads for logging
477c478bd9Sstevel@tonic-gate  * Currently only one for rolling the log (one per log).
487c478bd9Sstevel@tonic-gate  */
497c478bd9Sstevel@tonic-gate 
507c478bd9Sstevel@tonic-gate #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
517c478bd9Sstevel@tonic-gate #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
527c478bd9Sstevel@tonic-gate #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
537c478bd9Sstevel@tonic-gate 
547c478bd9Sstevel@tonic-gate /*
557c478bd9Sstevel@tonic-gate  * Macros
567c478bd9Sstevel@tonic-gate  */
577c478bd9Sstevel@tonic-gate #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
587c478bd9Sstevel@tonic-gate #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
597c478bd9Sstevel@tonic-gate 
607c478bd9Sstevel@tonic-gate /*
617c478bd9Sstevel@tonic-gate  * Tunables
627c478bd9Sstevel@tonic-gate  */
637c478bd9Sstevel@tonic-gate uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
647c478bd9Sstevel@tonic-gate uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
657c478bd9Sstevel@tonic-gate uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
667c478bd9Sstevel@tonic-gate long logmap_maxnme = 1536;
677c478bd9Sstevel@tonic-gate int trans_roll_tics = 0;
687c478bd9Sstevel@tonic-gate uint64_t trans_roll_new_delta = 0;
697c478bd9Sstevel@tonic-gate uint64_t lrr_wait = 0;
707c478bd9Sstevel@tonic-gate /*
717c478bd9Sstevel@tonic-gate  * Key for thread specific data for the roll thread to
727c478bd9Sstevel@tonic-gate  * bypass snapshot throttling
737c478bd9Sstevel@tonic-gate  */
747c478bd9Sstevel@tonic-gate uint_t bypass_snapshot_throttle_key;
757c478bd9Sstevel@tonic-gate 
767c478bd9Sstevel@tonic-gate /*
777c478bd9Sstevel@tonic-gate  * externs
787c478bd9Sstevel@tonic-gate  */
797c478bd9Sstevel@tonic-gate extern kmutex_t		ml_scan;
807c478bd9Sstevel@tonic-gate extern kcondvar_t	ml_scan_cv;
817c478bd9Sstevel@tonic-gate extern int		maxphys;
827c478bd9Sstevel@tonic-gate 
837c478bd9Sstevel@tonic-gate static void
trans_roll_wait(mt_map_t * logmap,callb_cpr_t * cprinfop)847c478bd9Sstevel@tonic-gate trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
857c478bd9Sstevel@tonic-gate {
867c478bd9Sstevel@tonic-gate 	mutex_enter(&logmap->mtm_mutex);
877c478bd9Sstevel@tonic-gate 	logmap->mtm_ref = 0;
887c478bd9Sstevel@tonic-gate 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
897c478bd9Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
907c478bd9Sstevel@tonic-gate 	}
917c478bd9Sstevel@tonic-gate 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
927c478bd9Sstevel@tonic-gate 	CALLB_CPR_SAFE_BEGIN(cprinfop);
93*d3d50737SRafael Vanoni 	(void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
94*d3d50737SRafael Vanoni 	    trans_roll_tics, TR_CLOCK_TICK);
957c478bd9Sstevel@tonic-gate 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
967c478bd9Sstevel@tonic-gate 	logmap->mtm_flags |= MTM_ROLLING;
977c478bd9Sstevel@tonic-gate 	mutex_exit(&logmap->mtm_mutex);
987c478bd9Sstevel@tonic-gate }
997c478bd9Sstevel@tonic-gate 
1007c478bd9Sstevel@tonic-gate /*
1017c478bd9Sstevel@tonic-gate  * returns the number of 8K buffers to use for rolling the log
1027c478bd9Sstevel@tonic-gate  */
1037c478bd9Sstevel@tonic-gate static uint32_t
log_roll_buffers()1047c478bd9Sstevel@tonic-gate log_roll_buffers()
1057c478bd9Sstevel@tonic-gate {
1067c478bd9Sstevel@tonic-gate 	/*
1077c478bd9Sstevel@tonic-gate 	 * sanity validate the tunable lufs_num_roll_bufs
1087c478bd9Sstevel@tonic-gate 	 */
1097c478bd9Sstevel@tonic-gate 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
1107c478bd9Sstevel@tonic-gate 		return (lufs_min_roll_bufs);
1117c478bd9Sstevel@tonic-gate 	}
1127c478bd9Sstevel@tonic-gate 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
1137c478bd9Sstevel@tonic-gate 		return (lufs_max_roll_bufs);
1147c478bd9Sstevel@tonic-gate 	}
1157c478bd9Sstevel@tonic-gate 	return (lufs_num_roll_bufs);
1167c478bd9Sstevel@tonic-gate }
1177c478bd9Sstevel@tonic-gate 
1187c478bd9Sstevel@tonic-gate /*
1197c478bd9Sstevel@tonic-gate  * Find something to roll, then if we don't have cached roll buffers
1207c478bd9Sstevel@tonic-gate  * covering all the deltas in that MAPBLOCK then read the master
1217c478bd9Sstevel@tonic-gate  * and overlay the deltas.
1227c478bd9Sstevel@tonic-gate  * returns;
1237c478bd9Sstevel@tonic-gate  * 	0 if sucessful
1247c478bd9Sstevel@tonic-gate  *	1 on finding nothing to roll
1257c478bd9Sstevel@tonic-gate  *	2 on error
1267c478bd9Sstevel@tonic-gate  */
1277c478bd9Sstevel@tonic-gate int
log_roll_read(ml_unit_t * ul,rollbuf_t * rbs,int nmblk,caddr_t roll_bufs,int * retnbuf)1287c478bd9Sstevel@tonic-gate log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
1297c478bd9Sstevel@tonic-gate     int *retnbuf)
1307c478bd9Sstevel@tonic-gate {
1317c478bd9Sstevel@tonic-gate 	offset_t	mof;
1327c478bd9Sstevel@tonic-gate 	buf_t		*bp;
1337c478bd9Sstevel@tonic-gate 	rollbuf_t	*rbp;
1347c478bd9Sstevel@tonic-gate 	mt_map_t	*logmap = ul->un_logmap;
1357c478bd9Sstevel@tonic-gate 	daddr_t		mblkno;
1367c478bd9Sstevel@tonic-gate 	int		i;
1377c478bd9Sstevel@tonic-gate 	int		error;
1387c478bd9Sstevel@tonic-gate 	int		nbuf;
1397c478bd9Sstevel@tonic-gate 
1407c478bd9Sstevel@tonic-gate 	/*
1417c478bd9Sstevel@tonic-gate 	 * Make sure there is really something to roll
1427c478bd9Sstevel@tonic-gate 	 */
1437c478bd9Sstevel@tonic-gate 	mof = 0;
1447c478bd9Sstevel@tonic-gate 	if (!logmap_next_roll(logmap, &mof)) {
1457c478bd9Sstevel@tonic-gate 		return (1);
1467c478bd9Sstevel@tonic-gate 	}
1477c478bd9Sstevel@tonic-gate 
1487c478bd9Sstevel@tonic-gate 	/*
1497c478bd9Sstevel@tonic-gate 	 * build some master blocks + deltas to roll forward
1507c478bd9Sstevel@tonic-gate 	 */
1517c478bd9Sstevel@tonic-gate 	rw_enter(&logmap->mtm_rwlock, RW_READER);
1527c478bd9Sstevel@tonic-gate 	nbuf = 0;
1537c478bd9Sstevel@tonic-gate 	do {
1547c478bd9Sstevel@tonic-gate 		mof = mof & (offset_t)MAPBLOCKMASK;
1557c478bd9Sstevel@tonic-gate 		mblkno = lbtodb(mof);
1567c478bd9Sstevel@tonic-gate 
1577c478bd9Sstevel@tonic-gate 		/*
1587c478bd9Sstevel@tonic-gate 		 * Check for the case of a new delta to a set up buffer
1597c478bd9Sstevel@tonic-gate 		 */
1607c478bd9Sstevel@tonic-gate 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
1617c478bd9Sstevel@tonic-gate 			if (P2ALIGN(rbp->rb_bh.b_blkno,
1627c478bd9Sstevel@tonic-gate 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
1637c478bd9Sstevel@tonic-gate 				TNF_PROBE_0(trans_roll_new_delta, "lufs",
1647c478bd9Sstevel@tonic-gate 				    /* CSTYLED */);
1657c478bd9Sstevel@tonic-gate 				trans_roll_new_delta++;
1667c478bd9Sstevel@tonic-gate 				/* Flush out the current set of buffers */
1677c478bd9Sstevel@tonic-gate 				goto flush_bufs;
1687c478bd9Sstevel@tonic-gate 			}
1697c478bd9Sstevel@tonic-gate 		}
1707c478bd9Sstevel@tonic-gate 
1717c478bd9Sstevel@tonic-gate 		/*
1727c478bd9Sstevel@tonic-gate 		 * Work out what to roll next. If it isn't cached then read
1737c478bd9Sstevel@tonic-gate 		 * it asynchronously from the master.
1747c478bd9Sstevel@tonic-gate 		 */
1757c478bd9Sstevel@tonic-gate 		bp = &rbp->rb_bh;
1767c478bd9Sstevel@tonic-gate 		bp->b_blkno = mblkno;
1777c478bd9Sstevel@tonic-gate 		bp->b_flags = B_READ;
1787c478bd9Sstevel@tonic-gate 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
1797c478bd9Sstevel@tonic-gate 		bp->b_bufsize = MAPBLOCKSIZE;
1807c478bd9Sstevel@tonic-gate 		if (top_read_roll(rbp, ul)) {
1817c478bd9Sstevel@tonic-gate 			/* logmap deltas were in use */
1827c478bd9Sstevel@tonic-gate 			if (nbuf == 0) {
1837c478bd9Sstevel@tonic-gate 				/*
1847c478bd9Sstevel@tonic-gate 				 * On first buffer wait for the logmap user
1857c478bd9Sstevel@tonic-gate 				 * to finish by grabbing the logmap lock
1867c478bd9Sstevel@tonic-gate 				 * exclusively rather than spinning
1877c478bd9Sstevel@tonic-gate 				 */
1887c478bd9Sstevel@tonic-gate 				rw_exit(&logmap->mtm_rwlock);
1897c478bd9Sstevel@tonic-gate 				lrr_wait++;
1907c478bd9Sstevel@tonic-gate 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
1917c478bd9Sstevel@tonic-gate 				rw_exit(&logmap->mtm_rwlock);
1927c478bd9Sstevel@tonic-gate 				return (1);
1937c478bd9Sstevel@tonic-gate 			}
1947c478bd9Sstevel@tonic-gate 			/* we have at least one buffer - flush it */
1957c478bd9Sstevel@tonic-gate 			goto flush_bufs;
1967c478bd9Sstevel@tonic-gate 		}
1977c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & B_INVAL) == 0) {
1987c478bd9Sstevel@tonic-gate 			nbuf++;
1997c478bd9Sstevel@tonic-gate 		}
2007c478bd9Sstevel@tonic-gate 		mof += MAPBLOCKSIZE;
2017c478bd9Sstevel@tonic-gate 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
2027c478bd9Sstevel@tonic-gate 
2037c478bd9Sstevel@tonic-gate 	/*
2047c478bd9Sstevel@tonic-gate 	 * If there was nothing to roll cycle back
2057c478bd9Sstevel@tonic-gate 	 */
2067c478bd9Sstevel@tonic-gate 	if (nbuf == 0) {
2077c478bd9Sstevel@tonic-gate 		rw_exit(&logmap->mtm_rwlock);
2087c478bd9Sstevel@tonic-gate 		return (1);
2097c478bd9Sstevel@tonic-gate 	}
2107c478bd9Sstevel@tonic-gate 
2117c478bd9Sstevel@tonic-gate flush_bufs:
2127c478bd9Sstevel@tonic-gate 	/*
2137c478bd9Sstevel@tonic-gate 	 * For each buffer, if it isn't cached then wait for the read to
2147c478bd9Sstevel@tonic-gate 	 * finish and overlay the deltas.
2157c478bd9Sstevel@tonic-gate 	 */
2167c478bd9Sstevel@tonic-gate 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
2177c478bd9Sstevel@tonic-gate 		if (!rbp->rb_crb) {
2187c478bd9Sstevel@tonic-gate 			bp = &rbp->rb_bh;
2197c478bd9Sstevel@tonic-gate 			if (trans_not_wait(bp)) {
2207c478bd9Sstevel@tonic-gate 				ldl_seterror(ul,
2217c478bd9Sstevel@tonic-gate 				    "Error reading master during ufs log roll");
2227c478bd9Sstevel@tonic-gate 				error = 1;
2237c478bd9Sstevel@tonic-gate 			}
2247c478bd9Sstevel@tonic-gate 			/*
2257c478bd9Sstevel@tonic-gate 			 * sync read the data from the log
2267c478bd9Sstevel@tonic-gate 			 */
2277c478bd9Sstevel@tonic-gate 			if (ldl_read(ul, bp->b_un.b_addr,
2287c478bd9Sstevel@tonic-gate 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
2297c478bd9Sstevel@tonic-gate 			    MAPBLOCKSIZE, rbp->rb_age)) {
2307c478bd9Sstevel@tonic-gate 				error = 1;
2317c478bd9Sstevel@tonic-gate 			}
2327c478bd9Sstevel@tonic-gate 		}
2337c478bd9Sstevel@tonic-gate 
2347c478bd9Sstevel@tonic-gate 		/*
2357c478bd9Sstevel@tonic-gate 		 * reset the age bit in the age list
2367c478bd9Sstevel@tonic-gate 		 */
2377c478bd9Sstevel@tonic-gate 		logmap_list_put_roll(logmap, rbp->rb_age);
2387c478bd9Sstevel@tonic-gate 
2397c478bd9Sstevel@tonic-gate 		if (ul->un_flags & LDL_ERROR) {
2407c478bd9Sstevel@tonic-gate 			error = 1;
2417c478bd9Sstevel@tonic-gate 		}
2427c478bd9Sstevel@tonic-gate 	}
2437c478bd9Sstevel@tonic-gate 	rw_exit(&logmap->mtm_rwlock);
2447c478bd9Sstevel@tonic-gate 	if (error)
2457c478bd9Sstevel@tonic-gate 		return (2);
2467c478bd9Sstevel@tonic-gate 	*retnbuf = nbuf;
2477c478bd9Sstevel@tonic-gate 	return (0);
2487c478bd9Sstevel@tonic-gate }
2497c478bd9Sstevel@tonic-gate 
2507c478bd9Sstevel@tonic-gate /*
2517c478bd9Sstevel@tonic-gate  * Write out a cached roll buffer
2527c478bd9Sstevel@tonic-gate  */
2537c478bd9Sstevel@tonic-gate void
log_roll_write_crb(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)2547c478bd9Sstevel@tonic-gate log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
2557c478bd9Sstevel@tonic-gate {
2567c478bd9Sstevel@tonic-gate 	crb_t *crb = rbp->rb_crb;
2577c478bd9Sstevel@tonic-gate 	buf_t *bp = &rbp->rb_bh;
2587c478bd9Sstevel@tonic-gate 
2597c478bd9Sstevel@tonic-gate 	bp->b_blkno = lbtodb(crb->c_mof);
2607c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = crb->c_buf;
2617c478bd9Sstevel@tonic-gate 	bp->b_bcount = crb->c_nb;
2627c478bd9Sstevel@tonic-gate 	bp->b_bufsize = crb->c_nb;
2637c478bd9Sstevel@tonic-gate 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
2647c478bd9Sstevel@tonic-gate 	bp->b_flags = B_WRITE;
2657c478bd9Sstevel@tonic-gate 	logstats.ls_rwrites.value.ui64++;
2667c478bd9Sstevel@tonic-gate 
2677c478bd9Sstevel@tonic-gate 	/* if snapshots are enabled, call it */
2687c478bd9Sstevel@tonic-gate 	if (ufsvfsp->vfs_snapshot) {
2697c478bd9Sstevel@tonic-gate 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
2707c478bd9Sstevel@tonic-gate 	} else {
2717c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2727c478bd9Sstevel@tonic-gate 	}
2737c478bd9Sstevel@tonic-gate }
2747c478bd9Sstevel@tonic-gate 
2757c478bd9Sstevel@tonic-gate /*
2767c478bd9Sstevel@tonic-gate  * Write out a set of non cached roll buffers
2777c478bd9Sstevel@tonic-gate  */
2787c478bd9Sstevel@tonic-gate void
log_roll_write_bufs(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)2797c478bd9Sstevel@tonic-gate log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
2807c478bd9Sstevel@tonic-gate {
2817c478bd9Sstevel@tonic-gate 	buf_t		*bp = &rbp->rb_bh;
2827c478bd9Sstevel@tonic-gate 	buf_t		*bp2;
2837c478bd9Sstevel@tonic-gate 	rbsecmap_t	secmap = rbp->rb_secmap;
2847c478bd9Sstevel@tonic-gate 	int		j, k;
2857c478bd9Sstevel@tonic-gate 
2867c478bd9Sstevel@tonic-gate 	ASSERT(secmap);
2877c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_INVAL) == 0);
2887c478bd9Sstevel@tonic-gate 
2897c478bd9Sstevel@tonic-gate 	do { /* for each contiguous block of sectors */
2907c478bd9Sstevel@tonic-gate 		/* find start of next sector to write */
2917c478bd9Sstevel@tonic-gate 		for (j = 0; j < 16; ++j) {
2927c478bd9Sstevel@tonic-gate 			if (secmap & UINT16_C(1))
2937c478bd9Sstevel@tonic-gate 				break;
2947c478bd9Sstevel@tonic-gate 			secmap >>= 1;
2957c478bd9Sstevel@tonic-gate 		}
2967c478bd9Sstevel@tonic-gate 		bp->b_un.b_addr += (j << DEV_BSHIFT);
2977c478bd9Sstevel@tonic-gate 		bp->b_blkno += j;
2987c478bd9Sstevel@tonic-gate 
2997c478bd9Sstevel@tonic-gate 		/* calculate number of sectors */
3007c478bd9Sstevel@tonic-gate 		secmap >>= 1;
3017c478bd9Sstevel@tonic-gate 		j++;
3027c478bd9Sstevel@tonic-gate 		for (k = 1; j < 16; ++j) {
3037c478bd9Sstevel@tonic-gate 			if ((secmap & UINT16_C(1)) == 0)
3047c478bd9Sstevel@tonic-gate 				break;
3057c478bd9Sstevel@tonic-gate 			secmap >>= 1;
3067c478bd9Sstevel@tonic-gate 			k++;
3077c478bd9Sstevel@tonic-gate 		}
3087c478bd9Sstevel@tonic-gate 		bp->b_bcount = k << DEV_BSHIFT;
3097c478bd9Sstevel@tonic-gate 		bp->b_flags = B_WRITE;
3107c478bd9Sstevel@tonic-gate 		logstats.ls_rwrites.value.ui64++;
3117c478bd9Sstevel@tonic-gate 
3127c478bd9Sstevel@tonic-gate 		/* if snapshots are enabled, call it */
3137c478bd9Sstevel@tonic-gate 		if (ufsvfsp->vfs_snapshot)
3147c478bd9Sstevel@tonic-gate 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
3157c478bd9Sstevel@tonic-gate 		else
3167c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
3177c478bd9Sstevel@tonic-gate 		if (secmap) {
3187c478bd9Sstevel@tonic-gate 			/*
3197c478bd9Sstevel@tonic-gate 			 * Allocate another buf_t to handle
3207c478bd9Sstevel@tonic-gate 			 * the next write in this MAPBLOCK
3217c478bd9Sstevel@tonic-gate 			 * Chain them via b_list.
3227c478bd9Sstevel@tonic-gate 			 */
3237c478bd9Sstevel@tonic-gate 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
3247c478bd9Sstevel@tonic-gate 			bp->b_list = bp2;
3257c478bd9Sstevel@tonic-gate 			bioinit(bp2);
3267c478bd9Sstevel@tonic-gate 			bp2->b_iodone = trans_not_done;
3277c478bd9Sstevel@tonic-gate 			bp2->b_bufsize = MAPBLOCKSIZE;
3287c478bd9Sstevel@tonic-gate 			bp2->b_edev = bp->b_edev;
3297c478bd9Sstevel@tonic-gate 			bp2->b_un.b_addr =
3307c478bd9Sstevel@tonic-gate 			    bp->b_un.b_addr + bp->b_bcount;
3317c478bd9Sstevel@tonic-gate 			bp2->b_blkno = bp->b_blkno + k;
3327c478bd9Sstevel@tonic-gate 			bp = bp2;
3337c478bd9Sstevel@tonic-gate 		}
3347c478bd9Sstevel@tonic-gate 	} while (secmap);
3357c478bd9Sstevel@tonic-gate }
3367c478bd9Sstevel@tonic-gate 
3377c478bd9Sstevel@tonic-gate /*
3387c478bd9Sstevel@tonic-gate  * Asynchronously roll the deltas, using the sector map
3397c478bd9Sstevel@tonic-gate  * in each rollbuf_t.
3407c478bd9Sstevel@tonic-gate  */
3417c478bd9Sstevel@tonic-gate int
log_roll_write(ml_unit_t * ul,rollbuf_t * rbs,int nbuf)3427c478bd9Sstevel@tonic-gate log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
3437c478bd9Sstevel@tonic-gate {
3447c478bd9Sstevel@tonic-gate 
3457c478bd9Sstevel@tonic-gate 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
3467c478bd9Sstevel@tonic-gate 	rollbuf_t	*rbp;
3477c478bd9Sstevel@tonic-gate 	buf_t		*bp, *bp2;
3487c478bd9Sstevel@tonic-gate 	rollbuf_t	*head, *prev, *rbp2;
3497c478bd9Sstevel@tonic-gate 
3507c478bd9Sstevel@tonic-gate 	/*
3517c478bd9Sstevel@tonic-gate 	 * Order the buffers by blkno
3527c478bd9Sstevel@tonic-gate 	 */
3537c478bd9Sstevel@tonic-gate 	ASSERT(nbuf > 0);
3547c478bd9Sstevel@tonic-gate #ifdef lint
3557c478bd9Sstevel@tonic-gate 	prev = rbs;
3567c478bd9Sstevel@tonic-gate #endif
3577c478bd9Sstevel@tonic-gate 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
3587c478bd9Sstevel@tonic-gate 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
3597c478bd9Sstevel@tonic-gate 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
3607c478bd9Sstevel@tonic-gate 				if (rbp2 == head) {
3617c478bd9Sstevel@tonic-gate 					rbp->rb_next = head;
3627c478bd9Sstevel@tonic-gate 					head = rbp;
3637c478bd9Sstevel@tonic-gate 				} else {
3647c478bd9Sstevel@tonic-gate 					prev->rb_next = rbp;
3657c478bd9Sstevel@tonic-gate 					rbp->rb_next = rbp2;
3667c478bd9Sstevel@tonic-gate 				}
3677c478bd9Sstevel@tonic-gate 				break;
3687c478bd9Sstevel@tonic-gate 			}
3697c478bd9Sstevel@tonic-gate 		}
3707c478bd9Sstevel@tonic-gate 		if (rbp2 == NULL) {
3717c478bd9Sstevel@tonic-gate 			prev->rb_next = rbp;
3727c478bd9Sstevel@tonic-gate 			rbp->rb_next = NULL;
3737c478bd9Sstevel@tonic-gate 		}
3747c478bd9Sstevel@tonic-gate 	}
3757c478bd9Sstevel@tonic-gate 
3767c478bd9Sstevel@tonic-gate 	/*
3777c478bd9Sstevel@tonic-gate 	 * issue the in-order writes
3787c478bd9Sstevel@tonic-gate 	 */
3797c478bd9Sstevel@tonic-gate 	for (rbp = head; rbp; rbp = rbp2) {
3807c478bd9Sstevel@tonic-gate 		if (rbp->rb_crb) {
3817c478bd9Sstevel@tonic-gate 			log_roll_write_crb(ufsvfsp, rbp);
3827c478bd9Sstevel@tonic-gate 		} else {
3837c478bd9Sstevel@tonic-gate 			log_roll_write_bufs(ufsvfsp, rbp);
3847c478bd9Sstevel@tonic-gate 		}
3857c478bd9Sstevel@tonic-gate 		/* null out the rb_next link for next set of rolling */
3867c478bd9Sstevel@tonic-gate 		rbp2 = rbp->rb_next;
3877c478bd9Sstevel@tonic-gate 		rbp->rb_next = NULL;
3887c478bd9Sstevel@tonic-gate 	}
3897c478bd9Sstevel@tonic-gate 
3907c478bd9Sstevel@tonic-gate 	/*
3917c478bd9Sstevel@tonic-gate 	 * wait for all the writes to finish
3927c478bd9Sstevel@tonic-gate 	 */
3937c478bd9Sstevel@tonic-gate 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
3947c478bd9Sstevel@tonic-gate 		bp = &rbp->rb_bh;
3957c478bd9Sstevel@tonic-gate 		if (trans_not_wait(bp)) {
3967c478bd9Sstevel@tonic-gate 			ldl_seterror(ul,
3977c478bd9Sstevel@tonic-gate 			    "Error writing master during ufs log roll");
3987c478bd9Sstevel@tonic-gate 		}
3997c478bd9Sstevel@tonic-gate 
4007c478bd9Sstevel@tonic-gate 		/*
4017c478bd9Sstevel@tonic-gate 		 * Now wait for all the "cloned" buffer writes (if any)
4027c478bd9Sstevel@tonic-gate 		 * and free those headers
4037c478bd9Sstevel@tonic-gate 		 */
4047c478bd9Sstevel@tonic-gate 		bp2 = bp->b_list;
4057c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
4067c478bd9Sstevel@tonic-gate 		while (bp2) {
4077c478bd9Sstevel@tonic-gate 			if (trans_not_wait(bp2)) {
4087c478bd9Sstevel@tonic-gate 				ldl_seterror(ul,
4097c478bd9Sstevel@tonic-gate 				    "Error writing master during ufs log roll");
4107c478bd9Sstevel@tonic-gate 			}
4117c478bd9Sstevel@tonic-gate 			bp = bp2;
4127c478bd9Sstevel@tonic-gate 			bp2 = bp2->b_list;
4137c478bd9Sstevel@tonic-gate 			kmem_free(bp, sizeof (buf_t));
4147c478bd9Sstevel@tonic-gate 		}
4157c478bd9Sstevel@tonic-gate 	}
4167c478bd9Sstevel@tonic-gate 
4177c478bd9Sstevel@tonic-gate 	if (ul->un_flags & LDL_ERROR)
4187c478bd9Sstevel@tonic-gate 		return (1);
4197c478bd9Sstevel@tonic-gate 	return (0);
4207c478bd9Sstevel@tonic-gate }
4217c478bd9Sstevel@tonic-gate 
4227c478bd9Sstevel@tonic-gate void
trans_roll(ml_unit_t * ul)4237c478bd9Sstevel@tonic-gate trans_roll(ml_unit_t *ul)
4247c478bd9Sstevel@tonic-gate {
4257c478bd9Sstevel@tonic-gate 	callb_cpr_t	cprinfo;
4267c478bd9Sstevel@tonic-gate 	mt_map_t	*logmap = ul->un_logmap;
4277c478bd9Sstevel@tonic-gate 	rollbuf_t	*rbs;
4287c478bd9Sstevel@tonic-gate 	rollbuf_t	*rbp;
4297c478bd9Sstevel@tonic-gate 	buf_t		*bp;
4307c478bd9Sstevel@tonic-gate 	caddr_t		roll_bufs;
4317c478bd9Sstevel@tonic-gate 	uint32_t	nmblk;
4327c478bd9Sstevel@tonic-gate 	int		i;
4337c478bd9Sstevel@tonic-gate 	int		doingforceroll;
4347c478bd9Sstevel@tonic-gate 	int		nbuf;
4357c478bd9Sstevel@tonic-gate 
4367c478bd9Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
4377c478bd9Sstevel@tonic-gate 	    "trans_roll");
4387c478bd9Sstevel@tonic-gate 
4397c478bd9Sstevel@tonic-gate 	/*
4407c478bd9Sstevel@tonic-gate 	 * We do not want the roll thread's writes to be
4417c478bd9Sstevel@tonic-gate 	 * throttled by the snapshot.
4427c478bd9Sstevel@tonic-gate 	 * If they are throttled then we can have a deadlock
4437c478bd9Sstevel@tonic-gate 	 * between the roll thread and the snapshot taskq thread:
4447c478bd9Sstevel@tonic-gate 	 * roll thread wants the throttling semaphore and
4457c478bd9Sstevel@tonic-gate 	 * the snapshot taskq thread cannot release the semaphore
4467c478bd9Sstevel@tonic-gate 	 * because it is writing to the log and the log is full.
4477c478bd9Sstevel@tonic-gate 	 */
4487c478bd9Sstevel@tonic-gate 
4497c478bd9Sstevel@tonic-gate 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
4507c478bd9Sstevel@tonic-gate 
4517c478bd9Sstevel@tonic-gate 	/*
4527c478bd9Sstevel@tonic-gate 	 * setup some roll parameters
4537c478bd9Sstevel@tonic-gate 	 */
4547c478bd9Sstevel@tonic-gate 	if (trans_roll_tics == 0)
4557c478bd9Sstevel@tonic-gate 		trans_roll_tics = 5 * hz;
4567c478bd9Sstevel@tonic-gate 	nmblk = log_roll_buffers();
4577c478bd9Sstevel@tonic-gate 
4587c478bd9Sstevel@tonic-gate 	/*
4597c478bd9Sstevel@tonic-gate 	 * allocate the buffers and buffer headers
4607c478bd9Sstevel@tonic-gate 	 */
4617c478bd9Sstevel@tonic-gate 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
4627c478bd9Sstevel@tonic-gate 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
4637c478bd9Sstevel@tonic-gate 
4647c478bd9Sstevel@tonic-gate 	/*
4657c478bd9Sstevel@tonic-gate 	 * initialize the buffer headers
4667c478bd9Sstevel@tonic-gate 	 */
4677c478bd9Sstevel@tonic-gate 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
4687c478bd9Sstevel@tonic-gate 		rbp->rb_next = NULL;
4697c478bd9Sstevel@tonic-gate 		bp = &rbp->rb_bh;
4707c478bd9Sstevel@tonic-gate 		bioinit(bp);
4717c478bd9Sstevel@tonic-gate 		bp->b_edev = ul->un_dev;
4727c478bd9Sstevel@tonic-gate 		bp->b_iodone = trans_not_done;
4737c478bd9Sstevel@tonic-gate 		bp->b_bufsize = MAPBLOCKSIZE;
4747c478bd9Sstevel@tonic-gate 	}
4757c478bd9Sstevel@tonic-gate 
4767c478bd9Sstevel@tonic-gate 	doingforceroll = 0;
4777c478bd9Sstevel@tonic-gate 
4787c478bd9Sstevel@tonic-gate again:
4797c478bd9Sstevel@tonic-gate 	/*
4807c478bd9Sstevel@tonic-gate 	 * LOOP FOREVER
4817c478bd9Sstevel@tonic-gate 	 */
4827c478bd9Sstevel@tonic-gate 
4837c478bd9Sstevel@tonic-gate 	/*
4847c478bd9Sstevel@tonic-gate 	 * exit on demand
4857c478bd9Sstevel@tonic-gate 	 */
4867c478bd9Sstevel@tonic-gate 	mutex_enter(&logmap->mtm_mutex);
4877c478bd9Sstevel@tonic-gate 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
4887c478bd9Sstevel@tonic-gate 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
4897c478bd9Sstevel@tonic-gate 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
4907c478bd9Sstevel@tonic-gate 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
4917c478bd9Sstevel@tonic-gate 		    MTM_ROLL_EXIT | MTM_ROLLING);
4927c478bd9Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
4937c478bd9Sstevel@tonic-gate 		CALLB_CPR_EXIT(&cprinfo);
4947c478bd9Sstevel@tonic-gate 		thread_exit();
4957c478bd9Sstevel@tonic-gate 		/* NOTREACHED */
4967c478bd9Sstevel@tonic-gate 	}
4977c478bd9Sstevel@tonic-gate 
4987c478bd9Sstevel@tonic-gate 	/*
4997c478bd9Sstevel@tonic-gate 	 * MT_SCAN debug mode
5007c478bd9Sstevel@tonic-gate 	 *	don't roll except in FORCEROLL situations
5017c478bd9Sstevel@tonic-gate 	 */
5027c478bd9Sstevel@tonic-gate 	if (logmap->mtm_debug & MT_SCAN)
5037c478bd9Sstevel@tonic-gate 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
5047c478bd9Sstevel@tonic-gate 			mutex_exit(&logmap->mtm_mutex);
5057c478bd9Sstevel@tonic-gate 			trans_roll_wait(logmap, &cprinfo);
5067c478bd9Sstevel@tonic-gate 			goto again;
5077c478bd9Sstevel@tonic-gate 		}
5087c478bd9Sstevel@tonic-gate 	ASSERT(logmap->mtm_trimlof == 0);
5097c478bd9Sstevel@tonic-gate 
5107c478bd9Sstevel@tonic-gate 	/*
5117c478bd9Sstevel@tonic-gate 	 * If we've finished a force roll cycle then wakeup any
5127c478bd9Sstevel@tonic-gate 	 * waiters.
5137c478bd9Sstevel@tonic-gate 	 */
5147c478bd9Sstevel@tonic-gate 	if (doingforceroll) {
5157c478bd9Sstevel@tonic-gate 		doingforceroll = 0;
5167c478bd9Sstevel@tonic-gate 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
5177c478bd9Sstevel@tonic-gate 		mutex_exit(&logmap->mtm_mutex);
5187c478bd9Sstevel@tonic-gate 		cv_broadcast(&logmap->mtm_from_roll_cv);
5197c478bd9Sstevel@tonic-gate 	} else {
5207c478bd9Sstevel@tonic-gate 		mutex_exit(&logmap->mtm_mutex);
5217c478bd9Sstevel@tonic-gate 	}
5227c478bd9Sstevel@tonic-gate 
5237c478bd9Sstevel@tonic-gate 	/*
5247c478bd9Sstevel@tonic-gate 	 * If someone wants us to roll something; then do it
5257c478bd9Sstevel@tonic-gate 	 */
5267c478bd9Sstevel@tonic-gate 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
5277c478bd9Sstevel@tonic-gate 		doingforceroll = 1;
5287c478bd9Sstevel@tonic-gate 		goto rollsomething;
5297c478bd9Sstevel@tonic-gate 	}
5307c478bd9Sstevel@tonic-gate 
5317c478bd9Sstevel@tonic-gate 	/*
5327c478bd9Sstevel@tonic-gate 	 * Log is busy, check if logmap is getting full.
5337c478bd9Sstevel@tonic-gate 	 */
5347c478bd9Sstevel@tonic-gate 	if (logmap_need_roll(logmap)) {
5357c478bd9Sstevel@tonic-gate 		goto rollsomething;
5367c478bd9Sstevel@tonic-gate 	}
5377c478bd9Sstevel@tonic-gate 
5387c478bd9Sstevel@tonic-gate 	/*
5397c478bd9Sstevel@tonic-gate 	 * Check if the log is idle and is not empty
5407c478bd9Sstevel@tonic-gate 	 */
5417c478bd9Sstevel@tonic-gate 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
5427c478bd9Sstevel@tonic-gate 		goto rollsomething;
5437c478bd9Sstevel@tonic-gate 	}
5447c478bd9Sstevel@tonic-gate 
5457c478bd9Sstevel@tonic-gate 	/*
5467c478bd9Sstevel@tonic-gate 	 * Log is busy, check if its getting full
5477c478bd9Sstevel@tonic-gate 	 */
5487c478bd9Sstevel@tonic-gate 	if (ldl_need_roll(ul)) {
5497c478bd9Sstevel@tonic-gate 		goto rollsomething;
5507c478bd9Sstevel@tonic-gate 	}
5517c478bd9Sstevel@tonic-gate 
5527c478bd9Sstevel@tonic-gate 	/*
5537c478bd9Sstevel@tonic-gate 	 * nothing to do; wait a bit and then start over
5547c478bd9Sstevel@tonic-gate 	 */
5557c478bd9Sstevel@tonic-gate 	trans_roll_wait(logmap, &cprinfo);
5567c478bd9Sstevel@tonic-gate 	goto again;
5577c478bd9Sstevel@tonic-gate 
5587c478bd9Sstevel@tonic-gate 	/*
5597c478bd9Sstevel@tonic-gate 	 * ROLL SOMETHING
5607c478bd9Sstevel@tonic-gate 	 */
5617c478bd9Sstevel@tonic-gate 
5627c478bd9Sstevel@tonic-gate rollsomething:
5637c478bd9Sstevel@tonic-gate 	/*
5647c478bd9Sstevel@tonic-gate 	 * Use the cached roll buffers, or read the master
5657c478bd9Sstevel@tonic-gate 	 * and overlay the deltas
5667c478bd9Sstevel@tonic-gate 	 */
5677c478bd9Sstevel@tonic-gate 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
5687c478bd9Sstevel@tonic-gate 	case 1: trans_roll_wait(logmap, &cprinfo);
5697c478bd9Sstevel@tonic-gate 		/* FALLTHROUGH */
5707c478bd9Sstevel@tonic-gate 	case 2: goto again;
5717c478bd9Sstevel@tonic-gate 	/* default case is success */
5727c478bd9Sstevel@tonic-gate 	}
5737c478bd9Sstevel@tonic-gate 
5747c478bd9Sstevel@tonic-gate 	/*
5757c478bd9Sstevel@tonic-gate 	 * Asynchronously write out the deltas
5767c478bd9Sstevel@tonic-gate 	 */
5777c478bd9Sstevel@tonic-gate 	if (log_roll_write(ul, rbs, nbuf))
5787c478bd9Sstevel@tonic-gate 		goto again;
5797c478bd9Sstevel@tonic-gate 
5807c478bd9Sstevel@tonic-gate 	/*
5817c478bd9Sstevel@tonic-gate 	 * free up the deltas in the logmap
5827c478bd9Sstevel@tonic-gate 	 */
5837c478bd9Sstevel@tonic-gate 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
5847c478bd9Sstevel@tonic-gate 		bp = &rbp->rb_bh;
5857c478bd9Sstevel@tonic-gate 		logmap_remove_roll(logmap,
5867c478bd9Sstevel@tonic-gate 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
5877c478bd9Sstevel@tonic-gate 	}
5887c478bd9Sstevel@tonic-gate 
5897c478bd9Sstevel@tonic-gate 	/*
5907c478bd9Sstevel@tonic-gate 	 * free up log space; if possible
5917c478bd9Sstevel@tonic-gate 	 */
5927c478bd9Sstevel@tonic-gate 	logmap_sethead(logmap, ul);
5937c478bd9Sstevel@tonic-gate 
5947c478bd9Sstevel@tonic-gate 	/*
5957c478bd9Sstevel@tonic-gate 	 * LOOP
5967c478bd9Sstevel@tonic-gate 	 */
5977c478bd9Sstevel@tonic-gate 	goto again;
5987c478bd9Sstevel@tonic-gate }
599