xref: /titanic_52/usr/src/uts/common/os/bio.c (revision 2c164fafa089aa352e513b095e1ecd0abd29c61f)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
56f84fed5Scth  * Common Development and Distribution License (the "License").
66f84fed5Scth  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22d3d50737SRafael Vanoni  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
24*2c164fafSPatrick Mooney  * Copyright 2019 Joyent, Inc.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
287c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
297c478bd9Sstevel@tonic-gate 
307c478bd9Sstevel@tonic-gate /*
317c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
327c478bd9Sstevel@tonic-gate  * The Regents of the University of California
337c478bd9Sstevel@tonic-gate  * All Rights Reserved
347c478bd9Sstevel@tonic-gate  *
357c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
367c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
377c478bd9Sstevel@tonic-gate  * contributors.
387c478bd9Sstevel@tonic-gate  */
397c478bd9Sstevel@tonic-gate 
407c478bd9Sstevel@tonic-gate #include <sys/types.h>
417c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
427c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
437c478bd9Sstevel@tonic-gate #include <sys/conf.h>
447c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
457c478bd9Sstevel@tonic-gate #include <sys/errno.h>
467c478bd9Sstevel@tonic-gate #include <sys/debug.h>
477c478bd9Sstevel@tonic-gate #include <sys/buf.h>
487c478bd9Sstevel@tonic-gate #include <sys/var.h>
497c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
507c478bd9Sstevel@tonic-gate #include <sys/bitmap.h>
517c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
527c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
537c478bd9Sstevel@tonic-gate #include <sys/vmem.h>
547c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
557c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
567c478bd9Sstevel@tonic-gate #include <vm/page.h>
577c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
587c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
597c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
607c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
617c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_bio.h>
627c478bd9Sstevel@tonic-gate #include <sys/fs/ufs_log.h>
637c478bd9Sstevel@tonic-gate #include <sys/systm.h>
647c478bd9Sstevel@tonic-gate #include <sys/vfs.h>
657c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
667c478bd9Sstevel@tonic-gate 
677c478bd9Sstevel@tonic-gate /* Locks */
687c478bd9Sstevel@tonic-gate static	kmutex_t	blist_lock;	/* protects b_list */
697c478bd9Sstevel@tonic-gate static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
707c478bd9Sstevel@tonic-gate static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
717c478bd9Sstevel@tonic-gate 
727c478bd9Sstevel@tonic-gate struct hbuf	*hbuf;			/* Hash buckets */
737c478bd9Sstevel@tonic-gate struct dwbuf	*dwbuf;			/* Delayed write buckets */
747c478bd9Sstevel@tonic-gate static struct buf *bhdrlist;		/* buf header free list */
757c478bd9Sstevel@tonic-gate static int 	nbuf;			/* number of buffer headers allocated */
767c478bd9Sstevel@tonic-gate 
777c478bd9Sstevel@tonic-gate static int	lastindex;		/* Reference point on where to start */
787c478bd9Sstevel@tonic-gate 					/* when looking for free buffers */
797c478bd9Sstevel@tonic-gate 
807c478bd9Sstevel@tonic-gate #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
817c478bd9Sstevel@tonic-gate #define	EMPTY_LIST	((struct buf *)-1)
827c478bd9Sstevel@tonic-gate 
837c478bd9Sstevel@tonic-gate static kcondvar_t	bio_mem_cv; 	/* Condition variables */
847c478bd9Sstevel@tonic-gate static kcondvar_t	bio_flushinval_cv;
857c478bd9Sstevel@tonic-gate static int	bio_doingflush;		/* flush in progress */
867c478bd9Sstevel@tonic-gate static int	bio_doinginval;		/* inval in progress */
877c478bd9Sstevel@tonic-gate static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
887c478bd9Sstevel@tonic-gate 
897c478bd9Sstevel@tonic-gate /*
907c478bd9Sstevel@tonic-gate  * Statistics on the buffer cache
917c478bd9Sstevel@tonic-gate  */
927c478bd9Sstevel@tonic-gate struct biostats biostats = {
937c478bd9Sstevel@tonic-gate 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
947c478bd9Sstevel@tonic-gate 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
957c478bd9Sstevel@tonic-gate 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
967c478bd9Sstevel@tonic-gate 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
977c478bd9Sstevel@tonic-gate 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
987c478bd9Sstevel@tonic-gate 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
997c478bd9Sstevel@tonic-gate };
1007c478bd9Sstevel@tonic-gate 
1017c478bd9Sstevel@tonic-gate /*
1027c478bd9Sstevel@tonic-gate  * kstat data
1037c478bd9Sstevel@tonic-gate  */
1047c478bd9Sstevel@tonic-gate kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
1057c478bd9Sstevel@tonic-gate uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
1067c478bd9Sstevel@tonic-gate 					sizeof (kstat_named_t));
1077c478bd9Sstevel@tonic-gate 
1087c478bd9Sstevel@tonic-gate /*
1097c478bd9Sstevel@tonic-gate  * Statistics on ufs buffer cache
1107c478bd9Sstevel@tonic-gate  * Not protected by locks
1117c478bd9Sstevel@tonic-gate  */
1127c478bd9Sstevel@tonic-gate struct ufsbiostats ub = {
1137c478bd9Sstevel@tonic-gate 	{ "breads",			KSTAT_DATA_UINT32 },
1147c478bd9Sstevel@tonic-gate 	{ "bwrites",			KSTAT_DATA_UINT32 },
1157c478bd9Sstevel@tonic-gate 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
1167c478bd9Sstevel@tonic-gate 	{ "getpages",			KSTAT_DATA_UINT32 },
1177c478bd9Sstevel@tonic-gate 	{ "getras",			KSTAT_DATA_UINT32 },
1187c478bd9Sstevel@tonic-gate 	{ "putsyncs",			KSTAT_DATA_UINT32 },
1197c478bd9Sstevel@tonic-gate 	{ "putasyncs",			KSTAT_DATA_UINT32 },
1207c478bd9Sstevel@tonic-gate 	{ "putpageios",			KSTAT_DATA_UINT32 },
1217c478bd9Sstevel@tonic-gate };
1227c478bd9Sstevel@tonic-gate 
1237c478bd9Sstevel@tonic-gate /*
1247c478bd9Sstevel@tonic-gate  * more UFS Logging eccentricities...
1257c478bd9Sstevel@tonic-gate  *
1267c478bd9Sstevel@tonic-gate  * required since "#pragma weak ..." doesn't work in reverse order.
1277c478bd9Sstevel@tonic-gate  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
1287c478bd9Sstevel@tonic-gate  *        to ufs routines don't get plugged into bio.c calls so
1297c478bd9Sstevel@tonic-gate  *        we initialize it when setting up the "lufsops" table
1307c478bd9Sstevel@tonic-gate  *        in "lufs.c:_init()"
1317c478bd9Sstevel@tonic-gate  */
1327c478bd9Sstevel@tonic-gate void (*bio_lufs_strategy)(void *, buf_t *);
1337c478bd9Sstevel@tonic-gate void (*bio_snapshot_strategy)(void *, buf_t *);
1347c478bd9Sstevel@tonic-gate 
1357c478bd9Sstevel@tonic-gate 
1367c478bd9Sstevel@tonic-gate /* Private routines */
1377c478bd9Sstevel@tonic-gate static struct buf	*bio_getfreeblk(long);
1387c478bd9Sstevel@tonic-gate static void 		bio_mem_get(long);
1397c478bd9Sstevel@tonic-gate static void		bio_bhdr_free(struct buf *);
1407c478bd9Sstevel@tonic-gate static struct buf	*bio_bhdr_alloc(void);
1417c478bd9Sstevel@tonic-gate static void		bio_recycle(int, long);
1427c478bd9Sstevel@tonic-gate static void 		bio_pageio_done(struct buf *);
1437c478bd9Sstevel@tonic-gate static int 		bio_incore(dev_t, daddr_t);
1447c478bd9Sstevel@tonic-gate 
1457c478bd9Sstevel@tonic-gate /*
1467c478bd9Sstevel@tonic-gate  * Buffer cache constants
1477c478bd9Sstevel@tonic-gate  */
1487c478bd9Sstevel@tonic-gate #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
1497c478bd9Sstevel@tonic-gate #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
1507c478bd9Sstevel@tonic-gate #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
1517c478bd9Sstevel@tonic-gate #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
1527c478bd9Sstevel@tonic-gate #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
1537c478bd9Sstevel@tonic-gate #define	BIO_HASHLEN	4		/* Target length of hash chains */
1547c478bd9Sstevel@tonic-gate 
1557c478bd9Sstevel@tonic-gate 
1567c478bd9Sstevel@tonic-gate /* Flags for bio_recycle() */
1577c478bd9Sstevel@tonic-gate #define	BIO_HEADER	0x01
1587c478bd9Sstevel@tonic-gate #define	BIO_MEM		0x02
1597c478bd9Sstevel@tonic-gate 
1607c478bd9Sstevel@tonic-gate extern	int bufhwm;		/* User tunable - high water mark for mem  */
1617c478bd9Sstevel@tonic-gate extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
1627c478bd9Sstevel@tonic-gate 
1637c478bd9Sstevel@tonic-gate /*
1647c478bd9Sstevel@tonic-gate  * The following routines allocate and free
1657c478bd9Sstevel@tonic-gate  * buffers with various side effects.  In general the
1667c478bd9Sstevel@tonic-gate  * arguments to an allocate routine are a device and
1677c478bd9Sstevel@tonic-gate  * a block number, and the value is a pointer to
1687c478bd9Sstevel@tonic-gate  * to the buffer header; the buffer returned is locked with a
1697c478bd9Sstevel@tonic-gate  * binary semaphore so that no one else can touch it. If the block was
1707c478bd9Sstevel@tonic-gate  * already in core, no I/O need be done; if it is
1717c478bd9Sstevel@tonic-gate  * already locked, the process waits until it becomes free.
1727c478bd9Sstevel@tonic-gate  * The following routines allocate a buffer:
1737c478bd9Sstevel@tonic-gate  *	getblk
1747c478bd9Sstevel@tonic-gate  *	bread/BREAD
1757c478bd9Sstevel@tonic-gate  *	breada
1767c478bd9Sstevel@tonic-gate  * Eventually the buffer must be released, possibly with the
1777c478bd9Sstevel@tonic-gate  * side effect of writing it out, by using one of
1787c478bd9Sstevel@tonic-gate  *	bwrite/BWRITE/brwrite
1797c478bd9Sstevel@tonic-gate  *	bdwrite/bdrwrite
1807c478bd9Sstevel@tonic-gate  *	bawrite
1817c478bd9Sstevel@tonic-gate  *	brelse
1827c478bd9Sstevel@tonic-gate  *
1837c478bd9Sstevel@tonic-gate  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
1847c478bd9Sstevel@tonic-gate  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
1857c478bd9Sstevel@tonic-gate  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
1867c478bd9Sstevel@tonic-gate  * B_DONE is still used to denote a buffer with I/O complete on it.
1877c478bd9Sstevel@tonic-gate  *
1887c478bd9Sstevel@tonic-gate  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
1897c478bd9Sstevel@tonic-gate  * should not be used where a very accurate count of the free buffers is
1907c478bd9Sstevel@tonic-gate  * needed.
1917c478bd9Sstevel@tonic-gate  */
1927c478bd9Sstevel@tonic-gate 
1937c478bd9Sstevel@tonic-gate /*
1947c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
1957c478bd9Sstevel@tonic-gate  *
1967c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
1977c478bd9Sstevel@tonic-gate  * BREAD() directly avoids the extra function call overhead invoked
1987c478bd9Sstevel@tonic-gate  * by calling this routine.
1997c478bd9Sstevel@tonic-gate  */
2007c478bd9Sstevel@tonic-gate struct buf *
2017c478bd9Sstevel@tonic-gate bread(dev_t dev, daddr_t blkno, long bsize)
2027c478bd9Sstevel@tonic-gate {
2037c478bd9Sstevel@tonic-gate 	return (BREAD(dev, blkno, bsize));
2047c478bd9Sstevel@tonic-gate }
2057c478bd9Sstevel@tonic-gate 
2067c478bd9Sstevel@tonic-gate /*
2077c478bd9Sstevel@tonic-gate  * Common code for reading a buffer with various options
2087c478bd9Sstevel@tonic-gate  *
2097c478bd9Sstevel@tonic-gate  * Read in (if necessary) the block and return a buffer pointer.
2107c478bd9Sstevel@tonic-gate  */
2117c478bd9Sstevel@tonic-gate struct buf *
2127c478bd9Sstevel@tonic-gate bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
2137c478bd9Sstevel@tonic-gate {
2147c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
2157c478bd9Sstevel@tonic-gate 	struct buf *bp;
2167c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
2177c478bd9Sstevel@tonic-gate 
2187c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lread, 1);
2197c478bd9Sstevel@tonic-gate 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
2207c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DONE)
2217c478bd9Sstevel@tonic-gate 		return (bp);
2227c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_READ;
2237c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount == bsize);
2247c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {					/* !ufs */
2257c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2267c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
2277c478bd9Sstevel@tonic-gate 							/* ufs && logging */
2287c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
2297c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
2307c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
2317c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
2327c478bd9Sstevel@tonic-gate 	} else {
233d3d50737SRafael Vanoni 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
2347c478bd9Sstevel@tonic-gate 		ub.ub_breads.value.ul++;		/* ufs && !logging */
2357c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
2367c478bd9Sstevel@tonic-gate 	}
2377c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
2387c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.inblock++;
2397c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, bread, 1);
2407c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
2417c478bd9Sstevel@tonic-gate 	return (bp);
2427c478bd9Sstevel@tonic-gate }
2437c478bd9Sstevel@tonic-gate 
2447c478bd9Sstevel@tonic-gate /*
2457c478bd9Sstevel@tonic-gate  * Read in the block, like bread, but also start I/O on the
2467c478bd9Sstevel@tonic-gate  * read-ahead block (which is not allocated to the caller).
2477c478bd9Sstevel@tonic-gate  */
2487c478bd9Sstevel@tonic-gate struct buf *
2497c478bd9Sstevel@tonic-gate breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
2507c478bd9Sstevel@tonic-gate {
2517c478bd9Sstevel@tonic-gate 	struct buf *bp, *rabp;
2527c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
2537c478bd9Sstevel@tonic-gate 
2547c478bd9Sstevel@tonic-gate 	bp = NULL;
2557c478bd9Sstevel@tonic-gate 	if (!bio_incore(dev, blkno)) {
2567c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(sys, lread, 1);
2577c478bd9Sstevel@tonic-gate 		bp = GETBLK(dev, blkno, bsize);
2587c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & B_DONE) == 0) {
2597c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_READ;
2607c478bd9Sstevel@tonic-gate 			bp->b_bcount = bsize;
2617c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
2627c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
2637c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
2647c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
2657c478bd9Sstevel@tonic-gate 		}
2667c478bd9Sstevel@tonic-gate 	}
2677c478bd9Sstevel@tonic-gate 	if (rablkno && bfreelist.b_bcount > 1 &&
2687c478bd9Sstevel@tonic-gate 	    !bio_incore(dev, rablkno)) {
2697c478bd9Sstevel@tonic-gate 		rabp = GETBLK(dev, rablkno, bsize);
2707c478bd9Sstevel@tonic-gate 		if (rabp->b_flags & B_DONE)
2717c478bd9Sstevel@tonic-gate 			brelse(rabp);
2727c478bd9Sstevel@tonic-gate 		else {
2737c478bd9Sstevel@tonic-gate 			rabp->b_flags |= B_READ|B_ASYNC;
2747c478bd9Sstevel@tonic-gate 			rabp->b_bcount = bsize;
2757c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(rabp);
2767c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
2777c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.inblock++;
2787c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(sys, bread, 1);
2797c478bd9Sstevel@tonic-gate 		}
2807c478bd9Sstevel@tonic-gate 	}
2817c478bd9Sstevel@tonic-gate 	if (bp == NULL)
2827c478bd9Sstevel@tonic-gate 		return (BREAD(dev, blkno, bsize));
2837c478bd9Sstevel@tonic-gate 	(void) biowait(bp);
2847c478bd9Sstevel@tonic-gate 	return (bp);
2857c478bd9Sstevel@tonic-gate }
2867c478bd9Sstevel@tonic-gate 
2877c478bd9Sstevel@tonic-gate /*
2887c478bd9Sstevel@tonic-gate  * Common code for writing a buffer with various options.
2897c478bd9Sstevel@tonic-gate  *
2907c478bd9Sstevel@tonic-gate  * force_wait  - wait for write completion regardless of B_ASYNC flag
2917c478bd9Sstevel@tonic-gate  * do_relse    - release the buffer when we are done
2927c478bd9Sstevel@tonic-gate  * clear_flags - flags to clear from the buffer
2937c478bd9Sstevel@tonic-gate  */
2947c478bd9Sstevel@tonic-gate void
2957c478bd9Sstevel@tonic-gate bwrite_common(void *arg, struct buf *bp, int force_wait,
2967c478bd9Sstevel@tonic-gate 				int do_relse, int clear_flags)
2977c478bd9Sstevel@tonic-gate {
2987c478bd9Sstevel@tonic-gate 	register int do_wait;
2997c478bd9Sstevel@tonic-gate 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
3007c478bd9Sstevel@tonic-gate 	int flag;
3017c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(curthread);
3027c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
3037c478bd9Sstevel@tonic-gate 
3047c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3057c478bd9Sstevel@tonic-gate 	flag = bp->b_flags;
3067c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~clear_flags;
3077c478bd9Sstevel@tonic-gate 	if (lwp != NULL)
3087c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.oublock++;
3097c478bd9Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
3107c478bd9Sstevel@tonic-gate 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
3117c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
3127c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
3137c478bd9Sstevel@tonic-gate 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
3147c478bd9Sstevel@tonic-gate 	if (do_wait == 0)
3157c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
3167c478bd9Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
3177c478bd9Sstevel@tonic-gate 	if (ufsvfsp == NULL) {
3187c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
3197c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
3207c478bd9Sstevel@tonic-gate 							/* ufs && logging */
3217c478bd9Sstevel@tonic-gate 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
3227c478bd9Sstevel@tonic-gate 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
3237c478bd9Sstevel@tonic-gate 							/* ufs && snapshots */
3247c478bd9Sstevel@tonic-gate 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
3257c478bd9Sstevel@tonic-gate 	} else {
3267c478bd9Sstevel@tonic-gate 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
3277c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
3287c478bd9Sstevel@tonic-gate 	}
3297c478bd9Sstevel@tonic-gate 	if (do_wait) {
3307c478bd9Sstevel@tonic-gate 		(void) biowait(bp);
3317c478bd9Sstevel@tonic-gate 		if (do_relse) {
3327c478bd9Sstevel@tonic-gate 			brelse(bp);
3337c478bd9Sstevel@tonic-gate 		}
3347c478bd9Sstevel@tonic-gate 	}
3357c478bd9Sstevel@tonic-gate }
3367c478bd9Sstevel@tonic-gate 
3377c478bd9Sstevel@tonic-gate /*
3387c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion (unless B_ASYNC is set).
3397c478bd9Sstevel@tonic-gate  * Then release the buffer.
3407c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
3417c478bd9Sstevel@tonic-gate  * BWRITE() directly avoids the extra function call overhead invoked
3427c478bd9Sstevel@tonic-gate  * by calling this routine.
3437c478bd9Sstevel@tonic-gate  */
3447c478bd9Sstevel@tonic-gate void
3457c478bd9Sstevel@tonic-gate bwrite(struct buf *bp)
3467c478bd9Sstevel@tonic-gate {
3477c478bd9Sstevel@tonic-gate 	BWRITE(bp);
3487c478bd9Sstevel@tonic-gate }
3497c478bd9Sstevel@tonic-gate 
3507c478bd9Sstevel@tonic-gate /*
3517c478bd9Sstevel@tonic-gate  * Write the buffer, waiting for completion.
3527c478bd9Sstevel@tonic-gate  * But don't release the buffer afterwards.
3537c478bd9Sstevel@tonic-gate  * This interface is provided for binary compatibility.  Using
3547c478bd9Sstevel@tonic-gate  * BWRITE2() directly avoids the extra function call overhead.
3557c478bd9Sstevel@tonic-gate  */
3567c478bd9Sstevel@tonic-gate void
3577c478bd9Sstevel@tonic-gate bwrite2(struct buf *bp)
3587c478bd9Sstevel@tonic-gate {
3597c478bd9Sstevel@tonic-gate 	BWRITE2(bp);
3607c478bd9Sstevel@tonic-gate }
3617c478bd9Sstevel@tonic-gate 
3627c478bd9Sstevel@tonic-gate /*
3637c478bd9Sstevel@tonic-gate  * Release the buffer, marking it so that if it is grabbed
3647c478bd9Sstevel@tonic-gate  * for another purpose it will be written out before being
3657c478bd9Sstevel@tonic-gate  * given up (e.g. when writing a partial block where it is
3667c478bd9Sstevel@tonic-gate  * assumed that another write for the same block will soon follow).
3677c478bd9Sstevel@tonic-gate  * Also save the time that the block is first marked as delayed
3687c478bd9Sstevel@tonic-gate  * so that it will be written in a reasonable time.
3697c478bd9Sstevel@tonic-gate  */
3707c478bd9Sstevel@tonic-gate void
3717c478bd9Sstevel@tonic-gate bdwrite(struct buf *bp)
3727c478bd9Sstevel@tonic-gate {
3737c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3747c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(sys, lwrite, 1);
3757c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_DELWRI) == 0)
376d3d50737SRafael Vanoni 		bp->b_start = ddi_get_lbolt();
3777c478bd9Sstevel@tonic-gate 	/*
3787c478bd9Sstevel@tonic-gate 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
3797c478bd9Sstevel@tonic-gate 	 * buffer to be written before being reused, and setting b_resid
3807c478bd9Sstevel@tonic-gate 	 * to zero says the buffer is complete.
3817c478bd9Sstevel@tonic-gate 	 */
3827c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_DELWRI | B_DONE;
3837c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
3847c478bd9Sstevel@tonic-gate 	brelse(bp);
3857c478bd9Sstevel@tonic-gate }
3867c478bd9Sstevel@tonic-gate 
3877c478bd9Sstevel@tonic-gate /*
3887c478bd9Sstevel@tonic-gate  * Release the buffer, start I/O on it, but don't wait for completion.
3897c478bd9Sstevel@tonic-gate  */
3907c478bd9Sstevel@tonic-gate void
3917c478bd9Sstevel@tonic-gate bawrite(struct buf *bp)
3927c478bd9Sstevel@tonic-gate {
3937c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
3947c478bd9Sstevel@tonic-gate 
3957c478bd9Sstevel@tonic-gate 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
3967c478bd9Sstevel@tonic-gate 	if (bfreelist.b_bcount > 4)
3977c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_ASYNC;
3987c478bd9Sstevel@tonic-gate 	BWRITE(bp);
3997c478bd9Sstevel@tonic-gate }
4007c478bd9Sstevel@tonic-gate 
4017c478bd9Sstevel@tonic-gate /*
4027c478bd9Sstevel@tonic-gate  * Release the buffer, with no I/O implied.
4037c478bd9Sstevel@tonic-gate  */
4047c478bd9Sstevel@tonic-gate void
4057c478bd9Sstevel@tonic-gate brelse(struct buf *bp)
4067c478bd9Sstevel@tonic-gate {
4077c478bd9Sstevel@tonic-gate 	struct buf	**backp;
4087c478bd9Sstevel@tonic-gate 	uint_t		index;
4097c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
4107c478bd9Sstevel@tonic-gate 	struct	buf	*dp;
4117c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
4127c478bd9Sstevel@tonic-gate 
4137c478bd9Sstevel@tonic-gate 
4147c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
4157c478bd9Sstevel@tonic-gate 
4167c478bd9Sstevel@tonic-gate 	/*
4177c478bd9Sstevel@tonic-gate 	 * Clear the retry write flag if the buffer was written without
4187c478bd9Sstevel@tonic-gate 	 * error.  The presence of B_DELWRI means the buffer has not yet
4197c478bd9Sstevel@tonic-gate 	 * been written and the presence of B_ERROR means that an error
4207c478bd9Sstevel@tonic-gate 	 * is still occurring.
4217c478bd9Sstevel@tonic-gate 	 */
4227c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
4237c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_RETRYWRI;
4247c478bd9Sstevel@tonic-gate 	}
4257c478bd9Sstevel@tonic-gate 
4267c478bd9Sstevel@tonic-gate 	/* Check for anomalous conditions */
4277c478bd9Sstevel@tonic-gate 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
4287c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_NOCACHE) {
4297c478bd9Sstevel@tonic-gate 			/* Don't add to the freelist. Destroy it now */
4307c478bd9Sstevel@tonic-gate 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
4317c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_sem);
4327c478bd9Sstevel@tonic-gate 			sema_destroy(&bp->b_io);
4337c478bd9Sstevel@tonic-gate 			kmem_free(bp, sizeof (struct buf));
4347c478bd9Sstevel@tonic-gate 			return;
4357c478bd9Sstevel@tonic-gate 		}
4367c478bd9Sstevel@tonic-gate 		/*
4377c478bd9Sstevel@tonic-gate 		 * If a write failed and we are supposed to retry write,
4387c478bd9Sstevel@tonic-gate 		 * don't toss the buffer.  Keep it around and mark it
4397c478bd9Sstevel@tonic-gate 		 * delayed write in the hopes that it will eventually
4407c478bd9Sstevel@tonic-gate 		 * get flushed (and still keep the system running.)
4417c478bd9Sstevel@tonic-gate 		 */
4427c478bd9Sstevel@tonic-gate 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
4437c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_DELWRI;
4447c478bd9Sstevel@tonic-gate 			/* keep fsflush from trying continuously to flush */
445d3d50737SRafael Vanoni 			bp->b_start = ddi_get_lbolt();
4467c478bd9Sstevel@tonic-gate 		} else
4477c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_AGE|B_STALE;
4487c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
4497c478bd9Sstevel@tonic-gate 		bp->b_error = 0;
4507c478bd9Sstevel@tonic-gate 	}
4517c478bd9Sstevel@tonic-gate 
4527c478bd9Sstevel@tonic-gate 	/*
4537c478bd9Sstevel@tonic-gate 	 * If delayed write is set then put in on the delayed
4547c478bd9Sstevel@tonic-gate 	 * write list instead of the free buffer list.
4557c478bd9Sstevel@tonic-gate 	 */
4567c478bd9Sstevel@tonic-gate 	index = bio_bhash(bp->b_edev, bp->b_blkno);
4577c478bd9Sstevel@tonic-gate 	hmp   = &hbuf[index].b_lock;
4587c478bd9Sstevel@tonic-gate 
4597c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
4607c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
4617c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
4627c478bd9Sstevel@tonic-gate 
4637c478bd9Sstevel@tonic-gate 	/*
4647c478bd9Sstevel@tonic-gate 	 * Make sure that the number of entries on this list are
4657c478bd9Sstevel@tonic-gate 	 * Zero <= count <= total # buffers
4667c478bd9Sstevel@tonic-gate 	 */
4677c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length >= 0);
4687c478bd9Sstevel@tonic-gate 	ASSERT(hp->b_length < nbuf);
4697c478bd9Sstevel@tonic-gate 
4707c478bd9Sstevel@tonic-gate 	hp->b_length++;		/* We are adding this buffer */
4717c478bd9Sstevel@tonic-gate 
4727c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_DELWRI) {
4737c478bd9Sstevel@tonic-gate 		/*
4747c478bd9Sstevel@tonic-gate 		 * This buffer goes on the delayed write buffer list
4757c478bd9Sstevel@tonic-gate 		 */
4767c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[index];
4777c478bd9Sstevel@tonic-gate 	}
4787c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bufsize > 0);
4797c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_bcount > 0);
4807c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr != NULL);
4817c478bd9Sstevel@tonic-gate 
4827c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_AGE) {
4837c478bd9Sstevel@tonic-gate 		backp = &dp->av_forw;
4847c478bd9Sstevel@tonic-gate 		(*backp)->av_back = bp;
4857c478bd9Sstevel@tonic-gate 		bp->av_forw = *backp;
4867c478bd9Sstevel@tonic-gate 		*backp = bp;
4877c478bd9Sstevel@tonic-gate 		bp->av_back = dp;
4887c478bd9Sstevel@tonic-gate 	} else {
4897c478bd9Sstevel@tonic-gate 		backp = &dp->av_back;
4907c478bd9Sstevel@tonic-gate 		(*backp)->av_forw = bp;
4917c478bd9Sstevel@tonic-gate 		bp->av_back = *backp;
4927c478bd9Sstevel@tonic-gate 		*backp = bp;
4937c478bd9Sstevel@tonic-gate 		bp->av_forw = dp;
4947c478bd9Sstevel@tonic-gate 	}
4957c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
4967c478bd9Sstevel@tonic-gate 
4977c478bd9Sstevel@tonic-gate 	if (bfreelist.b_flags & B_WANTED) {
4987c478bd9Sstevel@tonic-gate 		/*
4997c478bd9Sstevel@tonic-gate 		 * Should come here very very rarely.
5007c478bd9Sstevel@tonic-gate 		 */
5017c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
5027c478bd9Sstevel@tonic-gate 		if (bfreelist.b_flags & B_WANTED) {
5037c478bd9Sstevel@tonic-gate 			bfreelist.b_flags &= ~B_WANTED;
5047c478bd9Sstevel@tonic-gate 			cv_broadcast(&bio_mem_cv);
5057c478bd9Sstevel@tonic-gate 		}
5067c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
5077c478bd9Sstevel@tonic-gate 	}
5087c478bd9Sstevel@tonic-gate 
5097c478bd9Sstevel@tonic-gate 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
5107c478bd9Sstevel@tonic-gate 	/*
5117c478bd9Sstevel@tonic-gate 	 * Don't let anyone get the buffer off the freelist before we
5127c478bd9Sstevel@tonic-gate 	 * release our hold on it.
5137c478bd9Sstevel@tonic-gate 	 */
5147c478bd9Sstevel@tonic-gate 	sema_v(&bp->b_sem);
5157c478bd9Sstevel@tonic-gate }
5167c478bd9Sstevel@tonic-gate 
5177c478bd9Sstevel@tonic-gate /*
5187c478bd9Sstevel@tonic-gate  * Return a count of the number of B_BUSY buffers in the system
5197c478bd9Sstevel@tonic-gate  * Can only be used as a good estimate.  If 'cleanit' is set,
5207c478bd9Sstevel@tonic-gate  * try to flush all bufs.
5217c478bd9Sstevel@tonic-gate  */
5227c478bd9Sstevel@tonic-gate int
5237c478bd9Sstevel@tonic-gate bio_busy(int cleanit)
5247c478bd9Sstevel@tonic-gate {
5257c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
5267c478bd9Sstevel@tonic-gate 	int busy = 0;
5277c478bd9Sstevel@tonic-gate 	int i;
5287c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
5297c478bd9Sstevel@tonic-gate 
5307c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
5317c478bd9Sstevel@tonic-gate 		vfs_syncprogress();
5327c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
5337c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
5347c478bd9Sstevel@tonic-gate 
5357c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
5367c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
5377c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_BUSY)
5387c478bd9Sstevel@tonic-gate 				busy++;
5397c478bd9Sstevel@tonic-gate 		}
5407c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
5417c478bd9Sstevel@tonic-gate 	}
5427c478bd9Sstevel@tonic-gate 
5437c478bd9Sstevel@tonic-gate 	if (cleanit && busy != 0) {
5447c478bd9Sstevel@tonic-gate 		bflush(NODEV);
5457c478bd9Sstevel@tonic-gate 	}
5467c478bd9Sstevel@tonic-gate 
5477c478bd9Sstevel@tonic-gate 	return (busy);
5487c478bd9Sstevel@tonic-gate }
5497c478bd9Sstevel@tonic-gate 
5507c478bd9Sstevel@tonic-gate /*
5517c478bd9Sstevel@tonic-gate  * this interface is provided for binary compatibility.
5527c478bd9Sstevel@tonic-gate  *
5537c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
5547c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
5557c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
5567c478bd9Sstevel@tonic-gate  */
5577c478bd9Sstevel@tonic-gate struct buf *
5587c478bd9Sstevel@tonic-gate getblk(dev_t dev, daddr_t blkno, long bsize)
5597c478bd9Sstevel@tonic-gate {
5607c478bd9Sstevel@tonic-gate 	return (getblk_common(/* ufsvfsp */ NULL, dev,
5617c478bd9Sstevel@tonic-gate 	    blkno, bsize, /* errflg */ 0));
5627c478bd9Sstevel@tonic-gate }
5637c478bd9Sstevel@tonic-gate 
5647c478bd9Sstevel@tonic-gate /*
5657c478bd9Sstevel@tonic-gate  * Assign a buffer for the given block.  If the appropriate
5667c478bd9Sstevel@tonic-gate  * block is already associated, return it; otherwise search
5677c478bd9Sstevel@tonic-gate  * for the oldest non-busy buffer and reassign it.
5687c478bd9Sstevel@tonic-gate  */
5697c478bd9Sstevel@tonic-gate struct buf *
5707c478bd9Sstevel@tonic-gate getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
5717c478bd9Sstevel@tonic-gate {
5727c478bd9Sstevel@tonic-gate 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
5737c478bd9Sstevel@tonic-gate 	struct buf *bp;
5747c478bd9Sstevel@tonic-gate 	struct buf *dp;
5757c478bd9Sstevel@tonic-gate 	struct buf *nbp = NULL;
5767c478bd9Sstevel@tonic-gate 	struct buf *errbp;
5777c478bd9Sstevel@tonic-gate 	uint_t		index;
5787c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
5797c478bd9Sstevel@tonic-gate 	struct	hbuf	*hp;
5807c478bd9Sstevel@tonic-gate 
5817c478bd9Sstevel@tonic-gate 	if (getmajor(dev) >= devcnt)
5827c478bd9Sstevel@tonic-gate 		cmn_err(CE_PANIC, "blkdev");
5837c478bd9Sstevel@tonic-gate 
5847c478bd9Sstevel@tonic-gate 	biostats.bio_lookup.value.ui32++;
5857c478bd9Sstevel@tonic-gate 
5867c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
5877c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
5887c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
5897c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
5907c478bd9Sstevel@tonic-gate 
5917c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
5927c478bd9Sstevel@tonic-gate loop:
5937c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
5947c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
5957c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
5967c478bd9Sstevel@tonic-gate 			continue;
5977c478bd9Sstevel@tonic-gate 		/*
5987c478bd9Sstevel@tonic-gate 		 * Avoid holding the hash lock in the event that
5997c478bd9Sstevel@tonic-gate 		 * the buffer is locked by someone. Since the hash chain
6007c478bd9Sstevel@tonic-gate 		 * may change when we drop the hash lock
6017c478bd9Sstevel@tonic-gate 		 * we have to start at the beginning of the chain if the
6027c478bd9Sstevel@tonic-gate 		 * buffer identity/contents aren't valid.
6037c478bd9Sstevel@tonic-gate 		 */
6047c478bd9Sstevel@tonic-gate 		if (!sema_tryp(&bp->b_sem)) {
6057c478bd9Sstevel@tonic-gate 			biostats.bio_bufbusy.value.ui32++;
6067c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
6077c478bd9Sstevel@tonic-gate 			/*
6087c478bd9Sstevel@tonic-gate 			 * OK, we are dealing with a busy buffer.
6097c478bd9Sstevel@tonic-gate 			 * In the case that we are panicking and we
6107c478bd9Sstevel@tonic-gate 			 * got called from bread(), we have some chance
6117c478bd9Sstevel@tonic-gate 			 * for error recovery. So better bail out from
6127c478bd9Sstevel@tonic-gate 			 * here since sema_p() won't block. If we got
6137c478bd9Sstevel@tonic-gate 			 * called directly from ufs routines, there is
6147c478bd9Sstevel@tonic-gate 			 * no way to report an error yet.
6157c478bd9Sstevel@tonic-gate 			 */
6167c478bd9Sstevel@tonic-gate 			if (panicstr && errflg)
6177c478bd9Sstevel@tonic-gate 				goto errout;
6187c478bd9Sstevel@tonic-gate 			/*
6197c478bd9Sstevel@tonic-gate 			 * For the following line of code to work
6207c478bd9Sstevel@tonic-gate 			 * correctly never kmem_free the buffer "header".
6217c478bd9Sstevel@tonic-gate 			 */
6227c478bd9Sstevel@tonic-gate 			sema_p(&bp->b_sem);
6237c478bd9Sstevel@tonic-gate 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
6247c478bd9Sstevel@tonic-gate 			    (bp->b_flags & B_STALE)) {
6257c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
6267c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
6277c478bd9Sstevel@tonic-gate 				goto loop;	/* start over */
6287c478bd9Sstevel@tonic-gate 			}
6297c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
6307c478bd9Sstevel@tonic-gate 		}
6317c478bd9Sstevel@tonic-gate 		/* Found */
6327c478bd9Sstevel@tonic-gate 		biostats.bio_hit.value.ui32++;
6337c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_AGE;
6347c478bd9Sstevel@tonic-gate 
6357c478bd9Sstevel@tonic-gate 		/*
6367c478bd9Sstevel@tonic-gate 		 * Yank it off the free/delayed write lists
6377c478bd9Sstevel@tonic-gate 		 */
6387c478bd9Sstevel@tonic-gate 		hp->b_length--;
6397c478bd9Sstevel@tonic-gate 		notavail(bp);
6407c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
6417c478bd9Sstevel@tonic-gate 
6427c478bd9Sstevel@tonic-gate 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
6437c478bd9Sstevel@tonic-gate 
6447c478bd9Sstevel@tonic-gate 		if (nbp == NULL) {
6457c478bd9Sstevel@tonic-gate 			/*
6467c478bd9Sstevel@tonic-gate 			 * Make the common path short.
6477c478bd9Sstevel@tonic-gate 			 */
6487c478bd9Sstevel@tonic-gate 			ASSERT(SEMA_HELD(&bp->b_sem));
6497c478bd9Sstevel@tonic-gate 			return (bp);
6507c478bd9Sstevel@tonic-gate 		}
6517c478bd9Sstevel@tonic-gate 
6527c478bd9Sstevel@tonic-gate 		biostats.bio_bufdup.value.ui32++;
6537c478bd9Sstevel@tonic-gate 
6547c478bd9Sstevel@tonic-gate 		/*
6557c478bd9Sstevel@tonic-gate 		 * The buffer must have entered during the lock upgrade
6567c478bd9Sstevel@tonic-gate 		 * so free the new buffer we allocated and return the
6577c478bd9Sstevel@tonic-gate 		 * found buffer.
6587c478bd9Sstevel@tonic-gate 		 */
6597c478bd9Sstevel@tonic-gate 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
6607c478bd9Sstevel@tonic-gate 		nbp->b_un.b_addr = NULL;
6617c478bd9Sstevel@tonic-gate 
6627c478bd9Sstevel@tonic-gate 		/*
6637c478bd9Sstevel@tonic-gate 		 * Account for the memory
6647c478bd9Sstevel@tonic-gate 		 */
6657c478bd9Sstevel@tonic-gate 		mutex_enter(&bfree_lock);
6667c478bd9Sstevel@tonic-gate 		bfreelist.b_bufsize += nbp->b_bufsize;
6677c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
6687c478bd9Sstevel@tonic-gate 
6697c478bd9Sstevel@tonic-gate 		/*
6707c478bd9Sstevel@tonic-gate 		 * Destroy buf identity, and place on avail list
6717c478bd9Sstevel@tonic-gate 		 */
6727c478bd9Sstevel@tonic-gate 		nbp->b_dev = (o_dev_t)NODEV;
6737c478bd9Sstevel@tonic-gate 		nbp->b_edev = NODEV;
6747c478bd9Sstevel@tonic-gate 		nbp->b_flags = 0;
6757c478bd9Sstevel@tonic-gate 		nbp->b_file = NULL;
6767c478bd9Sstevel@tonic-gate 		nbp->b_offset = -1;
6777c478bd9Sstevel@tonic-gate 
6787c478bd9Sstevel@tonic-gate 		sema_v(&nbp->b_sem);
6797c478bd9Sstevel@tonic-gate 		bio_bhdr_free(nbp);
6807c478bd9Sstevel@tonic-gate 
6817c478bd9Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
6827c478bd9Sstevel@tonic-gate 		return (bp);
6837c478bd9Sstevel@tonic-gate 	}
6847c478bd9Sstevel@tonic-gate 
6857c478bd9Sstevel@tonic-gate 	/*
6867c478bd9Sstevel@tonic-gate 	 * bio_getfreeblk may block so check the hash chain again.
6877c478bd9Sstevel@tonic-gate 	 */
6887c478bd9Sstevel@tonic-gate 	if (nbp == NULL) {
6897c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
6907c478bd9Sstevel@tonic-gate 		nbp = bio_getfreeblk(bsize);
6917c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
6927c478bd9Sstevel@tonic-gate 		goto loop;
6937c478bd9Sstevel@tonic-gate 	}
6947c478bd9Sstevel@tonic-gate 
6957c478bd9Sstevel@tonic-gate 	/*
6967c478bd9Sstevel@tonic-gate 	 * New buffer. Assign nbp and stick it on the hash.
6977c478bd9Sstevel@tonic-gate 	 */
6987c478bd9Sstevel@tonic-gate 	nbp->b_flags = B_BUSY;
6997c478bd9Sstevel@tonic-gate 	nbp->b_edev = dev;
7007c478bd9Sstevel@tonic-gate 	nbp->b_dev = (o_dev_t)cmpdev(dev);
7017c478bd9Sstevel@tonic-gate 	nbp->b_blkno = blkno;
7027c478bd9Sstevel@tonic-gate 	nbp->b_iodone = NULL;
7037c478bd9Sstevel@tonic-gate 	nbp->b_bcount = bsize;
7047c478bd9Sstevel@tonic-gate 	/*
7057c478bd9Sstevel@tonic-gate 	 * If we are given a ufsvfsp and the vfs_root field is NULL
7067c478bd9Sstevel@tonic-gate 	 * then this must be I/O for a superblock.  A superblock's
7077c478bd9Sstevel@tonic-gate 	 * buffer is set up in mountfs() and there is no root vnode
7087c478bd9Sstevel@tonic-gate 	 * at that point.
7097c478bd9Sstevel@tonic-gate 	 */
7107c478bd9Sstevel@tonic-gate 	if (ufsvfsp && ufsvfsp->vfs_root) {
7117c478bd9Sstevel@tonic-gate 		nbp->b_vp = ufsvfsp->vfs_root;
7127c478bd9Sstevel@tonic-gate 	} else {
7137c478bd9Sstevel@tonic-gate 		nbp->b_vp = NULL;
7147c478bd9Sstevel@tonic-gate 	}
7157c478bd9Sstevel@tonic-gate 
7167c478bd9Sstevel@tonic-gate 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
7177c478bd9Sstevel@tonic-gate 
7187c478bd9Sstevel@tonic-gate 	binshash(nbp, dp);
7197c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
7207c478bd9Sstevel@tonic-gate 
7217c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&nbp->b_sem));
7227c478bd9Sstevel@tonic-gate 
7237c478bd9Sstevel@tonic-gate 	return (nbp);
7247c478bd9Sstevel@tonic-gate 
7257c478bd9Sstevel@tonic-gate 
7267c478bd9Sstevel@tonic-gate 	/*
7277c478bd9Sstevel@tonic-gate 	 * Come here in case of an internal error. At this point we couldn't
7287c478bd9Sstevel@tonic-gate 	 * get a buffer, but he have to return one. Hence we allocate some
7297c478bd9Sstevel@tonic-gate 	 * kind of error reply buffer on the fly. This buffer is marked as
7307c478bd9Sstevel@tonic-gate 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
7317c478bd9Sstevel@tonic-gate 	 *	- B_ERROR will indicate error to the caller.
7327c478bd9Sstevel@tonic-gate 	 *	- B_DONE will prevent us from reading the buffer from
7337c478bd9Sstevel@tonic-gate 	 *	  the device.
7347c478bd9Sstevel@tonic-gate 	 *	- B_NOCACHE will cause that this buffer gets free'd in
7357c478bd9Sstevel@tonic-gate 	 *	  brelse().
7367c478bd9Sstevel@tonic-gate 	 */
7377c478bd9Sstevel@tonic-gate 
7387c478bd9Sstevel@tonic-gate errout:
7397c478bd9Sstevel@tonic-gate 	errbp = geteblk();
7407c478bd9Sstevel@tonic-gate 	sema_p(&errbp->b_sem);
7417c478bd9Sstevel@tonic-gate 	errbp->b_flags &= ~B_BUSY;
7427c478bd9Sstevel@tonic-gate 	errbp->b_flags |= (B_ERROR | B_DONE);
7437c478bd9Sstevel@tonic-gate 	return (errbp);
7447c478bd9Sstevel@tonic-gate }
7457c478bd9Sstevel@tonic-gate 
7467c478bd9Sstevel@tonic-gate /*
7477c478bd9Sstevel@tonic-gate  * Get an empty block, not assigned to any particular device.
7487c478bd9Sstevel@tonic-gate  * Returns a locked buffer that is not on any hash or free list.
7497c478bd9Sstevel@tonic-gate  */
7507c478bd9Sstevel@tonic-gate struct buf *
7517c478bd9Sstevel@tonic-gate ngeteblk(long bsize)
7527c478bd9Sstevel@tonic-gate {
7537c478bd9Sstevel@tonic-gate 	struct buf *bp;
7547c478bd9Sstevel@tonic-gate 
7557c478bd9Sstevel@tonic-gate 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
7567c478bd9Sstevel@tonic-gate 	bioinit(bp);
7577c478bd9Sstevel@tonic-gate 	bp->av_forw = bp->av_back = NULL;
7587c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
7597c478bd9Sstevel@tonic-gate 	bp->b_bufsize = bsize;
7607c478bd9Sstevel@tonic-gate 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
7617c478bd9Sstevel@tonic-gate 	bp->b_dev = (o_dev_t)NODEV;
7627c478bd9Sstevel@tonic-gate 	bp->b_edev = NODEV;
7637c478bd9Sstevel@tonic-gate 	bp->b_lblkno = 0;
7647c478bd9Sstevel@tonic-gate 	bp->b_bcount = bsize;
7657c478bd9Sstevel@tonic-gate 	bp->b_iodone = NULL;
7667c478bd9Sstevel@tonic-gate 	return (bp);
7677c478bd9Sstevel@tonic-gate }
7687c478bd9Sstevel@tonic-gate 
7697c478bd9Sstevel@tonic-gate /*
7707c478bd9Sstevel@tonic-gate  * Interface of geteblk() is kept intact to maintain driver compatibility.
7717c478bd9Sstevel@tonic-gate  * Use ngeteblk() to allocate block size other than 1 KB.
7727c478bd9Sstevel@tonic-gate  */
7737c478bd9Sstevel@tonic-gate struct buf *
7747c478bd9Sstevel@tonic-gate geteblk(void)
7757c478bd9Sstevel@tonic-gate {
7767c478bd9Sstevel@tonic-gate 	return (ngeteblk((long)1024));
7777c478bd9Sstevel@tonic-gate }
7787c478bd9Sstevel@tonic-gate 
7797c478bd9Sstevel@tonic-gate /*
7807c478bd9Sstevel@tonic-gate  * Return a buffer w/o sleeping
7817c478bd9Sstevel@tonic-gate  */
7827c478bd9Sstevel@tonic-gate struct buf *
7837c478bd9Sstevel@tonic-gate trygetblk(dev_t dev, daddr_t blkno)
7847c478bd9Sstevel@tonic-gate {
7857c478bd9Sstevel@tonic-gate 	struct buf	*bp;
7867c478bd9Sstevel@tonic-gate 	struct buf	*dp;
7877c478bd9Sstevel@tonic-gate 	struct hbuf	*hp;
7887c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
7897c478bd9Sstevel@tonic-gate 	uint_t		index;
7907c478bd9Sstevel@tonic-gate 
7917c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
7927c478bd9Sstevel@tonic-gate 	hp = &hbuf[index];
7937c478bd9Sstevel@tonic-gate 	hmp = &hp->b_lock;
7947c478bd9Sstevel@tonic-gate 
7957c478bd9Sstevel@tonic-gate 	if (!mutex_tryenter(hmp))
7967c478bd9Sstevel@tonic-gate 		return (NULL);
7977c478bd9Sstevel@tonic-gate 
7987c478bd9Sstevel@tonic-gate 	dp = (struct buf *)hp;
7997c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
8007c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
8017c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
8027c478bd9Sstevel@tonic-gate 			continue;
8037c478bd9Sstevel@tonic-gate 		/*
8047c478bd9Sstevel@tonic-gate 		 * Get access to a valid buffer without sleeping
8057c478bd9Sstevel@tonic-gate 		 */
8067c478bd9Sstevel@tonic-gate 		if (sema_tryp(&bp->b_sem)) {
8077c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_DONE) {
8087c478bd9Sstevel@tonic-gate 				hp->b_length--;
8097c478bd9Sstevel@tonic-gate 				notavail(bp);
8107c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
8117c478bd9Sstevel@tonic-gate 				return (bp);
8127c478bd9Sstevel@tonic-gate 			} else {
8137c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
8147c478bd9Sstevel@tonic-gate 				break;
8157c478bd9Sstevel@tonic-gate 			}
8167c478bd9Sstevel@tonic-gate 		}
8177c478bd9Sstevel@tonic-gate 		break;
8187c478bd9Sstevel@tonic-gate 	}
8197c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
8207c478bd9Sstevel@tonic-gate 	return (NULL);
8217c478bd9Sstevel@tonic-gate }
8227c478bd9Sstevel@tonic-gate 
8237c478bd9Sstevel@tonic-gate /*
8247c478bd9Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return errors
8257c478bd9Sstevel@tonic-gate  * to the user.
8267c478bd9Sstevel@tonic-gate  */
8277c478bd9Sstevel@tonic-gate int
8287c478bd9Sstevel@tonic-gate iowait(struct buf *bp)
8297c478bd9Sstevel@tonic-gate {
8307c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8317c478bd9Sstevel@tonic-gate 	return (biowait(bp));
8327c478bd9Sstevel@tonic-gate }
8337c478bd9Sstevel@tonic-gate 
8347c478bd9Sstevel@tonic-gate /*
8357c478bd9Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
8367c478bd9Sstevel@tonic-gate  * and wake up anyone waiting for it.
8377c478bd9Sstevel@tonic-gate  */
8387c478bd9Sstevel@tonic-gate void
8397c478bd9Sstevel@tonic-gate iodone(struct buf *bp)
8407c478bd9Sstevel@tonic-gate {
8417c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8427c478bd9Sstevel@tonic-gate 	(void) biodone(bp);
8437c478bd9Sstevel@tonic-gate }
8447c478bd9Sstevel@tonic-gate 
8457c478bd9Sstevel@tonic-gate /*
8467c478bd9Sstevel@tonic-gate  * Zero the core associated with a buffer.
8477c478bd9Sstevel@tonic-gate  */
8487c478bd9Sstevel@tonic-gate void
8497c478bd9Sstevel@tonic-gate clrbuf(struct buf *bp)
8507c478bd9Sstevel@tonic-gate {
8517c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
8527c478bd9Sstevel@tonic-gate 	bzero(bp->b_un.b_addr, bp->b_bcount);
8537c478bd9Sstevel@tonic-gate 	bp->b_resid = 0;
8547c478bd9Sstevel@tonic-gate }
8557c478bd9Sstevel@tonic-gate 
8567c478bd9Sstevel@tonic-gate 
8577c478bd9Sstevel@tonic-gate /*
8587c478bd9Sstevel@tonic-gate  * Make sure all write-behind blocks on dev (or NODEV for all)
8597c478bd9Sstevel@tonic-gate  * are flushed out.
8607c478bd9Sstevel@tonic-gate  */
8617c478bd9Sstevel@tonic-gate void
8627c478bd9Sstevel@tonic-gate bflush(dev_t dev)
8637c478bd9Sstevel@tonic-gate {
8647c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
8657c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
8667c478bd9Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
8677c478bd9Sstevel@tonic-gate 	int i, index;
8687c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
8697c478bd9Sstevel@tonic-gate 
8707c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
8717c478bd9Sstevel@tonic-gate 	/*
8727c478bd9Sstevel@tonic-gate 	 * Wait for any invalidates or flushes ahead of us to finish.
8737c478bd9Sstevel@tonic-gate 	 * We really could split blist_lock up per device for better
8747c478bd9Sstevel@tonic-gate 	 * parallelism here.
8757c478bd9Sstevel@tonic-gate 	 */
8767c478bd9Sstevel@tonic-gate 	while (bio_doinginval || bio_doingflush) {
8777c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
8787c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
8797c478bd9Sstevel@tonic-gate 	}
8807c478bd9Sstevel@tonic-gate 	bio_doingflush++;
8817c478bd9Sstevel@tonic-gate 	/*
8827c478bd9Sstevel@tonic-gate 	 * Gather all B_DELWRI buffer for device.
8837c478bd9Sstevel@tonic-gate 	 * Lock ordering is b_sem > hash lock (brelse).
8847c478bd9Sstevel@tonic-gate 	 * Since we are finding the buffer via the delayed write list,
8857c478bd9Sstevel@tonic-gate 	 * it may be busy and we would block trying to get the
8867c478bd9Sstevel@tonic-gate 	 * b_sem lock while holding hash lock. So transfer all the
8877c478bd9Sstevel@tonic-gate 	 * candidates on the delwri_list and then drop the hash locks.
8887c478bd9Sstevel@tonic-gate 	 */
8897c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
8907c478bd9Sstevel@tonic-gate 		vfs_syncprogress();
8917c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
8927c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&dwbuf[i];
8937c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
8947c478bd9Sstevel@tonic-gate 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
8957c478bd9Sstevel@tonic-gate 			if (dev == NODEV || bp->b_edev == dev) {
8967c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
8977c478bd9Sstevel@tonic-gate 					bp->b_list = delwri_list;
8987c478bd9Sstevel@tonic-gate 					delwri_list = bp;
8997c478bd9Sstevel@tonic-gate 				}
9007c478bd9Sstevel@tonic-gate 			}
9017c478bd9Sstevel@tonic-gate 		}
9027c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
9037c478bd9Sstevel@tonic-gate 	}
9047c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
9057c478bd9Sstevel@tonic-gate 
9067c478bd9Sstevel@tonic-gate 	/*
9077c478bd9Sstevel@tonic-gate 	 * Now that the hash locks have been dropped grab the semaphores
9087c478bd9Sstevel@tonic-gate 	 * and write back all the buffers that have B_DELWRI set.
9097c478bd9Sstevel@tonic-gate 	 */
9107c478bd9Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
9117c478bd9Sstevel@tonic-gate 		vfs_syncprogress();
9127c478bd9Sstevel@tonic-gate 		bp = delwri_list;
9137c478bd9Sstevel@tonic-gate 
9147c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);	/* may block */
9157c478bd9Sstevel@tonic-gate 		if ((dev != bp->b_edev && dev != NODEV) ||
9167c478bd9Sstevel@tonic-gate 		    (panicstr && bp->b_flags & B_BUSY)) {
9177c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
9187c478bd9Sstevel@tonic-gate 			delwri_list = bp->b_list;
9197c478bd9Sstevel@tonic-gate 			bp->b_list = NULL;
9207c478bd9Sstevel@tonic-gate 			continue;	/* No longer a candidate */
9217c478bd9Sstevel@tonic-gate 		}
9227c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_DELWRI) {
9237c478bd9Sstevel@tonic-gate 			index = bio_bhash(bp->b_edev, bp->b_blkno);
9247c478bd9Sstevel@tonic-gate 			hp = &hbuf[index];
9257c478bd9Sstevel@tonic-gate 			hmp = &hp->b_lock;
9267c478bd9Sstevel@tonic-gate 			dp = (struct buf *)hp;
9277c478bd9Sstevel@tonic-gate 
9287c478bd9Sstevel@tonic-gate 			bp->b_flags |= B_ASYNC;
9297c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
9307c478bd9Sstevel@tonic-gate 			hp->b_length--;
9317c478bd9Sstevel@tonic-gate 			notavail(bp);
9327c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
9337c478bd9Sstevel@tonic-gate 			if (bp->b_vp == NULL) {		/* !ufs */
9347c478bd9Sstevel@tonic-gate 				BWRITE(bp);
9357c478bd9Sstevel@tonic-gate 			} else {			/* ufs */
9367c478bd9Sstevel@tonic-gate 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
9377c478bd9Sstevel@tonic-gate 			}
9387c478bd9Sstevel@tonic-gate 		} else {
9397c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
9407c478bd9Sstevel@tonic-gate 		}
9417c478bd9Sstevel@tonic-gate 		delwri_list = bp->b_list;
9427c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
9437c478bd9Sstevel@tonic-gate 	}
9447c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
9457c478bd9Sstevel@tonic-gate 	bio_doingflush--;
9467c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
9477c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
9487c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
9497c478bd9Sstevel@tonic-gate 	}
9507c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
9517c478bd9Sstevel@tonic-gate }
9527c478bd9Sstevel@tonic-gate 
9537c478bd9Sstevel@tonic-gate /*
9547c478bd9Sstevel@tonic-gate  * Ensure that a specified block is up-to-date on disk.
9557c478bd9Sstevel@tonic-gate  */
9567c478bd9Sstevel@tonic-gate void
9577c478bd9Sstevel@tonic-gate blkflush(dev_t dev, daddr_t blkno)
9587c478bd9Sstevel@tonic-gate {
9597c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
9607c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
9617c478bd9Sstevel@tonic-gate 	struct buf *sbp = NULL;
9627c478bd9Sstevel@tonic-gate 	uint_t index;
9637c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
9647c478bd9Sstevel@tonic-gate 
9657c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
9667c478bd9Sstevel@tonic-gate 	hp    = &hbuf[index];
9677c478bd9Sstevel@tonic-gate 	dp    = (struct buf *)hp;
9687c478bd9Sstevel@tonic-gate 	hmp   = &hp->b_lock;
9697c478bd9Sstevel@tonic-gate 
9707c478bd9Sstevel@tonic-gate 	/*
9717c478bd9Sstevel@tonic-gate 	 * Identify the buffer in the cache belonging to
9727c478bd9Sstevel@tonic-gate 	 * this device and blkno (if any).
9737c478bd9Sstevel@tonic-gate 	 */
9747c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
9757c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
9767c478bd9Sstevel@tonic-gate 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
9777c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE))
9787c478bd9Sstevel@tonic-gate 			continue;
9797c478bd9Sstevel@tonic-gate 		sbp = bp;
9807c478bd9Sstevel@tonic-gate 		break;
9817c478bd9Sstevel@tonic-gate 	}
9827c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
9837c478bd9Sstevel@tonic-gate 	if (sbp == NULL)
9847c478bd9Sstevel@tonic-gate 		return;
9857c478bd9Sstevel@tonic-gate 	/*
9867c478bd9Sstevel@tonic-gate 	 * Now check the buffer we have identified and
9877c478bd9Sstevel@tonic-gate 	 * make sure it still belongs to the device and is B_DELWRI
9887c478bd9Sstevel@tonic-gate 	 */
9897c478bd9Sstevel@tonic-gate 	sema_p(&sbp->b_sem);
9907c478bd9Sstevel@tonic-gate 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
9917c478bd9Sstevel@tonic-gate 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
9927c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
9937c478bd9Sstevel@tonic-gate 		hp->b_length--;
9947c478bd9Sstevel@tonic-gate 		notavail(sbp);
9957c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
9967c478bd9Sstevel@tonic-gate 		/*
9977c478bd9Sstevel@tonic-gate 		 * XXX - There is nothing to guarantee a synchronous
9987c478bd9Sstevel@tonic-gate 		 * write here if the B_ASYNC flag is set.  This needs
9997c478bd9Sstevel@tonic-gate 		 * some investigation.
10007c478bd9Sstevel@tonic-gate 		 */
10017c478bd9Sstevel@tonic-gate 		if (sbp->b_vp == NULL) {		/* !ufs */
10027c478bd9Sstevel@tonic-gate 			BWRITE(sbp);	/* synchronous write */
10037c478bd9Sstevel@tonic-gate 		} else {				/* ufs */
10047c478bd9Sstevel@tonic-gate 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
10057c478bd9Sstevel@tonic-gate 		}
10067c478bd9Sstevel@tonic-gate 	} else {
10077c478bd9Sstevel@tonic-gate 		sema_v(&sbp->b_sem);
10087c478bd9Sstevel@tonic-gate 	}
10097c478bd9Sstevel@tonic-gate }
10107c478bd9Sstevel@tonic-gate 
10117c478bd9Sstevel@tonic-gate /*
10127c478bd9Sstevel@tonic-gate  * Same as binval, except can force-invalidate delayed-write buffers
10137c478bd9Sstevel@tonic-gate  * (which are not be already flushed because of device errors).  Also
10147c478bd9Sstevel@tonic-gate  * makes sure that the retry write flag is cleared.
10157c478bd9Sstevel@tonic-gate  */
10167c478bd9Sstevel@tonic-gate int
10177c478bd9Sstevel@tonic-gate bfinval(dev_t dev, int force)
10187c478bd9Sstevel@tonic-gate {
10197c478bd9Sstevel@tonic-gate 	struct buf *dp;
10207c478bd9Sstevel@tonic-gate 	struct buf *bp;
10217c478bd9Sstevel@tonic-gate 	struct buf *binval_list = EMPTY_LIST;
10227c478bd9Sstevel@tonic-gate 	int i, error = 0;
10237c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
10247c478bd9Sstevel@tonic-gate 	uint_t index;
10257c478bd9Sstevel@tonic-gate 	struct buf **backp;
10267c478bd9Sstevel@tonic-gate 
10277c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
10287c478bd9Sstevel@tonic-gate 	/*
10297c478bd9Sstevel@tonic-gate 	 * Wait for any flushes ahead of us to finish, it's ok to
10307c478bd9Sstevel@tonic-gate 	 * do invalidates in parallel.
10317c478bd9Sstevel@tonic-gate 	 */
10327c478bd9Sstevel@tonic-gate 	while (bio_doingflush) {
10337c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 1;
10347c478bd9Sstevel@tonic-gate 		cv_wait(&bio_flushinval_cv, &blist_lock);
10357c478bd9Sstevel@tonic-gate 	}
10367c478bd9Sstevel@tonic-gate 	bio_doinginval++;
10377c478bd9Sstevel@tonic-gate 
10387c478bd9Sstevel@tonic-gate 	/* Gather bp's */
10397c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
10407c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
10417c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
10427c478bd9Sstevel@tonic-gate 
10437c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
10447c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
10457c478bd9Sstevel@tonic-gate 			if (bp->b_edev == dev) {
10467c478bd9Sstevel@tonic-gate 				if (bp->b_list == NULL) {
10477c478bd9Sstevel@tonic-gate 					bp->b_list = binval_list;
10487c478bd9Sstevel@tonic-gate 					binval_list = bp;
10497c478bd9Sstevel@tonic-gate 				}
10507c478bd9Sstevel@tonic-gate 			}
10517c478bd9Sstevel@tonic-gate 		}
10527c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
10537c478bd9Sstevel@tonic-gate 	}
10547c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
10557c478bd9Sstevel@tonic-gate 
10567c478bd9Sstevel@tonic-gate 	/* Invalidate all bp's found */
10577c478bd9Sstevel@tonic-gate 	while (binval_list != EMPTY_LIST) {
10587c478bd9Sstevel@tonic-gate 		bp = binval_list;
10597c478bd9Sstevel@tonic-gate 
10607c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_sem);
10617c478bd9Sstevel@tonic-gate 		if (bp->b_edev == dev) {
10627c478bd9Sstevel@tonic-gate 			if (force && (bp->b_flags & B_DELWRI)) {
10637c478bd9Sstevel@tonic-gate 				/* clear B_DELWRI, move to non-dw freelist */
10647c478bd9Sstevel@tonic-gate 				index = bio_bhash(bp->b_edev, bp->b_blkno);
10657c478bd9Sstevel@tonic-gate 				hmp = &hbuf[index].b_lock;
10667c478bd9Sstevel@tonic-gate 				dp = (struct buf *)&hbuf[index];
10677c478bd9Sstevel@tonic-gate 				mutex_enter(hmp);
10687c478bd9Sstevel@tonic-gate 
10697c478bd9Sstevel@tonic-gate 				/* remove from delayed write freelist */
10707c478bd9Sstevel@tonic-gate 				notavail(bp);
10717c478bd9Sstevel@tonic-gate 
10727c478bd9Sstevel@tonic-gate 				/* add to B_AGE side of non-dw freelist */
10737c478bd9Sstevel@tonic-gate 				backp = &dp->av_forw;
10747c478bd9Sstevel@tonic-gate 				(*backp)->av_back = bp;
10757c478bd9Sstevel@tonic-gate 				bp->av_forw = *backp;
10767c478bd9Sstevel@tonic-gate 				*backp = bp;
10777c478bd9Sstevel@tonic-gate 				bp->av_back = dp;
10787c478bd9Sstevel@tonic-gate 
10797c478bd9Sstevel@tonic-gate 				/*
10807c478bd9Sstevel@tonic-gate 				 * make sure write retries and busy are cleared
10817c478bd9Sstevel@tonic-gate 				 */
10827c478bd9Sstevel@tonic-gate 				bp->b_flags &=
10837c478bd9Sstevel@tonic-gate 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
10847c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
10857c478bd9Sstevel@tonic-gate 			}
10867c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_DELWRI) == 0)
10877c478bd9Sstevel@tonic-gate 				bp->b_flags |= B_STALE|B_AGE;
10887c478bd9Sstevel@tonic-gate 			else
10897c478bd9Sstevel@tonic-gate 				error = EIO;
10907c478bd9Sstevel@tonic-gate 		}
10917c478bd9Sstevel@tonic-gate 		sema_v(&bp->b_sem);
10927c478bd9Sstevel@tonic-gate 		binval_list = bp->b_list;
10937c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
10947c478bd9Sstevel@tonic-gate 	}
10957c478bd9Sstevel@tonic-gate 	mutex_enter(&blist_lock);
10967c478bd9Sstevel@tonic-gate 	bio_doinginval--;
10977c478bd9Sstevel@tonic-gate 	if (bio_flinv_cv_wanted) {
10987c478bd9Sstevel@tonic-gate 		cv_broadcast(&bio_flushinval_cv);
10997c478bd9Sstevel@tonic-gate 		bio_flinv_cv_wanted = 0;
11007c478bd9Sstevel@tonic-gate 	}
11017c478bd9Sstevel@tonic-gate 	mutex_exit(&blist_lock);
11027c478bd9Sstevel@tonic-gate 	return (error);
11037c478bd9Sstevel@tonic-gate }
11047c478bd9Sstevel@tonic-gate 
11057c478bd9Sstevel@tonic-gate /*
11067c478bd9Sstevel@tonic-gate  * If possible, invalidate blocks for a dev on demand
11077c478bd9Sstevel@tonic-gate  */
11087c478bd9Sstevel@tonic-gate void
11097c478bd9Sstevel@tonic-gate binval(dev_t dev)
11107c478bd9Sstevel@tonic-gate {
11117c478bd9Sstevel@tonic-gate 	(void) bfinval(dev, 0);
11127c478bd9Sstevel@tonic-gate }
11137c478bd9Sstevel@tonic-gate 
11147c478bd9Sstevel@tonic-gate /*
11157c478bd9Sstevel@tonic-gate  * Initialize the buffer I/O system by freeing
11167c478bd9Sstevel@tonic-gate  * all buffers and setting all device hash buffer lists to empty.
11177c478bd9Sstevel@tonic-gate  */
11187c478bd9Sstevel@tonic-gate void
11197c478bd9Sstevel@tonic-gate binit(void)
11207c478bd9Sstevel@tonic-gate {
11217c478bd9Sstevel@tonic-gate 	struct buf *bp;
11227c478bd9Sstevel@tonic-gate 	unsigned int i, pct;
11237c478bd9Sstevel@tonic-gate 	ulong_t	bio_max_hwm, bio_default_hwm;
11247c478bd9Sstevel@tonic-gate 
11257c478bd9Sstevel@tonic-gate 	/*
11267c478bd9Sstevel@tonic-gate 	 * Maximum/Default values for bufhwm are set to the smallest of:
11277c478bd9Sstevel@tonic-gate 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
11287c478bd9Sstevel@tonic-gate 	 *	- 1/4 of kernel virtual memory
11297c478bd9Sstevel@tonic-gate 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
11307c478bd9Sstevel@tonic-gate 	 * Additionally, in order to allow simple tuning by percentage of
11317c478bd9Sstevel@tonic-gate 	 * physical memory, bufhwm_pct is used to calculate the default if
11327c478bd9Sstevel@tonic-gate 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
11337c478bd9Sstevel@tonic-gate 	 *
11347c478bd9Sstevel@tonic-gate 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
11357c478bd9Sstevel@tonic-gate 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
11367c478bd9Sstevel@tonic-gate 	 */
11377c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
11387c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
11397c478bd9Sstevel@tonic-gate 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
11407c478bd9Sstevel@tonic-gate 
11417c478bd9Sstevel@tonic-gate 	pct = BIO_BUF_PERCENT;
11427c478bd9Sstevel@tonic-gate 	if (bufhwm_pct != 0 &&
11437c478bd9Sstevel@tonic-gate 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
11447c478bd9Sstevel@tonic-gate 		pct = BIO_BUF_PERCENT;
11457c478bd9Sstevel@tonic-gate 		/*
11467c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
11477c478bd9Sstevel@tonic-gate 		 */
11487c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
11497c478bd9Sstevel@tonic-gate 		    range(1..%d). Using %d as default.",
11507c478bd9Sstevel@tonic-gate 		    bufhwm_pct,
11517c478bd9Sstevel@tonic-gate 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
11527c478bd9Sstevel@tonic-gate 	}
11537c478bd9Sstevel@tonic-gate 
11547c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(physmem / pct,
11557c478bd9Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
11567c478bd9Sstevel@tonic-gate 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
11577c478bd9Sstevel@tonic-gate 
11587c478bd9Sstevel@tonic-gate 	if ((v.v_bufhwm = bufhwm) == 0)
11597c478bd9Sstevel@tonic-gate 		v.v_bufhwm = bio_default_hwm;
11607c478bd9Sstevel@tonic-gate 
11617c478bd9Sstevel@tonic-gate 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
11627c478bd9Sstevel@tonic-gate 		v.v_bufhwm = (int)bio_max_hwm;
11637c478bd9Sstevel@tonic-gate 		/*
11647c478bd9Sstevel@tonic-gate 		 * Invalid user specified value, emit a warning.
11657c478bd9Sstevel@tonic-gate 		 */
11667c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
11677c478bd9Sstevel@tonic-gate 		    "binit: bufhwm(%d) out \
11687c478bd9Sstevel@tonic-gate 		    of range(%d..%lu). Using %lu as default",
11697c478bd9Sstevel@tonic-gate 		    bufhwm,
11707c478bd9Sstevel@tonic-gate 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
11717c478bd9Sstevel@tonic-gate 	}
11727c478bd9Sstevel@tonic-gate 
11737c478bd9Sstevel@tonic-gate 	/*
11747c478bd9Sstevel@tonic-gate 	 * Determine the number of hash buckets. Default is to
11757c478bd9Sstevel@tonic-gate 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
11767c478bd9Sstevel@tonic-gate 	 * Round up number to the next power of 2.
11777c478bd9Sstevel@tonic-gate 	 */
11787c478bd9Sstevel@tonic-gate 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
11797c478bd9Sstevel@tonic-gate 	    BIO_HASHLEN);
11807c478bd9Sstevel@tonic-gate 	v.v_hmask = v.v_hbuf - 1;
11817c478bd9Sstevel@tonic-gate 	v.v_buf = BIO_BHDR_POOL;
11827c478bd9Sstevel@tonic-gate 
11837c478bd9Sstevel@tonic-gate 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
11847c478bd9Sstevel@tonic-gate 
11857c478bd9Sstevel@tonic-gate 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
11867c478bd9Sstevel@tonic-gate 
11877c478bd9Sstevel@tonic-gate 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
11887c478bd9Sstevel@tonic-gate 	bp = &bfreelist;
11897c478bd9Sstevel@tonic-gate 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
11907c478bd9Sstevel@tonic-gate 
11917c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
11927c478bd9Sstevel@tonic-gate 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
11937c478bd9Sstevel@tonic-gate 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
11947c478bd9Sstevel@tonic-gate 
11957c478bd9Sstevel@tonic-gate 		/*
11967c478bd9Sstevel@tonic-gate 		 * Initialize the delayed write buffer list.
11977c478bd9Sstevel@tonic-gate 		 */
11987c478bd9Sstevel@tonic-gate 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
11997c478bd9Sstevel@tonic-gate 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
12007c478bd9Sstevel@tonic-gate 	}
12017c478bd9Sstevel@tonic-gate }
12027c478bd9Sstevel@tonic-gate 
12037c478bd9Sstevel@tonic-gate /*
12047c478bd9Sstevel@tonic-gate  * Wait for I/O completion on the buffer; return error code.
12057c478bd9Sstevel@tonic-gate  * If bp was for synchronous I/O, bp is invalid and associated
12067c478bd9Sstevel@tonic-gate  * resources are freed on return.
12077c478bd9Sstevel@tonic-gate  */
12087c478bd9Sstevel@tonic-gate int
12097c478bd9Sstevel@tonic-gate biowait(struct buf *bp)
12107c478bd9Sstevel@tonic-gate {
12117c478bd9Sstevel@tonic-gate 	int error = 0;
12127c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
12137c478bd9Sstevel@tonic-gate 
12147c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
12157c478bd9Sstevel@tonic-gate 
12167c478bd9Sstevel@tonic-gate 	cpup = CPU;
12171a5e258fSJosef 'Jeff' Sipek 	atomic_inc_64(&cpup->cpu_stats.sys.iowait);
12187c478bd9Sstevel@tonic-gate 	DTRACE_IO1(wait__start, struct buf *, bp);
12197c478bd9Sstevel@tonic-gate 
12207c478bd9Sstevel@tonic-gate 	/*
12217c478bd9Sstevel@tonic-gate 	 * In case of panic, busy wait for completion
12227c478bd9Sstevel@tonic-gate 	 */
12237c478bd9Sstevel@tonic-gate 	if (panicstr) {
12247c478bd9Sstevel@tonic-gate 		while ((bp->b_flags & B_DONE) == 0)
12257c478bd9Sstevel@tonic-gate 			drv_usecwait(10);
12267c478bd9Sstevel@tonic-gate 	} else
12277c478bd9Sstevel@tonic-gate 		sema_p(&bp->b_io);
12287c478bd9Sstevel@tonic-gate 
12297c478bd9Sstevel@tonic-gate 	DTRACE_IO1(wait__done, struct buf *, bp);
12301a5e258fSJosef 'Jeff' Sipek 	atomic_dec_64(&cpup->cpu_stats.sys.iowait);
12317c478bd9Sstevel@tonic-gate 
12327c478bd9Sstevel@tonic-gate 	error = geterror(bp);
12337c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_ASYNC) == 0) {
12347c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
12357c478bd9Sstevel@tonic-gate 			bp_mapout(bp);
12367c478bd9Sstevel@tonic-gate 	}
12377c478bd9Sstevel@tonic-gate 	return (error);
12387c478bd9Sstevel@tonic-gate }
12397c478bd9Sstevel@tonic-gate 
12407c478bd9Sstevel@tonic-gate static void
12417c478bd9Sstevel@tonic-gate biodone_tnf_probe(struct buf *bp)
12427c478bd9Sstevel@tonic-gate {
12437c478bd9Sstevel@tonic-gate 	/* Kernel probe */
12447c478bd9Sstevel@tonic-gate 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
12457c478bd9Sstevel@tonic-gate 	    tnf_device,		device,		bp->b_edev,
12467c478bd9Sstevel@tonic-gate 	    tnf_diskaddr,	block,		bp->b_lblkno,
12477c478bd9Sstevel@tonic-gate 	    tnf_opaque,		buf,		bp);
12487c478bd9Sstevel@tonic-gate }
12497c478bd9Sstevel@tonic-gate 
12507c478bd9Sstevel@tonic-gate /*
12517c478bd9Sstevel@tonic-gate  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
12527c478bd9Sstevel@tonic-gate  * and wake up anyone waiting for it.
12537c478bd9Sstevel@tonic-gate  */
12547c478bd9Sstevel@tonic-gate void
12557c478bd9Sstevel@tonic-gate biodone(struct buf *bp)
12567c478bd9Sstevel@tonic-gate {
12577c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_STARTED) {
12587c478bd9Sstevel@tonic-gate 		DTRACE_IO1(done, struct buf *, bp);
12597c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_STARTED;
12607c478bd9Sstevel@tonic-gate 	}
12617c478bd9Sstevel@tonic-gate 
12627c478bd9Sstevel@tonic-gate 	/*
12637c478bd9Sstevel@tonic-gate 	 * Call the TNF probe here instead of the inline code
12647c478bd9Sstevel@tonic-gate 	 * to force our compiler to use the tail call optimization.
12657c478bd9Sstevel@tonic-gate 	 */
12667c478bd9Sstevel@tonic-gate 	biodone_tnf_probe(bp);
12677c478bd9Sstevel@tonic-gate 
12687c478bd9Sstevel@tonic-gate 	if (bp->b_iodone != NULL) {
12697c478bd9Sstevel@tonic-gate 		(*(bp->b_iodone))(bp);
12707c478bd9Sstevel@tonic-gate 		return;
12717c478bd9Sstevel@tonic-gate 	}
12727c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_DONE) == 0);
12737c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
12747c478bd9Sstevel@tonic-gate 	bp->b_flags |= B_DONE;
12757c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_ASYNC) {
12767c478bd9Sstevel@tonic-gate 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
12777c478bd9Sstevel@tonic-gate 			bio_pageio_done(bp);
12787c478bd9Sstevel@tonic-gate 		else
12797c478bd9Sstevel@tonic-gate 			brelse(bp);	/* release bp to freelist */
12807c478bd9Sstevel@tonic-gate 	} else {
12817c478bd9Sstevel@tonic-gate 		sema_v(&bp->b_io);
12827c478bd9Sstevel@tonic-gate 	}
12837c478bd9Sstevel@tonic-gate }
12847c478bd9Sstevel@tonic-gate 
12857c478bd9Sstevel@tonic-gate /*
12867c478bd9Sstevel@tonic-gate  * Pick up the device's error number and pass it to the user;
12877c478bd9Sstevel@tonic-gate  * if there is an error but the number is 0 set a generalized code.
12887c478bd9Sstevel@tonic-gate  */
12897c478bd9Sstevel@tonic-gate int
12907c478bd9Sstevel@tonic-gate geterror(struct buf *bp)
12917c478bd9Sstevel@tonic-gate {
12927c478bd9Sstevel@tonic-gate 	int error = 0;
12937c478bd9Sstevel@tonic-gate 
12947c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
12957c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_ERROR) {
12967c478bd9Sstevel@tonic-gate 		error = bp->b_error;
12977c478bd9Sstevel@tonic-gate 		if (!error)
12987c478bd9Sstevel@tonic-gate 			error = EIO;
12997c478bd9Sstevel@tonic-gate 	}
13007c478bd9Sstevel@tonic-gate 	return (error);
13017c478bd9Sstevel@tonic-gate }
13027c478bd9Sstevel@tonic-gate 
13037c478bd9Sstevel@tonic-gate /*
13047c478bd9Sstevel@tonic-gate  * Support for pageio buffers.
13057c478bd9Sstevel@tonic-gate  *
13067c478bd9Sstevel@tonic-gate  * This stuff should be generalized to provide a generalized bp
13077c478bd9Sstevel@tonic-gate  * header facility that can be used for things other than pageio.
13087c478bd9Sstevel@tonic-gate  */
13097c478bd9Sstevel@tonic-gate 
13107c478bd9Sstevel@tonic-gate /*
13117c478bd9Sstevel@tonic-gate  * Allocate and initialize a buf struct for use with pageio.
13127c478bd9Sstevel@tonic-gate  */
13137c478bd9Sstevel@tonic-gate struct buf *
13147c478bd9Sstevel@tonic-gate pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
13157c478bd9Sstevel@tonic-gate {
13167c478bd9Sstevel@tonic-gate 	struct buf *bp;
13177c478bd9Sstevel@tonic-gate 	struct cpu *cpup;
13187c478bd9Sstevel@tonic-gate 
13197c478bd9Sstevel@tonic-gate 	if (flags & B_READ) {
13207c478bd9Sstevel@tonic-gate 		CPU_STATS_ENTER_K();
13217c478bd9Sstevel@tonic-gate 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
13227c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
13237c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
13247c478bd9Sstevel@tonic-gate 		if ((flags & B_ASYNC) == 0) {
13257c478bd9Sstevel@tonic-gate 			klwp_t *lwp = ttolwp(curthread);
13267c478bd9Sstevel@tonic-gate 			if (lwp != NULL)
13277c478bd9Sstevel@tonic-gate 				lwp->lwp_ru.majflt++;
13287c478bd9Sstevel@tonic-gate 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
13297c478bd9Sstevel@tonic-gate 			/* Kernel probe */
13307c478bd9Sstevel@tonic-gate 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
13317c478bd9Sstevel@tonic-gate 			    tnf_opaque,		vnode,		pp->p_vnode,
13327c478bd9Sstevel@tonic-gate 			    tnf_offset,		offset,		pp->p_offset);
13337c478bd9Sstevel@tonic-gate 		}
13347c478bd9Sstevel@tonic-gate 		/*
13357c478bd9Sstevel@tonic-gate 		 * Update statistics for pages being paged in
13367c478bd9Sstevel@tonic-gate 		 */
13377c478bd9Sstevel@tonic-gate 		if (pp != NULL && pp->p_vnode != NULL) {
13387c478bd9Sstevel@tonic-gate 			if (IS_SWAPFSVP(pp->p_vnode)) {
1339d3d50737SRafael Vanoni 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
13407c478bd9Sstevel@tonic-gate 			} else {
13417c478bd9Sstevel@tonic-gate 				if (pp->p_vnode->v_flag & VVMEXEC) {
13427c478bd9Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, execpgin,
13437c478bd9Sstevel@tonic-gate 					    btopr(len));
13447c478bd9Sstevel@tonic-gate 				} else {
13457c478bd9Sstevel@tonic-gate 					CPU_STATS_ADDQ(cpup, vm, fspgin,
13467c478bd9Sstevel@tonic-gate 					    btopr(len));
13477c478bd9Sstevel@tonic-gate 				}
13487c478bd9Sstevel@tonic-gate 			}
13497c478bd9Sstevel@tonic-gate 		}
13507c478bd9Sstevel@tonic-gate 		CPU_STATS_EXIT_K();
13517c478bd9Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
13527c478bd9Sstevel@tonic-gate 		    "page_ws_in:pp %p", pp);
13537c478bd9Sstevel@tonic-gate 		/* Kernel probe */
13547c478bd9Sstevel@tonic-gate 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
13557c478bd9Sstevel@tonic-gate 		    tnf_opaque,	vnode,	pp->p_vnode,
13567c478bd9Sstevel@tonic-gate 		    tnf_offset,	offset,	pp->p_offset,
13577c478bd9Sstevel@tonic-gate 		    tnf_size,	size,	len);
13587c478bd9Sstevel@tonic-gate 	}
13597c478bd9Sstevel@tonic-gate 
13607c478bd9Sstevel@tonic-gate 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
13617c478bd9Sstevel@tonic-gate 	bp->b_bcount = len;
13627c478bd9Sstevel@tonic-gate 	bp->b_bufsize = len;
13637c478bd9Sstevel@tonic-gate 	bp->b_pages = pp;
13647c478bd9Sstevel@tonic-gate 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
13657c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
13667c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
13677c478bd9Sstevel@tonic-gate 
13687c478bd9Sstevel@tonic-gate 	/* Initialize bp->b_sem in "locked" state */
13697c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
13707c478bd9Sstevel@tonic-gate 
13717c478bd9Sstevel@tonic-gate 	VN_HOLD(vp);
13727c478bd9Sstevel@tonic-gate 	bp->b_vp = vp;
13737c478bd9Sstevel@tonic-gate 
13747c478bd9Sstevel@tonic-gate 	/*
13757c478bd9Sstevel@tonic-gate 	 * Caller sets dev & blkno and can adjust
13767c478bd9Sstevel@tonic-gate 	 * b_addr for page offset and can use bp_mapin
13777c478bd9Sstevel@tonic-gate 	 * to make pages kernel addressable.
13787c478bd9Sstevel@tonic-gate 	 */
13797c478bd9Sstevel@tonic-gate 	return (bp);
13807c478bd9Sstevel@tonic-gate }
13817c478bd9Sstevel@tonic-gate 
13827c478bd9Sstevel@tonic-gate void
13837c478bd9Sstevel@tonic-gate pageio_done(struct buf *bp)
13847c478bd9Sstevel@tonic-gate {
13857c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
13867c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_REMAPPED)
13877c478bd9Sstevel@tonic-gate 		bp_mapout(bp);
13887c478bd9Sstevel@tonic-gate 	VN_RELE(bp->b_vp);
13897c478bd9Sstevel@tonic-gate 	bp->b_vp = NULL;
13907c478bd9Sstevel@tonic-gate 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
13917c478bd9Sstevel@tonic-gate 
13927c478bd9Sstevel@tonic-gate 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
13937c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
13947c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
13957c478bd9Sstevel@tonic-gate 	kmem_free(bp, sizeof (struct buf));
13967c478bd9Sstevel@tonic-gate }
13977c478bd9Sstevel@tonic-gate 
13987c478bd9Sstevel@tonic-gate /*
13997c478bd9Sstevel@tonic-gate  * Check to see whether the buffers, except the one pointed by sbp,
14007c478bd9Sstevel@tonic-gate  * associated with the device are busy.
14017c478bd9Sstevel@tonic-gate  * NOTE: This expensive operation shall be improved together with ufs_icheck().
14027c478bd9Sstevel@tonic-gate  */
14037c478bd9Sstevel@tonic-gate int
14047c478bd9Sstevel@tonic-gate bcheck(dev_t dev, struct buf *sbp)
14057c478bd9Sstevel@tonic-gate {
14067c478bd9Sstevel@tonic-gate 	struct buf	*bp;
14077c478bd9Sstevel@tonic-gate 	struct buf	*dp;
14087c478bd9Sstevel@tonic-gate 	int i;
14097c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
14107c478bd9Sstevel@tonic-gate 
14117c478bd9Sstevel@tonic-gate 	/*
14127c478bd9Sstevel@tonic-gate 	 * check for busy bufs for this filesystem
14137c478bd9Sstevel@tonic-gate 	 */
14147c478bd9Sstevel@tonic-gate 	for (i = 0; i < v.v_hbuf; i++) {
14157c478bd9Sstevel@tonic-gate 		dp = (struct buf *)&hbuf[i];
14167c478bd9Sstevel@tonic-gate 		hmp = &hbuf[i].b_lock;
14177c478bd9Sstevel@tonic-gate 
14187c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
14197c478bd9Sstevel@tonic-gate 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
14207c478bd9Sstevel@tonic-gate 			/*
14217c478bd9Sstevel@tonic-gate 			 * if buf is busy or dirty, then filesystem is busy
14227c478bd9Sstevel@tonic-gate 			 */
14237c478bd9Sstevel@tonic-gate 			if ((bp->b_edev == dev) &&
14247c478bd9Sstevel@tonic-gate 			    ((bp->b_flags & B_STALE) == 0) &&
14257c478bd9Sstevel@tonic-gate 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
14267c478bd9Sstevel@tonic-gate 			    (bp != sbp)) {
14277c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
14287c478bd9Sstevel@tonic-gate 				return (1);
14297c478bd9Sstevel@tonic-gate 			}
14307c478bd9Sstevel@tonic-gate 		}
14317c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
14327c478bd9Sstevel@tonic-gate 	}
14337c478bd9Sstevel@tonic-gate 	return (0);
14347c478bd9Sstevel@tonic-gate }
14357c478bd9Sstevel@tonic-gate 
14367c478bd9Sstevel@tonic-gate /*
14377c478bd9Sstevel@tonic-gate  * Hash two 32 bit entities.
14387c478bd9Sstevel@tonic-gate  */
14397c478bd9Sstevel@tonic-gate int
14407c478bd9Sstevel@tonic-gate hash2ints(int x, int y)
14417c478bd9Sstevel@tonic-gate {
14427c478bd9Sstevel@tonic-gate 	int hash = 0;
14437c478bd9Sstevel@tonic-gate 
14447c478bd9Sstevel@tonic-gate 	hash = x - 1;
14457c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 8)) - 1;
14467c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 16)) - 1;
14477c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (x >> 24)) - 1;
14487c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + y) - 1;
14497c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 8)) - 1;
14507c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 16)) - 1;
14517c478bd9Sstevel@tonic-gate 	hash = ((hash * 7) + (y >> 24)) - 1;
14527c478bd9Sstevel@tonic-gate 
14537c478bd9Sstevel@tonic-gate 	return (hash);
14547c478bd9Sstevel@tonic-gate }
14557c478bd9Sstevel@tonic-gate 
14567c478bd9Sstevel@tonic-gate 
14577c478bd9Sstevel@tonic-gate /*
14587c478bd9Sstevel@tonic-gate  * Return a new buffer struct.
14597c478bd9Sstevel@tonic-gate  *	Create a new buffer if we haven't gone over our high water
14607c478bd9Sstevel@tonic-gate  *	mark for memory, otherwise try to get one off the freelist.
14617c478bd9Sstevel@tonic-gate  *
14627c478bd9Sstevel@tonic-gate  * Returns a locked buf that has no id and is not on any hash or free
14637c478bd9Sstevel@tonic-gate  * list.
14647c478bd9Sstevel@tonic-gate  */
14657c478bd9Sstevel@tonic-gate static struct buf *
14667c478bd9Sstevel@tonic-gate bio_getfreeblk(long bsize)
14677c478bd9Sstevel@tonic-gate {
14687c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp;
14697c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
14707c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
14717c478bd9Sstevel@tonic-gate 	uint_t		start, end;
14727c478bd9Sstevel@tonic-gate 
14737c478bd9Sstevel@tonic-gate 	/*
14747c478bd9Sstevel@tonic-gate 	 * mutex_enter(&bfree_lock);
14757c478bd9Sstevel@tonic-gate 	 * bfreelist.b_bufsize represents the amount of memory
14767c478bd9Sstevel@tonic-gate 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
14777c478bd9Sstevel@tonic-gate 	 * we are allowed to allocate in the cache before we hit our hwm.
14787c478bd9Sstevel@tonic-gate 	 */
14797c478bd9Sstevel@tonic-gate 	bio_mem_get(bsize);	/* Account for our memory request */
14807c478bd9Sstevel@tonic-gate 
14817c478bd9Sstevel@tonic-gate again:
14827c478bd9Sstevel@tonic-gate 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
14837c478bd9Sstevel@tonic-gate 	sema_p(&bp->b_sem);	/* Should never fail */
14847c478bd9Sstevel@tonic-gate 
14857c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
14867c478bd9Sstevel@tonic-gate 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
14877c478bd9Sstevel@tonic-gate 	if (bp->b_un.b_addr != NULL) {
14887c478bd9Sstevel@tonic-gate 		/*
14897c478bd9Sstevel@tonic-gate 		 * Make the common path short
14907c478bd9Sstevel@tonic-gate 		 */
14917c478bd9Sstevel@tonic-gate 		bp->b_bufsize = bsize;
14927c478bd9Sstevel@tonic-gate 		ASSERT(SEMA_HELD(&bp->b_sem));
14937c478bd9Sstevel@tonic-gate 		return (bp);
14947c478bd9Sstevel@tonic-gate 	} else {
14957c478bd9Sstevel@tonic-gate 		struct buf *save;
14967c478bd9Sstevel@tonic-gate 
14977c478bd9Sstevel@tonic-gate 		save = bp;	/* Save bp we allocated */
14987c478bd9Sstevel@tonic-gate 		start = end = lastindex;
14997c478bd9Sstevel@tonic-gate 
15007c478bd9Sstevel@tonic-gate 		biostats.bio_bufwant.value.ui32++;
15017c478bd9Sstevel@tonic-gate 
15027c478bd9Sstevel@tonic-gate 		/*
15037c478bd9Sstevel@tonic-gate 		 * Memory isn't available from the system now. Scan
15047c478bd9Sstevel@tonic-gate 		 * the hash buckets till enough space is found.
15057c478bd9Sstevel@tonic-gate 		 */
15067c478bd9Sstevel@tonic-gate 		do {
15077c478bd9Sstevel@tonic-gate 			hp = &hbuf[start];
15087c478bd9Sstevel@tonic-gate 			hmp = &hp->b_lock;
15097c478bd9Sstevel@tonic-gate 			dp = (struct buf *)hp;
15107c478bd9Sstevel@tonic-gate 
15117c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
15127c478bd9Sstevel@tonic-gate 			bp = dp->av_forw;
15137c478bd9Sstevel@tonic-gate 
15147c478bd9Sstevel@tonic-gate 			while (bp != dp) {
15157c478bd9Sstevel@tonic-gate 
15167c478bd9Sstevel@tonic-gate 				ASSERT(bp != NULL);
15177c478bd9Sstevel@tonic-gate 
15187c478bd9Sstevel@tonic-gate 				if (!sema_tryp(&bp->b_sem)) {
15197c478bd9Sstevel@tonic-gate 					bp = bp->av_forw;
15207c478bd9Sstevel@tonic-gate 					continue;
15217c478bd9Sstevel@tonic-gate 				}
15227c478bd9Sstevel@tonic-gate 
15237c478bd9Sstevel@tonic-gate 				/*
15247c478bd9Sstevel@tonic-gate 				 * Since we are going down the freelist
15257c478bd9Sstevel@tonic-gate 				 * associated with this hash bucket the
15267c478bd9Sstevel@tonic-gate 				 * B_DELWRI flag should not be set.
15277c478bd9Sstevel@tonic-gate 				 */
15287c478bd9Sstevel@tonic-gate 				ASSERT(!(bp->b_flags & B_DELWRI));
15297c478bd9Sstevel@tonic-gate 
15307c478bd9Sstevel@tonic-gate 				if (bp->b_bufsize == bsize) {
15317c478bd9Sstevel@tonic-gate 					hp->b_length--;
15327c478bd9Sstevel@tonic-gate 					notavail(bp);
15337c478bd9Sstevel@tonic-gate 					bremhash(bp);
15347c478bd9Sstevel@tonic-gate 					mutex_exit(hmp);
15357c478bd9Sstevel@tonic-gate 
15367c478bd9Sstevel@tonic-gate 					/*
15377c478bd9Sstevel@tonic-gate 					 * Didn't kmem_alloc any more, so don't
15387c478bd9Sstevel@tonic-gate 					 * count it twice.
15397c478bd9Sstevel@tonic-gate 					 */
15407c478bd9Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
15417c478bd9Sstevel@tonic-gate 					bfreelist.b_bufsize += bsize;
15427c478bd9Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
15437c478bd9Sstevel@tonic-gate 
15447c478bd9Sstevel@tonic-gate 					/*
15457c478bd9Sstevel@tonic-gate 					 * Update the lastindex value.
15467c478bd9Sstevel@tonic-gate 					 */
15477c478bd9Sstevel@tonic-gate 					lastindex = start;
15487c478bd9Sstevel@tonic-gate 
15497c478bd9Sstevel@tonic-gate 					/*
15507c478bd9Sstevel@tonic-gate 					 * Put our saved bp back on the list
15517c478bd9Sstevel@tonic-gate 					 */
15527c478bd9Sstevel@tonic-gate 					sema_v(&save->b_sem);
15537c478bd9Sstevel@tonic-gate 					bio_bhdr_free(save);
15547c478bd9Sstevel@tonic-gate 					ASSERT(SEMA_HELD(&bp->b_sem));
15557c478bd9Sstevel@tonic-gate 					return (bp);
15567c478bd9Sstevel@tonic-gate 				}
15577c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
15587c478bd9Sstevel@tonic-gate 				bp = bp->av_forw;
15597c478bd9Sstevel@tonic-gate 			}
15607c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
15617c478bd9Sstevel@tonic-gate 			start = ((start + 1) % v.v_hbuf);
15627c478bd9Sstevel@tonic-gate 		} while (start != end);
15637c478bd9Sstevel@tonic-gate 
15647c478bd9Sstevel@tonic-gate 		biostats.bio_bufwait.value.ui32++;
15657c478bd9Sstevel@tonic-gate 		bp = save;		/* Use original bp */
15667c478bd9Sstevel@tonic-gate 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
15677c478bd9Sstevel@tonic-gate 	}
15687c478bd9Sstevel@tonic-gate 
15697c478bd9Sstevel@tonic-gate 	bp->b_bufsize = bsize;
15707c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
15717c478bd9Sstevel@tonic-gate 	return (bp);
15727c478bd9Sstevel@tonic-gate }
15737c478bd9Sstevel@tonic-gate 
15747c478bd9Sstevel@tonic-gate /*
15757c478bd9Sstevel@tonic-gate  * Allocate a buffer header. If none currently available, allocate
15767c478bd9Sstevel@tonic-gate  * a new pool.
15777c478bd9Sstevel@tonic-gate  */
15787c478bd9Sstevel@tonic-gate static struct buf *
15797c478bd9Sstevel@tonic-gate bio_bhdr_alloc(void)
15807c478bd9Sstevel@tonic-gate {
15817c478bd9Sstevel@tonic-gate 	struct buf *dp, *sdp;
15827c478bd9Sstevel@tonic-gate 	struct buf *bp;
15837c478bd9Sstevel@tonic-gate 	int i;
15847c478bd9Sstevel@tonic-gate 
15857c478bd9Sstevel@tonic-gate 	for (;;) {
15867c478bd9Sstevel@tonic-gate 		mutex_enter(&bhdr_lock);
15877c478bd9Sstevel@tonic-gate 		if (bhdrlist != NULL) {
15887c478bd9Sstevel@tonic-gate 			bp = bhdrlist;
15897c478bd9Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
15907c478bd9Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
15917c478bd9Sstevel@tonic-gate 			bp->av_forw = NULL;
15927c478bd9Sstevel@tonic-gate 			return (bp);
15937c478bd9Sstevel@tonic-gate 		}
15947c478bd9Sstevel@tonic-gate 		mutex_exit(&bhdr_lock);
15957c478bd9Sstevel@tonic-gate 
15967c478bd9Sstevel@tonic-gate 		/*
15977c478bd9Sstevel@tonic-gate 		 * Need to allocate a new pool. If the system is currently
15987c478bd9Sstevel@tonic-gate 		 * out of memory, then try freeing things on the freelist.
15997c478bd9Sstevel@tonic-gate 		 */
16007c478bd9Sstevel@tonic-gate 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
16017c478bd9Sstevel@tonic-gate 		if (dp == NULL) {
16027c478bd9Sstevel@tonic-gate 			/*
16037c478bd9Sstevel@tonic-gate 			 * System can't give us a pool of headers, try
16047c478bd9Sstevel@tonic-gate 			 * recycling from the free lists.
16057c478bd9Sstevel@tonic-gate 			 */
16067c478bd9Sstevel@tonic-gate 			bio_recycle(BIO_HEADER, 0);
16077c478bd9Sstevel@tonic-gate 		} else {
16087c478bd9Sstevel@tonic-gate 			sdp = dp;
16097c478bd9Sstevel@tonic-gate 			for (i = 0; i < v.v_buf; i++, dp++) {
16107c478bd9Sstevel@tonic-gate 				/*
16117c478bd9Sstevel@tonic-gate 				 * The next two lines are needed since NODEV
16127c478bd9Sstevel@tonic-gate 				 * is -1 and not NULL
16137c478bd9Sstevel@tonic-gate 				 */
16147c478bd9Sstevel@tonic-gate 				dp->b_dev = (o_dev_t)NODEV;
16157c478bd9Sstevel@tonic-gate 				dp->b_edev = NODEV;
16167c478bd9Sstevel@tonic-gate 				dp->av_forw = dp + 1;
16177c478bd9Sstevel@tonic-gate 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
16187c478bd9Sstevel@tonic-gate 				    NULL);
16197c478bd9Sstevel@tonic-gate 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
16207c478bd9Sstevel@tonic-gate 				    NULL);
16217c478bd9Sstevel@tonic-gate 				dp->b_offset = -1;
16227c478bd9Sstevel@tonic-gate 			}
16237c478bd9Sstevel@tonic-gate 			mutex_enter(&bhdr_lock);
16247c478bd9Sstevel@tonic-gate 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
16257c478bd9Sstevel@tonic-gate 			bhdrlist = sdp;
16267c478bd9Sstevel@tonic-gate 			nbuf += v.v_buf;
16277c478bd9Sstevel@tonic-gate 			bp = bhdrlist;
16287c478bd9Sstevel@tonic-gate 			bhdrlist = bp->av_forw;
16297c478bd9Sstevel@tonic-gate 			mutex_exit(&bhdr_lock);
16307c478bd9Sstevel@tonic-gate 
16317c478bd9Sstevel@tonic-gate 			bp->av_forw = NULL;
16327c478bd9Sstevel@tonic-gate 			return (bp);
16337c478bd9Sstevel@tonic-gate 		}
16347c478bd9Sstevel@tonic-gate 	}
16357c478bd9Sstevel@tonic-gate }
16367c478bd9Sstevel@tonic-gate 
16377c478bd9Sstevel@tonic-gate static  void
16387c478bd9Sstevel@tonic-gate bio_bhdr_free(struct buf *bp)
16397c478bd9Sstevel@tonic-gate {
16407c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_back == NULL);
16417c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_forw == NULL);
16427c478bd9Sstevel@tonic-gate 	ASSERT(bp->av_back == NULL);
16437c478bd9Sstevel@tonic-gate 	ASSERT(bp->av_forw == NULL);
16447c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr == NULL);
16457c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
16467c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_edev == NODEV);
16477c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_flags == 0);
16487c478bd9Sstevel@tonic-gate 
16497c478bd9Sstevel@tonic-gate 	mutex_enter(&bhdr_lock);
16507c478bd9Sstevel@tonic-gate 	bp->av_forw = bhdrlist;
16517c478bd9Sstevel@tonic-gate 	bhdrlist = bp;
16527c478bd9Sstevel@tonic-gate 	mutex_exit(&bhdr_lock);
16537c478bd9Sstevel@tonic-gate }
16547c478bd9Sstevel@tonic-gate 
16557c478bd9Sstevel@tonic-gate /*
16567c478bd9Sstevel@tonic-gate  * If we haven't gone over the high water mark, it's o.k. to
16577c478bd9Sstevel@tonic-gate  * allocate more buffer space, otherwise recycle buffers
16587c478bd9Sstevel@tonic-gate  * from the freelist until enough memory is free for a bsize request.
16597c478bd9Sstevel@tonic-gate  *
16607c478bd9Sstevel@tonic-gate  * We account for this memory, even though
16617c478bd9Sstevel@tonic-gate  * we don't allocate it here.
16627c478bd9Sstevel@tonic-gate  */
16637c478bd9Sstevel@tonic-gate static void
16647c478bd9Sstevel@tonic-gate bio_mem_get(long bsize)
16657c478bd9Sstevel@tonic-gate {
16667c478bd9Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
16677c478bd9Sstevel@tonic-gate 	if (bfreelist.b_bufsize > bsize) {
16687c478bd9Sstevel@tonic-gate 		bfreelist.b_bufsize -= bsize;
16697c478bd9Sstevel@tonic-gate 		mutex_exit(&bfree_lock);
16707c478bd9Sstevel@tonic-gate 		return;
16717c478bd9Sstevel@tonic-gate 	}
16727c478bd9Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
16737c478bd9Sstevel@tonic-gate 	bio_recycle(BIO_MEM, bsize);
16747c478bd9Sstevel@tonic-gate }
16757c478bd9Sstevel@tonic-gate 
16767c478bd9Sstevel@tonic-gate /*
16777c478bd9Sstevel@tonic-gate  * flush a list of delayed write buffers.
16787c478bd9Sstevel@tonic-gate  * (currently used only by bio_recycle below.)
16797c478bd9Sstevel@tonic-gate  */
16807c478bd9Sstevel@tonic-gate static void
16817c478bd9Sstevel@tonic-gate bio_flushlist(struct buf *delwri_list)
16827c478bd9Sstevel@tonic-gate {
16837c478bd9Sstevel@tonic-gate 	struct buf *bp;
16847c478bd9Sstevel@tonic-gate 
16857c478bd9Sstevel@tonic-gate 	while (delwri_list != EMPTY_LIST) {
16867c478bd9Sstevel@tonic-gate 		bp = delwri_list;
16877c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_AGE | B_ASYNC;
16887c478bd9Sstevel@tonic-gate 		if (bp->b_vp == NULL) {		/* !ufs */
16897c478bd9Sstevel@tonic-gate 			BWRITE(bp);
16907c478bd9Sstevel@tonic-gate 		} else {			/* ufs */
16917c478bd9Sstevel@tonic-gate 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
16927c478bd9Sstevel@tonic-gate 		}
16937c478bd9Sstevel@tonic-gate 		delwri_list = bp->b_list;
16947c478bd9Sstevel@tonic-gate 		bp->b_list = NULL;
16957c478bd9Sstevel@tonic-gate 	}
16967c478bd9Sstevel@tonic-gate }
16977c478bd9Sstevel@tonic-gate 
16987c478bd9Sstevel@tonic-gate /*
16997c478bd9Sstevel@tonic-gate  * Start recycling buffers on the freelist for one of 2 reasons:
17007c478bd9Sstevel@tonic-gate  *	- we need a buffer header
17017c478bd9Sstevel@tonic-gate  *	- we need to free up memory
17027c478bd9Sstevel@tonic-gate  * Once started we continue to recycle buffers until the B_AGE
17037c478bd9Sstevel@tonic-gate  * buffers are gone.
17047c478bd9Sstevel@tonic-gate  */
17057c478bd9Sstevel@tonic-gate static void
17067c478bd9Sstevel@tonic-gate bio_recycle(int want, long bsize)
17077c478bd9Sstevel@tonic-gate {
17087c478bd9Sstevel@tonic-gate 	struct buf *bp, *dp, *dwp, *nbp;
17097c478bd9Sstevel@tonic-gate 	struct hbuf *hp;
17107c478bd9Sstevel@tonic-gate 	int	found = 0;
17117c478bd9Sstevel@tonic-gate 	kmutex_t	*hmp;
17127c478bd9Sstevel@tonic-gate 	int		start, end;
17137c478bd9Sstevel@tonic-gate 	struct buf *delwri_list = EMPTY_LIST;
17147c478bd9Sstevel@tonic-gate 
17157c478bd9Sstevel@tonic-gate 	/*
17167c478bd9Sstevel@tonic-gate 	 * Recycle buffers.
17177c478bd9Sstevel@tonic-gate 	 */
17187c478bd9Sstevel@tonic-gate top:
17197c478bd9Sstevel@tonic-gate 	start = end = lastindex;
17207c478bd9Sstevel@tonic-gate 	do {
17217c478bd9Sstevel@tonic-gate 		hp = &hbuf[start];
17227c478bd9Sstevel@tonic-gate 		hmp = &hp->b_lock;
17237c478bd9Sstevel@tonic-gate 		dp = (struct buf *)hp;
17247c478bd9Sstevel@tonic-gate 
17257c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
17267c478bd9Sstevel@tonic-gate 		bp = dp->av_forw;
17277c478bd9Sstevel@tonic-gate 
17287c478bd9Sstevel@tonic-gate 		while (bp != dp) {
17297c478bd9Sstevel@tonic-gate 
17307c478bd9Sstevel@tonic-gate 			ASSERT(bp != NULL);
17317c478bd9Sstevel@tonic-gate 
17327c478bd9Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem)) {
17337c478bd9Sstevel@tonic-gate 				bp = bp->av_forw;
17347c478bd9Sstevel@tonic-gate 				continue;
17357c478bd9Sstevel@tonic-gate 			}
17367c478bd9Sstevel@tonic-gate 			/*
17377c478bd9Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
17387c478bd9Sstevel@tonic-gate 			 */
17397c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
17407c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
17417c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
17427c478bd9Sstevel@tonic-gate 				lastindex = start;
17437c478bd9Sstevel@tonic-gate 				return;	/* All done */
17447c478bd9Sstevel@tonic-gate 			}
17457c478bd9Sstevel@tonic-gate 
17467c478bd9Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(&hp->b_lock));
17477c478bd9Sstevel@tonic-gate 			ASSERT(!(bp->b_flags & B_DELWRI));
17487c478bd9Sstevel@tonic-gate 			hp->b_length--;
17497c478bd9Sstevel@tonic-gate 			notavail(bp);
17507c478bd9Sstevel@tonic-gate 
17517c478bd9Sstevel@tonic-gate 			/*
17527c478bd9Sstevel@tonic-gate 			 * Remove bhdr from cache, free up memory,
17537c478bd9Sstevel@tonic-gate 			 * and add the hdr to the freelist.
17547c478bd9Sstevel@tonic-gate 			 */
17557c478bd9Sstevel@tonic-gate 			bremhash(bp);
17567c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
17577c478bd9Sstevel@tonic-gate 
17587c478bd9Sstevel@tonic-gate 			if (bp->b_bufsize) {
17597c478bd9Sstevel@tonic-gate 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
17607c478bd9Sstevel@tonic-gate 				bp->b_un.b_addr = NULL;
17617c478bd9Sstevel@tonic-gate 				mutex_enter(&bfree_lock);
17627c478bd9Sstevel@tonic-gate 				bfreelist.b_bufsize += bp->b_bufsize;
17637c478bd9Sstevel@tonic-gate 				mutex_exit(&bfree_lock);
17647c478bd9Sstevel@tonic-gate 			}
17657c478bd9Sstevel@tonic-gate 
17667c478bd9Sstevel@tonic-gate 			bp->b_dev = (o_dev_t)NODEV;
17677c478bd9Sstevel@tonic-gate 			bp->b_edev = NODEV;
17687c478bd9Sstevel@tonic-gate 			bp->b_flags = 0;
17697c478bd9Sstevel@tonic-gate 			sema_v(&bp->b_sem);
17707c478bd9Sstevel@tonic-gate 			bio_bhdr_free(bp);
17717c478bd9Sstevel@tonic-gate 			if (want == BIO_HEADER) {
17727c478bd9Sstevel@tonic-gate 				found = 1;
17737c478bd9Sstevel@tonic-gate 			} else {
17747c478bd9Sstevel@tonic-gate 				ASSERT(want == BIO_MEM);
17757c478bd9Sstevel@tonic-gate 				if (!found && bfreelist.b_bufsize >= bsize) {
17767c478bd9Sstevel@tonic-gate 					/* Account for the memory we want */
17777c478bd9Sstevel@tonic-gate 					mutex_enter(&bfree_lock);
17787c478bd9Sstevel@tonic-gate 					if (bfreelist.b_bufsize >= bsize) {
17797c478bd9Sstevel@tonic-gate 						bfreelist.b_bufsize -= bsize;
17807c478bd9Sstevel@tonic-gate 						found = 1;
17817c478bd9Sstevel@tonic-gate 					}
17827c478bd9Sstevel@tonic-gate 					mutex_exit(&bfree_lock);
17837c478bd9Sstevel@tonic-gate 				}
17847c478bd9Sstevel@tonic-gate 			}
17857c478bd9Sstevel@tonic-gate 
17867c478bd9Sstevel@tonic-gate 			/*
17877c478bd9Sstevel@tonic-gate 			 * Since we dropped hmp start from the
17887c478bd9Sstevel@tonic-gate 			 * begining.
17897c478bd9Sstevel@tonic-gate 			 */
17907c478bd9Sstevel@tonic-gate 			mutex_enter(hmp);
17917c478bd9Sstevel@tonic-gate 			bp = dp->av_forw;
17927c478bd9Sstevel@tonic-gate 		}
17937c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
17947c478bd9Sstevel@tonic-gate 
17957c478bd9Sstevel@tonic-gate 		/*
17967c478bd9Sstevel@tonic-gate 		 * Look at the delayed write list.
17977c478bd9Sstevel@tonic-gate 		 * First gather into a private list, then write them.
17987c478bd9Sstevel@tonic-gate 		 */
17997c478bd9Sstevel@tonic-gate 		dwp = (struct buf *)&dwbuf[start];
18007c478bd9Sstevel@tonic-gate 		mutex_enter(&blist_lock);
18017c478bd9Sstevel@tonic-gate 		bio_doingflush++;
18027c478bd9Sstevel@tonic-gate 		mutex_enter(hmp);
18037c478bd9Sstevel@tonic-gate 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
18047c478bd9Sstevel@tonic-gate 
18057c478bd9Sstevel@tonic-gate 			ASSERT(bp != NULL);
18067c478bd9Sstevel@tonic-gate 			nbp = bp->av_forw;
18077c478bd9Sstevel@tonic-gate 
18087c478bd9Sstevel@tonic-gate 			if (!sema_tryp(&bp->b_sem))
18097c478bd9Sstevel@tonic-gate 				continue;
18107c478bd9Sstevel@tonic-gate 			ASSERT(bp->b_flags & B_DELWRI);
18117c478bd9Sstevel@tonic-gate 			/*
18127c478bd9Sstevel@tonic-gate 			 * Do we really want to nuke all of the B_AGE stuff??
18137c478bd9Sstevel@tonic-gate 			 */
18147c478bd9Sstevel@tonic-gate 
18157c478bd9Sstevel@tonic-gate 			if ((bp->b_flags & B_AGE) == 0 && found) {
18167c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
18177c478bd9Sstevel@tonic-gate 				mutex_exit(hmp);
18187c478bd9Sstevel@tonic-gate 				lastindex = start;
18197c478bd9Sstevel@tonic-gate 				mutex_exit(&blist_lock);
18207c478bd9Sstevel@tonic-gate 				bio_flushlist(delwri_list);
18217c478bd9Sstevel@tonic-gate 				mutex_enter(&blist_lock);
18227c478bd9Sstevel@tonic-gate 				bio_doingflush--;
18237c478bd9Sstevel@tonic-gate 				if (bio_flinv_cv_wanted) {
18247c478bd9Sstevel@tonic-gate 					bio_flinv_cv_wanted = 0;
18257c478bd9Sstevel@tonic-gate 					cv_broadcast(&bio_flushinval_cv);
18267c478bd9Sstevel@tonic-gate 				}
18277c478bd9Sstevel@tonic-gate 				mutex_exit(&blist_lock);
18287c478bd9Sstevel@tonic-gate 				return; /* All done */
18297c478bd9Sstevel@tonic-gate 			}
18307c478bd9Sstevel@tonic-gate 
18317c478bd9Sstevel@tonic-gate 			/*
18327c478bd9Sstevel@tonic-gate 			 * If the buffer is already on a flush or
18337c478bd9Sstevel@tonic-gate 			 * invalidate list then just skip it.
18347c478bd9Sstevel@tonic-gate 			 */
18357c478bd9Sstevel@tonic-gate 			if (bp->b_list != NULL) {
18367c478bd9Sstevel@tonic-gate 				sema_v(&bp->b_sem);
18377c478bd9Sstevel@tonic-gate 				continue;
18387c478bd9Sstevel@tonic-gate 			}
18397c478bd9Sstevel@tonic-gate 			/*
18407c478bd9Sstevel@tonic-gate 			 * We are still on the same bucket.
18417c478bd9Sstevel@tonic-gate 			 */
18427c478bd9Sstevel@tonic-gate 			hp->b_length--;
18437c478bd9Sstevel@tonic-gate 			notavail(bp);
18447c478bd9Sstevel@tonic-gate 			bp->b_list = delwri_list;
18457c478bd9Sstevel@tonic-gate 			delwri_list = bp;
18467c478bd9Sstevel@tonic-gate 		}
18477c478bd9Sstevel@tonic-gate 		mutex_exit(hmp);
18487c478bd9Sstevel@tonic-gate 		mutex_exit(&blist_lock);
18497c478bd9Sstevel@tonic-gate 		bio_flushlist(delwri_list);
18507c478bd9Sstevel@tonic-gate 		delwri_list = EMPTY_LIST;
18517c478bd9Sstevel@tonic-gate 		mutex_enter(&blist_lock);
18527c478bd9Sstevel@tonic-gate 		bio_doingflush--;
18537c478bd9Sstevel@tonic-gate 		if (bio_flinv_cv_wanted) {
18547c478bd9Sstevel@tonic-gate 			bio_flinv_cv_wanted = 0;
18557c478bd9Sstevel@tonic-gate 			cv_broadcast(&bio_flushinval_cv);
18567c478bd9Sstevel@tonic-gate 		}
18577c478bd9Sstevel@tonic-gate 		mutex_exit(&blist_lock);
18587c478bd9Sstevel@tonic-gate 		start = (start + 1) % v.v_hbuf;
18597c478bd9Sstevel@tonic-gate 
18607c478bd9Sstevel@tonic-gate 	} while (start != end);
18617c478bd9Sstevel@tonic-gate 
18627c478bd9Sstevel@tonic-gate 	if (found)
18637c478bd9Sstevel@tonic-gate 		return;
18647c478bd9Sstevel@tonic-gate 
18657c478bd9Sstevel@tonic-gate 	/*
18667c478bd9Sstevel@tonic-gate 	 * Free lists exhausted and we haven't satisfied the request.
18677c478bd9Sstevel@tonic-gate 	 * Wait here for more entries to be added to freelist.
18687c478bd9Sstevel@tonic-gate 	 * Because this might have just happened, make it timed.
18697c478bd9Sstevel@tonic-gate 	 */
18707c478bd9Sstevel@tonic-gate 	mutex_enter(&bfree_lock);
18717c478bd9Sstevel@tonic-gate 	bfreelist.b_flags |= B_WANTED;
1872d3d50737SRafael Vanoni 	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
18737c478bd9Sstevel@tonic-gate 	mutex_exit(&bfree_lock);
18747c478bd9Sstevel@tonic-gate 	goto top;
18757c478bd9Sstevel@tonic-gate }
18767c478bd9Sstevel@tonic-gate 
18777c478bd9Sstevel@tonic-gate /*
18787c478bd9Sstevel@tonic-gate  * See if the block is associated with some buffer
18797c478bd9Sstevel@tonic-gate  * (mainly to avoid getting hung up on a wait in breada).
18807c478bd9Sstevel@tonic-gate  */
18817c478bd9Sstevel@tonic-gate static int
18827c478bd9Sstevel@tonic-gate bio_incore(dev_t dev, daddr_t blkno)
18837c478bd9Sstevel@tonic-gate {
18847c478bd9Sstevel@tonic-gate 	struct buf *bp;
18857c478bd9Sstevel@tonic-gate 	struct buf *dp;
18867c478bd9Sstevel@tonic-gate 	uint_t index;
18877c478bd9Sstevel@tonic-gate 	kmutex_t *hmp;
18887c478bd9Sstevel@tonic-gate 
18897c478bd9Sstevel@tonic-gate 	index = bio_bhash(dev, blkno);
18907c478bd9Sstevel@tonic-gate 	dp = (struct buf *)&hbuf[index];
18917c478bd9Sstevel@tonic-gate 	hmp = &hbuf[index].b_lock;
18927c478bd9Sstevel@tonic-gate 
18937c478bd9Sstevel@tonic-gate 	mutex_enter(hmp);
18947c478bd9Sstevel@tonic-gate 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
18957c478bd9Sstevel@tonic-gate 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
18967c478bd9Sstevel@tonic-gate 		    (bp->b_flags & B_STALE) == 0) {
18977c478bd9Sstevel@tonic-gate 			mutex_exit(hmp);
18987c478bd9Sstevel@tonic-gate 			return (1);
18997c478bd9Sstevel@tonic-gate 		}
19007c478bd9Sstevel@tonic-gate 	}
19017c478bd9Sstevel@tonic-gate 	mutex_exit(hmp);
19027c478bd9Sstevel@tonic-gate 	return (0);
19037c478bd9Sstevel@tonic-gate }
19047c478bd9Sstevel@tonic-gate 
19057c478bd9Sstevel@tonic-gate static void
19067c478bd9Sstevel@tonic-gate bio_pageio_done(struct buf *bp)
19077c478bd9Sstevel@tonic-gate {
19087c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_PAGEIO) {
19097c478bd9Sstevel@tonic-gate 
19107c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_REMAPPED)
19117c478bd9Sstevel@tonic-gate 			bp_mapout(bp);
19127c478bd9Sstevel@tonic-gate 
19137c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_READ)
19147c478bd9Sstevel@tonic-gate 			pvn_read_done(bp->b_pages, bp->b_flags);
19157c478bd9Sstevel@tonic-gate 		else
19167c478bd9Sstevel@tonic-gate 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
19177c478bd9Sstevel@tonic-gate 		pageio_done(bp);
19187c478bd9Sstevel@tonic-gate 	} else {
19197c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_REMAPPED);
19207c478bd9Sstevel@tonic-gate 		bp_mapout(bp);
19217c478bd9Sstevel@tonic-gate 		brelse(bp);
19227c478bd9Sstevel@tonic-gate 	}
19237c478bd9Sstevel@tonic-gate }
19247c478bd9Sstevel@tonic-gate 
19257c478bd9Sstevel@tonic-gate /*
19267c478bd9Sstevel@tonic-gate  * bioerror(9F) - indicate error in buffer header
19277c478bd9Sstevel@tonic-gate  * If 'error' is zero, remove the error indication.
19287c478bd9Sstevel@tonic-gate  */
19297c478bd9Sstevel@tonic-gate void
19307c478bd9Sstevel@tonic-gate bioerror(struct buf *bp, int error)
19317c478bd9Sstevel@tonic-gate {
19327c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
19337c478bd9Sstevel@tonic-gate 	ASSERT(error >= 0);
19347c478bd9Sstevel@tonic-gate 	ASSERT(SEMA_HELD(&bp->b_sem));
19357c478bd9Sstevel@tonic-gate 
19367c478bd9Sstevel@tonic-gate 	if (error != 0) {
19377c478bd9Sstevel@tonic-gate 		bp->b_flags |= B_ERROR;
19387c478bd9Sstevel@tonic-gate 	} else {
19397c478bd9Sstevel@tonic-gate 		bp->b_flags &= ~B_ERROR;
19407c478bd9Sstevel@tonic-gate 	}
19417c478bd9Sstevel@tonic-gate 	bp->b_error = error;
19427c478bd9Sstevel@tonic-gate }
19437c478bd9Sstevel@tonic-gate 
19447c478bd9Sstevel@tonic-gate /*
19457c478bd9Sstevel@tonic-gate  * bioreset(9F) - reuse a private buffer header after I/O is complete
19467c478bd9Sstevel@tonic-gate  */
19477c478bd9Sstevel@tonic-gate void
19487c478bd9Sstevel@tonic-gate bioreset(struct buf *bp)
19497c478bd9Sstevel@tonic-gate {
19507c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
19517c478bd9Sstevel@tonic-gate 
19527c478bd9Sstevel@tonic-gate 	biofini(bp);
19537c478bd9Sstevel@tonic-gate 	bioinit(bp);
19547c478bd9Sstevel@tonic-gate }
19557c478bd9Sstevel@tonic-gate 
19567c478bd9Sstevel@tonic-gate /*
19577c478bd9Sstevel@tonic-gate  * biosize(9F) - return size of a buffer header
19587c478bd9Sstevel@tonic-gate  */
19597c478bd9Sstevel@tonic-gate size_t
19607c478bd9Sstevel@tonic-gate biosize(void)
19617c478bd9Sstevel@tonic-gate {
19627c478bd9Sstevel@tonic-gate 	return (sizeof (struct buf));
19637c478bd9Sstevel@tonic-gate }
19647c478bd9Sstevel@tonic-gate 
19657c478bd9Sstevel@tonic-gate /*
19667c478bd9Sstevel@tonic-gate  * biomodified(9F) - check if buffer is modified
19677c478bd9Sstevel@tonic-gate  */
19687c478bd9Sstevel@tonic-gate int
19697c478bd9Sstevel@tonic-gate biomodified(struct buf *bp)
19707c478bd9Sstevel@tonic-gate {
19717c478bd9Sstevel@tonic-gate 	int npf;
19727c478bd9Sstevel@tonic-gate 	int ppattr;
19737c478bd9Sstevel@tonic-gate 	struct page *pp;
19747c478bd9Sstevel@tonic-gate 
19757c478bd9Sstevel@tonic-gate 	ASSERT(bp != NULL);
19767c478bd9Sstevel@tonic-gate 
19777c478bd9Sstevel@tonic-gate 	if ((bp->b_flags & B_PAGEIO) == 0) {
19787c478bd9Sstevel@tonic-gate 		return (-1);
19797c478bd9Sstevel@tonic-gate 	}
19807c478bd9Sstevel@tonic-gate 	pp = bp->b_pages;
19817c478bd9Sstevel@tonic-gate 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
19827c478bd9Sstevel@tonic-gate 
19837c478bd9Sstevel@tonic-gate 	while (npf > 0) {
19847c478bd9Sstevel@tonic-gate 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
19857c478bd9Sstevel@tonic-gate 		    HAT_SYNC_STOPON_MOD);
19867c478bd9Sstevel@tonic-gate 		if (ppattr & P_MOD)
19877c478bd9Sstevel@tonic-gate 			return (1);
19887c478bd9Sstevel@tonic-gate 		pp = pp->p_next;
19897c478bd9Sstevel@tonic-gate 		npf--;
19907c478bd9Sstevel@tonic-gate 	}
19917c478bd9Sstevel@tonic-gate 
19927c478bd9Sstevel@tonic-gate 	return (0);
19937c478bd9Sstevel@tonic-gate }
19947c478bd9Sstevel@tonic-gate 
19957c478bd9Sstevel@tonic-gate /*
19967c478bd9Sstevel@tonic-gate  * bioinit(9F) - initialize a buffer structure
19977c478bd9Sstevel@tonic-gate  */
19987c478bd9Sstevel@tonic-gate void
19997c478bd9Sstevel@tonic-gate bioinit(struct buf *bp)
20007c478bd9Sstevel@tonic-gate {
20017c478bd9Sstevel@tonic-gate 	bzero(bp, sizeof (struct buf));
20027c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
20037c478bd9Sstevel@tonic-gate 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
20047c478bd9Sstevel@tonic-gate 	bp->b_offset = -1;
20057c478bd9Sstevel@tonic-gate }
20067c478bd9Sstevel@tonic-gate 
20077c478bd9Sstevel@tonic-gate /*
20087c478bd9Sstevel@tonic-gate  * biofini(9F) - uninitialize a buffer structure
20097c478bd9Sstevel@tonic-gate  */
20107c478bd9Sstevel@tonic-gate void
20117c478bd9Sstevel@tonic-gate biofini(struct buf *bp)
20127c478bd9Sstevel@tonic-gate {
20137c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_io);
20147c478bd9Sstevel@tonic-gate 	sema_destroy(&bp->b_sem);
20157c478bd9Sstevel@tonic-gate }
20167c478bd9Sstevel@tonic-gate 
20177c478bd9Sstevel@tonic-gate /*
20187c478bd9Sstevel@tonic-gate  * bioclone(9F) - clone a buffer
20197c478bd9Sstevel@tonic-gate  */
20207c478bd9Sstevel@tonic-gate struct buf *
20217c478bd9Sstevel@tonic-gate bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
20227c478bd9Sstevel@tonic-gate     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
20237c478bd9Sstevel@tonic-gate {
20247c478bd9Sstevel@tonic-gate 	struct buf *bufp;
20257c478bd9Sstevel@tonic-gate 
20267c478bd9Sstevel@tonic-gate 	ASSERT(bp);
20277c478bd9Sstevel@tonic-gate 	if (bp_mem == NULL) {
20287c478bd9Sstevel@tonic-gate 		bufp = kmem_alloc(sizeof (struct buf), sleep);
20297c478bd9Sstevel@tonic-gate 		if (bufp == NULL) {
20307c478bd9Sstevel@tonic-gate 			return (NULL);
20317c478bd9Sstevel@tonic-gate 		}
20327c478bd9Sstevel@tonic-gate 		bioinit(bufp);
20337c478bd9Sstevel@tonic-gate 	} else {
20347c478bd9Sstevel@tonic-gate 		bufp = bp_mem;
20357c478bd9Sstevel@tonic-gate 		bioreset(bufp);
20367c478bd9Sstevel@tonic-gate 	}
20377c478bd9Sstevel@tonic-gate 
20387c478bd9Sstevel@tonic-gate #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
20397c478bd9Sstevel@tonic-gate 	B_ABRWRITE)
20407c478bd9Sstevel@tonic-gate 
20417c478bd9Sstevel@tonic-gate 	/*
20426f84fed5Scth 	 * The cloned buffer does not inherit the B_REMAPPED flag.
20437c478bd9Sstevel@tonic-gate 	 */
20447c478bd9Sstevel@tonic-gate 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
20457c478bd9Sstevel@tonic-gate 	bufp->b_bcount = len;
20467c478bd9Sstevel@tonic-gate 	bufp->b_blkno = blkno;
20477c478bd9Sstevel@tonic-gate 	bufp->b_iodone = iodone;
20487c478bd9Sstevel@tonic-gate 	bufp->b_proc = bp->b_proc;
20497c478bd9Sstevel@tonic-gate 	bufp->b_edev = dev;
20507c478bd9Sstevel@tonic-gate 	bufp->b_file = bp->b_file;
20517c478bd9Sstevel@tonic-gate 	bufp->b_offset = bp->b_offset;
20527c478bd9Sstevel@tonic-gate 
20537c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_SHADOW) {
20547c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_shadow);
20557c478bd9Sstevel@tonic-gate 		ASSERT(bp->b_flags & B_PHYS);
20567c478bd9Sstevel@tonic-gate 
20577c478bd9Sstevel@tonic-gate 		bufp->b_shadow = bp->b_shadow +
20587c478bd9Sstevel@tonic-gate 		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
20597c478bd9Sstevel@tonic-gate 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
20606f84fed5Scth 		if (bp->b_flags & B_REMAPPED)
20616f84fed5Scth 			bufp->b_proc = NULL;
20627c478bd9Sstevel@tonic-gate 	} else {
20637c478bd9Sstevel@tonic-gate 		if (bp->b_flags & B_PAGEIO) {
20647c478bd9Sstevel@tonic-gate 			struct page *pp;
20657c478bd9Sstevel@tonic-gate 			off_t o;
20667c478bd9Sstevel@tonic-gate 			int i;
20677c478bd9Sstevel@tonic-gate 
20687c478bd9Sstevel@tonic-gate 			pp = bp->b_pages;
20697c478bd9Sstevel@tonic-gate 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
20707c478bd9Sstevel@tonic-gate 			for (i = btop(o); i > 0; i--) {
20717c478bd9Sstevel@tonic-gate 				pp = pp->p_next;
20727c478bd9Sstevel@tonic-gate 			}
20737c478bd9Sstevel@tonic-gate 			bufp->b_pages = pp;
20747c478bd9Sstevel@tonic-gate 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
20757c478bd9Sstevel@tonic-gate 		} else {
20767c478bd9Sstevel@tonic-gate 			bufp->b_un.b_addr =
20777c478bd9Sstevel@tonic-gate 			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
20787c478bd9Sstevel@tonic-gate 			if (bp->b_flags & B_REMAPPED)
20797c478bd9Sstevel@tonic-gate 				bufp->b_proc = NULL;
20807c478bd9Sstevel@tonic-gate 		}
20817c478bd9Sstevel@tonic-gate 	}
20827c478bd9Sstevel@tonic-gate 	return (bufp);
20837c478bd9Sstevel@tonic-gate }
2084