xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2019 Joyent, Inc.
25  */
26 
27 /*
28  * Copyright (c) 2016 by Delphix. All rights reserved.
29  */
30 
31 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
32 /*	  All Rights Reserved	*/
33 
34 /*
35  * University Copyright- Copyright (c) 1982, 1986, 1988
36  * The Regents of the University of California
37  * All Rights Reserved
38  *
39  * University Acknowledgment- Portions of this document are derived from
40  * software developed by the University of California, Berkeley, and its
41  * contributors.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
47 #include <sys/conf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
51 #include <sys/buf.h>
52 #include <sys/var.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
56 #include <sys/kmem.h>
57 #include <sys/vmem.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
60 #include <vm/page.h>
61 #include <vm/pvn.h>
62 #include <sys/vtrace.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_log.h>
66 #include <sys/systm.h>
67 #include <sys/vfs.h>
68 #include <sys/sdt.h>
69 
70 /* Locks */
71 static	kmutex_t	blist_lock;	/* protects b_list */
72 static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
73 static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
74 
75 struct hbuf	*hbuf;			/* Hash buckets */
76 struct dwbuf	*dwbuf;			/* Delayed write buckets */
77 static struct buf *bhdrlist;		/* buf header free list */
78 static int	nbuf;			/* number of buffer headers allocated */
79 
80 static int	lastindex;		/* Reference point on where to start */
81 					/* when looking for free buffers */
82 
83 #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
84 #define	EMPTY_LIST	((struct buf *)-1)
85 
86 static kcondvar_t	bio_mem_cv;	/* Condition variables */
87 static kcondvar_t	bio_flushinval_cv;
88 static int	bio_doingflush;		/* flush in progress */
89 static int	bio_doinginval;		/* inval in progress */
90 static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
91 
92 /*
93  * Statistics on the buffer cache
94  */
95 struct biostats biostats = {
96 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
97 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
98 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
99 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
100 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
101 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
102 };
103 
104 /*
105  * kstat data
106  */
107 kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
108 uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
109 					sizeof (kstat_named_t));
110 
111 /*
112  * Statistics on ufs buffer cache
113  * Not protected by locks
114  */
115 struct ufsbiostats ub = {
116 	{ "breads",			KSTAT_DATA_UINT32 },
117 	{ "bwrites",			KSTAT_DATA_UINT32 },
118 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
119 	{ "getpages",			KSTAT_DATA_UINT32 },
120 	{ "getras",			KSTAT_DATA_UINT32 },
121 	{ "putsyncs",			KSTAT_DATA_UINT32 },
122 	{ "putasyncs",			KSTAT_DATA_UINT32 },
123 	{ "putpageios",			KSTAT_DATA_UINT32 },
124 };
125 
126 /*
127  * more UFS Logging eccentricities...
128  *
129  * required since "#pragma weak ..." doesn't work in reverse order.
130  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
131  *        to ufs routines don't get plugged into bio.c calls so
132  *        we initialize it when setting up the "lufsops" table
133  *        in "lufs.c:_init()"
134  */
135 void (*bio_lufs_strategy)(void *, buf_t *);
136 void (*bio_snapshot_strategy)(void *, buf_t *);
137 
138 
139 /* Private routines */
140 static struct buf	*bio_getfreeblk(long);
141 static void		bio_mem_get(long);
142 static void		bio_bhdr_free(struct buf *);
143 static struct buf	*bio_bhdr_alloc(void);
144 static void		bio_recycle(int, long);
145 static void		bio_pageio_done(struct buf *);
146 static int		bio_incore(dev_t, daddr_t);
147 
148 /*
149  * Buffer cache constants
150  */
151 #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
152 #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
153 #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
154 #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
155 #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
156 #define	BIO_HASHLEN	4		/* Target length of hash chains */
157 
158 
159 /* Flags for bio_recycle() */
160 #define	BIO_HEADER	0x01
161 #define	BIO_MEM		0x02
162 
163 extern	int bufhwm;		/* User tunable - high water mark for mem  */
164 extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
165 
166 /*
167  * The following routines allocate and free
168  * buffers with various side effects.  In general the
169  * arguments to an allocate routine are a device and
170  * a block number, and the value is a pointer to
171  * to the buffer header; the buffer returned is locked with a
172  * binary semaphore so that no one else can touch it. If the block was
173  * already in core, no I/O need be done; if it is
174  * already locked, the process waits until it becomes free.
175  * The following routines allocate a buffer:
176  *	getblk
177  *	bread/BREAD
178  *	breada
179  * Eventually the buffer must be released, possibly with the
180  * side effect of writing it out, by using one of
181  *	bwrite/BWRITE/brwrite
182  *	bdwrite/bdrwrite
183  *	bawrite
184  *	brelse
185  *
186  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
187  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
188  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
189  * B_DONE is still used to denote a buffer with I/O complete on it.
190  *
191  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
192  * should not be used where a very accurate count of the free buffers is
193  * needed.
194  */
195 
196 /*
197  * Read in (if necessary) the block and return a buffer pointer.
198  *
199  * This interface is provided for binary compatibility.  Using
200  * BREAD() directly avoids the extra function call overhead invoked
201  * by calling this routine.
202  */
203 struct buf *
204 bread(dev_t dev, daddr_t blkno, long bsize)
205 {
206 	return (BREAD(dev, blkno, bsize));
207 }
208 
209 /*
210  * Common code for reading a buffer with various options
211  *
212  * Read in (if necessary) the block and return a buffer pointer.
213  */
214 struct buf *
215 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
216 {
217 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
218 	struct buf *bp;
219 	klwp_t *lwp = ttolwp(curthread);
220 
221 	CPU_STATS_ADD_K(sys, lread, 1);
222 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
223 	if (bp->b_flags & B_DONE)
224 		return (bp);
225 	bp->b_flags |= B_READ;
226 	ASSERT(bp->b_bcount == bsize);
227 	if (ufsvfsp == NULL) {					/* !ufs */
228 		(void) bdev_strategy(bp);
229 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
230 							/* ufs && logging */
231 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
232 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
233 							/* ufs && snapshots */
234 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
235 	} else {
236 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
237 		ub.ub_breads.value.ul++;		/* ufs && !logging */
238 		(void) bdev_strategy(bp);
239 	}
240 	if (lwp != NULL)
241 		lwp->lwp_ru.inblock++;
242 	CPU_STATS_ADD_K(sys, bread, 1);
243 	(void) biowait(bp);
244 	return (bp);
245 }
246 
247 /*
248  * Read in the block, like bread, but also start I/O on the
249  * read-ahead block (which is not allocated to the caller).
250  */
251 struct buf *
252 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
253 {
254 	struct buf *bp, *rabp;
255 	klwp_t *lwp = ttolwp(curthread);
256 
257 	bp = NULL;
258 	if (!bio_incore(dev, blkno)) {
259 		CPU_STATS_ADD_K(sys, lread, 1);
260 		bp = GETBLK(dev, blkno, bsize);
261 		if ((bp->b_flags & B_DONE) == 0) {
262 			bp->b_flags |= B_READ;
263 			bp->b_bcount = bsize;
264 			(void) bdev_strategy(bp);
265 			if (lwp != NULL)
266 				lwp->lwp_ru.inblock++;
267 			CPU_STATS_ADD_K(sys, bread, 1);
268 		}
269 	}
270 	if (rablkno && bfreelist.b_bcount > 1 &&
271 	    !bio_incore(dev, rablkno)) {
272 		rabp = GETBLK(dev, rablkno, bsize);
273 		if (rabp->b_flags & B_DONE)
274 			brelse(rabp);
275 		else {
276 			rabp->b_flags |= B_READ|B_ASYNC;
277 			rabp->b_bcount = bsize;
278 			(void) bdev_strategy(rabp);
279 			if (lwp != NULL)
280 				lwp->lwp_ru.inblock++;
281 			CPU_STATS_ADD_K(sys, bread, 1);
282 		}
283 	}
284 	if (bp == NULL)
285 		return (BREAD(dev, blkno, bsize));
286 	(void) biowait(bp);
287 	return (bp);
288 }
289 
290 /*
291  * Common code for writing a buffer with various options.
292  *
293  * force_wait  - wait for write completion regardless of B_ASYNC flag
294  * do_relse    - release the buffer when we are done
295  * clear_flags - flags to clear from the buffer
296  */
297 void
298 bwrite_common(void *arg, struct buf *bp, int force_wait,
299     int do_relse, int clear_flags)
300 {
301 	register int do_wait;
302 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
303 	int flag;
304 	klwp_t *lwp = ttolwp(curthread);
305 	struct cpu *cpup;
306 
307 	ASSERT(SEMA_HELD(&bp->b_sem));
308 	flag = bp->b_flags;
309 	bp->b_flags &= ~clear_flags;
310 	if (lwp != NULL)
311 		lwp->lwp_ru.oublock++;
312 	CPU_STATS_ENTER_K();
313 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
314 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
315 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
316 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
317 	if (do_wait == 0)
318 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
319 	CPU_STATS_EXIT_K();
320 	if (ufsvfsp == NULL) {
321 		(void) bdev_strategy(bp);
322 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
323 							/* ufs && logging */
324 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
325 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
326 							/* ufs && snapshots */
327 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
328 	} else {
329 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
330 		(void) bdev_strategy(bp);
331 	}
332 	if (do_wait) {
333 		(void) biowait(bp);
334 		if (do_relse) {
335 			brelse(bp);
336 		}
337 	}
338 }
339 
340 /*
341  * Write the buffer, waiting for completion (unless B_ASYNC is set).
342  * Then release the buffer.
343  * This interface is provided for binary compatibility.  Using
344  * BWRITE() directly avoids the extra function call overhead invoked
345  * by calling this routine.
346  */
347 void
348 bwrite(struct buf *bp)
349 {
350 	BWRITE(bp);
351 }
352 
353 /*
354  * Write the buffer, waiting for completion.
355  * But don't release the buffer afterwards.
356  * This interface is provided for binary compatibility.  Using
357  * BWRITE2() directly avoids the extra function call overhead.
358  */
359 void
360 bwrite2(struct buf *bp)
361 {
362 	BWRITE2(bp);
363 }
364 
365 /*
366  * Release the buffer, marking it so that if it is grabbed
367  * for another purpose it will be written out before being
368  * given up (e.g. when writing a partial block where it is
369  * assumed that another write for the same block will soon follow).
370  * Also save the time that the block is first marked as delayed
371  * so that it will be written in a reasonable time.
372  */
373 void
374 bdwrite(struct buf *bp)
375 {
376 	ASSERT(SEMA_HELD(&bp->b_sem));
377 	CPU_STATS_ADD_K(sys, lwrite, 1);
378 	if ((bp->b_flags & B_DELWRI) == 0)
379 		bp->b_start = ddi_get_lbolt();
380 	/*
381 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
382 	 * buffer to be written before being reused, and setting b_resid
383 	 * to zero says the buffer is complete.
384 	 */
385 	bp->b_flags |= B_DELWRI | B_DONE;
386 	bp->b_resid = 0;
387 	brelse(bp);
388 }
389 
390 /*
391  * Release the buffer, start I/O on it, but don't wait for completion.
392  */
393 void
394 bawrite(struct buf *bp)
395 {
396 	ASSERT(SEMA_HELD(&bp->b_sem));
397 
398 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
399 	if (bfreelist.b_bcount > 4)
400 		bp->b_flags |= B_ASYNC;
401 	BWRITE(bp);
402 }
403 
404 /*
405  * Release the buffer, with no I/O implied.
406  */
407 void
408 brelse(struct buf *bp)
409 {
410 	struct buf	**backp;
411 	uint_t		index;
412 	kmutex_t	*hmp;
413 	struct	buf	*dp;
414 	struct	hbuf	*hp;
415 
416 
417 	ASSERT(SEMA_HELD(&bp->b_sem));
418 
419 	/*
420 	 * Clear the retry write flag if the buffer was written without
421 	 * error.  The presence of B_DELWRI means the buffer has not yet
422 	 * been written and the presence of B_ERROR means that an error
423 	 * is still occurring.
424 	 */
425 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
426 		bp->b_flags &= ~B_RETRYWRI;
427 	}
428 
429 	/* Check for anomalous conditions */
430 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
431 		if (bp->b_flags & B_NOCACHE) {
432 			/* Don't add to the freelist. Destroy it now */
433 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
434 			sema_destroy(&bp->b_sem);
435 			sema_destroy(&bp->b_io);
436 			kmem_free(bp, sizeof (struct buf));
437 			return;
438 		}
439 		/*
440 		 * If a write failed and we are supposed to retry write,
441 		 * don't toss the buffer.  Keep it around and mark it
442 		 * delayed write in the hopes that it will eventually
443 		 * get flushed (and still keep the system running.)
444 		 */
445 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
446 			bp->b_flags |= B_DELWRI;
447 			/* keep fsflush from trying continuously to flush */
448 			bp->b_start = ddi_get_lbolt();
449 		} else
450 			bp->b_flags |= B_AGE|B_STALE;
451 		bp->b_flags &= ~B_ERROR;
452 		bp->b_error = 0;
453 	}
454 
455 	/*
456 	 * If delayed write is set then put in on the delayed
457 	 * write list instead of the free buffer list.
458 	 */
459 	index = bio_bhash(bp->b_edev, bp->b_blkno);
460 	hmp   = &hbuf[index].b_lock;
461 
462 	mutex_enter(hmp);
463 	hp = &hbuf[index];
464 	dp = (struct buf *)hp;
465 
466 	/*
467 	 * Make sure that the number of entries on this list are
468 	 * Zero <= count <= total # buffers
469 	 */
470 	ASSERT(hp->b_length >= 0);
471 	ASSERT(hp->b_length < nbuf);
472 
473 	hp->b_length++;		/* We are adding this buffer */
474 
475 	if (bp->b_flags & B_DELWRI) {
476 		/*
477 		 * This buffer goes on the delayed write buffer list
478 		 */
479 		dp = (struct buf *)&dwbuf[index];
480 	}
481 	ASSERT(bp->b_bufsize > 0);
482 	ASSERT(bp->b_bcount > 0);
483 	ASSERT(bp->b_un.b_addr != NULL);
484 
485 	if (bp->b_flags & B_AGE) {
486 		backp = &dp->av_forw;
487 		(*backp)->av_back = bp;
488 		bp->av_forw = *backp;
489 		*backp = bp;
490 		bp->av_back = dp;
491 	} else {
492 		backp = &dp->av_back;
493 		(*backp)->av_forw = bp;
494 		bp->av_back = *backp;
495 		*backp = bp;
496 		bp->av_forw = dp;
497 	}
498 	mutex_exit(hmp);
499 
500 	if (bfreelist.b_flags & B_WANTED) {
501 		/*
502 		 * Should come here very very rarely.
503 		 */
504 		mutex_enter(&bfree_lock);
505 		if (bfreelist.b_flags & B_WANTED) {
506 			bfreelist.b_flags &= ~B_WANTED;
507 			cv_broadcast(&bio_mem_cv);
508 		}
509 		mutex_exit(&bfree_lock);
510 	}
511 
512 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
513 	/*
514 	 * Don't let anyone get the buffer off the freelist before we
515 	 * release our hold on it.
516 	 */
517 	sema_v(&bp->b_sem);
518 }
519 
520 /*
521  * Return a count of the number of B_BUSY buffers in the system
522  * Can only be used as a good estimate.  If 'cleanit' is set,
523  * try to flush all bufs.
524  */
525 int
526 bio_busy(int cleanit)
527 {
528 	struct buf *bp, *dp;
529 	int busy = 0;
530 	int i;
531 	kmutex_t *hmp;
532 
533 	for (i = 0; i < v.v_hbuf; i++) {
534 		dp = (struct buf *)&hbuf[i];
535 		hmp = &hbuf[i].b_lock;
536 
537 		mutex_enter(hmp);
538 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
539 			if (bp->b_flags & B_BUSY)
540 				busy++;
541 		}
542 		mutex_exit(hmp);
543 	}
544 
545 	if (cleanit && busy != 0) {
546 		bflush(NODEV);
547 	}
548 
549 	return (busy);
550 }
551 
552 /*
553  * this interface is provided for binary compatibility.
554  *
555  * Assign a buffer for the given block.  If the appropriate
556  * block is already associated, return it; otherwise search
557  * for the oldest non-busy buffer and reassign it.
558  */
559 struct buf *
560 getblk(dev_t dev, daddr_t blkno, long bsize)
561 {
562 	return (getblk_common(/* ufsvfsp */ NULL, dev,
563 	    blkno, bsize, /* errflg */ 0));
564 }
565 
566 /*
567  * Assign a buffer for the given block.  If the appropriate
568  * block is already associated, return it; otherwise search
569  * for the oldest non-busy buffer and reassign it.
570  */
571 struct buf *
572 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
573 {
574 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
575 	struct buf *bp;
576 	struct buf *dp;
577 	struct buf *nbp = NULL;
578 	struct buf *errbp;
579 	uint_t		index;
580 	kmutex_t	*hmp;
581 	struct	hbuf	*hp;
582 
583 	if (getmajor(dev) >= devcnt)
584 		cmn_err(CE_PANIC, "blkdev");
585 
586 	biostats.bio_lookup.value.ui32++;
587 
588 	index = bio_bhash(dev, blkno);
589 	hp    = &hbuf[index];
590 	dp    = (struct buf *)hp;
591 	hmp   = &hp->b_lock;
592 
593 	mutex_enter(hmp);
594 loop:
595 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
596 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
597 		    (bp->b_flags & B_STALE))
598 			continue;
599 		/*
600 		 * Avoid holding the hash lock in the event that
601 		 * the buffer is locked by someone. Since the hash chain
602 		 * may change when we drop the hash lock
603 		 * we have to start at the beginning of the chain if the
604 		 * buffer identity/contents aren't valid.
605 		 */
606 		if (!sema_tryp(&bp->b_sem)) {
607 			biostats.bio_bufbusy.value.ui32++;
608 			mutex_exit(hmp);
609 			/*
610 			 * OK, we are dealing with a busy buffer.
611 			 * In the case that we are panicking and we
612 			 * got called from bread(), we have some chance
613 			 * for error recovery. So better bail out from
614 			 * here since sema_p() won't block. If we got
615 			 * called directly from ufs routines, there is
616 			 * no way to report an error yet.
617 			 */
618 			if (panicstr && errflg)
619 				goto errout;
620 			/*
621 			 * For the following line of code to work
622 			 * correctly never kmem_free the buffer "header".
623 			 */
624 			sema_p(&bp->b_sem);
625 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
626 			    (bp->b_flags & B_STALE)) {
627 				sema_v(&bp->b_sem);
628 				mutex_enter(hmp);
629 				goto loop;	/* start over */
630 			}
631 			mutex_enter(hmp);
632 		}
633 		/* Found */
634 		biostats.bio_hit.value.ui32++;
635 		bp->b_flags &= ~B_AGE;
636 
637 		/*
638 		 * Yank it off the free/delayed write lists
639 		 */
640 		hp->b_length--;
641 		notavail(bp);
642 		mutex_exit(hmp);
643 
644 		ASSERT((bp->b_flags & B_NOCACHE) == 0);
645 
646 		if (nbp == NULL) {
647 			/*
648 			 * Make the common path short.
649 			 */
650 			ASSERT(SEMA_HELD(&bp->b_sem));
651 			return (bp);
652 		}
653 
654 		biostats.bio_bufdup.value.ui32++;
655 
656 		/*
657 		 * The buffer must have entered during the lock upgrade
658 		 * so free the new buffer we allocated and return the
659 		 * found buffer.
660 		 */
661 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
662 		nbp->b_un.b_addr = NULL;
663 
664 		/*
665 		 * Account for the memory
666 		 */
667 		mutex_enter(&bfree_lock);
668 		bfreelist.b_bufsize += nbp->b_bufsize;
669 		mutex_exit(&bfree_lock);
670 
671 		/*
672 		 * Destroy buf identity, and place on avail list
673 		 */
674 		nbp->b_dev = (o_dev_t)NODEV;
675 		nbp->b_edev = NODEV;
676 		nbp->b_flags = 0;
677 		nbp->b_file = NULL;
678 		nbp->b_offset = -1;
679 
680 		sema_v(&nbp->b_sem);
681 		bio_bhdr_free(nbp);
682 
683 		ASSERT(SEMA_HELD(&bp->b_sem));
684 		return (bp);
685 	}
686 
687 	/*
688 	 * bio_getfreeblk may block so check the hash chain again.
689 	 */
690 	if (nbp == NULL) {
691 		mutex_exit(hmp);
692 		nbp = bio_getfreeblk(bsize);
693 		mutex_enter(hmp);
694 		goto loop;
695 	}
696 
697 	/*
698 	 * New buffer. Assign nbp and stick it on the hash.
699 	 */
700 	nbp->b_flags = B_BUSY;
701 	nbp->b_edev = dev;
702 	nbp->b_dev = (o_dev_t)cmpdev(dev);
703 	nbp->b_blkno = blkno;
704 	nbp->b_iodone = NULL;
705 	nbp->b_bcount = bsize;
706 	/*
707 	 * If we are given a ufsvfsp and the vfs_root field is NULL
708 	 * then this must be I/O for a superblock.  A superblock's
709 	 * buffer is set up in mountfs() and there is no root vnode
710 	 * at that point.
711 	 */
712 	if (ufsvfsp && ufsvfsp->vfs_root) {
713 		nbp->b_vp = ufsvfsp->vfs_root;
714 	} else {
715 		nbp->b_vp = NULL;
716 	}
717 
718 	ASSERT((nbp->b_flags & B_NOCACHE) == 0);
719 
720 	binshash(nbp, dp);
721 	mutex_exit(hmp);
722 
723 	ASSERT(SEMA_HELD(&nbp->b_sem));
724 
725 	return (nbp);
726 
727 
728 	/*
729 	 * Come here in case of an internal error. At this point we couldn't
730 	 * get a buffer, but we have to return one. Hence we allocate some
731 	 * kind of error reply buffer on the fly. This buffer is marked as
732 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
733 	 *	- B_ERROR will indicate error to the caller.
734 	 *	- B_DONE will prevent us from reading the buffer from
735 	 *	  the device.
736 	 *	- B_NOCACHE will cause that this buffer gets free'd in
737 	 *	  brelse().
738 	 */
739 
740 errout:
741 	errbp = geteblk();
742 	sema_p(&errbp->b_sem);
743 	errbp->b_flags &= ~B_BUSY;
744 	errbp->b_flags |= (B_ERROR | B_DONE);
745 	return (errbp);
746 }
747 
748 /*
749  * Get an empty block, not assigned to any particular device.
750  * Returns a locked buffer that is not on any hash or free list.
751  */
752 struct buf *
753 ngeteblk(long bsize)
754 {
755 	struct buf *bp;
756 
757 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
758 	bioinit(bp);
759 	bp->av_forw = bp->av_back = NULL;
760 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
761 	bp->b_bufsize = bsize;
762 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
763 	bp->b_dev = (o_dev_t)NODEV;
764 	bp->b_edev = NODEV;
765 	bp->b_lblkno = 0;
766 	bp->b_bcount = bsize;
767 	bp->b_iodone = NULL;
768 	return (bp);
769 }
770 
771 /*
772  * Interface of geteblk() is kept intact to maintain driver compatibility.
773  * Use ngeteblk() to allocate block size other than 1 KB.
774  */
775 struct buf *
776 geteblk(void)
777 {
778 	return (ngeteblk((long)1024));
779 }
780 
781 /*
782  * Return a buffer w/o sleeping
783  */
784 struct buf *
785 trygetblk(dev_t dev, daddr_t blkno)
786 {
787 	struct buf	*bp;
788 	struct buf	*dp;
789 	struct hbuf	*hp;
790 	kmutex_t	*hmp;
791 	uint_t		index;
792 
793 	index = bio_bhash(dev, blkno);
794 	hp = &hbuf[index];
795 	hmp = &hp->b_lock;
796 
797 	if (!mutex_tryenter(hmp))
798 		return (NULL);
799 
800 	dp = (struct buf *)hp;
801 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
802 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
803 		    (bp->b_flags & B_STALE))
804 			continue;
805 		/*
806 		 * Get access to a valid buffer without sleeping
807 		 */
808 		if (sema_tryp(&bp->b_sem)) {
809 			if (bp->b_flags & B_DONE) {
810 				hp->b_length--;
811 				notavail(bp);
812 				mutex_exit(hmp);
813 				return (bp);
814 			} else {
815 				sema_v(&bp->b_sem);
816 				break;
817 			}
818 		}
819 		break;
820 	}
821 	mutex_exit(hmp);
822 	return (NULL);
823 }
824 
825 /*
826  * Wait for I/O completion on the buffer; return errors
827  * to the user.
828  */
829 int
830 iowait(struct buf *bp)
831 {
832 	ASSERT(SEMA_HELD(&bp->b_sem));
833 	return (biowait(bp));
834 }
835 
836 /*
837  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
838  * and wake up anyone waiting for it.
839  */
840 void
841 iodone(struct buf *bp)
842 {
843 	ASSERT(SEMA_HELD(&bp->b_sem));
844 	(void) biodone(bp);
845 }
846 
847 /*
848  * Zero the core associated with a buffer.
849  */
850 void
851 clrbuf(struct buf *bp)
852 {
853 	ASSERT(SEMA_HELD(&bp->b_sem));
854 	bzero(bp->b_un.b_addr, bp->b_bcount);
855 	bp->b_resid = 0;
856 }
857 
858 
859 /*
860  * Make sure all write-behind blocks on dev (or NODEV for all)
861  * are flushed out.
862  */
863 void
864 bflush(dev_t dev)
865 {
866 	struct buf *bp, *dp;
867 	struct hbuf *hp;
868 	struct buf *delwri_list = EMPTY_LIST;
869 	int i, index;
870 	kmutex_t *hmp;
871 
872 	mutex_enter(&blist_lock);
873 	/*
874 	 * Wait for any invalidates or flushes ahead of us to finish.
875 	 * We really could split blist_lock up per device for better
876 	 * parallelism here.
877 	 */
878 	while (bio_doinginval || bio_doingflush) {
879 		bio_flinv_cv_wanted = 1;
880 		cv_wait(&bio_flushinval_cv, &blist_lock);
881 	}
882 	bio_doingflush++;
883 	/*
884 	 * Gather all B_DELWRI buffer for device.
885 	 * Lock ordering is b_sem > hash lock (brelse).
886 	 * Since we are finding the buffer via the delayed write list,
887 	 * it may be busy and we would block trying to get the
888 	 * b_sem lock while holding hash lock. So transfer all the
889 	 * candidates on the delwri_list and then drop the hash locks.
890 	 */
891 	for (i = 0; i < v.v_hbuf; i++) {
892 		hmp = &hbuf[i].b_lock;
893 		dp = (struct buf *)&dwbuf[i];
894 		mutex_enter(hmp);
895 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
896 			if (dev == NODEV || bp->b_edev == dev) {
897 				if (bp->b_list == NULL) {
898 					bp->b_list = delwri_list;
899 					delwri_list = bp;
900 				}
901 			}
902 		}
903 		mutex_exit(hmp);
904 	}
905 	mutex_exit(&blist_lock);
906 
907 	/*
908 	 * Now that the hash locks have been dropped grab the semaphores
909 	 * and write back all the buffers that have B_DELWRI set.
910 	 */
911 	while (delwri_list != EMPTY_LIST) {
912 		bp = delwri_list;
913 
914 		sema_p(&bp->b_sem);	/* may block */
915 		if ((dev != bp->b_edev && dev != NODEV) ||
916 		    (panicstr && bp->b_flags & B_BUSY)) {
917 			sema_v(&bp->b_sem);
918 			delwri_list = bp->b_list;
919 			bp->b_list = NULL;
920 			continue;	/* No longer a candidate */
921 		}
922 		if (bp->b_flags & B_DELWRI) {
923 			index = bio_bhash(bp->b_edev, bp->b_blkno);
924 			hp = &hbuf[index];
925 			hmp = &hp->b_lock;
926 			dp = (struct buf *)hp;
927 
928 			bp->b_flags |= B_ASYNC;
929 			mutex_enter(hmp);
930 			hp->b_length--;
931 			notavail(bp);
932 			mutex_exit(hmp);
933 			if (bp->b_vp == NULL) {		/* !ufs */
934 				BWRITE(bp);
935 			} else {			/* ufs */
936 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
937 			}
938 		} else {
939 			sema_v(&bp->b_sem);
940 		}
941 		delwri_list = bp->b_list;
942 		bp->b_list = NULL;
943 	}
944 	mutex_enter(&blist_lock);
945 	bio_doingflush--;
946 	if (bio_flinv_cv_wanted) {
947 		bio_flinv_cv_wanted = 0;
948 		cv_broadcast(&bio_flushinval_cv);
949 	}
950 	mutex_exit(&blist_lock);
951 }
952 
953 /*
954  * Ensure that a specified block is up-to-date on disk.
955  */
956 void
957 blkflush(dev_t dev, daddr_t blkno)
958 {
959 	struct buf *bp, *dp;
960 	struct hbuf *hp;
961 	struct buf *sbp = NULL;
962 	uint_t index;
963 	kmutex_t *hmp;
964 
965 	index = bio_bhash(dev, blkno);
966 	hp    = &hbuf[index];
967 	dp    = (struct buf *)hp;
968 	hmp   = &hp->b_lock;
969 
970 	/*
971 	 * Identify the buffer in the cache belonging to
972 	 * this device and blkno (if any).
973 	 */
974 	mutex_enter(hmp);
975 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
976 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
977 		    (bp->b_flags & B_STALE))
978 			continue;
979 		sbp = bp;
980 		break;
981 	}
982 	mutex_exit(hmp);
983 	if (sbp == NULL)
984 		return;
985 	/*
986 	 * Now check the buffer we have identified and
987 	 * make sure it still belongs to the device and is B_DELWRI
988 	 */
989 	sema_p(&sbp->b_sem);
990 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
991 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
992 		mutex_enter(hmp);
993 		hp->b_length--;
994 		notavail(sbp);
995 		mutex_exit(hmp);
996 		/*
997 		 * XXX - There is nothing to guarantee a synchronous
998 		 * write here if the B_ASYNC flag is set.  This needs
999 		 * some investigation.
1000 		 */
1001 		if (sbp->b_vp == NULL) {		/* !ufs */
1002 			BWRITE(sbp);	/* synchronous write */
1003 		} else {				/* ufs */
1004 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1005 		}
1006 	} else {
1007 		sema_v(&sbp->b_sem);
1008 	}
1009 }
1010 
1011 /*
1012  * Same as binval, except can force-invalidate delayed-write buffers
1013  * (which are not be already flushed because of device errors).  Also
1014  * makes sure that the retry write flag is cleared.
1015  */
1016 int
1017 bfinval(dev_t dev, int force)
1018 {
1019 	struct buf *dp;
1020 	struct buf *bp;
1021 	struct buf *binval_list = EMPTY_LIST;
1022 	int i, error = 0;
1023 	kmutex_t *hmp;
1024 	uint_t index;
1025 	struct buf **backp;
1026 
1027 	mutex_enter(&blist_lock);
1028 	/*
1029 	 * Wait for any flushes ahead of us to finish, it's ok to
1030 	 * do invalidates in parallel.
1031 	 */
1032 	while (bio_doingflush) {
1033 		bio_flinv_cv_wanted = 1;
1034 		cv_wait(&bio_flushinval_cv, &blist_lock);
1035 	}
1036 	bio_doinginval++;
1037 
1038 	/* Gather bp's */
1039 	for (i = 0; i < v.v_hbuf; i++) {
1040 		dp = (struct buf *)&hbuf[i];
1041 		hmp = &hbuf[i].b_lock;
1042 
1043 		mutex_enter(hmp);
1044 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1045 			if (bp->b_edev == dev) {
1046 				if (bp->b_list == NULL) {
1047 					bp->b_list = binval_list;
1048 					binval_list = bp;
1049 				}
1050 			}
1051 		}
1052 		mutex_exit(hmp);
1053 	}
1054 	mutex_exit(&blist_lock);
1055 
1056 	/* Invalidate all bp's found */
1057 	while (binval_list != EMPTY_LIST) {
1058 		bp = binval_list;
1059 
1060 		sema_p(&bp->b_sem);
1061 		if (bp->b_edev == dev) {
1062 			if (force && (bp->b_flags & B_DELWRI)) {
1063 				/* clear B_DELWRI, move to non-dw freelist */
1064 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1065 				hmp = &hbuf[index].b_lock;
1066 				dp = (struct buf *)&hbuf[index];
1067 				mutex_enter(hmp);
1068 
1069 				/* remove from delayed write freelist */
1070 				notavail(bp);
1071 
1072 				/* add to B_AGE side of non-dw freelist */
1073 				backp = &dp->av_forw;
1074 				(*backp)->av_back = bp;
1075 				bp->av_forw = *backp;
1076 				*backp = bp;
1077 				bp->av_back = dp;
1078 
1079 				/*
1080 				 * make sure write retries and busy are cleared
1081 				 */
1082 				bp->b_flags &=
1083 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1084 				mutex_exit(hmp);
1085 			}
1086 			if ((bp->b_flags & B_DELWRI) == 0)
1087 				bp->b_flags |= B_STALE|B_AGE;
1088 			else
1089 				error = EIO;
1090 		}
1091 		sema_v(&bp->b_sem);
1092 		binval_list = bp->b_list;
1093 		bp->b_list = NULL;
1094 	}
1095 	mutex_enter(&blist_lock);
1096 	bio_doinginval--;
1097 	if (bio_flinv_cv_wanted) {
1098 		cv_broadcast(&bio_flushinval_cv);
1099 		bio_flinv_cv_wanted = 0;
1100 	}
1101 	mutex_exit(&blist_lock);
1102 	return (error);
1103 }
1104 
1105 /*
1106  * If possible, invalidate blocks for a dev on demand
1107  */
1108 void
1109 binval(dev_t dev)
1110 {
1111 	(void) bfinval(dev, 0);
1112 }
1113 
1114 /*
1115  * Initialize the buffer I/O system by freeing
1116  * all buffers and setting all device hash buffer lists to empty.
1117  */
1118 void
1119 binit(void)
1120 {
1121 	struct buf *bp;
1122 	unsigned int i, pct;
1123 	ulong_t	bio_max_hwm, bio_default_hwm;
1124 
1125 	/*
1126 	 * Maximum/Default values for bufhwm are set to the smallest of:
1127 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1128 	 *	- 1/4 of kernel virtual memory
1129 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1130 	 * Additionally, in order to allow simple tuning by percentage of
1131 	 * physical memory, bufhwm_pct is used to calculate the default if
1132 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1133 	 *
1134 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1135 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1136 	 */
1137 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1138 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1139 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1140 
1141 	pct = BIO_BUF_PERCENT;
1142 	if (bufhwm_pct != 0 &&
1143 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1144 		pct = BIO_BUF_PERCENT;
1145 		/*
1146 		 * Invalid user specified value, emit a warning.
1147 		 */
1148 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1149 		    range(1..%d). Using %d as default.",
1150 		    bufhwm_pct,
1151 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1152 	}
1153 
1154 	bio_default_hwm = MIN(physmem / pct,
1155 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1156 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1157 
1158 	if ((v.v_bufhwm = bufhwm) == 0)
1159 		v.v_bufhwm = bio_default_hwm;
1160 
1161 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1162 		v.v_bufhwm = (int)bio_max_hwm;
1163 		/*
1164 		 * Invalid user specified value, emit a warning.
1165 		 */
1166 		cmn_err(CE_WARN,
1167 		    "binit: bufhwm(%d) out \
1168 		    of range(%d..%lu). Using %lu as default",
1169 		    bufhwm,
1170 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1171 	}
1172 
1173 	/*
1174 	 * Determine the number of hash buckets. Default is to
1175 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1176 	 * Round up number to the next power of 2.
1177 	 */
1178 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1179 	    BIO_HASHLEN);
1180 	v.v_hmask = v.v_hbuf - 1;
1181 	v.v_buf = BIO_BHDR_POOL;
1182 
1183 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1184 
1185 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1186 
1187 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1188 	bp = &bfreelist;
1189 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1190 
1191 	for (i = 0; i < v.v_hbuf; i++) {
1192 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1193 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1194 
1195 		/*
1196 		 * Initialize the delayed write buffer list.
1197 		 */
1198 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1199 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1200 	}
1201 }
1202 
1203 /*
1204  * Wait for I/O completion on the buffer; return error code.
1205  * If bp was for synchronous I/O, bp is invalid and associated
1206  * resources are freed on return.
1207  */
1208 int
1209 biowait(struct buf *bp)
1210 {
1211 	int error = 0;
1212 	struct cpu *cpup;
1213 
1214 	ASSERT(SEMA_HELD(&bp->b_sem));
1215 
1216 	cpup = CPU;
1217 	atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1218 	DTRACE_IO1(wait__start, struct buf *, bp);
1219 
1220 	/*
1221 	 * In case of panic, busy wait for completion
1222 	 */
1223 	if (panicstr) {
1224 		while ((bp->b_flags & B_DONE) == 0)
1225 			drv_usecwait(10);
1226 	} else
1227 		sema_p(&bp->b_io);
1228 
1229 	DTRACE_IO1(wait__done, struct buf *, bp);
1230 	atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1231 
1232 	error = geterror(bp);
1233 	if ((bp->b_flags & B_ASYNC) == 0) {
1234 		if (bp->b_flags & B_REMAPPED)
1235 			bp_mapout(bp);
1236 	}
1237 	return (error);
1238 }
1239 
1240 /*
1241  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1242  * and wake up anyone waiting for it.
1243  */
1244 void
1245 biodone(struct buf *bp)
1246 {
1247 	if (bp->b_flags & B_STARTED) {
1248 		DTRACE_IO1(done, struct buf *, bp);
1249 		bp->b_flags &= ~B_STARTED;
1250 	}
1251 
1252 	if (bp->b_iodone != NULL) {
1253 		(*(bp->b_iodone))(bp);
1254 		return;
1255 	}
1256 	ASSERT((bp->b_flags & B_DONE) == 0);
1257 	ASSERT(SEMA_HELD(&bp->b_sem));
1258 	bp->b_flags |= B_DONE;
1259 	if (bp->b_flags & B_ASYNC) {
1260 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1261 			bio_pageio_done(bp);
1262 		else
1263 			brelse(bp);	/* release bp to freelist */
1264 	} else {
1265 		sema_v(&bp->b_io);
1266 	}
1267 }
1268 
1269 /*
1270  * Pick up the device's error number and pass it to the user;
1271  * if there is an error but the number is 0 set a generalized code.
1272  */
1273 int
1274 geterror(struct buf *bp)
1275 {
1276 	int error = 0;
1277 
1278 	ASSERT(SEMA_HELD(&bp->b_sem));
1279 	if (bp->b_flags & B_ERROR) {
1280 		error = bp->b_error;
1281 		if (!error)
1282 			error = EIO;
1283 	}
1284 	return (error);
1285 }
1286 
1287 /*
1288  * Support for pageio buffers.
1289  *
1290  * This stuff should be generalized to provide a generalized bp
1291  * header facility that can be used for things other than pageio.
1292  */
1293 
1294 /*
1295  * Allocate and initialize a buf struct for use with pageio.
1296  */
1297 struct buf *
1298 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1299 {
1300 	struct buf *bp;
1301 	struct cpu *cpup;
1302 
1303 	if (flags & B_READ) {
1304 		CPU_STATS_ENTER_K();
1305 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1306 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1307 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1308 
1309 		atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1310 
1311 		if ((flags & B_ASYNC) == 0) {
1312 			klwp_t *lwp = ttolwp(curthread);
1313 			if (lwp != NULL)
1314 				lwp->lwp_ru.majflt++;
1315 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1316 		}
1317 		/*
1318 		 * Update statistics for pages being paged in
1319 		 */
1320 		if (pp != NULL && pp->p_vnode != NULL) {
1321 			if (IS_SWAPFSVP(pp->p_vnode)) {
1322 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1323 				atomic_add_64(&curzone->zone_anonpgin,
1324 				    btopr(len));
1325 			} else {
1326 				if (pp->p_vnode->v_flag & VVMEXEC) {
1327 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1328 					    btopr(len));
1329 					atomic_add_64(&curzone->zone_execpgin,
1330 					    btopr(len));
1331 				} else {
1332 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1333 					    btopr(len));
1334 					atomic_add_64(&curzone->zone_fspgin,
1335 					    btopr(len));
1336 				}
1337 			}
1338 		}
1339 		CPU_STATS_EXIT_K();
1340 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1341 		    "page_ws_in:pp %p", pp);
1342 	}
1343 
1344 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1345 	bp->b_bcount = len;
1346 	bp->b_bufsize = len;
1347 	bp->b_pages = pp;
1348 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1349 	bp->b_offset = -1;
1350 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1351 
1352 	/* Initialize bp->b_sem in "locked" state */
1353 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1354 
1355 	VN_HOLD(vp);
1356 	bp->b_vp = vp;
1357 
1358 	/*
1359 	 * Caller sets dev & blkno and can adjust
1360 	 * b_addr for page offset and can use bp_mapin
1361 	 * to make pages kernel addressable.
1362 	 */
1363 	return (bp);
1364 }
1365 
1366 void
1367 pageio_done(struct buf *bp)
1368 {
1369 	ASSERT(SEMA_HELD(&bp->b_sem));
1370 	if (bp->b_flags & B_REMAPPED)
1371 		bp_mapout(bp);
1372 	VN_RELE(bp->b_vp);
1373 	bp->b_vp = NULL;
1374 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1375 
1376 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1377 	sema_destroy(&bp->b_sem);
1378 	sema_destroy(&bp->b_io);
1379 	kmem_free(bp, sizeof (struct buf));
1380 }
1381 
1382 /*
1383  * Check to see whether the buffers, except the one pointed by sbp,
1384  * associated with the device are busy.
1385  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1386  */
1387 int
1388 bcheck(dev_t dev, struct buf *sbp)
1389 {
1390 	struct buf	*bp;
1391 	struct buf	*dp;
1392 	int i;
1393 	kmutex_t *hmp;
1394 
1395 	/*
1396 	 * check for busy bufs for this filesystem
1397 	 */
1398 	for (i = 0; i < v.v_hbuf; i++) {
1399 		dp = (struct buf *)&hbuf[i];
1400 		hmp = &hbuf[i].b_lock;
1401 
1402 		mutex_enter(hmp);
1403 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1404 			/*
1405 			 * if buf is busy or dirty, then filesystem is busy
1406 			 */
1407 			if ((bp->b_edev == dev) &&
1408 			    ((bp->b_flags & B_STALE) == 0) &&
1409 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1410 			    (bp != sbp)) {
1411 				mutex_exit(hmp);
1412 				return (1);
1413 			}
1414 		}
1415 		mutex_exit(hmp);
1416 	}
1417 	return (0);
1418 }
1419 
1420 /*
1421  * Hash two 32 bit entities.
1422  */
1423 int
1424 hash2ints(int x, int y)
1425 {
1426 	int hash = 0;
1427 
1428 	hash = x - 1;
1429 	hash = ((hash * 7) + (x >> 8)) - 1;
1430 	hash = ((hash * 7) + (x >> 16)) - 1;
1431 	hash = ((hash * 7) + (x >> 24)) - 1;
1432 	hash = ((hash * 7) + y) - 1;
1433 	hash = ((hash * 7) + (y >> 8)) - 1;
1434 	hash = ((hash * 7) + (y >> 16)) - 1;
1435 	hash = ((hash * 7) + (y >> 24)) - 1;
1436 
1437 	return (hash);
1438 }
1439 
1440 
1441 /*
1442  * Return a new buffer struct.
1443  *	Create a new buffer if we haven't gone over our high water
1444  *	mark for memory, otherwise try to get one off the freelist.
1445  *
1446  * Returns a locked buf that has no id and is not on any hash or free
1447  * list.
1448  */
1449 static struct buf *
1450 bio_getfreeblk(long bsize)
1451 {
1452 	struct buf *bp, *dp;
1453 	struct hbuf *hp;
1454 	kmutex_t	*hmp;
1455 	uint_t		start, end;
1456 
1457 	/*
1458 	 * mutex_enter(&bfree_lock);
1459 	 * bfreelist.b_bufsize represents the amount of memory
1460 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1461 	 * we are allowed to allocate in the cache before we hit our hwm.
1462 	 */
1463 	bio_mem_get(bsize);	/* Account for our memory request */
1464 
1465 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1466 	sema_p(&bp->b_sem);	/* Should never fail */
1467 
1468 	ASSERT(bp->b_un.b_addr == NULL);
1469 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1470 	if (bp->b_un.b_addr != NULL) {
1471 		/*
1472 		 * Make the common path short
1473 		 */
1474 		bp->b_bufsize = bsize;
1475 		ASSERT(SEMA_HELD(&bp->b_sem));
1476 		return (bp);
1477 	} else {
1478 		struct buf *save;
1479 
1480 		save = bp;	/* Save bp we allocated */
1481 		start = end = lastindex;
1482 
1483 		biostats.bio_bufwant.value.ui32++;
1484 
1485 		/*
1486 		 * Memory isn't available from the system now. Scan
1487 		 * the hash buckets till enough space is found.
1488 		 */
1489 		do {
1490 			hp = &hbuf[start];
1491 			hmp = &hp->b_lock;
1492 			dp = (struct buf *)hp;
1493 
1494 			mutex_enter(hmp);
1495 			bp = dp->av_forw;
1496 
1497 			while (bp != dp) {
1498 
1499 				ASSERT(bp != NULL);
1500 
1501 				if (!sema_tryp(&bp->b_sem)) {
1502 					bp = bp->av_forw;
1503 					continue;
1504 				}
1505 
1506 				/*
1507 				 * Since we are going down the freelist
1508 				 * associated with this hash bucket the
1509 				 * B_DELWRI flag should not be set.
1510 				 */
1511 				ASSERT(!(bp->b_flags & B_DELWRI));
1512 
1513 				if (bp->b_bufsize == bsize) {
1514 					hp->b_length--;
1515 					notavail(bp);
1516 					bremhash(bp);
1517 					mutex_exit(hmp);
1518 
1519 					/*
1520 					 * Didn't kmem_alloc any more, so don't
1521 					 * count it twice.
1522 					 */
1523 					mutex_enter(&bfree_lock);
1524 					bfreelist.b_bufsize += bsize;
1525 					mutex_exit(&bfree_lock);
1526 
1527 					/*
1528 					 * Update the lastindex value.
1529 					 */
1530 					lastindex = start;
1531 
1532 					/*
1533 					 * Put our saved bp back on the list
1534 					 */
1535 					sema_v(&save->b_sem);
1536 					bio_bhdr_free(save);
1537 					ASSERT(SEMA_HELD(&bp->b_sem));
1538 					return (bp);
1539 				}
1540 				sema_v(&bp->b_sem);
1541 				bp = bp->av_forw;
1542 			}
1543 			mutex_exit(hmp);
1544 			start = ((start + 1) % v.v_hbuf);
1545 		} while (start != end);
1546 
1547 		biostats.bio_bufwait.value.ui32++;
1548 		bp = save;		/* Use original bp */
1549 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1550 	}
1551 
1552 	bp->b_bufsize = bsize;
1553 	ASSERT(SEMA_HELD(&bp->b_sem));
1554 	return (bp);
1555 }
1556 
1557 /*
1558  * Allocate a buffer header. If none currently available, allocate
1559  * a new pool.
1560  */
1561 static struct buf *
1562 bio_bhdr_alloc(void)
1563 {
1564 	struct buf *dp, *sdp;
1565 	struct buf *bp;
1566 	int i;
1567 
1568 	for (;;) {
1569 		mutex_enter(&bhdr_lock);
1570 		if (bhdrlist != NULL) {
1571 			bp = bhdrlist;
1572 			bhdrlist = bp->av_forw;
1573 			mutex_exit(&bhdr_lock);
1574 			bp->av_forw = NULL;
1575 			return (bp);
1576 		}
1577 		mutex_exit(&bhdr_lock);
1578 
1579 		/*
1580 		 * Need to allocate a new pool. If the system is currently
1581 		 * out of memory, then try freeing things on the freelist.
1582 		 */
1583 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1584 		if (dp == NULL) {
1585 			/*
1586 			 * System can't give us a pool of headers, try
1587 			 * recycling from the free lists.
1588 			 */
1589 			bio_recycle(BIO_HEADER, 0);
1590 		} else {
1591 			sdp = dp;
1592 			for (i = 0; i < v.v_buf; i++, dp++) {
1593 				/*
1594 				 * The next two lines are needed since NODEV
1595 				 * is -1 and not NULL
1596 				 */
1597 				dp->b_dev = (o_dev_t)NODEV;
1598 				dp->b_edev = NODEV;
1599 				dp->av_forw = dp + 1;
1600 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1601 				    NULL);
1602 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1603 				    NULL);
1604 				dp->b_offset = -1;
1605 			}
1606 			mutex_enter(&bhdr_lock);
1607 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1608 			bhdrlist = sdp;
1609 			nbuf += v.v_buf;
1610 			bp = bhdrlist;
1611 			bhdrlist = bp->av_forw;
1612 			mutex_exit(&bhdr_lock);
1613 
1614 			bp->av_forw = NULL;
1615 			return (bp);
1616 		}
1617 	}
1618 }
1619 
1620 static  void
1621 bio_bhdr_free(struct buf *bp)
1622 {
1623 	ASSERT(bp->b_back == NULL);
1624 	ASSERT(bp->b_forw == NULL);
1625 	ASSERT(bp->av_back == NULL);
1626 	ASSERT(bp->av_forw == NULL);
1627 	ASSERT(bp->b_un.b_addr == NULL);
1628 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1629 	ASSERT(bp->b_edev == NODEV);
1630 	ASSERT(bp->b_flags == 0);
1631 
1632 	mutex_enter(&bhdr_lock);
1633 	bp->av_forw = bhdrlist;
1634 	bhdrlist = bp;
1635 	mutex_exit(&bhdr_lock);
1636 }
1637 
1638 /*
1639  * If we haven't gone over the high water mark, it's o.k. to
1640  * allocate more buffer space, otherwise recycle buffers
1641  * from the freelist until enough memory is free for a bsize request.
1642  *
1643  * We account for this memory, even though
1644  * we don't allocate it here.
1645  */
1646 static void
1647 bio_mem_get(long bsize)
1648 {
1649 	mutex_enter(&bfree_lock);
1650 	if (bfreelist.b_bufsize > bsize) {
1651 		bfreelist.b_bufsize -= bsize;
1652 		mutex_exit(&bfree_lock);
1653 		return;
1654 	}
1655 	mutex_exit(&bfree_lock);
1656 	bio_recycle(BIO_MEM, bsize);
1657 }
1658 
1659 /*
1660  * flush a list of delayed write buffers.
1661  * (currently used only by bio_recycle below.)
1662  */
1663 static void
1664 bio_flushlist(struct buf *delwri_list)
1665 {
1666 	struct buf *bp;
1667 
1668 	while (delwri_list != EMPTY_LIST) {
1669 		bp = delwri_list;
1670 		bp->b_flags |= B_AGE | B_ASYNC;
1671 		if (bp->b_vp == NULL) {		/* !ufs */
1672 			BWRITE(bp);
1673 		} else {			/* ufs */
1674 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1675 		}
1676 		delwri_list = bp->b_list;
1677 		bp->b_list = NULL;
1678 	}
1679 }
1680 
1681 /*
1682  * Start recycling buffers on the freelist for one of 2 reasons:
1683  *	- we need a buffer header
1684  *	- we need to free up memory
1685  * Once started we continue to recycle buffers until the B_AGE
1686  * buffers are gone.
1687  */
1688 static void
1689 bio_recycle(int want, long bsize)
1690 {
1691 	struct buf *bp, *dp, *dwp, *nbp;
1692 	struct hbuf *hp;
1693 	int	found = 0;
1694 	kmutex_t	*hmp;
1695 	int		start, end;
1696 	struct buf *delwri_list = EMPTY_LIST;
1697 
1698 	/*
1699 	 * Recycle buffers.
1700 	 */
1701 top:
1702 	start = end = lastindex;
1703 	do {
1704 		hp = &hbuf[start];
1705 		hmp = &hp->b_lock;
1706 		dp = (struct buf *)hp;
1707 
1708 		mutex_enter(hmp);
1709 		bp = dp->av_forw;
1710 
1711 		while (bp != dp) {
1712 
1713 			ASSERT(bp != NULL);
1714 
1715 			if (!sema_tryp(&bp->b_sem)) {
1716 				bp = bp->av_forw;
1717 				continue;
1718 			}
1719 			/*
1720 			 * Do we really want to nuke all of the B_AGE stuff??
1721 			 */
1722 			if ((bp->b_flags & B_AGE) == 0 && found) {
1723 				sema_v(&bp->b_sem);
1724 				mutex_exit(hmp);
1725 				lastindex = start;
1726 				return;	/* All done */
1727 			}
1728 
1729 			ASSERT(MUTEX_HELD(&hp->b_lock));
1730 			ASSERT(!(bp->b_flags & B_DELWRI));
1731 			hp->b_length--;
1732 			notavail(bp);
1733 
1734 			/*
1735 			 * Remove bhdr from cache, free up memory,
1736 			 * and add the hdr to the freelist.
1737 			 */
1738 			bremhash(bp);
1739 			mutex_exit(hmp);
1740 
1741 			if (bp->b_bufsize) {
1742 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1743 				bp->b_un.b_addr = NULL;
1744 				mutex_enter(&bfree_lock);
1745 				bfreelist.b_bufsize += bp->b_bufsize;
1746 				mutex_exit(&bfree_lock);
1747 			}
1748 
1749 			bp->b_dev = (o_dev_t)NODEV;
1750 			bp->b_edev = NODEV;
1751 			bp->b_flags = 0;
1752 			sema_v(&bp->b_sem);
1753 			bio_bhdr_free(bp);
1754 			if (want == BIO_HEADER) {
1755 				found = 1;
1756 			} else {
1757 				ASSERT(want == BIO_MEM);
1758 				if (!found && bfreelist.b_bufsize >= bsize) {
1759 					/* Account for the memory we want */
1760 					mutex_enter(&bfree_lock);
1761 					if (bfreelist.b_bufsize >= bsize) {
1762 						bfreelist.b_bufsize -= bsize;
1763 						found = 1;
1764 					}
1765 					mutex_exit(&bfree_lock);
1766 				}
1767 			}
1768 
1769 			/*
1770 			 * Since we dropped hmp start from the
1771 			 * begining.
1772 			 */
1773 			mutex_enter(hmp);
1774 			bp = dp->av_forw;
1775 		}
1776 		mutex_exit(hmp);
1777 
1778 		/*
1779 		 * Look at the delayed write list.
1780 		 * First gather into a private list, then write them.
1781 		 */
1782 		dwp = (struct buf *)&dwbuf[start];
1783 		mutex_enter(&blist_lock);
1784 		bio_doingflush++;
1785 		mutex_enter(hmp);
1786 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1787 
1788 			ASSERT(bp != NULL);
1789 			nbp = bp->av_forw;
1790 
1791 			if (!sema_tryp(&bp->b_sem))
1792 				continue;
1793 			ASSERT(bp->b_flags & B_DELWRI);
1794 			/*
1795 			 * Do we really want to nuke all of the B_AGE stuff??
1796 			 */
1797 
1798 			if ((bp->b_flags & B_AGE) == 0 && found) {
1799 				sema_v(&bp->b_sem);
1800 				mutex_exit(hmp);
1801 				lastindex = start;
1802 				mutex_exit(&blist_lock);
1803 				bio_flushlist(delwri_list);
1804 				mutex_enter(&blist_lock);
1805 				bio_doingflush--;
1806 				if (bio_flinv_cv_wanted) {
1807 					bio_flinv_cv_wanted = 0;
1808 					cv_broadcast(&bio_flushinval_cv);
1809 				}
1810 				mutex_exit(&blist_lock);
1811 				return; /* All done */
1812 			}
1813 
1814 			/*
1815 			 * If the buffer is already on a flush or
1816 			 * invalidate list then just skip it.
1817 			 */
1818 			if (bp->b_list != NULL) {
1819 				sema_v(&bp->b_sem);
1820 				continue;
1821 			}
1822 			/*
1823 			 * We are still on the same bucket.
1824 			 */
1825 			hp->b_length--;
1826 			notavail(bp);
1827 			bp->b_list = delwri_list;
1828 			delwri_list = bp;
1829 		}
1830 		mutex_exit(hmp);
1831 		mutex_exit(&blist_lock);
1832 		bio_flushlist(delwri_list);
1833 		delwri_list = EMPTY_LIST;
1834 		mutex_enter(&blist_lock);
1835 		bio_doingflush--;
1836 		if (bio_flinv_cv_wanted) {
1837 			bio_flinv_cv_wanted = 0;
1838 			cv_broadcast(&bio_flushinval_cv);
1839 		}
1840 		mutex_exit(&blist_lock);
1841 		start = (start + 1) % v.v_hbuf;
1842 
1843 	} while (start != end);
1844 
1845 	if (found)
1846 		return;
1847 
1848 	/*
1849 	 * Free lists exhausted and we haven't satisfied the request.
1850 	 * Wait here for more entries to be added to freelist.
1851 	 * Because this might have just happened, make it timed.
1852 	 */
1853 	mutex_enter(&bfree_lock);
1854 	bfreelist.b_flags |= B_WANTED;
1855 	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1856 	mutex_exit(&bfree_lock);
1857 	goto top;
1858 }
1859 
1860 /*
1861  * See if the block is associated with some buffer
1862  * (mainly to avoid getting hung up on a wait in breada).
1863  */
1864 static int
1865 bio_incore(dev_t dev, daddr_t blkno)
1866 {
1867 	struct buf *bp;
1868 	struct buf *dp;
1869 	uint_t index;
1870 	kmutex_t *hmp;
1871 
1872 	index = bio_bhash(dev, blkno);
1873 	dp = (struct buf *)&hbuf[index];
1874 	hmp = &hbuf[index].b_lock;
1875 
1876 	mutex_enter(hmp);
1877 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1878 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1879 		    (bp->b_flags & B_STALE) == 0) {
1880 			mutex_exit(hmp);
1881 			return (1);
1882 		}
1883 	}
1884 	mutex_exit(hmp);
1885 	return (0);
1886 }
1887 
1888 static void
1889 bio_pageio_done(struct buf *bp)
1890 {
1891 	if (bp->b_flags & B_PAGEIO) {
1892 
1893 		if (bp->b_flags & B_REMAPPED)
1894 			bp_mapout(bp);
1895 
1896 		if (bp->b_flags & B_READ)
1897 			pvn_read_done(bp->b_pages, bp->b_flags);
1898 		else
1899 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1900 		pageio_done(bp);
1901 	} else {
1902 		ASSERT(bp->b_flags & B_REMAPPED);
1903 		bp_mapout(bp);
1904 		brelse(bp);
1905 	}
1906 }
1907 
1908 /*
1909  * bioerror(9F) - indicate error in buffer header
1910  * If 'error' is zero, remove the error indication.
1911  */
1912 void
1913 bioerror(struct buf *bp, int error)
1914 {
1915 	ASSERT(bp != NULL);
1916 	ASSERT(error >= 0);
1917 	ASSERT(SEMA_HELD(&bp->b_sem));
1918 
1919 	if (error != 0) {
1920 		bp->b_flags |= B_ERROR;
1921 	} else {
1922 		bp->b_flags &= ~B_ERROR;
1923 	}
1924 	bp->b_error = error;
1925 }
1926 
1927 /*
1928  * bioreset(9F) - reuse a private buffer header after I/O is complete
1929  */
1930 void
1931 bioreset(struct buf *bp)
1932 {
1933 	ASSERT(bp != NULL);
1934 
1935 	biofini(bp);
1936 	bioinit(bp);
1937 }
1938 
1939 /*
1940  * biosize(9F) - return size of a buffer header
1941  */
1942 size_t
1943 biosize(void)
1944 {
1945 	return (sizeof (struct buf));
1946 }
1947 
1948 /*
1949  * biomodified(9F) - check if buffer is modified
1950  */
1951 int
1952 biomodified(struct buf *bp)
1953 {
1954 	int npf;
1955 	int ppattr;
1956 	struct page *pp;
1957 
1958 	ASSERT(bp != NULL);
1959 
1960 	if ((bp->b_flags & B_PAGEIO) == 0) {
1961 		return (-1);
1962 	}
1963 	pp = bp->b_pages;
1964 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1965 
1966 	while (npf > 0) {
1967 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1968 		    HAT_SYNC_STOPON_MOD);
1969 		if (ppattr & P_MOD)
1970 			return (1);
1971 		pp = pp->p_next;
1972 		npf--;
1973 	}
1974 
1975 	return (0);
1976 }
1977 
1978 /*
1979  * bioinit(9F) - initialize a buffer structure
1980  */
1981 void
1982 bioinit(struct buf *bp)
1983 {
1984 	bzero(bp, sizeof (struct buf));
1985 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1986 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1987 	bp->b_offset = -1;
1988 }
1989 
1990 /*
1991  * biofini(9F) - uninitialize a buffer structure
1992  */
1993 void
1994 biofini(struct buf *bp)
1995 {
1996 	sema_destroy(&bp->b_io);
1997 	sema_destroy(&bp->b_sem);
1998 }
1999 
2000 /*
2001  * bioclone(9F) - clone a buffer
2002  */
2003 struct buf *
2004 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2005     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2006 {
2007 	struct buf *bufp;
2008 
2009 	ASSERT(bp);
2010 	if (bp_mem == NULL) {
2011 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2012 		if (bufp == NULL) {
2013 			return (NULL);
2014 		}
2015 		bioinit(bufp);
2016 	} else {
2017 		bufp = bp_mem;
2018 		bioreset(bufp);
2019 	}
2020 
2021 #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2022 	B_ABRWRITE)
2023 
2024 	/*
2025 	 * The cloned buffer does not inherit the B_REMAPPED flag.
2026 	 */
2027 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2028 	bufp->b_bcount = len;
2029 	bufp->b_blkno = blkno;
2030 	bufp->b_iodone = iodone;
2031 	bufp->b_proc = bp->b_proc;
2032 	bufp->b_edev = dev;
2033 	bufp->b_file = bp->b_file;
2034 	bufp->b_offset = bp->b_offset;
2035 
2036 	if (bp->b_flags & B_SHADOW) {
2037 		ASSERT(bp->b_shadow);
2038 		ASSERT(bp->b_flags & B_PHYS);
2039 
2040 		bufp->b_shadow = bp->b_shadow +
2041 		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2042 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2043 		if (bp->b_flags & B_REMAPPED)
2044 			bufp->b_proc = NULL;
2045 	} else {
2046 		if (bp->b_flags & B_PAGEIO) {
2047 			struct page *pp;
2048 			off_t o;
2049 			int i;
2050 
2051 			pp = bp->b_pages;
2052 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2053 			for (i = btop(o); i > 0; i--) {
2054 				pp = pp->p_next;
2055 			}
2056 			bufp->b_pages = pp;
2057 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2058 		} else {
2059 			bufp->b_un.b_addr =
2060 			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2061 			if (bp->b_flags & B_REMAPPED)
2062 				bufp->b_proc = NULL;
2063 		}
2064 	}
2065 	return (bufp);
2066 }
2067