xref: /titanic_52/usr/src/uts/common/os/bio.c (revision 342440ec94087b8c751c580ab9ed6c693d31d418)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 #include <sys/types.h>
42 #include <sys/t_lock.h>
43 #include <sys/sysmacros.h>
44 #include <sys/conf.h>
45 #include <sys/cpuvar.h>
46 #include <sys/errno.h>
47 #include <sys/debug.h>
48 #include <sys/buf.h>
49 #include <sys/var.h>
50 #include <sys/vnode.h>
51 #include <sys/bitmap.h>
52 #include <sys/cmn_err.h>
53 #include <sys/kmem.h>
54 #include <sys/vmem.h>
55 #include <sys/atomic.h>
56 #include <vm/seg_kmem.h>
57 #include <vm/page.h>
58 #include <vm/pvn.h>
59 #include <sys/vtrace.h>
60 #include <sys/tnf_probe.h>
61 #include <sys/fs/ufs_inode.h>
62 #include <sys/fs/ufs_bio.h>
63 #include <sys/fs/ufs_log.h>
64 #include <sys/systm.h>
65 #include <sys/vfs.h>
66 #include <sys/sdt.h>
67 
68 /* Locks */
69 static	kmutex_t	blist_lock;	/* protects b_list */
70 static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
71 static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
72 
73 struct hbuf	*hbuf;			/* Hash buckets */
74 struct dwbuf	*dwbuf;			/* Delayed write buckets */
75 static struct buf *bhdrlist;		/* buf header free list */
76 static int 	nbuf;			/* number of buffer headers allocated */
77 
78 static int	lastindex;		/* Reference point on where to start */
79 					/* when looking for free buffers */
80 
81 #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
82 #define	EMPTY_LIST	((struct buf *)-1)
83 
84 static kcondvar_t	bio_mem_cv; 	/* Condition variables */
85 static kcondvar_t	bio_flushinval_cv;
86 static int	bio_doingflush;		/* flush in progress */
87 static int	bio_doinginval;		/* inval in progress */
88 static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
89 
90 /*
91  * Statistics on the buffer cache
92  */
93 struct biostats biostats = {
94 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
95 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
96 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
97 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
98 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
99 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
100 };
101 
102 /*
103  * kstat data
104  */
105 kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
106 uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
107 					sizeof (kstat_named_t));
108 
109 /*
110  * Statistics on ufs buffer cache
111  * Not protected by locks
112  */
113 struct ufsbiostats ub = {
114 	{ "breads",			KSTAT_DATA_UINT32 },
115 	{ "bwrites",			KSTAT_DATA_UINT32 },
116 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
117 	{ "getpages",			KSTAT_DATA_UINT32 },
118 	{ "getras",			KSTAT_DATA_UINT32 },
119 	{ "putsyncs",			KSTAT_DATA_UINT32 },
120 	{ "putasyncs",			KSTAT_DATA_UINT32 },
121 	{ "putpageios",			KSTAT_DATA_UINT32 },
122 };
123 
124 /*
125  * more UFS Logging eccentricities...
126  *
127  * required since "#pragma weak ..." doesn't work in reverse order.
128  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
129  *        to ufs routines don't get plugged into bio.c calls so
130  *        we initialize it when setting up the "lufsops" table
131  *        in "lufs.c:_init()"
132  */
133 void (*bio_lufs_strategy)(void *, buf_t *);
134 void (*bio_snapshot_strategy)(void *, buf_t *);
135 
136 
137 /* Private routines */
138 static struct buf	*bio_getfreeblk(long);
139 static void 		bio_mem_get(long);
140 static void		bio_bhdr_free(struct buf *);
141 static struct buf	*bio_bhdr_alloc(void);
142 static void		bio_recycle(int, long);
143 static void 		bio_pageio_done(struct buf *);
144 static int 		bio_incore(dev_t, daddr_t);
145 
146 /*
147  * Buffer cache constants
148  */
149 #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
150 #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
151 #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
152 #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
153 #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
154 #define	BIO_HASHLEN	4		/* Target length of hash chains */
155 
156 
157 /* Flags for bio_recycle() */
158 #define	BIO_HEADER	0x01
159 #define	BIO_MEM		0x02
160 
161 extern	int bufhwm;		/* User tunable - high water mark for mem  */
162 extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
163 
164 /*
165  * The following routines allocate and free
166  * buffers with various side effects.  In general the
167  * arguments to an allocate routine are a device and
168  * a block number, and the value is a pointer to
169  * to the buffer header; the buffer returned is locked with a
170  * binary semaphore so that no one else can touch it. If the block was
171  * already in core, no I/O need be done; if it is
172  * already locked, the process waits until it becomes free.
173  * The following routines allocate a buffer:
174  *	getblk
175  *	bread/BREAD
176  *	breada
177  * Eventually the buffer must be released, possibly with the
178  * side effect of writing it out, by using one of
179  *	bwrite/BWRITE/brwrite
180  *	bdwrite/bdrwrite
181  *	bawrite
182  *	brelse
183  *
184  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
185  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
186  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
187  * B_DONE is still used to denote a buffer with I/O complete on it.
188  *
189  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
190  * should not be used where a very accurate count of the free buffers is
191  * needed.
192  */
193 
194 /*
195  * Read in (if necessary) the block and return a buffer pointer.
196  *
197  * This interface is provided for binary compatibility.  Using
198  * BREAD() directly avoids the extra function call overhead invoked
199  * by calling this routine.
200  */
201 struct buf *
202 bread(dev_t dev, daddr_t blkno, long bsize)
203 {
204 	return (BREAD(dev, blkno, bsize));
205 }
206 
207 /*
208  * Common code for reading a buffer with various options
209  *
210  * Read in (if necessary) the block and return a buffer pointer.
211  */
212 struct buf *
213 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
214 {
215 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
216 	struct buf *bp;
217 	klwp_t *lwp = ttolwp(curthread);
218 
219 	CPU_STATS_ADD_K(sys, lread, 1);
220 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
221 	if (bp->b_flags & B_DONE)
222 		return (bp);
223 	bp->b_flags |= B_READ;
224 	ASSERT(bp->b_bcount == bsize);
225 	if (ufsvfsp == NULL) {					/* !ufs */
226 		(void) bdev_strategy(bp);
227 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
228 							/* ufs && logging */
229 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
230 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
231 							/* ufs && snapshots */
232 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
233 	} else {
234 		ufsvfsp->vfs_iotstamp = lbolt;
235 		ub.ub_breads.value.ul++;		/* ufs && !logging */
236 		(void) bdev_strategy(bp);
237 	}
238 	if (lwp != NULL)
239 		lwp->lwp_ru.inblock++;
240 	CPU_STATS_ADD_K(sys, bread, 1);
241 	(void) biowait(bp);
242 	return (bp);
243 }
244 
245 /*
246  * Read in the block, like bread, but also start I/O on the
247  * read-ahead block (which is not allocated to the caller).
248  */
249 struct buf *
250 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
251 {
252 	struct buf *bp, *rabp;
253 	klwp_t *lwp = ttolwp(curthread);
254 
255 	bp = NULL;
256 	if (!bio_incore(dev, blkno)) {
257 		CPU_STATS_ADD_K(sys, lread, 1);
258 		bp = GETBLK(dev, blkno, bsize);
259 		if ((bp->b_flags & B_DONE) == 0) {
260 			bp->b_flags |= B_READ;
261 			bp->b_bcount = bsize;
262 			(void) bdev_strategy(bp);
263 			if (lwp != NULL)
264 				lwp->lwp_ru.inblock++;
265 			CPU_STATS_ADD_K(sys, bread, 1);
266 		}
267 	}
268 	if (rablkno && bfreelist.b_bcount > 1 &&
269 	    !bio_incore(dev, rablkno)) {
270 		rabp = GETBLK(dev, rablkno, bsize);
271 		if (rabp->b_flags & B_DONE)
272 			brelse(rabp);
273 		else {
274 			rabp->b_flags |= B_READ|B_ASYNC;
275 			rabp->b_bcount = bsize;
276 			(void) bdev_strategy(rabp);
277 			if (lwp != NULL)
278 				lwp->lwp_ru.inblock++;
279 			CPU_STATS_ADD_K(sys, bread, 1);
280 		}
281 	}
282 	if (bp == NULL)
283 		return (BREAD(dev, blkno, bsize));
284 	(void) biowait(bp);
285 	return (bp);
286 }
287 
288 /*
289  * Common code for writing a buffer with various options.
290  *
291  * force_wait  - wait for write completion regardless of B_ASYNC flag
292  * do_relse    - release the buffer when we are done
293  * clear_flags - flags to clear from the buffer
294  */
295 void
296 bwrite_common(void *arg, struct buf *bp, int force_wait,
297 				int do_relse, int clear_flags)
298 {
299 	register int do_wait;
300 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
301 	int flag;
302 	klwp_t *lwp = ttolwp(curthread);
303 	struct cpu *cpup;
304 
305 	ASSERT(SEMA_HELD(&bp->b_sem));
306 	flag = bp->b_flags;
307 	bp->b_flags &= ~clear_flags;
308 	if (lwp != NULL)
309 		lwp->lwp_ru.oublock++;
310 	CPU_STATS_ENTER_K();
311 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
312 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
313 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
314 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
315 	if (do_wait == 0)
316 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
317 	CPU_STATS_EXIT_K();
318 	if (ufsvfsp == NULL) {
319 		(void) bdev_strategy(bp);
320 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
321 							/* ufs && logging */
322 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
323 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
324 							/* ufs && snapshots */
325 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
326 	} else {
327 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
328 		(void) bdev_strategy(bp);
329 	}
330 	if (do_wait) {
331 		(void) biowait(bp);
332 		if (do_relse) {
333 			brelse(bp);
334 		}
335 	}
336 }
337 
338 /*
339  * Write the buffer, waiting for completion (unless B_ASYNC is set).
340  * Then release the buffer.
341  * This interface is provided for binary compatibility.  Using
342  * BWRITE() directly avoids the extra function call overhead invoked
343  * by calling this routine.
344  */
345 void
346 bwrite(struct buf *bp)
347 {
348 	BWRITE(bp);
349 }
350 
351 /*
352  * Write the buffer, waiting for completion.
353  * But don't release the buffer afterwards.
354  * This interface is provided for binary compatibility.  Using
355  * BWRITE2() directly avoids the extra function call overhead.
356  */
357 void
358 bwrite2(struct buf *bp)
359 {
360 	BWRITE2(bp);
361 }
362 
363 /*
364  * Release the buffer, marking it so that if it is grabbed
365  * for another purpose it will be written out before being
366  * given up (e.g. when writing a partial block where it is
367  * assumed that another write for the same block will soon follow).
368  * Also save the time that the block is first marked as delayed
369  * so that it will be written in a reasonable time.
370  */
371 void
372 bdwrite(struct buf *bp)
373 {
374 	ASSERT(SEMA_HELD(&bp->b_sem));
375 	CPU_STATS_ADD_K(sys, lwrite, 1);
376 	if ((bp->b_flags & B_DELWRI) == 0)
377 		bp->b_start = lbolt;
378 	/*
379 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
380 	 * buffer to be written before being reused, and setting b_resid
381 	 * to zero says the buffer is complete.
382 	 */
383 	bp->b_flags |= B_DELWRI | B_DONE;
384 	bp->b_resid = 0;
385 	brelse(bp);
386 }
387 
388 /*
389  * Release the buffer, start I/O on it, but don't wait for completion.
390  */
391 void
392 bawrite(struct buf *bp)
393 {
394 	ASSERT(SEMA_HELD(&bp->b_sem));
395 
396 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
397 	if (bfreelist.b_bcount > 4)
398 		bp->b_flags |= B_ASYNC;
399 	BWRITE(bp);
400 }
401 
402 /*
403  * Release the buffer, with no I/O implied.
404  */
405 void
406 brelse(struct buf *bp)
407 {
408 	struct buf	**backp;
409 	uint_t		index;
410 	kmutex_t	*hmp;
411 	struct	buf	*dp;
412 	struct	hbuf	*hp;
413 
414 
415 	ASSERT(SEMA_HELD(&bp->b_sem));
416 
417 	/*
418 	 * Clear the retry write flag if the buffer was written without
419 	 * error.  The presence of B_DELWRI means the buffer has not yet
420 	 * been written and the presence of B_ERROR means that an error
421 	 * is still occurring.
422 	 */
423 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
424 		bp->b_flags &= ~B_RETRYWRI;
425 	}
426 
427 	/* Check for anomalous conditions */
428 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
429 		if (bp->b_flags & B_NOCACHE) {
430 			/* Don't add to the freelist. Destroy it now */
431 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
432 			sema_destroy(&bp->b_sem);
433 			sema_destroy(&bp->b_io);
434 			kmem_free(bp, sizeof (struct buf));
435 			return;
436 		}
437 		/*
438 		 * If a write failed and we are supposed to retry write,
439 		 * don't toss the buffer.  Keep it around and mark it
440 		 * delayed write in the hopes that it will eventually
441 		 * get flushed (and still keep the system running.)
442 		 */
443 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
444 			bp->b_flags |= B_DELWRI;
445 			/* keep fsflush from trying continuously to flush */
446 			bp->b_start = lbolt;
447 		} else
448 			bp->b_flags |= B_AGE|B_STALE;
449 		bp->b_flags &= ~B_ERROR;
450 		bp->b_error = 0;
451 	}
452 
453 	/*
454 	 * If delayed write is set then put in on the delayed
455 	 * write list instead of the free buffer list.
456 	 */
457 	index = bio_bhash(bp->b_edev, bp->b_blkno);
458 	hmp   = &hbuf[index].b_lock;
459 
460 	mutex_enter(hmp);
461 	hp = &hbuf[index];
462 	dp = (struct buf *)hp;
463 
464 	/*
465 	 * Make sure that the number of entries on this list are
466 	 * Zero <= count <= total # buffers
467 	 */
468 	ASSERT(hp->b_length >= 0);
469 	ASSERT(hp->b_length < nbuf);
470 
471 	hp->b_length++;		/* We are adding this buffer */
472 
473 	if (bp->b_flags & B_DELWRI) {
474 		/*
475 		 * This buffer goes on the delayed write buffer list
476 		 */
477 		dp = (struct buf *)&dwbuf[index];
478 	}
479 	ASSERT(bp->b_bufsize > 0);
480 	ASSERT(bp->b_bcount > 0);
481 	ASSERT(bp->b_un.b_addr != NULL);
482 
483 	if (bp->b_flags & B_AGE) {
484 		backp = &dp->av_forw;
485 		(*backp)->av_back = bp;
486 		bp->av_forw = *backp;
487 		*backp = bp;
488 		bp->av_back = dp;
489 	} else {
490 		backp = &dp->av_back;
491 		(*backp)->av_forw = bp;
492 		bp->av_back = *backp;
493 		*backp = bp;
494 		bp->av_forw = dp;
495 	}
496 	mutex_exit(hmp);
497 
498 	if (bfreelist.b_flags & B_WANTED) {
499 		/*
500 		 * Should come here very very rarely.
501 		 */
502 		mutex_enter(&bfree_lock);
503 		if (bfreelist.b_flags & B_WANTED) {
504 			bfreelist.b_flags &= ~B_WANTED;
505 			cv_broadcast(&bio_mem_cv);
506 		}
507 		mutex_exit(&bfree_lock);
508 	}
509 
510 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
511 	/*
512 	 * Don't let anyone get the buffer off the freelist before we
513 	 * release our hold on it.
514 	 */
515 	sema_v(&bp->b_sem);
516 }
517 
518 /*
519  * Return a count of the number of B_BUSY buffers in the system
520  * Can only be used as a good estimate.  If 'cleanit' is set,
521  * try to flush all bufs.
522  */
523 int
524 bio_busy(int cleanit)
525 {
526 	struct buf *bp, *dp;
527 	int busy = 0;
528 	int i;
529 	kmutex_t *hmp;
530 
531 	for (i = 0; i < v.v_hbuf; i++) {
532 		vfs_syncprogress();
533 		dp = (struct buf *)&hbuf[i];
534 		hmp = &hbuf[i].b_lock;
535 
536 		mutex_enter(hmp);
537 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
538 			if (bp->b_flags & B_BUSY)
539 				busy++;
540 		}
541 		mutex_exit(hmp);
542 	}
543 
544 	if (cleanit && busy != 0) {
545 		bflush(NODEV);
546 	}
547 
548 	return (busy);
549 }
550 
551 /*
552  * this interface is provided for binary compatibility.
553  *
554  * Assign a buffer for the given block.  If the appropriate
555  * block is already associated, return it; otherwise search
556  * for the oldest non-busy buffer and reassign it.
557  */
558 struct buf *
559 getblk(dev_t dev, daddr_t blkno, long bsize)
560 {
561 	return (getblk_common(/* ufsvfsp */ NULL, dev,
562 			blkno, bsize, /* errflg */ 0));
563 }
564 
565 /*
566  * Assign a buffer for the given block.  If the appropriate
567  * block is already associated, return it; otherwise search
568  * for the oldest non-busy buffer and reassign it.
569  */
570 struct buf *
571 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
572 {
573 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
574 	struct buf *bp;
575 	struct buf *dp;
576 	struct buf *nbp = NULL;
577 	struct buf *errbp;
578 	uint_t		index;
579 	kmutex_t	*hmp;
580 	struct	hbuf	*hp;
581 
582 	if (getmajor(dev) >= devcnt)
583 		cmn_err(CE_PANIC, "blkdev");
584 
585 	biostats.bio_lookup.value.ui32++;
586 
587 	index = bio_bhash(dev, blkno);
588 	hp    = &hbuf[index];
589 	dp    = (struct buf *)hp;
590 	hmp   = &hp->b_lock;
591 
592 	mutex_enter(hmp);
593 loop:
594 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
595 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
596 		    (bp->b_flags & B_STALE))
597 			continue;
598 		/*
599 		 * Avoid holding the hash lock in the event that
600 		 * the buffer is locked by someone. Since the hash chain
601 		 * may change when we drop the hash lock
602 		 * we have to start at the beginning of the chain if the
603 		 * buffer identity/contents aren't valid.
604 		 */
605 		if (!sema_tryp(&bp->b_sem)) {
606 			biostats.bio_bufbusy.value.ui32++;
607 			mutex_exit(hmp);
608 			/*
609 			 * OK, we are dealing with a busy buffer.
610 			 * In the case that we are panicking and we
611 			 * got called from bread(), we have some chance
612 			 * for error recovery. So better bail out from
613 			 * here since sema_p() won't block. If we got
614 			 * called directly from ufs routines, there is
615 			 * no way to report an error yet.
616 			 */
617 			if (panicstr && errflg)
618 				goto errout;
619 			/*
620 			 * For the following line of code to work
621 			 * correctly never kmem_free the buffer "header".
622 			 */
623 			sema_p(&bp->b_sem);
624 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
625 			    (bp->b_flags & B_STALE)) {
626 				sema_v(&bp->b_sem);
627 				mutex_enter(hmp);
628 				goto loop;	/* start over */
629 			}
630 			mutex_enter(hmp);
631 		}
632 		/* Found */
633 		biostats.bio_hit.value.ui32++;
634 		bp->b_flags &= ~B_AGE;
635 
636 		/*
637 		 * Yank it off the free/delayed write lists
638 		 */
639 		hp->b_length--;
640 		notavail(bp);
641 		mutex_exit(hmp);
642 
643 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
644 
645 		if (nbp == NULL) {
646 			/*
647 			 * Make the common path short.
648 			 */
649 			ASSERT(SEMA_HELD(&bp->b_sem));
650 			return (bp);
651 		}
652 
653 		biostats.bio_bufdup.value.ui32++;
654 
655 		/*
656 		 * The buffer must have entered during the lock upgrade
657 		 * so free the new buffer we allocated and return the
658 		 * found buffer.
659 		 */
660 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
661 		nbp->b_un.b_addr = NULL;
662 
663 		/*
664 		 * Account for the memory
665 		 */
666 		mutex_enter(&bfree_lock);
667 		bfreelist.b_bufsize += nbp->b_bufsize;
668 		mutex_exit(&bfree_lock);
669 
670 		/*
671 		 * Destroy buf identity, and place on avail list
672 		 */
673 		nbp->b_dev = (o_dev_t)NODEV;
674 		nbp->b_edev = NODEV;
675 		nbp->b_flags = 0;
676 		nbp->b_file = NULL;
677 		nbp->b_offset = -1;
678 
679 		sema_v(&nbp->b_sem);
680 		bio_bhdr_free(nbp);
681 
682 		ASSERT(SEMA_HELD(&bp->b_sem));
683 		return (bp);
684 	}
685 
686 	/*
687 	 * bio_getfreeblk may block so check the hash chain again.
688 	 */
689 	if (nbp == NULL) {
690 		mutex_exit(hmp);
691 		nbp = bio_getfreeblk(bsize);
692 		mutex_enter(hmp);
693 		goto loop;
694 	}
695 
696 	/*
697 	 * New buffer. Assign nbp and stick it on the hash.
698 	 */
699 	nbp->b_flags = B_BUSY;
700 	nbp->b_edev = dev;
701 	nbp->b_dev = (o_dev_t)cmpdev(dev);
702 	nbp->b_blkno = blkno;
703 	nbp->b_iodone = NULL;
704 	nbp->b_bcount = bsize;
705 	/*
706 	 * If we are given a ufsvfsp and the vfs_root field is NULL
707 	 * then this must be I/O for a superblock.  A superblock's
708 	 * buffer is set up in mountfs() and there is no root vnode
709 	 * at that point.
710 	 */
711 	if (ufsvfsp && ufsvfsp->vfs_root) {
712 		nbp->b_vp = ufsvfsp->vfs_root;
713 	} else {
714 		nbp->b_vp = NULL;
715 	}
716 
717 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
718 
719 	binshash(nbp, dp);
720 	mutex_exit(hmp);
721 
722 	ASSERT(SEMA_HELD(&nbp->b_sem));
723 
724 	return (nbp);
725 
726 
727 	/*
728 	 * Come here in case of an internal error. At this point we couldn't
729 	 * get a buffer, but he have to return one. Hence we allocate some
730 	 * kind of error reply buffer on the fly. This buffer is marked as
731 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
732 	 *	- B_ERROR will indicate error to the caller.
733 	 *	- B_DONE will prevent us from reading the buffer from
734 	 *	  the device.
735 	 *	- B_NOCACHE will cause that this buffer gets free'd in
736 	 *	  brelse().
737 	 */
738 
739 errout:
740 	errbp = geteblk();
741 	sema_p(&errbp->b_sem);
742 	errbp->b_flags &= ~B_BUSY;
743 	errbp->b_flags |= (B_ERROR | B_DONE);
744 	return (errbp);
745 }
746 
747 /*
748  * Get an empty block, not assigned to any particular device.
749  * Returns a locked buffer that is not on any hash or free list.
750  */
751 struct buf *
752 ngeteblk(long bsize)
753 {
754 	struct buf *bp;
755 
756 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
757 	bioinit(bp);
758 	bp->av_forw = bp->av_back = NULL;
759 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
760 	bp->b_bufsize = bsize;
761 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
762 	bp->b_dev = (o_dev_t)NODEV;
763 	bp->b_edev = NODEV;
764 	bp->b_lblkno = 0;
765 	bp->b_bcount = bsize;
766 	bp->b_iodone = NULL;
767 	return (bp);
768 }
769 
770 /*
771  * Interface of geteblk() is kept intact to maintain driver compatibility.
772  * Use ngeteblk() to allocate block size other than 1 KB.
773  */
774 struct buf *
775 geteblk(void)
776 {
777 	return (ngeteblk((long)1024));
778 }
779 
780 /*
781  * Return a buffer w/o sleeping
782  */
783 struct buf *
784 trygetblk(dev_t dev, daddr_t blkno)
785 {
786 	struct buf	*bp;
787 	struct buf	*dp;
788 	struct hbuf	*hp;
789 	kmutex_t	*hmp;
790 	uint_t		index;
791 
792 	index = bio_bhash(dev, blkno);
793 	hp = &hbuf[index];
794 	hmp = &hp->b_lock;
795 
796 	if (!mutex_tryenter(hmp))
797 		return (NULL);
798 
799 	dp = (struct buf *)hp;
800 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
801 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
802 		    (bp->b_flags & B_STALE))
803 			continue;
804 		/*
805 		 * Get access to a valid buffer without sleeping
806 		 */
807 		if (sema_tryp(&bp->b_sem)) {
808 			if (bp->b_flags & B_DONE) {
809 				hp->b_length--;
810 				notavail(bp);
811 				mutex_exit(hmp);
812 				return (bp);
813 			} else {
814 				sema_v(&bp->b_sem);
815 				break;
816 			}
817 		}
818 		break;
819 	}
820 	mutex_exit(hmp);
821 	return (NULL);
822 }
823 
824 /*
825  * Wait for I/O completion on the buffer; return errors
826  * to the user.
827  */
828 int
829 iowait(struct buf *bp)
830 {
831 	ASSERT(SEMA_HELD(&bp->b_sem));
832 	return (biowait(bp));
833 }
834 
835 /*
836  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
837  * and wake up anyone waiting for it.
838  */
839 void
840 iodone(struct buf *bp)
841 {
842 	ASSERT(SEMA_HELD(&bp->b_sem));
843 	(void) biodone(bp);
844 }
845 
846 /*
847  * Zero the core associated with a buffer.
848  */
849 void
850 clrbuf(struct buf *bp)
851 {
852 	ASSERT(SEMA_HELD(&bp->b_sem));
853 	bzero(bp->b_un.b_addr, bp->b_bcount);
854 	bp->b_resid = 0;
855 }
856 
857 
858 /*
859  * Make sure all write-behind blocks on dev (or NODEV for all)
860  * are flushed out.
861  */
862 void
863 bflush(dev_t dev)
864 {
865 	struct buf *bp, *dp;
866 	struct hbuf *hp;
867 	struct buf *delwri_list = EMPTY_LIST;
868 	int i, index;
869 	kmutex_t *hmp;
870 
871 	mutex_enter(&blist_lock);
872 	/*
873 	 * Wait for any invalidates or flushes ahead of us to finish.
874 	 * We really could split blist_lock up per device for better
875 	 * parallelism here.
876 	 */
877 	while (bio_doinginval || bio_doingflush) {
878 		bio_flinv_cv_wanted = 1;
879 		cv_wait(&bio_flushinval_cv, &blist_lock);
880 	}
881 	bio_doingflush++;
882 	/*
883 	 * Gather all B_DELWRI buffer for device.
884 	 * Lock ordering is b_sem > hash lock (brelse).
885 	 * Since we are finding the buffer via the delayed write list,
886 	 * it may be busy and we would block trying to get the
887 	 * b_sem lock while holding hash lock. So transfer all the
888 	 * candidates on the delwri_list and then drop the hash locks.
889 	 */
890 	for (i = 0; i < v.v_hbuf; i++) {
891 		vfs_syncprogress();
892 		hmp = &hbuf[i].b_lock;
893 		dp = (struct buf *)&dwbuf[i];
894 		mutex_enter(hmp);
895 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
896 			if (dev == NODEV || bp->b_edev == dev) {
897 				if (bp->b_list == NULL) {
898 					bp->b_list = delwri_list;
899 					delwri_list = bp;
900 				}
901 			}
902 		}
903 		mutex_exit(hmp);
904 	}
905 	mutex_exit(&blist_lock);
906 
907 	/*
908 	 * Now that the hash locks have been dropped grab the semaphores
909 	 * and write back all the buffers that have B_DELWRI set.
910 	 */
911 	while (delwri_list != EMPTY_LIST) {
912 		vfs_syncprogress();
913 		bp = delwri_list;
914 
915 		sema_p(&bp->b_sem);	/* may block */
916 		if ((dev != bp->b_edev && dev != NODEV) ||
917 		    (panicstr && bp->b_flags & B_BUSY)) {
918 			sema_v(&bp->b_sem);
919 			delwri_list = bp->b_list;
920 			bp->b_list = NULL;
921 			continue;	/* No longer a candidate */
922 		}
923 		if (bp->b_flags & B_DELWRI) {
924 			index = bio_bhash(bp->b_edev, bp->b_blkno);
925 			hp = &hbuf[index];
926 			hmp = &hp->b_lock;
927 			dp = (struct buf *)hp;
928 
929 			bp->b_flags |= B_ASYNC;
930 			mutex_enter(hmp);
931 			hp->b_length--;
932 			notavail(bp);
933 			mutex_exit(hmp);
934 			if (bp->b_vp == NULL) {		/* !ufs */
935 				BWRITE(bp);
936 			} else {			/* ufs */
937 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
938 			}
939 		} else {
940 			sema_v(&bp->b_sem);
941 		}
942 		delwri_list = bp->b_list;
943 		bp->b_list = NULL;
944 	}
945 	mutex_enter(&blist_lock);
946 	bio_doingflush--;
947 	if (bio_flinv_cv_wanted) {
948 		bio_flinv_cv_wanted = 0;
949 		cv_broadcast(&bio_flushinval_cv);
950 	}
951 	mutex_exit(&blist_lock);
952 }
953 
954 /*
955  * Ensure that a specified block is up-to-date on disk.
956  */
957 void
958 blkflush(dev_t dev, daddr_t blkno)
959 {
960 	struct buf *bp, *dp;
961 	struct hbuf *hp;
962 	struct buf *sbp = NULL;
963 	uint_t index;
964 	kmutex_t *hmp;
965 
966 	index = bio_bhash(dev, blkno);
967 	hp    = &hbuf[index];
968 	dp    = (struct buf *)hp;
969 	hmp   = &hp->b_lock;
970 
971 	/*
972 	 * Identify the buffer in the cache belonging to
973 	 * this device and blkno (if any).
974 	 */
975 	mutex_enter(hmp);
976 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
977 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
978 		    (bp->b_flags & B_STALE))
979 			continue;
980 		sbp = bp;
981 		break;
982 	}
983 	mutex_exit(hmp);
984 	if (sbp == NULL)
985 		return;
986 	/*
987 	 * Now check the buffer we have identified and
988 	 * make sure it still belongs to the device and is B_DELWRI
989 	 */
990 	sema_p(&sbp->b_sem);
991 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
992 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
993 		mutex_enter(hmp);
994 		hp->b_length--;
995 		notavail(sbp);
996 		mutex_exit(hmp);
997 		/*
998 		 * XXX - There is nothing to guarantee a synchronous
999 		 * write here if the B_ASYNC flag is set.  This needs
1000 		 * some investigation.
1001 		 */
1002 		if (sbp->b_vp == NULL) {		/* !ufs */
1003 			BWRITE(sbp);	/* synchronous write */
1004 		} else {				/* ufs */
1005 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006 		}
1007 	} else {
1008 		sema_v(&sbp->b_sem);
1009 	}
1010 }
1011 
1012 /*
1013  * Same as binval, except can force-invalidate delayed-write buffers
1014  * (which are not be already flushed because of device errors).  Also
1015  * makes sure that the retry write flag is cleared.
1016  */
1017 int
1018 bfinval(dev_t dev, int force)
1019 {
1020 	struct buf *dp;
1021 	struct buf *bp;
1022 	struct buf *binval_list = EMPTY_LIST;
1023 	int i, error = 0;
1024 	kmutex_t *hmp;
1025 	uint_t index;
1026 	struct buf **backp;
1027 
1028 	mutex_enter(&blist_lock);
1029 	/*
1030 	 * Wait for any flushes ahead of us to finish, it's ok to
1031 	 * do invalidates in parallel.
1032 	 */
1033 	while (bio_doingflush) {
1034 		bio_flinv_cv_wanted = 1;
1035 		cv_wait(&bio_flushinval_cv, &blist_lock);
1036 	}
1037 	bio_doinginval++;
1038 
1039 	/* Gather bp's */
1040 	for (i = 0; i < v.v_hbuf; i++) {
1041 		dp = (struct buf *)&hbuf[i];
1042 		hmp = &hbuf[i].b_lock;
1043 
1044 		mutex_enter(hmp);
1045 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046 			if (bp->b_edev == dev) {
1047 				if (bp->b_list == NULL) {
1048 					bp->b_list = binval_list;
1049 					binval_list = bp;
1050 				}
1051 			}
1052 		}
1053 		mutex_exit(hmp);
1054 	}
1055 	mutex_exit(&blist_lock);
1056 
1057 	/* Invalidate all bp's found */
1058 	while (binval_list != EMPTY_LIST) {
1059 		bp = binval_list;
1060 
1061 		sema_p(&bp->b_sem);
1062 		if (bp->b_edev == dev) {
1063 			if (force && (bp->b_flags & B_DELWRI)) {
1064 				/* clear B_DELWRI, move to non-dw freelist */
1065 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1066 				hmp = &hbuf[index].b_lock;
1067 				dp = (struct buf *)&hbuf[index];
1068 				mutex_enter(hmp);
1069 
1070 				/* remove from delayed write freelist */
1071 				notavail(bp);
1072 
1073 				/* add to B_AGE side of non-dw freelist */
1074 				backp = &dp->av_forw;
1075 				(*backp)->av_back = bp;
1076 				bp->av_forw = *backp;
1077 				*backp = bp;
1078 				bp->av_back = dp;
1079 
1080 				/*
1081 				 * make sure write retries and busy are cleared
1082 				 */
1083 				bp->b_flags &=
1084 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085 				mutex_exit(hmp);
1086 			}
1087 			if ((bp->b_flags & B_DELWRI) == 0)
1088 				bp->b_flags |= B_STALE|B_AGE;
1089 			else
1090 				error = EIO;
1091 		}
1092 		sema_v(&bp->b_sem);
1093 		binval_list = bp->b_list;
1094 		bp->b_list = NULL;
1095 	}
1096 	mutex_enter(&blist_lock);
1097 	bio_doinginval--;
1098 	if (bio_flinv_cv_wanted) {
1099 		cv_broadcast(&bio_flushinval_cv);
1100 		bio_flinv_cv_wanted = 0;
1101 	}
1102 	mutex_exit(&blist_lock);
1103 	return (error);
1104 }
1105 
1106 /*
1107  * If possible, invalidate blocks for a dev on demand
1108  */
1109 void
1110 binval(dev_t dev)
1111 {
1112 	(void) bfinval(dev, 0);
1113 }
1114 
1115 /*
1116  * Initialize the buffer I/O system by freeing
1117  * all buffers and setting all device hash buffer lists to empty.
1118  */
1119 void
1120 binit(void)
1121 {
1122 	struct buf *bp;
1123 	unsigned int i, pct;
1124 	ulong_t	bio_max_hwm, bio_default_hwm;
1125 
1126 	/*
1127 	 * Maximum/Default values for bufhwm are set to the smallest of:
1128 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 	 *	- 1/4 of kernel virtual memory
1130 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 	 * Additionally, in order to allow simple tuning by percentage of
1132 	 * physical memory, bufhwm_pct is used to calculate the default if
1133 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134 	 *
1135 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137 	 */
1138 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141 
1142 	pct = BIO_BUF_PERCENT;
1143 	if (bufhwm_pct != 0 &&
1144 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145 		pct = BIO_BUF_PERCENT;
1146 		/*
1147 		 * Invalid user specified value, emit a warning.
1148 		 */
1149 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150 			range(1..%d). Using %d as default.",
1151 			bufhwm_pct,
1152 			100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153 	}
1154 
1155 	bio_default_hwm = MIN(physmem / pct,
1156 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158 
1159 	if ((v.v_bufhwm = bufhwm) == 0)
1160 		v.v_bufhwm = bio_default_hwm;
1161 
1162 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163 		v.v_bufhwm = (int)bio_max_hwm;
1164 		/*
1165 		 * Invalid user specified value, emit a warning.
1166 		 */
1167 		cmn_err(CE_WARN,
1168 			"binit: bufhwm(%d) out \
1169 			of range(%d..%lu). Using %lu as default",
1170 			bufhwm,
1171 			BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172 	}
1173 
1174 	/*
1175 	 * Determine the number of hash buckets. Default is to
1176 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 	 * Round up number to the next power of 2.
1178 	 */
1179 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180 	    BIO_HASHLEN);
1181 	v.v_hmask = v.v_hbuf - 1;
1182 	v.v_buf = BIO_BHDR_POOL;
1183 
1184 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185 
1186 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187 
1188 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189 	bp = &bfreelist;
1190 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191 
1192 	for (i = 0; i < v.v_hbuf; i++) {
1193 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195 
1196 		/*
1197 		 * Initialize the delayed write buffer list.
1198 		 */
1199 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201 	}
1202 }
1203 
1204 /*
1205  * Wait for I/O completion on the buffer; return error code.
1206  * If bp was for synchronous I/O, bp is invalid and associated
1207  * resources are freed on return.
1208  */
1209 int
1210 biowait(struct buf *bp)
1211 {
1212 	int error = 0;
1213 	struct cpu *cpup;
1214 
1215 	ASSERT(SEMA_HELD(&bp->b_sem));
1216 
1217 	cpup = CPU;
1218 	atomic_add_64(&cpup->cpu_stats.sys.iowait, 1);
1219 	DTRACE_IO1(wait__start, struct buf *, bp);
1220 
1221 	/*
1222 	 * In case of panic, busy wait for completion
1223 	 */
1224 	if (panicstr) {
1225 		while ((bp->b_flags & B_DONE) == 0)
1226 			drv_usecwait(10);
1227 	} else
1228 		sema_p(&bp->b_io);
1229 
1230 	DTRACE_IO1(wait__done, struct buf *, bp);
1231 	atomic_add_64(&cpup->cpu_stats.sys.iowait, -1);
1232 
1233 	error = geterror(bp);
1234 	if ((bp->b_flags & B_ASYNC) == 0) {
1235 		if (bp->b_flags & B_REMAPPED)
1236 			bp_mapout(bp);
1237 	}
1238 	return (error);
1239 }
1240 
1241 static void
1242 biodone_tnf_probe(struct buf *bp)
1243 {
1244 	/* Kernel probe */
1245 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246 		tnf_device,	device,		bp->b_edev,
1247 		tnf_diskaddr,	block,		bp->b_lblkno,
1248 		tnf_opaque,	buf,		bp);
1249 }
1250 
1251 /*
1252  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253  * and wake up anyone waiting for it.
1254  */
1255 void
1256 biodone(struct buf *bp)
1257 {
1258 	if (bp->b_flags & B_STARTED) {
1259 		DTRACE_IO1(done, struct buf *, bp);
1260 		bp->b_flags &= ~B_STARTED;
1261 	}
1262 
1263 	/*
1264 	 * Call the TNF probe here instead of the inline code
1265 	 * to force our compiler to use the tail call optimization.
1266 	 */
1267 	biodone_tnf_probe(bp);
1268 
1269 	if (bp->b_iodone != NULL) {
1270 		(*(bp->b_iodone))(bp);
1271 		return;
1272 	}
1273 	ASSERT((bp->b_flags & B_DONE) == 0);
1274 	ASSERT(SEMA_HELD(&bp->b_sem));
1275 	bp->b_flags |= B_DONE;
1276 	if (bp->b_flags & B_ASYNC) {
1277 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278 			bio_pageio_done(bp);
1279 		else
1280 			brelse(bp);	/* release bp to freelist */
1281 	} else {
1282 		sema_v(&bp->b_io);
1283 	}
1284 }
1285 
1286 /*
1287  * Pick up the device's error number and pass it to the user;
1288  * if there is an error but the number is 0 set a generalized code.
1289  */
1290 int
1291 geterror(struct buf *bp)
1292 {
1293 	int error = 0;
1294 
1295 	ASSERT(SEMA_HELD(&bp->b_sem));
1296 	if (bp->b_flags & B_ERROR) {
1297 		error = bp->b_error;
1298 		if (!error)
1299 			error = EIO;
1300 	}
1301 	return (error);
1302 }
1303 
1304 /*
1305  * Support for pageio buffers.
1306  *
1307  * This stuff should be generalized to provide a generalized bp
1308  * header facility that can be used for things other than pageio.
1309  */
1310 
1311 /*
1312  * Allocate and initialize a buf struct for use with pageio.
1313  */
1314 struct buf *
1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 {
1317 	struct buf *bp;
1318 	struct cpu *cpup;
1319 
1320 	if (flags & B_READ) {
1321 		CPU_STATS_ENTER_K();
1322 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1323 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325 		if ((flags & B_ASYNC) == 0) {
1326 			klwp_t *lwp = ttolwp(curthread);
1327 			if (lwp != NULL)
1328 				lwp->lwp_ru.majflt++;
1329 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1330 			/* Kernel probe */
1331 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1332 				tnf_opaque,	vnode,		pp->p_vnode,
1333 				tnf_offset,	offset,		pp->p_offset);
1334 		}
1335 		/*
1336 		 * Update statistics for pages being paged in
1337 		 */
1338 		if (pp != NULL && pp->p_vnode != NULL) {
1339 			if (IS_SWAPFSVP(pp->p_vnode)) {
1340 				CPU_STATS_ADDQ(cpup, vm, anonpgin,
1341 						btopr(len));
1342 			} else {
1343 				if (pp->p_vnode->v_flag & VVMEXEC) {
1344 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1345 							btopr(len));
1346 				} else {
1347 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1348 							btopr(len));
1349 				}
1350 			}
1351 		}
1352 		CPU_STATS_EXIT_K();
1353 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1354 		    "page_ws_in:pp %p", pp);
1355 		/* Kernel probe */
1356 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1357 			tnf_opaque,	vnode,		pp->p_vnode,
1358 			tnf_offset,	offset,		pp->p_offset,
1359 			tnf_size,	size,		len);
1360 	}
1361 
1362 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1363 	bp->b_bcount = len;
1364 	bp->b_bufsize = len;
1365 	bp->b_pages = pp;
1366 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1367 	bp->b_offset = -1;
1368 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1369 
1370 	/* Initialize bp->b_sem in "locked" state */
1371 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1372 
1373 	VN_HOLD(vp);
1374 	bp->b_vp = vp;
1375 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1376 
1377 	/*
1378 	 * Caller sets dev & blkno and can adjust
1379 	 * b_addr for page offset and can use bp_mapin
1380 	 * to make pages kernel addressable.
1381 	 */
1382 	return (bp);
1383 }
1384 
1385 void
1386 pageio_done(struct buf *bp)
1387 {
1388 	ASSERT(SEMA_HELD(&bp->b_sem));
1389 	if (bp->b_flags & B_REMAPPED)
1390 		bp_mapout(bp);
1391 	VN_RELE(bp->b_vp);
1392 	bp->b_vp = NULL;
1393 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1394 
1395 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1396 	sema_destroy(&bp->b_sem);
1397 	sema_destroy(&bp->b_io);
1398 	kmem_free(bp, sizeof (struct buf));
1399 }
1400 
1401 /*
1402  * Check to see whether the buffers, except the one pointed by sbp,
1403  * associated with the device are busy.
1404  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1405  */
1406 int
1407 bcheck(dev_t dev, struct buf *sbp)
1408 {
1409 	struct buf	*bp;
1410 	struct buf	*dp;
1411 	int i;
1412 	kmutex_t *hmp;
1413 
1414 	/*
1415 	 * check for busy bufs for this filesystem
1416 	 */
1417 	for (i = 0; i < v.v_hbuf; i++) {
1418 		dp = (struct buf *)&hbuf[i];
1419 		hmp = &hbuf[i].b_lock;
1420 
1421 		mutex_enter(hmp);
1422 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1423 			/*
1424 			 * if buf is busy or dirty, then filesystem is busy
1425 			 */
1426 			if ((bp->b_edev == dev) &&
1427 			    ((bp->b_flags & B_STALE) == 0) &&
1428 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1429 			    (bp != sbp)) {
1430 				mutex_exit(hmp);
1431 				return (1);
1432 			}
1433 		}
1434 		mutex_exit(hmp);
1435 	}
1436 	return (0);
1437 }
1438 
1439 /*
1440  * Hash two 32 bit entities.
1441  */
1442 int
1443 hash2ints(int x, int y)
1444 {
1445 	int hash = 0;
1446 
1447 	hash = x - 1;
1448 	hash = ((hash * 7) + (x >> 8)) - 1;
1449 	hash = ((hash * 7) + (x >> 16)) - 1;
1450 	hash = ((hash * 7) + (x >> 24)) - 1;
1451 	hash = ((hash * 7) + y) - 1;
1452 	hash = ((hash * 7) + (y >> 8)) - 1;
1453 	hash = ((hash * 7) + (y >> 16)) - 1;
1454 	hash = ((hash * 7) + (y >> 24)) - 1;
1455 
1456 	return (hash);
1457 }
1458 
1459 
1460 /*
1461  * Return a new buffer struct.
1462  *	Create a new buffer if we haven't gone over our high water
1463  *	mark for memory, otherwise try to get one off the freelist.
1464  *
1465  * Returns a locked buf that has no id and is not on any hash or free
1466  * list.
1467  */
1468 static struct buf *
1469 bio_getfreeblk(long bsize)
1470 {
1471 	struct buf *bp, *dp;
1472 	struct hbuf *hp;
1473 	kmutex_t	*hmp;
1474 	uint_t		start, end;
1475 
1476 	/*
1477 	 * mutex_enter(&bfree_lock);
1478 	 * bfreelist.b_bufsize represents the amount of memory
1479 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1480 	 * we are allowed to allocate in the cache before we hit our hwm.
1481 	 */
1482 	bio_mem_get(bsize);	/* Account for our memory request */
1483 
1484 again:
1485 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1486 	sema_p(&bp->b_sem);	/* Should never fail */
1487 
1488 	ASSERT(bp->b_un.b_addr == NULL);
1489 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1490 	if (bp->b_un.b_addr != NULL) {
1491 		/*
1492 		 * Make the common path short
1493 		 */
1494 		bp->b_bufsize = bsize;
1495 		ASSERT(SEMA_HELD(&bp->b_sem));
1496 		return (bp);
1497 	} else {
1498 		struct buf *save;
1499 
1500 		save = bp;	/* Save bp we allocated */
1501 		start = end = lastindex;
1502 
1503 		biostats.bio_bufwant.value.ui32++;
1504 
1505 		/*
1506 		 * Memory isn't available from the system now. Scan
1507 		 * the hash buckets till enough space is found.
1508 		 */
1509 		do {
1510 			hp = &hbuf[start];
1511 			hmp = &hp->b_lock;
1512 			dp = (struct buf *)hp;
1513 
1514 			mutex_enter(hmp);
1515 			bp = dp->av_forw;
1516 
1517 			while (bp != dp) {
1518 
1519 				ASSERT(bp != NULL);
1520 
1521 				if (!sema_tryp(&bp->b_sem)) {
1522 					bp = bp->av_forw;
1523 					continue;
1524 				}
1525 
1526 				/*
1527 				 * Since we are going down the freelist
1528 				 * associated with this hash bucket the
1529 				 * B_DELWRI flag should not be set.
1530 				 */
1531 				ASSERT(!(bp->b_flags & B_DELWRI));
1532 
1533 				if (bp->b_bufsize == bsize) {
1534 					hp->b_length--;
1535 					notavail(bp);
1536 					bremhash(bp);
1537 					mutex_exit(hmp);
1538 
1539 					/*
1540 					 * Didn't kmem_alloc any more, so don't
1541 					 * count it twice.
1542 					 */
1543 					mutex_enter(&bfree_lock);
1544 					bfreelist.b_bufsize += bsize;
1545 					mutex_exit(&bfree_lock);
1546 
1547 					/*
1548 					 * Update the lastindex value.
1549 					 */
1550 					lastindex = start;
1551 
1552 					/*
1553 					 * Put our saved bp back on the list
1554 					 */
1555 					sema_v(&save->b_sem);
1556 					bio_bhdr_free(save);
1557 					ASSERT(SEMA_HELD(&bp->b_sem));
1558 					return (bp);
1559 				}
1560 				sema_v(&bp->b_sem);
1561 				bp = bp->av_forw;
1562 			}
1563 			mutex_exit(hmp);
1564 			start = ((start + 1) % v.v_hbuf);
1565 		} while (start != end);
1566 
1567 		biostats.bio_bufwait.value.ui32++;
1568 		bp = save;		/* Use original bp */
1569 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1570 	}
1571 
1572 	bp->b_bufsize = bsize;
1573 	ASSERT(SEMA_HELD(&bp->b_sem));
1574 	return (bp);
1575 }
1576 
1577 /*
1578  * Allocate a buffer header. If none currently available, allocate
1579  * a new pool.
1580  */
1581 static struct buf *
1582 bio_bhdr_alloc(void)
1583 {
1584 	struct buf *dp, *sdp;
1585 	struct buf *bp;
1586 	int i;
1587 
1588 	for (;;) {
1589 		mutex_enter(&bhdr_lock);
1590 		if (bhdrlist != NULL) {
1591 			bp = bhdrlist;
1592 			bhdrlist = bp->av_forw;
1593 			mutex_exit(&bhdr_lock);
1594 			bp->av_forw = NULL;
1595 			return (bp);
1596 		}
1597 		mutex_exit(&bhdr_lock);
1598 
1599 		/*
1600 		 * Need to allocate a new pool. If the system is currently
1601 		 * out of memory, then try freeing things on the freelist.
1602 		 */
1603 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1604 		if (dp == NULL) {
1605 			/*
1606 			 * System can't give us a pool of headers, try
1607 			 * recycling from the free lists.
1608 			 */
1609 			bio_recycle(BIO_HEADER, 0);
1610 		} else {
1611 			sdp = dp;
1612 			for (i = 0; i < v.v_buf; i++, dp++) {
1613 				/*
1614 				 * The next two lines are needed since NODEV
1615 				 * is -1 and not NULL
1616 				 */
1617 				dp->b_dev = (o_dev_t)NODEV;
1618 				dp->b_edev = NODEV;
1619 				dp->av_forw = dp + 1;
1620 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1621 				    NULL);
1622 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1623 				    NULL);
1624 				dp->b_offset = -1;
1625 			}
1626 			mutex_enter(&bhdr_lock);
1627 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1628 			bhdrlist = sdp;
1629 			nbuf += v.v_buf;
1630 			bp = bhdrlist;
1631 			bhdrlist = bp->av_forw;
1632 			mutex_exit(&bhdr_lock);
1633 
1634 			bp->av_forw = NULL;
1635 			return (bp);
1636 		}
1637 	}
1638 }
1639 
1640 static  void
1641 bio_bhdr_free(struct buf *bp)
1642 {
1643 	ASSERT(bp->b_back == NULL);
1644 	ASSERT(bp->b_forw == NULL);
1645 	ASSERT(bp->av_back == NULL);
1646 	ASSERT(bp->av_forw == NULL);
1647 	ASSERT(bp->b_un.b_addr == NULL);
1648 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1649 	ASSERT(bp->b_edev == NODEV);
1650 	ASSERT(bp->b_flags == 0);
1651 
1652 	mutex_enter(&bhdr_lock);
1653 	bp->av_forw = bhdrlist;
1654 	bhdrlist = bp;
1655 	mutex_exit(&bhdr_lock);
1656 }
1657 
1658 /*
1659  * If we haven't gone over the high water mark, it's o.k. to
1660  * allocate more buffer space, otherwise recycle buffers
1661  * from the freelist until enough memory is free for a bsize request.
1662  *
1663  * We account for this memory, even though
1664  * we don't allocate it here.
1665  */
1666 static void
1667 bio_mem_get(long bsize)
1668 {
1669 	mutex_enter(&bfree_lock);
1670 	if (bfreelist.b_bufsize > bsize) {
1671 		bfreelist.b_bufsize -= bsize;
1672 		mutex_exit(&bfree_lock);
1673 		return;
1674 	}
1675 	mutex_exit(&bfree_lock);
1676 	bio_recycle(BIO_MEM, bsize);
1677 }
1678 
1679 /*
1680  * flush a list of delayed write buffers.
1681  * (currently used only by bio_recycle below.)
1682  */
1683 static void
1684 bio_flushlist(struct buf *delwri_list)
1685 {
1686 	struct buf *bp;
1687 
1688 	while (delwri_list != EMPTY_LIST) {
1689 		bp = delwri_list;
1690 		bp->b_flags |= B_AGE | B_ASYNC;
1691 		if (bp->b_vp == NULL) {		/* !ufs */
1692 			BWRITE(bp);
1693 		} else {			/* ufs */
1694 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1695 		}
1696 		delwri_list = bp->b_list;
1697 		bp->b_list = NULL;
1698 	}
1699 }
1700 
1701 /*
1702  * Start recycling buffers on the freelist for one of 2 reasons:
1703  *	- we need a buffer header
1704  *	- we need to free up memory
1705  * Once started we continue to recycle buffers until the B_AGE
1706  * buffers are gone.
1707  */
1708 static void
1709 bio_recycle(int want, long bsize)
1710 {
1711 	struct buf *bp, *dp, *dwp, *nbp;
1712 	struct hbuf *hp;
1713 	int	found = 0;
1714 	kmutex_t	*hmp;
1715 	int		start, end;
1716 	struct buf *delwri_list = EMPTY_LIST;
1717 
1718 	/*
1719 	 * Recycle buffers.
1720 	 */
1721 top:
1722 	start = end = lastindex;
1723 	do {
1724 		hp = &hbuf[start];
1725 		hmp = &hp->b_lock;
1726 		dp = (struct buf *)hp;
1727 
1728 		mutex_enter(hmp);
1729 		bp = dp->av_forw;
1730 
1731 		while (bp != dp) {
1732 
1733 			ASSERT(bp != NULL);
1734 
1735 			if (!sema_tryp(&bp->b_sem)) {
1736 				bp = bp->av_forw;
1737 				continue;
1738 			}
1739 			/*
1740 			 * Do we really want to nuke all of the B_AGE stuff??
1741 			 */
1742 			if ((bp->b_flags & B_AGE) == 0 && found) {
1743 				sema_v(&bp->b_sem);
1744 				mutex_exit(hmp);
1745 				lastindex = start;
1746 				return;	/* All done */
1747 			}
1748 
1749 			ASSERT(MUTEX_HELD(&hp->b_lock));
1750 			ASSERT(!(bp->b_flags & B_DELWRI));
1751 			hp->b_length--;
1752 			notavail(bp);
1753 
1754 			/*
1755 			 * Remove bhdr from cache, free up memory,
1756 			 * and add the hdr to the freelist.
1757 			 */
1758 			bremhash(bp);
1759 			mutex_exit(hmp);
1760 
1761 			if (bp->b_bufsize) {
1762 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1763 				bp->b_un.b_addr = NULL;
1764 				mutex_enter(&bfree_lock);
1765 				bfreelist.b_bufsize += bp->b_bufsize;
1766 				mutex_exit(&bfree_lock);
1767 			}
1768 
1769 			bp->b_dev = (o_dev_t)NODEV;
1770 			bp->b_edev = NODEV;
1771 			bp->b_flags = 0;
1772 			sema_v(&bp->b_sem);
1773 			bio_bhdr_free(bp);
1774 			if (want == BIO_HEADER) {
1775 				found = 1;
1776 			} else {
1777 				ASSERT(want == BIO_MEM);
1778 				if (!found && bfreelist.b_bufsize >= bsize) {
1779 					/* Account for the memory we want */
1780 					mutex_enter(&bfree_lock);
1781 					if (bfreelist.b_bufsize >= bsize) {
1782 						bfreelist.b_bufsize -= bsize;
1783 						found = 1;
1784 					}
1785 					mutex_exit(&bfree_lock);
1786 				}
1787 			}
1788 
1789 			/*
1790 			 * Since we dropped hmp start from the
1791 			 * begining.
1792 			 */
1793 			mutex_enter(hmp);
1794 			bp = dp->av_forw;
1795 		}
1796 		mutex_exit(hmp);
1797 
1798 		/*
1799 		 * Look at the delayed write list.
1800 		 * First gather into a private list, then write them.
1801 		 */
1802 		dwp = (struct buf *)&dwbuf[start];
1803 		mutex_enter(&blist_lock);
1804 		bio_doingflush++;
1805 		mutex_enter(hmp);
1806 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1807 
1808 			ASSERT(bp != NULL);
1809 			nbp = bp->av_forw;
1810 
1811 			if (!sema_tryp(&bp->b_sem))
1812 				continue;
1813 			ASSERT(bp->b_flags & B_DELWRI);
1814 			/*
1815 			 * Do we really want to nuke all of the B_AGE stuff??
1816 			 */
1817 
1818 			if ((bp->b_flags & B_AGE) == 0 && found) {
1819 				sema_v(&bp->b_sem);
1820 				mutex_exit(hmp);
1821 				lastindex = start;
1822 				mutex_exit(&blist_lock);
1823 				bio_flushlist(delwri_list);
1824 				mutex_enter(&blist_lock);
1825 				bio_doingflush--;
1826 				if (bio_flinv_cv_wanted) {
1827 					bio_flinv_cv_wanted = 0;
1828 					cv_broadcast(&bio_flushinval_cv);
1829 				}
1830 				mutex_exit(&blist_lock);
1831 				return; /* All done */
1832 			}
1833 
1834 			/*
1835 			 * If the buffer is already on a flush or
1836 			 * invalidate list then just skip it.
1837 			 */
1838 			if (bp->b_list != NULL) {
1839 				sema_v(&bp->b_sem);
1840 				continue;
1841 			}
1842 			/*
1843 			 * We are still on the same bucket.
1844 			 */
1845 			hp->b_length--;
1846 			notavail(bp);
1847 			bp->b_list = delwri_list;
1848 			delwri_list = bp;
1849 		}
1850 		mutex_exit(hmp);
1851 		mutex_exit(&blist_lock);
1852 		bio_flushlist(delwri_list);
1853 		delwri_list = EMPTY_LIST;
1854 		mutex_enter(&blist_lock);
1855 		bio_doingflush--;
1856 		if (bio_flinv_cv_wanted) {
1857 			bio_flinv_cv_wanted = 0;
1858 			cv_broadcast(&bio_flushinval_cv);
1859 		}
1860 		mutex_exit(&blist_lock);
1861 		start = (start + 1) % v.v_hbuf;
1862 
1863 	} while (start != end);
1864 
1865 	if (found)
1866 		return;
1867 
1868 	/*
1869 	 * Free lists exhausted and we haven't satisfied the request.
1870 	 * Wait here for more entries to be added to freelist.
1871 	 * Because this might have just happened, make it timed.
1872 	 */
1873 	mutex_enter(&bfree_lock);
1874 	bfreelist.b_flags |= B_WANTED;
1875 	(void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz);
1876 	mutex_exit(&bfree_lock);
1877 	goto top;
1878 }
1879 
1880 /*
1881  * See if the block is associated with some buffer
1882  * (mainly to avoid getting hung up on a wait in breada).
1883  */
1884 static int
1885 bio_incore(dev_t dev, daddr_t blkno)
1886 {
1887 	struct buf *bp;
1888 	struct buf *dp;
1889 	uint_t index;
1890 	kmutex_t *hmp;
1891 
1892 	index = bio_bhash(dev, blkno);
1893 	dp = (struct buf *)&hbuf[index];
1894 	hmp = &hbuf[index].b_lock;
1895 
1896 	mutex_enter(hmp);
1897 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1898 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1899 		    (bp->b_flags & B_STALE) == 0) {
1900 			mutex_exit(hmp);
1901 			return (1);
1902 		}
1903 	}
1904 	mutex_exit(hmp);
1905 	return (0);
1906 }
1907 
1908 static void
1909 bio_pageio_done(struct buf *bp)
1910 {
1911 	if (bp->b_flags & B_PAGEIO) {
1912 
1913 		if (bp->b_flags & B_REMAPPED)
1914 			bp_mapout(bp);
1915 
1916 		if (bp->b_flags & B_READ)
1917 			pvn_read_done(bp->b_pages, bp->b_flags);
1918 		else
1919 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1920 		pageio_done(bp);
1921 	} else {
1922 		ASSERT(bp->b_flags & B_REMAPPED);
1923 		bp_mapout(bp);
1924 		brelse(bp);
1925 	}
1926 }
1927 
1928 /*
1929  * bioerror(9F) - indicate error in buffer header
1930  * If 'error' is zero, remove the error indication.
1931  */
1932 void
1933 bioerror(struct buf *bp, int error)
1934 {
1935 	ASSERT(bp != NULL);
1936 	ASSERT(error >= 0);
1937 	ASSERT(SEMA_HELD(&bp->b_sem));
1938 
1939 	if (error != 0) {
1940 		bp->b_flags |= B_ERROR;
1941 	} else {
1942 		bp->b_flags &= ~B_ERROR;
1943 	}
1944 	bp->b_error = error;
1945 }
1946 
1947 /*
1948  * bioreset(9F) - reuse a private buffer header after I/O is complete
1949  */
1950 void
1951 bioreset(struct buf *bp)
1952 {
1953 	ASSERT(bp != NULL);
1954 
1955 	biofini(bp);
1956 	bioinit(bp);
1957 }
1958 
1959 /*
1960  * biosize(9F) - return size of a buffer header
1961  */
1962 size_t
1963 biosize(void)
1964 {
1965 	return (sizeof (struct buf));
1966 }
1967 
1968 /*
1969  * biomodified(9F) - check if buffer is modified
1970  */
1971 int
1972 biomodified(struct buf *bp)
1973 {
1974 	int npf;
1975 	int ppattr;
1976 	struct page *pp;
1977 
1978 	ASSERT(bp != NULL);
1979 
1980 	if ((bp->b_flags & B_PAGEIO) == 0) {
1981 		return (-1);
1982 	}
1983 	pp = bp->b_pages;
1984 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1985 
1986 	while (npf > 0) {
1987 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1988 				HAT_SYNC_STOPON_MOD);
1989 		if (ppattr & P_MOD)
1990 			return (1);
1991 		pp = pp->p_next;
1992 		npf--;
1993 	}
1994 
1995 	return (0);
1996 }
1997 
1998 /*
1999  * bioinit(9F) - initialize a buffer structure
2000  */
2001 void
2002 bioinit(struct buf *bp)
2003 {
2004 	bzero(bp, sizeof (struct buf));
2005 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2006 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2007 	bp->b_offset = -1;
2008 }
2009 
2010 /*
2011  * biofini(9F) - uninitialize a buffer structure
2012  */
2013 void
2014 biofini(struct buf *bp)
2015 {
2016 	sema_destroy(&bp->b_io);
2017 	sema_destroy(&bp->b_sem);
2018 }
2019 
2020 /*
2021  * bioclone(9F) - clone a buffer
2022  */
2023 struct buf *
2024 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2025     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2026 {
2027 	struct buf *bufp;
2028 
2029 	ASSERT(bp);
2030 	if (bp_mem == NULL) {
2031 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2032 		if (bufp == NULL) {
2033 			return (NULL);
2034 		}
2035 		bioinit(bufp);
2036 	} else {
2037 		bufp = bp_mem;
2038 		bioreset(bufp);
2039 	}
2040 
2041 #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2042 	B_ABRWRITE)
2043 
2044 	/*
2045 	 * The cloned buffer does not inherit the B_REMAPPED flag.
2046 	 */
2047 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2048 	bufp->b_bcount = len;
2049 	bufp->b_blkno = blkno;
2050 	bufp->b_iodone = iodone;
2051 	bufp->b_proc = bp->b_proc;
2052 	bufp->b_edev = dev;
2053 	bufp->b_file = bp->b_file;
2054 	bufp->b_offset = bp->b_offset;
2055 
2056 	if (bp->b_flags & B_SHADOW) {
2057 		ASSERT(bp->b_shadow);
2058 		ASSERT(bp->b_flags & B_PHYS);
2059 
2060 		bufp->b_shadow = bp->b_shadow +
2061 			btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2062 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2063 		if (bp->b_flags & B_REMAPPED)
2064 			bufp->b_proc = NULL;
2065 	} else {
2066 		if (bp->b_flags & B_PAGEIO) {
2067 			struct page *pp;
2068 			off_t o;
2069 			int i;
2070 
2071 			pp = bp->b_pages;
2072 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2073 			for (i = btop(o); i > 0; i--) {
2074 				pp = pp->p_next;
2075 			}
2076 			bufp->b_pages = pp;
2077 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2078 		} else {
2079 			bufp->b_un.b_addr =
2080 				(caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2081 			if (bp->b_flags & B_REMAPPED)
2082 				bufp->b_proc = NULL;
2083 		}
2084 	}
2085 	return (bufp);
2086 }
2087