xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision 2f8bbd9dee64b0f32e2f0e385b450b0d7dca7e32)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2011 Joyent, Inc.  All rights reserved.
25  */
26 
27 /*
28  * Copyright (c) 2016 by Delphix. All rights reserved.
29  */
30 
31 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
32 /*	  All Rights Reserved  	*/
33 
34 /*
35  * University Copyright- Copyright (c) 1982, 1986, 1988
36  * The Regents of the University of California
37  * All Rights Reserved
38  *
39  * University Acknowledgment- Portions of this document are derived from
40  * software developed by the University of California, Berkeley, and its
41  * contributors.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
47 #include <sys/conf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
51 #include <sys/buf.h>
52 #include <sys/var.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
56 #include <sys/kmem.h>
57 #include <sys/vmem.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
60 #include <vm/page.h>
61 #include <vm/pvn.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
64 #include <sys/fs/ufs_inode.h>
65 #include <sys/fs/ufs_bio.h>
66 #include <sys/fs/ufs_log.h>
67 #include <sys/systm.h>
68 #include <sys/vfs.h>
69 #include <sys/sdt.h>
70 
71 /* Locks */
72 static	kmutex_t	blist_lock;	/* protects b_list */
73 static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
74 static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
75 
76 struct hbuf	*hbuf;			/* Hash buckets */
77 struct dwbuf	*dwbuf;			/* Delayed write buckets */
78 static struct buf *bhdrlist;		/* buf header free list */
79 static int 	nbuf;			/* number of buffer headers allocated */
80 
81 static int	lastindex;		/* Reference point on where to start */
82 					/* when looking for free buffers */
83 
84 #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
85 #define	EMPTY_LIST	((struct buf *)-1)
86 
87 static kcondvar_t	bio_mem_cv; 	/* Condition variables */
88 static kcondvar_t	bio_flushinval_cv;
89 static int	bio_doingflush;		/* flush in progress */
90 static int	bio_doinginval;		/* inval in progress */
91 static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
92 
93 /*
94  * Statistics on the buffer cache
95  */
96 struct biostats biostats = {
97 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
98 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
99 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
100 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
101 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
102 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
103 };
104 
105 /*
106  * kstat data
107  */
108 kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
109 uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
110 					sizeof (kstat_named_t));
111 
112 /*
113  * Statistics on ufs buffer cache
114  * Not protected by locks
115  */
116 struct ufsbiostats ub = {
117 	{ "breads",			KSTAT_DATA_UINT32 },
118 	{ "bwrites",			KSTAT_DATA_UINT32 },
119 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
120 	{ "getpages",			KSTAT_DATA_UINT32 },
121 	{ "getras",			KSTAT_DATA_UINT32 },
122 	{ "putsyncs",			KSTAT_DATA_UINT32 },
123 	{ "putasyncs",			KSTAT_DATA_UINT32 },
124 	{ "putpageios",			KSTAT_DATA_UINT32 },
125 };
126 
127 /*
128  * more UFS Logging eccentricities...
129  *
130  * required since "#pragma weak ..." doesn't work in reverse order.
131  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
132  *        to ufs routines don't get plugged into bio.c calls so
133  *        we initialize it when setting up the "lufsops" table
134  *        in "lufs.c:_init()"
135  */
136 void (*bio_lufs_strategy)(void *, buf_t *);
137 void (*bio_snapshot_strategy)(void *, buf_t *);
138 
139 
140 /* Private routines */
141 static struct buf	*bio_getfreeblk(long);
142 static void 		bio_mem_get(long);
143 static void		bio_bhdr_free(struct buf *);
144 static struct buf	*bio_bhdr_alloc(void);
145 static void		bio_recycle(int, long);
146 static void 		bio_pageio_done(struct buf *);
147 static int 		bio_incore(dev_t, daddr_t);
148 
149 /*
150  * Buffer cache constants
151  */
152 #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
153 #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
154 #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
155 #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
156 #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
157 #define	BIO_HASHLEN	4		/* Target length of hash chains */
158 
159 
160 /* Flags for bio_recycle() */
161 #define	BIO_HEADER	0x01
162 #define	BIO_MEM		0x02
163 
164 extern	int bufhwm;		/* User tunable - high water mark for mem  */
165 extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
166 
167 /*
168  * The following routines allocate and free
169  * buffers with various side effects.  In general the
170  * arguments to an allocate routine are a device and
171  * a block number, and the value is a pointer to
172  * to the buffer header; the buffer returned is locked with a
173  * binary semaphore so that no one else can touch it. If the block was
174  * already in core, no I/O need be done; if it is
175  * already locked, the process waits until it becomes free.
176  * The following routines allocate a buffer:
177  *	getblk
178  *	bread/BREAD
179  *	breada
180  * Eventually the buffer must be released, possibly with the
181  * side effect of writing it out, by using one of
182  *	bwrite/BWRITE/brwrite
183  *	bdwrite/bdrwrite
184  *	bawrite
185  *	brelse
186  *
187  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
188  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
189  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
190  * B_DONE is still used to denote a buffer with I/O complete on it.
191  *
192  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
193  * should not be used where a very accurate count of the free buffers is
194  * needed.
195  */
196 
197 /*
198  * Read in (if necessary) the block and return a buffer pointer.
199  *
200  * This interface is provided for binary compatibility.  Using
201  * BREAD() directly avoids the extra function call overhead invoked
202  * by calling this routine.
203  */
204 struct buf *
205 bread(dev_t dev, daddr_t blkno, long bsize)
206 {
207 	return (BREAD(dev, blkno, bsize));
208 }
209 
210 /*
211  * Common code for reading a buffer with various options
212  *
213  * Read in (if necessary) the block and return a buffer pointer.
214  */
215 struct buf *
216 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
217 {
218 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
219 	struct buf *bp;
220 	klwp_t *lwp = ttolwp(curthread);
221 
222 	CPU_STATS_ADD_K(sys, lread, 1);
223 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
224 	if (bp->b_flags & B_DONE)
225 		return (bp);
226 	bp->b_flags |= B_READ;
227 	ASSERT(bp->b_bcount == bsize);
228 	if (ufsvfsp == NULL) {					/* !ufs */
229 		(void) bdev_strategy(bp);
230 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
231 							/* ufs && logging */
232 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
233 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
234 							/* ufs && snapshots */
235 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
236 	} else {
237 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
238 		ub.ub_breads.value.ul++;		/* ufs && !logging */
239 		(void) bdev_strategy(bp);
240 	}
241 	if (lwp != NULL)
242 		lwp->lwp_ru.inblock++;
243 	CPU_STATS_ADD_K(sys, bread, 1);
244 	(void) biowait(bp);
245 	return (bp);
246 }
247 
248 /*
249  * Read in the block, like bread, but also start I/O on the
250  * read-ahead block (which is not allocated to the caller).
251  */
252 struct buf *
253 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
254 {
255 	struct buf *bp, *rabp;
256 	klwp_t *lwp = ttolwp(curthread);
257 
258 	bp = NULL;
259 	if (!bio_incore(dev, blkno)) {
260 		CPU_STATS_ADD_K(sys, lread, 1);
261 		bp = GETBLK(dev, blkno, bsize);
262 		if ((bp->b_flags & B_DONE) == 0) {
263 			bp->b_flags |= B_READ;
264 			bp->b_bcount = bsize;
265 			(void) bdev_strategy(bp);
266 			if (lwp != NULL)
267 				lwp->lwp_ru.inblock++;
268 			CPU_STATS_ADD_K(sys, bread, 1);
269 		}
270 	}
271 	if (rablkno && bfreelist.b_bcount > 1 &&
272 	    !bio_incore(dev, rablkno)) {
273 		rabp = GETBLK(dev, rablkno, bsize);
274 		if (rabp->b_flags & B_DONE)
275 			brelse(rabp);
276 		else {
277 			rabp->b_flags |= B_READ|B_ASYNC;
278 			rabp->b_bcount = bsize;
279 			(void) bdev_strategy(rabp);
280 			if (lwp != NULL)
281 				lwp->lwp_ru.inblock++;
282 			CPU_STATS_ADD_K(sys, bread, 1);
283 		}
284 	}
285 	if (bp == NULL)
286 		return (BREAD(dev, blkno, bsize));
287 	(void) biowait(bp);
288 	return (bp);
289 }
290 
291 /*
292  * Common code for writing a buffer with various options.
293  *
294  * force_wait  - wait for write completion regardless of B_ASYNC flag
295  * do_relse    - release the buffer when we are done
296  * clear_flags - flags to clear from the buffer
297  */
298 void
299 bwrite_common(void *arg, struct buf *bp, int force_wait,
300     int do_relse, int clear_flags)
301 {
302 	register int do_wait;
303 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
304 	int flag;
305 	klwp_t *lwp = ttolwp(curthread);
306 	struct cpu *cpup;
307 
308 	ASSERT(SEMA_HELD(&bp->b_sem));
309 	flag = bp->b_flags;
310 	bp->b_flags &= ~clear_flags;
311 	if (lwp != NULL)
312 		lwp->lwp_ru.oublock++;
313 	CPU_STATS_ENTER_K();
314 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
315 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
316 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
317 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
318 	if (do_wait == 0)
319 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
320 	CPU_STATS_EXIT_K();
321 	if (ufsvfsp == NULL) {
322 		(void) bdev_strategy(bp);
323 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
324 							/* ufs && logging */
325 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
326 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
327 							/* ufs && snapshots */
328 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
329 	} else {
330 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
331 		(void) bdev_strategy(bp);
332 	}
333 	if (do_wait) {
334 		(void) biowait(bp);
335 		if (do_relse) {
336 			brelse(bp);
337 		}
338 	}
339 }
340 
341 /*
342  * Write the buffer, waiting for completion (unless B_ASYNC is set).
343  * Then release the buffer.
344  * This interface is provided for binary compatibility.  Using
345  * BWRITE() directly avoids the extra function call overhead invoked
346  * by calling this routine.
347  */
348 void
349 bwrite(struct buf *bp)
350 {
351 	BWRITE(bp);
352 }
353 
354 /*
355  * Write the buffer, waiting for completion.
356  * But don't release the buffer afterwards.
357  * This interface is provided for binary compatibility.  Using
358  * BWRITE2() directly avoids the extra function call overhead.
359  */
360 void
361 bwrite2(struct buf *bp)
362 {
363 	BWRITE2(bp);
364 }
365 
366 /*
367  * Release the buffer, marking it so that if it is grabbed
368  * for another purpose it will be written out before being
369  * given up (e.g. when writing a partial block where it is
370  * assumed that another write for the same block will soon follow).
371  * Also save the time that the block is first marked as delayed
372  * so that it will be written in a reasonable time.
373  */
374 void
375 bdwrite(struct buf *bp)
376 {
377 	ASSERT(SEMA_HELD(&bp->b_sem));
378 	CPU_STATS_ADD_K(sys, lwrite, 1);
379 	if ((bp->b_flags & B_DELWRI) == 0)
380 		bp->b_start = ddi_get_lbolt();
381 	/*
382 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
383 	 * buffer to be written before being reused, and setting b_resid
384 	 * to zero says the buffer is complete.
385 	 */
386 	bp->b_flags |= B_DELWRI | B_DONE;
387 	bp->b_resid = 0;
388 	brelse(bp);
389 }
390 
391 /*
392  * Release the buffer, start I/O on it, but don't wait for completion.
393  */
394 void
395 bawrite(struct buf *bp)
396 {
397 	ASSERT(SEMA_HELD(&bp->b_sem));
398 
399 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
400 	if (bfreelist.b_bcount > 4)
401 		bp->b_flags |= B_ASYNC;
402 	BWRITE(bp);
403 }
404 
405 /*
406  * Release the buffer, with no I/O implied.
407  */
408 void
409 brelse(struct buf *bp)
410 {
411 	struct buf	**backp;
412 	uint_t		index;
413 	kmutex_t	*hmp;
414 	struct	buf	*dp;
415 	struct	hbuf	*hp;
416 
417 
418 	ASSERT(SEMA_HELD(&bp->b_sem));
419 
420 	/*
421 	 * Clear the retry write flag if the buffer was written without
422 	 * error.  The presence of B_DELWRI means the buffer has not yet
423 	 * been written and the presence of B_ERROR means that an error
424 	 * is still occurring.
425 	 */
426 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
427 		bp->b_flags &= ~B_RETRYWRI;
428 	}
429 
430 	/* Check for anomalous conditions */
431 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
432 		if (bp->b_flags & B_NOCACHE) {
433 			/* Don't add to the freelist. Destroy it now */
434 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
435 			sema_destroy(&bp->b_sem);
436 			sema_destroy(&bp->b_io);
437 			kmem_free(bp, sizeof (struct buf));
438 			return;
439 		}
440 		/*
441 		 * If a write failed and we are supposed to retry write,
442 		 * don't toss the buffer.  Keep it around and mark it
443 		 * delayed write in the hopes that it will eventually
444 		 * get flushed (and still keep the system running.)
445 		 */
446 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
447 			bp->b_flags |= B_DELWRI;
448 			/* keep fsflush from trying continuously to flush */
449 			bp->b_start = ddi_get_lbolt();
450 		} else
451 			bp->b_flags |= B_AGE|B_STALE;
452 		bp->b_flags &= ~B_ERROR;
453 		bp->b_error = 0;
454 	}
455 
456 	/*
457 	 * If delayed write is set then put in on the delayed
458 	 * write list instead of the free buffer list.
459 	 */
460 	index = bio_bhash(bp->b_edev, bp->b_blkno);
461 	hmp   = &hbuf[index].b_lock;
462 
463 	mutex_enter(hmp);
464 	hp = &hbuf[index];
465 	dp = (struct buf *)hp;
466 
467 	/*
468 	 * Make sure that the number of entries on this list are
469 	 * Zero <= count <= total # buffers
470 	 */
471 	ASSERT(hp->b_length >= 0);
472 	ASSERT(hp->b_length < nbuf);
473 
474 	hp->b_length++;		/* We are adding this buffer */
475 
476 	if (bp->b_flags & B_DELWRI) {
477 		/*
478 		 * This buffer goes on the delayed write buffer list
479 		 */
480 		dp = (struct buf *)&dwbuf[index];
481 	}
482 	ASSERT(bp->b_bufsize > 0);
483 	ASSERT(bp->b_bcount > 0);
484 	ASSERT(bp->b_un.b_addr != NULL);
485 
486 	if (bp->b_flags & B_AGE) {
487 		backp = &dp->av_forw;
488 		(*backp)->av_back = bp;
489 		bp->av_forw = *backp;
490 		*backp = bp;
491 		bp->av_back = dp;
492 	} else {
493 		backp = &dp->av_back;
494 		(*backp)->av_forw = bp;
495 		bp->av_back = *backp;
496 		*backp = bp;
497 		bp->av_forw = dp;
498 	}
499 	mutex_exit(hmp);
500 
501 	if (bfreelist.b_flags & B_WANTED) {
502 		/*
503 		 * Should come here very very rarely.
504 		 */
505 		mutex_enter(&bfree_lock);
506 		if (bfreelist.b_flags & B_WANTED) {
507 			bfreelist.b_flags &= ~B_WANTED;
508 			cv_broadcast(&bio_mem_cv);
509 		}
510 		mutex_exit(&bfree_lock);
511 	}
512 
513 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
514 	/*
515 	 * Don't let anyone get the buffer off the freelist before we
516 	 * release our hold on it.
517 	 */
518 	sema_v(&bp->b_sem);
519 }
520 
521 /*
522  * Return a count of the number of B_BUSY buffers in the system
523  * Can only be used as a good estimate.  If 'cleanit' is set,
524  * try to flush all bufs.
525  */
526 int
527 bio_busy(int cleanit)
528 {
529 	struct buf *bp, *dp;
530 	int busy = 0;
531 	int i;
532 	kmutex_t *hmp;
533 
534 	for (i = 0; i < v.v_hbuf; i++) {
535 		dp = (struct buf *)&hbuf[i];
536 		hmp = &hbuf[i].b_lock;
537 
538 		mutex_enter(hmp);
539 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
540 			if (bp->b_flags & B_BUSY)
541 				busy++;
542 		}
543 		mutex_exit(hmp);
544 	}
545 
546 	if (cleanit && busy != 0) {
547 		bflush(NODEV);
548 	}
549 
550 	return (busy);
551 }
552 
553 /*
554  * this interface is provided for binary compatibility.
555  *
556  * Assign a buffer for the given block.  If the appropriate
557  * block is already associated, return it; otherwise search
558  * for the oldest non-busy buffer and reassign it.
559  */
560 struct buf *
561 getblk(dev_t dev, daddr_t blkno, long bsize)
562 {
563 	return (getblk_common(/* ufsvfsp */ NULL, dev,
564 	    blkno, bsize, /* errflg */ 0));
565 }
566 
567 /*
568  * Assign a buffer for the given block.  If the appropriate
569  * block is already associated, return it; otherwise search
570  * for the oldest non-busy buffer and reassign it.
571  */
572 struct buf *
573 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
574 {
575 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
576 	struct buf *bp;
577 	struct buf *dp;
578 	struct buf *nbp = NULL;
579 	struct buf *errbp;
580 	uint_t		index;
581 	kmutex_t	*hmp;
582 	struct	hbuf	*hp;
583 
584 	if (getmajor(dev) >= devcnt)
585 		cmn_err(CE_PANIC, "blkdev");
586 
587 	biostats.bio_lookup.value.ui32++;
588 
589 	index = bio_bhash(dev, blkno);
590 	hp    = &hbuf[index];
591 	dp    = (struct buf *)hp;
592 	hmp   = &hp->b_lock;
593 
594 	mutex_enter(hmp);
595 loop:
596 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
597 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
598 		    (bp->b_flags & B_STALE))
599 			continue;
600 		/*
601 		 * Avoid holding the hash lock in the event that
602 		 * the buffer is locked by someone. Since the hash chain
603 		 * may change when we drop the hash lock
604 		 * we have to start at the beginning of the chain if the
605 		 * buffer identity/contents aren't valid.
606 		 */
607 		if (!sema_tryp(&bp->b_sem)) {
608 			biostats.bio_bufbusy.value.ui32++;
609 			mutex_exit(hmp);
610 			/*
611 			 * OK, we are dealing with a busy buffer.
612 			 * In the case that we are panicking and we
613 			 * got called from bread(), we have some chance
614 			 * for error recovery. So better bail out from
615 			 * here since sema_p() won't block. If we got
616 			 * called directly from ufs routines, there is
617 			 * no way to report an error yet.
618 			 */
619 			if (panicstr && errflg)
620 				goto errout;
621 			/*
622 			 * For the following line of code to work
623 			 * correctly never kmem_free the buffer "header".
624 			 */
625 			sema_p(&bp->b_sem);
626 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
627 			    (bp->b_flags & B_STALE)) {
628 				sema_v(&bp->b_sem);
629 				mutex_enter(hmp);
630 				goto loop;	/* start over */
631 			}
632 			mutex_enter(hmp);
633 		}
634 		/* Found */
635 		biostats.bio_hit.value.ui32++;
636 		bp->b_flags &= ~B_AGE;
637 
638 		/*
639 		 * Yank it off the free/delayed write lists
640 		 */
641 		hp->b_length--;
642 		notavail(bp);
643 		mutex_exit(hmp);
644 
645 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
646 
647 		if (nbp == NULL) {
648 			/*
649 			 * Make the common path short.
650 			 */
651 			ASSERT(SEMA_HELD(&bp->b_sem));
652 			return (bp);
653 		}
654 
655 		biostats.bio_bufdup.value.ui32++;
656 
657 		/*
658 		 * The buffer must have entered during the lock upgrade
659 		 * so free the new buffer we allocated and return the
660 		 * found buffer.
661 		 */
662 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
663 		nbp->b_un.b_addr = NULL;
664 
665 		/*
666 		 * Account for the memory
667 		 */
668 		mutex_enter(&bfree_lock);
669 		bfreelist.b_bufsize += nbp->b_bufsize;
670 		mutex_exit(&bfree_lock);
671 
672 		/*
673 		 * Destroy buf identity, and place on avail list
674 		 */
675 		nbp->b_dev = (o_dev_t)NODEV;
676 		nbp->b_edev = NODEV;
677 		nbp->b_flags = 0;
678 		nbp->b_file = NULL;
679 		nbp->b_offset = -1;
680 
681 		sema_v(&nbp->b_sem);
682 		bio_bhdr_free(nbp);
683 
684 		ASSERT(SEMA_HELD(&bp->b_sem));
685 		return (bp);
686 	}
687 
688 	/*
689 	 * bio_getfreeblk may block so check the hash chain again.
690 	 */
691 	if (nbp == NULL) {
692 		mutex_exit(hmp);
693 		nbp = bio_getfreeblk(bsize);
694 		mutex_enter(hmp);
695 		goto loop;
696 	}
697 
698 	/*
699 	 * New buffer. Assign nbp and stick it on the hash.
700 	 */
701 	nbp->b_flags = B_BUSY;
702 	nbp->b_edev = dev;
703 	nbp->b_dev = (o_dev_t)cmpdev(dev);
704 	nbp->b_blkno = blkno;
705 	nbp->b_iodone = NULL;
706 	nbp->b_bcount = bsize;
707 	/*
708 	 * If we are given a ufsvfsp and the vfs_root field is NULL
709 	 * then this must be I/O for a superblock.  A superblock's
710 	 * buffer is set up in mountfs() and there is no root vnode
711 	 * at that point.
712 	 */
713 	if (ufsvfsp && ufsvfsp->vfs_root) {
714 		nbp->b_vp = ufsvfsp->vfs_root;
715 	} else {
716 		nbp->b_vp = NULL;
717 	}
718 
719 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
720 
721 	binshash(nbp, dp);
722 	mutex_exit(hmp);
723 
724 	ASSERT(SEMA_HELD(&nbp->b_sem));
725 
726 	return (nbp);
727 
728 
729 	/*
730 	 * Come here in case of an internal error. At this point we couldn't
731 	 * get a buffer, but we have to return one. Hence we allocate some
732 	 * kind of error reply buffer on the fly. This buffer is marked as
733 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
734 	 *	- B_ERROR will indicate error to the caller.
735 	 *	- B_DONE will prevent us from reading the buffer from
736 	 *	  the device.
737 	 *	- B_NOCACHE will cause that this buffer gets free'd in
738 	 *	  brelse().
739 	 */
740 
741 errout:
742 	errbp = geteblk();
743 	sema_p(&errbp->b_sem);
744 	errbp->b_flags &= ~B_BUSY;
745 	errbp->b_flags |= (B_ERROR | B_DONE);
746 	return (errbp);
747 }
748 
749 /*
750  * Get an empty block, not assigned to any particular device.
751  * Returns a locked buffer that is not on any hash or free list.
752  */
753 struct buf *
754 ngeteblk(long bsize)
755 {
756 	struct buf *bp;
757 
758 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
759 	bioinit(bp);
760 	bp->av_forw = bp->av_back = NULL;
761 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
762 	bp->b_bufsize = bsize;
763 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
764 	bp->b_dev = (o_dev_t)NODEV;
765 	bp->b_edev = NODEV;
766 	bp->b_lblkno = 0;
767 	bp->b_bcount = bsize;
768 	bp->b_iodone = NULL;
769 	return (bp);
770 }
771 
772 /*
773  * Interface of geteblk() is kept intact to maintain driver compatibility.
774  * Use ngeteblk() to allocate block size other than 1 KB.
775  */
776 struct buf *
777 geteblk(void)
778 {
779 	return (ngeteblk((long)1024));
780 }
781 
782 /*
783  * Return a buffer w/o sleeping
784  */
785 struct buf *
786 trygetblk(dev_t dev, daddr_t blkno)
787 {
788 	struct buf	*bp;
789 	struct buf	*dp;
790 	struct hbuf	*hp;
791 	kmutex_t	*hmp;
792 	uint_t		index;
793 
794 	index = bio_bhash(dev, blkno);
795 	hp = &hbuf[index];
796 	hmp = &hp->b_lock;
797 
798 	if (!mutex_tryenter(hmp))
799 		return (NULL);
800 
801 	dp = (struct buf *)hp;
802 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
803 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
804 		    (bp->b_flags & B_STALE))
805 			continue;
806 		/*
807 		 * Get access to a valid buffer without sleeping
808 		 */
809 		if (sema_tryp(&bp->b_sem)) {
810 			if (bp->b_flags & B_DONE) {
811 				hp->b_length--;
812 				notavail(bp);
813 				mutex_exit(hmp);
814 				return (bp);
815 			} else {
816 				sema_v(&bp->b_sem);
817 				break;
818 			}
819 		}
820 		break;
821 	}
822 	mutex_exit(hmp);
823 	return (NULL);
824 }
825 
826 /*
827  * Wait for I/O completion on the buffer; return errors
828  * to the user.
829  */
830 int
831 iowait(struct buf *bp)
832 {
833 	ASSERT(SEMA_HELD(&bp->b_sem));
834 	return (biowait(bp));
835 }
836 
837 /*
838  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
839  * and wake up anyone waiting for it.
840  */
841 void
842 iodone(struct buf *bp)
843 {
844 	ASSERT(SEMA_HELD(&bp->b_sem));
845 	(void) biodone(bp);
846 }
847 
848 /*
849  * Zero the core associated with a buffer.
850  */
851 void
852 clrbuf(struct buf *bp)
853 {
854 	ASSERT(SEMA_HELD(&bp->b_sem));
855 	bzero(bp->b_un.b_addr, bp->b_bcount);
856 	bp->b_resid = 0;
857 }
858 
859 
860 /*
861  * Make sure all write-behind blocks on dev (or NODEV for all)
862  * are flushed out.
863  */
864 void
865 bflush(dev_t dev)
866 {
867 	struct buf *bp, *dp;
868 	struct hbuf *hp;
869 	struct buf *delwri_list = EMPTY_LIST;
870 	int i, index;
871 	kmutex_t *hmp;
872 
873 	mutex_enter(&blist_lock);
874 	/*
875 	 * Wait for any invalidates or flushes ahead of us to finish.
876 	 * We really could split blist_lock up per device for better
877 	 * parallelism here.
878 	 */
879 	while (bio_doinginval || bio_doingflush) {
880 		bio_flinv_cv_wanted = 1;
881 		cv_wait(&bio_flushinval_cv, &blist_lock);
882 	}
883 	bio_doingflush++;
884 	/*
885 	 * Gather all B_DELWRI buffer for device.
886 	 * Lock ordering is b_sem > hash lock (brelse).
887 	 * Since we are finding the buffer via the delayed write list,
888 	 * it may be busy and we would block trying to get the
889 	 * b_sem lock while holding hash lock. So transfer all the
890 	 * candidates on the delwri_list and then drop the hash locks.
891 	 */
892 	for (i = 0; i < v.v_hbuf; i++) {
893 		hmp = &hbuf[i].b_lock;
894 		dp = (struct buf *)&dwbuf[i];
895 		mutex_enter(hmp);
896 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897 			if (dev == NODEV || bp->b_edev == dev) {
898 				if (bp->b_list == NULL) {
899 					bp->b_list = delwri_list;
900 					delwri_list = bp;
901 				}
902 			}
903 		}
904 		mutex_exit(hmp);
905 	}
906 	mutex_exit(&blist_lock);
907 
908 	/*
909 	 * Now that the hash locks have been dropped grab the semaphores
910 	 * and write back all the buffers that have B_DELWRI set.
911 	 */
912 	while (delwri_list != EMPTY_LIST) {
913 		bp = delwri_list;
914 
915 		sema_p(&bp->b_sem);	/* may block */
916 		if ((dev != bp->b_edev && dev != NODEV) ||
917 		    (panicstr && bp->b_flags & B_BUSY)) {
918 			sema_v(&bp->b_sem);
919 			delwri_list = bp->b_list;
920 			bp->b_list = NULL;
921 			continue;	/* No longer a candidate */
922 		}
923 		if (bp->b_flags & B_DELWRI) {
924 			index = bio_bhash(bp->b_edev, bp->b_blkno);
925 			hp = &hbuf[index];
926 			hmp = &hp->b_lock;
927 			dp = (struct buf *)hp;
928 
929 			bp->b_flags |= B_ASYNC;
930 			mutex_enter(hmp);
931 			hp->b_length--;
932 			notavail(bp);
933 			mutex_exit(hmp);
934 			if (bp->b_vp == NULL) {		/* !ufs */
935 				BWRITE(bp);
936 			} else {			/* ufs */
937 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
938 			}
939 		} else {
940 			sema_v(&bp->b_sem);
941 		}
942 		delwri_list = bp->b_list;
943 		bp->b_list = NULL;
944 	}
945 	mutex_enter(&blist_lock);
946 	bio_doingflush--;
947 	if (bio_flinv_cv_wanted) {
948 		bio_flinv_cv_wanted = 0;
949 		cv_broadcast(&bio_flushinval_cv);
950 	}
951 	mutex_exit(&blist_lock);
952 }
953 
954 /*
955  * Ensure that a specified block is up-to-date on disk.
956  */
957 void
958 blkflush(dev_t dev, daddr_t blkno)
959 {
960 	struct buf *bp, *dp;
961 	struct hbuf *hp;
962 	struct buf *sbp = NULL;
963 	uint_t index;
964 	kmutex_t *hmp;
965 
966 	index = bio_bhash(dev, blkno);
967 	hp    = &hbuf[index];
968 	dp    = (struct buf *)hp;
969 	hmp   = &hp->b_lock;
970 
971 	/*
972 	 * Identify the buffer in the cache belonging to
973 	 * this device and blkno (if any).
974 	 */
975 	mutex_enter(hmp);
976 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
977 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
978 		    (bp->b_flags & B_STALE))
979 			continue;
980 		sbp = bp;
981 		break;
982 	}
983 	mutex_exit(hmp);
984 	if (sbp == NULL)
985 		return;
986 	/*
987 	 * Now check the buffer we have identified and
988 	 * make sure it still belongs to the device and is B_DELWRI
989 	 */
990 	sema_p(&sbp->b_sem);
991 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
992 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
993 		mutex_enter(hmp);
994 		hp->b_length--;
995 		notavail(sbp);
996 		mutex_exit(hmp);
997 		/*
998 		 * XXX - There is nothing to guarantee a synchronous
999 		 * write here if the B_ASYNC flag is set.  This needs
1000 		 * some investigation.
1001 		 */
1002 		if (sbp->b_vp == NULL) {		/* !ufs */
1003 			BWRITE(sbp);	/* synchronous write */
1004 		} else {				/* ufs */
1005 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006 		}
1007 	} else {
1008 		sema_v(&sbp->b_sem);
1009 	}
1010 }
1011 
1012 /*
1013  * Same as binval, except can force-invalidate delayed-write buffers
1014  * (which are not be already flushed because of device errors).  Also
1015  * makes sure that the retry write flag is cleared.
1016  */
1017 int
1018 bfinval(dev_t dev, int force)
1019 {
1020 	struct buf *dp;
1021 	struct buf *bp;
1022 	struct buf *binval_list = EMPTY_LIST;
1023 	int i, error = 0;
1024 	kmutex_t *hmp;
1025 	uint_t index;
1026 	struct buf **backp;
1027 
1028 	mutex_enter(&blist_lock);
1029 	/*
1030 	 * Wait for any flushes ahead of us to finish, it's ok to
1031 	 * do invalidates in parallel.
1032 	 */
1033 	while (bio_doingflush) {
1034 		bio_flinv_cv_wanted = 1;
1035 		cv_wait(&bio_flushinval_cv, &blist_lock);
1036 	}
1037 	bio_doinginval++;
1038 
1039 	/* Gather bp's */
1040 	for (i = 0; i < v.v_hbuf; i++) {
1041 		dp = (struct buf *)&hbuf[i];
1042 		hmp = &hbuf[i].b_lock;
1043 
1044 		mutex_enter(hmp);
1045 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046 			if (bp->b_edev == dev) {
1047 				if (bp->b_list == NULL) {
1048 					bp->b_list = binval_list;
1049 					binval_list = bp;
1050 				}
1051 			}
1052 		}
1053 		mutex_exit(hmp);
1054 	}
1055 	mutex_exit(&blist_lock);
1056 
1057 	/* Invalidate all bp's found */
1058 	while (binval_list != EMPTY_LIST) {
1059 		bp = binval_list;
1060 
1061 		sema_p(&bp->b_sem);
1062 		if (bp->b_edev == dev) {
1063 			if (force && (bp->b_flags & B_DELWRI)) {
1064 				/* clear B_DELWRI, move to non-dw freelist */
1065 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1066 				hmp = &hbuf[index].b_lock;
1067 				dp = (struct buf *)&hbuf[index];
1068 				mutex_enter(hmp);
1069 
1070 				/* remove from delayed write freelist */
1071 				notavail(bp);
1072 
1073 				/* add to B_AGE side of non-dw freelist */
1074 				backp = &dp->av_forw;
1075 				(*backp)->av_back = bp;
1076 				bp->av_forw = *backp;
1077 				*backp = bp;
1078 				bp->av_back = dp;
1079 
1080 				/*
1081 				 * make sure write retries and busy are cleared
1082 				 */
1083 				bp->b_flags &=
1084 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085 				mutex_exit(hmp);
1086 			}
1087 			if ((bp->b_flags & B_DELWRI) == 0)
1088 				bp->b_flags |= B_STALE|B_AGE;
1089 			else
1090 				error = EIO;
1091 		}
1092 		sema_v(&bp->b_sem);
1093 		binval_list = bp->b_list;
1094 		bp->b_list = NULL;
1095 	}
1096 	mutex_enter(&blist_lock);
1097 	bio_doinginval--;
1098 	if (bio_flinv_cv_wanted) {
1099 		cv_broadcast(&bio_flushinval_cv);
1100 		bio_flinv_cv_wanted = 0;
1101 	}
1102 	mutex_exit(&blist_lock);
1103 	return (error);
1104 }
1105 
1106 /*
1107  * If possible, invalidate blocks for a dev on demand
1108  */
1109 void
1110 binval(dev_t dev)
1111 {
1112 	(void) bfinval(dev, 0);
1113 }
1114 
1115 /*
1116  * Initialize the buffer I/O system by freeing
1117  * all buffers and setting all device hash buffer lists to empty.
1118  */
1119 void
1120 binit(void)
1121 {
1122 	struct buf *bp;
1123 	unsigned int i, pct;
1124 	ulong_t	bio_max_hwm, bio_default_hwm;
1125 
1126 	/*
1127 	 * Maximum/Default values for bufhwm are set to the smallest of:
1128 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 	 *	- 1/4 of kernel virtual memory
1130 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 	 * Additionally, in order to allow simple tuning by percentage of
1132 	 * physical memory, bufhwm_pct is used to calculate the default if
1133 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134 	 *
1135 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137 	 */
1138 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141 
1142 	pct = BIO_BUF_PERCENT;
1143 	if (bufhwm_pct != 0 &&
1144 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145 		pct = BIO_BUF_PERCENT;
1146 		/*
1147 		 * Invalid user specified value, emit a warning.
1148 		 */
1149 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150 		    range(1..%d). Using %d as default.",
1151 		    bufhwm_pct,
1152 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153 	}
1154 
1155 	bio_default_hwm = MIN(physmem / pct,
1156 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158 
1159 	if ((v.v_bufhwm = bufhwm) == 0)
1160 		v.v_bufhwm = bio_default_hwm;
1161 
1162 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163 		v.v_bufhwm = (int)bio_max_hwm;
1164 		/*
1165 		 * Invalid user specified value, emit a warning.
1166 		 */
1167 		cmn_err(CE_WARN,
1168 		    "binit: bufhwm(%d) out \
1169 		    of range(%d..%lu). Using %lu as default",
1170 		    bufhwm,
1171 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172 	}
1173 
1174 	/*
1175 	 * Determine the number of hash buckets. Default is to
1176 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 	 * Round up number to the next power of 2.
1178 	 */
1179 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180 	    BIO_HASHLEN);
1181 	v.v_hmask = v.v_hbuf - 1;
1182 	v.v_buf = BIO_BHDR_POOL;
1183 
1184 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185 
1186 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187 
1188 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189 	bp = &bfreelist;
1190 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191 
1192 	for (i = 0; i < v.v_hbuf; i++) {
1193 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195 
1196 		/*
1197 		 * Initialize the delayed write buffer list.
1198 		 */
1199 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201 	}
1202 }
1203 
1204 /*
1205  * Wait for I/O completion on the buffer; return error code.
1206  * If bp was for synchronous I/O, bp is invalid and associated
1207  * resources are freed on return.
1208  */
1209 int
1210 biowait(struct buf *bp)
1211 {
1212 	int error = 0;
1213 	struct cpu *cpup;
1214 
1215 	ASSERT(SEMA_HELD(&bp->b_sem));
1216 
1217 	cpup = CPU;
1218 	atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219 	DTRACE_IO1(wait__start, struct buf *, bp);
1220 
1221 	/*
1222 	 * In case of panic, busy wait for completion
1223 	 */
1224 	if (panicstr) {
1225 		while ((bp->b_flags & B_DONE) == 0)
1226 			drv_usecwait(10);
1227 	} else
1228 		sema_p(&bp->b_io);
1229 
1230 	DTRACE_IO1(wait__done, struct buf *, bp);
1231 	atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1232 
1233 	error = geterror(bp);
1234 	if ((bp->b_flags & B_ASYNC) == 0) {
1235 		if (bp->b_flags & B_REMAPPED)
1236 			bp_mapout(bp);
1237 	}
1238 	return (error);
1239 }
1240 
1241 static void
1242 biodone_tnf_probe(struct buf *bp)
1243 {
1244 	/* Kernel probe */
1245 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246 	    tnf_device,		device,		bp->b_edev,
1247 	    tnf_diskaddr,	block,		bp->b_lblkno,
1248 	    tnf_opaque,		buf,		bp);
1249 }
1250 
1251 /*
1252  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253  * and wake up anyone waiting for it.
1254  */
1255 void
1256 biodone(struct buf *bp)
1257 {
1258 	if (bp->b_flags & B_STARTED) {
1259 		DTRACE_IO1(done, struct buf *, bp);
1260 		bp->b_flags &= ~B_STARTED;
1261 	}
1262 
1263 	/*
1264 	 * Call the TNF probe here instead of the inline code
1265 	 * to force our compiler to use the tail call optimization.
1266 	 */
1267 	biodone_tnf_probe(bp);
1268 
1269 	if (bp->b_iodone != NULL) {
1270 		(*(bp->b_iodone))(bp);
1271 		return;
1272 	}
1273 	ASSERT((bp->b_flags & B_DONE) == 0);
1274 	ASSERT(SEMA_HELD(&bp->b_sem));
1275 	bp->b_flags |= B_DONE;
1276 	if (bp->b_flags & B_ASYNC) {
1277 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278 			bio_pageio_done(bp);
1279 		else
1280 			brelse(bp);	/* release bp to freelist */
1281 	} else {
1282 		sema_v(&bp->b_io);
1283 	}
1284 }
1285 
1286 /*
1287  * Pick up the device's error number and pass it to the user;
1288  * if there is an error but the number is 0 set a generalized code.
1289  */
1290 int
1291 geterror(struct buf *bp)
1292 {
1293 	int error = 0;
1294 
1295 	ASSERT(SEMA_HELD(&bp->b_sem));
1296 	if (bp->b_flags & B_ERROR) {
1297 		error = bp->b_error;
1298 		if (!error)
1299 			error = EIO;
1300 	}
1301 	return (error);
1302 }
1303 
1304 /*
1305  * Support for pageio buffers.
1306  *
1307  * This stuff should be generalized to provide a generalized bp
1308  * header facility that can be used for things other than pageio.
1309  */
1310 
1311 /*
1312  * Allocate and initialize a buf struct for use with pageio.
1313  */
1314 struct buf *
1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 {
1317 	struct buf *bp;
1318 	struct cpu *cpup;
1319 
1320 	if (flags & B_READ) {
1321 		CPU_STATS_ENTER_K();
1322 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1323 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325 
1326 		atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1327 
1328 		if ((flags & B_ASYNC) == 0) {
1329 			klwp_t *lwp = ttolwp(curthread);
1330 			if (lwp != NULL)
1331 				lwp->lwp_ru.majflt++;
1332 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333 			/* Kernel probe */
1334 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335 			    tnf_opaque,		vnode,		pp->p_vnode,
1336 			    tnf_offset,		offset,		pp->p_offset);
1337 		}
1338 		/*
1339 		 * Update statistics for pages being paged in
1340 		 */
1341 		if (pp != NULL && pp->p_vnode != NULL) {
1342 			if (IS_SWAPFSVP(pp->p_vnode)) {
1343 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344 				atomic_add_64(&curzone->zone_anonpgin,
1345 				    btopr(len));
1346 			} else {
1347 				if (pp->p_vnode->v_flag & VVMEXEC) {
1348 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1349 					    btopr(len));
1350 					atomic_add_64(&curzone->zone_execpgin,
1351 					    btopr(len));
1352 				} else {
1353 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1354 					    btopr(len));
1355 					atomic_add_64(&curzone->zone_fspgin,
1356 					    btopr(len));
1357 				}
1358 			}
1359 		}
1360 		CPU_STATS_EXIT_K();
1361 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362 		    "page_ws_in:pp %p", pp);
1363 		/* Kernel probe */
1364 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365 		    tnf_opaque,	vnode,	pp->p_vnode,
1366 		    tnf_offset,	offset,	pp->p_offset,
1367 		    tnf_size,	size,	len);
1368 	}
1369 
1370 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371 	bp->b_bcount = len;
1372 	bp->b_bufsize = len;
1373 	bp->b_pages = pp;
1374 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375 	bp->b_offset = -1;
1376 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1377 
1378 	/* Initialize bp->b_sem in "locked" state */
1379 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1380 
1381 	VN_HOLD(vp);
1382 	bp->b_vp = vp;
1383 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1384 
1385 	/*
1386 	 * Caller sets dev & blkno and can adjust
1387 	 * b_addr for page offset and can use bp_mapin
1388 	 * to make pages kernel addressable.
1389 	 */
1390 	return (bp);
1391 }
1392 
1393 void
1394 pageio_done(struct buf *bp)
1395 {
1396 	ASSERT(SEMA_HELD(&bp->b_sem));
1397 	if (bp->b_flags & B_REMAPPED)
1398 		bp_mapout(bp);
1399 	VN_RELE(bp->b_vp);
1400 	bp->b_vp = NULL;
1401 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1402 
1403 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1404 	sema_destroy(&bp->b_sem);
1405 	sema_destroy(&bp->b_io);
1406 	kmem_free(bp, sizeof (struct buf));
1407 }
1408 
1409 /*
1410  * Check to see whether the buffers, except the one pointed by sbp,
1411  * associated with the device are busy.
1412  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1413  */
1414 int
1415 bcheck(dev_t dev, struct buf *sbp)
1416 {
1417 	struct buf	*bp;
1418 	struct buf	*dp;
1419 	int i;
1420 	kmutex_t *hmp;
1421 
1422 	/*
1423 	 * check for busy bufs for this filesystem
1424 	 */
1425 	for (i = 0; i < v.v_hbuf; i++) {
1426 		dp = (struct buf *)&hbuf[i];
1427 		hmp = &hbuf[i].b_lock;
1428 
1429 		mutex_enter(hmp);
1430 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1431 			/*
1432 			 * if buf is busy or dirty, then filesystem is busy
1433 			 */
1434 			if ((bp->b_edev == dev) &&
1435 			    ((bp->b_flags & B_STALE) == 0) &&
1436 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1437 			    (bp != sbp)) {
1438 				mutex_exit(hmp);
1439 				return (1);
1440 			}
1441 		}
1442 		mutex_exit(hmp);
1443 	}
1444 	return (0);
1445 }
1446 
1447 /*
1448  * Hash two 32 bit entities.
1449  */
1450 int
1451 hash2ints(int x, int y)
1452 {
1453 	int hash = 0;
1454 
1455 	hash = x - 1;
1456 	hash = ((hash * 7) + (x >> 8)) - 1;
1457 	hash = ((hash * 7) + (x >> 16)) - 1;
1458 	hash = ((hash * 7) + (x >> 24)) - 1;
1459 	hash = ((hash * 7) + y) - 1;
1460 	hash = ((hash * 7) + (y >> 8)) - 1;
1461 	hash = ((hash * 7) + (y >> 16)) - 1;
1462 	hash = ((hash * 7) + (y >> 24)) - 1;
1463 
1464 	return (hash);
1465 }
1466 
1467 
1468 /*
1469  * Return a new buffer struct.
1470  *	Create a new buffer if we haven't gone over our high water
1471  *	mark for memory, otherwise try to get one off the freelist.
1472  *
1473  * Returns a locked buf that has no id and is not on any hash or free
1474  * list.
1475  */
1476 static struct buf *
1477 bio_getfreeblk(long bsize)
1478 {
1479 	struct buf *bp, *dp;
1480 	struct hbuf *hp;
1481 	kmutex_t	*hmp;
1482 	uint_t		start, end;
1483 
1484 	/*
1485 	 * mutex_enter(&bfree_lock);
1486 	 * bfreelist.b_bufsize represents the amount of memory
1487 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1488 	 * we are allowed to allocate in the cache before we hit our hwm.
1489 	 */
1490 	bio_mem_get(bsize);	/* Account for our memory request */
1491 
1492 again:
1493 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1494 	sema_p(&bp->b_sem);	/* Should never fail */
1495 
1496 	ASSERT(bp->b_un.b_addr == NULL);
1497 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1498 	if (bp->b_un.b_addr != NULL) {
1499 		/*
1500 		 * Make the common path short
1501 		 */
1502 		bp->b_bufsize = bsize;
1503 		ASSERT(SEMA_HELD(&bp->b_sem));
1504 		return (bp);
1505 	} else {
1506 		struct buf *save;
1507 
1508 		save = bp;	/* Save bp we allocated */
1509 		start = end = lastindex;
1510 
1511 		biostats.bio_bufwant.value.ui32++;
1512 
1513 		/*
1514 		 * Memory isn't available from the system now. Scan
1515 		 * the hash buckets till enough space is found.
1516 		 */
1517 		do {
1518 			hp = &hbuf[start];
1519 			hmp = &hp->b_lock;
1520 			dp = (struct buf *)hp;
1521 
1522 			mutex_enter(hmp);
1523 			bp = dp->av_forw;
1524 
1525 			while (bp != dp) {
1526 
1527 				ASSERT(bp != NULL);
1528 
1529 				if (!sema_tryp(&bp->b_sem)) {
1530 					bp = bp->av_forw;
1531 					continue;
1532 				}
1533 
1534 				/*
1535 				 * Since we are going down the freelist
1536 				 * associated with this hash bucket the
1537 				 * B_DELWRI flag should not be set.
1538 				 */
1539 				ASSERT(!(bp->b_flags & B_DELWRI));
1540 
1541 				if (bp->b_bufsize == bsize) {
1542 					hp->b_length--;
1543 					notavail(bp);
1544 					bremhash(bp);
1545 					mutex_exit(hmp);
1546 
1547 					/*
1548 					 * Didn't kmem_alloc any more, so don't
1549 					 * count it twice.
1550 					 */
1551 					mutex_enter(&bfree_lock);
1552 					bfreelist.b_bufsize += bsize;
1553 					mutex_exit(&bfree_lock);
1554 
1555 					/*
1556 					 * Update the lastindex value.
1557 					 */
1558 					lastindex = start;
1559 
1560 					/*
1561 					 * Put our saved bp back on the list
1562 					 */
1563 					sema_v(&save->b_sem);
1564 					bio_bhdr_free(save);
1565 					ASSERT(SEMA_HELD(&bp->b_sem));
1566 					return (bp);
1567 				}
1568 				sema_v(&bp->b_sem);
1569 				bp = bp->av_forw;
1570 			}
1571 			mutex_exit(hmp);
1572 			start = ((start + 1) % v.v_hbuf);
1573 		} while (start != end);
1574 
1575 		biostats.bio_bufwait.value.ui32++;
1576 		bp = save;		/* Use original bp */
1577 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1578 	}
1579 
1580 	bp->b_bufsize = bsize;
1581 	ASSERT(SEMA_HELD(&bp->b_sem));
1582 	return (bp);
1583 }
1584 
1585 /*
1586  * Allocate a buffer header. If none currently available, allocate
1587  * a new pool.
1588  */
1589 static struct buf *
1590 bio_bhdr_alloc(void)
1591 {
1592 	struct buf *dp, *sdp;
1593 	struct buf *bp;
1594 	int i;
1595 
1596 	for (;;) {
1597 		mutex_enter(&bhdr_lock);
1598 		if (bhdrlist != NULL) {
1599 			bp = bhdrlist;
1600 			bhdrlist = bp->av_forw;
1601 			mutex_exit(&bhdr_lock);
1602 			bp->av_forw = NULL;
1603 			return (bp);
1604 		}
1605 		mutex_exit(&bhdr_lock);
1606 
1607 		/*
1608 		 * Need to allocate a new pool. If the system is currently
1609 		 * out of memory, then try freeing things on the freelist.
1610 		 */
1611 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1612 		if (dp == NULL) {
1613 			/*
1614 			 * System can't give us a pool of headers, try
1615 			 * recycling from the free lists.
1616 			 */
1617 			bio_recycle(BIO_HEADER, 0);
1618 		} else {
1619 			sdp = dp;
1620 			for (i = 0; i < v.v_buf; i++, dp++) {
1621 				/*
1622 				 * The next two lines are needed since NODEV
1623 				 * is -1 and not NULL
1624 				 */
1625 				dp->b_dev = (o_dev_t)NODEV;
1626 				dp->b_edev = NODEV;
1627 				dp->av_forw = dp + 1;
1628 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1629 				    NULL);
1630 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1631 				    NULL);
1632 				dp->b_offset = -1;
1633 			}
1634 			mutex_enter(&bhdr_lock);
1635 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1636 			bhdrlist = sdp;
1637 			nbuf += v.v_buf;
1638 			bp = bhdrlist;
1639 			bhdrlist = bp->av_forw;
1640 			mutex_exit(&bhdr_lock);
1641 
1642 			bp->av_forw = NULL;
1643 			return (bp);
1644 		}
1645 	}
1646 }
1647 
1648 static  void
1649 bio_bhdr_free(struct buf *bp)
1650 {
1651 	ASSERT(bp->b_back == NULL);
1652 	ASSERT(bp->b_forw == NULL);
1653 	ASSERT(bp->av_back == NULL);
1654 	ASSERT(bp->av_forw == NULL);
1655 	ASSERT(bp->b_un.b_addr == NULL);
1656 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1657 	ASSERT(bp->b_edev == NODEV);
1658 	ASSERT(bp->b_flags == 0);
1659 
1660 	mutex_enter(&bhdr_lock);
1661 	bp->av_forw = bhdrlist;
1662 	bhdrlist = bp;
1663 	mutex_exit(&bhdr_lock);
1664 }
1665 
1666 /*
1667  * If we haven't gone over the high water mark, it's o.k. to
1668  * allocate more buffer space, otherwise recycle buffers
1669  * from the freelist until enough memory is free for a bsize request.
1670  *
1671  * We account for this memory, even though
1672  * we don't allocate it here.
1673  */
1674 static void
1675 bio_mem_get(long bsize)
1676 {
1677 	mutex_enter(&bfree_lock);
1678 	if (bfreelist.b_bufsize > bsize) {
1679 		bfreelist.b_bufsize -= bsize;
1680 		mutex_exit(&bfree_lock);
1681 		return;
1682 	}
1683 	mutex_exit(&bfree_lock);
1684 	bio_recycle(BIO_MEM, bsize);
1685 }
1686 
1687 /*
1688  * flush a list of delayed write buffers.
1689  * (currently used only by bio_recycle below.)
1690  */
1691 static void
1692 bio_flushlist(struct buf *delwri_list)
1693 {
1694 	struct buf *bp;
1695 
1696 	while (delwri_list != EMPTY_LIST) {
1697 		bp = delwri_list;
1698 		bp->b_flags |= B_AGE | B_ASYNC;
1699 		if (bp->b_vp == NULL) {		/* !ufs */
1700 			BWRITE(bp);
1701 		} else {			/* ufs */
1702 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1703 		}
1704 		delwri_list = bp->b_list;
1705 		bp->b_list = NULL;
1706 	}
1707 }
1708 
1709 /*
1710  * Start recycling buffers on the freelist for one of 2 reasons:
1711  *	- we need a buffer header
1712  *	- we need to free up memory
1713  * Once started we continue to recycle buffers until the B_AGE
1714  * buffers are gone.
1715  */
1716 static void
1717 bio_recycle(int want, long bsize)
1718 {
1719 	struct buf *bp, *dp, *dwp, *nbp;
1720 	struct hbuf *hp;
1721 	int	found = 0;
1722 	kmutex_t	*hmp;
1723 	int		start, end;
1724 	struct buf *delwri_list = EMPTY_LIST;
1725 
1726 	/*
1727 	 * Recycle buffers.
1728 	 */
1729 top:
1730 	start = end = lastindex;
1731 	do {
1732 		hp = &hbuf[start];
1733 		hmp = &hp->b_lock;
1734 		dp = (struct buf *)hp;
1735 
1736 		mutex_enter(hmp);
1737 		bp = dp->av_forw;
1738 
1739 		while (bp != dp) {
1740 
1741 			ASSERT(bp != NULL);
1742 
1743 			if (!sema_tryp(&bp->b_sem)) {
1744 				bp = bp->av_forw;
1745 				continue;
1746 			}
1747 			/*
1748 			 * Do we really want to nuke all of the B_AGE stuff??
1749 			 */
1750 			if ((bp->b_flags & B_AGE) == 0 && found) {
1751 				sema_v(&bp->b_sem);
1752 				mutex_exit(hmp);
1753 				lastindex = start;
1754 				return;	/* All done */
1755 			}
1756 
1757 			ASSERT(MUTEX_HELD(&hp->b_lock));
1758 			ASSERT(!(bp->b_flags & B_DELWRI));
1759 			hp->b_length--;
1760 			notavail(bp);
1761 
1762 			/*
1763 			 * Remove bhdr from cache, free up memory,
1764 			 * and add the hdr to the freelist.
1765 			 */
1766 			bremhash(bp);
1767 			mutex_exit(hmp);
1768 
1769 			if (bp->b_bufsize) {
1770 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1771 				bp->b_un.b_addr = NULL;
1772 				mutex_enter(&bfree_lock);
1773 				bfreelist.b_bufsize += bp->b_bufsize;
1774 				mutex_exit(&bfree_lock);
1775 			}
1776 
1777 			bp->b_dev = (o_dev_t)NODEV;
1778 			bp->b_edev = NODEV;
1779 			bp->b_flags = 0;
1780 			sema_v(&bp->b_sem);
1781 			bio_bhdr_free(bp);
1782 			if (want == BIO_HEADER) {
1783 				found = 1;
1784 			} else {
1785 				ASSERT(want == BIO_MEM);
1786 				if (!found && bfreelist.b_bufsize >= bsize) {
1787 					/* Account for the memory we want */
1788 					mutex_enter(&bfree_lock);
1789 					if (bfreelist.b_bufsize >= bsize) {
1790 						bfreelist.b_bufsize -= bsize;
1791 						found = 1;
1792 					}
1793 					mutex_exit(&bfree_lock);
1794 				}
1795 			}
1796 
1797 			/*
1798 			 * Since we dropped hmp start from the
1799 			 * begining.
1800 			 */
1801 			mutex_enter(hmp);
1802 			bp = dp->av_forw;
1803 		}
1804 		mutex_exit(hmp);
1805 
1806 		/*
1807 		 * Look at the delayed write list.
1808 		 * First gather into a private list, then write them.
1809 		 */
1810 		dwp = (struct buf *)&dwbuf[start];
1811 		mutex_enter(&blist_lock);
1812 		bio_doingflush++;
1813 		mutex_enter(hmp);
1814 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1815 
1816 			ASSERT(bp != NULL);
1817 			nbp = bp->av_forw;
1818 
1819 			if (!sema_tryp(&bp->b_sem))
1820 				continue;
1821 			ASSERT(bp->b_flags & B_DELWRI);
1822 			/*
1823 			 * Do we really want to nuke all of the B_AGE stuff??
1824 			 */
1825 
1826 			if ((bp->b_flags & B_AGE) == 0 && found) {
1827 				sema_v(&bp->b_sem);
1828 				mutex_exit(hmp);
1829 				lastindex = start;
1830 				mutex_exit(&blist_lock);
1831 				bio_flushlist(delwri_list);
1832 				mutex_enter(&blist_lock);
1833 				bio_doingflush--;
1834 				if (bio_flinv_cv_wanted) {
1835 					bio_flinv_cv_wanted = 0;
1836 					cv_broadcast(&bio_flushinval_cv);
1837 				}
1838 				mutex_exit(&blist_lock);
1839 				return; /* All done */
1840 			}
1841 
1842 			/*
1843 			 * If the buffer is already on a flush or
1844 			 * invalidate list then just skip it.
1845 			 */
1846 			if (bp->b_list != NULL) {
1847 				sema_v(&bp->b_sem);
1848 				continue;
1849 			}
1850 			/*
1851 			 * We are still on the same bucket.
1852 			 */
1853 			hp->b_length--;
1854 			notavail(bp);
1855 			bp->b_list = delwri_list;
1856 			delwri_list = bp;
1857 		}
1858 		mutex_exit(hmp);
1859 		mutex_exit(&blist_lock);
1860 		bio_flushlist(delwri_list);
1861 		delwri_list = EMPTY_LIST;
1862 		mutex_enter(&blist_lock);
1863 		bio_doingflush--;
1864 		if (bio_flinv_cv_wanted) {
1865 			bio_flinv_cv_wanted = 0;
1866 			cv_broadcast(&bio_flushinval_cv);
1867 		}
1868 		mutex_exit(&blist_lock);
1869 		start = (start + 1) % v.v_hbuf;
1870 
1871 	} while (start != end);
1872 
1873 	if (found)
1874 		return;
1875 
1876 	/*
1877 	 * Free lists exhausted and we haven't satisfied the request.
1878 	 * Wait here for more entries to be added to freelist.
1879 	 * Because this might have just happened, make it timed.
1880 	 */
1881 	mutex_enter(&bfree_lock);
1882 	bfreelist.b_flags |= B_WANTED;
1883 	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1884 	mutex_exit(&bfree_lock);
1885 	goto top;
1886 }
1887 
1888 /*
1889  * See if the block is associated with some buffer
1890  * (mainly to avoid getting hung up on a wait in breada).
1891  */
1892 static int
1893 bio_incore(dev_t dev, daddr_t blkno)
1894 {
1895 	struct buf *bp;
1896 	struct buf *dp;
1897 	uint_t index;
1898 	kmutex_t *hmp;
1899 
1900 	index = bio_bhash(dev, blkno);
1901 	dp = (struct buf *)&hbuf[index];
1902 	hmp = &hbuf[index].b_lock;
1903 
1904 	mutex_enter(hmp);
1905 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1906 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1907 		    (bp->b_flags & B_STALE) == 0) {
1908 			mutex_exit(hmp);
1909 			return (1);
1910 		}
1911 	}
1912 	mutex_exit(hmp);
1913 	return (0);
1914 }
1915 
1916 static void
1917 bio_pageio_done(struct buf *bp)
1918 {
1919 	if (bp->b_flags & B_PAGEIO) {
1920 
1921 		if (bp->b_flags & B_REMAPPED)
1922 			bp_mapout(bp);
1923 
1924 		if (bp->b_flags & B_READ)
1925 			pvn_read_done(bp->b_pages, bp->b_flags);
1926 		else
1927 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1928 		pageio_done(bp);
1929 	} else {
1930 		ASSERT(bp->b_flags & B_REMAPPED);
1931 		bp_mapout(bp);
1932 		brelse(bp);
1933 	}
1934 }
1935 
1936 /*
1937  * bioerror(9F) - indicate error in buffer header
1938  * If 'error' is zero, remove the error indication.
1939  */
1940 void
1941 bioerror(struct buf *bp, int error)
1942 {
1943 	ASSERT(bp != NULL);
1944 	ASSERT(error >= 0);
1945 	ASSERT(SEMA_HELD(&bp->b_sem));
1946 
1947 	if (error != 0) {
1948 		bp->b_flags |= B_ERROR;
1949 	} else {
1950 		bp->b_flags &= ~B_ERROR;
1951 	}
1952 	bp->b_error = error;
1953 }
1954 
1955 /*
1956  * bioreset(9F) - reuse a private buffer header after I/O is complete
1957  */
1958 void
1959 bioreset(struct buf *bp)
1960 {
1961 	ASSERT(bp != NULL);
1962 
1963 	biofini(bp);
1964 	bioinit(bp);
1965 }
1966 
1967 /*
1968  * biosize(9F) - return size of a buffer header
1969  */
1970 size_t
1971 biosize(void)
1972 {
1973 	return (sizeof (struct buf));
1974 }
1975 
1976 /*
1977  * biomodified(9F) - check if buffer is modified
1978  */
1979 int
1980 biomodified(struct buf *bp)
1981 {
1982 	int npf;
1983 	int ppattr;
1984 	struct page *pp;
1985 
1986 	ASSERT(bp != NULL);
1987 
1988 	if ((bp->b_flags & B_PAGEIO) == 0) {
1989 		return (-1);
1990 	}
1991 	pp = bp->b_pages;
1992 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1993 
1994 	while (npf > 0) {
1995 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1996 		    HAT_SYNC_STOPON_MOD);
1997 		if (ppattr & P_MOD)
1998 			return (1);
1999 		pp = pp->p_next;
2000 		npf--;
2001 	}
2002 
2003 	return (0);
2004 }
2005 
2006 /*
2007  * bioinit(9F) - initialize a buffer structure
2008  */
2009 void
2010 bioinit(struct buf *bp)
2011 {
2012 	bzero(bp, sizeof (struct buf));
2013 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2014 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2015 	bp->b_offset = -1;
2016 }
2017 
2018 /*
2019  * biofini(9F) - uninitialize a buffer structure
2020  */
2021 void
2022 biofini(struct buf *bp)
2023 {
2024 	sema_destroy(&bp->b_io);
2025 	sema_destroy(&bp->b_sem);
2026 }
2027 
2028 /*
2029  * bioclone(9F) - clone a buffer
2030  */
2031 struct buf *
2032 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2033     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2034 {
2035 	struct buf *bufp;
2036 
2037 	ASSERT(bp);
2038 	if (bp_mem == NULL) {
2039 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2040 		if (bufp == NULL) {
2041 			return (NULL);
2042 		}
2043 		bioinit(bufp);
2044 	} else {
2045 		bufp = bp_mem;
2046 		bioreset(bufp);
2047 	}
2048 
2049 #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2050 	B_ABRWRITE)
2051 
2052 	/*
2053 	 * The cloned buffer does not inherit the B_REMAPPED flag.
2054 	 */
2055 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2056 	bufp->b_bcount = len;
2057 	bufp->b_blkno = blkno;
2058 	bufp->b_iodone = iodone;
2059 	bufp->b_proc = bp->b_proc;
2060 	bufp->b_edev = dev;
2061 	bufp->b_file = bp->b_file;
2062 	bufp->b_offset = bp->b_offset;
2063 
2064 	if (bp->b_flags & B_SHADOW) {
2065 		ASSERT(bp->b_shadow);
2066 		ASSERT(bp->b_flags & B_PHYS);
2067 
2068 		bufp->b_shadow = bp->b_shadow +
2069 		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2070 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2071 		if (bp->b_flags & B_REMAPPED)
2072 			bufp->b_proc = NULL;
2073 	} else {
2074 		if (bp->b_flags & B_PAGEIO) {
2075 			struct page *pp;
2076 			off_t o;
2077 			int i;
2078 
2079 			pp = bp->b_pages;
2080 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2081 			for (i = btop(o); i > 0; i--) {
2082 				pp = pp->p_next;
2083 			}
2084 			bufp->b_pages = pp;
2085 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2086 		} else {
2087 			bufp->b_un.b_addr =
2088 			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2089 			if (bp->b_flags & B_REMAPPED)
2090 				bufp->b_proc = NULL;
2091 		}
2092 	}
2093 	return (bufp);
2094 }
2095