xref: /illumos-gate/usr/src/uts/common/os/bio.c (revision 3ba944265c4ae1fcf23ef758537c2e4f4feec16e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2011 Joyent, Inc.  All rights reserved.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #include <sys/types.h>
41 #include <sys/t_lock.h>
42 #include <sys/sysmacros.h>
43 #include <sys/conf.h>
44 #include <sys/cpuvar.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/buf.h>
48 #include <sys/var.h>
49 #include <sys/vnode.h>
50 #include <sys/bitmap.h>
51 #include <sys/cmn_err.h>
52 #include <sys/kmem.h>
53 #include <sys/vmem.h>
54 #include <sys/atomic.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/page.h>
57 #include <vm/pvn.h>
58 #include <sys/vtrace.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/fs/ufs_inode.h>
61 #include <sys/fs/ufs_bio.h>
62 #include <sys/fs/ufs_log.h>
63 #include <sys/systm.h>
64 #include <sys/vfs.h>
65 #include <sys/sdt.h>
66 
67 /* Locks */
68 static	kmutex_t	blist_lock;	/* protects b_list */
69 static	kmutex_t	bhdr_lock;	/* protects the bhdrlist */
70 static	kmutex_t	bfree_lock;	/* protects the bfreelist structure */
71 
72 struct hbuf	*hbuf;			/* Hash buckets */
73 struct dwbuf	*dwbuf;			/* Delayed write buckets */
74 static struct buf *bhdrlist;		/* buf header free list */
75 static int 	nbuf;			/* number of buffer headers allocated */
76 
77 static int	lastindex;		/* Reference point on where to start */
78 					/* when looking for free buffers */
79 
80 #define	bio_bhash(dev, bn)	(hash2ints((dev), (int)(bn)) & v.v_hmask)
81 #define	EMPTY_LIST	((struct buf *)-1)
82 
83 static kcondvar_t	bio_mem_cv; 	/* Condition variables */
84 static kcondvar_t	bio_flushinval_cv;
85 static int	bio_doingflush;		/* flush in progress */
86 static int	bio_doinginval;		/* inval in progress */
87 static int	bio_flinv_cv_wanted;	/* someone waiting for cv */
88 
89 /*
90  * Statistics on the buffer cache
91  */
92 struct biostats biostats = {
93 	{ "buffer_cache_lookups",		KSTAT_DATA_UINT32 },
94 	{ "buffer_cache_hits",			KSTAT_DATA_UINT32 },
95 	{ "new_buffer_requests",		KSTAT_DATA_UINT32 },
96 	{ "waits_for_buffer_allocs",		KSTAT_DATA_UINT32 },
97 	{ "buffers_locked_by_someone",		KSTAT_DATA_UINT32 },
98 	{ "duplicate_buffers_found",		KSTAT_DATA_UINT32 }
99 };
100 
101 /*
102  * kstat data
103  */
104 kstat_named_t	*biostats_ptr = (kstat_named_t *)&biostats;
105 uint_t		biostats_ndata = (uint_t)(sizeof (biostats) /
106 					sizeof (kstat_named_t));
107 
108 /*
109  * Statistics on ufs buffer cache
110  * Not protected by locks
111  */
112 struct ufsbiostats ub = {
113 	{ "breads",			KSTAT_DATA_UINT32 },
114 	{ "bwrites",			KSTAT_DATA_UINT32 },
115 	{ "fbiwrites",			KSTAT_DATA_UINT32 },
116 	{ "getpages",			KSTAT_DATA_UINT32 },
117 	{ "getras",			KSTAT_DATA_UINT32 },
118 	{ "putsyncs",			KSTAT_DATA_UINT32 },
119 	{ "putasyncs",			KSTAT_DATA_UINT32 },
120 	{ "putpageios",			KSTAT_DATA_UINT32 },
121 };
122 
123 /*
124  * more UFS Logging eccentricities...
125  *
126  * required since "#pragma weak ..." doesn't work in reverse order.
127  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
128  *        to ufs routines don't get plugged into bio.c calls so
129  *        we initialize it when setting up the "lufsops" table
130  *        in "lufs.c:_init()"
131  */
132 void (*bio_lufs_strategy)(void *, buf_t *);
133 void (*bio_snapshot_strategy)(void *, buf_t *);
134 
135 
136 /* Private routines */
137 static struct buf	*bio_getfreeblk(long);
138 static void 		bio_mem_get(long);
139 static void		bio_bhdr_free(struct buf *);
140 static struct buf	*bio_bhdr_alloc(void);
141 static void		bio_recycle(int, long);
142 static void 		bio_pageio_done(struct buf *);
143 static int 		bio_incore(dev_t, daddr_t);
144 
145 /*
146  * Buffer cache constants
147  */
148 #define	BIO_BUF_PERCENT	(100/2)		/* default: 2% of memory */
149 #define	BIO_MAX_PERCENT	(100/20)	/* max is 20% of real memory */
150 #define	BIO_BHDR_POOL	100		/* Default bhdr pool size */
151 #define	BIO_MIN_HDR	10		/* Minimum number of buffer headers */
152 #define	BIO_MIN_HWM	(BIO_MIN_HDR * MAXBSIZE / 1024)
153 #define	BIO_HASHLEN	4		/* Target length of hash chains */
154 
155 
156 /* Flags for bio_recycle() */
157 #define	BIO_HEADER	0x01
158 #define	BIO_MEM		0x02
159 
160 extern	int bufhwm;		/* User tunable - high water mark for mem  */
161 extern	int bufhwm_pct;		/* ditto - given in % of physmem  */
162 
163 /*
164  * The following routines allocate and free
165  * buffers with various side effects.  In general the
166  * arguments to an allocate routine are a device and
167  * a block number, and the value is a pointer to
168  * to the buffer header; the buffer returned is locked with a
169  * binary semaphore so that no one else can touch it. If the block was
170  * already in core, no I/O need be done; if it is
171  * already locked, the process waits until it becomes free.
172  * The following routines allocate a buffer:
173  *	getblk
174  *	bread/BREAD
175  *	breada
176  * Eventually the buffer must be released, possibly with the
177  * side effect of writing it out, by using one of
178  *	bwrite/BWRITE/brwrite
179  *	bdwrite/bdrwrite
180  *	bawrite
181  *	brelse
182  *
183  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
184  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
185  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
186  * B_DONE is still used to denote a buffer with I/O complete on it.
187  *
188  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
189  * should not be used where a very accurate count of the free buffers is
190  * needed.
191  */
192 
193 /*
194  * Read in (if necessary) the block and return a buffer pointer.
195  *
196  * This interface is provided for binary compatibility.  Using
197  * BREAD() directly avoids the extra function call overhead invoked
198  * by calling this routine.
199  */
200 struct buf *
201 bread(dev_t dev, daddr_t blkno, long bsize)
202 {
203 	return (BREAD(dev, blkno, bsize));
204 }
205 
206 /*
207  * Common code for reading a buffer with various options
208  *
209  * Read in (if necessary) the block and return a buffer pointer.
210  */
211 struct buf *
212 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
213 {
214 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
215 	struct buf *bp;
216 	klwp_t *lwp = ttolwp(curthread);
217 
218 	CPU_STATS_ADD_K(sys, lread, 1);
219 	bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
220 	if (bp->b_flags & B_DONE)
221 		return (bp);
222 	bp->b_flags |= B_READ;
223 	ASSERT(bp->b_bcount == bsize);
224 	if (ufsvfsp == NULL) {					/* !ufs */
225 		(void) bdev_strategy(bp);
226 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
227 							/* ufs && logging */
228 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
229 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
230 							/* ufs && snapshots */
231 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
232 	} else {
233 		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
234 		ub.ub_breads.value.ul++;		/* ufs && !logging */
235 		(void) bdev_strategy(bp);
236 	}
237 	if (lwp != NULL)
238 		lwp->lwp_ru.inblock++;
239 	CPU_STATS_ADD_K(sys, bread, 1);
240 	(void) biowait(bp);
241 	return (bp);
242 }
243 
244 /*
245  * Read in the block, like bread, but also start I/O on the
246  * read-ahead block (which is not allocated to the caller).
247  */
248 struct buf *
249 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
250 {
251 	struct buf *bp, *rabp;
252 	klwp_t *lwp = ttolwp(curthread);
253 
254 	bp = NULL;
255 	if (!bio_incore(dev, blkno)) {
256 		CPU_STATS_ADD_K(sys, lread, 1);
257 		bp = GETBLK(dev, blkno, bsize);
258 		if ((bp->b_flags & B_DONE) == 0) {
259 			bp->b_flags |= B_READ;
260 			bp->b_bcount = bsize;
261 			(void) bdev_strategy(bp);
262 			if (lwp != NULL)
263 				lwp->lwp_ru.inblock++;
264 			CPU_STATS_ADD_K(sys, bread, 1);
265 		}
266 	}
267 	if (rablkno && bfreelist.b_bcount > 1 &&
268 	    !bio_incore(dev, rablkno)) {
269 		rabp = GETBLK(dev, rablkno, bsize);
270 		if (rabp->b_flags & B_DONE)
271 			brelse(rabp);
272 		else {
273 			rabp->b_flags |= B_READ|B_ASYNC;
274 			rabp->b_bcount = bsize;
275 			(void) bdev_strategy(rabp);
276 			if (lwp != NULL)
277 				lwp->lwp_ru.inblock++;
278 			CPU_STATS_ADD_K(sys, bread, 1);
279 		}
280 	}
281 	if (bp == NULL)
282 		return (BREAD(dev, blkno, bsize));
283 	(void) biowait(bp);
284 	return (bp);
285 }
286 
287 /*
288  * Common code for writing a buffer with various options.
289  *
290  * force_wait  - wait for write completion regardless of B_ASYNC flag
291  * do_relse    - release the buffer when we are done
292  * clear_flags - flags to clear from the buffer
293  */
294 void
295 bwrite_common(void *arg, struct buf *bp, int force_wait,
296 				int do_relse, int clear_flags)
297 {
298 	register int do_wait;
299 	struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
300 	int flag;
301 	klwp_t *lwp = ttolwp(curthread);
302 	struct cpu *cpup;
303 
304 	ASSERT(SEMA_HELD(&bp->b_sem));
305 	flag = bp->b_flags;
306 	bp->b_flags &= ~clear_flags;
307 	if (lwp != NULL)
308 		lwp->lwp_ru.oublock++;
309 	CPU_STATS_ENTER_K();
310 	cpup = CPU;		/* get pointer AFTER preemption is disabled */
311 	CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
312 	CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
313 	do_wait = ((flag & B_ASYNC) == 0 || force_wait);
314 	if (do_wait == 0)
315 		CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
316 	CPU_STATS_EXIT_K();
317 	if (ufsvfsp == NULL) {
318 		(void) bdev_strategy(bp);
319 	} else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
320 							/* ufs && logging */
321 		(*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
322 	} else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
323 							/* ufs && snapshots */
324 		(*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
325 	} else {
326 		ub.ub_bwrites.value.ul++;		/* ufs && !logging */
327 		(void) bdev_strategy(bp);
328 	}
329 	if (do_wait) {
330 		(void) biowait(bp);
331 		if (do_relse) {
332 			brelse(bp);
333 		}
334 	}
335 }
336 
337 /*
338  * Write the buffer, waiting for completion (unless B_ASYNC is set).
339  * Then release the buffer.
340  * This interface is provided for binary compatibility.  Using
341  * BWRITE() directly avoids the extra function call overhead invoked
342  * by calling this routine.
343  */
344 void
345 bwrite(struct buf *bp)
346 {
347 	BWRITE(bp);
348 }
349 
350 /*
351  * Write the buffer, waiting for completion.
352  * But don't release the buffer afterwards.
353  * This interface is provided for binary compatibility.  Using
354  * BWRITE2() directly avoids the extra function call overhead.
355  */
356 void
357 bwrite2(struct buf *bp)
358 {
359 	BWRITE2(bp);
360 }
361 
362 /*
363  * Release the buffer, marking it so that if it is grabbed
364  * for another purpose it will be written out before being
365  * given up (e.g. when writing a partial block where it is
366  * assumed that another write for the same block will soon follow).
367  * Also save the time that the block is first marked as delayed
368  * so that it will be written in a reasonable time.
369  */
370 void
371 bdwrite(struct buf *bp)
372 {
373 	ASSERT(SEMA_HELD(&bp->b_sem));
374 	CPU_STATS_ADD_K(sys, lwrite, 1);
375 	if ((bp->b_flags & B_DELWRI) == 0)
376 		bp->b_start = ddi_get_lbolt();
377 	/*
378 	 * B_DONE allows others to use the buffer, B_DELWRI causes the
379 	 * buffer to be written before being reused, and setting b_resid
380 	 * to zero says the buffer is complete.
381 	 */
382 	bp->b_flags |= B_DELWRI | B_DONE;
383 	bp->b_resid = 0;
384 	brelse(bp);
385 }
386 
387 /*
388  * Release the buffer, start I/O on it, but don't wait for completion.
389  */
390 void
391 bawrite(struct buf *bp)
392 {
393 	ASSERT(SEMA_HELD(&bp->b_sem));
394 
395 	/* Use bfreelist.b_bcount as a weird-ass heuristic */
396 	if (bfreelist.b_bcount > 4)
397 		bp->b_flags |= B_ASYNC;
398 	BWRITE(bp);
399 }
400 
401 /*
402  * Release the buffer, with no I/O implied.
403  */
404 void
405 brelse(struct buf *bp)
406 {
407 	struct buf	**backp;
408 	uint_t		index;
409 	kmutex_t	*hmp;
410 	struct	buf	*dp;
411 	struct	hbuf	*hp;
412 
413 
414 	ASSERT(SEMA_HELD(&bp->b_sem));
415 
416 	/*
417 	 * Clear the retry write flag if the buffer was written without
418 	 * error.  The presence of B_DELWRI means the buffer has not yet
419 	 * been written and the presence of B_ERROR means that an error
420 	 * is still occurring.
421 	 */
422 	if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
423 		bp->b_flags &= ~B_RETRYWRI;
424 	}
425 
426 	/* Check for anomalous conditions */
427 	if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
428 		if (bp->b_flags & B_NOCACHE) {
429 			/* Don't add to the freelist. Destroy it now */
430 			kmem_free(bp->b_un.b_addr, bp->b_bufsize);
431 			sema_destroy(&bp->b_sem);
432 			sema_destroy(&bp->b_io);
433 			kmem_free(bp, sizeof (struct buf));
434 			return;
435 		}
436 		/*
437 		 * If a write failed and we are supposed to retry write,
438 		 * don't toss the buffer.  Keep it around and mark it
439 		 * delayed write in the hopes that it will eventually
440 		 * get flushed (and still keep the system running.)
441 		 */
442 		if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
443 			bp->b_flags |= B_DELWRI;
444 			/* keep fsflush from trying continuously to flush */
445 			bp->b_start = ddi_get_lbolt();
446 		} else
447 			bp->b_flags |= B_AGE|B_STALE;
448 		bp->b_flags &= ~B_ERROR;
449 		bp->b_error = 0;
450 	}
451 
452 	/*
453 	 * If delayed write is set then put in on the delayed
454 	 * write list instead of the free buffer list.
455 	 */
456 	index = bio_bhash(bp->b_edev, bp->b_blkno);
457 	hmp   = &hbuf[index].b_lock;
458 
459 	mutex_enter(hmp);
460 	hp = &hbuf[index];
461 	dp = (struct buf *)hp;
462 
463 	/*
464 	 * Make sure that the number of entries on this list are
465 	 * Zero <= count <= total # buffers
466 	 */
467 	ASSERT(hp->b_length >= 0);
468 	ASSERT(hp->b_length < nbuf);
469 
470 	hp->b_length++;		/* We are adding this buffer */
471 
472 	if (bp->b_flags & B_DELWRI) {
473 		/*
474 		 * This buffer goes on the delayed write buffer list
475 		 */
476 		dp = (struct buf *)&dwbuf[index];
477 	}
478 	ASSERT(bp->b_bufsize > 0);
479 	ASSERT(bp->b_bcount > 0);
480 	ASSERT(bp->b_un.b_addr != NULL);
481 
482 	if (bp->b_flags & B_AGE) {
483 		backp = &dp->av_forw;
484 		(*backp)->av_back = bp;
485 		bp->av_forw = *backp;
486 		*backp = bp;
487 		bp->av_back = dp;
488 	} else {
489 		backp = &dp->av_back;
490 		(*backp)->av_forw = bp;
491 		bp->av_back = *backp;
492 		*backp = bp;
493 		bp->av_forw = dp;
494 	}
495 	mutex_exit(hmp);
496 
497 	if (bfreelist.b_flags & B_WANTED) {
498 		/*
499 		 * Should come here very very rarely.
500 		 */
501 		mutex_enter(&bfree_lock);
502 		if (bfreelist.b_flags & B_WANTED) {
503 			bfreelist.b_flags &= ~B_WANTED;
504 			cv_broadcast(&bio_mem_cv);
505 		}
506 		mutex_exit(&bfree_lock);
507 	}
508 
509 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
510 	/*
511 	 * Don't let anyone get the buffer off the freelist before we
512 	 * release our hold on it.
513 	 */
514 	sema_v(&bp->b_sem);
515 }
516 
517 /*
518  * Return a count of the number of B_BUSY buffers in the system
519  * Can only be used as a good estimate.  If 'cleanit' is set,
520  * try to flush all bufs.
521  */
522 int
523 bio_busy(int cleanit)
524 {
525 	struct buf *bp, *dp;
526 	int busy = 0;
527 	int i;
528 	kmutex_t *hmp;
529 
530 	for (i = 0; i < v.v_hbuf; i++) {
531 		vfs_syncprogress();
532 		dp = (struct buf *)&hbuf[i];
533 		hmp = &hbuf[i].b_lock;
534 
535 		mutex_enter(hmp);
536 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
537 			if (bp->b_flags & B_BUSY)
538 				busy++;
539 		}
540 		mutex_exit(hmp);
541 	}
542 
543 	if (cleanit && busy != 0) {
544 		bflush(NODEV);
545 	}
546 
547 	return (busy);
548 }
549 
550 /*
551  * this interface is provided for binary compatibility.
552  *
553  * Assign a buffer for the given block.  If the appropriate
554  * block is already associated, return it; otherwise search
555  * for the oldest non-busy buffer and reassign it.
556  */
557 struct buf *
558 getblk(dev_t dev, daddr_t blkno, long bsize)
559 {
560 	return (getblk_common(/* ufsvfsp */ NULL, dev,
561 	    blkno, bsize, /* errflg */ 0));
562 }
563 
564 /*
565  * Assign a buffer for the given block.  If the appropriate
566  * block is already associated, return it; otherwise search
567  * for the oldest non-busy buffer and reassign it.
568  */
569 struct buf *
570 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
571 {
572 	ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
573 	struct buf *bp;
574 	struct buf *dp;
575 	struct buf *nbp = NULL;
576 	struct buf *errbp;
577 	uint_t		index;
578 	kmutex_t	*hmp;
579 	struct	hbuf	*hp;
580 
581 	if (getmajor(dev) >= devcnt)
582 		cmn_err(CE_PANIC, "blkdev");
583 
584 	biostats.bio_lookup.value.ui32++;
585 
586 	index = bio_bhash(dev, blkno);
587 	hp    = &hbuf[index];
588 	dp    = (struct buf *)hp;
589 	hmp   = &hp->b_lock;
590 
591 	mutex_enter(hmp);
592 loop:
593 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
594 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
595 		    (bp->b_flags & B_STALE))
596 			continue;
597 		/*
598 		 * Avoid holding the hash lock in the event that
599 		 * the buffer is locked by someone. Since the hash chain
600 		 * may change when we drop the hash lock
601 		 * we have to start at the beginning of the chain if the
602 		 * buffer identity/contents aren't valid.
603 		 */
604 		if (!sema_tryp(&bp->b_sem)) {
605 			biostats.bio_bufbusy.value.ui32++;
606 			mutex_exit(hmp);
607 			/*
608 			 * OK, we are dealing with a busy buffer.
609 			 * In the case that we are panicking and we
610 			 * got called from bread(), we have some chance
611 			 * for error recovery. So better bail out from
612 			 * here since sema_p() won't block. If we got
613 			 * called directly from ufs routines, there is
614 			 * no way to report an error yet.
615 			 */
616 			if (panicstr && errflg)
617 				goto errout;
618 			/*
619 			 * For the following line of code to work
620 			 * correctly never kmem_free the buffer "header".
621 			 */
622 			sema_p(&bp->b_sem);
623 			if (bp->b_blkno != blkno || bp->b_edev != dev ||
624 			    (bp->b_flags & B_STALE)) {
625 				sema_v(&bp->b_sem);
626 				mutex_enter(hmp);
627 				goto loop;	/* start over */
628 			}
629 			mutex_enter(hmp);
630 		}
631 		/* Found */
632 		biostats.bio_hit.value.ui32++;
633 		bp->b_flags &= ~B_AGE;
634 
635 		/*
636 		 * Yank it off the free/delayed write lists
637 		 */
638 		hp->b_length--;
639 		notavail(bp);
640 		mutex_exit(hmp);
641 
642 		ASSERT((bp->b_flags & B_NOCACHE) == NULL);
643 
644 		if (nbp == NULL) {
645 			/*
646 			 * Make the common path short.
647 			 */
648 			ASSERT(SEMA_HELD(&bp->b_sem));
649 			return (bp);
650 		}
651 
652 		biostats.bio_bufdup.value.ui32++;
653 
654 		/*
655 		 * The buffer must have entered during the lock upgrade
656 		 * so free the new buffer we allocated and return the
657 		 * found buffer.
658 		 */
659 		kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
660 		nbp->b_un.b_addr = NULL;
661 
662 		/*
663 		 * Account for the memory
664 		 */
665 		mutex_enter(&bfree_lock);
666 		bfreelist.b_bufsize += nbp->b_bufsize;
667 		mutex_exit(&bfree_lock);
668 
669 		/*
670 		 * Destroy buf identity, and place on avail list
671 		 */
672 		nbp->b_dev = (o_dev_t)NODEV;
673 		nbp->b_edev = NODEV;
674 		nbp->b_flags = 0;
675 		nbp->b_file = NULL;
676 		nbp->b_offset = -1;
677 
678 		sema_v(&nbp->b_sem);
679 		bio_bhdr_free(nbp);
680 
681 		ASSERT(SEMA_HELD(&bp->b_sem));
682 		return (bp);
683 	}
684 
685 	/*
686 	 * bio_getfreeblk may block so check the hash chain again.
687 	 */
688 	if (nbp == NULL) {
689 		mutex_exit(hmp);
690 		nbp = bio_getfreeblk(bsize);
691 		mutex_enter(hmp);
692 		goto loop;
693 	}
694 
695 	/*
696 	 * New buffer. Assign nbp and stick it on the hash.
697 	 */
698 	nbp->b_flags = B_BUSY;
699 	nbp->b_edev = dev;
700 	nbp->b_dev = (o_dev_t)cmpdev(dev);
701 	nbp->b_blkno = blkno;
702 	nbp->b_iodone = NULL;
703 	nbp->b_bcount = bsize;
704 	/*
705 	 * If we are given a ufsvfsp and the vfs_root field is NULL
706 	 * then this must be I/O for a superblock.  A superblock's
707 	 * buffer is set up in mountfs() and there is no root vnode
708 	 * at that point.
709 	 */
710 	if (ufsvfsp && ufsvfsp->vfs_root) {
711 		nbp->b_vp = ufsvfsp->vfs_root;
712 	} else {
713 		nbp->b_vp = NULL;
714 	}
715 
716 	ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
717 
718 	binshash(nbp, dp);
719 	mutex_exit(hmp);
720 
721 	ASSERT(SEMA_HELD(&nbp->b_sem));
722 
723 	return (nbp);
724 
725 
726 	/*
727 	 * Come here in case of an internal error. At this point we couldn't
728 	 * get a buffer, but he have to return one. Hence we allocate some
729 	 * kind of error reply buffer on the fly. This buffer is marked as
730 	 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
731 	 *	- B_ERROR will indicate error to the caller.
732 	 *	- B_DONE will prevent us from reading the buffer from
733 	 *	  the device.
734 	 *	- B_NOCACHE will cause that this buffer gets free'd in
735 	 *	  brelse().
736 	 */
737 
738 errout:
739 	errbp = geteblk();
740 	sema_p(&errbp->b_sem);
741 	errbp->b_flags &= ~B_BUSY;
742 	errbp->b_flags |= (B_ERROR | B_DONE);
743 	return (errbp);
744 }
745 
746 /*
747  * Get an empty block, not assigned to any particular device.
748  * Returns a locked buffer that is not on any hash or free list.
749  */
750 struct buf *
751 ngeteblk(long bsize)
752 {
753 	struct buf *bp;
754 
755 	bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
756 	bioinit(bp);
757 	bp->av_forw = bp->av_back = NULL;
758 	bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
759 	bp->b_bufsize = bsize;
760 	bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
761 	bp->b_dev = (o_dev_t)NODEV;
762 	bp->b_edev = NODEV;
763 	bp->b_lblkno = 0;
764 	bp->b_bcount = bsize;
765 	bp->b_iodone = NULL;
766 	return (bp);
767 }
768 
769 /*
770  * Interface of geteblk() is kept intact to maintain driver compatibility.
771  * Use ngeteblk() to allocate block size other than 1 KB.
772  */
773 struct buf *
774 geteblk(void)
775 {
776 	return (ngeteblk((long)1024));
777 }
778 
779 /*
780  * Return a buffer w/o sleeping
781  */
782 struct buf *
783 trygetblk(dev_t dev, daddr_t blkno)
784 {
785 	struct buf	*bp;
786 	struct buf	*dp;
787 	struct hbuf	*hp;
788 	kmutex_t	*hmp;
789 	uint_t		index;
790 
791 	index = bio_bhash(dev, blkno);
792 	hp = &hbuf[index];
793 	hmp = &hp->b_lock;
794 
795 	if (!mutex_tryenter(hmp))
796 		return (NULL);
797 
798 	dp = (struct buf *)hp;
799 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
800 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
801 		    (bp->b_flags & B_STALE))
802 			continue;
803 		/*
804 		 * Get access to a valid buffer without sleeping
805 		 */
806 		if (sema_tryp(&bp->b_sem)) {
807 			if (bp->b_flags & B_DONE) {
808 				hp->b_length--;
809 				notavail(bp);
810 				mutex_exit(hmp);
811 				return (bp);
812 			} else {
813 				sema_v(&bp->b_sem);
814 				break;
815 			}
816 		}
817 		break;
818 	}
819 	mutex_exit(hmp);
820 	return (NULL);
821 }
822 
823 /*
824  * Wait for I/O completion on the buffer; return errors
825  * to the user.
826  */
827 int
828 iowait(struct buf *bp)
829 {
830 	ASSERT(SEMA_HELD(&bp->b_sem));
831 	return (biowait(bp));
832 }
833 
834 /*
835  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
836  * and wake up anyone waiting for it.
837  */
838 void
839 iodone(struct buf *bp)
840 {
841 	ASSERT(SEMA_HELD(&bp->b_sem));
842 	(void) biodone(bp);
843 }
844 
845 /*
846  * Zero the core associated with a buffer.
847  */
848 void
849 clrbuf(struct buf *bp)
850 {
851 	ASSERT(SEMA_HELD(&bp->b_sem));
852 	bzero(bp->b_un.b_addr, bp->b_bcount);
853 	bp->b_resid = 0;
854 }
855 
856 
857 /*
858  * Make sure all write-behind blocks on dev (or NODEV for all)
859  * are flushed out.
860  */
861 void
862 bflush(dev_t dev)
863 {
864 	struct buf *bp, *dp;
865 	struct hbuf *hp;
866 	struct buf *delwri_list = EMPTY_LIST;
867 	int i, index;
868 	kmutex_t *hmp;
869 
870 	mutex_enter(&blist_lock);
871 	/*
872 	 * Wait for any invalidates or flushes ahead of us to finish.
873 	 * We really could split blist_lock up per device for better
874 	 * parallelism here.
875 	 */
876 	while (bio_doinginval || bio_doingflush) {
877 		bio_flinv_cv_wanted = 1;
878 		cv_wait(&bio_flushinval_cv, &blist_lock);
879 	}
880 	bio_doingflush++;
881 	/*
882 	 * Gather all B_DELWRI buffer for device.
883 	 * Lock ordering is b_sem > hash lock (brelse).
884 	 * Since we are finding the buffer via the delayed write list,
885 	 * it may be busy and we would block trying to get the
886 	 * b_sem lock while holding hash lock. So transfer all the
887 	 * candidates on the delwri_list and then drop the hash locks.
888 	 */
889 	for (i = 0; i < v.v_hbuf; i++) {
890 		vfs_syncprogress();
891 		hmp = &hbuf[i].b_lock;
892 		dp = (struct buf *)&dwbuf[i];
893 		mutex_enter(hmp);
894 		for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
895 			if (dev == NODEV || bp->b_edev == dev) {
896 				if (bp->b_list == NULL) {
897 					bp->b_list = delwri_list;
898 					delwri_list = bp;
899 				}
900 			}
901 		}
902 		mutex_exit(hmp);
903 	}
904 	mutex_exit(&blist_lock);
905 
906 	/*
907 	 * Now that the hash locks have been dropped grab the semaphores
908 	 * and write back all the buffers that have B_DELWRI set.
909 	 */
910 	while (delwri_list != EMPTY_LIST) {
911 		vfs_syncprogress();
912 		bp = delwri_list;
913 
914 		sema_p(&bp->b_sem);	/* may block */
915 		if ((dev != bp->b_edev && dev != NODEV) ||
916 		    (panicstr && bp->b_flags & B_BUSY)) {
917 			sema_v(&bp->b_sem);
918 			delwri_list = bp->b_list;
919 			bp->b_list = NULL;
920 			continue;	/* No longer a candidate */
921 		}
922 		if (bp->b_flags & B_DELWRI) {
923 			index = bio_bhash(bp->b_edev, bp->b_blkno);
924 			hp = &hbuf[index];
925 			hmp = &hp->b_lock;
926 			dp = (struct buf *)hp;
927 
928 			bp->b_flags |= B_ASYNC;
929 			mutex_enter(hmp);
930 			hp->b_length--;
931 			notavail(bp);
932 			mutex_exit(hmp);
933 			if (bp->b_vp == NULL) {		/* !ufs */
934 				BWRITE(bp);
935 			} else {			/* ufs */
936 				UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
937 			}
938 		} else {
939 			sema_v(&bp->b_sem);
940 		}
941 		delwri_list = bp->b_list;
942 		bp->b_list = NULL;
943 	}
944 	mutex_enter(&blist_lock);
945 	bio_doingflush--;
946 	if (bio_flinv_cv_wanted) {
947 		bio_flinv_cv_wanted = 0;
948 		cv_broadcast(&bio_flushinval_cv);
949 	}
950 	mutex_exit(&blist_lock);
951 }
952 
953 /*
954  * Ensure that a specified block is up-to-date on disk.
955  */
956 void
957 blkflush(dev_t dev, daddr_t blkno)
958 {
959 	struct buf *bp, *dp;
960 	struct hbuf *hp;
961 	struct buf *sbp = NULL;
962 	uint_t index;
963 	kmutex_t *hmp;
964 
965 	index = bio_bhash(dev, blkno);
966 	hp    = &hbuf[index];
967 	dp    = (struct buf *)hp;
968 	hmp   = &hp->b_lock;
969 
970 	/*
971 	 * Identify the buffer in the cache belonging to
972 	 * this device and blkno (if any).
973 	 */
974 	mutex_enter(hmp);
975 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
976 		if (bp->b_blkno != blkno || bp->b_edev != dev ||
977 		    (bp->b_flags & B_STALE))
978 			continue;
979 		sbp = bp;
980 		break;
981 	}
982 	mutex_exit(hmp);
983 	if (sbp == NULL)
984 		return;
985 	/*
986 	 * Now check the buffer we have identified and
987 	 * make sure it still belongs to the device and is B_DELWRI
988 	 */
989 	sema_p(&sbp->b_sem);
990 	if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
991 	    (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
992 		mutex_enter(hmp);
993 		hp->b_length--;
994 		notavail(sbp);
995 		mutex_exit(hmp);
996 		/*
997 		 * XXX - There is nothing to guarantee a synchronous
998 		 * write here if the B_ASYNC flag is set.  This needs
999 		 * some investigation.
1000 		 */
1001 		if (sbp->b_vp == NULL) {		/* !ufs */
1002 			BWRITE(sbp);	/* synchronous write */
1003 		} else {				/* ufs */
1004 			UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1005 		}
1006 	} else {
1007 		sema_v(&sbp->b_sem);
1008 	}
1009 }
1010 
1011 /*
1012  * Same as binval, except can force-invalidate delayed-write buffers
1013  * (which are not be already flushed because of device errors).  Also
1014  * makes sure that the retry write flag is cleared.
1015  */
1016 int
1017 bfinval(dev_t dev, int force)
1018 {
1019 	struct buf *dp;
1020 	struct buf *bp;
1021 	struct buf *binval_list = EMPTY_LIST;
1022 	int i, error = 0;
1023 	kmutex_t *hmp;
1024 	uint_t index;
1025 	struct buf **backp;
1026 
1027 	mutex_enter(&blist_lock);
1028 	/*
1029 	 * Wait for any flushes ahead of us to finish, it's ok to
1030 	 * do invalidates in parallel.
1031 	 */
1032 	while (bio_doingflush) {
1033 		bio_flinv_cv_wanted = 1;
1034 		cv_wait(&bio_flushinval_cv, &blist_lock);
1035 	}
1036 	bio_doinginval++;
1037 
1038 	/* Gather bp's */
1039 	for (i = 0; i < v.v_hbuf; i++) {
1040 		dp = (struct buf *)&hbuf[i];
1041 		hmp = &hbuf[i].b_lock;
1042 
1043 		mutex_enter(hmp);
1044 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1045 			if (bp->b_edev == dev) {
1046 				if (bp->b_list == NULL) {
1047 					bp->b_list = binval_list;
1048 					binval_list = bp;
1049 				}
1050 			}
1051 		}
1052 		mutex_exit(hmp);
1053 	}
1054 	mutex_exit(&blist_lock);
1055 
1056 	/* Invalidate all bp's found */
1057 	while (binval_list != EMPTY_LIST) {
1058 		bp = binval_list;
1059 
1060 		sema_p(&bp->b_sem);
1061 		if (bp->b_edev == dev) {
1062 			if (force && (bp->b_flags & B_DELWRI)) {
1063 				/* clear B_DELWRI, move to non-dw freelist */
1064 				index = bio_bhash(bp->b_edev, bp->b_blkno);
1065 				hmp = &hbuf[index].b_lock;
1066 				dp = (struct buf *)&hbuf[index];
1067 				mutex_enter(hmp);
1068 
1069 				/* remove from delayed write freelist */
1070 				notavail(bp);
1071 
1072 				/* add to B_AGE side of non-dw freelist */
1073 				backp = &dp->av_forw;
1074 				(*backp)->av_back = bp;
1075 				bp->av_forw = *backp;
1076 				*backp = bp;
1077 				bp->av_back = dp;
1078 
1079 				/*
1080 				 * make sure write retries and busy are cleared
1081 				 */
1082 				bp->b_flags &=
1083 				    ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1084 				mutex_exit(hmp);
1085 			}
1086 			if ((bp->b_flags & B_DELWRI) == 0)
1087 				bp->b_flags |= B_STALE|B_AGE;
1088 			else
1089 				error = EIO;
1090 		}
1091 		sema_v(&bp->b_sem);
1092 		binval_list = bp->b_list;
1093 		bp->b_list = NULL;
1094 	}
1095 	mutex_enter(&blist_lock);
1096 	bio_doinginval--;
1097 	if (bio_flinv_cv_wanted) {
1098 		cv_broadcast(&bio_flushinval_cv);
1099 		bio_flinv_cv_wanted = 0;
1100 	}
1101 	mutex_exit(&blist_lock);
1102 	return (error);
1103 }
1104 
1105 /*
1106  * If possible, invalidate blocks for a dev on demand
1107  */
1108 void
1109 binval(dev_t dev)
1110 {
1111 	(void) bfinval(dev, 0);
1112 }
1113 
1114 /*
1115  * Initialize the buffer I/O system by freeing
1116  * all buffers and setting all device hash buffer lists to empty.
1117  */
1118 void
1119 binit(void)
1120 {
1121 	struct buf *bp;
1122 	unsigned int i, pct;
1123 	ulong_t	bio_max_hwm, bio_default_hwm;
1124 
1125 	/*
1126 	 * Maximum/Default values for bufhwm are set to the smallest of:
1127 	 *	- BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1128 	 *	- 1/4 of kernel virtual memory
1129 	 *	- INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1130 	 * Additionally, in order to allow simple tuning by percentage of
1131 	 * physical memory, bufhwm_pct is used to calculate the default if
1132 	 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1133 	 *
1134 	 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1135 	 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1136 	 */
1137 	bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1138 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1139 	bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1140 
1141 	pct = BIO_BUF_PERCENT;
1142 	if (bufhwm_pct != 0 &&
1143 	    ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1144 		pct = BIO_BUF_PERCENT;
1145 		/*
1146 		 * Invalid user specified value, emit a warning.
1147 		 */
1148 		cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1149 		    range(1..%d). Using %d as default.",
1150 		    bufhwm_pct,
1151 		    100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1152 	}
1153 
1154 	bio_default_hwm = MIN(physmem / pct,
1155 	    btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1156 	bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1157 
1158 	if ((v.v_bufhwm = bufhwm) == 0)
1159 		v.v_bufhwm = bio_default_hwm;
1160 
1161 	if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1162 		v.v_bufhwm = (int)bio_max_hwm;
1163 		/*
1164 		 * Invalid user specified value, emit a warning.
1165 		 */
1166 		cmn_err(CE_WARN,
1167 		    "binit: bufhwm(%d) out \
1168 		    of range(%d..%lu). Using %lu as default",
1169 		    bufhwm,
1170 		    BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1171 	}
1172 
1173 	/*
1174 	 * Determine the number of hash buckets. Default is to
1175 	 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1176 	 * Round up number to the next power of 2.
1177 	 */
1178 	v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1179 	    BIO_HASHLEN);
1180 	v.v_hmask = v.v_hbuf - 1;
1181 	v.v_buf = BIO_BHDR_POOL;
1182 
1183 	hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1184 
1185 	dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1186 
1187 	bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1188 	bp = &bfreelist;
1189 	bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1190 
1191 	for (i = 0; i < v.v_hbuf; i++) {
1192 		hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1193 		hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1194 
1195 		/*
1196 		 * Initialize the delayed write buffer list.
1197 		 */
1198 		dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1199 		dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1200 	}
1201 }
1202 
1203 /*
1204  * Wait for I/O completion on the buffer; return error code.
1205  * If bp was for synchronous I/O, bp is invalid and associated
1206  * resources are freed on return.
1207  */
1208 int
1209 biowait(struct buf *bp)
1210 {
1211 	int error = 0;
1212 	struct cpu *cpup;
1213 
1214 	ASSERT(SEMA_HELD(&bp->b_sem));
1215 
1216 	cpup = CPU;
1217 	atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1218 	DTRACE_IO1(wait__start, struct buf *, bp);
1219 
1220 	/*
1221 	 * In case of panic, busy wait for completion
1222 	 */
1223 	if (panicstr) {
1224 		while ((bp->b_flags & B_DONE) == 0)
1225 			drv_usecwait(10);
1226 	} else
1227 		sema_p(&bp->b_io);
1228 
1229 	DTRACE_IO1(wait__done, struct buf *, bp);
1230 	atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1231 
1232 	error = geterror(bp);
1233 	if ((bp->b_flags & B_ASYNC) == 0) {
1234 		if (bp->b_flags & B_REMAPPED)
1235 			bp_mapout(bp);
1236 	}
1237 	return (error);
1238 }
1239 
1240 static void
1241 biodone_tnf_probe(struct buf *bp)
1242 {
1243 	/* Kernel probe */
1244 	TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1245 	    tnf_device,		device,		bp->b_edev,
1246 	    tnf_diskaddr,	block,		bp->b_lblkno,
1247 	    tnf_opaque,		buf,		bp);
1248 }
1249 
1250 /*
1251  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1252  * and wake up anyone waiting for it.
1253  */
1254 void
1255 biodone(struct buf *bp)
1256 {
1257 	if (bp->b_flags & B_STARTED) {
1258 		DTRACE_IO1(done, struct buf *, bp);
1259 		bp->b_flags &= ~B_STARTED;
1260 	}
1261 
1262 	/*
1263 	 * Call the TNF probe here instead of the inline code
1264 	 * to force our compiler to use the tail call optimization.
1265 	 */
1266 	biodone_tnf_probe(bp);
1267 
1268 	if (bp->b_iodone != NULL) {
1269 		(*(bp->b_iodone))(bp);
1270 		return;
1271 	}
1272 	ASSERT((bp->b_flags & B_DONE) == 0);
1273 	ASSERT(SEMA_HELD(&bp->b_sem));
1274 	bp->b_flags |= B_DONE;
1275 	if (bp->b_flags & B_ASYNC) {
1276 		if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1277 			bio_pageio_done(bp);
1278 		else
1279 			brelse(bp);	/* release bp to freelist */
1280 	} else {
1281 		sema_v(&bp->b_io);
1282 	}
1283 }
1284 
1285 /*
1286  * Pick up the device's error number and pass it to the user;
1287  * if there is an error but the number is 0 set a generalized code.
1288  */
1289 int
1290 geterror(struct buf *bp)
1291 {
1292 	int error = 0;
1293 
1294 	ASSERT(SEMA_HELD(&bp->b_sem));
1295 	if (bp->b_flags & B_ERROR) {
1296 		error = bp->b_error;
1297 		if (!error)
1298 			error = EIO;
1299 	}
1300 	return (error);
1301 }
1302 
1303 /*
1304  * Support for pageio buffers.
1305  *
1306  * This stuff should be generalized to provide a generalized bp
1307  * header facility that can be used for things other than pageio.
1308  */
1309 
1310 /*
1311  * Allocate and initialize a buf struct for use with pageio.
1312  */
1313 struct buf *
1314 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1315 {
1316 	struct buf *bp;
1317 	struct cpu *cpup;
1318 
1319 	if (flags & B_READ) {
1320 		CPU_STATS_ENTER_K();
1321 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
1322 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1323 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1324 
1325 		atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1326 
1327 		if ((flags & B_ASYNC) == 0) {
1328 			klwp_t *lwp = ttolwp(curthread);
1329 			if (lwp != NULL)
1330 				lwp->lwp_ru.majflt++;
1331 			CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1332 			/* Kernel probe */
1333 			TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1334 			    tnf_opaque,		vnode,		pp->p_vnode,
1335 			    tnf_offset,		offset,		pp->p_offset);
1336 		}
1337 		/*
1338 		 * Update statistics for pages being paged in
1339 		 */
1340 		if (pp != NULL && pp->p_vnode != NULL) {
1341 			if (IS_SWAPFSVP(pp->p_vnode)) {
1342 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1343 				atomic_add_64(&curzone->zone_anonpgin,
1344 				    btopr(len));
1345 			} else {
1346 				if (pp->p_vnode->v_flag & VVMEXEC) {
1347 					CPU_STATS_ADDQ(cpup, vm, execpgin,
1348 					    btopr(len));
1349 					atomic_add_64(&curzone->zone_execpgin,
1350 					    btopr(len));
1351 				} else {
1352 					CPU_STATS_ADDQ(cpup, vm, fspgin,
1353 					    btopr(len));
1354 					atomic_add_64(&curzone->zone_fspgin,
1355 					    btopr(len));
1356 				}
1357 			}
1358 		}
1359 		CPU_STATS_EXIT_K();
1360 		TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1361 		    "page_ws_in:pp %p", pp);
1362 		/* Kernel probe */
1363 		TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1364 		    tnf_opaque,	vnode,	pp->p_vnode,
1365 		    tnf_offset,	offset,	pp->p_offset,
1366 		    tnf_size,	size,	len);
1367 	}
1368 
1369 	bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1370 	bp->b_bcount = len;
1371 	bp->b_bufsize = len;
1372 	bp->b_pages = pp;
1373 	bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1374 	bp->b_offset = -1;
1375 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1376 
1377 	/* Initialize bp->b_sem in "locked" state */
1378 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1379 
1380 	VN_HOLD(vp);
1381 	bp->b_vp = vp;
1382 	THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1383 
1384 	/*
1385 	 * Caller sets dev & blkno and can adjust
1386 	 * b_addr for page offset and can use bp_mapin
1387 	 * to make pages kernel addressable.
1388 	 */
1389 	return (bp);
1390 }
1391 
1392 void
1393 pageio_done(struct buf *bp)
1394 {
1395 	ASSERT(SEMA_HELD(&bp->b_sem));
1396 	if (bp->b_flags & B_REMAPPED)
1397 		bp_mapout(bp);
1398 	VN_RELE(bp->b_vp);
1399 	bp->b_vp = NULL;
1400 	ASSERT((bp->b_flags & B_NOCACHE) != 0);
1401 
1402 	/* A sema_v(bp->b_sem) is implied if we are destroying it */
1403 	sema_destroy(&bp->b_sem);
1404 	sema_destroy(&bp->b_io);
1405 	kmem_free(bp, sizeof (struct buf));
1406 }
1407 
1408 /*
1409  * Check to see whether the buffers, except the one pointed by sbp,
1410  * associated with the device are busy.
1411  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1412  */
1413 int
1414 bcheck(dev_t dev, struct buf *sbp)
1415 {
1416 	struct buf	*bp;
1417 	struct buf	*dp;
1418 	int i;
1419 	kmutex_t *hmp;
1420 
1421 	/*
1422 	 * check for busy bufs for this filesystem
1423 	 */
1424 	for (i = 0; i < v.v_hbuf; i++) {
1425 		dp = (struct buf *)&hbuf[i];
1426 		hmp = &hbuf[i].b_lock;
1427 
1428 		mutex_enter(hmp);
1429 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1430 			/*
1431 			 * if buf is busy or dirty, then filesystem is busy
1432 			 */
1433 			if ((bp->b_edev == dev) &&
1434 			    ((bp->b_flags & B_STALE) == 0) &&
1435 			    (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1436 			    (bp != sbp)) {
1437 				mutex_exit(hmp);
1438 				return (1);
1439 			}
1440 		}
1441 		mutex_exit(hmp);
1442 	}
1443 	return (0);
1444 }
1445 
1446 /*
1447  * Hash two 32 bit entities.
1448  */
1449 int
1450 hash2ints(int x, int y)
1451 {
1452 	int hash = 0;
1453 
1454 	hash = x - 1;
1455 	hash = ((hash * 7) + (x >> 8)) - 1;
1456 	hash = ((hash * 7) + (x >> 16)) - 1;
1457 	hash = ((hash * 7) + (x >> 24)) - 1;
1458 	hash = ((hash * 7) + y) - 1;
1459 	hash = ((hash * 7) + (y >> 8)) - 1;
1460 	hash = ((hash * 7) + (y >> 16)) - 1;
1461 	hash = ((hash * 7) + (y >> 24)) - 1;
1462 
1463 	return (hash);
1464 }
1465 
1466 
1467 /*
1468  * Return a new buffer struct.
1469  *	Create a new buffer if we haven't gone over our high water
1470  *	mark for memory, otherwise try to get one off the freelist.
1471  *
1472  * Returns a locked buf that has no id and is not on any hash or free
1473  * list.
1474  */
1475 static struct buf *
1476 bio_getfreeblk(long bsize)
1477 {
1478 	struct buf *bp, *dp;
1479 	struct hbuf *hp;
1480 	kmutex_t	*hmp;
1481 	uint_t		start, end;
1482 
1483 	/*
1484 	 * mutex_enter(&bfree_lock);
1485 	 * bfreelist.b_bufsize represents the amount of memory
1486 	 * mutex_exit(&bfree_lock); protect ref to bfreelist
1487 	 * we are allowed to allocate in the cache before we hit our hwm.
1488 	 */
1489 	bio_mem_get(bsize);	/* Account for our memory request */
1490 
1491 again:
1492 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
1493 	sema_p(&bp->b_sem);	/* Should never fail */
1494 
1495 	ASSERT(bp->b_un.b_addr == NULL);
1496 	bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1497 	if (bp->b_un.b_addr != NULL) {
1498 		/*
1499 		 * Make the common path short
1500 		 */
1501 		bp->b_bufsize = bsize;
1502 		ASSERT(SEMA_HELD(&bp->b_sem));
1503 		return (bp);
1504 	} else {
1505 		struct buf *save;
1506 
1507 		save = bp;	/* Save bp we allocated */
1508 		start = end = lastindex;
1509 
1510 		biostats.bio_bufwant.value.ui32++;
1511 
1512 		/*
1513 		 * Memory isn't available from the system now. Scan
1514 		 * the hash buckets till enough space is found.
1515 		 */
1516 		do {
1517 			hp = &hbuf[start];
1518 			hmp = &hp->b_lock;
1519 			dp = (struct buf *)hp;
1520 
1521 			mutex_enter(hmp);
1522 			bp = dp->av_forw;
1523 
1524 			while (bp != dp) {
1525 
1526 				ASSERT(bp != NULL);
1527 
1528 				if (!sema_tryp(&bp->b_sem)) {
1529 					bp = bp->av_forw;
1530 					continue;
1531 				}
1532 
1533 				/*
1534 				 * Since we are going down the freelist
1535 				 * associated with this hash bucket the
1536 				 * B_DELWRI flag should not be set.
1537 				 */
1538 				ASSERT(!(bp->b_flags & B_DELWRI));
1539 
1540 				if (bp->b_bufsize == bsize) {
1541 					hp->b_length--;
1542 					notavail(bp);
1543 					bremhash(bp);
1544 					mutex_exit(hmp);
1545 
1546 					/*
1547 					 * Didn't kmem_alloc any more, so don't
1548 					 * count it twice.
1549 					 */
1550 					mutex_enter(&bfree_lock);
1551 					bfreelist.b_bufsize += bsize;
1552 					mutex_exit(&bfree_lock);
1553 
1554 					/*
1555 					 * Update the lastindex value.
1556 					 */
1557 					lastindex = start;
1558 
1559 					/*
1560 					 * Put our saved bp back on the list
1561 					 */
1562 					sema_v(&save->b_sem);
1563 					bio_bhdr_free(save);
1564 					ASSERT(SEMA_HELD(&bp->b_sem));
1565 					return (bp);
1566 				}
1567 				sema_v(&bp->b_sem);
1568 				bp = bp->av_forw;
1569 			}
1570 			mutex_exit(hmp);
1571 			start = ((start + 1) % v.v_hbuf);
1572 		} while (start != end);
1573 
1574 		biostats.bio_bufwait.value.ui32++;
1575 		bp = save;		/* Use original bp */
1576 		bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1577 	}
1578 
1579 	bp->b_bufsize = bsize;
1580 	ASSERT(SEMA_HELD(&bp->b_sem));
1581 	return (bp);
1582 }
1583 
1584 /*
1585  * Allocate a buffer header. If none currently available, allocate
1586  * a new pool.
1587  */
1588 static struct buf *
1589 bio_bhdr_alloc(void)
1590 {
1591 	struct buf *dp, *sdp;
1592 	struct buf *bp;
1593 	int i;
1594 
1595 	for (;;) {
1596 		mutex_enter(&bhdr_lock);
1597 		if (bhdrlist != NULL) {
1598 			bp = bhdrlist;
1599 			bhdrlist = bp->av_forw;
1600 			mutex_exit(&bhdr_lock);
1601 			bp->av_forw = NULL;
1602 			return (bp);
1603 		}
1604 		mutex_exit(&bhdr_lock);
1605 
1606 		/*
1607 		 * Need to allocate a new pool. If the system is currently
1608 		 * out of memory, then try freeing things on the freelist.
1609 		 */
1610 		dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1611 		if (dp == NULL) {
1612 			/*
1613 			 * System can't give us a pool of headers, try
1614 			 * recycling from the free lists.
1615 			 */
1616 			bio_recycle(BIO_HEADER, 0);
1617 		} else {
1618 			sdp = dp;
1619 			for (i = 0; i < v.v_buf; i++, dp++) {
1620 				/*
1621 				 * The next two lines are needed since NODEV
1622 				 * is -1 and not NULL
1623 				 */
1624 				dp->b_dev = (o_dev_t)NODEV;
1625 				dp->b_edev = NODEV;
1626 				dp->av_forw = dp + 1;
1627 				sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1628 				    NULL);
1629 				sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1630 				    NULL);
1631 				dp->b_offset = -1;
1632 			}
1633 			mutex_enter(&bhdr_lock);
1634 			(--dp)->av_forw = bhdrlist;	/* Fix last pointer */
1635 			bhdrlist = sdp;
1636 			nbuf += v.v_buf;
1637 			bp = bhdrlist;
1638 			bhdrlist = bp->av_forw;
1639 			mutex_exit(&bhdr_lock);
1640 
1641 			bp->av_forw = NULL;
1642 			return (bp);
1643 		}
1644 	}
1645 }
1646 
1647 static  void
1648 bio_bhdr_free(struct buf *bp)
1649 {
1650 	ASSERT(bp->b_back == NULL);
1651 	ASSERT(bp->b_forw == NULL);
1652 	ASSERT(bp->av_back == NULL);
1653 	ASSERT(bp->av_forw == NULL);
1654 	ASSERT(bp->b_un.b_addr == NULL);
1655 	ASSERT(bp->b_dev == (o_dev_t)NODEV);
1656 	ASSERT(bp->b_edev == NODEV);
1657 	ASSERT(bp->b_flags == 0);
1658 
1659 	mutex_enter(&bhdr_lock);
1660 	bp->av_forw = bhdrlist;
1661 	bhdrlist = bp;
1662 	mutex_exit(&bhdr_lock);
1663 }
1664 
1665 /*
1666  * If we haven't gone over the high water mark, it's o.k. to
1667  * allocate more buffer space, otherwise recycle buffers
1668  * from the freelist until enough memory is free for a bsize request.
1669  *
1670  * We account for this memory, even though
1671  * we don't allocate it here.
1672  */
1673 static void
1674 bio_mem_get(long bsize)
1675 {
1676 	mutex_enter(&bfree_lock);
1677 	if (bfreelist.b_bufsize > bsize) {
1678 		bfreelist.b_bufsize -= bsize;
1679 		mutex_exit(&bfree_lock);
1680 		return;
1681 	}
1682 	mutex_exit(&bfree_lock);
1683 	bio_recycle(BIO_MEM, bsize);
1684 }
1685 
1686 /*
1687  * flush a list of delayed write buffers.
1688  * (currently used only by bio_recycle below.)
1689  */
1690 static void
1691 bio_flushlist(struct buf *delwri_list)
1692 {
1693 	struct buf *bp;
1694 
1695 	while (delwri_list != EMPTY_LIST) {
1696 		bp = delwri_list;
1697 		bp->b_flags |= B_AGE | B_ASYNC;
1698 		if (bp->b_vp == NULL) {		/* !ufs */
1699 			BWRITE(bp);
1700 		} else {			/* ufs */
1701 			UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1702 		}
1703 		delwri_list = bp->b_list;
1704 		bp->b_list = NULL;
1705 	}
1706 }
1707 
1708 /*
1709  * Start recycling buffers on the freelist for one of 2 reasons:
1710  *	- we need a buffer header
1711  *	- we need to free up memory
1712  * Once started we continue to recycle buffers until the B_AGE
1713  * buffers are gone.
1714  */
1715 static void
1716 bio_recycle(int want, long bsize)
1717 {
1718 	struct buf *bp, *dp, *dwp, *nbp;
1719 	struct hbuf *hp;
1720 	int	found = 0;
1721 	kmutex_t	*hmp;
1722 	int		start, end;
1723 	struct buf *delwri_list = EMPTY_LIST;
1724 
1725 	/*
1726 	 * Recycle buffers.
1727 	 */
1728 top:
1729 	start = end = lastindex;
1730 	do {
1731 		hp = &hbuf[start];
1732 		hmp = &hp->b_lock;
1733 		dp = (struct buf *)hp;
1734 
1735 		mutex_enter(hmp);
1736 		bp = dp->av_forw;
1737 
1738 		while (bp != dp) {
1739 
1740 			ASSERT(bp != NULL);
1741 
1742 			if (!sema_tryp(&bp->b_sem)) {
1743 				bp = bp->av_forw;
1744 				continue;
1745 			}
1746 			/*
1747 			 * Do we really want to nuke all of the B_AGE stuff??
1748 			 */
1749 			if ((bp->b_flags & B_AGE) == 0 && found) {
1750 				sema_v(&bp->b_sem);
1751 				mutex_exit(hmp);
1752 				lastindex = start;
1753 				return;	/* All done */
1754 			}
1755 
1756 			ASSERT(MUTEX_HELD(&hp->b_lock));
1757 			ASSERT(!(bp->b_flags & B_DELWRI));
1758 			hp->b_length--;
1759 			notavail(bp);
1760 
1761 			/*
1762 			 * Remove bhdr from cache, free up memory,
1763 			 * and add the hdr to the freelist.
1764 			 */
1765 			bremhash(bp);
1766 			mutex_exit(hmp);
1767 
1768 			if (bp->b_bufsize) {
1769 				kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1770 				bp->b_un.b_addr = NULL;
1771 				mutex_enter(&bfree_lock);
1772 				bfreelist.b_bufsize += bp->b_bufsize;
1773 				mutex_exit(&bfree_lock);
1774 			}
1775 
1776 			bp->b_dev = (o_dev_t)NODEV;
1777 			bp->b_edev = NODEV;
1778 			bp->b_flags = 0;
1779 			sema_v(&bp->b_sem);
1780 			bio_bhdr_free(bp);
1781 			if (want == BIO_HEADER) {
1782 				found = 1;
1783 			} else {
1784 				ASSERT(want == BIO_MEM);
1785 				if (!found && bfreelist.b_bufsize >= bsize) {
1786 					/* Account for the memory we want */
1787 					mutex_enter(&bfree_lock);
1788 					if (bfreelist.b_bufsize >= bsize) {
1789 						bfreelist.b_bufsize -= bsize;
1790 						found = 1;
1791 					}
1792 					mutex_exit(&bfree_lock);
1793 				}
1794 			}
1795 
1796 			/*
1797 			 * Since we dropped hmp start from the
1798 			 * begining.
1799 			 */
1800 			mutex_enter(hmp);
1801 			bp = dp->av_forw;
1802 		}
1803 		mutex_exit(hmp);
1804 
1805 		/*
1806 		 * Look at the delayed write list.
1807 		 * First gather into a private list, then write them.
1808 		 */
1809 		dwp = (struct buf *)&dwbuf[start];
1810 		mutex_enter(&blist_lock);
1811 		bio_doingflush++;
1812 		mutex_enter(hmp);
1813 		for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1814 
1815 			ASSERT(bp != NULL);
1816 			nbp = bp->av_forw;
1817 
1818 			if (!sema_tryp(&bp->b_sem))
1819 				continue;
1820 			ASSERT(bp->b_flags & B_DELWRI);
1821 			/*
1822 			 * Do we really want to nuke all of the B_AGE stuff??
1823 			 */
1824 
1825 			if ((bp->b_flags & B_AGE) == 0 && found) {
1826 				sema_v(&bp->b_sem);
1827 				mutex_exit(hmp);
1828 				lastindex = start;
1829 				mutex_exit(&blist_lock);
1830 				bio_flushlist(delwri_list);
1831 				mutex_enter(&blist_lock);
1832 				bio_doingflush--;
1833 				if (bio_flinv_cv_wanted) {
1834 					bio_flinv_cv_wanted = 0;
1835 					cv_broadcast(&bio_flushinval_cv);
1836 				}
1837 				mutex_exit(&blist_lock);
1838 				return; /* All done */
1839 			}
1840 
1841 			/*
1842 			 * If the buffer is already on a flush or
1843 			 * invalidate list then just skip it.
1844 			 */
1845 			if (bp->b_list != NULL) {
1846 				sema_v(&bp->b_sem);
1847 				continue;
1848 			}
1849 			/*
1850 			 * We are still on the same bucket.
1851 			 */
1852 			hp->b_length--;
1853 			notavail(bp);
1854 			bp->b_list = delwri_list;
1855 			delwri_list = bp;
1856 		}
1857 		mutex_exit(hmp);
1858 		mutex_exit(&blist_lock);
1859 		bio_flushlist(delwri_list);
1860 		delwri_list = EMPTY_LIST;
1861 		mutex_enter(&blist_lock);
1862 		bio_doingflush--;
1863 		if (bio_flinv_cv_wanted) {
1864 			bio_flinv_cv_wanted = 0;
1865 			cv_broadcast(&bio_flushinval_cv);
1866 		}
1867 		mutex_exit(&blist_lock);
1868 		start = (start + 1) % v.v_hbuf;
1869 
1870 	} while (start != end);
1871 
1872 	if (found)
1873 		return;
1874 
1875 	/*
1876 	 * Free lists exhausted and we haven't satisfied the request.
1877 	 * Wait here for more entries to be added to freelist.
1878 	 * Because this might have just happened, make it timed.
1879 	 */
1880 	mutex_enter(&bfree_lock);
1881 	bfreelist.b_flags |= B_WANTED;
1882 	(void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1883 	mutex_exit(&bfree_lock);
1884 	goto top;
1885 }
1886 
1887 /*
1888  * See if the block is associated with some buffer
1889  * (mainly to avoid getting hung up on a wait in breada).
1890  */
1891 static int
1892 bio_incore(dev_t dev, daddr_t blkno)
1893 {
1894 	struct buf *bp;
1895 	struct buf *dp;
1896 	uint_t index;
1897 	kmutex_t *hmp;
1898 
1899 	index = bio_bhash(dev, blkno);
1900 	dp = (struct buf *)&hbuf[index];
1901 	hmp = &hbuf[index].b_lock;
1902 
1903 	mutex_enter(hmp);
1904 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1905 		if (bp->b_blkno == blkno && bp->b_edev == dev &&
1906 		    (bp->b_flags & B_STALE) == 0) {
1907 			mutex_exit(hmp);
1908 			return (1);
1909 		}
1910 	}
1911 	mutex_exit(hmp);
1912 	return (0);
1913 }
1914 
1915 static void
1916 bio_pageio_done(struct buf *bp)
1917 {
1918 	if (bp->b_flags & B_PAGEIO) {
1919 
1920 		if (bp->b_flags & B_REMAPPED)
1921 			bp_mapout(bp);
1922 
1923 		if (bp->b_flags & B_READ)
1924 			pvn_read_done(bp->b_pages, bp->b_flags);
1925 		else
1926 			pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1927 		pageio_done(bp);
1928 	} else {
1929 		ASSERT(bp->b_flags & B_REMAPPED);
1930 		bp_mapout(bp);
1931 		brelse(bp);
1932 	}
1933 }
1934 
1935 /*
1936  * bioerror(9F) - indicate error in buffer header
1937  * If 'error' is zero, remove the error indication.
1938  */
1939 void
1940 bioerror(struct buf *bp, int error)
1941 {
1942 	ASSERT(bp != NULL);
1943 	ASSERT(error >= 0);
1944 	ASSERT(SEMA_HELD(&bp->b_sem));
1945 
1946 	if (error != 0) {
1947 		bp->b_flags |= B_ERROR;
1948 	} else {
1949 		bp->b_flags &= ~B_ERROR;
1950 	}
1951 	bp->b_error = error;
1952 }
1953 
1954 /*
1955  * bioreset(9F) - reuse a private buffer header after I/O is complete
1956  */
1957 void
1958 bioreset(struct buf *bp)
1959 {
1960 	ASSERT(bp != NULL);
1961 
1962 	biofini(bp);
1963 	bioinit(bp);
1964 }
1965 
1966 /*
1967  * biosize(9F) - return size of a buffer header
1968  */
1969 size_t
1970 biosize(void)
1971 {
1972 	return (sizeof (struct buf));
1973 }
1974 
1975 /*
1976  * biomodified(9F) - check if buffer is modified
1977  */
1978 int
1979 biomodified(struct buf *bp)
1980 {
1981 	int npf;
1982 	int ppattr;
1983 	struct page *pp;
1984 
1985 	ASSERT(bp != NULL);
1986 
1987 	if ((bp->b_flags & B_PAGEIO) == 0) {
1988 		return (-1);
1989 	}
1990 	pp = bp->b_pages;
1991 	npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1992 
1993 	while (npf > 0) {
1994 		ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1995 		    HAT_SYNC_STOPON_MOD);
1996 		if (ppattr & P_MOD)
1997 			return (1);
1998 		pp = pp->p_next;
1999 		npf--;
2000 	}
2001 
2002 	return (0);
2003 }
2004 
2005 /*
2006  * bioinit(9F) - initialize a buffer structure
2007  */
2008 void
2009 bioinit(struct buf *bp)
2010 {
2011 	bzero(bp, sizeof (struct buf));
2012 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2013 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2014 	bp->b_offset = -1;
2015 }
2016 
2017 /*
2018  * biofini(9F) - uninitialize a buffer structure
2019  */
2020 void
2021 biofini(struct buf *bp)
2022 {
2023 	sema_destroy(&bp->b_io);
2024 	sema_destroy(&bp->b_sem);
2025 }
2026 
2027 /*
2028  * bioclone(9F) - clone a buffer
2029  */
2030 struct buf *
2031 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2032     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2033 {
2034 	struct buf *bufp;
2035 
2036 	ASSERT(bp);
2037 	if (bp_mem == NULL) {
2038 		bufp = kmem_alloc(sizeof (struct buf), sleep);
2039 		if (bufp == NULL) {
2040 			return (NULL);
2041 		}
2042 		bioinit(bufp);
2043 	} else {
2044 		bufp = bp_mem;
2045 		bioreset(bufp);
2046 	}
2047 
2048 #define	BUF_CLONE_FLAGS	(B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2049 	B_ABRWRITE)
2050 
2051 	/*
2052 	 * The cloned buffer does not inherit the B_REMAPPED flag.
2053 	 */
2054 	bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2055 	bufp->b_bcount = len;
2056 	bufp->b_blkno = blkno;
2057 	bufp->b_iodone = iodone;
2058 	bufp->b_proc = bp->b_proc;
2059 	bufp->b_edev = dev;
2060 	bufp->b_file = bp->b_file;
2061 	bufp->b_offset = bp->b_offset;
2062 
2063 	if (bp->b_flags & B_SHADOW) {
2064 		ASSERT(bp->b_shadow);
2065 		ASSERT(bp->b_flags & B_PHYS);
2066 
2067 		bufp->b_shadow = bp->b_shadow +
2068 		    btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2069 		bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2070 		if (bp->b_flags & B_REMAPPED)
2071 			bufp->b_proc = NULL;
2072 	} else {
2073 		if (bp->b_flags & B_PAGEIO) {
2074 			struct page *pp;
2075 			off_t o;
2076 			int i;
2077 
2078 			pp = bp->b_pages;
2079 			o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2080 			for (i = btop(o); i > 0; i--) {
2081 				pp = pp->p_next;
2082 			}
2083 			bufp->b_pages = pp;
2084 			bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2085 		} else {
2086 			bufp->b_un.b_addr =
2087 			    (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2088 			if (bp->b_flags & B_REMAPPED)
2089 				bufp->b_proc = NULL;
2090 		}
2091 	}
2092 	return (bufp);
2093 }
2094