1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40 #include <sys/types.h>
41 #include <sys/t_lock.h>
42 #include <sys/sysmacros.h>
43 #include <sys/conf.h>
44 #include <sys/cpuvar.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/buf.h>
48 #include <sys/var.h>
49 #include <sys/vnode.h>
50 #include <sys/bitmap.h>
51 #include <sys/cmn_err.h>
52 #include <sys/kmem.h>
53 #include <sys/vmem.h>
54 #include <sys/atomic.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/page.h>
57 #include <vm/pvn.h>
58 #include <sys/vtrace.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/fs/ufs_inode.h>
61 #include <sys/fs/ufs_bio.h>
62 #include <sys/fs/ufs_log.h>
63 #include <sys/systm.h>
64 #include <sys/vfs.h>
65 #include <sys/sdt.h>
66
67 /* Locks */
68 static kmutex_t blist_lock; /* protects b_list */
69 static kmutex_t bhdr_lock; /* protects the bhdrlist */
70 static kmutex_t bfree_lock; /* protects the bfreelist structure */
71
72 struct hbuf *hbuf; /* Hash buckets */
73 struct dwbuf *dwbuf; /* Delayed write buckets */
74 static struct buf *bhdrlist; /* buf header free list */
75 static int nbuf; /* number of buffer headers allocated */
76
77 static int lastindex; /* Reference point on where to start */
78 /* when looking for free buffers */
79
80 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
81 #define EMPTY_LIST ((struct buf *)-1)
82
83 static kcondvar_t bio_mem_cv; /* Condition variables */
84 static kcondvar_t bio_flushinval_cv;
85 static int bio_doingflush; /* flush in progress */
86 static int bio_doinginval; /* inval in progress */
87 static int bio_flinv_cv_wanted; /* someone waiting for cv */
88
89 /*
90 * Statistics on the buffer cache
91 */
92 struct biostats biostats = {
93 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
94 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
95 { "new_buffer_requests", KSTAT_DATA_UINT32 },
96 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
97 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
98 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
99 };
100
101 /*
102 * kstat data
103 */
104 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
105 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
106 sizeof (kstat_named_t));
107
108 /*
109 * Statistics on ufs buffer cache
110 * Not protected by locks
111 */
112 struct ufsbiostats ub = {
113 { "breads", KSTAT_DATA_UINT32 },
114 { "bwrites", KSTAT_DATA_UINT32 },
115 { "fbiwrites", KSTAT_DATA_UINT32 },
116 { "getpages", KSTAT_DATA_UINT32 },
117 { "getras", KSTAT_DATA_UINT32 },
118 { "putsyncs", KSTAT_DATA_UINT32 },
119 { "putasyncs", KSTAT_DATA_UINT32 },
120 { "putpageios", KSTAT_DATA_UINT32 },
121 };
122
123 /*
124 * more UFS Logging eccentricities...
125 *
126 * required since "#pragma weak ..." doesn't work in reverse order.
127 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
128 * to ufs routines don't get plugged into bio.c calls so
129 * we initialize it when setting up the "lufsops" table
130 * in "lufs.c:_init()"
131 */
132 void (*bio_lufs_strategy)(void *, buf_t *);
133 void (*bio_snapshot_strategy)(void *, buf_t *);
134
135
136 /* Private routines */
137 static struct buf *bio_getfreeblk(long);
138 static void bio_mem_get(long);
139 static void bio_bhdr_free(struct buf *);
140 static struct buf *bio_bhdr_alloc(void);
141 static void bio_recycle(int, long);
142 static void bio_pageio_done(struct buf *);
143 static int bio_incore(dev_t, daddr_t);
144
145 /*
146 * Buffer cache constants
147 */
148 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
149 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
150 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
151 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
152 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
153 #define BIO_HASHLEN 4 /* Target length of hash chains */
154
155
156 /* Flags for bio_recycle() */
157 #define BIO_HEADER 0x01
158 #define BIO_MEM 0x02
159
160 extern int bufhwm; /* User tunable - high water mark for mem */
161 extern int bufhwm_pct; /* ditto - given in % of physmem */
162
163 /*
164 * The following routines allocate and free
165 * buffers with various side effects. In general the
166 * arguments to an allocate routine are a device and
167 * a block number, and the value is a pointer to
168 * to the buffer header; the buffer returned is locked with a
169 * binary semaphore so that no one else can touch it. If the block was
170 * already in core, no I/O need be done; if it is
171 * already locked, the process waits until it becomes free.
172 * The following routines allocate a buffer:
173 * getblk
174 * bread/BREAD
175 * breada
176 * Eventually the buffer must be released, possibly with the
177 * side effect of writing it out, by using one of
178 * bwrite/BWRITE/brwrite
179 * bdwrite/bdrwrite
180 * bawrite
181 * brelse
182 *
183 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
184 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
185 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
186 * B_DONE is still used to denote a buffer with I/O complete on it.
187 *
188 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
189 * should not be used where a very accurate count of the free buffers is
190 * needed.
191 */
192
193 /*
194 * Read in (if necessary) the block and return a buffer pointer.
195 *
196 * This interface is provided for binary compatibility. Using
197 * BREAD() directly avoids the extra function call overhead invoked
198 * by calling this routine.
199 */
200 struct buf *
bread(dev_t dev,daddr_t blkno,long bsize)201 bread(dev_t dev, daddr_t blkno, long bsize)
202 {
203 return (BREAD(dev, blkno, bsize));
204 }
205
206 /*
207 * Common code for reading a buffer with various options
208 *
209 * Read in (if necessary) the block and return a buffer pointer.
210 */
211 struct buf *
bread_common(void * arg,dev_t dev,daddr_t blkno,long bsize)212 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
213 {
214 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
215 struct buf *bp;
216 klwp_t *lwp = ttolwp(curthread);
217
218 CPU_STATS_ADD_K(sys, lread, 1);
219 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
220 if (bp->b_flags & B_DONE)
221 return (bp);
222 bp->b_flags |= B_READ;
223 ASSERT(bp->b_bcount == bsize);
224 if (ufsvfsp == NULL) { /* !ufs */
225 (void) bdev_strategy(bp);
226 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
227 /* ufs && logging */
228 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
229 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
230 /* ufs && snapshots */
231 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
232 } else {
233 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
234 ub.ub_breads.value.ul++; /* ufs && !logging */
235 (void) bdev_strategy(bp);
236 }
237 if (lwp != NULL)
238 lwp->lwp_ru.inblock++;
239 CPU_STATS_ADD_K(sys, bread, 1);
240 (void) biowait(bp);
241 return (bp);
242 }
243
244 /*
245 * Read in the block, like bread, but also start I/O on the
246 * read-ahead block (which is not allocated to the caller).
247 */
248 struct buf *
breada(dev_t dev,daddr_t blkno,daddr_t rablkno,long bsize)249 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
250 {
251 struct buf *bp, *rabp;
252 klwp_t *lwp = ttolwp(curthread);
253
254 bp = NULL;
255 if (!bio_incore(dev, blkno)) {
256 CPU_STATS_ADD_K(sys, lread, 1);
257 bp = GETBLK(dev, blkno, bsize);
258 if ((bp->b_flags & B_DONE) == 0) {
259 bp->b_flags |= B_READ;
260 bp->b_bcount = bsize;
261 (void) bdev_strategy(bp);
262 if (lwp != NULL)
263 lwp->lwp_ru.inblock++;
264 CPU_STATS_ADD_K(sys, bread, 1);
265 }
266 }
267 if (rablkno && bfreelist.b_bcount > 1 &&
268 !bio_incore(dev, rablkno)) {
269 rabp = GETBLK(dev, rablkno, bsize);
270 if (rabp->b_flags & B_DONE)
271 brelse(rabp);
272 else {
273 rabp->b_flags |= B_READ|B_ASYNC;
274 rabp->b_bcount = bsize;
275 (void) bdev_strategy(rabp);
276 if (lwp != NULL)
277 lwp->lwp_ru.inblock++;
278 CPU_STATS_ADD_K(sys, bread, 1);
279 }
280 }
281 if (bp == NULL)
282 return (BREAD(dev, blkno, bsize));
283 (void) biowait(bp);
284 return (bp);
285 }
286
287 /*
288 * Common code for writing a buffer with various options.
289 *
290 * force_wait - wait for write completion regardless of B_ASYNC flag
291 * do_relse - release the buffer when we are done
292 * clear_flags - flags to clear from the buffer
293 */
294 void
bwrite_common(void * arg,struct buf * bp,int force_wait,int do_relse,int clear_flags)295 bwrite_common(void *arg, struct buf *bp, int force_wait,
296 int do_relse, int clear_flags)
297 {
298 register int do_wait;
299 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
300 int flag;
301 klwp_t *lwp = ttolwp(curthread);
302 struct cpu *cpup;
303
304 ASSERT(SEMA_HELD(&bp->b_sem));
305 flag = bp->b_flags;
306 bp->b_flags &= ~clear_flags;
307 if (lwp != NULL)
308 lwp->lwp_ru.oublock++;
309 CPU_STATS_ENTER_K();
310 cpup = CPU; /* get pointer AFTER preemption is disabled */
311 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
312 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
313 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
314 if (do_wait == 0)
315 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
316 CPU_STATS_EXIT_K();
317 if (ufsvfsp == NULL) {
318 (void) bdev_strategy(bp);
319 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
320 /* ufs && logging */
321 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
322 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
323 /* ufs && snapshots */
324 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
325 } else {
326 ub.ub_bwrites.value.ul++; /* ufs && !logging */
327 (void) bdev_strategy(bp);
328 }
329 if (do_wait) {
330 (void) biowait(bp);
331 if (do_relse) {
332 brelse(bp);
333 }
334 }
335 }
336
337 /*
338 * Write the buffer, waiting for completion (unless B_ASYNC is set).
339 * Then release the buffer.
340 * This interface is provided for binary compatibility. Using
341 * BWRITE() directly avoids the extra function call overhead invoked
342 * by calling this routine.
343 */
344 void
bwrite(struct buf * bp)345 bwrite(struct buf *bp)
346 {
347 BWRITE(bp);
348 }
349
350 /*
351 * Write the buffer, waiting for completion.
352 * But don't release the buffer afterwards.
353 * This interface is provided for binary compatibility. Using
354 * BWRITE2() directly avoids the extra function call overhead.
355 */
356 void
bwrite2(struct buf * bp)357 bwrite2(struct buf *bp)
358 {
359 BWRITE2(bp);
360 }
361
362 /*
363 * Release the buffer, marking it so that if it is grabbed
364 * for another purpose it will be written out before being
365 * given up (e.g. when writing a partial block where it is
366 * assumed that another write for the same block will soon follow).
367 * Also save the time that the block is first marked as delayed
368 * so that it will be written in a reasonable time.
369 */
370 void
bdwrite(struct buf * bp)371 bdwrite(struct buf *bp)
372 {
373 ASSERT(SEMA_HELD(&bp->b_sem));
374 CPU_STATS_ADD_K(sys, lwrite, 1);
375 if ((bp->b_flags & B_DELWRI) == 0)
376 bp->b_start = ddi_get_lbolt();
377 /*
378 * B_DONE allows others to use the buffer, B_DELWRI causes the
379 * buffer to be written before being reused, and setting b_resid
380 * to zero says the buffer is complete.
381 */
382 bp->b_flags |= B_DELWRI | B_DONE;
383 bp->b_resid = 0;
384 brelse(bp);
385 }
386
387 /*
388 * Release the buffer, start I/O on it, but don't wait for completion.
389 */
390 void
bawrite(struct buf * bp)391 bawrite(struct buf *bp)
392 {
393 ASSERT(SEMA_HELD(&bp->b_sem));
394
395 /* Use bfreelist.b_bcount as a weird-ass heuristic */
396 if (bfreelist.b_bcount > 4)
397 bp->b_flags |= B_ASYNC;
398 BWRITE(bp);
399 }
400
401 /*
402 * Release the buffer, with no I/O implied.
403 */
404 void
brelse(struct buf * bp)405 brelse(struct buf *bp)
406 {
407 struct buf **backp;
408 uint_t index;
409 kmutex_t *hmp;
410 struct buf *dp;
411 struct hbuf *hp;
412
413
414 ASSERT(SEMA_HELD(&bp->b_sem));
415
416 /*
417 * Clear the retry write flag if the buffer was written without
418 * error. The presence of B_DELWRI means the buffer has not yet
419 * been written and the presence of B_ERROR means that an error
420 * is still occurring.
421 */
422 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
423 bp->b_flags &= ~B_RETRYWRI;
424 }
425
426 /* Check for anomalous conditions */
427 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
428 if (bp->b_flags & B_NOCACHE) {
429 /* Don't add to the freelist. Destroy it now */
430 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
431 sema_destroy(&bp->b_sem);
432 sema_destroy(&bp->b_io);
433 kmem_free(bp, sizeof (struct buf));
434 return;
435 }
436 /*
437 * If a write failed and we are supposed to retry write,
438 * don't toss the buffer. Keep it around and mark it
439 * delayed write in the hopes that it will eventually
440 * get flushed (and still keep the system running.)
441 */
442 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
443 bp->b_flags |= B_DELWRI;
444 /* keep fsflush from trying continuously to flush */
445 bp->b_start = ddi_get_lbolt();
446 } else
447 bp->b_flags |= B_AGE|B_STALE;
448 bp->b_flags &= ~B_ERROR;
449 bp->b_error = 0;
450 }
451
452 /*
453 * If delayed write is set then put in on the delayed
454 * write list instead of the free buffer list.
455 */
456 index = bio_bhash(bp->b_edev, bp->b_blkno);
457 hmp = &hbuf[index].b_lock;
458
459 mutex_enter(hmp);
460 hp = &hbuf[index];
461 dp = (struct buf *)hp;
462
463 /*
464 * Make sure that the number of entries on this list are
465 * Zero <= count <= total # buffers
466 */
467 ASSERT(hp->b_length >= 0);
468 ASSERT(hp->b_length < nbuf);
469
470 hp->b_length++; /* We are adding this buffer */
471
472 if (bp->b_flags & B_DELWRI) {
473 /*
474 * This buffer goes on the delayed write buffer list
475 */
476 dp = (struct buf *)&dwbuf[index];
477 }
478 ASSERT(bp->b_bufsize > 0);
479 ASSERT(bp->b_bcount > 0);
480 ASSERT(bp->b_un.b_addr != NULL);
481
482 if (bp->b_flags & B_AGE) {
483 backp = &dp->av_forw;
484 (*backp)->av_back = bp;
485 bp->av_forw = *backp;
486 *backp = bp;
487 bp->av_back = dp;
488 } else {
489 backp = &dp->av_back;
490 (*backp)->av_forw = bp;
491 bp->av_back = *backp;
492 *backp = bp;
493 bp->av_forw = dp;
494 }
495 mutex_exit(hmp);
496
497 if (bfreelist.b_flags & B_WANTED) {
498 /*
499 * Should come here very very rarely.
500 */
501 mutex_enter(&bfree_lock);
502 if (bfreelist.b_flags & B_WANTED) {
503 bfreelist.b_flags &= ~B_WANTED;
504 cv_broadcast(&bio_mem_cv);
505 }
506 mutex_exit(&bfree_lock);
507 }
508
509 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
510 /*
511 * Don't let anyone get the buffer off the freelist before we
512 * release our hold on it.
513 */
514 sema_v(&bp->b_sem);
515 }
516
517 /*
518 * Return a count of the number of B_BUSY buffers in the system
519 * Can only be used as a good estimate. If 'cleanit' is set,
520 * try to flush all bufs.
521 */
522 int
bio_busy(int cleanit)523 bio_busy(int cleanit)
524 {
525 struct buf *bp, *dp;
526 int busy = 0;
527 int i;
528 kmutex_t *hmp;
529
530 for (i = 0; i < v.v_hbuf; i++) {
531 vfs_syncprogress();
532 dp = (struct buf *)&hbuf[i];
533 hmp = &hbuf[i].b_lock;
534
535 mutex_enter(hmp);
536 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
537 if (bp->b_flags & B_BUSY)
538 busy++;
539 }
540 mutex_exit(hmp);
541 }
542
543 if (cleanit && busy != 0) {
544 bflush(NODEV);
545 }
546
547 return (busy);
548 }
549
550 /*
551 * this interface is provided for binary compatibility.
552 *
553 * Assign a buffer for the given block. If the appropriate
554 * block is already associated, return it; otherwise search
555 * for the oldest non-busy buffer and reassign it.
556 */
557 struct buf *
getblk(dev_t dev,daddr_t blkno,long bsize)558 getblk(dev_t dev, daddr_t blkno, long bsize)
559 {
560 return (getblk_common(/* ufsvfsp */ NULL, dev,
561 blkno, bsize, /* errflg */ 0));
562 }
563
564 /*
565 * Assign a buffer for the given block. If the appropriate
566 * block is already associated, return it; otherwise search
567 * for the oldest non-busy buffer and reassign it.
568 */
569 struct buf *
getblk_common(void * arg,dev_t dev,daddr_t blkno,long bsize,int errflg)570 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
571 {
572 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
573 struct buf *bp;
574 struct buf *dp;
575 struct buf *nbp = NULL;
576 struct buf *errbp;
577 uint_t index;
578 kmutex_t *hmp;
579 struct hbuf *hp;
580
581 if (getmajor(dev) >= devcnt)
582 cmn_err(CE_PANIC, "blkdev");
583
584 biostats.bio_lookup.value.ui32++;
585
586 index = bio_bhash(dev, blkno);
587 hp = &hbuf[index];
588 dp = (struct buf *)hp;
589 hmp = &hp->b_lock;
590
591 mutex_enter(hmp);
592 loop:
593 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
594 if (bp->b_blkno != blkno || bp->b_edev != dev ||
595 (bp->b_flags & B_STALE))
596 continue;
597 /*
598 * Avoid holding the hash lock in the event that
599 * the buffer is locked by someone. Since the hash chain
600 * may change when we drop the hash lock
601 * we have to start at the beginning of the chain if the
602 * buffer identity/contents aren't valid.
603 */
604 if (!sema_tryp(&bp->b_sem)) {
605 biostats.bio_bufbusy.value.ui32++;
606 mutex_exit(hmp);
607 /*
608 * OK, we are dealing with a busy buffer.
609 * In the case that we are panicking and we
610 * got called from bread(), we have some chance
611 * for error recovery. So better bail out from
612 * here since sema_p() won't block. If we got
613 * called directly from ufs routines, there is
614 * no way to report an error yet.
615 */
616 if (panicstr && errflg)
617 goto errout;
618 /*
619 * For the following line of code to work
620 * correctly never kmem_free the buffer "header".
621 */
622 sema_p(&bp->b_sem);
623 if (bp->b_blkno != blkno || bp->b_edev != dev ||
624 (bp->b_flags & B_STALE)) {
625 sema_v(&bp->b_sem);
626 mutex_enter(hmp);
627 goto loop; /* start over */
628 }
629 mutex_enter(hmp);
630 }
631 /* Found */
632 biostats.bio_hit.value.ui32++;
633 bp->b_flags &= ~B_AGE;
634
635 /*
636 * Yank it off the free/delayed write lists
637 */
638 hp->b_length--;
639 notavail(bp);
640 mutex_exit(hmp);
641
642 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
643
644 if (nbp == NULL) {
645 /*
646 * Make the common path short.
647 */
648 ASSERT(SEMA_HELD(&bp->b_sem));
649 return (bp);
650 }
651
652 biostats.bio_bufdup.value.ui32++;
653
654 /*
655 * The buffer must have entered during the lock upgrade
656 * so free the new buffer we allocated and return the
657 * found buffer.
658 */
659 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
660 nbp->b_un.b_addr = NULL;
661
662 /*
663 * Account for the memory
664 */
665 mutex_enter(&bfree_lock);
666 bfreelist.b_bufsize += nbp->b_bufsize;
667 mutex_exit(&bfree_lock);
668
669 /*
670 * Destroy buf identity, and place on avail list
671 */
672 nbp->b_dev = (o_dev_t)NODEV;
673 nbp->b_edev = NODEV;
674 nbp->b_flags = 0;
675 nbp->b_file = NULL;
676 nbp->b_offset = -1;
677
678 sema_v(&nbp->b_sem);
679 bio_bhdr_free(nbp);
680
681 ASSERT(SEMA_HELD(&bp->b_sem));
682 return (bp);
683 }
684
685 /*
686 * bio_getfreeblk may block so check the hash chain again.
687 */
688 if (nbp == NULL) {
689 mutex_exit(hmp);
690 nbp = bio_getfreeblk(bsize);
691 mutex_enter(hmp);
692 goto loop;
693 }
694
695 /*
696 * New buffer. Assign nbp and stick it on the hash.
697 */
698 nbp->b_flags = B_BUSY;
699 nbp->b_edev = dev;
700 nbp->b_dev = (o_dev_t)cmpdev(dev);
701 nbp->b_blkno = blkno;
702 nbp->b_iodone = NULL;
703 nbp->b_bcount = bsize;
704 /*
705 * If we are given a ufsvfsp and the vfs_root field is NULL
706 * then this must be I/O for a superblock. A superblock's
707 * buffer is set up in mountfs() and there is no root vnode
708 * at that point.
709 */
710 if (ufsvfsp && ufsvfsp->vfs_root) {
711 nbp->b_vp = ufsvfsp->vfs_root;
712 } else {
713 nbp->b_vp = NULL;
714 }
715
716 ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
717
718 binshash(nbp, dp);
719 mutex_exit(hmp);
720
721 ASSERT(SEMA_HELD(&nbp->b_sem));
722
723 return (nbp);
724
725
726 /*
727 * Come here in case of an internal error. At this point we couldn't
728 * get a buffer, but he have to return one. Hence we allocate some
729 * kind of error reply buffer on the fly. This buffer is marked as
730 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
731 * - B_ERROR will indicate error to the caller.
732 * - B_DONE will prevent us from reading the buffer from
733 * the device.
734 * - B_NOCACHE will cause that this buffer gets free'd in
735 * brelse().
736 */
737
738 errout:
739 errbp = geteblk();
740 sema_p(&errbp->b_sem);
741 errbp->b_flags &= ~B_BUSY;
742 errbp->b_flags |= (B_ERROR | B_DONE);
743 return (errbp);
744 }
745
746 /*
747 * Get an empty block, not assigned to any particular device.
748 * Returns a locked buffer that is not on any hash or free list.
749 */
750 struct buf *
ngeteblk(long bsize)751 ngeteblk(long bsize)
752 {
753 struct buf *bp;
754
755 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
756 bioinit(bp);
757 bp->av_forw = bp->av_back = NULL;
758 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
759 bp->b_bufsize = bsize;
760 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
761 bp->b_dev = (o_dev_t)NODEV;
762 bp->b_edev = NODEV;
763 bp->b_lblkno = 0;
764 bp->b_bcount = bsize;
765 bp->b_iodone = NULL;
766 return (bp);
767 }
768
769 /*
770 * Interface of geteblk() is kept intact to maintain driver compatibility.
771 * Use ngeteblk() to allocate block size other than 1 KB.
772 */
773 struct buf *
geteblk(void)774 geteblk(void)
775 {
776 return (ngeteblk((long)1024));
777 }
778
779 /*
780 * Return a buffer w/o sleeping
781 */
782 struct buf *
trygetblk(dev_t dev,daddr_t blkno)783 trygetblk(dev_t dev, daddr_t blkno)
784 {
785 struct buf *bp;
786 struct buf *dp;
787 struct hbuf *hp;
788 kmutex_t *hmp;
789 uint_t index;
790
791 index = bio_bhash(dev, blkno);
792 hp = &hbuf[index];
793 hmp = &hp->b_lock;
794
795 if (!mutex_tryenter(hmp))
796 return (NULL);
797
798 dp = (struct buf *)hp;
799 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
800 if (bp->b_blkno != blkno || bp->b_edev != dev ||
801 (bp->b_flags & B_STALE))
802 continue;
803 /*
804 * Get access to a valid buffer without sleeping
805 */
806 if (sema_tryp(&bp->b_sem)) {
807 if (bp->b_flags & B_DONE) {
808 hp->b_length--;
809 notavail(bp);
810 mutex_exit(hmp);
811 return (bp);
812 } else {
813 sema_v(&bp->b_sem);
814 break;
815 }
816 }
817 break;
818 }
819 mutex_exit(hmp);
820 return (NULL);
821 }
822
823 /*
824 * Wait for I/O completion on the buffer; return errors
825 * to the user.
826 */
827 int
iowait(struct buf * bp)828 iowait(struct buf *bp)
829 {
830 ASSERT(SEMA_HELD(&bp->b_sem));
831 return (biowait(bp));
832 }
833
834 /*
835 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
836 * and wake up anyone waiting for it.
837 */
838 void
iodone(struct buf * bp)839 iodone(struct buf *bp)
840 {
841 ASSERT(SEMA_HELD(&bp->b_sem));
842 (void) biodone(bp);
843 }
844
845 /*
846 * Zero the core associated with a buffer.
847 */
848 void
clrbuf(struct buf * bp)849 clrbuf(struct buf *bp)
850 {
851 ASSERT(SEMA_HELD(&bp->b_sem));
852 bzero(bp->b_un.b_addr, bp->b_bcount);
853 bp->b_resid = 0;
854 }
855
856
857 /*
858 * Make sure all write-behind blocks on dev (or NODEV for all)
859 * are flushed out.
860 */
861 void
bflush(dev_t dev)862 bflush(dev_t dev)
863 {
864 struct buf *bp, *dp;
865 struct hbuf *hp;
866 struct buf *delwri_list = EMPTY_LIST;
867 int i, index;
868 kmutex_t *hmp;
869
870 mutex_enter(&blist_lock);
871 /*
872 * Wait for any invalidates or flushes ahead of us to finish.
873 * We really could split blist_lock up per device for better
874 * parallelism here.
875 */
876 while (bio_doinginval || bio_doingflush) {
877 bio_flinv_cv_wanted = 1;
878 cv_wait(&bio_flushinval_cv, &blist_lock);
879 }
880 bio_doingflush++;
881 /*
882 * Gather all B_DELWRI buffer for device.
883 * Lock ordering is b_sem > hash lock (brelse).
884 * Since we are finding the buffer via the delayed write list,
885 * it may be busy and we would block trying to get the
886 * b_sem lock while holding hash lock. So transfer all the
887 * candidates on the delwri_list and then drop the hash locks.
888 */
889 for (i = 0; i < v.v_hbuf; i++) {
890 vfs_syncprogress();
891 hmp = &hbuf[i].b_lock;
892 dp = (struct buf *)&dwbuf[i];
893 mutex_enter(hmp);
894 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
895 if (dev == NODEV || bp->b_edev == dev) {
896 if (bp->b_list == NULL) {
897 bp->b_list = delwri_list;
898 delwri_list = bp;
899 }
900 }
901 }
902 mutex_exit(hmp);
903 }
904 mutex_exit(&blist_lock);
905
906 /*
907 * Now that the hash locks have been dropped grab the semaphores
908 * and write back all the buffers that have B_DELWRI set.
909 */
910 while (delwri_list != EMPTY_LIST) {
911 vfs_syncprogress();
912 bp = delwri_list;
913
914 sema_p(&bp->b_sem); /* may block */
915 if ((dev != bp->b_edev && dev != NODEV) ||
916 (panicstr && bp->b_flags & B_BUSY)) {
917 sema_v(&bp->b_sem);
918 delwri_list = bp->b_list;
919 bp->b_list = NULL;
920 continue; /* No longer a candidate */
921 }
922 if (bp->b_flags & B_DELWRI) {
923 index = bio_bhash(bp->b_edev, bp->b_blkno);
924 hp = &hbuf[index];
925 hmp = &hp->b_lock;
926 dp = (struct buf *)hp;
927
928 bp->b_flags |= B_ASYNC;
929 mutex_enter(hmp);
930 hp->b_length--;
931 notavail(bp);
932 mutex_exit(hmp);
933 if (bp->b_vp == NULL) { /* !ufs */
934 BWRITE(bp);
935 } else { /* ufs */
936 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
937 }
938 } else {
939 sema_v(&bp->b_sem);
940 }
941 delwri_list = bp->b_list;
942 bp->b_list = NULL;
943 }
944 mutex_enter(&blist_lock);
945 bio_doingflush--;
946 if (bio_flinv_cv_wanted) {
947 bio_flinv_cv_wanted = 0;
948 cv_broadcast(&bio_flushinval_cv);
949 }
950 mutex_exit(&blist_lock);
951 }
952
953 /*
954 * Ensure that a specified block is up-to-date on disk.
955 */
956 void
blkflush(dev_t dev,daddr_t blkno)957 blkflush(dev_t dev, daddr_t blkno)
958 {
959 struct buf *bp, *dp;
960 struct hbuf *hp;
961 struct buf *sbp = NULL;
962 uint_t index;
963 kmutex_t *hmp;
964
965 index = bio_bhash(dev, blkno);
966 hp = &hbuf[index];
967 dp = (struct buf *)hp;
968 hmp = &hp->b_lock;
969
970 /*
971 * Identify the buffer in the cache belonging to
972 * this device and blkno (if any).
973 */
974 mutex_enter(hmp);
975 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
976 if (bp->b_blkno != blkno || bp->b_edev != dev ||
977 (bp->b_flags & B_STALE))
978 continue;
979 sbp = bp;
980 break;
981 }
982 mutex_exit(hmp);
983 if (sbp == NULL)
984 return;
985 /*
986 * Now check the buffer we have identified and
987 * make sure it still belongs to the device and is B_DELWRI
988 */
989 sema_p(&sbp->b_sem);
990 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
991 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
992 mutex_enter(hmp);
993 hp->b_length--;
994 notavail(sbp);
995 mutex_exit(hmp);
996 /*
997 * XXX - There is nothing to guarantee a synchronous
998 * write here if the B_ASYNC flag is set. This needs
999 * some investigation.
1000 */
1001 if (sbp->b_vp == NULL) { /* !ufs */
1002 BWRITE(sbp); /* synchronous write */
1003 } else { /* ufs */
1004 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1005 }
1006 } else {
1007 sema_v(&sbp->b_sem);
1008 }
1009 }
1010
1011 /*
1012 * Same as binval, except can force-invalidate delayed-write buffers
1013 * (which are not be already flushed because of device errors). Also
1014 * makes sure that the retry write flag is cleared.
1015 */
1016 int
bfinval(dev_t dev,int force)1017 bfinval(dev_t dev, int force)
1018 {
1019 struct buf *dp;
1020 struct buf *bp;
1021 struct buf *binval_list = EMPTY_LIST;
1022 int i, error = 0;
1023 kmutex_t *hmp;
1024 uint_t index;
1025 struct buf **backp;
1026
1027 mutex_enter(&blist_lock);
1028 /*
1029 * Wait for any flushes ahead of us to finish, it's ok to
1030 * do invalidates in parallel.
1031 */
1032 while (bio_doingflush) {
1033 bio_flinv_cv_wanted = 1;
1034 cv_wait(&bio_flushinval_cv, &blist_lock);
1035 }
1036 bio_doinginval++;
1037
1038 /* Gather bp's */
1039 for (i = 0; i < v.v_hbuf; i++) {
1040 dp = (struct buf *)&hbuf[i];
1041 hmp = &hbuf[i].b_lock;
1042
1043 mutex_enter(hmp);
1044 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1045 if (bp->b_edev == dev) {
1046 if (bp->b_list == NULL) {
1047 bp->b_list = binval_list;
1048 binval_list = bp;
1049 }
1050 }
1051 }
1052 mutex_exit(hmp);
1053 }
1054 mutex_exit(&blist_lock);
1055
1056 /* Invalidate all bp's found */
1057 while (binval_list != EMPTY_LIST) {
1058 bp = binval_list;
1059
1060 sema_p(&bp->b_sem);
1061 if (bp->b_edev == dev) {
1062 if (force && (bp->b_flags & B_DELWRI)) {
1063 /* clear B_DELWRI, move to non-dw freelist */
1064 index = bio_bhash(bp->b_edev, bp->b_blkno);
1065 hmp = &hbuf[index].b_lock;
1066 dp = (struct buf *)&hbuf[index];
1067 mutex_enter(hmp);
1068
1069 /* remove from delayed write freelist */
1070 notavail(bp);
1071
1072 /* add to B_AGE side of non-dw freelist */
1073 backp = &dp->av_forw;
1074 (*backp)->av_back = bp;
1075 bp->av_forw = *backp;
1076 *backp = bp;
1077 bp->av_back = dp;
1078
1079 /*
1080 * make sure write retries and busy are cleared
1081 */
1082 bp->b_flags &=
1083 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1084 mutex_exit(hmp);
1085 }
1086 if ((bp->b_flags & B_DELWRI) == 0)
1087 bp->b_flags |= B_STALE|B_AGE;
1088 else
1089 error = EIO;
1090 }
1091 sema_v(&bp->b_sem);
1092 binval_list = bp->b_list;
1093 bp->b_list = NULL;
1094 }
1095 mutex_enter(&blist_lock);
1096 bio_doinginval--;
1097 if (bio_flinv_cv_wanted) {
1098 cv_broadcast(&bio_flushinval_cv);
1099 bio_flinv_cv_wanted = 0;
1100 }
1101 mutex_exit(&blist_lock);
1102 return (error);
1103 }
1104
1105 /*
1106 * If possible, invalidate blocks for a dev on demand
1107 */
1108 void
binval(dev_t dev)1109 binval(dev_t dev)
1110 {
1111 (void) bfinval(dev, 0);
1112 }
1113
1114 /*
1115 * Initialize the buffer I/O system by freeing
1116 * all buffers and setting all device hash buffer lists to empty.
1117 */
1118 void
binit(void)1119 binit(void)
1120 {
1121 struct buf *bp;
1122 unsigned int i, pct;
1123 ulong_t bio_max_hwm, bio_default_hwm;
1124
1125 /*
1126 * Maximum/Default values for bufhwm are set to the smallest of:
1127 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1128 * - 1/4 of kernel virtual memory
1129 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1130 * Additionally, in order to allow simple tuning by percentage of
1131 * physical memory, bufhwm_pct is used to calculate the default if
1132 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1133 *
1134 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1135 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1136 */
1137 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1138 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1139 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1140
1141 pct = BIO_BUF_PERCENT;
1142 if (bufhwm_pct != 0 &&
1143 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1144 pct = BIO_BUF_PERCENT;
1145 /*
1146 * Invalid user specified value, emit a warning.
1147 */
1148 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1149 range(1..%d). Using %d as default.",
1150 bufhwm_pct,
1151 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1152 }
1153
1154 bio_default_hwm = MIN(physmem / pct,
1155 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1156 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1157
1158 if ((v.v_bufhwm = bufhwm) == 0)
1159 v.v_bufhwm = bio_default_hwm;
1160
1161 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1162 v.v_bufhwm = (int)bio_max_hwm;
1163 /*
1164 * Invalid user specified value, emit a warning.
1165 */
1166 cmn_err(CE_WARN,
1167 "binit: bufhwm(%d) out \
1168 of range(%d..%lu). Using %lu as default",
1169 bufhwm,
1170 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1171 }
1172
1173 /*
1174 * Determine the number of hash buckets. Default is to
1175 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1176 * Round up number to the next power of 2.
1177 */
1178 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1179 BIO_HASHLEN);
1180 v.v_hmask = v.v_hbuf - 1;
1181 v.v_buf = BIO_BHDR_POOL;
1182
1183 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1184
1185 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1186
1187 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1188 bp = &bfreelist;
1189 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1190
1191 for (i = 0; i < v.v_hbuf; i++) {
1192 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1193 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1194
1195 /*
1196 * Initialize the delayed write buffer list.
1197 */
1198 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1199 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1200 }
1201 }
1202
1203 /*
1204 * Wait for I/O completion on the buffer; return error code.
1205 * If bp was for synchronous I/O, bp is invalid and associated
1206 * resources are freed on return.
1207 */
1208 int
biowait(struct buf * bp)1209 biowait(struct buf *bp)
1210 {
1211 int error = 0;
1212 struct cpu *cpup;
1213
1214 ASSERT(SEMA_HELD(&bp->b_sem));
1215
1216 cpup = CPU;
1217 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1218 DTRACE_IO1(wait__start, struct buf *, bp);
1219
1220 /*
1221 * In case of panic, busy wait for completion
1222 */
1223 if (panicstr) {
1224 while ((bp->b_flags & B_DONE) == 0)
1225 drv_usecwait(10);
1226 } else
1227 sema_p(&bp->b_io);
1228
1229 DTRACE_IO1(wait__done, struct buf *, bp);
1230 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1231
1232 error = geterror(bp);
1233 if ((bp->b_flags & B_ASYNC) == 0) {
1234 if (bp->b_flags & B_REMAPPED)
1235 bp_mapout(bp);
1236 }
1237 return (error);
1238 }
1239
1240 static void
biodone_tnf_probe(struct buf * bp)1241 biodone_tnf_probe(struct buf *bp)
1242 {
1243 /* Kernel probe */
1244 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1245 tnf_device, device, bp->b_edev,
1246 tnf_diskaddr, block, bp->b_lblkno,
1247 tnf_opaque, buf, bp);
1248 }
1249
1250 /*
1251 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1252 * and wake up anyone waiting for it.
1253 */
1254 void
biodone(struct buf * bp)1255 biodone(struct buf *bp)
1256 {
1257 if (bp->b_flags & B_STARTED) {
1258 DTRACE_IO1(done, struct buf *, bp);
1259 bp->b_flags &= ~B_STARTED;
1260 }
1261
1262 /*
1263 * Call the TNF probe here instead of the inline code
1264 * to force our compiler to use the tail call optimization.
1265 */
1266 biodone_tnf_probe(bp);
1267
1268 if (bp->b_iodone != NULL) {
1269 (*(bp->b_iodone))(bp);
1270 return;
1271 }
1272 ASSERT((bp->b_flags & B_DONE) == 0);
1273 ASSERT(SEMA_HELD(&bp->b_sem));
1274 bp->b_flags |= B_DONE;
1275 if (bp->b_flags & B_ASYNC) {
1276 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1277 bio_pageio_done(bp);
1278 else
1279 brelse(bp); /* release bp to freelist */
1280 } else {
1281 sema_v(&bp->b_io);
1282 }
1283 }
1284
1285 /*
1286 * Pick up the device's error number and pass it to the user;
1287 * if there is an error but the number is 0 set a generalized code.
1288 */
1289 int
geterror(struct buf * bp)1290 geterror(struct buf *bp)
1291 {
1292 int error = 0;
1293
1294 ASSERT(SEMA_HELD(&bp->b_sem));
1295 if (bp->b_flags & B_ERROR) {
1296 error = bp->b_error;
1297 if (!error)
1298 error = EIO;
1299 }
1300 return (error);
1301 }
1302
1303 /*
1304 * Support for pageio buffers.
1305 *
1306 * This stuff should be generalized to provide a generalized bp
1307 * header facility that can be used for things other than pageio.
1308 */
1309
1310 /*
1311 * Allocate and initialize a buf struct for use with pageio.
1312 */
1313 struct buf *
pageio_setup(struct page * pp,size_t len,struct vnode * vp,int flags)1314 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1315 {
1316 struct buf *bp;
1317 struct cpu *cpup;
1318
1319 if (flags & B_READ) {
1320 CPU_STATS_ENTER_K();
1321 cpup = CPU; /* get pointer AFTER preemption is disabled */
1322 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1323 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1324 if ((flags & B_ASYNC) == 0) {
1325 klwp_t *lwp = ttolwp(curthread);
1326 if (lwp != NULL)
1327 lwp->lwp_ru.majflt++;
1328 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1329 /* Kernel probe */
1330 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1331 tnf_opaque, vnode, pp->p_vnode,
1332 tnf_offset, offset, pp->p_offset);
1333 }
1334 /*
1335 * Update statistics for pages being paged in
1336 */
1337 if (pp != NULL && pp->p_vnode != NULL) {
1338 if (IS_SWAPFSVP(pp->p_vnode)) {
1339 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1340 } else {
1341 if (pp->p_vnode->v_flag & VVMEXEC) {
1342 CPU_STATS_ADDQ(cpup, vm, execpgin,
1343 btopr(len));
1344 } else {
1345 CPU_STATS_ADDQ(cpup, vm, fspgin,
1346 btopr(len));
1347 }
1348 }
1349 }
1350 CPU_STATS_EXIT_K();
1351 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1352 "page_ws_in:pp %p", pp);
1353 /* Kernel probe */
1354 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1355 tnf_opaque, vnode, pp->p_vnode,
1356 tnf_offset, offset, pp->p_offset,
1357 tnf_size, size, len);
1358 }
1359
1360 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1361 bp->b_bcount = len;
1362 bp->b_bufsize = len;
1363 bp->b_pages = pp;
1364 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1365 bp->b_offset = -1;
1366 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1367
1368 /* Initialize bp->b_sem in "locked" state */
1369 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1370
1371 VN_HOLD(vp);
1372 bp->b_vp = vp;
1373
1374 /*
1375 * Caller sets dev & blkno and can adjust
1376 * b_addr for page offset and can use bp_mapin
1377 * to make pages kernel addressable.
1378 */
1379 return (bp);
1380 }
1381
1382 void
pageio_done(struct buf * bp)1383 pageio_done(struct buf *bp)
1384 {
1385 ASSERT(SEMA_HELD(&bp->b_sem));
1386 if (bp->b_flags & B_REMAPPED)
1387 bp_mapout(bp);
1388 VN_RELE(bp->b_vp);
1389 bp->b_vp = NULL;
1390 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1391
1392 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1393 sema_destroy(&bp->b_sem);
1394 sema_destroy(&bp->b_io);
1395 kmem_free(bp, sizeof (struct buf));
1396 }
1397
1398 /*
1399 * Check to see whether the buffers, except the one pointed by sbp,
1400 * associated with the device are busy.
1401 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1402 */
1403 int
bcheck(dev_t dev,struct buf * sbp)1404 bcheck(dev_t dev, struct buf *sbp)
1405 {
1406 struct buf *bp;
1407 struct buf *dp;
1408 int i;
1409 kmutex_t *hmp;
1410
1411 /*
1412 * check for busy bufs for this filesystem
1413 */
1414 for (i = 0; i < v.v_hbuf; i++) {
1415 dp = (struct buf *)&hbuf[i];
1416 hmp = &hbuf[i].b_lock;
1417
1418 mutex_enter(hmp);
1419 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1420 /*
1421 * if buf is busy or dirty, then filesystem is busy
1422 */
1423 if ((bp->b_edev == dev) &&
1424 ((bp->b_flags & B_STALE) == 0) &&
1425 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1426 (bp != sbp)) {
1427 mutex_exit(hmp);
1428 return (1);
1429 }
1430 }
1431 mutex_exit(hmp);
1432 }
1433 return (0);
1434 }
1435
1436 /*
1437 * Hash two 32 bit entities.
1438 */
1439 int
hash2ints(int x,int y)1440 hash2ints(int x, int y)
1441 {
1442 int hash = 0;
1443
1444 hash = x - 1;
1445 hash = ((hash * 7) + (x >> 8)) - 1;
1446 hash = ((hash * 7) + (x >> 16)) - 1;
1447 hash = ((hash * 7) + (x >> 24)) - 1;
1448 hash = ((hash * 7) + y) - 1;
1449 hash = ((hash * 7) + (y >> 8)) - 1;
1450 hash = ((hash * 7) + (y >> 16)) - 1;
1451 hash = ((hash * 7) + (y >> 24)) - 1;
1452
1453 return (hash);
1454 }
1455
1456
1457 /*
1458 * Return a new buffer struct.
1459 * Create a new buffer if we haven't gone over our high water
1460 * mark for memory, otherwise try to get one off the freelist.
1461 *
1462 * Returns a locked buf that has no id and is not on any hash or free
1463 * list.
1464 */
1465 static struct buf *
bio_getfreeblk(long bsize)1466 bio_getfreeblk(long bsize)
1467 {
1468 struct buf *bp, *dp;
1469 struct hbuf *hp;
1470 kmutex_t *hmp;
1471 uint_t start, end;
1472
1473 /*
1474 * mutex_enter(&bfree_lock);
1475 * bfreelist.b_bufsize represents the amount of memory
1476 * mutex_exit(&bfree_lock); protect ref to bfreelist
1477 * we are allowed to allocate in the cache before we hit our hwm.
1478 */
1479 bio_mem_get(bsize); /* Account for our memory request */
1480
1481 again:
1482 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1483 sema_p(&bp->b_sem); /* Should never fail */
1484
1485 ASSERT(bp->b_un.b_addr == NULL);
1486 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1487 if (bp->b_un.b_addr != NULL) {
1488 /*
1489 * Make the common path short
1490 */
1491 bp->b_bufsize = bsize;
1492 ASSERT(SEMA_HELD(&bp->b_sem));
1493 return (bp);
1494 } else {
1495 struct buf *save;
1496
1497 save = bp; /* Save bp we allocated */
1498 start = end = lastindex;
1499
1500 biostats.bio_bufwant.value.ui32++;
1501
1502 /*
1503 * Memory isn't available from the system now. Scan
1504 * the hash buckets till enough space is found.
1505 */
1506 do {
1507 hp = &hbuf[start];
1508 hmp = &hp->b_lock;
1509 dp = (struct buf *)hp;
1510
1511 mutex_enter(hmp);
1512 bp = dp->av_forw;
1513
1514 while (bp != dp) {
1515
1516 ASSERT(bp != NULL);
1517
1518 if (!sema_tryp(&bp->b_sem)) {
1519 bp = bp->av_forw;
1520 continue;
1521 }
1522
1523 /*
1524 * Since we are going down the freelist
1525 * associated with this hash bucket the
1526 * B_DELWRI flag should not be set.
1527 */
1528 ASSERT(!(bp->b_flags & B_DELWRI));
1529
1530 if (bp->b_bufsize == bsize) {
1531 hp->b_length--;
1532 notavail(bp);
1533 bremhash(bp);
1534 mutex_exit(hmp);
1535
1536 /*
1537 * Didn't kmem_alloc any more, so don't
1538 * count it twice.
1539 */
1540 mutex_enter(&bfree_lock);
1541 bfreelist.b_bufsize += bsize;
1542 mutex_exit(&bfree_lock);
1543
1544 /*
1545 * Update the lastindex value.
1546 */
1547 lastindex = start;
1548
1549 /*
1550 * Put our saved bp back on the list
1551 */
1552 sema_v(&save->b_sem);
1553 bio_bhdr_free(save);
1554 ASSERT(SEMA_HELD(&bp->b_sem));
1555 return (bp);
1556 }
1557 sema_v(&bp->b_sem);
1558 bp = bp->av_forw;
1559 }
1560 mutex_exit(hmp);
1561 start = ((start + 1) % v.v_hbuf);
1562 } while (start != end);
1563
1564 biostats.bio_bufwait.value.ui32++;
1565 bp = save; /* Use original bp */
1566 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1567 }
1568
1569 bp->b_bufsize = bsize;
1570 ASSERT(SEMA_HELD(&bp->b_sem));
1571 return (bp);
1572 }
1573
1574 /*
1575 * Allocate a buffer header. If none currently available, allocate
1576 * a new pool.
1577 */
1578 static struct buf *
bio_bhdr_alloc(void)1579 bio_bhdr_alloc(void)
1580 {
1581 struct buf *dp, *sdp;
1582 struct buf *bp;
1583 int i;
1584
1585 for (;;) {
1586 mutex_enter(&bhdr_lock);
1587 if (bhdrlist != NULL) {
1588 bp = bhdrlist;
1589 bhdrlist = bp->av_forw;
1590 mutex_exit(&bhdr_lock);
1591 bp->av_forw = NULL;
1592 return (bp);
1593 }
1594 mutex_exit(&bhdr_lock);
1595
1596 /*
1597 * Need to allocate a new pool. If the system is currently
1598 * out of memory, then try freeing things on the freelist.
1599 */
1600 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1601 if (dp == NULL) {
1602 /*
1603 * System can't give us a pool of headers, try
1604 * recycling from the free lists.
1605 */
1606 bio_recycle(BIO_HEADER, 0);
1607 } else {
1608 sdp = dp;
1609 for (i = 0; i < v.v_buf; i++, dp++) {
1610 /*
1611 * The next two lines are needed since NODEV
1612 * is -1 and not NULL
1613 */
1614 dp->b_dev = (o_dev_t)NODEV;
1615 dp->b_edev = NODEV;
1616 dp->av_forw = dp + 1;
1617 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1618 NULL);
1619 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1620 NULL);
1621 dp->b_offset = -1;
1622 }
1623 mutex_enter(&bhdr_lock);
1624 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1625 bhdrlist = sdp;
1626 nbuf += v.v_buf;
1627 bp = bhdrlist;
1628 bhdrlist = bp->av_forw;
1629 mutex_exit(&bhdr_lock);
1630
1631 bp->av_forw = NULL;
1632 return (bp);
1633 }
1634 }
1635 }
1636
1637 static void
bio_bhdr_free(struct buf * bp)1638 bio_bhdr_free(struct buf *bp)
1639 {
1640 ASSERT(bp->b_back == NULL);
1641 ASSERT(bp->b_forw == NULL);
1642 ASSERT(bp->av_back == NULL);
1643 ASSERT(bp->av_forw == NULL);
1644 ASSERT(bp->b_un.b_addr == NULL);
1645 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1646 ASSERT(bp->b_edev == NODEV);
1647 ASSERT(bp->b_flags == 0);
1648
1649 mutex_enter(&bhdr_lock);
1650 bp->av_forw = bhdrlist;
1651 bhdrlist = bp;
1652 mutex_exit(&bhdr_lock);
1653 }
1654
1655 /*
1656 * If we haven't gone over the high water mark, it's o.k. to
1657 * allocate more buffer space, otherwise recycle buffers
1658 * from the freelist until enough memory is free for a bsize request.
1659 *
1660 * We account for this memory, even though
1661 * we don't allocate it here.
1662 */
1663 static void
bio_mem_get(long bsize)1664 bio_mem_get(long bsize)
1665 {
1666 mutex_enter(&bfree_lock);
1667 if (bfreelist.b_bufsize > bsize) {
1668 bfreelist.b_bufsize -= bsize;
1669 mutex_exit(&bfree_lock);
1670 return;
1671 }
1672 mutex_exit(&bfree_lock);
1673 bio_recycle(BIO_MEM, bsize);
1674 }
1675
1676 /*
1677 * flush a list of delayed write buffers.
1678 * (currently used only by bio_recycle below.)
1679 */
1680 static void
bio_flushlist(struct buf * delwri_list)1681 bio_flushlist(struct buf *delwri_list)
1682 {
1683 struct buf *bp;
1684
1685 while (delwri_list != EMPTY_LIST) {
1686 bp = delwri_list;
1687 bp->b_flags |= B_AGE | B_ASYNC;
1688 if (bp->b_vp == NULL) { /* !ufs */
1689 BWRITE(bp);
1690 } else { /* ufs */
1691 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1692 }
1693 delwri_list = bp->b_list;
1694 bp->b_list = NULL;
1695 }
1696 }
1697
1698 /*
1699 * Start recycling buffers on the freelist for one of 2 reasons:
1700 * - we need a buffer header
1701 * - we need to free up memory
1702 * Once started we continue to recycle buffers until the B_AGE
1703 * buffers are gone.
1704 */
1705 static void
bio_recycle(int want,long bsize)1706 bio_recycle(int want, long bsize)
1707 {
1708 struct buf *bp, *dp, *dwp, *nbp;
1709 struct hbuf *hp;
1710 int found = 0;
1711 kmutex_t *hmp;
1712 int start, end;
1713 struct buf *delwri_list = EMPTY_LIST;
1714
1715 /*
1716 * Recycle buffers.
1717 */
1718 top:
1719 start = end = lastindex;
1720 do {
1721 hp = &hbuf[start];
1722 hmp = &hp->b_lock;
1723 dp = (struct buf *)hp;
1724
1725 mutex_enter(hmp);
1726 bp = dp->av_forw;
1727
1728 while (bp != dp) {
1729
1730 ASSERT(bp != NULL);
1731
1732 if (!sema_tryp(&bp->b_sem)) {
1733 bp = bp->av_forw;
1734 continue;
1735 }
1736 /*
1737 * Do we really want to nuke all of the B_AGE stuff??
1738 */
1739 if ((bp->b_flags & B_AGE) == 0 && found) {
1740 sema_v(&bp->b_sem);
1741 mutex_exit(hmp);
1742 lastindex = start;
1743 return; /* All done */
1744 }
1745
1746 ASSERT(MUTEX_HELD(&hp->b_lock));
1747 ASSERT(!(bp->b_flags & B_DELWRI));
1748 hp->b_length--;
1749 notavail(bp);
1750
1751 /*
1752 * Remove bhdr from cache, free up memory,
1753 * and add the hdr to the freelist.
1754 */
1755 bremhash(bp);
1756 mutex_exit(hmp);
1757
1758 if (bp->b_bufsize) {
1759 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1760 bp->b_un.b_addr = NULL;
1761 mutex_enter(&bfree_lock);
1762 bfreelist.b_bufsize += bp->b_bufsize;
1763 mutex_exit(&bfree_lock);
1764 }
1765
1766 bp->b_dev = (o_dev_t)NODEV;
1767 bp->b_edev = NODEV;
1768 bp->b_flags = 0;
1769 sema_v(&bp->b_sem);
1770 bio_bhdr_free(bp);
1771 if (want == BIO_HEADER) {
1772 found = 1;
1773 } else {
1774 ASSERT(want == BIO_MEM);
1775 if (!found && bfreelist.b_bufsize >= bsize) {
1776 /* Account for the memory we want */
1777 mutex_enter(&bfree_lock);
1778 if (bfreelist.b_bufsize >= bsize) {
1779 bfreelist.b_bufsize -= bsize;
1780 found = 1;
1781 }
1782 mutex_exit(&bfree_lock);
1783 }
1784 }
1785
1786 /*
1787 * Since we dropped hmp start from the
1788 * begining.
1789 */
1790 mutex_enter(hmp);
1791 bp = dp->av_forw;
1792 }
1793 mutex_exit(hmp);
1794
1795 /*
1796 * Look at the delayed write list.
1797 * First gather into a private list, then write them.
1798 */
1799 dwp = (struct buf *)&dwbuf[start];
1800 mutex_enter(&blist_lock);
1801 bio_doingflush++;
1802 mutex_enter(hmp);
1803 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1804
1805 ASSERT(bp != NULL);
1806 nbp = bp->av_forw;
1807
1808 if (!sema_tryp(&bp->b_sem))
1809 continue;
1810 ASSERT(bp->b_flags & B_DELWRI);
1811 /*
1812 * Do we really want to nuke all of the B_AGE stuff??
1813 */
1814
1815 if ((bp->b_flags & B_AGE) == 0 && found) {
1816 sema_v(&bp->b_sem);
1817 mutex_exit(hmp);
1818 lastindex = start;
1819 mutex_exit(&blist_lock);
1820 bio_flushlist(delwri_list);
1821 mutex_enter(&blist_lock);
1822 bio_doingflush--;
1823 if (bio_flinv_cv_wanted) {
1824 bio_flinv_cv_wanted = 0;
1825 cv_broadcast(&bio_flushinval_cv);
1826 }
1827 mutex_exit(&blist_lock);
1828 return; /* All done */
1829 }
1830
1831 /*
1832 * If the buffer is already on a flush or
1833 * invalidate list then just skip it.
1834 */
1835 if (bp->b_list != NULL) {
1836 sema_v(&bp->b_sem);
1837 continue;
1838 }
1839 /*
1840 * We are still on the same bucket.
1841 */
1842 hp->b_length--;
1843 notavail(bp);
1844 bp->b_list = delwri_list;
1845 delwri_list = bp;
1846 }
1847 mutex_exit(hmp);
1848 mutex_exit(&blist_lock);
1849 bio_flushlist(delwri_list);
1850 delwri_list = EMPTY_LIST;
1851 mutex_enter(&blist_lock);
1852 bio_doingflush--;
1853 if (bio_flinv_cv_wanted) {
1854 bio_flinv_cv_wanted = 0;
1855 cv_broadcast(&bio_flushinval_cv);
1856 }
1857 mutex_exit(&blist_lock);
1858 start = (start + 1) % v.v_hbuf;
1859
1860 } while (start != end);
1861
1862 if (found)
1863 return;
1864
1865 /*
1866 * Free lists exhausted and we haven't satisfied the request.
1867 * Wait here for more entries to be added to freelist.
1868 * Because this might have just happened, make it timed.
1869 */
1870 mutex_enter(&bfree_lock);
1871 bfreelist.b_flags |= B_WANTED;
1872 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1873 mutex_exit(&bfree_lock);
1874 goto top;
1875 }
1876
1877 /*
1878 * See if the block is associated with some buffer
1879 * (mainly to avoid getting hung up on a wait in breada).
1880 */
1881 static int
bio_incore(dev_t dev,daddr_t blkno)1882 bio_incore(dev_t dev, daddr_t blkno)
1883 {
1884 struct buf *bp;
1885 struct buf *dp;
1886 uint_t index;
1887 kmutex_t *hmp;
1888
1889 index = bio_bhash(dev, blkno);
1890 dp = (struct buf *)&hbuf[index];
1891 hmp = &hbuf[index].b_lock;
1892
1893 mutex_enter(hmp);
1894 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1895 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1896 (bp->b_flags & B_STALE) == 0) {
1897 mutex_exit(hmp);
1898 return (1);
1899 }
1900 }
1901 mutex_exit(hmp);
1902 return (0);
1903 }
1904
1905 static void
bio_pageio_done(struct buf * bp)1906 bio_pageio_done(struct buf *bp)
1907 {
1908 if (bp->b_flags & B_PAGEIO) {
1909
1910 if (bp->b_flags & B_REMAPPED)
1911 bp_mapout(bp);
1912
1913 if (bp->b_flags & B_READ)
1914 pvn_read_done(bp->b_pages, bp->b_flags);
1915 else
1916 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1917 pageio_done(bp);
1918 } else {
1919 ASSERT(bp->b_flags & B_REMAPPED);
1920 bp_mapout(bp);
1921 brelse(bp);
1922 }
1923 }
1924
1925 /*
1926 * bioerror(9F) - indicate error in buffer header
1927 * If 'error' is zero, remove the error indication.
1928 */
1929 void
bioerror(struct buf * bp,int error)1930 bioerror(struct buf *bp, int error)
1931 {
1932 ASSERT(bp != NULL);
1933 ASSERT(error >= 0);
1934 ASSERT(SEMA_HELD(&bp->b_sem));
1935
1936 if (error != 0) {
1937 bp->b_flags |= B_ERROR;
1938 } else {
1939 bp->b_flags &= ~B_ERROR;
1940 }
1941 bp->b_error = error;
1942 }
1943
1944 /*
1945 * bioreset(9F) - reuse a private buffer header after I/O is complete
1946 */
1947 void
bioreset(struct buf * bp)1948 bioreset(struct buf *bp)
1949 {
1950 ASSERT(bp != NULL);
1951
1952 biofini(bp);
1953 bioinit(bp);
1954 }
1955
1956 /*
1957 * biosize(9F) - return size of a buffer header
1958 */
1959 size_t
biosize(void)1960 biosize(void)
1961 {
1962 return (sizeof (struct buf));
1963 }
1964
1965 /*
1966 * biomodified(9F) - check if buffer is modified
1967 */
1968 int
biomodified(struct buf * bp)1969 biomodified(struct buf *bp)
1970 {
1971 int npf;
1972 int ppattr;
1973 struct page *pp;
1974
1975 ASSERT(bp != NULL);
1976
1977 if ((bp->b_flags & B_PAGEIO) == 0) {
1978 return (-1);
1979 }
1980 pp = bp->b_pages;
1981 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1982
1983 while (npf > 0) {
1984 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1985 HAT_SYNC_STOPON_MOD);
1986 if (ppattr & P_MOD)
1987 return (1);
1988 pp = pp->p_next;
1989 npf--;
1990 }
1991
1992 return (0);
1993 }
1994
1995 /*
1996 * bioinit(9F) - initialize a buffer structure
1997 */
1998 void
bioinit(struct buf * bp)1999 bioinit(struct buf *bp)
2000 {
2001 bzero(bp, sizeof (struct buf));
2002 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2003 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2004 bp->b_offset = -1;
2005 }
2006
2007 /*
2008 * biofini(9F) - uninitialize a buffer structure
2009 */
2010 void
biofini(struct buf * bp)2011 biofini(struct buf *bp)
2012 {
2013 sema_destroy(&bp->b_io);
2014 sema_destroy(&bp->b_sem);
2015 }
2016
2017 /*
2018 * bioclone(9F) - clone a buffer
2019 */
2020 struct buf *
bioclone(struct buf * bp,off_t off,size_t len,dev_t dev,daddr_t blkno,int (* iodone)(struct buf *),struct buf * bp_mem,int sleep)2021 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2022 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2023 {
2024 struct buf *bufp;
2025
2026 ASSERT(bp);
2027 if (bp_mem == NULL) {
2028 bufp = kmem_alloc(sizeof (struct buf), sleep);
2029 if (bufp == NULL) {
2030 return (NULL);
2031 }
2032 bioinit(bufp);
2033 } else {
2034 bufp = bp_mem;
2035 bioreset(bufp);
2036 }
2037
2038 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2039 B_ABRWRITE)
2040
2041 /*
2042 * The cloned buffer does not inherit the B_REMAPPED flag.
2043 */
2044 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2045 bufp->b_bcount = len;
2046 bufp->b_blkno = blkno;
2047 bufp->b_iodone = iodone;
2048 bufp->b_proc = bp->b_proc;
2049 bufp->b_edev = dev;
2050 bufp->b_file = bp->b_file;
2051 bufp->b_offset = bp->b_offset;
2052
2053 if (bp->b_flags & B_SHADOW) {
2054 ASSERT(bp->b_shadow);
2055 ASSERT(bp->b_flags & B_PHYS);
2056
2057 bufp->b_shadow = bp->b_shadow +
2058 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2059 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2060 if (bp->b_flags & B_REMAPPED)
2061 bufp->b_proc = NULL;
2062 } else {
2063 if (bp->b_flags & B_PAGEIO) {
2064 struct page *pp;
2065 off_t o;
2066 int i;
2067
2068 pp = bp->b_pages;
2069 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2070 for (i = btop(o); i > 0; i--) {
2071 pp = pp->p_next;
2072 }
2073 bufp->b_pages = pp;
2074 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2075 } else {
2076 bufp->b_un.b_addr =
2077 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2078 if (bp->b_flags & B_REMAPPED)
2079 bufp->b_proc = NULL;
2080 }
2081 }
2082 return (bufp);
2083 }
2084