1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 /*
28 * Copyright (c) 2016 by Delphix. All rights reserved.
29 */
30
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
33
34 /*
35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 * The Regents of the University of California
37 * All Rights Reserved
38 *
39 * University Acknowledgment- Portions of this document are derived from
40 * software developed by the University of California, Berkeley, and its
41 * contributors.
42 */
43
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
47 #include <sys/conf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
51 #include <sys/buf.h>
52 #include <sys/var.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
56 #include <sys/kmem.h>
57 #include <sys/vmem.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
60 #include <vm/page.h>
61 #include <vm/pvn.h>
62 #include <sys/vtrace.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_log.h>
66 #include <sys/systm.h>
67 #include <sys/vfs.h>
68 #include <sys/sdt.h>
69
70 /* Locks */
71 static kmutex_t blist_lock; /* protects b_list */
72 static kmutex_t bhdr_lock; /* protects the bhdrlist */
73 static kmutex_t bfree_lock; /* protects the bfreelist structure */
74
75 struct hbuf *hbuf; /* Hash buckets */
76 struct dwbuf *dwbuf; /* Delayed write buckets */
77 static struct buf *bhdrlist; /* buf header free list */
78 static int nbuf; /* number of buffer headers allocated */
79
80 static int lastindex; /* Reference point on where to start */
81 /* when looking for free buffers */
82
83 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
84 #define EMPTY_LIST ((struct buf *)-1)
85
86 static kcondvar_t bio_mem_cv; /* Condition variables */
87 static kcondvar_t bio_flushinval_cv;
88 static int bio_doingflush; /* flush in progress */
89 static int bio_doinginval; /* inval in progress */
90 static int bio_flinv_cv_wanted; /* someone waiting for cv */
91
92 /*
93 * Statistics on the buffer cache
94 */
95 struct biostats biostats = {
96 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
97 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
98 { "new_buffer_requests", KSTAT_DATA_UINT32 },
99 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
100 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
101 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
102 };
103
104 /*
105 * kstat data
106 */
107 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
108 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
109 sizeof (kstat_named_t));
110
111 /*
112 * Statistics on ufs buffer cache
113 * Not protected by locks
114 */
115 struct ufsbiostats ub = {
116 { "breads", KSTAT_DATA_UINT32 },
117 { "bwrites", KSTAT_DATA_UINT32 },
118 { "fbiwrites", KSTAT_DATA_UINT32 },
119 { "getpages", KSTAT_DATA_UINT32 },
120 { "getras", KSTAT_DATA_UINT32 },
121 { "putsyncs", KSTAT_DATA_UINT32 },
122 { "putasyncs", KSTAT_DATA_UINT32 },
123 { "putpageios", KSTAT_DATA_UINT32 },
124 };
125
126 /*
127 * more UFS Logging eccentricities...
128 *
129 * required since "#pragma weak ..." doesn't work in reverse order.
130 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
131 * to ufs routines don't get plugged into bio.c calls so
132 * we initialize it when setting up the "lufsops" table
133 * in "lufs.c:_init()"
134 */
135 void (*bio_lufs_strategy)(void *, buf_t *);
136 void (*bio_snapshot_strategy)(void *, buf_t *);
137
138
139 /* Private routines */
140 static struct buf *bio_getfreeblk(long);
141 static void bio_mem_get(long);
142 static void bio_bhdr_free(struct buf *);
143 static struct buf *bio_bhdr_alloc(void);
144 static void bio_recycle(int, long);
145 static void bio_pageio_done(struct buf *);
146 static int bio_incore(dev_t, daddr_t);
147
148 /*
149 * Buffer cache constants
150 */
151 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
152 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
153 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
154 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
155 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
156 #define BIO_HASHLEN 4 /* Target length of hash chains */
157
158
159 /* Flags for bio_recycle() */
160 #define BIO_HEADER 0x01
161 #define BIO_MEM 0x02
162
163 extern int bufhwm; /* User tunable - high water mark for mem */
164 extern int bufhwm_pct; /* ditto - given in % of physmem */
165
166 /*
167 * The following routines allocate and free
168 * buffers with various side effects. In general the
169 * arguments to an allocate routine are a device and
170 * a block number, and the value is a pointer to
171 * to the buffer header; the buffer returned is locked with a
172 * binary semaphore so that no one else can touch it. If the block was
173 * already in core, no I/O need be done; if it is
174 * already locked, the process waits until it becomes free.
175 * The following routines allocate a buffer:
176 * getblk
177 * bread/BREAD
178 * breada
179 * Eventually the buffer must be released, possibly with the
180 * side effect of writing it out, by using one of
181 * bwrite/BWRITE/brwrite
182 * bdwrite/bdrwrite
183 * bawrite
184 * brelse
185 *
186 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
187 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
188 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
189 * B_DONE is still used to denote a buffer with I/O complete on it.
190 *
191 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
192 * should not be used where a very accurate count of the free buffers is
193 * needed.
194 */
195
196 /*
197 * Read in (if necessary) the block and return a buffer pointer.
198 *
199 * This interface is provided for binary compatibility. Using
200 * BREAD() directly avoids the extra function call overhead invoked
201 * by calling this routine.
202 */
203 struct buf *
bread(dev_t dev,daddr_t blkno,long bsize)204 bread(dev_t dev, daddr_t blkno, long bsize)
205 {
206 return (BREAD(dev, blkno, bsize));
207 }
208
209 /*
210 * Common code for reading a buffer with various options
211 *
212 * Read in (if necessary) the block and return a buffer pointer.
213 */
214 struct buf *
bread_common(void * arg,dev_t dev,daddr_t blkno,long bsize)215 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
216 {
217 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
218 struct buf *bp;
219 klwp_t *lwp = ttolwp(curthread);
220
221 CPU_STATS_ADD_K(sys, lread, 1);
222 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
223 if (bp->b_flags & B_DONE)
224 return (bp);
225 bp->b_flags |= B_READ;
226 ASSERT(bp->b_bcount == bsize);
227 if (ufsvfsp == NULL) { /* !ufs */
228 (void) bdev_strategy(bp);
229 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
230 /* ufs && logging */
231 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
232 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
233 /* ufs && snapshots */
234 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
235 } else {
236 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
237 ub.ub_breads.value.ul++; /* ufs && !logging */
238 (void) bdev_strategy(bp);
239 }
240 if (lwp != NULL)
241 lwp->lwp_ru.inblock++;
242 CPU_STATS_ADD_K(sys, bread, 1);
243 (void) biowait(bp);
244 return (bp);
245 }
246
247 /*
248 * Read in the block, like bread, but also start I/O on the
249 * read-ahead block (which is not allocated to the caller).
250 */
251 struct buf *
breada(dev_t dev,daddr_t blkno,daddr_t rablkno,long bsize)252 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
253 {
254 struct buf *bp, *rabp;
255 klwp_t *lwp = ttolwp(curthread);
256
257 bp = NULL;
258 if (!bio_incore(dev, blkno)) {
259 CPU_STATS_ADD_K(sys, lread, 1);
260 bp = GETBLK(dev, blkno, bsize);
261 if ((bp->b_flags & B_DONE) == 0) {
262 bp->b_flags |= B_READ;
263 bp->b_bcount = bsize;
264 (void) bdev_strategy(bp);
265 if (lwp != NULL)
266 lwp->lwp_ru.inblock++;
267 CPU_STATS_ADD_K(sys, bread, 1);
268 }
269 }
270 if (rablkno && bfreelist.b_bcount > 1 &&
271 !bio_incore(dev, rablkno)) {
272 rabp = GETBLK(dev, rablkno, bsize);
273 if (rabp->b_flags & B_DONE)
274 brelse(rabp);
275 else {
276 rabp->b_flags |= B_READ|B_ASYNC;
277 rabp->b_bcount = bsize;
278 (void) bdev_strategy(rabp);
279 if (lwp != NULL)
280 lwp->lwp_ru.inblock++;
281 CPU_STATS_ADD_K(sys, bread, 1);
282 }
283 }
284 if (bp == NULL)
285 return (BREAD(dev, blkno, bsize));
286 (void) biowait(bp);
287 return (bp);
288 }
289
290 /*
291 * Common code for writing a buffer with various options.
292 *
293 * force_wait - wait for write completion regardless of B_ASYNC flag
294 * do_relse - release the buffer when we are done
295 * clear_flags - flags to clear from the buffer
296 */
297 void
bwrite_common(void * arg,struct buf * bp,int force_wait,int do_relse,int clear_flags)298 bwrite_common(void *arg, struct buf *bp, int force_wait,
299 int do_relse, int clear_flags)
300 {
301 register int do_wait;
302 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
303 int flag;
304 klwp_t *lwp = ttolwp(curthread);
305 struct cpu *cpup;
306
307 ASSERT(SEMA_HELD(&bp->b_sem));
308 flag = bp->b_flags;
309 bp->b_flags &= ~clear_flags;
310 if (lwp != NULL)
311 lwp->lwp_ru.oublock++;
312 CPU_STATS_ENTER_K();
313 cpup = CPU; /* get pointer AFTER preemption is disabled */
314 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
315 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
316 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
317 if (do_wait == 0)
318 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
319 CPU_STATS_EXIT_K();
320 if (ufsvfsp == NULL) {
321 (void) bdev_strategy(bp);
322 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
323 /* ufs && logging */
324 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
325 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
326 /* ufs && snapshots */
327 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
328 } else {
329 ub.ub_bwrites.value.ul++; /* ufs && !logging */
330 (void) bdev_strategy(bp);
331 }
332 if (do_wait) {
333 (void) biowait(bp);
334 if (do_relse) {
335 brelse(bp);
336 }
337 }
338 }
339
340 /*
341 * Write the buffer, waiting for completion (unless B_ASYNC is set).
342 * Then release the buffer.
343 * This interface is provided for binary compatibility. Using
344 * BWRITE() directly avoids the extra function call overhead invoked
345 * by calling this routine.
346 */
347 void
bwrite(struct buf * bp)348 bwrite(struct buf *bp)
349 {
350 BWRITE(bp);
351 }
352
353 /*
354 * Write the buffer, waiting for completion.
355 * But don't release the buffer afterwards.
356 * This interface is provided for binary compatibility. Using
357 * BWRITE2() directly avoids the extra function call overhead.
358 */
359 void
bwrite2(struct buf * bp)360 bwrite2(struct buf *bp)
361 {
362 BWRITE2(bp);
363 }
364
365 /*
366 * Release the buffer, marking it so that if it is grabbed
367 * for another purpose it will be written out before being
368 * given up (e.g. when writing a partial block where it is
369 * assumed that another write for the same block will soon follow).
370 * Also save the time that the block is first marked as delayed
371 * so that it will be written in a reasonable time.
372 */
373 void
bdwrite(struct buf * bp)374 bdwrite(struct buf *bp)
375 {
376 ASSERT(SEMA_HELD(&bp->b_sem));
377 CPU_STATS_ADD_K(sys, lwrite, 1);
378 if ((bp->b_flags & B_DELWRI) == 0)
379 bp->b_start = ddi_get_lbolt();
380 /*
381 * B_DONE allows others to use the buffer, B_DELWRI causes the
382 * buffer to be written before being reused, and setting b_resid
383 * to zero says the buffer is complete.
384 */
385 bp->b_flags |= B_DELWRI | B_DONE;
386 bp->b_resid = 0;
387 brelse(bp);
388 }
389
390 /*
391 * Release the buffer, start I/O on it, but don't wait for completion.
392 */
393 void
bawrite(struct buf * bp)394 bawrite(struct buf *bp)
395 {
396 ASSERT(SEMA_HELD(&bp->b_sem));
397
398 /* Use bfreelist.b_bcount as a weird-ass heuristic */
399 if (bfreelist.b_bcount > 4)
400 bp->b_flags |= B_ASYNC;
401 BWRITE(bp);
402 }
403
404 /*
405 * Release the buffer, with no I/O implied.
406 */
407 void
brelse(struct buf * bp)408 brelse(struct buf *bp)
409 {
410 struct buf **backp;
411 uint_t index;
412 kmutex_t *hmp;
413 struct buf *dp;
414 struct hbuf *hp;
415
416
417 ASSERT(SEMA_HELD(&bp->b_sem));
418
419 /*
420 * Clear the retry write flag if the buffer was written without
421 * error. The presence of B_DELWRI means the buffer has not yet
422 * been written and the presence of B_ERROR means that an error
423 * is still occurring.
424 */
425 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
426 bp->b_flags &= ~B_RETRYWRI;
427 }
428
429 /* Check for anomalous conditions */
430 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
431 if (bp->b_flags & B_NOCACHE) {
432 /* Don't add to the freelist. Destroy it now */
433 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
434 sema_destroy(&bp->b_sem);
435 sema_destroy(&bp->b_io);
436 kmem_free(bp, sizeof (struct buf));
437 return;
438 }
439 /*
440 * If a write failed and we are supposed to retry write,
441 * don't toss the buffer. Keep it around and mark it
442 * delayed write in the hopes that it will eventually
443 * get flushed (and still keep the system running.)
444 */
445 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
446 bp->b_flags |= B_DELWRI;
447 /* keep fsflush from trying continuously to flush */
448 bp->b_start = ddi_get_lbolt();
449 } else
450 bp->b_flags |= B_AGE|B_STALE;
451 bp->b_flags &= ~B_ERROR;
452 bp->b_error = 0;
453 }
454
455 /*
456 * If delayed write is set then put in on the delayed
457 * write list instead of the free buffer list.
458 */
459 index = bio_bhash(bp->b_edev, bp->b_blkno);
460 hmp = &hbuf[index].b_lock;
461
462 mutex_enter(hmp);
463 hp = &hbuf[index];
464 dp = (struct buf *)hp;
465
466 /*
467 * Make sure that the number of entries on this list are
468 * Zero <= count <= total # buffers
469 */
470 ASSERT(hp->b_length >= 0);
471 ASSERT(hp->b_length < nbuf);
472
473 hp->b_length++; /* We are adding this buffer */
474
475 if (bp->b_flags & B_DELWRI) {
476 /*
477 * This buffer goes on the delayed write buffer list
478 */
479 dp = (struct buf *)&dwbuf[index];
480 }
481 ASSERT(bp->b_bufsize > 0);
482 ASSERT(bp->b_bcount > 0);
483 ASSERT(bp->b_un.b_addr != NULL);
484
485 if (bp->b_flags & B_AGE) {
486 backp = &dp->av_forw;
487 (*backp)->av_back = bp;
488 bp->av_forw = *backp;
489 *backp = bp;
490 bp->av_back = dp;
491 } else {
492 backp = &dp->av_back;
493 (*backp)->av_forw = bp;
494 bp->av_back = *backp;
495 *backp = bp;
496 bp->av_forw = dp;
497 }
498 mutex_exit(hmp);
499
500 if (bfreelist.b_flags & B_WANTED) {
501 /*
502 * Should come here very very rarely.
503 */
504 mutex_enter(&bfree_lock);
505 if (bfreelist.b_flags & B_WANTED) {
506 bfreelist.b_flags &= ~B_WANTED;
507 cv_broadcast(&bio_mem_cv);
508 }
509 mutex_exit(&bfree_lock);
510 }
511
512 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
513 /*
514 * Don't let anyone get the buffer off the freelist before we
515 * release our hold on it.
516 */
517 sema_v(&bp->b_sem);
518 }
519
520 /*
521 * Return a count of the number of B_BUSY buffers in the system
522 * Can only be used as a good estimate. If 'cleanit' is set,
523 * try to flush all bufs.
524 */
525 int
bio_busy(int cleanit)526 bio_busy(int cleanit)
527 {
528 struct buf *bp, *dp;
529 int busy = 0;
530 int i;
531 kmutex_t *hmp;
532
533 for (i = 0; i < v.v_hbuf; i++) {
534 dp = (struct buf *)&hbuf[i];
535 hmp = &hbuf[i].b_lock;
536
537 mutex_enter(hmp);
538 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
539 if (bp->b_flags & B_BUSY)
540 busy++;
541 }
542 mutex_exit(hmp);
543 }
544
545 if (cleanit && busy != 0) {
546 bflush(NODEV);
547 }
548
549 return (busy);
550 }
551
552 /*
553 * this interface is provided for binary compatibility.
554 *
555 * Assign a buffer for the given block. If the appropriate
556 * block is already associated, return it; otherwise search
557 * for the oldest non-busy buffer and reassign it.
558 */
559 struct buf *
getblk(dev_t dev,daddr_t blkno,long bsize)560 getblk(dev_t dev, daddr_t blkno, long bsize)
561 {
562 return (getblk_common(/* ufsvfsp */ NULL, dev,
563 blkno, bsize, /* errflg */ 0));
564 }
565
566 /*
567 * Assign a buffer for the given block. If the appropriate
568 * block is already associated, return it; otherwise search
569 * for the oldest non-busy buffer and reassign it.
570 */
571 struct buf *
getblk_common(void * arg,dev_t dev,daddr_t blkno,long bsize,int errflg)572 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
573 {
574 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
575 struct buf *bp;
576 struct buf *dp;
577 struct buf *nbp = NULL;
578 struct buf *errbp;
579 uint_t index;
580 kmutex_t *hmp;
581 struct hbuf *hp;
582
583 if (getmajor(dev) >= devcnt)
584 cmn_err(CE_PANIC, "blkdev");
585
586 biostats.bio_lookup.value.ui32++;
587
588 index = bio_bhash(dev, blkno);
589 hp = &hbuf[index];
590 dp = (struct buf *)hp;
591 hmp = &hp->b_lock;
592
593 mutex_enter(hmp);
594 loop:
595 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
596 if (bp->b_blkno != blkno || bp->b_edev != dev ||
597 (bp->b_flags & B_STALE))
598 continue;
599 /*
600 * Avoid holding the hash lock in the event that
601 * the buffer is locked by someone. Since the hash chain
602 * may change when we drop the hash lock
603 * we have to start at the beginning of the chain if the
604 * buffer identity/contents aren't valid.
605 */
606 if (!sema_tryp(&bp->b_sem)) {
607 biostats.bio_bufbusy.value.ui32++;
608 mutex_exit(hmp);
609 /*
610 * OK, we are dealing with a busy buffer.
611 * In the case that we are panicking and we
612 * got called from bread(), we have some chance
613 * for error recovery. So better bail out from
614 * here since sema_p() won't block. If we got
615 * called directly from ufs routines, there is
616 * no way to report an error yet.
617 */
618 if (panicstr && errflg)
619 goto errout;
620 /*
621 * For the following line of code to work
622 * correctly never kmem_free the buffer "header".
623 */
624 sema_p(&bp->b_sem);
625 if (bp->b_blkno != blkno || bp->b_edev != dev ||
626 (bp->b_flags & B_STALE)) {
627 sema_v(&bp->b_sem);
628 mutex_enter(hmp);
629 goto loop; /* start over */
630 }
631 mutex_enter(hmp);
632 }
633 /* Found */
634 biostats.bio_hit.value.ui32++;
635 bp->b_flags &= ~B_AGE;
636
637 /*
638 * Yank it off the free/delayed write lists
639 */
640 hp->b_length--;
641 notavail(bp);
642 mutex_exit(hmp);
643
644 ASSERT((bp->b_flags & B_NOCACHE) == 0);
645
646 if (nbp == NULL) {
647 /*
648 * Make the common path short.
649 */
650 ASSERT(SEMA_HELD(&bp->b_sem));
651 return (bp);
652 }
653
654 biostats.bio_bufdup.value.ui32++;
655
656 /*
657 * The buffer must have entered during the lock upgrade
658 * so free the new buffer we allocated and return the
659 * found buffer.
660 */
661 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
662 nbp->b_un.b_addr = NULL;
663
664 /*
665 * Account for the memory
666 */
667 mutex_enter(&bfree_lock);
668 bfreelist.b_bufsize += nbp->b_bufsize;
669 mutex_exit(&bfree_lock);
670
671 /*
672 * Destroy buf identity, and place on avail list
673 */
674 nbp->b_dev = (o_dev_t)NODEV;
675 nbp->b_edev = NODEV;
676 nbp->b_flags = 0;
677 nbp->b_file = NULL;
678 nbp->b_offset = -1;
679
680 sema_v(&nbp->b_sem);
681 bio_bhdr_free(nbp);
682
683 ASSERT(SEMA_HELD(&bp->b_sem));
684 return (bp);
685 }
686
687 /*
688 * bio_getfreeblk may block so check the hash chain again.
689 */
690 if (nbp == NULL) {
691 mutex_exit(hmp);
692 nbp = bio_getfreeblk(bsize);
693 mutex_enter(hmp);
694 goto loop;
695 }
696
697 /*
698 * New buffer. Assign nbp and stick it on the hash.
699 */
700 nbp->b_flags = B_BUSY;
701 nbp->b_edev = dev;
702 nbp->b_dev = (o_dev_t)cmpdev(dev);
703 nbp->b_blkno = blkno;
704 nbp->b_iodone = NULL;
705 nbp->b_bcount = bsize;
706 /*
707 * If we are given a ufsvfsp and the vfs_root field is NULL
708 * then this must be I/O for a superblock. A superblock's
709 * buffer is set up in mountfs() and there is no root vnode
710 * at that point.
711 */
712 if (ufsvfsp && ufsvfsp->vfs_root) {
713 nbp->b_vp = ufsvfsp->vfs_root;
714 } else {
715 nbp->b_vp = NULL;
716 }
717
718 ASSERT((nbp->b_flags & B_NOCACHE) == 0);
719
720 binshash(nbp, dp);
721 mutex_exit(hmp);
722
723 ASSERT(SEMA_HELD(&nbp->b_sem));
724
725 return (nbp);
726
727
728 /*
729 * Come here in case of an internal error. At this point we couldn't
730 * get a buffer, but we have to return one. Hence we allocate some
731 * kind of error reply buffer on the fly. This buffer is marked as
732 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
733 * - B_ERROR will indicate error to the caller.
734 * - B_DONE will prevent us from reading the buffer from
735 * the device.
736 * - B_NOCACHE will cause that this buffer gets free'd in
737 * brelse().
738 */
739
740 errout:
741 errbp = geteblk();
742 sema_p(&errbp->b_sem);
743 errbp->b_flags &= ~B_BUSY;
744 errbp->b_flags |= (B_ERROR | B_DONE);
745 return (errbp);
746 }
747
748 /*
749 * Get an empty block, not assigned to any particular device.
750 * Returns a locked buffer that is not on any hash or free list.
751 */
752 struct buf *
ngeteblk(long bsize)753 ngeteblk(long bsize)
754 {
755 struct buf *bp;
756
757 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
758 bioinit(bp);
759 bp->av_forw = bp->av_back = NULL;
760 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
761 bp->b_bufsize = bsize;
762 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
763 bp->b_dev = (o_dev_t)NODEV;
764 bp->b_edev = NODEV;
765 bp->b_lblkno = 0;
766 bp->b_bcount = bsize;
767 bp->b_iodone = NULL;
768 return (bp);
769 }
770
771 /*
772 * Interface of geteblk() is kept intact to maintain driver compatibility.
773 * Use ngeteblk() to allocate block size other than 1 KB.
774 */
775 struct buf *
geteblk(void)776 geteblk(void)
777 {
778 return (ngeteblk((long)1024));
779 }
780
781 /*
782 * Return a buffer w/o sleeping
783 */
784 struct buf *
trygetblk(dev_t dev,daddr_t blkno)785 trygetblk(dev_t dev, daddr_t blkno)
786 {
787 struct buf *bp;
788 struct buf *dp;
789 struct hbuf *hp;
790 kmutex_t *hmp;
791 uint_t index;
792
793 index = bio_bhash(dev, blkno);
794 hp = &hbuf[index];
795 hmp = &hp->b_lock;
796
797 if (!mutex_tryenter(hmp))
798 return (NULL);
799
800 dp = (struct buf *)hp;
801 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
802 if (bp->b_blkno != blkno || bp->b_edev != dev ||
803 (bp->b_flags & B_STALE))
804 continue;
805 /*
806 * Get access to a valid buffer without sleeping
807 */
808 if (sema_tryp(&bp->b_sem)) {
809 if (bp->b_flags & B_DONE) {
810 hp->b_length--;
811 notavail(bp);
812 mutex_exit(hmp);
813 return (bp);
814 } else {
815 sema_v(&bp->b_sem);
816 break;
817 }
818 }
819 break;
820 }
821 mutex_exit(hmp);
822 return (NULL);
823 }
824
825 /*
826 * Wait for I/O completion on the buffer; return errors
827 * to the user.
828 */
829 int
iowait(struct buf * bp)830 iowait(struct buf *bp)
831 {
832 ASSERT(SEMA_HELD(&bp->b_sem));
833 return (biowait(bp));
834 }
835
836 /*
837 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
838 * and wake up anyone waiting for it.
839 */
840 void
iodone(struct buf * bp)841 iodone(struct buf *bp)
842 {
843 ASSERT(SEMA_HELD(&bp->b_sem));
844 (void) biodone(bp);
845 }
846
847 /*
848 * Zero the core associated with a buffer.
849 */
850 void
clrbuf(struct buf * bp)851 clrbuf(struct buf *bp)
852 {
853 ASSERT(SEMA_HELD(&bp->b_sem));
854 bzero(bp->b_un.b_addr, bp->b_bcount);
855 bp->b_resid = 0;
856 }
857
858
859 /*
860 * Make sure all write-behind blocks on dev (or NODEV for all)
861 * are flushed out.
862 */
863 void
bflush(dev_t dev)864 bflush(dev_t dev)
865 {
866 struct buf *bp, *dp;
867 struct hbuf *hp;
868 struct buf *delwri_list = EMPTY_LIST;
869 int i, index;
870 kmutex_t *hmp;
871
872 mutex_enter(&blist_lock);
873 /*
874 * Wait for any invalidates or flushes ahead of us to finish.
875 * We really could split blist_lock up per device for better
876 * parallelism here.
877 */
878 while (bio_doinginval || bio_doingflush) {
879 bio_flinv_cv_wanted = 1;
880 cv_wait(&bio_flushinval_cv, &blist_lock);
881 }
882 bio_doingflush++;
883 /*
884 * Gather all B_DELWRI buffer for device.
885 * Lock ordering is b_sem > hash lock (brelse).
886 * Since we are finding the buffer via the delayed write list,
887 * it may be busy and we would block trying to get the
888 * b_sem lock while holding hash lock. So transfer all the
889 * candidates on the delwri_list and then drop the hash locks.
890 */
891 for (i = 0; i < v.v_hbuf; i++) {
892 hmp = &hbuf[i].b_lock;
893 dp = (struct buf *)&dwbuf[i];
894 mutex_enter(hmp);
895 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
896 if (dev == NODEV || bp->b_edev == dev) {
897 if (bp->b_list == NULL) {
898 bp->b_list = delwri_list;
899 delwri_list = bp;
900 }
901 }
902 }
903 mutex_exit(hmp);
904 }
905 mutex_exit(&blist_lock);
906
907 /*
908 * Now that the hash locks have been dropped grab the semaphores
909 * and write back all the buffers that have B_DELWRI set.
910 */
911 while (delwri_list != EMPTY_LIST) {
912 bp = delwri_list;
913
914 sema_p(&bp->b_sem); /* may block */
915 if ((dev != bp->b_edev && dev != NODEV) ||
916 (panicstr && bp->b_flags & B_BUSY)) {
917 sema_v(&bp->b_sem);
918 delwri_list = bp->b_list;
919 bp->b_list = NULL;
920 continue; /* No longer a candidate */
921 }
922 if (bp->b_flags & B_DELWRI) {
923 index = bio_bhash(bp->b_edev, bp->b_blkno);
924 hp = &hbuf[index];
925 hmp = &hp->b_lock;
926 dp = (struct buf *)hp;
927
928 bp->b_flags |= B_ASYNC;
929 mutex_enter(hmp);
930 hp->b_length--;
931 notavail(bp);
932 mutex_exit(hmp);
933 if (bp->b_vp == NULL) { /* !ufs */
934 BWRITE(bp);
935 } else { /* ufs */
936 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
937 }
938 } else {
939 sema_v(&bp->b_sem);
940 }
941 delwri_list = bp->b_list;
942 bp->b_list = NULL;
943 }
944 mutex_enter(&blist_lock);
945 bio_doingflush--;
946 if (bio_flinv_cv_wanted) {
947 bio_flinv_cv_wanted = 0;
948 cv_broadcast(&bio_flushinval_cv);
949 }
950 mutex_exit(&blist_lock);
951 }
952
953 /*
954 * Ensure that a specified block is up-to-date on disk.
955 */
956 void
blkflush(dev_t dev,daddr_t blkno)957 blkflush(dev_t dev, daddr_t blkno)
958 {
959 struct buf *bp, *dp;
960 struct hbuf *hp;
961 struct buf *sbp = NULL;
962 uint_t index;
963 kmutex_t *hmp;
964
965 index = bio_bhash(dev, blkno);
966 hp = &hbuf[index];
967 dp = (struct buf *)hp;
968 hmp = &hp->b_lock;
969
970 /*
971 * Identify the buffer in the cache belonging to
972 * this device and blkno (if any).
973 */
974 mutex_enter(hmp);
975 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
976 if (bp->b_blkno != blkno || bp->b_edev != dev ||
977 (bp->b_flags & B_STALE))
978 continue;
979 sbp = bp;
980 break;
981 }
982 mutex_exit(hmp);
983 if (sbp == NULL)
984 return;
985 /*
986 * Now check the buffer we have identified and
987 * make sure it still belongs to the device and is B_DELWRI
988 */
989 sema_p(&sbp->b_sem);
990 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
991 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
992 mutex_enter(hmp);
993 hp->b_length--;
994 notavail(sbp);
995 mutex_exit(hmp);
996 /*
997 * XXX - There is nothing to guarantee a synchronous
998 * write here if the B_ASYNC flag is set. This needs
999 * some investigation.
1000 */
1001 if (sbp->b_vp == NULL) { /* !ufs */
1002 BWRITE(sbp); /* synchronous write */
1003 } else { /* ufs */
1004 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1005 }
1006 } else {
1007 sema_v(&sbp->b_sem);
1008 }
1009 }
1010
1011 /*
1012 * Same as binval, except can force-invalidate delayed-write buffers
1013 * (which are not be already flushed because of device errors). Also
1014 * makes sure that the retry write flag is cleared.
1015 */
1016 int
bfinval(dev_t dev,int force)1017 bfinval(dev_t dev, int force)
1018 {
1019 struct buf *dp;
1020 struct buf *bp;
1021 struct buf *binval_list = EMPTY_LIST;
1022 int i, error = 0;
1023 kmutex_t *hmp;
1024 uint_t index;
1025 struct buf **backp;
1026
1027 mutex_enter(&blist_lock);
1028 /*
1029 * Wait for any flushes ahead of us to finish, it's ok to
1030 * do invalidates in parallel.
1031 */
1032 while (bio_doingflush) {
1033 bio_flinv_cv_wanted = 1;
1034 cv_wait(&bio_flushinval_cv, &blist_lock);
1035 }
1036 bio_doinginval++;
1037
1038 /* Gather bp's */
1039 for (i = 0; i < v.v_hbuf; i++) {
1040 dp = (struct buf *)&hbuf[i];
1041 hmp = &hbuf[i].b_lock;
1042
1043 mutex_enter(hmp);
1044 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1045 if (bp->b_edev == dev) {
1046 if (bp->b_list == NULL) {
1047 bp->b_list = binval_list;
1048 binval_list = bp;
1049 }
1050 }
1051 }
1052 mutex_exit(hmp);
1053 }
1054 mutex_exit(&blist_lock);
1055
1056 /* Invalidate all bp's found */
1057 while (binval_list != EMPTY_LIST) {
1058 bp = binval_list;
1059
1060 sema_p(&bp->b_sem);
1061 if (bp->b_edev == dev) {
1062 if (force && (bp->b_flags & B_DELWRI)) {
1063 /* clear B_DELWRI, move to non-dw freelist */
1064 index = bio_bhash(bp->b_edev, bp->b_blkno);
1065 hmp = &hbuf[index].b_lock;
1066 dp = (struct buf *)&hbuf[index];
1067 mutex_enter(hmp);
1068
1069 /* remove from delayed write freelist */
1070 notavail(bp);
1071
1072 /* add to B_AGE side of non-dw freelist */
1073 backp = &dp->av_forw;
1074 (*backp)->av_back = bp;
1075 bp->av_forw = *backp;
1076 *backp = bp;
1077 bp->av_back = dp;
1078
1079 /*
1080 * make sure write retries and busy are cleared
1081 */
1082 bp->b_flags &=
1083 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1084 mutex_exit(hmp);
1085 }
1086 if ((bp->b_flags & B_DELWRI) == 0)
1087 bp->b_flags |= B_STALE|B_AGE;
1088 else
1089 error = EIO;
1090 }
1091 sema_v(&bp->b_sem);
1092 binval_list = bp->b_list;
1093 bp->b_list = NULL;
1094 }
1095 mutex_enter(&blist_lock);
1096 bio_doinginval--;
1097 if (bio_flinv_cv_wanted) {
1098 cv_broadcast(&bio_flushinval_cv);
1099 bio_flinv_cv_wanted = 0;
1100 }
1101 mutex_exit(&blist_lock);
1102 return (error);
1103 }
1104
1105 /*
1106 * If possible, invalidate blocks for a dev on demand
1107 */
1108 void
binval(dev_t dev)1109 binval(dev_t dev)
1110 {
1111 (void) bfinval(dev, 0);
1112 }
1113
1114 /*
1115 * Initialize the buffer I/O system by freeing
1116 * all buffers and setting all device hash buffer lists to empty.
1117 */
1118 void
binit(void)1119 binit(void)
1120 {
1121 struct buf *bp;
1122 unsigned int i, pct;
1123 ulong_t bio_max_hwm, bio_default_hwm;
1124
1125 /*
1126 * Maximum/Default values for bufhwm are set to the smallest of:
1127 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1128 * - 1/4 of kernel virtual memory
1129 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1130 * Additionally, in order to allow simple tuning by percentage of
1131 * physical memory, bufhwm_pct is used to calculate the default if
1132 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1133 *
1134 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1135 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1136 */
1137 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1138 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1139 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1140
1141 pct = BIO_BUF_PERCENT;
1142 if (bufhwm_pct != 0 &&
1143 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1144 pct = BIO_BUF_PERCENT;
1145 /*
1146 * Invalid user specified value, emit a warning.
1147 */
1148 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1149 range(1..%d). Using %d as default.",
1150 bufhwm_pct,
1151 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1152 }
1153
1154 bio_default_hwm = MIN(physmem / pct,
1155 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1156 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1157
1158 if ((v.v_bufhwm = bufhwm) == 0)
1159 v.v_bufhwm = bio_default_hwm;
1160
1161 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1162 v.v_bufhwm = (int)bio_max_hwm;
1163 /*
1164 * Invalid user specified value, emit a warning.
1165 */
1166 cmn_err(CE_WARN,
1167 "binit: bufhwm(%d) out \
1168 of range(%d..%lu). Using %lu as default",
1169 bufhwm,
1170 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1171 }
1172
1173 /*
1174 * Determine the number of hash buckets. Default is to
1175 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1176 * Round up number to the next power of 2.
1177 */
1178 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1179 BIO_HASHLEN);
1180 v.v_hmask = v.v_hbuf - 1;
1181 v.v_buf = BIO_BHDR_POOL;
1182
1183 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1184
1185 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1186
1187 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1188 bp = &bfreelist;
1189 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1190
1191 for (i = 0; i < v.v_hbuf; i++) {
1192 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1193 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1194
1195 /*
1196 * Initialize the delayed write buffer list.
1197 */
1198 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1199 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1200 }
1201 }
1202
1203 /*
1204 * Wait for I/O completion on the buffer; return error code.
1205 * If bp was for synchronous I/O, bp is invalid and associated
1206 * resources are freed on return.
1207 */
1208 int
biowait(struct buf * bp)1209 biowait(struct buf *bp)
1210 {
1211 int error = 0;
1212 struct cpu *cpup;
1213
1214 ASSERT(SEMA_HELD(&bp->b_sem));
1215
1216 cpup = CPU;
1217 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1218 DTRACE_IO1(wait__start, struct buf *, bp);
1219
1220 /*
1221 * In case of panic, busy wait for completion
1222 */
1223 if (panicstr) {
1224 while ((bp->b_flags & B_DONE) == 0)
1225 drv_usecwait(10);
1226 } else
1227 sema_p(&bp->b_io);
1228
1229 DTRACE_IO1(wait__done, struct buf *, bp);
1230 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1231
1232 error = geterror(bp);
1233 if ((bp->b_flags & B_ASYNC) == 0) {
1234 if (bp->b_flags & B_REMAPPED)
1235 bp_mapout(bp);
1236 }
1237 return (error);
1238 }
1239
1240 /*
1241 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1242 * and wake up anyone waiting for it.
1243 */
1244 void
biodone(struct buf * bp)1245 biodone(struct buf *bp)
1246 {
1247 if (bp->b_flags & B_STARTED) {
1248 DTRACE_IO1(done, struct buf *, bp);
1249 bp->b_flags &= ~B_STARTED;
1250 }
1251
1252 if (bp->b_iodone != NULL) {
1253 (*(bp->b_iodone))(bp);
1254 return;
1255 }
1256 ASSERT((bp->b_flags & B_DONE) == 0);
1257 ASSERT(SEMA_HELD(&bp->b_sem));
1258 bp->b_flags |= B_DONE;
1259 if (bp->b_flags & B_ASYNC) {
1260 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1261 bio_pageio_done(bp);
1262 else
1263 brelse(bp); /* release bp to freelist */
1264 } else {
1265 sema_v(&bp->b_io);
1266 }
1267 }
1268
1269 /*
1270 * Pick up the device's error number and pass it to the user;
1271 * if there is an error but the number is 0 set a generalized code.
1272 */
1273 int
geterror(struct buf * bp)1274 geterror(struct buf *bp)
1275 {
1276 int error = 0;
1277
1278 ASSERT(SEMA_HELD(&bp->b_sem));
1279 if (bp->b_flags & B_ERROR) {
1280 error = bp->b_error;
1281 if (!error)
1282 error = EIO;
1283 }
1284 return (error);
1285 }
1286
1287 /*
1288 * Support for pageio buffers.
1289 *
1290 * This stuff should be generalized to provide a generalized bp
1291 * header facility that can be used for things other than pageio.
1292 */
1293
1294 /*
1295 * Allocate and initialize a buf struct for use with pageio.
1296 */
1297 struct buf *
pageio_setup(struct page * pp,size_t len,struct vnode * vp,int flags)1298 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1299 {
1300 struct buf *bp;
1301 struct cpu *cpup;
1302
1303 if (flags & B_READ) {
1304 CPU_STATS_ENTER_K();
1305 cpup = CPU; /* get pointer AFTER preemption is disabled */
1306 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1307 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1308
1309 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1310
1311 if ((flags & B_ASYNC) == 0) {
1312 klwp_t *lwp = ttolwp(curthread);
1313 if (lwp != NULL)
1314 lwp->lwp_ru.majflt++;
1315 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1316 }
1317 /*
1318 * Update statistics for pages being paged in
1319 */
1320 if (pp != NULL && pp->p_vnode != NULL) {
1321 if (IS_SWAPFSVP(pp->p_vnode)) {
1322 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1323 atomic_add_64(&curzone->zone_anonpgin,
1324 btopr(len));
1325 } else {
1326 if (pp->p_vnode->v_flag & VVMEXEC) {
1327 CPU_STATS_ADDQ(cpup, vm, execpgin,
1328 btopr(len));
1329 atomic_add_64(&curzone->zone_execpgin,
1330 btopr(len));
1331 } else {
1332 CPU_STATS_ADDQ(cpup, vm, fspgin,
1333 btopr(len));
1334 atomic_add_64(&curzone->zone_fspgin,
1335 btopr(len));
1336 }
1337 }
1338 }
1339 CPU_STATS_EXIT_K();
1340 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1341 "page_ws_in:pp %p", pp);
1342 }
1343
1344 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1345 bp->b_bcount = len;
1346 bp->b_bufsize = len;
1347 bp->b_pages = pp;
1348 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1349 bp->b_offset = -1;
1350 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1351
1352 /* Initialize bp->b_sem in "locked" state */
1353 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1354
1355 VN_HOLD(vp);
1356 bp->b_vp = vp;
1357
1358 /*
1359 * Caller sets dev & blkno and can adjust
1360 * b_addr for page offset and can use bp_mapin
1361 * to make pages kernel addressable.
1362 */
1363 return (bp);
1364 }
1365
1366 void
pageio_done(struct buf * bp)1367 pageio_done(struct buf *bp)
1368 {
1369 ASSERT(SEMA_HELD(&bp->b_sem));
1370 if (bp->b_flags & B_REMAPPED)
1371 bp_mapout(bp);
1372 VN_RELE(bp->b_vp);
1373 bp->b_vp = NULL;
1374 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1375
1376 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1377 sema_destroy(&bp->b_sem);
1378 sema_destroy(&bp->b_io);
1379 kmem_free(bp, sizeof (struct buf));
1380 }
1381
1382 /*
1383 * Check to see whether the buffers, except the one pointed by sbp,
1384 * associated with the device are busy.
1385 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1386 */
1387 int
bcheck(dev_t dev,struct buf * sbp)1388 bcheck(dev_t dev, struct buf *sbp)
1389 {
1390 struct buf *bp;
1391 struct buf *dp;
1392 int i;
1393 kmutex_t *hmp;
1394
1395 /*
1396 * check for busy bufs for this filesystem
1397 */
1398 for (i = 0; i < v.v_hbuf; i++) {
1399 dp = (struct buf *)&hbuf[i];
1400 hmp = &hbuf[i].b_lock;
1401
1402 mutex_enter(hmp);
1403 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1404 /*
1405 * if buf is busy or dirty, then filesystem is busy
1406 */
1407 if ((bp->b_edev == dev) &&
1408 ((bp->b_flags & B_STALE) == 0) &&
1409 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1410 (bp != sbp)) {
1411 mutex_exit(hmp);
1412 return (1);
1413 }
1414 }
1415 mutex_exit(hmp);
1416 }
1417 return (0);
1418 }
1419
1420 /*
1421 * Hash two 32 bit entities.
1422 */
1423 int
hash2ints(int x,int y)1424 hash2ints(int x, int y)
1425 {
1426 int hash = 0;
1427
1428 hash = x - 1;
1429 hash = ((hash * 7) + (x >> 8)) - 1;
1430 hash = ((hash * 7) + (x >> 16)) - 1;
1431 hash = ((hash * 7) + (x >> 24)) - 1;
1432 hash = ((hash * 7) + y) - 1;
1433 hash = ((hash * 7) + (y >> 8)) - 1;
1434 hash = ((hash * 7) + (y >> 16)) - 1;
1435 hash = ((hash * 7) + (y >> 24)) - 1;
1436
1437 return (hash);
1438 }
1439
1440
1441 /*
1442 * Return a new buffer struct.
1443 * Create a new buffer if we haven't gone over our high water
1444 * mark for memory, otherwise try to get one off the freelist.
1445 *
1446 * Returns a locked buf that has no id and is not on any hash or free
1447 * list.
1448 */
1449 static struct buf *
bio_getfreeblk(long bsize)1450 bio_getfreeblk(long bsize)
1451 {
1452 struct buf *bp, *dp;
1453 struct hbuf *hp;
1454 kmutex_t *hmp;
1455 uint_t start, end;
1456
1457 /*
1458 * mutex_enter(&bfree_lock);
1459 * bfreelist.b_bufsize represents the amount of memory
1460 * mutex_exit(&bfree_lock); protect ref to bfreelist
1461 * we are allowed to allocate in the cache before we hit our hwm.
1462 */
1463 bio_mem_get(bsize); /* Account for our memory request */
1464
1465 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1466 sema_p(&bp->b_sem); /* Should never fail */
1467
1468 ASSERT(bp->b_un.b_addr == NULL);
1469 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1470 if (bp->b_un.b_addr != NULL) {
1471 /*
1472 * Make the common path short
1473 */
1474 bp->b_bufsize = bsize;
1475 ASSERT(SEMA_HELD(&bp->b_sem));
1476 return (bp);
1477 } else {
1478 struct buf *save;
1479
1480 save = bp; /* Save bp we allocated */
1481 start = end = lastindex;
1482
1483 biostats.bio_bufwant.value.ui32++;
1484
1485 /*
1486 * Memory isn't available from the system now. Scan
1487 * the hash buckets till enough space is found.
1488 */
1489 do {
1490 hp = &hbuf[start];
1491 hmp = &hp->b_lock;
1492 dp = (struct buf *)hp;
1493
1494 mutex_enter(hmp);
1495 bp = dp->av_forw;
1496
1497 while (bp != dp) {
1498
1499 ASSERT(bp != NULL);
1500
1501 if (!sema_tryp(&bp->b_sem)) {
1502 bp = bp->av_forw;
1503 continue;
1504 }
1505
1506 /*
1507 * Since we are going down the freelist
1508 * associated with this hash bucket the
1509 * B_DELWRI flag should not be set.
1510 */
1511 ASSERT(!(bp->b_flags & B_DELWRI));
1512
1513 if (bp->b_bufsize == bsize) {
1514 hp->b_length--;
1515 notavail(bp);
1516 bremhash(bp);
1517 mutex_exit(hmp);
1518
1519 /*
1520 * Didn't kmem_alloc any more, so don't
1521 * count it twice.
1522 */
1523 mutex_enter(&bfree_lock);
1524 bfreelist.b_bufsize += bsize;
1525 mutex_exit(&bfree_lock);
1526
1527 /*
1528 * Update the lastindex value.
1529 */
1530 lastindex = start;
1531
1532 /*
1533 * Put our saved bp back on the list
1534 */
1535 sema_v(&save->b_sem);
1536 bio_bhdr_free(save);
1537 ASSERT(SEMA_HELD(&bp->b_sem));
1538 return (bp);
1539 }
1540 sema_v(&bp->b_sem);
1541 bp = bp->av_forw;
1542 }
1543 mutex_exit(hmp);
1544 start = ((start + 1) % v.v_hbuf);
1545 } while (start != end);
1546
1547 biostats.bio_bufwait.value.ui32++;
1548 bp = save; /* Use original bp */
1549 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1550 }
1551
1552 bp->b_bufsize = bsize;
1553 ASSERT(SEMA_HELD(&bp->b_sem));
1554 return (bp);
1555 }
1556
1557 /*
1558 * Allocate a buffer header. If none currently available, allocate
1559 * a new pool.
1560 */
1561 static struct buf *
bio_bhdr_alloc(void)1562 bio_bhdr_alloc(void)
1563 {
1564 struct buf *dp, *sdp;
1565 struct buf *bp;
1566 int i;
1567
1568 for (;;) {
1569 mutex_enter(&bhdr_lock);
1570 if (bhdrlist != NULL) {
1571 bp = bhdrlist;
1572 bhdrlist = bp->av_forw;
1573 mutex_exit(&bhdr_lock);
1574 bp->av_forw = NULL;
1575 return (bp);
1576 }
1577 mutex_exit(&bhdr_lock);
1578
1579 /*
1580 * Need to allocate a new pool. If the system is currently
1581 * out of memory, then try freeing things on the freelist.
1582 */
1583 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1584 if (dp == NULL) {
1585 /*
1586 * System can't give us a pool of headers, try
1587 * recycling from the free lists.
1588 */
1589 bio_recycle(BIO_HEADER, 0);
1590 } else {
1591 sdp = dp;
1592 for (i = 0; i < v.v_buf; i++, dp++) {
1593 /*
1594 * The next two lines are needed since NODEV
1595 * is -1 and not NULL
1596 */
1597 dp->b_dev = (o_dev_t)NODEV;
1598 dp->b_edev = NODEV;
1599 dp->av_forw = dp + 1;
1600 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1601 NULL);
1602 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1603 NULL);
1604 dp->b_offset = -1;
1605 }
1606 mutex_enter(&bhdr_lock);
1607 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1608 bhdrlist = sdp;
1609 nbuf += v.v_buf;
1610 bp = bhdrlist;
1611 bhdrlist = bp->av_forw;
1612 mutex_exit(&bhdr_lock);
1613
1614 bp->av_forw = NULL;
1615 return (bp);
1616 }
1617 }
1618 }
1619
1620 static void
bio_bhdr_free(struct buf * bp)1621 bio_bhdr_free(struct buf *bp)
1622 {
1623 ASSERT(bp->b_back == NULL);
1624 ASSERT(bp->b_forw == NULL);
1625 ASSERT(bp->av_back == NULL);
1626 ASSERT(bp->av_forw == NULL);
1627 ASSERT(bp->b_un.b_addr == NULL);
1628 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1629 ASSERT(bp->b_edev == NODEV);
1630 ASSERT(bp->b_flags == 0);
1631
1632 mutex_enter(&bhdr_lock);
1633 bp->av_forw = bhdrlist;
1634 bhdrlist = bp;
1635 mutex_exit(&bhdr_lock);
1636 }
1637
1638 /*
1639 * If we haven't gone over the high water mark, it's o.k. to
1640 * allocate more buffer space, otherwise recycle buffers
1641 * from the freelist until enough memory is free for a bsize request.
1642 *
1643 * We account for this memory, even though
1644 * we don't allocate it here.
1645 */
1646 static void
bio_mem_get(long bsize)1647 bio_mem_get(long bsize)
1648 {
1649 mutex_enter(&bfree_lock);
1650 if (bfreelist.b_bufsize > bsize) {
1651 bfreelist.b_bufsize -= bsize;
1652 mutex_exit(&bfree_lock);
1653 return;
1654 }
1655 mutex_exit(&bfree_lock);
1656 bio_recycle(BIO_MEM, bsize);
1657 }
1658
1659 /*
1660 * flush a list of delayed write buffers.
1661 * (currently used only by bio_recycle below.)
1662 */
1663 static void
bio_flushlist(struct buf * delwri_list)1664 bio_flushlist(struct buf *delwri_list)
1665 {
1666 struct buf *bp;
1667
1668 while (delwri_list != EMPTY_LIST) {
1669 bp = delwri_list;
1670 bp->b_flags |= B_AGE | B_ASYNC;
1671 if (bp->b_vp == NULL) { /* !ufs */
1672 BWRITE(bp);
1673 } else { /* ufs */
1674 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1675 }
1676 delwri_list = bp->b_list;
1677 bp->b_list = NULL;
1678 }
1679 }
1680
1681 /*
1682 * Start recycling buffers on the freelist for one of 2 reasons:
1683 * - we need a buffer header
1684 * - we need to free up memory
1685 * Once started we continue to recycle buffers until the B_AGE
1686 * buffers are gone.
1687 */
1688 static void
bio_recycle(int want,long bsize)1689 bio_recycle(int want, long bsize)
1690 {
1691 struct buf *bp, *dp, *dwp, *nbp;
1692 struct hbuf *hp;
1693 int found = 0;
1694 kmutex_t *hmp;
1695 int start, end;
1696 struct buf *delwri_list = EMPTY_LIST;
1697
1698 /*
1699 * Recycle buffers.
1700 */
1701 top:
1702 start = end = lastindex;
1703 do {
1704 hp = &hbuf[start];
1705 hmp = &hp->b_lock;
1706 dp = (struct buf *)hp;
1707
1708 mutex_enter(hmp);
1709 bp = dp->av_forw;
1710
1711 while (bp != dp) {
1712
1713 ASSERT(bp != NULL);
1714
1715 if (!sema_tryp(&bp->b_sem)) {
1716 bp = bp->av_forw;
1717 continue;
1718 }
1719 /*
1720 * Do we really want to nuke all of the B_AGE stuff??
1721 */
1722 if ((bp->b_flags & B_AGE) == 0 && found) {
1723 sema_v(&bp->b_sem);
1724 mutex_exit(hmp);
1725 lastindex = start;
1726 return; /* All done */
1727 }
1728
1729 ASSERT(MUTEX_HELD(&hp->b_lock));
1730 ASSERT(!(bp->b_flags & B_DELWRI));
1731 hp->b_length--;
1732 notavail(bp);
1733
1734 /*
1735 * Remove bhdr from cache, free up memory,
1736 * and add the hdr to the freelist.
1737 */
1738 bremhash(bp);
1739 mutex_exit(hmp);
1740
1741 if (bp->b_bufsize) {
1742 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1743 bp->b_un.b_addr = NULL;
1744 mutex_enter(&bfree_lock);
1745 bfreelist.b_bufsize += bp->b_bufsize;
1746 mutex_exit(&bfree_lock);
1747 }
1748
1749 bp->b_dev = (o_dev_t)NODEV;
1750 bp->b_edev = NODEV;
1751 bp->b_flags = 0;
1752 sema_v(&bp->b_sem);
1753 bio_bhdr_free(bp);
1754 if (want == BIO_HEADER) {
1755 found = 1;
1756 } else {
1757 ASSERT(want == BIO_MEM);
1758 if (!found && bfreelist.b_bufsize >= bsize) {
1759 /* Account for the memory we want */
1760 mutex_enter(&bfree_lock);
1761 if (bfreelist.b_bufsize >= bsize) {
1762 bfreelist.b_bufsize -= bsize;
1763 found = 1;
1764 }
1765 mutex_exit(&bfree_lock);
1766 }
1767 }
1768
1769 /*
1770 * Since we dropped hmp start from the
1771 * begining.
1772 */
1773 mutex_enter(hmp);
1774 bp = dp->av_forw;
1775 }
1776 mutex_exit(hmp);
1777
1778 /*
1779 * Look at the delayed write list.
1780 * First gather into a private list, then write them.
1781 */
1782 dwp = (struct buf *)&dwbuf[start];
1783 mutex_enter(&blist_lock);
1784 bio_doingflush++;
1785 mutex_enter(hmp);
1786 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1787
1788 ASSERT(bp != NULL);
1789 nbp = bp->av_forw;
1790
1791 if (!sema_tryp(&bp->b_sem))
1792 continue;
1793 ASSERT(bp->b_flags & B_DELWRI);
1794 /*
1795 * Do we really want to nuke all of the B_AGE stuff??
1796 */
1797
1798 if ((bp->b_flags & B_AGE) == 0 && found) {
1799 sema_v(&bp->b_sem);
1800 mutex_exit(hmp);
1801 lastindex = start;
1802 mutex_exit(&blist_lock);
1803 bio_flushlist(delwri_list);
1804 mutex_enter(&blist_lock);
1805 bio_doingflush--;
1806 if (bio_flinv_cv_wanted) {
1807 bio_flinv_cv_wanted = 0;
1808 cv_broadcast(&bio_flushinval_cv);
1809 }
1810 mutex_exit(&blist_lock);
1811 return; /* All done */
1812 }
1813
1814 /*
1815 * If the buffer is already on a flush or
1816 * invalidate list then just skip it.
1817 */
1818 if (bp->b_list != NULL) {
1819 sema_v(&bp->b_sem);
1820 continue;
1821 }
1822 /*
1823 * We are still on the same bucket.
1824 */
1825 hp->b_length--;
1826 notavail(bp);
1827 bp->b_list = delwri_list;
1828 delwri_list = bp;
1829 }
1830 mutex_exit(hmp);
1831 mutex_exit(&blist_lock);
1832 bio_flushlist(delwri_list);
1833 delwri_list = EMPTY_LIST;
1834 mutex_enter(&blist_lock);
1835 bio_doingflush--;
1836 if (bio_flinv_cv_wanted) {
1837 bio_flinv_cv_wanted = 0;
1838 cv_broadcast(&bio_flushinval_cv);
1839 }
1840 mutex_exit(&blist_lock);
1841 start = (start + 1) % v.v_hbuf;
1842
1843 } while (start != end);
1844
1845 if (found)
1846 return;
1847
1848 /*
1849 * Free lists exhausted and we haven't satisfied the request.
1850 * Wait here for more entries to be added to freelist.
1851 * Because this might have just happened, make it timed.
1852 */
1853 mutex_enter(&bfree_lock);
1854 bfreelist.b_flags |= B_WANTED;
1855 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1856 mutex_exit(&bfree_lock);
1857 goto top;
1858 }
1859
1860 /*
1861 * See if the block is associated with some buffer
1862 * (mainly to avoid getting hung up on a wait in breada).
1863 */
1864 static int
bio_incore(dev_t dev,daddr_t blkno)1865 bio_incore(dev_t dev, daddr_t blkno)
1866 {
1867 struct buf *bp;
1868 struct buf *dp;
1869 uint_t index;
1870 kmutex_t *hmp;
1871
1872 index = bio_bhash(dev, blkno);
1873 dp = (struct buf *)&hbuf[index];
1874 hmp = &hbuf[index].b_lock;
1875
1876 mutex_enter(hmp);
1877 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1878 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1879 (bp->b_flags & B_STALE) == 0) {
1880 mutex_exit(hmp);
1881 return (1);
1882 }
1883 }
1884 mutex_exit(hmp);
1885 return (0);
1886 }
1887
1888 static void
bio_pageio_done(struct buf * bp)1889 bio_pageio_done(struct buf *bp)
1890 {
1891 if (bp->b_flags & B_PAGEIO) {
1892
1893 if (bp->b_flags & B_REMAPPED)
1894 bp_mapout(bp);
1895
1896 if (bp->b_flags & B_READ)
1897 pvn_read_done(bp->b_pages, bp->b_flags);
1898 else
1899 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1900 pageio_done(bp);
1901 } else {
1902 ASSERT(bp->b_flags & B_REMAPPED);
1903 bp_mapout(bp);
1904 brelse(bp);
1905 }
1906 }
1907
1908 /*
1909 * bioerror(9F) - indicate error in buffer header
1910 * If 'error' is zero, remove the error indication.
1911 */
1912 void
bioerror(struct buf * bp,int error)1913 bioerror(struct buf *bp, int error)
1914 {
1915 ASSERT(bp != NULL);
1916 ASSERT(error >= 0);
1917 ASSERT(SEMA_HELD(&bp->b_sem));
1918
1919 if (error != 0) {
1920 bp->b_flags |= B_ERROR;
1921 } else {
1922 bp->b_flags &= ~B_ERROR;
1923 }
1924 bp->b_error = error;
1925 }
1926
1927 /*
1928 * bioreset(9F) - reuse a private buffer header after I/O is complete
1929 */
1930 void
bioreset(struct buf * bp)1931 bioreset(struct buf *bp)
1932 {
1933 ASSERT(bp != NULL);
1934
1935 biofini(bp);
1936 bioinit(bp);
1937 }
1938
1939 /*
1940 * biosize(9F) - return size of a buffer header
1941 */
1942 size_t
biosize(void)1943 biosize(void)
1944 {
1945 return (sizeof (struct buf));
1946 }
1947
1948 /*
1949 * biomodified(9F) - check if buffer is modified
1950 */
1951 int
biomodified(struct buf * bp)1952 biomodified(struct buf *bp)
1953 {
1954 int npf;
1955 int ppattr;
1956 struct page *pp;
1957
1958 ASSERT(bp != NULL);
1959
1960 if ((bp->b_flags & B_PAGEIO) == 0) {
1961 return (-1);
1962 }
1963 pp = bp->b_pages;
1964 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1965
1966 while (npf > 0) {
1967 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1968 HAT_SYNC_STOPON_MOD);
1969 if (ppattr & P_MOD)
1970 return (1);
1971 pp = pp->p_next;
1972 npf--;
1973 }
1974
1975 return (0);
1976 }
1977
1978 /*
1979 * bioinit(9F) - initialize a buffer structure
1980 */
1981 void
bioinit(struct buf * bp)1982 bioinit(struct buf *bp)
1983 {
1984 bzero(bp, sizeof (struct buf));
1985 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1986 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1987 bp->b_offset = -1;
1988 }
1989
1990 /*
1991 * biofini(9F) - uninitialize a buffer structure
1992 */
1993 void
biofini(struct buf * bp)1994 biofini(struct buf *bp)
1995 {
1996 sema_destroy(&bp->b_io);
1997 sema_destroy(&bp->b_sem);
1998 }
1999
2000 /*
2001 * bioclone(9F) - clone a buffer
2002 */
2003 struct buf *
bioclone(struct buf * bp,off_t off,size_t len,dev_t dev,daddr_t blkno,int (* iodone)(struct buf *),struct buf * bp_mem,int sleep)2004 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2005 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2006 {
2007 struct buf *bufp;
2008
2009 ASSERT(bp);
2010 if (bp_mem == NULL) {
2011 bufp = kmem_alloc(sizeof (struct buf), sleep);
2012 if (bufp == NULL) {
2013 return (NULL);
2014 }
2015 bioinit(bufp);
2016 } else {
2017 bufp = bp_mem;
2018 bioreset(bufp);
2019 }
2020
2021 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2022 B_ABRWRITE)
2023
2024 /*
2025 * The cloned buffer does not inherit the B_REMAPPED flag.
2026 */
2027 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2028 bufp->b_bcount = len;
2029 bufp->b_blkno = blkno;
2030 bufp->b_iodone = iodone;
2031 bufp->b_proc = bp->b_proc;
2032 bufp->b_edev = dev;
2033 bufp->b_file = bp->b_file;
2034 bufp->b_offset = bp->b_offset;
2035
2036 if (bp->b_flags & B_SHADOW) {
2037 ASSERT(bp->b_shadow);
2038 ASSERT(bp->b_flags & B_PHYS);
2039
2040 bufp->b_shadow = bp->b_shadow +
2041 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2042 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2043 if (bp->b_flags & B_REMAPPED)
2044 bufp->b_proc = NULL;
2045 } else {
2046 if (bp->b_flags & B_PAGEIO) {
2047 struct page *pp;
2048 off_t o;
2049 int i;
2050
2051 pp = bp->b_pages;
2052 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2053 for (i = btop(o); i > 0; i--) {
2054 pp = pp->p_next;
2055 }
2056 bufp->b_pages = pp;
2057 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2058 } else {
2059 bufp->b_un.b_addr =
2060 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2061 if (bp->b_flags & B_REMAPPED)
2062 bufp->b_proc = NULL;
2063 }
2064 }
2065 return (bufp);
2066 }
2067