1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2011 Joyent, Inc. All rights reserved. 25 */ 26 27 /* 28 * Copyright (c) 2016 by Delphix. All rights reserved. 29 */ 30 31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 32 /* All Rights Reserved */ 33 34 /* 35 * University Copyright- Copyright (c) 1982, 1986, 1988 36 * The Regents of the University of California 37 * All Rights Reserved 38 * 39 * University Acknowledgment- Portions of this document are derived from 40 * software developed by the University of California, Berkeley, and its 41 * contributors. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/sysmacros.h> 47 #include <sys/conf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/errno.h> 50 #include <sys/debug.h> 51 #include <sys/buf.h> 52 #include <sys/var.h> 53 #include <sys/vnode.h> 54 #include <sys/bitmap.h> 55 #include <sys/cmn_err.h> 56 #include <sys/kmem.h> 57 #include <sys/vmem.h> 58 #include <sys/atomic.h> 59 #include <vm/seg_kmem.h> 60 #include <vm/page.h> 61 #include <vm/pvn.h> 62 #include <sys/vtrace.h> 63 #include <sys/tnf_probe.h> 64 #include <sys/fs/ufs_inode.h> 65 #include <sys/fs/ufs_bio.h> 66 #include <sys/fs/ufs_log.h> 67 #include <sys/systm.h> 68 #include <sys/vfs.h> 69 #include <sys/sdt.h> 70 71 /* Locks */ 72 static kmutex_t blist_lock; /* protects b_list */ 73 static kmutex_t bhdr_lock; /* protects the bhdrlist */ 74 static kmutex_t bfree_lock; /* protects the bfreelist structure */ 75 76 struct hbuf *hbuf; /* Hash buckets */ 77 struct dwbuf *dwbuf; /* Delayed write buckets */ 78 static struct buf *bhdrlist; /* buf header free list */ 79 static int nbuf; /* number of buffer headers allocated */ 80 81 static int lastindex; /* Reference point on where to start */ 82 /* when looking for free buffers */ 83 84 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 85 #define EMPTY_LIST ((struct buf *)-1) 86 87 static kcondvar_t bio_mem_cv; /* Condition variables */ 88 static kcondvar_t bio_flushinval_cv; 89 static int bio_doingflush; /* flush in progress */ 90 static int bio_doinginval; /* inval in progress */ 91 static int bio_flinv_cv_wanted; /* someone waiting for cv */ 92 93 /* 94 * Statistics on the buffer cache 95 */ 96 struct biostats biostats = { 97 { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 98 { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 99 { "new_buffer_requests", KSTAT_DATA_UINT32 }, 100 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 101 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 102 { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 103 }; 104 105 /* 106 * kstat data 107 */ 108 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 109 uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 110 sizeof (kstat_named_t)); 111 112 /* 113 * Statistics on ufs buffer cache 114 * Not protected by locks 115 */ 116 struct ufsbiostats ub = { 117 { "breads", KSTAT_DATA_UINT32 }, 118 { "bwrites", KSTAT_DATA_UINT32 }, 119 { "fbiwrites", KSTAT_DATA_UINT32 }, 120 { "getpages", KSTAT_DATA_UINT32 }, 121 { "getras", KSTAT_DATA_UINT32 }, 122 { "putsyncs", KSTAT_DATA_UINT32 }, 123 { "putasyncs", KSTAT_DATA_UINT32 }, 124 { "putpageios", KSTAT_DATA_UINT32 }, 125 }; 126 127 /* 128 * more UFS Logging eccentricities... 129 * 130 * required since "#pragma weak ..." doesn't work in reverse order. 131 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 132 * to ufs routines don't get plugged into bio.c calls so 133 * we initialize it when setting up the "lufsops" table 134 * in "lufs.c:_init()" 135 */ 136 void (*bio_lufs_strategy)(void *, buf_t *); 137 void (*bio_snapshot_strategy)(void *, buf_t *); 138 139 140 /* Private routines */ 141 static struct buf *bio_getfreeblk(long); 142 static void bio_mem_get(long); 143 static void bio_bhdr_free(struct buf *); 144 static struct buf *bio_bhdr_alloc(void); 145 static void bio_recycle(int, long); 146 static void bio_pageio_done(struct buf *); 147 static int bio_incore(dev_t, daddr_t); 148 149 /* 150 * Buffer cache constants 151 */ 152 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 153 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 154 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 155 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 156 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 157 #define BIO_HASHLEN 4 /* Target length of hash chains */ 158 159 160 /* Flags for bio_recycle() */ 161 #define BIO_HEADER 0x01 162 #define BIO_MEM 0x02 163 164 extern int bufhwm; /* User tunable - high water mark for mem */ 165 extern int bufhwm_pct; /* ditto - given in % of physmem */ 166 167 /* 168 * The following routines allocate and free 169 * buffers with various side effects. In general the 170 * arguments to an allocate routine are a device and 171 * a block number, and the value is a pointer to 172 * to the buffer header; the buffer returned is locked with a 173 * binary semaphore so that no one else can touch it. If the block was 174 * already in core, no I/O need be done; if it is 175 * already locked, the process waits until it becomes free. 176 * The following routines allocate a buffer: 177 * getblk 178 * bread/BREAD 179 * breada 180 * Eventually the buffer must be released, possibly with the 181 * side effect of writing it out, by using one of 182 * bwrite/BWRITE/brwrite 183 * bdwrite/bdrwrite 184 * bawrite 185 * brelse 186 * 187 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 188 * Instead, a binary semaphore, b_sem is used to gain exclusive access to 189 * a buffer and a binary semaphore, b_io is used for I/O synchronization. 190 * B_DONE is still used to denote a buffer with I/O complete on it. 191 * 192 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 193 * should not be used where a very accurate count of the free buffers is 194 * needed. 195 */ 196 197 /* 198 * Read in (if necessary) the block and return a buffer pointer. 199 * 200 * This interface is provided for binary compatibility. Using 201 * BREAD() directly avoids the extra function call overhead invoked 202 * by calling this routine. 203 */ 204 struct buf * 205 bread(dev_t dev, daddr_t blkno, long bsize) 206 { 207 return (BREAD(dev, blkno, bsize)); 208 } 209 210 /* 211 * Common code for reading a buffer with various options 212 * 213 * Read in (if necessary) the block and return a buffer pointer. 214 */ 215 struct buf * 216 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 217 { 218 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 219 struct buf *bp; 220 klwp_t *lwp = ttolwp(curthread); 221 222 CPU_STATS_ADD_K(sys, lread, 1); 223 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 224 if (bp->b_flags & B_DONE) 225 return (bp); 226 bp->b_flags |= B_READ; 227 ASSERT(bp->b_bcount == bsize); 228 if (ufsvfsp == NULL) { /* !ufs */ 229 (void) bdev_strategy(bp); 230 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 231 /* ufs && logging */ 232 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 233 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 234 /* ufs && snapshots */ 235 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 236 } else { 237 ufsvfsp->vfs_iotstamp = ddi_get_lbolt(); 238 ub.ub_breads.value.ul++; /* ufs && !logging */ 239 (void) bdev_strategy(bp); 240 } 241 if (lwp != NULL) 242 lwp->lwp_ru.inblock++; 243 CPU_STATS_ADD_K(sys, bread, 1); 244 (void) biowait(bp); 245 return (bp); 246 } 247 248 /* 249 * Read in the block, like bread, but also start I/O on the 250 * read-ahead block (which is not allocated to the caller). 251 */ 252 struct buf * 253 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 254 { 255 struct buf *bp, *rabp; 256 klwp_t *lwp = ttolwp(curthread); 257 258 bp = NULL; 259 if (!bio_incore(dev, blkno)) { 260 CPU_STATS_ADD_K(sys, lread, 1); 261 bp = GETBLK(dev, blkno, bsize); 262 if ((bp->b_flags & B_DONE) == 0) { 263 bp->b_flags |= B_READ; 264 bp->b_bcount = bsize; 265 (void) bdev_strategy(bp); 266 if (lwp != NULL) 267 lwp->lwp_ru.inblock++; 268 CPU_STATS_ADD_K(sys, bread, 1); 269 } 270 } 271 if (rablkno && bfreelist.b_bcount > 1 && 272 !bio_incore(dev, rablkno)) { 273 rabp = GETBLK(dev, rablkno, bsize); 274 if (rabp->b_flags & B_DONE) 275 brelse(rabp); 276 else { 277 rabp->b_flags |= B_READ|B_ASYNC; 278 rabp->b_bcount = bsize; 279 (void) bdev_strategy(rabp); 280 if (lwp != NULL) 281 lwp->lwp_ru.inblock++; 282 CPU_STATS_ADD_K(sys, bread, 1); 283 } 284 } 285 if (bp == NULL) 286 return (BREAD(dev, blkno, bsize)); 287 (void) biowait(bp); 288 return (bp); 289 } 290 291 /* 292 * Common code for writing a buffer with various options. 293 * 294 * force_wait - wait for write completion regardless of B_ASYNC flag 295 * do_relse - release the buffer when we are done 296 * clear_flags - flags to clear from the buffer 297 */ 298 void 299 bwrite_common(void *arg, struct buf *bp, int force_wait, 300 int do_relse, int clear_flags) 301 { 302 register int do_wait; 303 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 304 int flag; 305 klwp_t *lwp = ttolwp(curthread); 306 struct cpu *cpup; 307 308 ASSERT(SEMA_HELD(&bp->b_sem)); 309 flag = bp->b_flags; 310 bp->b_flags &= ~clear_flags; 311 if (lwp != NULL) 312 lwp->lwp_ru.oublock++; 313 CPU_STATS_ENTER_K(); 314 cpup = CPU; /* get pointer AFTER preemption is disabled */ 315 CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 316 CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 317 do_wait = ((flag & B_ASYNC) == 0 || force_wait); 318 if (do_wait == 0) 319 CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 320 CPU_STATS_EXIT_K(); 321 if (ufsvfsp == NULL) { 322 (void) bdev_strategy(bp); 323 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 324 /* ufs && logging */ 325 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 326 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 327 /* ufs && snapshots */ 328 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 329 } else { 330 ub.ub_bwrites.value.ul++; /* ufs && !logging */ 331 (void) bdev_strategy(bp); 332 } 333 if (do_wait) { 334 (void) biowait(bp); 335 if (do_relse) { 336 brelse(bp); 337 } 338 } 339 } 340 341 /* 342 * Write the buffer, waiting for completion (unless B_ASYNC is set). 343 * Then release the buffer. 344 * This interface is provided for binary compatibility. Using 345 * BWRITE() directly avoids the extra function call overhead invoked 346 * by calling this routine. 347 */ 348 void 349 bwrite(struct buf *bp) 350 { 351 BWRITE(bp); 352 } 353 354 /* 355 * Write the buffer, waiting for completion. 356 * But don't release the buffer afterwards. 357 * This interface is provided for binary compatibility. Using 358 * BWRITE2() directly avoids the extra function call overhead. 359 */ 360 void 361 bwrite2(struct buf *bp) 362 { 363 BWRITE2(bp); 364 } 365 366 /* 367 * Release the buffer, marking it so that if it is grabbed 368 * for another purpose it will be written out before being 369 * given up (e.g. when writing a partial block where it is 370 * assumed that another write for the same block will soon follow). 371 * Also save the time that the block is first marked as delayed 372 * so that it will be written in a reasonable time. 373 */ 374 void 375 bdwrite(struct buf *bp) 376 { 377 ASSERT(SEMA_HELD(&bp->b_sem)); 378 CPU_STATS_ADD_K(sys, lwrite, 1); 379 if ((bp->b_flags & B_DELWRI) == 0) 380 bp->b_start = ddi_get_lbolt(); 381 /* 382 * B_DONE allows others to use the buffer, B_DELWRI causes the 383 * buffer to be written before being reused, and setting b_resid 384 * to zero says the buffer is complete. 385 */ 386 bp->b_flags |= B_DELWRI | B_DONE; 387 bp->b_resid = 0; 388 brelse(bp); 389 } 390 391 /* 392 * Release the buffer, start I/O on it, but don't wait for completion. 393 */ 394 void 395 bawrite(struct buf *bp) 396 { 397 ASSERT(SEMA_HELD(&bp->b_sem)); 398 399 /* Use bfreelist.b_bcount as a weird-ass heuristic */ 400 if (bfreelist.b_bcount > 4) 401 bp->b_flags |= B_ASYNC; 402 BWRITE(bp); 403 } 404 405 /* 406 * Release the buffer, with no I/O implied. 407 */ 408 void 409 brelse(struct buf *bp) 410 { 411 struct buf **backp; 412 uint_t index; 413 kmutex_t *hmp; 414 struct buf *dp; 415 struct hbuf *hp; 416 417 418 ASSERT(SEMA_HELD(&bp->b_sem)); 419 420 /* 421 * Clear the retry write flag if the buffer was written without 422 * error. The presence of B_DELWRI means the buffer has not yet 423 * been written and the presence of B_ERROR means that an error 424 * is still occurring. 425 */ 426 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 427 bp->b_flags &= ~B_RETRYWRI; 428 } 429 430 /* Check for anomalous conditions */ 431 if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 432 if (bp->b_flags & B_NOCACHE) { 433 /* Don't add to the freelist. Destroy it now */ 434 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 435 sema_destroy(&bp->b_sem); 436 sema_destroy(&bp->b_io); 437 kmem_free(bp, sizeof (struct buf)); 438 return; 439 } 440 /* 441 * If a write failed and we are supposed to retry write, 442 * don't toss the buffer. Keep it around and mark it 443 * delayed write in the hopes that it will eventually 444 * get flushed (and still keep the system running.) 445 */ 446 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 447 bp->b_flags |= B_DELWRI; 448 /* keep fsflush from trying continuously to flush */ 449 bp->b_start = ddi_get_lbolt(); 450 } else 451 bp->b_flags |= B_AGE|B_STALE; 452 bp->b_flags &= ~B_ERROR; 453 bp->b_error = 0; 454 } 455 456 /* 457 * If delayed write is set then put in on the delayed 458 * write list instead of the free buffer list. 459 */ 460 index = bio_bhash(bp->b_edev, bp->b_blkno); 461 hmp = &hbuf[index].b_lock; 462 463 mutex_enter(hmp); 464 hp = &hbuf[index]; 465 dp = (struct buf *)hp; 466 467 /* 468 * Make sure that the number of entries on this list are 469 * Zero <= count <= total # buffers 470 */ 471 ASSERT(hp->b_length >= 0); 472 ASSERT(hp->b_length < nbuf); 473 474 hp->b_length++; /* We are adding this buffer */ 475 476 if (bp->b_flags & B_DELWRI) { 477 /* 478 * This buffer goes on the delayed write buffer list 479 */ 480 dp = (struct buf *)&dwbuf[index]; 481 } 482 ASSERT(bp->b_bufsize > 0); 483 ASSERT(bp->b_bcount > 0); 484 ASSERT(bp->b_un.b_addr != NULL); 485 486 if (bp->b_flags & B_AGE) { 487 backp = &dp->av_forw; 488 (*backp)->av_back = bp; 489 bp->av_forw = *backp; 490 *backp = bp; 491 bp->av_back = dp; 492 } else { 493 backp = &dp->av_back; 494 (*backp)->av_forw = bp; 495 bp->av_back = *backp; 496 *backp = bp; 497 bp->av_forw = dp; 498 } 499 mutex_exit(hmp); 500 501 if (bfreelist.b_flags & B_WANTED) { 502 /* 503 * Should come here very very rarely. 504 */ 505 mutex_enter(&bfree_lock); 506 if (bfreelist.b_flags & B_WANTED) { 507 bfreelist.b_flags &= ~B_WANTED; 508 cv_broadcast(&bio_mem_cv); 509 } 510 mutex_exit(&bfree_lock); 511 } 512 513 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 514 /* 515 * Don't let anyone get the buffer off the freelist before we 516 * release our hold on it. 517 */ 518 sema_v(&bp->b_sem); 519 } 520 521 /* 522 * Return a count of the number of B_BUSY buffers in the system 523 * Can only be used as a good estimate. If 'cleanit' is set, 524 * try to flush all bufs. 525 */ 526 int 527 bio_busy(int cleanit) 528 { 529 struct buf *bp, *dp; 530 int busy = 0; 531 int i; 532 kmutex_t *hmp; 533 534 for (i = 0; i < v.v_hbuf; i++) { 535 dp = (struct buf *)&hbuf[i]; 536 hmp = &hbuf[i].b_lock; 537 538 mutex_enter(hmp); 539 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 540 if (bp->b_flags & B_BUSY) 541 busy++; 542 } 543 mutex_exit(hmp); 544 } 545 546 if (cleanit && busy != 0) { 547 bflush(NODEV); 548 } 549 550 return (busy); 551 } 552 553 /* 554 * this interface is provided for binary compatibility. 555 * 556 * Assign a buffer for the given block. If the appropriate 557 * block is already associated, return it; otherwise search 558 * for the oldest non-busy buffer and reassign it. 559 */ 560 struct buf * 561 getblk(dev_t dev, daddr_t blkno, long bsize) 562 { 563 return (getblk_common(/* ufsvfsp */ NULL, dev, 564 blkno, bsize, /* errflg */ 0)); 565 } 566 567 /* 568 * Assign a buffer for the given block. If the appropriate 569 * block is already associated, return it; otherwise search 570 * for the oldest non-busy buffer and reassign it. 571 */ 572 struct buf * 573 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 574 { 575 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 576 struct buf *bp; 577 struct buf *dp; 578 struct buf *nbp = NULL; 579 struct buf *errbp; 580 uint_t index; 581 kmutex_t *hmp; 582 struct hbuf *hp; 583 584 if (getmajor(dev) >= devcnt) 585 cmn_err(CE_PANIC, "blkdev"); 586 587 biostats.bio_lookup.value.ui32++; 588 589 index = bio_bhash(dev, blkno); 590 hp = &hbuf[index]; 591 dp = (struct buf *)hp; 592 hmp = &hp->b_lock; 593 594 mutex_enter(hmp); 595 loop: 596 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 597 if (bp->b_blkno != blkno || bp->b_edev != dev || 598 (bp->b_flags & B_STALE)) 599 continue; 600 /* 601 * Avoid holding the hash lock in the event that 602 * the buffer is locked by someone. Since the hash chain 603 * may change when we drop the hash lock 604 * we have to start at the beginning of the chain if the 605 * buffer identity/contents aren't valid. 606 */ 607 if (!sema_tryp(&bp->b_sem)) { 608 biostats.bio_bufbusy.value.ui32++; 609 mutex_exit(hmp); 610 /* 611 * OK, we are dealing with a busy buffer. 612 * In the case that we are panicking and we 613 * got called from bread(), we have some chance 614 * for error recovery. So better bail out from 615 * here since sema_p() won't block. If we got 616 * called directly from ufs routines, there is 617 * no way to report an error yet. 618 */ 619 if (panicstr && errflg) 620 goto errout; 621 /* 622 * For the following line of code to work 623 * correctly never kmem_free the buffer "header". 624 */ 625 sema_p(&bp->b_sem); 626 if (bp->b_blkno != blkno || bp->b_edev != dev || 627 (bp->b_flags & B_STALE)) { 628 sema_v(&bp->b_sem); 629 mutex_enter(hmp); 630 goto loop; /* start over */ 631 } 632 mutex_enter(hmp); 633 } 634 /* Found */ 635 biostats.bio_hit.value.ui32++; 636 bp->b_flags &= ~B_AGE; 637 638 /* 639 * Yank it off the free/delayed write lists 640 */ 641 hp->b_length--; 642 notavail(bp); 643 mutex_exit(hmp); 644 645 ASSERT((bp->b_flags & B_NOCACHE) == NULL); 646 647 if (nbp == NULL) { 648 /* 649 * Make the common path short. 650 */ 651 ASSERT(SEMA_HELD(&bp->b_sem)); 652 return (bp); 653 } 654 655 biostats.bio_bufdup.value.ui32++; 656 657 /* 658 * The buffer must have entered during the lock upgrade 659 * so free the new buffer we allocated and return the 660 * found buffer. 661 */ 662 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 663 nbp->b_un.b_addr = NULL; 664 665 /* 666 * Account for the memory 667 */ 668 mutex_enter(&bfree_lock); 669 bfreelist.b_bufsize += nbp->b_bufsize; 670 mutex_exit(&bfree_lock); 671 672 /* 673 * Destroy buf identity, and place on avail list 674 */ 675 nbp->b_dev = (o_dev_t)NODEV; 676 nbp->b_edev = NODEV; 677 nbp->b_flags = 0; 678 nbp->b_file = NULL; 679 nbp->b_offset = -1; 680 681 sema_v(&nbp->b_sem); 682 bio_bhdr_free(nbp); 683 684 ASSERT(SEMA_HELD(&bp->b_sem)); 685 return (bp); 686 } 687 688 /* 689 * bio_getfreeblk may block so check the hash chain again. 690 */ 691 if (nbp == NULL) { 692 mutex_exit(hmp); 693 nbp = bio_getfreeblk(bsize); 694 mutex_enter(hmp); 695 goto loop; 696 } 697 698 /* 699 * New buffer. Assign nbp and stick it on the hash. 700 */ 701 nbp->b_flags = B_BUSY; 702 nbp->b_edev = dev; 703 nbp->b_dev = (o_dev_t)cmpdev(dev); 704 nbp->b_blkno = blkno; 705 nbp->b_iodone = NULL; 706 nbp->b_bcount = bsize; 707 /* 708 * If we are given a ufsvfsp and the vfs_root field is NULL 709 * then this must be I/O for a superblock. A superblock's 710 * buffer is set up in mountfs() and there is no root vnode 711 * at that point. 712 */ 713 if (ufsvfsp && ufsvfsp->vfs_root) { 714 nbp->b_vp = ufsvfsp->vfs_root; 715 } else { 716 nbp->b_vp = NULL; 717 } 718 719 ASSERT((nbp->b_flags & B_NOCACHE) == NULL); 720 721 binshash(nbp, dp); 722 mutex_exit(hmp); 723 724 ASSERT(SEMA_HELD(&nbp->b_sem)); 725 726 return (nbp); 727 728 729 /* 730 * Come here in case of an internal error. At this point we couldn't 731 * get a buffer, but we have to return one. Hence we allocate some 732 * kind of error reply buffer on the fly. This buffer is marked as 733 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 734 * - B_ERROR will indicate error to the caller. 735 * - B_DONE will prevent us from reading the buffer from 736 * the device. 737 * - B_NOCACHE will cause that this buffer gets free'd in 738 * brelse(). 739 */ 740 741 errout: 742 errbp = geteblk(); 743 sema_p(&errbp->b_sem); 744 errbp->b_flags &= ~B_BUSY; 745 errbp->b_flags |= (B_ERROR | B_DONE); 746 return (errbp); 747 } 748 749 /* 750 * Get an empty block, not assigned to any particular device. 751 * Returns a locked buffer that is not on any hash or free list. 752 */ 753 struct buf * 754 ngeteblk(long bsize) 755 { 756 struct buf *bp; 757 758 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 759 bioinit(bp); 760 bp->av_forw = bp->av_back = NULL; 761 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 762 bp->b_bufsize = bsize; 763 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 764 bp->b_dev = (o_dev_t)NODEV; 765 bp->b_edev = NODEV; 766 bp->b_lblkno = 0; 767 bp->b_bcount = bsize; 768 bp->b_iodone = NULL; 769 return (bp); 770 } 771 772 /* 773 * Interface of geteblk() is kept intact to maintain driver compatibility. 774 * Use ngeteblk() to allocate block size other than 1 KB. 775 */ 776 struct buf * 777 geteblk(void) 778 { 779 return (ngeteblk((long)1024)); 780 } 781 782 /* 783 * Return a buffer w/o sleeping 784 */ 785 struct buf * 786 trygetblk(dev_t dev, daddr_t blkno) 787 { 788 struct buf *bp; 789 struct buf *dp; 790 struct hbuf *hp; 791 kmutex_t *hmp; 792 uint_t index; 793 794 index = bio_bhash(dev, blkno); 795 hp = &hbuf[index]; 796 hmp = &hp->b_lock; 797 798 if (!mutex_tryenter(hmp)) 799 return (NULL); 800 801 dp = (struct buf *)hp; 802 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 803 if (bp->b_blkno != blkno || bp->b_edev != dev || 804 (bp->b_flags & B_STALE)) 805 continue; 806 /* 807 * Get access to a valid buffer without sleeping 808 */ 809 if (sema_tryp(&bp->b_sem)) { 810 if (bp->b_flags & B_DONE) { 811 hp->b_length--; 812 notavail(bp); 813 mutex_exit(hmp); 814 return (bp); 815 } else { 816 sema_v(&bp->b_sem); 817 break; 818 } 819 } 820 break; 821 } 822 mutex_exit(hmp); 823 return (NULL); 824 } 825 826 /* 827 * Wait for I/O completion on the buffer; return errors 828 * to the user. 829 */ 830 int 831 iowait(struct buf *bp) 832 { 833 ASSERT(SEMA_HELD(&bp->b_sem)); 834 return (biowait(bp)); 835 } 836 837 /* 838 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 839 * and wake up anyone waiting for it. 840 */ 841 void 842 iodone(struct buf *bp) 843 { 844 ASSERT(SEMA_HELD(&bp->b_sem)); 845 (void) biodone(bp); 846 } 847 848 /* 849 * Zero the core associated with a buffer. 850 */ 851 void 852 clrbuf(struct buf *bp) 853 { 854 ASSERT(SEMA_HELD(&bp->b_sem)); 855 bzero(bp->b_un.b_addr, bp->b_bcount); 856 bp->b_resid = 0; 857 } 858 859 860 /* 861 * Make sure all write-behind blocks on dev (or NODEV for all) 862 * are flushed out. 863 */ 864 void 865 bflush(dev_t dev) 866 { 867 struct buf *bp, *dp; 868 struct hbuf *hp; 869 struct buf *delwri_list = EMPTY_LIST; 870 int i, index; 871 kmutex_t *hmp; 872 873 mutex_enter(&blist_lock); 874 /* 875 * Wait for any invalidates or flushes ahead of us to finish. 876 * We really could split blist_lock up per device for better 877 * parallelism here. 878 */ 879 while (bio_doinginval || bio_doingflush) { 880 bio_flinv_cv_wanted = 1; 881 cv_wait(&bio_flushinval_cv, &blist_lock); 882 } 883 bio_doingflush++; 884 /* 885 * Gather all B_DELWRI buffer for device. 886 * Lock ordering is b_sem > hash lock (brelse). 887 * Since we are finding the buffer via the delayed write list, 888 * it may be busy and we would block trying to get the 889 * b_sem lock while holding hash lock. So transfer all the 890 * candidates on the delwri_list and then drop the hash locks. 891 */ 892 for (i = 0; i < v.v_hbuf; i++) { 893 hmp = &hbuf[i].b_lock; 894 dp = (struct buf *)&dwbuf[i]; 895 mutex_enter(hmp); 896 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 897 if (dev == NODEV || bp->b_edev == dev) { 898 if (bp->b_list == NULL) { 899 bp->b_list = delwri_list; 900 delwri_list = bp; 901 } 902 } 903 } 904 mutex_exit(hmp); 905 } 906 mutex_exit(&blist_lock); 907 908 /* 909 * Now that the hash locks have been dropped grab the semaphores 910 * and write back all the buffers that have B_DELWRI set. 911 */ 912 while (delwri_list != EMPTY_LIST) { 913 bp = delwri_list; 914 915 sema_p(&bp->b_sem); /* may block */ 916 if ((dev != bp->b_edev && dev != NODEV) || 917 (panicstr && bp->b_flags & B_BUSY)) { 918 sema_v(&bp->b_sem); 919 delwri_list = bp->b_list; 920 bp->b_list = NULL; 921 continue; /* No longer a candidate */ 922 } 923 if (bp->b_flags & B_DELWRI) { 924 index = bio_bhash(bp->b_edev, bp->b_blkno); 925 hp = &hbuf[index]; 926 hmp = &hp->b_lock; 927 dp = (struct buf *)hp; 928 929 bp->b_flags |= B_ASYNC; 930 mutex_enter(hmp); 931 hp->b_length--; 932 notavail(bp); 933 mutex_exit(hmp); 934 if (bp->b_vp == NULL) { /* !ufs */ 935 BWRITE(bp); 936 } else { /* ufs */ 937 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 938 } 939 } else { 940 sema_v(&bp->b_sem); 941 } 942 delwri_list = bp->b_list; 943 bp->b_list = NULL; 944 } 945 mutex_enter(&blist_lock); 946 bio_doingflush--; 947 if (bio_flinv_cv_wanted) { 948 bio_flinv_cv_wanted = 0; 949 cv_broadcast(&bio_flushinval_cv); 950 } 951 mutex_exit(&blist_lock); 952 } 953 954 /* 955 * Ensure that a specified block is up-to-date on disk. 956 */ 957 void 958 blkflush(dev_t dev, daddr_t blkno) 959 { 960 struct buf *bp, *dp; 961 struct hbuf *hp; 962 struct buf *sbp = NULL; 963 uint_t index; 964 kmutex_t *hmp; 965 966 index = bio_bhash(dev, blkno); 967 hp = &hbuf[index]; 968 dp = (struct buf *)hp; 969 hmp = &hp->b_lock; 970 971 /* 972 * Identify the buffer in the cache belonging to 973 * this device and blkno (if any). 974 */ 975 mutex_enter(hmp); 976 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 977 if (bp->b_blkno != blkno || bp->b_edev != dev || 978 (bp->b_flags & B_STALE)) 979 continue; 980 sbp = bp; 981 break; 982 } 983 mutex_exit(hmp); 984 if (sbp == NULL) 985 return; 986 /* 987 * Now check the buffer we have identified and 988 * make sure it still belongs to the device and is B_DELWRI 989 */ 990 sema_p(&sbp->b_sem); 991 if (sbp->b_blkno == blkno && sbp->b_edev == dev && 992 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 993 mutex_enter(hmp); 994 hp->b_length--; 995 notavail(sbp); 996 mutex_exit(hmp); 997 /* 998 * XXX - There is nothing to guarantee a synchronous 999 * write here if the B_ASYNC flag is set. This needs 1000 * some investigation. 1001 */ 1002 if (sbp->b_vp == NULL) { /* !ufs */ 1003 BWRITE(sbp); /* synchronous write */ 1004 } else { /* ufs */ 1005 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1006 } 1007 } else { 1008 sema_v(&sbp->b_sem); 1009 } 1010 } 1011 1012 /* 1013 * Same as binval, except can force-invalidate delayed-write buffers 1014 * (which are not be already flushed because of device errors). Also 1015 * makes sure that the retry write flag is cleared. 1016 */ 1017 int 1018 bfinval(dev_t dev, int force) 1019 { 1020 struct buf *dp; 1021 struct buf *bp; 1022 struct buf *binval_list = EMPTY_LIST; 1023 int i, error = 0; 1024 kmutex_t *hmp; 1025 uint_t index; 1026 struct buf **backp; 1027 1028 mutex_enter(&blist_lock); 1029 /* 1030 * Wait for any flushes ahead of us to finish, it's ok to 1031 * do invalidates in parallel. 1032 */ 1033 while (bio_doingflush) { 1034 bio_flinv_cv_wanted = 1; 1035 cv_wait(&bio_flushinval_cv, &blist_lock); 1036 } 1037 bio_doinginval++; 1038 1039 /* Gather bp's */ 1040 for (i = 0; i < v.v_hbuf; i++) { 1041 dp = (struct buf *)&hbuf[i]; 1042 hmp = &hbuf[i].b_lock; 1043 1044 mutex_enter(hmp); 1045 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1046 if (bp->b_edev == dev) { 1047 if (bp->b_list == NULL) { 1048 bp->b_list = binval_list; 1049 binval_list = bp; 1050 } 1051 } 1052 } 1053 mutex_exit(hmp); 1054 } 1055 mutex_exit(&blist_lock); 1056 1057 /* Invalidate all bp's found */ 1058 while (binval_list != EMPTY_LIST) { 1059 bp = binval_list; 1060 1061 sema_p(&bp->b_sem); 1062 if (bp->b_edev == dev) { 1063 if (force && (bp->b_flags & B_DELWRI)) { 1064 /* clear B_DELWRI, move to non-dw freelist */ 1065 index = bio_bhash(bp->b_edev, bp->b_blkno); 1066 hmp = &hbuf[index].b_lock; 1067 dp = (struct buf *)&hbuf[index]; 1068 mutex_enter(hmp); 1069 1070 /* remove from delayed write freelist */ 1071 notavail(bp); 1072 1073 /* add to B_AGE side of non-dw freelist */ 1074 backp = &dp->av_forw; 1075 (*backp)->av_back = bp; 1076 bp->av_forw = *backp; 1077 *backp = bp; 1078 bp->av_back = dp; 1079 1080 /* 1081 * make sure write retries and busy are cleared 1082 */ 1083 bp->b_flags &= 1084 ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1085 mutex_exit(hmp); 1086 } 1087 if ((bp->b_flags & B_DELWRI) == 0) 1088 bp->b_flags |= B_STALE|B_AGE; 1089 else 1090 error = EIO; 1091 } 1092 sema_v(&bp->b_sem); 1093 binval_list = bp->b_list; 1094 bp->b_list = NULL; 1095 } 1096 mutex_enter(&blist_lock); 1097 bio_doinginval--; 1098 if (bio_flinv_cv_wanted) { 1099 cv_broadcast(&bio_flushinval_cv); 1100 bio_flinv_cv_wanted = 0; 1101 } 1102 mutex_exit(&blist_lock); 1103 return (error); 1104 } 1105 1106 /* 1107 * If possible, invalidate blocks for a dev on demand 1108 */ 1109 void 1110 binval(dev_t dev) 1111 { 1112 (void) bfinval(dev, 0); 1113 } 1114 1115 /* 1116 * Initialize the buffer I/O system by freeing 1117 * all buffers and setting all device hash buffer lists to empty. 1118 */ 1119 void 1120 binit(void) 1121 { 1122 struct buf *bp; 1123 unsigned int i, pct; 1124 ulong_t bio_max_hwm, bio_default_hwm; 1125 1126 /* 1127 * Maximum/Default values for bufhwm are set to the smallest of: 1128 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1129 * - 1/4 of kernel virtual memory 1130 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1131 * Additionally, in order to allow simple tuning by percentage of 1132 * physical memory, bufhwm_pct is used to calculate the default if 1133 * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1134 * 1135 * Since the unit for v.v_bufhwm is kilobytes, this allows for 1136 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1137 */ 1138 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1139 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1140 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1141 1142 pct = BIO_BUF_PERCENT; 1143 if (bufhwm_pct != 0 && 1144 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1145 pct = BIO_BUF_PERCENT; 1146 /* 1147 * Invalid user specified value, emit a warning. 1148 */ 1149 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1150 range(1..%d). Using %d as default.", 1151 bufhwm_pct, 1152 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1153 } 1154 1155 bio_default_hwm = MIN(physmem / pct, 1156 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1157 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1158 1159 if ((v.v_bufhwm = bufhwm) == 0) 1160 v.v_bufhwm = bio_default_hwm; 1161 1162 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1163 v.v_bufhwm = (int)bio_max_hwm; 1164 /* 1165 * Invalid user specified value, emit a warning. 1166 */ 1167 cmn_err(CE_WARN, 1168 "binit: bufhwm(%d) out \ 1169 of range(%d..%lu). Using %lu as default", 1170 bufhwm, 1171 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1172 } 1173 1174 /* 1175 * Determine the number of hash buckets. Default is to 1176 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1177 * Round up number to the next power of 2. 1178 */ 1179 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1180 BIO_HASHLEN); 1181 v.v_hmask = v.v_hbuf - 1; 1182 v.v_buf = BIO_BHDR_POOL; 1183 1184 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1185 1186 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1187 1188 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1189 bp = &bfreelist; 1190 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1191 1192 for (i = 0; i < v.v_hbuf; i++) { 1193 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1194 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1195 1196 /* 1197 * Initialize the delayed write buffer list. 1198 */ 1199 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1200 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1201 } 1202 } 1203 1204 /* 1205 * Wait for I/O completion on the buffer; return error code. 1206 * If bp was for synchronous I/O, bp is invalid and associated 1207 * resources are freed on return. 1208 */ 1209 int 1210 biowait(struct buf *bp) 1211 { 1212 int error = 0; 1213 struct cpu *cpup; 1214 1215 ASSERT(SEMA_HELD(&bp->b_sem)); 1216 1217 cpup = CPU; 1218 atomic_inc_64(&cpup->cpu_stats.sys.iowait); 1219 DTRACE_IO1(wait__start, struct buf *, bp); 1220 1221 /* 1222 * In case of panic, busy wait for completion 1223 */ 1224 if (panicstr) { 1225 while ((bp->b_flags & B_DONE) == 0) 1226 drv_usecwait(10); 1227 } else 1228 sema_p(&bp->b_io); 1229 1230 DTRACE_IO1(wait__done, struct buf *, bp); 1231 atomic_dec_64(&cpup->cpu_stats.sys.iowait); 1232 1233 error = geterror(bp); 1234 if ((bp->b_flags & B_ASYNC) == 0) { 1235 if (bp->b_flags & B_REMAPPED) 1236 bp_mapout(bp); 1237 } 1238 return (error); 1239 } 1240 1241 static void 1242 biodone_tnf_probe(struct buf *bp) 1243 { 1244 /* Kernel probe */ 1245 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */, 1246 tnf_device, device, bp->b_edev, 1247 tnf_diskaddr, block, bp->b_lblkno, 1248 tnf_opaque, buf, bp); 1249 } 1250 1251 /* 1252 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1253 * and wake up anyone waiting for it. 1254 */ 1255 void 1256 biodone(struct buf *bp) 1257 { 1258 if (bp->b_flags & B_STARTED) { 1259 DTRACE_IO1(done, struct buf *, bp); 1260 bp->b_flags &= ~B_STARTED; 1261 } 1262 1263 /* 1264 * Call the TNF probe here instead of the inline code 1265 * to force our compiler to use the tail call optimization. 1266 */ 1267 biodone_tnf_probe(bp); 1268 1269 if (bp->b_iodone != NULL) { 1270 (*(bp->b_iodone))(bp); 1271 return; 1272 } 1273 ASSERT((bp->b_flags & B_DONE) == 0); 1274 ASSERT(SEMA_HELD(&bp->b_sem)); 1275 bp->b_flags |= B_DONE; 1276 if (bp->b_flags & B_ASYNC) { 1277 if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1278 bio_pageio_done(bp); 1279 else 1280 brelse(bp); /* release bp to freelist */ 1281 } else { 1282 sema_v(&bp->b_io); 1283 } 1284 } 1285 1286 /* 1287 * Pick up the device's error number and pass it to the user; 1288 * if there is an error but the number is 0 set a generalized code. 1289 */ 1290 int 1291 geterror(struct buf *bp) 1292 { 1293 int error = 0; 1294 1295 ASSERT(SEMA_HELD(&bp->b_sem)); 1296 if (bp->b_flags & B_ERROR) { 1297 error = bp->b_error; 1298 if (!error) 1299 error = EIO; 1300 } 1301 return (error); 1302 } 1303 1304 /* 1305 * Support for pageio buffers. 1306 * 1307 * This stuff should be generalized to provide a generalized bp 1308 * header facility that can be used for things other than pageio. 1309 */ 1310 1311 /* 1312 * Allocate and initialize a buf struct for use with pageio. 1313 */ 1314 struct buf * 1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1316 { 1317 struct buf *bp; 1318 struct cpu *cpup; 1319 1320 if (flags & B_READ) { 1321 CPU_STATS_ENTER_K(); 1322 cpup = CPU; /* get pointer AFTER preemption is disabled */ 1323 CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1324 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1325 1326 atomic_add_64(&curzone->zone_pgpgin, btopr(len)); 1327 1328 if ((flags & B_ASYNC) == 0) { 1329 klwp_t *lwp = ttolwp(curthread); 1330 if (lwp != NULL) 1331 lwp->lwp_ru.majflt++; 1332 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1333 /* Kernel probe */ 1334 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */, 1335 tnf_opaque, vnode, pp->p_vnode, 1336 tnf_offset, offset, pp->p_offset); 1337 } 1338 /* 1339 * Update statistics for pages being paged in 1340 */ 1341 if (pp != NULL && pp->p_vnode != NULL) { 1342 if (IS_SWAPFSVP(pp->p_vnode)) { 1343 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); 1344 atomic_add_64(&curzone->zone_anonpgin, 1345 btopr(len)); 1346 } else { 1347 if (pp->p_vnode->v_flag & VVMEXEC) { 1348 CPU_STATS_ADDQ(cpup, vm, execpgin, 1349 btopr(len)); 1350 atomic_add_64(&curzone->zone_execpgin, 1351 btopr(len)); 1352 } else { 1353 CPU_STATS_ADDQ(cpup, vm, fspgin, 1354 btopr(len)); 1355 atomic_add_64(&curzone->zone_fspgin, 1356 btopr(len)); 1357 } 1358 } 1359 } 1360 CPU_STATS_EXIT_K(); 1361 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1362 "page_ws_in:pp %p", pp); 1363 /* Kernel probe */ 1364 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */, 1365 tnf_opaque, vnode, pp->p_vnode, 1366 tnf_offset, offset, pp->p_offset, 1367 tnf_size, size, len); 1368 } 1369 1370 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1371 bp->b_bcount = len; 1372 bp->b_bufsize = len; 1373 bp->b_pages = pp; 1374 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1375 bp->b_offset = -1; 1376 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1377 1378 /* Initialize bp->b_sem in "locked" state */ 1379 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1380 1381 VN_HOLD(vp); 1382 bp->b_vp = vp; 1383 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */ 1384 1385 /* 1386 * Caller sets dev & blkno and can adjust 1387 * b_addr for page offset and can use bp_mapin 1388 * to make pages kernel addressable. 1389 */ 1390 return (bp); 1391 } 1392 1393 void 1394 pageio_done(struct buf *bp) 1395 { 1396 ASSERT(SEMA_HELD(&bp->b_sem)); 1397 if (bp->b_flags & B_REMAPPED) 1398 bp_mapout(bp); 1399 VN_RELE(bp->b_vp); 1400 bp->b_vp = NULL; 1401 ASSERT((bp->b_flags & B_NOCACHE) != 0); 1402 1403 /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1404 sema_destroy(&bp->b_sem); 1405 sema_destroy(&bp->b_io); 1406 kmem_free(bp, sizeof (struct buf)); 1407 } 1408 1409 /* 1410 * Check to see whether the buffers, except the one pointed by sbp, 1411 * associated with the device are busy. 1412 * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1413 */ 1414 int 1415 bcheck(dev_t dev, struct buf *sbp) 1416 { 1417 struct buf *bp; 1418 struct buf *dp; 1419 int i; 1420 kmutex_t *hmp; 1421 1422 /* 1423 * check for busy bufs for this filesystem 1424 */ 1425 for (i = 0; i < v.v_hbuf; i++) { 1426 dp = (struct buf *)&hbuf[i]; 1427 hmp = &hbuf[i].b_lock; 1428 1429 mutex_enter(hmp); 1430 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1431 /* 1432 * if buf is busy or dirty, then filesystem is busy 1433 */ 1434 if ((bp->b_edev == dev) && 1435 ((bp->b_flags & B_STALE) == 0) && 1436 (bp->b_flags & (B_DELWRI|B_BUSY)) && 1437 (bp != sbp)) { 1438 mutex_exit(hmp); 1439 return (1); 1440 } 1441 } 1442 mutex_exit(hmp); 1443 } 1444 return (0); 1445 } 1446 1447 /* 1448 * Hash two 32 bit entities. 1449 */ 1450 int 1451 hash2ints(int x, int y) 1452 { 1453 int hash = 0; 1454 1455 hash = x - 1; 1456 hash = ((hash * 7) + (x >> 8)) - 1; 1457 hash = ((hash * 7) + (x >> 16)) - 1; 1458 hash = ((hash * 7) + (x >> 24)) - 1; 1459 hash = ((hash * 7) + y) - 1; 1460 hash = ((hash * 7) + (y >> 8)) - 1; 1461 hash = ((hash * 7) + (y >> 16)) - 1; 1462 hash = ((hash * 7) + (y >> 24)) - 1; 1463 1464 return (hash); 1465 } 1466 1467 1468 /* 1469 * Return a new buffer struct. 1470 * Create a new buffer if we haven't gone over our high water 1471 * mark for memory, otherwise try to get one off the freelist. 1472 * 1473 * Returns a locked buf that has no id and is not on any hash or free 1474 * list. 1475 */ 1476 static struct buf * 1477 bio_getfreeblk(long bsize) 1478 { 1479 struct buf *bp, *dp; 1480 struct hbuf *hp; 1481 kmutex_t *hmp; 1482 uint_t start, end; 1483 1484 /* 1485 * mutex_enter(&bfree_lock); 1486 * bfreelist.b_bufsize represents the amount of memory 1487 * mutex_exit(&bfree_lock); protect ref to bfreelist 1488 * we are allowed to allocate in the cache before we hit our hwm. 1489 */ 1490 bio_mem_get(bsize); /* Account for our memory request */ 1491 1492 again: 1493 bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1494 sema_p(&bp->b_sem); /* Should never fail */ 1495 1496 ASSERT(bp->b_un.b_addr == NULL); 1497 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1498 if (bp->b_un.b_addr != NULL) { 1499 /* 1500 * Make the common path short 1501 */ 1502 bp->b_bufsize = bsize; 1503 ASSERT(SEMA_HELD(&bp->b_sem)); 1504 return (bp); 1505 } else { 1506 struct buf *save; 1507 1508 save = bp; /* Save bp we allocated */ 1509 start = end = lastindex; 1510 1511 biostats.bio_bufwant.value.ui32++; 1512 1513 /* 1514 * Memory isn't available from the system now. Scan 1515 * the hash buckets till enough space is found. 1516 */ 1517 do { 1518 hp = &hbuf[start]; 1519 hmp = &hp->b_lock; 1520 dp = (struct buf *)hp; 1521 1522 mutex_enter(hmp); 1523 bp = dp->av_forw; 1524 1525 while (bp != dp) { 1526 1527 ASSERT(bp != NULL); 1528 1529 if (!sema_tryp(&bp->b_sem)) { 1530 bp = bp->av_forw; 1531 continue; 1532 } 1533 1534 /* 1535 * Since we are going down the freelist 1536 * associated with this hash bucket the 1537 * B_DELWRI flag should not be set. 1538 */ 1539 ASSERT(!(bp->b_flags & B_DELWRI)); 1540 1541 if (bp->b_bufsize == bsize) { 1542 hp->b_length--; 1543 notavail(bp); 1544 bremhash(bp); 1545 mutex_exit(hmp); 1546 1547 /* 1548 * Didn't kmem_alloc any more, so don't 1549 * count it twice. 1550 */ 1551 mutex_enter(&bfree_lock); 1552 bfreelist.b_bufsize += bsize; 1553 mutex_exit(&bfree_lock); 1554 1555 /* 1556 * Update the lastindex value. 1557 */ 1558 lastindex = start; 1559 1560 /* 1561 * Put our saved bp back on the list 1562 */ 1563 sema_v(&save->b_sem); 1564 bio_bhdr_free(save); 1565 ASSERT(SEMA_HELD(&bp->b_sem)); 1566 return (bp); 1567 } 1568 sema_v(&bp->b_sem); 1569 bp = bp->av_forw; 1570 } 1571 mutex_exit(hmp); 1572 start = ((start + 1) % v.v_hbuf); 1573 } while (start != end); 1574 1575 biostats.bio_bufwait.value.ui32++; 1576 bp = save; /* Use original bp */ 1577 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1578 } 1579 1580 bp->b_bufsize = bsize; 1581 ASSERT(SEMA_HELD(&bp->b_sem)); 1582 return (bp); 1583 } 1584 1585 /* 1586 * Allocate a buffer header. If none currently available, allocate 1587 * a new pool. 1588 */ 1589 static struct buf * 1590 bio_bhdr_alloc(void) 1591 { 1592 struct buf *dp, *sdp; 1593 struct buf *bp; 1594 int i; 1595 1596 for (;;) { 1597 mutex_enter(&bhdr_lock); 1598 if (bhdrlist != NULL) { 1599 bp = bhdrlist; 1600 bhdrlist = bp->av_forw; 1601 mutex_exit(&bhdr_lock); 1602 bp->av_forw = NULL; 1603 return (bp); 1604 } 1605 mutex_exit(&bhdr_lock); 1606 1607 /* 1608 * Need to allocate a new pool. If the system is currently 1609 * out of memory, then try freeing things on the freelist. 1610 */ 1611 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1612 if (dp == NULL) { 1613 /* 1614 * System can't give us a pool of headers, try 1615 * recycling from the free lists. 1616 */ 1617 bio_recycle(BIO_HEADER, 0); 1618 } else { 1619 sdp = dp; 1620 for (i = 0; i < v.v_buf; i++, dp++) { 1621 /* 1622 * The next two lines are needed since NODEV 1623 * is -1 and not NULL 1624 */ 1625 dp->b_dev = (o_dev_t)NODEV; 1626 dp->b_edev = NODEV; 1627 dp->av_forw = dp + 1; 1628 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1629 NULL); 1630 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1631 NULL); 1632 dp->b_offset = -1; 1633 } 1634 mutex_enter(&bhdr_lock); 1635 (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1636 bhdrlist = sdp; 1637 nbuf += v.v_buf; 1638 bp = bhdrlist; 1639 bhdrlist = bp->av_forw; 1640 mutex_exit(&bhdr_lock); 1641 1642 bp->av_forw = NULL; 1643 return (bp); 1644 } 1645 } 1646 } 1647 1648 static void 1649 bio_bhdr_free(struct buf *bp) 1650 { 1651 ASSERT(bp->b_back == NULL); 1652 ASSERT(bp->b_forw == NULL); 1653 ASSERT(bp->av_back == NULL); 1654 ASSERT(bp->av_forw == NULL); 1655 ASSERT(bp->b_un.b_addr == NULL); 1656 ASSERT(bp->b_dev == (o_dev_t)NODEV); 1657 ASSERT(bp->b_edev == NODEV); 1658 ASSERT(bp->b_flags == 0); 1659 1660 mutex_enter(&bhdr_lock); 1661 bp->av_forw = bhdrlist; 1662 bhdrlist = bp; 1663 mutex_exit(&bhdr_lock); 1664 } 1665 1666 /* 1667 * If we haven't gone over the high water mark, it's o.k. to 1668 * allocate more buffer space, otherwise recycle buffers 1669 * from the freelist until enough memory is free for a bsize request. 1670 * 1671 * We account for this memory, even though 1672 * we don't allocate it here. 1673 */ 1674 static void 1675 bio_mem_get(long bsize) 1676 { 1677 mutex_enter(&bfree_lock); 1678 if (bfreelist.b_bufsize > bsize) { 1679 bfreelist.b_bufsize -= bsize; 1680 mutex_exit(&bfree_lock); 1681 return; 1682 } 1683 mutex_exit(&bfree_lock); 1684 bio_recycle(BIO_MEM, bsize); 1685 } 1686 1687 /* 1688 * flush a list of delayed write buffers. 1689 * (currently used only by bio_recycle below.) 1690 */ 1691 static void 1692 bio_flushlist(struct buf *delwri_list) 1693 { 1694 struct buf *bp; 1695 1696 while (delwri_list != EMPTY_LIST) { 1697 bp = delwri_list; 1698 bp->b_flags |= B_AGE | B_ASYNC; 1699 if (bp->b_vp == NULL) { /* !ufs */ 1700 BWRITE(bp); 1701 } else { /* ufs */ 1702 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1703 } 1704 delwri_list = bp->b_list; 1705 bp->b_list = NULL; 1706 } 1707 } 1708 1709 /* 1710 * Start recycling buffers on the freelist for one of 2 reasons: 1711 * - we need a buffer header 1712 * - we need to free up memory 1713 * Once started we continue to recycle buffers until the B_AGE 1714 * buffers are gone. 1715 */ 1716 static void 1717 bio_recycle(int want, long bsize) 1718 { 1719 struct buf *bp, *dp, *dwp, *nbp; 1720 struct hbuf *hp; 1721 int found = 0; 1722 kmutex_t *hmp; 1723 int start, end; 1724 struct buf *delwri_list = EMPTY_LIST; 1725 1726 /* 1727 * Recycle buffers. 1728 */ 1729 top: 1730 start = end = lastindex; 1731 do { 1732 hp = &hbuf[start]; 1733 hmp = &hp->b_lock; 1734 dp = (struct buf *)hp; 1735 1736 mutex_enter(hmp); 1737 bp = dp->av_forw; 1738 1739 while (bp != dp) { 1740 1741 ASSERT(bp != NULL); 1742 1743 if (!sema_tryp(&bp->b_sem)) { 1744 bp = bp->av_forw; 1745 continue; 1746 } 1747 /* 1748 * Do we really want to nuke all of the B_AGE stuff?? 1749 */ 1750 if ((bp->b_flags & B_AGE) == 0 && found) { 1751 sema_v(&bp->b_sem); 1752 mutex_exit(hmp); 1753 lastindex = start; 1754 return; /* All done */ 1755 } 1756 1757 ASSERT(MUTEX_HELD(&hp->b_lock)); 1758 ASSERT(!(bp->b_flags & B_DELWRI)); 1759 hp->b_length--; 1760 notavail(bp); 1761 1762 /* 1763 * Remove bhdr from cache, free up memory, 1764 * and add the hdr to the freelist. 1765 */ 1766 bremhash(bp); 1767 mutex_exit(hmp); 1768 1769 if (bp->b_bufsize) { 1770 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1771 bp->b_un.b_addr = NULL; 1772 mutex_enter(&bfree_lock); 1773 bfreelist.b_bufsize += bp->b_bufsize; 1774 mutex_exit(&bfree_lock); 1775 } 1776 1777 bp->b_dev = (o_dev_t)NODEV; 1778 bp->b_edev = NODEV; 1779 bp->b_flags = 0; 1780 sema_v(&bp->b_sem); 1781 bio_bhdr_free(bp); 1782 if (want == BIO_HEADER) { 1783 found = 1; 1784 } else { 1785 ASSERT(want == BIO_MEM); 1786 if (!found && bfreelist.b_bufsize >= bsize) { 1787 /* Account for the memory we want */ 1788 mutex_enter(&bfree_lock); 1789 if (bfreelist.b_bufsize >= bsize) { 1790 bfreelist.b_bufsize -= bsize; 1791 found = 1; 1792 } 1793 mutex_exit(&bfree_lock); 1794 } 1795 } 1796 1797 /* 1798 * Since we dropped hmp start from the 1799 * begining. 1800 */ 1801 mutex_enter(hmp); 1802 bp = dp->av_forw; 1803 } 1804 mutex_exit(hmp); 1805 1806 /* 1807 * Look at the delayed write list. 1808 * First gather into a private list, then write them. 1809 */ 1810 dwp = (struct buf *)&dwbuf[start]; 1811 mutex_enter(&blist_lock); 1812 bio_doingflush++; 1813 mutex_enter(hmp); 1814 for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1815 1816 ASSERT(bp != NULL); 1817 nbp = bp->av_forw; 1818 1819 if (!sema_tryp(&bp->b_sem)) 1820 continue; 1821 ASSERT(bp->b_flags & B_DELWRI); 1822 /* 1823 * Do we really want to nuke all of the B_AGE stuff?? 1824 */ 1825 1826 if ((bp->b_flags & B_AGE) == 0 && found) { 1827 sema_v(&bp->b_sem); 1828 mutex_exit(hmp); 1829 lastindex = start; 1830 mutex_exit(&blist_lock); 1831 bio_flushlist(delwri_list); 1832 mutex_enter(&blist_lock); 1833 bio_doingflush--; 1834 if (bio_flinv_cv_wanted) { 1835 bio_flinv_cv_wanted = 0; 1836 cv_broadcast(&bio_flushinval_cv); 1837 } 1838 mutex_exit(&blist_lock); 1839 return; /* All done */ 1840 } 1841 1842 /* 1843 * If the buffer is already on a flush or 1844 * invalidate list then just skip it. 1845 */ 1846 if (bp->b_list != NULL) { 1847 sema_v(&bp->b_sem); 1848 continue; 1849 } 1850 /* 1851 * We are still on the same bucket. 1852 */ 1853 hp->b_length--; 1854 notavail(bp); 1855 bp->b_list = delwri_list; 1856 delwri_list = bp; 1857 } 1858 mutex_exit(hmp); 1859 mutex_exit(&blist_lock); 1860 bio_flushlist(delwri_list); 1861 delwri_list = EMPTY_LIST; 1862 mutex_enter(&blist_lock); 1863 bio_doingflush--; 1864 if (bio_flinv_cv_wanted) { 1865 bio_flinv_cv_wanted = 0; 1866 cv_broadcast(&bio_flushinval_cv); 1867 } 1868 mutex_exit(&blist_lock); 1869 start = (start + 1) % v.v_hbuf; 1870 1871 } while (start != end); 1872 1873 if (found) 1874 return; 1875 1876 /* 1877 * Free lists exhausted and we haven't satisfied the request. 1878 * Wait here for more entries to be added to freelist. 1879 * Because this might have just happened, make it timed. 1880 */ 1881 mutex_enter(&bfree_lock); 1882 bfreelist.b_flags |= B_WANTED; 1883 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK); 1884 mutex_exit(&bfree_lock); 1885 goto top; 1886 } 1887 1888 /* 1889 * See if the block is associated with some buffer 1890 * (mainly to avoid getting hung up on a wait in breada). 1891 */ 1892 static int 1893 bio_incore(dev_t dev, daddr_t blkno) 1894 { 1895 struct buf *bp; 1896 struct buf *dp; 1897 uint_t index; 1898 kmutex_t *hmp; 1899 1900 index = bio_bhash(dev, blkno); 1901 dp = (struct buf *)&hbuf[index]; 1902 hmp = &hbuf[index].b_lock; 1903 1904 mutex_enter(hmp); 1905 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1906 if (bp->b_blkno == blkno && bp->b_edev == dev && 1907 (bp->b_flags & B_STALE) == 0) { 1908 mutex_exit(hmp); 1909 return (1); 1910 } 1911 } 1912 mutex_exit(hmp); 1913 return (0); 1914 } 1915 1916 static void 1917 bio_pageio_done(struct buf *bp) 1918 { 1919 if (bp->b_flags & B_PAGEIO) { 1920 1921 if (bp->b_flags & B_REMAPPED) 1922 bp_mapout(bp); 1923 1924 if (bp->b_flags & B_READ) 1925 pvn_read_done(bp->b_pages, bp->b_flags); 1926 else 1927 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1928 pageio_done(bp); 1929 } else { 1930 ASSERT(bp->b_flags & B_REMAPPED); 1931 bp_mapout(bp); 1932 brelse(bp); 1933 } 1934 } 1935 1936 /* 1937 * bioerror(9F) - indicate error in buffer header 1938 * If 'error' is zero, remove the error indication. 1939 */ 1940 void 1941 bioerror(struct buf *bp, int error) 1942 { 1943 ASSERT(bp != NULL); 1944 ASSERT(error >= 0); 1945 ASSERT(SEMA_HELD(&bp->b_sem)); 1946 1947 if (error != 0) { 1948 bp->b_flags |= B_ERROR; 1949 } else { 1950 bp->b_flags &= ~B_ERROR; 1951 } 1952 bp->b_error = error; 1953 } 1954 1955 /* 1956 * bioreset(9F) - reuse a private buffer header after I/O is complete 1957 */ 1958 void 1959 bioreset(struct buf *bp) 1960 { 1961 ASSERT(bp != NULL); 1962 1963 biofini(bp); 1964 bioinit(bp); 1965 } 1966 1967 /* 1968 * biosize(9F) - return size of a buffer header 1969 */ 1970 size_t 1971 biosize(void) 1972 { 1973 return (sizeof (struct buf)); 1974 } 1975 1976 /* 1977 * biomodified(9F) - check if buffer is modified 1978 */ 1979 int 1980 biomodified(struct buf *bp) 1981 { 1982 int npf; 1983 int ppattr; 1984 struct page *pp; 1985 1986 ASSERT(bp != NULL); 1987 1988 if ((bp->b_flags & B_PAGEIO) == 0) { 1989 return (-1); 1990 } 1991 pp = bp->b_pages; 1992 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1993 1994 while (npf > 0) { 1995 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1996 HAT_SYNC_STOPON_MOD); 1997 if (ppattr & P_MOD) 1998 return (1); 1999 pp = pp->p_next; 2000 npf--; 2001 } 2002 2003 return (0); 2004 } 2005 2006 /* 2007 * bioinit(9F) - initialize a buffer structure 2008 */ 2009 void 2010 bioinit(struct buf *bp) 2011 { 2012 bzero(bp, sizeof (struct buf)); 2013 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 2014 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 2015 bp->b_offset = -1; 2016 } 2017 2018 /* 2019 * biofini(9F) - uninitialize a buffer structure 2020 */ 2021 void 2022 biofini(struct buf *bp) 2023 { 2024 sema_destroy(&bp->b_io); 2025 sema_destroy(&bp->b_sem); 2026 } 2027 2028 /* 2029 * bioclone(9F) - clone a buffer 2030 */ 2031 struct buf * 2032 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2033 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2034 { 2035 struct buf *bufp; 2036 2037 ASSERT(bp); 2038 if (bp_mem == NULL) { 2039 bufp = kmem_alloc(sizeof (struct buf), sleep); 2040 if (bufp == NULL) { 2041 return (NULL); 2042 } 2043 bioinit(bufp); 2044 } else { 2045 bufp = bp_mem; 2046 bioreset(bufp); 2047 } 2048 2049 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2050 B_ABRWRITE) 2051 2052 /* 2053 * The cloned buffer does not inherit the B_REMAPPED flag. 2054 */ 2055 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2056 bufp->b_bcount = len; 2057 bufp->b_blkno = blkno; 2058 bufp->b_iodone = iodone; 2059 bufp->b_proc = bp->b_proc; 2060 bufp->b_edev = dev; 2061 bufp->b_file = bp->b_file; 2062 bufp->b_offset = bp->b_offset; 2063 2064 if (bp->b_flags & B_SHADOW) { 2065 ASSERT(bp->b_shadow); 2066 ASSERT(bp->b_flags & B_PHYS); 2067 2068 bufp->b_shadow = bp->b_shadow + 2069 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2070 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2071 if (bp->b_flags & B_REMAPPED) 2072 bufp->b_proc = NULL; 2073 } else { 2074 if (bp->b_flags & B_PAGEIO) { 2075 struct page *pp; 2076 off_t o; 2077 int i; 2078 2079 pp = bp->b_pages; 2080 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2081 for (i = btop(o); i > 0; i--) { 2082 pp = pp->p_next; 2083 } 2084 bufp->b_pages = pp; 2085 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2086 } else { 2087 bufp->b_un.b_addr = 2088 (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2089 if (bp->b_flags & B_REMAPPED) 2090 bufp->b_proc = NULL; 2091 } 2092 } 2093 return (bufp); 2094 } 2095