1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 #include <sys/types.h> 42 #include <sys/t_lock.h> 43 #include <sys/sysmacros.h> 44 #include <sys/conf.h> 45 #include <sys/cpuvar.h> 46 #include <sys/errno.h> 47 #include <sys/debug.h> 48 #include <sys/buf.h> 49 #include <sys/var.h> 50 #include <sys/vnode.h> 51 #include <sys/bitmap.h> 52 #include <sys/cmn_err.h> 53 #include <sys/kmem.h> 54 #include <sys/vmem.h> 55 #include <sys/atomic.h> 56 #include <vm/seg_kmem.h> 57 #include <vm/page.h> 58 #include <vm/pvn.h> 59 #include <sys/vtrace.h> 60 #include <sys/tnf_probe.h> 61 #include <sys/fs/ufs_inode.h> 62 #include <sys/fs/ufs_bio.h> 63 #include <sys/fs/ufs_log.h> 64 #include <sys/systm.h> 65 #include <sys/vfs.h> 66 #include <sys/sdt.h> 67 68 /* Locks */ 69 static kmutex_t blist_lock; /* protects b_list */ 70 static kmutex_t bhdr_lock; /* protects the bhdrlist */ 71 static kmutex_t bfree_lock; /* protects the bfreelist structure */ 72 73 struct hbuf *hbuf; /* Hash buckets */ 74 struct dwbuf *dwbuf; /* Delayed write buckets */ 75 static struct buf *bhdrlist; /* buf header free list */ 76 static int nbuf; /* number of buffer headers allocated */ 77 78 static int lastindex; /* Reference point on where to start */ 79 /* when looking for free buffers */ 80 81 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 82 #define EMPTY_LIST ((struct buf *)-1) 83 84 static kcondvar_t bio_mem_cv; /* Condition variables */ 85 static kcondvar_t bio_flushinval_cv; 86 static int bio_doingflush; /* flush in progress */ 87 static int bio_doinginval; /* inval in progress */ 88 static int bio_flinv_cv_wanted; /* someone waiting for cv */ 89 90 /* 91 * Statistics on the buffer cache 92 */ 93 struct biostats biostats = { 94 { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 95 { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 96 { "new_buffer_requests", KSTAT_DATA_UINT32 }, 97 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 98 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 99 { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 100 }; 101 102 /* 103 * kstat data 104 */ 105 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 106 uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 107 sizeof (kstat_named_t)); 108 109 /* 110 * Statistics on ufs buffer cache 111 * Not protected by locks 112 */ 113 struct ufsbiostats ub = { 114 { "breads", KSTAT_DATA_UINT32 }, 115 { "bwrites", KSTAT_DATA_UINT32 }, 116 { "fbiwrites", KSTAT_DATA_UINT32 }, 117 { "getpages", KSTAT_DATA_UINT32 }, 118 { "getras", KSTAT_DATA_UINT32 }, 119 { "putsyncs", KSTAT_DATA_UINT32 }, 120 { "putasyncs", KSTAT_DATA_UINT32 }, 121 { "putpageios", KSTAT_DATA_UINT32 }, 122 }; 123 124 /* 125 * more UFS Logging eccentricities... 126 * 127 * required since "#pragma weak ..." doesn't work in reverse order. 128 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 129 * to ufs routines don't get plugged into bio.c calls so 130 * we initialize it when setting up the "lufsops" table 131 * in "lufs.c:_init()" 132 */ 133 void (*bio_lufs_strategy)(void *, buf_t *); 134 void (*bio_snapshot_strategy)(void *, buf_t *); 135 136 137 /* Private routines */ 138 static struct buf *bio_getfreeblk(long); 139 static void bio_mem_get(long); 140 static void bio_bhdr_free(struct buf *); 141 static struct buf *bio_bhdr_alloc(void); 142 static void bio_recycle(int, long); 143 static void bio_pageio_done(struct buf *); 144 static int bio_incore(dev_t, daddr_t); 145 146 /* 147 * Buffer cache constants 148 */ 149 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 150 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 151 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 152 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 153 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 154 #define BIO_HASHLEN 4 /* Target length of hash chains */ 155 156 157 /* Flags for bio_recycle() */ 158 #define BIO_HEADER 0x01 159 #define BIO_MEM 0x02 160 161 extern int bufhwm; /* User tunable - high water mark for mem */ 162 extern int bufhwm_pct; /* ditto - given in % of physmem */ 163 164 /* 165 * The following routines allocate and free 166 * buffers with various side effects. In general the 167 * arguments to an allocate routine are a device and 168 * a block number, and the value is a pointer to 169 * to the buffer header; the buffer returned is locked with a 170 * binary semaphore so that no one else can touch it. If the block was 171 * already in core, no I/O need be done; if it is 172 * already locked, the process waits until it becomes free. 173 * The following routines allocate a buffer: 174 * getblk 175 * bread/BREAD 176 * breada 177 * Eventually the buffer must be released, possibly with the 178 * side effect of writing it out, by using one of 179 * bwrite/BWRITE/brwrite 180 * bdwrite/bdrwrite 181 * bawrite 182 * brelse 183 * 184 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 185 * Instead, a binary semaphore, b_sem is used to gain exclusive access to 186 * a buffer and a binary semaphore, b_io is used for I/O synchronization. 187 * B_DONE is still used to denote a buffer with I/O complete on it. 188 * 189 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 190 * should not be used where a very accurate count of the free buffers is 191 * needed. 192 */ 193 194 /* 195 * Read in (if necessary) the block and return a buffer pointer. 196 * 197 * This interface is provided for binary compatibility. Using 198 * BREAD() directly avoids the extra function call overhead invoked 199 * by calling this routine. 200 */ 201 struct buf * 202 bread(dev_t dev, daddr_t blkno, long bsize) 203 { 204 return (BREAD(dev, blkno, bsize)); 205 } 206 207 /* 208 * Common code for reading a buffer with various options 209 * 210 * Read in (if necessary) the block and return a buffer pointer. 211 */ 212 struct buf * 213 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 214 { 215 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 216 struct buf *bp; 217 klwp_t *lwp = ttolwp(curthread); 218 219 CPU_STATS_ADD_K(sys, lread, 1); 220 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 221 if (bp->b_flags & B_DONE) 222 return (bp); 223 bp->b_flags |= B_READ; 224 ASSERT(bp->b_bcount == bsize); 225 if (ufsvfsp == NULL) { /* !ufs */ 226 (void) bdev_strategy(bp); 227 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 228 /* ufs && logging */ 229 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 230 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 231 /* ufs && snapshots */ 232 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 233 } else { 234 ufsvfsp->vfs_iotstamp = lbolt; 235 ub.ub_breads.value.ul++; /* ufs && !logging */ 236 (void) bdev_strategy(bp); 237 } 238 if (lwp != NULL) 239 lwp->lwp_ru.inblock++; 240 CPU_STATS_ADD_K(sys, bread, 1); 241 (void) biowait(bp); 242 return (bp); 243 } 244 245 /* 246 * Read in the block, like bread, but also start I/O on the 247 * read-ahead block (which is not allocated to the caller). 248 */ 249 struct buf * 250 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 251 { 252 struct buf *bp, *rabp; 253 klwp_t *lwp = ttolwp(curthread); 254 255 bp = NULL; 256 if (!bio_incore(dev, blkno)) { 257 CPU_STATS_ADD_K(sys, lread, 1); 258 bp = GETBLK(dev, blkno, bsize); 259 if ((bp->b_flags & B_DONE) == 0) { 260 bp->b_flags |= B_READ; 261 bp->b_bcount = bsize; 262 (void) bdev_strategy(bp); 263 if (lwp != NULL) 264 lwp->lwp_ru.inblock++; 265 CPU_STATS_ADD_K(sys, bread, 1); 266 } 267 } 268 if (rablkno && bfreelist.b_bcount > 1 && 269 !bio_incore(dev, rablkno)) { 270 rabp = GETBLK(dev, rablkno, bsize); 271 if (rabp->b_flags & B_DONE) 272 brelse(rabp); 273 else { 274 rabp->b_flags |= B_READ|B_ASYNC; 275 rabp->b_bcount = bsize; 276 (void) bdev_strategy(rabp); 277 if (lwp != NULL) 278 lwp->lwp_ru.inblock++; 279 CPU_STATS_ADD_K(sys, bread, 1); 280 } 281 } 282 if (bp == NULL) 283 return (BREAD(dev, blkno, bsize)); 284 (void) biowait(bp); 285 return (bp); 286 } 287 288 /* 289 * Common code for writing a buffer with various options. 290 * 291 * force_wait - wait for write completion regardless of B_ASYNC flag 292 * do_relse - release the buffer when we are done 293 * clear_flags - flags to clear from the buffer 294 */ 295 void 296 bwrite_common(void *arg, struct buf *bp, int force_wait, 297 int do_relse, int clear_flags) 298 { 299 register int do_wait; 300 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 301 int flag; 302 klwp_t *lwp = ttolwp(curthread); 303 struct cpu *cpup; 304 305 ASSERT(SEMA_HELD(&bp->b_sem)); 306 flag = bp->b_flags; 307 bp->b_flags &= ~clear_flags; 308 if (lwp != NULL) 309 lwp->lwp_ru.oublock++; 310 CPU_STATS_ENTER_K(); 311 cpup = CPU; /* get pointer AFTER preemption is disabled */ 312 CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 313 CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 314 do_wait = ((flag & B_ASYNC) == 0 || force_wait); 315 if (do_wait == 0) 316 CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 317 CPU_STATS_EXIT_K(); 318 if (ufsvfsp == NULL) { 319 (void) bdev_strategy(bp); 320 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 321 /* ufs && logging */ 322 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 323 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 324 /* ufs && snapshots */ 325 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 326 } else { 327 ub.ub_bwrites.value.ul++; /* ufs && !logging */ 328 (void) bdev_strategy(bp); 329 } 330 if (do_wait) { 331 (void) biowait(bp); 332 if (do_relse) { 333 brelse(bp); 334 } 335 } 336 } 337 338 /* 339 * Write the buffer, waiting for completion (unless B_ASYNC is set). 340 * Then release the buffer. 341 * This interface is provided for binary compatibility. Using 342 * BWRITE() directly avoids the extra function call overhead invoked 343 * by calling this routine. 344 */ 345 void 346 bwrite(struct buf *bp) 347 { 348 BWRITE(bp); 349 } 350 351 /* 352 * Write the buffer, waiting for completion. 353 * But don't release the buffer afterwards. 354 * This interface is provided for binary compatibility. Using 355 * BWRITE2() directly avoids the extra function call overhead. 356 */ 357 void 358 bwrite2(struct buf *bp) 359 { 360 BWRITE2(bp); 361 } 362 363 /* 364 * Release the buffer, marking it so that if it is grabbed 365 * for another purpose it will be written out before being 366 * given up (e.g. when writing a partial block where it is 367 * assumed that another write for the same block will soon follow). 368 * Also save the time that the block is first marked as delayed 369 * so that it will be written in a reasonable time. 370 */ 371 void 372 bdwrite(struct buf *bp) 373 { 374 ASSERT(SEMA_HELD(&bp->b_sem)); 375 CPU_STATS_ADD_K(sys, lwrite, 1); 376 if ((bp->b_flags & B_DELWRI) == 0) 377 bp->b_start = lbolt; 378 /* 379 * B_DONE allows others to use the buffer, B_DELWRI causes the 380 * buffer to be written before being reused, and setting b_resid 381 * to zero says the buffer is complete. 382 */ 383 bp->b_flags |= B_DELWRI | B_DONE; 384 bp->b_resid = 0; 385 brelse(bp); 386 } 387 388 /* 389 * Release the buffer, start I/O on it, but don't wait for completion. 390 */ 391 void 392 bawrite(struct buf *bp) 393 { 394 ASSERT(SEMA_HELD(&bp->b_sem)); 395 396 /* Use bfreelist.b_bcount as a weird-ass heuristic */ 397 if (bfreelist.b_bcount > 4) 398 bp->b_flags |= B_ASYNC; 399 BWRITE(bp); 400 } 401 402 /* 403 * Release the buffer, with no I/O implied. 404 */ 405 void 406 brelse(struct buf *bp) 407 { 408 struct buf **backp; 409 uint_t index; 410 kmutex_t *hmp; 411 struct buf *dp; 412 struct hbuf *hp; 413 414 415 ASSERT(SEMA_HELD(&bp->b_sem)); 416 417 /* 418 * Clear the retry write flag if the buffer was written without 419 * error. The presence of B_DELWRI means the buffer has not yet 420 * been written and the presence of B_ERROR means that an error 421 * is still occurring. 422 */ 423 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 424 bp->b_flags &= ~B_RETRYWRI; 425 } 426 427 /* Check for anomalous conditions */ 428 if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 429 if (bp->b_flags & B_NOCACHE) { 430 /* Don't add to the freelist. Destroy it now */ 431 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 432 sema_destroy(&bp->b_sem); 433 sema_destroy(&bp->b_io); 434 kmem_free(bp, sizeof (struct buf)); 435 return; 436 } 437 /* 438 * If a write failed and we are supposed to retry write, 439 * don't toss the buffer. Keep it around and mark it 440 * delayed write in the hopes that it will eventually 441 * get flushed (and still keep the system running.) 442 */ 443 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 444 bp->b_flags |= B_DELWRI; 445 /* keep fsflush from trying continuously to flush */ 446 bp->b_start = lbolt; 447 } else 448 bp->b_flags |= B_AGE|B_STALE; 449 bp->b_flags &= ~B_ERROR; 450 bp->b_error = 0; 451 } 452 453 /* 454 * If delayed write is set then put in on the delayed 455 * write list instead of the free buffer list. 456 */ 457 index = bio_bhash(bp->b_edev, bp->b_blkno); 458 hmp = &hbuf[index].b_lock; 459 460 mutex_enter(hmp); 461 hp = &hbuf[index]; 462 dp = (struct buf *)hp; 463 464 /* 465 * Make sure that the number of entries on this list are 466 * Zero <= count <= total # buffers 467 */ 468 ASSERT(hp->b_length >= 0); 469 ASSERT(hp->b_length < nbuf); 470 471 hp->b_length++; /* We are adding this buffer */ 472 473 if (bp->b_flags & B_DELWRI) { 474 /* 475 * This buffer goes on the delayed write buffer list 476 */ 477 dp = (struct buf *)&dwbuf[index]; 478 } 479 ASSERT(bp->b_bufsize > 0); 480 ASSERT(bp->b_bcount > 0); 481 ASSERT(bp->b_un.b_addr != NULL); 482 483 if (bp->b_flags & B_AGE) { 484 backp = &dp->av_forw; 485 (*backp)->av_back = bp; 486 bp->av_forw = *backp; 487 *backp = bp; 488 bp->av_back = dp; 489 } else { 490 backp = &dp->av_back; 491 (*backp)->av_forw = bp; 492 bp->av_back = *backp; 493 *backp = bp; 494 bp->av_forw = dp; 495 } 496 mutex_exit(hmp); 497 498 if (bfreelist.b_flags & B_WANTED) { 499 /* 500 * Should come here very very rarely. 501 */ 502 mutex_enter(&bfree_lock); 503 if (bfreelist.b_flags & B_WANTED) { 504 bfreelist.b_flags &= ~B_WANTED; 505 cv_broadcast(&bio_mem_cv); 506 } 507 mutex_exit(&bfree_lock); 508 } 509 510 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 511 /* 512 * Don't let anyone get the buffer off the freelist before we 513 * release our hold on it. 514 */ 515 sema_v(&bp->b_sem); 516 } 517 518 /* 519 * Return a count of the number of B_BUSY buffers in the system 520 * Can only be used as a good estimate. If 'cleanit' is set, 521 * try to flush all bufs. 522 */ 523 int 524 bio_busy(int cleanit) 525 { 526 struct buf *bp, *dp; 527 int busy = 0; 528 int i; 529 kmutex_t *hmp; 530 531 for (i = 0; i < v.v_hbuf; i++) { 532 vfs_syncprogress(); 533 dp = (struct buf *)&hbuf[i]; 534 hmp = &hbuf[i].b_lock; 535 536 mutex_enter(hmp); 537 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 538 if (bp->b_flags & B_BUSY) 539 busy++; 540 } 541 mutex_exit(hmp); 542 } 543 544 if (cleanit && busy != 0) { 545 bflush(NODEV); 546 } 547 548 return (busy); 549 } 550 551 /* 552 * this interface is provided for binary compatibility. 553 * 554 * Assign a buffer for the given block. If the appropriate 555 * block is already associated, return it; otherwise search 556 * for the oldest non-busy buffer and reassign it. 557 */ 558 struct buf * 559 getblk(dev_t dev, daddr_t blkno, long bsize) 560 { 561 return (getblk_common(/* ufsvfsp */ NULL, dev, 562 blkno, bsize, /* errflg */ 0)); 563 } 564 565 /* 566 * Assign a buffer for the given block. If the appropriate 567 * block is already associated, return it; otherwise search 568 * for the oldest non-busy buffer and reassign it. 569 */ 570 struct buf * 571 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 572 { 573 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 574 struct buf *bp; 575 struct buf *dp; 576 struct buf *nbp = NULL; 577 struct buf *errbp; 578 uint_t index; 579 kmutex_t *hmp; 580 struct hbuf *hp; 581 582 if (getmajor(dev) >= devcnt) 583 cmn_err(CE_PANIC, "blkdev"); 584 585 biostats.bio_lookup.value.ui32++; 586 587 index = bio_bhash(dev, blkno); 588 hp = &hbuf[index]; 589 dp = (struct buf *)hp; 590 hmp = &hp->b_lock; 591 592 mutex_enter(hmp); 593 loop: 594 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 595 if (bp->b_blkno != blkno || bp->b_edev != dev || 596 (bp->b_flags & B_STALE)) 597 continue; 598 /* 599 * Avoid holding the hash lock in the event that 600 * the buffer is locked by someone. Since the hash chain 601 * may change when we drop the hash lock 602 * we have to start at the beginning of the chain if the 603 * buffer identity/contents aren't valid. 604 */ 605 if (!sema_tryp(&bp->b_sem)) { 606 biostats.bio_bufbusy.value.ui32++; 607 mutex_exit(hmp); 608 /* 609 * OK, we are dealing with a busy buffer. 610 * In the case that we are panicking and we 611 * got called from bread(), we have some chance 612 * for error recovery. So better bail out from 613 * here since sema_p() won't block. If we got 614 * called directly from ufs routines, there is 615 * no way to report an error yet. 616 */ 617 if (panicstr && errflg) 618 goto errout; 619 /* 620 * For the following line of code to work 621 * correctly never kmem_free the buffer "header". 622 */ 623 sema_p(&bp->b_sem); 624 if (bp->b_blkno != blkno || bp->b_edev != dev || 625 (bp->b_flags & B_STALE)) { 626 sema_v(&bp->b_sem); 627 mutex_enter(hmp); 628 goto loop; /* start over */ 629 } 630 mutex_enter(hmp); 631 } 632 /* Found */ 633 biostats.bio_hit.value.ui32++; 634 bp->b_flags &= ~B_AGE; 635 636 /* 637 * Yank it off the free/delayed write lists 638 */ 639 hp->b_length--; 640 notavail(bp); 641 mutex_exit(hmp); 642 643 ASSERT((bp->b_flags & B_NOCACHE) == NULL); 644 645 if (nbp == NULL) { 646 /* 647 * Make the common path short. 648 */ 649 ASSERT(SEMA_HELD(&bp->b_sem)); 650 return (bp); 651 } 652 653 biostats.bio_bufdup.value.ui32++; 654 655 /* 656 * The buffer must have entered during the lock upgrade 657 * so free the new buffer we allocated and return the 658 * found buffer. 659 */ 660 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 661 nbp->b_un.b_addr = NULL; 662 663 /* 664 * Account for the memory 665 */ 666 mutex_enter(&bfree_lock); 667 bfreelist.b_bufsize += nbp->b_bufsize; 668 mutex_exit(&bfree_lock); 669 670 /* 671 * Destroy buf identity, and place on avail list 672 */ 673 nbp->b_dev = (o_dev_t)NODEV; 674 nbp->b_edev = NODEV; 675 nbp->b_flags = 0; 676 nbp->b_file = NULL; 677 nbp->b_offset = -1; 678 679 sema_v(&nbp->b_sem); 680 bio_bhdr_free(nbp); 681 682 ASSERT(SEMA_HELD(&bp->b_sem)); 683 return (bp); 684 } 685 686 /* 687 * bio_getfreeblk may block so check the hash chain again. 688 */ 689 if (nbp == NULL) { 690 mutex_exit(hmp); 691 nbp = bio_getfreeblk(bsize); 692 mutex_enter(hmp); 693 goto loop; 694 } 695 696 /* 697 * New buffer. Assign nbp and stick it on the hash. 698 */ 699 nbp->b_flags = B_BUSY; 700 nbp->b_edev = dev; 701 nbp->b_dev = (o_dev_t)cmpdev(dev); 702 nbp->b_blkno = blkno; 703 nbp->b_iodone = NULL; 704 nbp->b_bcount = bsize; 705 /* 706 * If we are given a ufsvfsp and the vfs_root field is NULL 707 * then this must be I/O for a superblock. A superblock's 708 * buffer is set up in mountfs() and there is no root vnode 709 * at that point. 710 */ 711 if (ufsvfsp && ufsvfsp->vfs_root) { 712 nbp->b_vp = ufsvfsp->vfs_root; 713 } else { 714 nbp->b_vp = NULL; 715 } 716 717 ASSERT((nbp->b_flags & B_NOCACHE) == NULL); 718 719 binshash(nbp, dp); 720 mutex_exit(hmp); 721 722 ASSERT(SEMA_HELD(&nbp->b_sem)); 723 724 return (nbp); 725 726 727 /* 728 * Come here in case of an internal error. At this point we couldn't 729 * get a buffer, but he have to return one. Hence we allocate some 730 * kind of error reply buffer on the fly. This buffer is marked as 731 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 732 * - B_ERROR will indicate error to the caller. 733 * - B_DONE will prevent us from reading the buffer from 734 * the device. 735 * - B_NOCACHE will cause that this buffer gets free'd in 736 * brelse(). 737 */ 738 739 errout: 740 errbp = geteblk(); 741 sema_p(&errbp->b_sem); 742 errbp->b_flags &= ~B_BUSY; 743 errbp->b_flags |= (B_ERROR | B_DONE); 744 return (errbp); 745 } 746 747 /* 748 * Get an empty block, not assigned to any particular device. 749 * Returns a locked buffer that is not on any hash or free list. 750 */ 751 struct buf * 752 ngeteblk(long bsize) 753 { 754 struct buf *bp; 755 756 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 757 bioinit(bp); 758 bp->av_forw = bp->av_back = NULL; 759 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 760 bp->b_bufsize = bsize; 761 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 762 bp->b_dev = (o_dev_t)NODEV; 763 bp->b_edev = NODEV; 764 bp->b_lblkno = 0; 765 bp->b_bcount = bsize; 766 bp->b_iodone = NULL; 767 return (bp); 768 } 769 770 /* 771 * Interface of geteblk() is kept intact to maintain driver compatibility. 772 * Use ngeteblk() to allocate block size other than 1 KB. 773 */ 774 struct buf * 775 geteblk(void) 776 { 777 return (ngeteblk((long)1024)); 778 } 779 780 /* 781 * Return a buffer w/o sleeping 782 */ 783 struct buf * 784 trygetblk(dev_t dev, daddr_t blkno) 785 { 786 struct buf *bp; 787 struct buf *dp; 788 struct hbuf *hp; 789 kmutex_t *hmp; 790 uint_t index; 791 792 index = bio_bhash(dev, blkno); 793 hp = &hbuf[index]; 794 hmp = &hp->b_lock; 795 796 if (!mutex_tryenter(hmp)) 797 return (NULL); 798 799 dp = (struct buf *)hp; 800 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 801 if (bp->b_blkno != blkno || bp->b_edev != dev || 802 (bp->b_flags & B_STALE)) 803 continue; 804 /* 805 * Get access to a valid buffer without sleeping 806 */ 807 if (sema_tryp(&bp->b_sem)) { 808 if (bp->b_flags & B_DONE) { 809 hp->b_length--; 810 notavail(bp); 811 mutex_exit(hmp); 812 return (bp); 813 } else { 814 sema_v(&bp->b_sem); 815 break; 816 } 817 } 818 break; 819 } 820 mutex_exit(hmp); 821 return (NULL); 822 } 823 824 /* 825 * Wait for I/O completion on the buffer; return errors 826 * to the user. 827 */ 828 int 829 iowait(struct buf *bp) 830 { 831 ASSERT(SEMA_HELD(&bp->b_sem)); 832 return (biowait(bp)); 833 } 834 835 /* 836 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 837 * and wake up anyone waiting for it. 838 */ 839 void 840 iodone(struct buf *bp) 841 { 842 ASSERT(SEMA_HELD(&bp->b_sem)); 843 (void) biodone(bp); 844 } 845 846 /* 847 * Zero the core associated with a buffer. 848 */ 849 void 850 clrbuf(struct buf *bp) 851 { 852 ASSERT(SEMA_HELD(&bp->b_sem)); 853 bzero(bp->b_un.b_addr, bp->b_bcount); 854 bp->b_resid = 0; 855 } 856 857 858 /* 859 * Make sure all write-behind blocks on dev (or NODEV for all) 860 * are flushed out. 861 */ 862 void 863 bflush(dev_t dev) 864 { 865 struct buf *bp, *dp; 866 struct hbuf *hp; 867 struct buf *delwri_list = EMPTY_LIST; 868 int i, index; 869 kmutex_t *hmp; 870 871 mutex_enter(&blist_lock); 872 /* 873 * Wait for any invalidates or flushes ahead of us to finish. 874 * We really could split blist_lock up per device for better 875 * parallelism here. 876 */ 877 while (bio_doinginval || bio_doingflush) { 878 bio_flinv_cv_wanted = 1; 879 cv_wait(&bio_flushinval_cv, &blist_lock); 880 } 881 bio_doingflush++; 882 /* 883 * Gather all B_DELWRI buffer for device. 884 * Lock ordering is b_sem > hash lock (brelse). 885 * Since we are finding the buffer via the delayed write list, 886 * it may be busy and we would block trying to get the 887 * b_sem lock while holding hash lock. So transfer all the 888 * candidates on the delwri_list and then drop the hash locks. 889 */ 890 for (i = 0; i < v.v_hbuf; i++) { 891 vfs_syncprogress(); 892 hmp = &hbuf[i].b_lock; 893 dp = (struct buf *)&dwbuf[i]; 894 mutex_enter(hmp); 895 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 896 if (dev == NODEV || bp->b_edev == dev) { 897 if (bp->b_list == NULL) { 898 bp->b_list = delwri_list; 899 delwri_list = bp; 900 } 901 } 902 } 903 mutex_exit(hmp); 904 } 905 mutex_exit(&blist_lock); 906 907 /* 908 * Now that the hash locks have been dropped grab the semaphores 909 * and write back all the buffers that have B_DELWRI set. 910 */ 911 while (delwri_list != EMPTY_LIST) { 912 vfs_syncprogress(); 913 bp = delwri_list; 914 915 sema_p(&bp->b_sem); /* may block */ 916 if ((dev != bp->b_edev && dev != NODEV) || 917 (panicstr && bp->b_flags & B_BUSY)) { 918 sema_v(&bp->b_sem); 919 delwri_list = bp->b_list; 920 bp->b_list = NULL; 921 continue; /* No longer a candidate */ 922 } 923 if (bp->b_flags & B_DELWRI) { 924 index = bio_bhash(bp->b_edev, bp->b_blkno); 925 hp = &hbuf[index]; 926 hmp = &hp->b_lock; 927 dp = (struct buf *)hp; 928 929 bp->b_flags |= B_ASYNC; 930 mutex_enter(hmp); 931 hp->b_length--; 932 notavail(bp); 933 mutex_exit(hmp); 934 if (bp->b_vp == NULL) { /* !ufs */ 935 BWRITE(bp); 936 } else { /* ufs */ 937 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 938 } 939 } else { 940 sema_v(&bp->b_sem); 941 } 942 delwri_list = bp->b_list; 943 bp->b_list = NULL; 944 } 945 mutex_enter(&blist_lock); 946 bio_doingflush--; 947 if (bio_flinv_cv_wanted) { 948 bio_flinv_cv_wanted = 0; 949 cv_broadcast(&bio_flushinval_cv); 950 } 951 mutex_exit(&blist_lock); 952 } 953 954 /* 955 * Ensure that a specified block is up-to-date on disk. 956 */ 957 void 958 blkflush(dev_t dev, daddr_t blkno) 959 { 960 struct buf *bp, *dp; 961 struct hbuf *hp; 962 struct buf *sbp = NULL; 963 uint_t index; 964 kmutex_t *hmp; 965 966 index = bio_bhash(dev, blkno); 967 hp = &hbuf[index]; 968 dp = (struct buf *)hp; 969 hmp = &hp->b_lock; 970 971 /* 972 * Identify the buffer in the cache belonging to 973 * this device and blkno (if any). 974 */ 975 mutex_enter(hmp); 976 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 977 if (bp->b_blkno != blkno || bp->b_edev != dev || 978 (bp->b_flags & B_STALE)) 979 continue; 980 sbp = bp; 981 break; 982 } 983 mutex_exit(hmp); 984 if (sbp == NULL) 985 return; 986 /* 987 * Now check the buffer we have identified and 988 * make sure it still belongs to the device and is B_DELWRI 989 */ 990 sema_p(&sbp->b_sem); 991 if (sbp->b_blkno == blkno && sbp->b_edev == dev && 992 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 993 mutex_enter(hmp); 994 hp->b_length--; 995 notavail(sbp); 996 mutex_exit(hmp); 997 /* 998 * XXX - There is nothing to guarantee a synchronous 999 * write here if the B_ASYNC flag is set. This needs 1000 * some investigation. 1001 */ 1002 if (sbp->b_vp == NULL) { /* !ufs */ 1003 BWRITE(sbp); /* synchronous write */ 1004 } else { /* ufs */ 1005 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1006 } 1007 } else { 1008 sema_v(&sbp->b_sem); 1009 } 1010 } 1011 1012 /* 1013 * Same as binval, except can force-invalidate delayed-write buffers 1014 * (which are not be already flushed because of device errors). Also 1015 * makes sure that the retry write flag is cleared. 1016 */ 1017 int 1018 bfinval(dev_t dev, int force) 1019 { 1020 struct buf *dp; 1021 struct buf *bp; 1022 struct buf *binval_list = EMPTY_LIST; 1023 int i, error = 0; 1024 kmutex_t *hmp; 1025 uint_t index; 1026 struct buf **backp; 1027 1028 mutex_enter(&blist_lock); 1029 /* 1030 * Wait for any flushes ahead of us to finish, it's ok to 1031 * do invalidates in parallel. 1032 */ 1033 while (bio_doingflush) { 1034 bio_flinv_cv_wanted = 1; 1035 cv_wait(&bio_flushinval_cv, &blist_lock); 1036 } 1037 bio_doinginval++; 1038 1039 /* Gather bp's */ 1040 for (i = 0; i < v.v_hbuf; i++) { 1041 dp = (struct buf *)&hbuf[i]; 1042 hmp = &hbuf[i].b_lock; 1043 1044 mutex_enter(hmp); 1045 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1046 if (bp->b_edev == dev) { 1047 if (bp->b_list == NULL) { 1048 bp->b_list = binval_list; 1049 binval_list = bp; 1050 } 1051 } 1052 } 1053 mutex_exit(hmp); 1054 } 1055 mutex_exit(&blist_lock); 1056 1057 /* Invalidate all bp's found */ 1058 while (binval_list != EMPTY_LIST) { 1059 bp = binval_list; 1060 1061 sema_p(&bp->b_sem); 1062 if (bp->b_edev == dev) { 1063 if (force && (bp->b_flags & B_DELWRI)) { 1064 /* clear B_DELWRI, move to non-dw freelist */ 1065 index = bio_bhash(bp->b_edev, bp->b_blkno); 1066 hmp = &hbuf[index].b_lock; 1067 dp = (struct buf *)&hbuf[index]; 1068 mutex_enter(hmp); 1069 1070 /* remove from delayed write freelist */ 1071 notavail(bp); 1072 1073 /* add to B_AGE side of non-dw freelist */ 1074 backp = &dp->av_forw; 1075 (*backp)->av_back = bp; 1076 bp->av_forw = *backp; 1077 *backp = bp; 1078 bp->av_back = dp; 1079 1080 /* 1081 * make sure write retries and busy are cleared 1082 */ 1083 bp->b_flags &= 1084 ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1085 mutex_exit(hmp); 1086 } 1087 if ((bp->b_flags & B_DELWRI) == 0) 1088 bp->b_flags |= B_STALE|B_AGE; 1089 else 1090 error = EIO; 1091 } 1092 sema_v(&bp->b_sem); 1093 binval_list = bp->b_list; 1094 bp->b_list = NULL; 1095 } 1096 mutex_enter(&blist_lock); 1097 bio_doinginval--; 1098 if (bio_flinv_cv_wanted) { 1099 cv_broadcast(&bio_flushinval_cv); 1100 bio_flinv_cv_wanted = 0; 1101 } 1102 mutex_exit(&blist_lock); 1103 return (error); 1104 } 1105 1106 /* 1107 * If possible, invalidate blocks for a dev on demand 1108 */ 1109 void 1110 binval(dev_t dev) 1111 { 1112 (void) bfinval(dev, 0); 1113 } 1114 1115 /* 1116 * Initialize the buffer I/O system by freeing 1117 * all buffers and setting all device hash buffer lists to empty. 1118 */ 1119 void 1120 binit(void) 1121 { 1122 struct buf *bp; 1123 unsigned int i, pct; 1124 ulong_t bio_max_hwm, bio_default_hwm; 1125 1126 /* 1127 * Maximum/Default values for bufhwm are set to the smallest of: 1128 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1129 * - 1/4 of kernel virtual memory 1130 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1131 * Additionally, in order to allow simple tuning by percentage of 1132 * physical memory, bufhwm_pct is used to calculate the default if 1133 * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1134 * 1135 * Since the unit for v.v_bufhwm is kilobytes, this allows for 1136 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1137 */ 1138 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1139 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1140 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1141 1142 pct = BIO_BUF_PERCENT; 1143 if (bufhwm_pct != 0 && 1144 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1145 pct = BIO_BUF_PERCENT; 1146 /* 1147 * Invalid user specified value, emit a warning. 1148 */ 1149 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1150 range(1..%d). Using %d as default.", 1151 bufhwm_pct, 1152 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1153 } 1154 1155 bio_default_hwm = MIN(physmem / pct, 1156 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1157 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1158 1159 if ((v.v_bufhwm = bufhwm) == 0) 1160 v.v_bufhwm = bio_default_hwm; 1161 1162 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1163 v.v_bufhwm = (int)bio_max_hwm; 1164 /* 1165 * Invalid user specified value, emit a warning. 1166 */ 1167 cmn_err(CE_WARN, 1168 "binit: bufhwm(%d) out \ 1169 of range(%d..%lu). Using %lu as default", 1170 bufhwm, 1171 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1172 } 1173 1174 /* 1175 * Determine the number of hash buckets. Default is to 1176 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1177 * Round up number to the next power of 2. 1178 */ 1179 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1180 BIO_HASHLEN); 1181 v.v_hmask = v.v_hbuf - 1; 1182 v.v_buf = BIO_BHDR_POOL; 1183 1184 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1185 1186 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1187 1188 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1189 bp = &bfreelist; 1190 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1191 1192 for (i = 0; i < v.v_hbuf; i++) { 1193 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1194 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1195 1196 /* 1197 * Initialize the delayed write buffer list. 1198 */ 1199 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1200 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1201 } 1202 } 1203 1204 /* 1205 * Wait for I/O completion on the buffer; return error code. 1206 * If bp was for synchronous I/O, bp is invalid and associated 1207 * resources are freed on return. 1208 */ 1209 int 1210 biowait(struct buf *bp) 1211 { 1212 int error = 0; 1213 struct cpu *cpup; 1214 1215 ASSERT(SEMA_HELD(&bp->b_sem)); 1216 1217 cpup = CPU; 1218 atomic_add_64(&cpup->cpu_stats.sys.iowait, 1); 1219 DTRACE_IO1(wait__start, struct buf *, bp); 1220 1221 /* 1222 * In case of panic, busy wait for completion 1223 */ 1224 if (panicstr) { 1225 while ((bp->b_flags & B_DONE) == 0) 1226 drv_usecwait(10); 1227 } else 1228 sema_p(&bp->b_io); 1229 1230 DTRACE_IO1(wait__done, struct buf *, bp); 1231 atomic_add_64(&cpup->cpu_stats.sys.iowait, -1); 1232 1233 error = geterror(bp); 1234 if ((bp->b_flags & B_ASYNC) == 0) { 1235 if (bp->b_flags & B_REMAPPED) 1236 bp_mapout(bp); 1237 } 1238 return (error); 1239 } 1240 1241 static void 1242 biodone_tnf_probe(struct buf *bp) 1243 { 1244 /* Kernel probe */ 1245 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */, 1246 tnf_device, device, bp->b_edev, 1247 tnf_diskaddr, block, bp->b_lblkno, 1248 tnf_opaque, buf, bp); 1249 } 1250 1251 /* 1252 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1253 * and wake up anyone waiting for it. 1254 */ 1255 void 1256 biodone(struct buf *bp) 1257 { 1258 if (bp->b_flags & B_STARTED) { 1259 DTRACE_IO1(done, struct buf *, bp); 1260 bp->b_flags &= ~B_STARTED; 1261 } 1262 1263 /* 1264 * Call the TNF probe here instead of the inline code 1265 * to force our compiler to use the tail call optimization. 1266 */ 1267 biodone_tnf_probe(bp); 1268 1269 if (bp->b_iodone != NULL) { 1270 (*(bp->b_iodone))(bp); 1271 return; 1272 } 1273 ASSERT((bp->b_flags & B_DONE) == 0); 1274 ASSERT(SEMA_HELD(&bp->b_sem)); 1275 bp->b_flags |= B_DONE; 1276 if (bp->b_flags & B_ASYNC) { 1277 if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1278 bio_pageio_done(bp); 1279 else 1280 brelse(bp); /* release bp to freelist */ 1281 } else { 1282 sema_v(&bp->b_io); 1283 } 1284 } 1285 1286 /* 1287 * Pick up the device's error number and pass it to the user; 1288 * if there is an error but the number is 0 set a generalized code. 1289 */ 1290 int 1291 geterror(struct buf *bp) 1292 { 1293 int error = 0; 1294 1295 ASSERT(SEMA_HELD(&bp->b_sem)); 1296 if (bp->b_flags & B_ERROR) { 1297 error = bp->b_error; 1298 if (!error) 1299 error = EIO; 1300 } 1301 return (error); 1302 } 1303 1304 /* 1305 * Support for pageio buffers. 1306 * 1307 * This stuff should be generalized to provide a generalized bp 1308 * header facility that can be used for things other than pageio. 1309 */ 1310 1311 /* 1312 * Allocate and initialize a buf struct for use with pageio. 1313 */ 1314 struct buf * 1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1316 { 1317 struct buf *bp; 1318 struct cpu *cpup; 1319 1320 if (flags & B_READ) { 1321 CPU_STATS_ENTER_K(); 1322 cpup = CPU; /* get pointer AFTER preemption is disabled */ 1323 CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1324 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1325 if ((flags & B_ASYNC) == 0) { 1326 klwp_t *lwp = ttolwp(curthread); 1327 if (lwp != NULL) 1328 lwp->lwp_ru.majflt++; 1329 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1330 /* Kernel probe */ 1331 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */, 1332 tnf_opaque, vnode, pp->p_vnode, 1333 tnf_offset, offset, pp->p_offset); 1334 } 1335 /* 1336 * Update statistics for pages being paged in 1337 */ 1338 if (pp != NULL && pp->p_vnode != NULL) { 1339 if (IS_SWAPFSVP(pp->p_vnode)) { 1340 CPU_STATS_ADDQ(cpup, vm, anonpgin, 1341 btopr(len)); 1342 } else { 1343 if (pp->p_vnode->v_flag & VVMEXEC) { 1344 CPU_STATS_ADDQ(cpup, vm, execpgin, 1345 btopr(len)); 1346 } else { 1347 CPU_STATS_ADDQ(cpup, vm, fspgin, 1348 btopr(len)); 1349 } 1350 } 1351 } 1352 CPU_STATS_EXIT_K(); 1353 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1354 "page_ws_in:pp %p", pp); 1355 /* Kernel probe */ 1356 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */, 1357 tnf_opaque, vnode, pp->p_vnode, 1358 tnf_offset, offset, pp->p_offset, 1359 tnf_size, size, len); 1360 } 1361 1362 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1363 bp->b_bcount = len; 1364 bp->b_bufsize = len; 1365 bp->b_pages = pp; 1366 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1367 bp->b_offset = -1; 1368 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1369 1370 /* Initialize bp->b_sem in "locked" state */ 1371 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1372 1373 VN_HOLD(vp); 1374 bp->b_vp = vp; 1375 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */ 1376 1377 /* 1378 * Caller sets dev & blkno and can adjust 1379 * b_addr for page offset and can use bp_mapin 1380 * to make pages kernel addressable. 1381 */ 1382 return (bp); 1383 } 1384 1385 void 1386 pageio_done(struct buf *bp) 1387 { 1388 ASSERT(SEMA_HELD(&bp->b_sem)); 1389 if (bp->b_flags & B_REMAPPED) 1390 bp_mapout(bp); 1391 VN_RELE(bp->b_vp); 1392 bp->b_vp = NULL; 1393 ASSERT((bp->b_flags & B_NOCACHE) != 0); 1394 1395 /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1396 sema_destroy(&bp->b_sem); 1397 sema_destroy(&bp->b_io); 1398 kmem_free(bp, sizeof (struct buf)); 1399 } 1400 1401 /* 1402 * Check to see whether the buffers, except the one pointed by sbp, 1403 * associated with the device are busy. 1404 * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1405 */ 1406 int 1407 bcheck(dev_t dev, struct buf *sbp) 1408 { 1409 struct buf *bp; 1410 struct buf *dp; 1411 int i; 1412 kmutex_t *hmp; 1413 1414 /* 1415 * check for busy bufs for this filesystem 1416 */ 1417 for (i = 0; i < v.v_hbuf; i++) { 1418 dp = (struct buf *)&hbuf[i]; 1419 hmp = &hbuf[i].b_lock; 1420 1421 mutex_enter(hmp); 1422 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1423 /* 1424 * if buf is busy or dirty, then filesystem is busy 1425 */ 1426 if ((bp->b_edev == dev) && 1427 ((bp->b_flags & B_STALE) == 0) && 1428 (bp->b_flags & (B_DELWRI|B_BUSY)) && 1429 (bp != sbp)) { 1430 mutex_exit(hmp); 1431 return (1); 1432 } 1433 } 1434 mutex_exit(hmp); 1435 } 1436 return (0); 1437 } 1438 1439 /* 1440 * Hash two 32 bit entities. 1441 */ 1442 int 1443 hash2ints(int x, int y) 1444 { 1445 int hash = 0; 1446 1447 hash = x - 1; 1448 hash = ((hash * 7) + (x >> 8)) - 1; 1449 hash = ((hash * 7) + (x >> 16)) - 1; 1450 hash = ((hash * 7) + (x >> 24)) - 1; 1451 hash = ((hash * 7) + y) - 1; 1452 hash = ((hash * 7) + (y >> 8)) - 1; 1453 hash = ((hash * 7) + (y >> 16)) - 1; 1454 hash = ((hash * 7) + (y >> 24)) - 1; 1455 1456 return (hash); 1457 } 1458 1459 1460 /* 1461 * Return a new buffer struct. 1462 * Create a new buffer if we haven't gone over our high water 1463 * mark for memory, otherwise try to get one off the freelist. 1464 * 1465 * Returns a locked buf that has no id and is not on any hash or free 1466 * list. 1467 */ 1468 static struct buf * 1469 bio_getfreeblk(long bsize) 1470 { 1471 struct buf *bp, *dp; 1472 struct hbuf *hp; 1473 kmutex_t *hmp; 1474 uint_t start, end; 1475 1476 /* 1477 * mutex_enter(&bfree_lock); 1478 * bfreelist.b_bufsize represents the amount of memory 1479 * mutex_exit(&bfree_lock); protect ref to bfreelist 1480 * we are allowed to allocate in the cache before we hit our hwm. 1481 */ 1482 bio_mem_get(bsize); /* Account for our memory request */ 1483 1484 again: 1485 bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1486 sema_p(&bp->b_sem); /* Should never fail */ 1487 1488 ASSERT(bp->b_un.b_addr == NULL); 1489 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1490 if (bp->b_un.b_addr != NULL) { 1491 /* 1492 * Make the common path short 1493 */ 1494 bp->b_bufsize = bsize; 1495 ASSERT(SEMA_HELD(&bp->b_sem)); 1496 return (bp); 1497 } else { 1498 struct buf *save; 1499 1500 save = bp; /* Save bp we allocated */ 1501 start = end = lastindex; 1502 1503 biostats.bio_bufwant.value.ui32++; 1504 1505 /* 1506 * Memory isn't available from the system now. Scan 1507 * the hash buckets till enough space is found. 1508 */ 1509 do { 1510 hp = &hbuf[start]; 1511 hmp = &hp->b_lock; 1512 dp = (struct buf *)hp; 1513 1514 mutex_enter(hmp); 1515 bp = dp->av_forw; 1516 1517 while (bp != dp) { 1518 1519 ASSERT(bp != NULL); 1520 1521 if (!sema_tryp(&bp->b_sem)) { 1522 bp = bp->av_forw; 1523 continue; 1524 } 1525 1526 /* 1527 * Since we are going down the freelist 1528 * associated with this hash bucket the 1529 * B_DELWRI flag should not be set. 1530 */ 1531 ASSERT(!(bp->b_flags & B_DELWRI)); 1532 1533 if (bp->b_bufsize == bsize) { 1534 hp->b_length--; 1535 notavail(bp); 1536 bremhash(bp); 1537 mutex_exit(hmp); 1538 1539 /* 1540 * Didn't kmem_alloc any more, so don't 1541 * count it twice. 1542 */ 1543 mutex_enter(&bfree_lock); 1544 bfreelist.b_bufsize += bsize; 1545 mutex_exit(&bfree_lock); 1546 1547 /* 1548 * Update the lastindex value. 1549 */ 1550 lastindex = start; 1551 1552 /* 1553 * Put our saved bp back on the list 1554 */ 1555 sema_v(&save->b_sem); 1556 bio_bhdr_free(save); 1557 ASSERT(SEMA_HELD(&bp->b_sem)); 1558 return (bp); 1559 } 1560 sema_v(&bp->b_sem); 1561 bp = bp->av_forw; 1562 } 1563 mutex_exit(hmp); 1564 start = ((start + 1) % v.v_hbuf); 1565 } while (start != end); 1566 1567 biostats.bio_bufwait.value.ui32++; 1568 bp = save; /* Use original bp */ 1569 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1570 } 1571 1572 bp->b_bufsize = bsize; 1573 ASSERT(SEMA_HELD(&bp->b_sem)); 1574 return (bp); 1575 } 1576 1577 /* 1578 * Allocate a buffer header. If none currently available, allocate 1579 * a new pool. 1580 */ 1581 static struct buf * 1582 bio_bhdr_alloc(void) 1583 { 1584 struct buf *dp, *sdp; 1585 struct buf *bp; 1586 int i; 1587 1588 for (;;) { 1589 mutex_enter(&bhdr_lock); 1590 if (bhdrlist != NULL) { 1591 bp = bhdrlist; 1592 bhdrlist = bp->av_forw; 1593 mutex_exit(&bhdr_lock); 1594 bp->av_forw = NULL; 1595 return (bp); 1596 } 1597 mutex_exit(&bhdr_lock); 1598 1599 /* 1600 * Need to allocate a new pool. If the system is currently 1601 * out of memory, then try freeing things on the freelist. 1602 */ 1603 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1604 if (dp == NULL) { 1605 /* 1606 * System can't give us a pool of headers, try 1607 * recycling from the free lists. 1608 */ 1609 bio_recycle(BIO_HEADER, 0); 1610 } else { 1611 sdp = dp; 1612 for (i = 0; i < v.v_buf; i++, dp++) { 1613 /* 1614 * The next two lines are needed since NODEV 1615 * is -1 and not NULL 1616 */ 1617 dp->b_dev = (o_dev_t)NODEV; 1618 dp->b_edev = NODEV; 1619 dp->av_forw = dp + 1; 1620 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1621 NULL); 1622 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1623 NULL); 1624 dp->b_offset = -1; 1625 } 1626 mutex_enter(&bhdr_lock); 1627 (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1628 bhdrlist = sdp; 1629 nbuf += v.v_buf; 1630 bp = bhdrlist; 1631 bhdrlist = bp->av_forw; 1632 mutex_exit(&bhdr_lock); 1633 1634 bp->av_forw = NULL; 1635 return (bp); 1636 } 1637 } 1638 } 1639 1640 static void 1641 bio_bhdr_free(struct buf *bp) 1642 { 1643 ASSERT(bp->b_back == NULL); 1644 ASSERT(bp->b_forw == NULL); 1645 ASSERT(bp->av_back == NULL); 1646 ASSERT(bp->av_forw == NULL); 1647 ASSERT(bp->b_un.b_addr == NULL); 1648 ASSERT(bp->b_dev == (o_dev_t)NODEV); 1649 ASSERT(bp->b_edev == NODEV); 1650 ASSERT(bp->b_flags == 0); 1651 1652 mutex_enter(&bhdr_lock); 1653 bp->av_forw = bhdrlist; 1654 bhdrlist = bp; 1655 mutex_exit(&bhdr_lock); 1656 } 1657 1658 /* 1659 * If we haven't gone over the high water mark, it's o.k. to 1660 * allocate more buffer space, otherwise recycle buffers 1661 * from the freelist until enough memory is free for a bsize request. 1662 * 1663 * We account for this memory, even though 1664 * we don't allocate it here. 1665 */ 1666 static void 1667 bio_mem_get(long bsize) 1668 { 1669 mutex_enter(&bfree_lock); 1670 if (bfreelist.b_bufsize > bsize) { 1671 bfreelist.b_bufsize -= bsize; 1672 mutex_exit(&bfree_lock); 1673 return; 1674 } 1675 mutex_exit(&bfree_lock); 1676 bio_recycle(BIO_MEM, bsize); 1677 } 1678 1679 /* 1680 * flush a list of delayed write buffers. 1681 * (currently used only by bio_recycle below.) 1682 */ 1683 static void 1684 bio_flushlist(struct buf *delwri_list) 1685 { 1686 struct buf *bp; 1687 1688 while (delwri_list != EMPTY_LIST) { 1689 bp = delwri_list; 1690 bp->b_flags |= B_AGE | B_ASYNC; 1691 if (bp->b_vp == NULL) { /* !ufs */ 1692 BWRITE(bp); 1693 } else { /* ufs */ 1694 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1695 } 1696 delwri_list = bp->b_list; 1697 bp->b_list = NULL; 1698 } 1699 } 1700 1701 /* 1702 * Start recycling buffers on the freelist for one of 2 reasons: 1703 * - we need a buffer header 1704 * - we need to free up memory 1705 * Once started we continue to recycle buffers until the B_AGE 1706 * buffers are gone. 1707 */ 1708 static void 1709 bio_recycle(int want, long bsize) 1710 { 1711 struct buf *bp, *dp, *dwp, *nbp; 1712 struct hbuf *hp; 1713 int found = 0; 1714 kmutex_t *hmp; 1715 int start, end; 1716 struct buf *delwri_list = EMPTY_LIST; 1717 1718 /* 1719 * Recycle buffers. 1720 */ 1721 top: 1722 start = end = lastindex; 1723 do { 1724 hp = &hbuf[start]; 1725 hmp = &hp->b_lock; 1726 dp = (struct buf *)hp; 1727 1728 mutex_enter(hmp); 1729 bp = dp->av_forw; 1730 1731 while (bp != dp) { 1732 1733 ASSERT(bp != NULL); 1734 1735 if (!sema_tryp(&bp->b_sem)) { 1736 bp = bp->av_forw; 1737 continue; 1738 } 1739 /* 1740 * Do we really want to nuke all of the B_AGE stuff?? 1741 */ 1742 if ((bp->b_flags & B_AGE) == 0 && found) { 1743 sema_v(&bp->b_sem); 1744 mutex_exit(hmp); 1745 lastindex = start; 1746 return; /* All done */ 1747 } 1748 1749 ASSERT(MUTEX_HELD(&hp->b_lock)); 1750 ASSERT(!(bp->b_flags & B_DELWRI)); 1751 hp->b_length--; 1752 notavail(bp); 1753 1754 /* 1755 * Remove bhdr from cache, free up memory, 1756 * and add the hdr to the freelist. 1757 */ 1758 bremhash(bp); 1759 mutex_exit(hmp); 1760 1761 if (bp->b_bufsize) { 1762 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1763 bp->b_un.b_addr = NULL; 1764 mutex_enter(&bfree_lock); 1765 bfreelist.b_bufsize += bp->b_bufsize; 1766 mutex_exit(&bfree_lock); 1767 } 1768 1769 bp->b_dev = (o_dev_t)NODEV; 1770 bp->b_edev = NODEV; 1771 bp->b_flags = 0; 1772 sema_v(&bp->b_sem); 1773 bio_bhdr_free(bp); 1774 if (want == BIO_HEADER) { 1775 found = 1; 1776 } else { 1777 ASSERT(want == BIO_MEM); 1778 if (!found && bfreelist.b_bufsize >= bsize) { 1779 /* Account for the memory we want */ 1780 mutex_enter(&bfree_lock); 1781 if (bfreelist.b_bufsize >= bsize) { 1782 bfreelist.b_bufsize -= bsize; 1783 found = 1; 1784 } 1785 mutex_exit(&bfree_lock); 1786 } 1787 } 1788 1789 /* 1790 * Since we dropped hmp start from the 1791 * begining. 1792 */ 1793 mutex_enter(hmp); 1794 bp = dp->av_forw; 1795 } 1796 mutex_exit(hmp); 1797 1798 /* 1799 * Look at the delayed write list. 1800 * First gather into a private list, then write them. 1801 */ 1802 dwp = (struct buf *)&dwbuf[start]; 1803 mutex_enter(&blist_lock); 1804 bio_doingflush++; 1805 mutex_enter(hmp); 1806 for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1807 1808 ASSERT(bp != NULL); 1809 nbp = bp->av_forw; 1810 1811 if (!sema_tryp(&bp->b_sem)) 1812 continue; 1813 ASSERT(bp->b_flags & B_DELWRI); 1814 /* 1815 * Do we really want to nuke all of the B_AGE stuff?? 1816 */ 1817 1818 if ((bp->b_flags & B_AGE) == 0 && found) { 1819 sema_v(&bp->b_sem); 1820 mutex_exit(hmp); 1821 lastindex = start; 1822 mutex_exit(&blist_lock); 1823 bio_flushlist(delwri_list); 1824 mutex_enter(&blist_lock); 1825 bio_doingflush--; 1826 if (bio_flinv_cv_wanted) { 1827 bio_flinv_cv_wanted = 0; 1828 cv_broadcast(&bio_flushinval_cv); 1829 } 1830 mutex_exit(&blist_lock); 1831 return; /* All done */ 1832 } 1833 1834 /* 1835 * If the buffer is already on a flush or 1836 * invalidate list then just skip it. 1837 */ 1838 if (bp->b_list != NULL) { 1839 sema_v(&bp->b_sem); 1840 continue; 1841 } 1842 /* 1843 * We are still on the same bucket. 1844 */ 1845 hp->b_length--; 1846 notavail(bp); 1847 bp->b_list = delwri_list; 1848 delwri_list = bp; 1849 } 1850 mutex_exit(hmp); 1851 mutex_exit(&blist_lock); 1852 bio_flushlist(delwri_list); 1853 delwri_list = EMPTY_LIST; 1854 mutex_enter(&blist_lock); 1855 bio_doingflush--; 1856 if (bio_flinv_cv_wanted) { 1857 bio_flinv_cv_wanted = 0; 1858 cv_broadcast(&bio_flushinval_cv); 1859 } 1860 mutex_exit(&blist_lock); 1861 start = (start + 1) % v.v_hbuf; 1862 1863 } while (start != end); 1864 1865 if (found) 1866 return; 1867 1868 /* 1869 * Free lists exhausted and we haven't satisfied the request. 1870 * Wait here for more entries to be added to freelist. 1871 * Because this might have just happened, make it timed. 1872 */ 1873 mutex_enter(&bfree_lock); 1874 bfreelist.b_flags |= B_WANTED; 1875 (void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz); 1876 mutex_exit(&bfree_lock); 1877 goto top; 1878 } 1879 1880 /* 1881 * See if the block is associated with some buffer 1882 * (mainly to avoid getting hung up on a wait in breada). 1883 */ 1884 static int 1885 bio_incore(dev_t dev, daddr_t blkno) 1886 { 1887 struct buf *bp; 1888 struct buf *dp; 1889 uint_t index; 1890 kmutex_t *hmp; 1891 1892 index = bio_bhash(dev, blkno); 1893 dp = (struct buf *)&hbuf[index]; 1894 hmp = &hbuf[index].b_lock; 1895 1896 mutex_enter(hmp); 1897 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1898 if (bp->b_blkno == blkno && bp->b_edev == dev && 1899 (bp->b_flags & B_STALE) == 0) { 1900 mutex_exit(hmp); 1901 return (1); 1902 } 1903 } 1904 mutex_exit(hmp); 1905 return (0); 1906 } 1907 1908 static void 1909 bio_pageio_done(struct buf *bp) 1910 { 1911 if (bp->b_flags & B_PAGEIO) { 1912 1913 if (bp->b_flags & B_REMAPPED) 1914 bp_mapout(bp); 1915 1916 if (bp->b_flags & B_READ) 1917 pvn_read_done(bp->b_pages, bp->b_flags); 1918 else 1919 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1920 pageio_done(bp); 1921 } else { 1922 ASSERT(bp->b_flags & B_REMAPPED); 1923 bp_mapout(bp); 1924 brelse(bp); 1925 } 1926 } 1927 1928 /* 1929 * bioerror(9F) - indicate error in buffer header 1930 * If 'error' is zero, remove the error indication. 1931 */ 1932 void 1933 bioerror(struct buf *bp, int error) 1934 { 1935 ASSERT(bp != NULL); 1936 ASSERT(error >= 0); 1937 ASSERT(SEMA_HELD(&bp->b_sem)); 1938 1939 if (error != 0) { 1940 bp->b_flags |= B_ERROR; 1941 } else { 1942 bp->b_flags &= ~B_ERROR; 1943 } 1944 bp->b_error = error; 1945 } 1946 1947 /* 1948 * bioreset(9F) - reuse a private buffer header after I/O is complete 1949 */ 1950 void 1951 bioreset(struct buf *bp) 1952 { 1953 ASSERT(bp != NULL); 1954 1955 biofini(bp); 1956 bioinit(bp); 1957 } 1958 1959 /* 1960 * biosize(9F) - return size of a buffer header 1961 */ 1962 size_t 1963 biosize(void) 1964 { 1965 return (sizeof (struct buf)); 1966 } 1967 1968 /* 1969 * biomodified(9F) - check if buffer is modified 1970 */ 1971 int 1972 biomodified(struct buf *bp) 1973 { 1974 int npf; 1975 int ppattr; 1976 struct page *pp; 1977 1978 ASSERT(bp != NULL); 1979 1980 if ((bp->b_flags & B_PAGEIO) == 0) { 1981 return (-1); 1982 } 1983 pp = bp->b_pages; 1984 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1985 1986 while (npf > 0) { 1987 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1988 HAT_SYNC_STOPON_MOD); 1989 if (ppattr & P_MOD) 1990 return (1); 1991 pp = pp->p_next; 1992 npf--; 1993 } 1994 1995 return (0); 1996 } 1997 1998 /* 1999 * bioinit(9F) - initialize a buffer structure 2000 */ 2001 void 2002 bioinit(struct buf *bp) 2003 { 2004 bzero(bp, sizeof (struct buf)); 2005 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 2006 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 2007 bp->b_offset = -1; 2008 } 2009 2010 /* 2011 * biofini(9F) - uninitialize a buffer structure 2012 */ 2013 void 2014 biofini(struct buf *bp) 2015 { 2016 sema_destroy(&bp->b_io); 2017 sema_destroy(&bp->b_sem); 2018 } 2019 2020 /* 2021 * bioclone(9F) - clone a buffer 2022 */ 2023 struct buf * 2024 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2025 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2026 { 2027 struct buf *bufp; 2028 2029 ASSERT(bp); 2030 if (bp_mem == NULL) { 2031 bufp = kmem_alloc(sizeof (struct buf), sleep); 2032 if (bufp == NULL) { 2033 return (NULL); 2034 } 2035 bioinit(bufp); 2036 } else { 2037 bufp = bp_mem; 2038 bioreset(bufp); 2039 } 2040 2041 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2042 B_ABRWRITE) 2043 2044 /* 2045 * The cloned buffer does not inherit the B_REMAPPED flag. 2046 */ 2047 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2048 bufp->b_bcount = len; 2049 bufp->b_blkno = blkno; 2050 bufp->b_iodone = iodone; 2051 bufp->b_proc = bp->b_proc; 2052 bufp->b_edev = dev; 2053 bufp->b_file = bp->b_file; 2054 bufp->b_offset = bp->b_offset; 2055 2056 if (bp->b_flags & B_SHADOW) { 2057 ASSERT(bp->b_shadow); 2058 ASSERT(bp->b_flags & B_PHYS); 2059 2060 bufp->b_shadow = bp->b_shadow + 2061 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2062 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2063 if (bp->b_flags & B_REMAPPED) 2064 bufp->b_proc = NULL; 2065 } else { 2066 if (bp->b_flags & B_PAGEIO) { 2067 struct page *pp; 2068 off_t o; 2069 int i; 2070 2071 pp = bp->b_pages; 2072 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2073 for (i = btop(o); i > 0; i--) { 2074 pp = pp->p_next; 2075 } 2076 bufp->b_pages = pp; 2077 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2078 } else { 2079 bufp->b_un.b_addr = 2080 (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2081 if (bp->b_flags & B_REMAPPED) 2082 bufp->b_proc = NULL; 2083 } 2084 } 2085 return (bufp); 2086 } 2087