1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2011 Joyent, Inc. All rights reserved. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/t_lock.h> 42 #include <sys/sysmacros.h> 43 #include <sys/conf.h> 44 #include <sys/cpuvar.h> 45 #include <sys/errno.h> 46 #include <sys/debug.h> 47 #include <sys/buf.h> 48 #include <sys/var.h> 49 #include <sys/vnode.h> 50 #include <sys/bitmap.h> 51 #include <sys/cmn_err.h> 52 #include <sys/kmem.h> 53 #include <sys/vmem.h> 54 #include <sys/atomic.h> 55 #include <vm/seg_kmem.h> 56 #include <vm/page.h> 57 #include <vm/pvn.h> 58 #include <sys/vtrace.h> 59 #include <sys/tnf_probe.h> 60 #include <sys/fs/ufs_inode.h> 61 #include <sys/fs/ufs_bio.h> 62 #include <sys/fs/ufs_log.h> 63 #include <sys/systm.h> 64 #include <sys/vfs.h> 65 #include <sys/sdt.h> 66 67 /* Locks */ 68 static kmutex_t blist_lock; /* protects b_list */ 69 static kmutex_t bhdr_lock; /* protects the bhdrlist */ 70 static kmutex_t bfree_lock; /* protects the bfreelist structure */ 71 72 struct hbuf *hbuf; /* Hash buckets */ 73 struct dwbuf *dwbuf; /* Delayed write buckets */ 74 static struct buf *bhdrlist; /* buf header free list */ 75 static int nbuf; /* number of buffer headers allocated */ 76 77 static int lastindex; /* Reference point on where to start */ 78 /* when looking for free buffers */ 79 80 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 81 #define EMPTY_LIST ((struct buf *)-1) 82 83 static kcondvar_t bio_mem_cv; /* Condition variables */ 84 static kcondvar_t bio_flushinval_cv; 85 static int bio_doingflush; /* flush in progress */ 86 static int bio_doinginval; /* inval in progress */ 87 static int bio_flinv_cv_wanted; /* someone waiting for cv */ 88 89 /* 90 * Statistics on the buffer cache 91 */ 92 struct biostats biostats = { 93 { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 94 { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 95 { "new_buffer_requests", KSTAT_DATA_UINT32 }, 96 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 97 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 98 { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 99 }; 100 101 /* 102 * kstat data 103 */ 104 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 105 uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 106 sizeof (kstat_named_t)); 107 108 /* 109 * Statistics on ufs buffer cache 110 * Not protected by locks 111 */ 112 struct ufsbiostats ub = { 113 { "breads", KSTAT_DATA_UINT32 }, 114 { "bwrites", KSTAT_DATA_UINT32 }, 115 { "fbiwrites", KSTAT_DATA_UINT32 }, 116 { "getpages", KSTAT_DATA_UINT32 }, 117 { "getras", KSTAT_DATA_UINT32 }, 118 { "putsyncs", KSTAT_DATA_UINT32 }, 119 { "putasyncs", KSTAT_DATA_UINT32 }, 120 { "putpageios", KSTAT_DATA_UINT32 }, 121 }; 122 123 /* 124 * more UFS Logging eccentricities... 125 * 126 * required since "#pragma weak ..." doesn't work in reverse order. 127 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 128 * to ufs routines don't get plugged into bio.c calls so 129 * we initialize it when setting up the "lufsops" table 130 * in "lufs.c:_init()" 131 */ 132 void (*bio_lufs_strategy)(void *, buf_t *); 133 void (*bio_snapshot_strategy)(void *, buf_t *); 134 135 136 /* Private routines */ 137 static struct buf *bio_getfreeblk(long); 138 static void bio_mem_get(long); 139 static void bio_bhdr_free(struct buf *); 140 static struct buf *bio_bhdr_alloc(void); 141 static void bio_recycle(int, long); 142 static void bio_pageio_done(struct buf *); 143 static int bio_incore(dev_t, daddr_t); 144 145 /* 146 * Buffer cache constants 147 */ 148 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 149 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 150 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 151 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 152 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 153 #define BIO_HASHLEN 4 /* Target length of hash chains */ 154 155 156 /* Flags for bio_recycle() */ 157 #define BIO_HEADER 0x01 158 #define BIO_MEM 0x02 159 160 extern int bufhwm; /* User tunable - high water mark for mem */ 161 extern int bufhwm_pct; /* ditto - given in % of physmem */ 162 163 /* 164 * The following routines allocate and free 165 * buffers with various side effects. In general the 166 * arguments to an allocate routine are a device and 167 * a block number, and the value is a pointer to 168 * to the buffer header; the buffer returned is locked with a 169 * binary semaphore so that no one else can touch it. If the block was 170 * already in core, no I/O need be done; if it is 171 * already locked, the process waits until it becomes free. 172 * The following routines allocate a buffer: 173 * getblk 174 * bread/BREAD 175 * breada 176 * Eventually the buffer must be released, possibly with the 177 * side effect of writing it out, by using one of 178 * bwrite/BWRITE/brwrite 179 * bdwrite/bdrwrite 180 * bawrite 181 * brelse 182 * 183 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 184 * Instead, a binary semaphore, b_sem is used to gain exclusive access to 185 * a buffer and a binary semaphore, b_io is used for I/O synchronization. 186 * B_DONE is still used to denote a buffer with I/O complete on it. 187 * 188 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 189 * should not be used where a very accurate count of the free buffers is 190 * needed. 191 */ 192 193 /* 194 * Read in (if necessary) the block and return a buffer pointer. 195 * 196 * This interface is provided for binary compatibility. Using 197 * BREAD() directly avoids the extra function call overhead invoked 198 * by calling this routine. 199 */ 200 struct buf * 201 bread(dev_t dev, daddr_t blkno, long bsize) 202 { 203 return (BREAD(dev, blkno, bsize)); 204 } 205 206 /* 207 * Common code for reading a buffer with various options 208 * 209 * Read in (if necessary) the block and return a buffer pointer. 210 */ 211 struct buf * 212 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 213 { 214 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 215 struct buf *bp; 216 klwp_t *lwp = ttolwp(curthread); 217 218 CPU_STATS_ADD_K(sys, lread, 1); 219 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 220 if (bp->b_flags & B_DONE) 221 return (bp); 222 bp->b_flags |= B_READ; 223 ASSERT(bp->b_bcount == bsize); 224 if (ufsvfsp == NULL) { /* !ufs */ 225 (void) bdev_strategy(bp); 226 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 227 /* ufs && logging */ 228 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 229 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 230 /* ufs && snapshots */ 231 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 232 } else { 233 ufsvfsp->vfs_iotstamp = ddi_get_lbolt(); 234 ub.ub_breads.value.ul++; /* ufs && !logging */ 235 (void) bdev_strategy(bp); 236 } 237 if (lwp != NULL) 238 lwp->lwp_ru.inblock++; 239 CPU_STATS_ADD_K(sys, bread, 1); 240 (void) biowait(bp); 241 return (bp); 242 } 243 244 /* 245 * Read in the block, like bread, but also start I/O on the 246 * read-ahead block (which is not allocated to the caller). 247 */ 248 struct buf * 249 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 250 { 251 struct buf *bp, *rabp; 252 klwp_t *lwp = ttolwp(curthread); 253 254 bp = NULL; 255 if (!bio_incore(dev, blkno)) { 256 CPU_STATS_ADD_K(sys, lread, 1); 257 bp = GETBLK(dev, blkno, bsize); 258 if ((bp->b_flags & B_DONE) == 0) { 259 bp->b_flags |= B_READ; 260 bp->b_bcount = bsize; 261 (void) bdev_strategy(bp); 262 if (lwp != NULL) 263 lwp->lwp_ru.inblock++; 264 CPU_STATS_ADD_K(sys, bread, 1); 265 } 266 } 267 if (rablkno && bfreelist.b_bcount > 1 && 268 !bio_incore(dev, rablkno)) { 269 rabp = GETBLK(dev, rablkno, bsize); 270 if (rabp->b_flags & B_DONE) 271 brelse(rabp); 272 else { 273 rabp->b_flags |= B_READ|B_ASYNC; 274 rabp->b_bcount = bsize; 275 (void) bdev_strategy(rabp); 276 if (lwp != NULL) 277 lwp->lwp_ru.inblock++; 278 CPU_STATS_ADD_K(sys, bread, 1); 279 } 280 } 281 if (bp == NULL) 282 return (BREAD(dev, blkno, bsize)); 283 (void) biowait(bp); 284 return (bp); 285 } 286 287 /* 288 * Common code for writing a buffer with various options. 289 * 290 * force_wait - wait for write completion regardless of B_ASYNC flag 291 * do_relse - release the buffer when we are done 292 * clear_flags - flags to clear from the buffer 293 */ 294 void 295 bwrite_common(void *arg, struct buf *bp, int force_wait, 296 int do_relse, int clear_flags) 297 { 298 register int do_wait; 299 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 300 int flag; 301 klwp_t *lwp = ttolwp(curthread); 302 struct cpu *cpup; 303 304 ASSERT(SEMA_HELD(&bp->b_sem)); 305 flag = bp->b_flags; 306 bp->b_flags &= ~clear_flags; 307 if (lwp != NULL) 308 lwp->lwp_ru.oublock++; 309 CPU_STATS_ENTER_K(); 310 cpup = CPU; /* get pointer AFTER preemption is disabled */ 311 CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 312 CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 313 do_wait = ((flag & B_ASYNC) == 0 || force_wait); 314 if (do_wait == 0) 315 CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 316 CPU_STATS_EXIT_K(); 317 if (ufsvfsp == NULL) { 318 (void) bdev_strategy(bp); 319 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 320 /* ufs && logging */ 321 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 322 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 323 /* ufs && snapshots */ 324 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 325 } else { 326 ub.ub_bwrites.value.ul++; /* ufs && !logging */ 327 (void) bdev_strategy(bp); 328 } 329 if (do_wait) { 330 (void) biowait(bp); 331 if (do_relse) { 332 brelse(bp); 333 } 334 } 335 } 336 337 /* 338 * Write the buffer, waiting for completion (unless B_ASYNC is set). 339 * Then release the buffer. 340 * This interface is provided for binary compatibility. Using 341 * BWRITE() directly avoids the extra function call overhead invoked 342 * by calling this routine. 343 */ 344 void 345 bwrite(struct buf *bp) 346 { 347 BWRITE(bp); 348 } 349 350 /* 351 * Write the buffer, waiting for completion. 352 * But don't release the buffer afterwards. 353 * This interface is provided for binary compatibility. Using 354 * BWRITE2() directly avoids the extra function call overhead. 355 */ 356 void 357 bwrite2(struct buf *bp) 358 { 359 BWRITE2(bp); 360 } 361 362 /* 363 * Release the buffer, marking it so that if it is grabbed 364 * for another purpose it will be written out before being 365 * given up (e.g. when writing a partial block where it is 366 * assumed that another write for the same block will soon follow). 367 * Also save the time that the block is first marked as delayed 368 * so that it will be written in a reasonable time. 369 */ 370 void 371 bdwrite(struct buf *bp) 372 { 373 ASSERT(SEMA_HELD(&bp->b_sem)); 374 CPU_STATS_ADD_K(sys, lwrite, 1); 375 if ((bp->b_flags & B_DELWRI) == 0) 376 bp->b_start = ddi_get_lbolt(); 377 /* 378 * B_DONE allows others to use the buffer, B_DELWRI causes the 379 * buffer to be written before being reused, and setting b_resid 380 * to zero says the buffer is complete. 381 */ 382 bp->b_flags |= B_DELWRI | B_DONE; 383 bp->b_resid = 0; 384 brelse(bp); 385 } 386 387 /* 388 * Release the buffer, start I/O on it, but don't wait for completion. 389 */ 390 void 391 bawrite(struct buf *bp) 392 { 393 ASSERT(SEMA_HELD(&bp->b_sem)); 394 395 /* Use bfreelist.b_bcount as a weird-ass heuristic */ 396 if (bfreelist.b_bcount > 4) 397 bp->b_flags |= B_ASYNC; 398 BWRITE(bp); 399 } 400 401 /* 402 * Release the buffer, with no I/O implied. 403 */ 404 void 405 brelse(struct buf *bp) 406 { 407 struct buf **backp; 408 uint_t index; 409 kmutex_t *hmp; 410 struct buf *dp; 411 struct hbuf *hp; 412 413 414 ASSERT(SEMA_HELD(&bp->b_sem)); 415 416 /* 417 * Clear the retry write flag if the buffer was written without 418 * error. The presence of B_DELWRI means the buffer has not yet 419 * been written and the presence of B_ERROR means that an error 420 * is still occurring. 421 */ 422 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 423 bp->b_flags &= ~B_RETRYWRI; 424 } 425 426 /* Check for anomalous conditions */ 427 if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 428 if (bp->b_flags & B_NOCACHE) { 429 /* Don't add to the freelist. Destroy it now */ 430 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 431 sema_destroy(&bp->b_sem); 432 sema_destroy(&bp->b_io); 433 kmem_free(bp, sizeof (struct buf)); 434 return; 435 } 436 /* 437 * If a write failed and we are supposed to retry write, 438 * don't toss the buffer. Keep it around and mark it 439 * delayed write in the hopes that it will eventually 440 * get flushed (and still keep the system running.) 441 */ 442 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 443 bp->b_flags |= B_DELWRI; 444 /* keep fsflush from trying continuously to flush */ 445 bp->b_start = ddi_get_lbolt(); 446 } else 447 bp->b_flags |= B_AGE|B_STALE; 448 bp->b_flags &= ~B_ERROR; 449 bp->b_error = 0; 450 } 451 452 /* 453 * If delayed write is set then put in on the delayed 454 * write list instead of the free buffer list. 455 */ 456 index = bio_bhash(bp->b_edev, bp->b_blkno); 457 hmp = &hbuf[index].b_lock; 458 459 mutex_enter(hmp); 460 hp = &hbuf[index]; 461 dp = (struct buf *)hp; 462 463 /* 464 * Make sure that the number of entries on this list are 465 * Zero <= count <= total # buffers 466 */ 467 ASSERT(hp->b_length >= 0); 468 ASSERT(hp->b_length < nbuf); 469 470 hp->b_length++; /* We are adding this buffer */ 471 472 if (bp->b_flags & B_DELWRI) { 473 /* 474 * This buffer goes on the delayed write buffer list 475 */ 476 dp = (struct buf *)&dwbuf[index]; 477 } 478 ASSERT(bp->b_bufsize > 0); 479 ASSERT(bp->b_bcount > 0); 480 ASSERT(bp->b_un.b_addr != NULL); 481 482 if (bp->b_flags & B_AGE) { 483 backp = &dp->av_forw; 484 (*backp)->av_back = bp; 485 bp->av_forw = *backp; 486 *backp = bp; 487 bp->av_back = dp; 488 } else { 489 backp = &dp->av_back; 490 (*backp)->av_forw = bp; 491 bp->av_back = *backp; 492 *backp = bp; 493 bp->av_forw = dp; 494 } 495 mutex_exit(hmp); 496 497 if (bfreelist.b_flags & B_WANTED) { 498 /* 499 * Should come here very very rarely. 500 */ 501 mutex_enter(&bfree_lock); 502 if (bfreelist.b_flags & B_WANTED) { 503 bfreelist.b_flags &= ~B_WANTED; 504 cv_broadcast(&bio_mem_cv); 505 } 506 mutex_exit(&bfree_lock); 507 } 508 509 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 510 /* 511 * Don't let anyone get the buffer off the freelist before we 512 * release our hold on it. 513 */ 514 sema_v(&bp->b_sem); 515 } 516 517 /* 518 * Return a count of the number of B_BUSY buffers in the system 519 * Can only be used as a good estimate. If 'cleanit' is set, 520 * try to flush all bufs. 521 */ 522 int 523 bio_busy(int cleanit) 524 { 525 struct buf *bp, *dp; 526 int busy = 0; 527 int i; 528 kmutex_t *hmp; 529 530 for (i = 0; i < v.v_hbuf; i++) { 531 vfs_syncprogress(); 532 dp = (struct buf *)&hbuf[i]; 533 hmp = &hbuf[i].b_lock; 534 535 mutex_enter(hmp); 536 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 537 if (bp->b_flags & B_BUSY) 538 busy++; 539 } 540 mutex_exit(hmp); 541 } 542 543 if (cleanit && busy != 0) { 544 bflush(NODEV); 545 } 546 547 return (busy); 548 } 549 550 /* 551 * this interface is provided for binary compatibility. 552 * 553 * Assign a buffer for the given block. If the appropriate 554 * block is already associated, return it; otherwise search 555 * for the oldest non-busy buffer and reassign it. 556 */ 557 struct buf * 558 getblk(dev_t dev, daddr_t blkno, long bsize) 559 { 560 return (getblk_common(/* ufsvfsp */ NULL, dev, 561 blkno, bsize, /* errflg */ 0)); 562 } 563 564 /* 565 * Assign a buffer for the given block. If the appropriate 566 * block is already associated, return it; otherwise search 567 * for the oldest non-busy buffer and reassign it. 568 */ 569 struct buf * 570 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 571 { 572 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 573 struct buf *bp; 574 struct buf *dp; 575 struct buf *nbp = NULL; 576 struct buf *errbp; 577 uint_t index; 578 kmutex_t *hmp; 579 struct hbuf *hp; 580 581 if (getmajor(dev) >= devcnt) 582 cmn_err(CE_PANIC, "blkdev"); 583 584 biostats.bio_lookup.value.ui32++; 585 586 index = bio_bhash(dev, blkno); 587 hp = &hbuf[index]; 588 dp = (struct buf *)hp; 589 hmp = &hp->b_lock; 590 591 mutex_enter(hmp); 592 loop: 593 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 594 if (bp->b_blkno != blkno || bp->b_edev != dev || 595 (bp->b_flags & B_STALE)) 596 continue; 597 /* 598 * Avoid holding the hash lock in the event that 599 * the buffer is locked by someone. Since the hash chain 600 * may change when we drop the hash lock 601 * we have to start at the beginning of the chain if the 602 * buffer identity/contents aren't valid. 603 */ 604 if (!sema_tryp(&bp->b_sem)) { 605 biostats.bio_bufbusy.value.ui32++; 606 mutex_exit(hmp); 607 /* 608 * OK, we are dealing with a busy buffer. 609 * In the case that we are panicking and we 610 * got called from bread(), we have some chance 611 * for error recovery. So better bail out from 612 * here since sema_p() won't block. If we got 613 * called directly from ufs routines, there is 614 * no way to report an error yet. 615 */ 616 if (panicstr && errflg) 617 goto errout; 618 /* 619 * For the following line of code to work 620 * correctly never kmem_free the buffer "header". 621 */ 622 sema_p(&bp->b_sem); 623 if (bp->b_blkno != blkno || bp->b_edev != dev || 624 (bp->b_flags & B_STALE)) { 625 sema_v(&bp->b_sem); 626 mutex_enter(hmp); 627 goto loop; /* start over */ 628 } 629 mutex_enter(hmp); 630 } 631 /* Found */ 632 biostats.bio_hit.value.ui32++; 633 bp->b_flags &= ~B_AGE; 634 635 /* 636 * Yank it off the free/delayed write lists 637 */ 638 hp->b_length--; 639 notavail(bp); 640 mutex_exit(hmp); 641 642 ASSERT((bp->b_flags & B_NOCACHE) == NULL); 643 644 if (nbp == NULL) { 645 /* 646 * Make the common path short. 647 */ 648 ASSERT(SEMA_HELD(&bp->b_sem)); 649 return (bp); 650 } 651 652 biostats.bio_bufdup.value.ui32++; 653 654 /* 655 * The buffer must have entered during the lock upgrade 656 * so free the new buffer we allocated and return the 657 * found buffer. 658 */ 659 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 660 nbp->b_un.b_addr = NULL; 661 662 /* 663 * Account for the memory 664 */ 665 mutex_enter(&bfree_lock); 666 bfreelist.b_bufsize += nbp->b_bufsize; 667 mutex_exit(&bfree_lock); 668 669 /* 670 * Destroy buf identity, and place on avail list 671 */ 672 nbp->b_dev = (o_dev_t)NODEV; 673 nbp->b_edev = NODEV; 674 nbp->b_flags = 0; 675 nbp->b_file = NULL; 676 nbp->b_offset = -1; 677 678 sema_v(&nbp->b_sem); 679 bio_bhdr_free(nbp); 680 681 ASSERT(SEMA_HELD(&bp->b_sem)); 682 return (bp); 683 } 684 685 /* 686 * bio_getfreeblk may block so check the hash chain again. 687 */ 688 if (nbp == NULL) { 689 mutex_exit(hmp); 690 nbp = bio_getfreeblk(bsize); 691 mutex_enter(hmp); 692 goto loop; 693 } 694 695 /* 696 * New buffer. Assign nbp and stick it on the hash. 697 */ 698 nbp->b_flags = B_BUSY; 699 nbp->b_edev = dev; 700 nbp->b_dev = (o_dev_t)cmpdev(dev); 701 nbp->b_blkno = blkno; 702 nbp->b_iodone = NULL; 703 nbp->b_bcount = bsize; 704 /* 705 * If we are given a ufsvfsp and the vfs_root field is NULL 706 * then this must be I/O for a superblock. A superblock's 707 * buffer is set up in mountfs() and there is no root vnode 708 * at that point. 709 */ 710 if (ufsvfsp && ufsvfsp->vfs_root) { 711 nbp->b_vp = ufsvfsp->vfs_root; 712 } else { 713 nbp->b_vp = NULL; 714 } 715 716 ASSERT((nbp->b_flags & B_NOCACHE) == NULL); 717 718 binshash(nbp, dp); 719 mutex_exit(hmp); 720 721 ASSERT(SEMA_HELD(&nbp->b_sem)); 722 723 return (nbp); 724 725 726 /* 727 * Come here in case of an internal error. At this point we couldn't 728 * get a buffer, but he have to return one. Hence we allocate some 729 * kind of error reply buffer on the fly. This buffer is marked as 730 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 731 * - B_ERROR will indicate error to the caller. 732 * - B_DONE will prevent us from reading the buffer from 733 * the device. 734 * - B_NOCACHE will cause that this buffer gets free'd in 735 * brelse(). 736 */ 737 738 errout: 739 errbp = geteblk(); 740 sema_p(&errbp->b_sem); 741 errbp->b_flags &= ~B_BUSY; 742 errbp->b_flags |= (B_ERROR | B_DONE); 743 return (errbp); 744 } 745 746 /* 747 * Get an empty block, not assigned to any particular device. 748 * Returns a locked buffer that is not on any hash or free list. 749 */ 750 struct buf * 751 ngeteblk(long bsize) 752 { 753 struct buf *bp; 754 755 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 756 bioinit(bp); 757 bp->av_forw = bp->av_back = NULL; 758 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 759 bp->b_bufsize = bsize; 760 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 761 bp->b_dev = (o_dev_t)NODEV; 762 bp->b_edev = NODEV; 763 bp->b_lblkno = 0; 764 bp->b_bcount = bsize; 765 bp->b_iodone = NULL; 766 return (bp); 767 } 768 769 /* 770 * Interface of geteblk() is kept intact to maintain driver compatibility. 771 * Use ngeteblk() to allocate block size other than 1 KB. 772 */ 773 struct buf * 774 geteblk(void) 775 { 776 return (ngeteblk((long)1024)); 777 } 778 779 /* 780 * Return a buffer w/o sleeping 781 */ 782 struct buf * 783 trygetblk(dev_t dev, daddr_t blkno) 784 { 785 struct buf *bp; 786 struct buf *dp; 787 struct hbuf *hp; 788 kmutex_t *hmp; 789 uint_t index; 790 791 index = bio_bhash(dev, blkno); 792 hp = &hbuf[index]; 793 hmp = &hp->b_lock; 794 795 if (!mutex_tryenter(hmp)) 796 return (NULL); 797 798 dp = (struct buf *)hp; 799 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 800 if (bp->b_blkno != blkno || bp->b_edev != dev || 801 (bp->b_flags & B_STALE)) 802 continue; 803 /* 804 * Get access to a valid buffer without sleeping 805 */ 806 if (sema_tryp(&bp->b_sem)) { 807 if (bp->b_flags & B_DONE) { 808 hp->b_length--; 809 notavail(bp); 810 mutex_exit(hmp); 811 return (bp); 812 } else { 813 sema_v(&bp->b_sem); 814 break; 815 } 816 } 817 break; 818 } 819 mutex_exit(hmp); 820 return (NULL); 821 } 822 823 /* 824 * Wait for I/O completion on the buffer; return errors 825 * to the user. 826 */ 827 int 828 iowait(struct buf *bp) 829 { 830 ASSERT(SEMA_HELD(&bp->b_sem)); 831 return (biowait(bp)); 832 } 833 834 /* 835 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 836 * and wake up anyone waiting for it. 837 */ 838 void 839 iodone(struct buf *bp) 840 { 841 ASSERT(SEMA_HELD(&bp->b_sem)); 842 (void) biodone(bp); 843 } 844 845 /* 846 * Zero the core associated with a buffer. 847 */ 848 void 849 clrbuf(struct buf *bp) 850 { 851 ASSERT(SEMA_HELD(&bp->b_sem)); 852 bzero(bp->b_un.b_addr, bp->b_bcount); 853 bp->b_resid = 0; 854 } 855 856 857 /* 858 * Make sure all write-behind blocks on dev (or NODEV for all) 859 * are flushed out. 860 */ 861 void 862 bflush(dev_t dev) 863 { 864 struct buf *bp, *dp; 865 struct hbuf *hp; 866 struct buf *delwri_list = EMPTY_LIST; 867 int i, index; 868 kmutex_t *hmp; 869 870 mutex_enter(&blist_lock); 871 /* 872 * Wait for any invalidates or flushes ahead of us to finish. 873 * We really could split blist_lock up per device for better 874 * parallelism here. 875 */ 876 while (bio_doinginval || bio_doingflush) { 877 bio_flinv_cv_wanted = 1; 878 cv_wait(&bio_flushinval_cv, &blist_lock); 879 } 880 bio_doingflush++; 881 /* 882 * Gather all B_DELWRI buffer for device. 883 * Lock ordering is b_sem > hash lock (brelse). 884 * Since we are finding the buffer via the delayed write list, 885 * it may be busy and we would block trying to get the 886 * b_sem lock while holding hash lock. So transfer all the 887 * candidates on the delwri_list and then drop the hash locks. 888 */ 889 for (i = 0; i < v.v_hbuf; i++) { 890 vfs_syncprogress(); 891 hmp = &hbuf[i].b_lock; 892 dp = (struct buf *)&dwbuf[i]; 893 mutex_enter(hmp); 894 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 895 if (dev == NODEV || bp->b_edev == dev) { 896 if (bp->b_list == NULL) { 897 bp->b_list = delwri_list; 898 delwri_list = bp; 899 } 900 } 901 } 902 mutex_exit(hmp); 903 } 904 mutex_exit(&blist_lock); 905 906 /* 907 * Now that the hash locks have been dropped grab the semaphores 908 * and write back all the buffers that have B_DELWRI set. 909 */ 910 while (delwri_list != EMPTY_LIST) { 911 vfs_syncprogress(); 912 bp = delwri_list; 913 914 sema_p(&bp->b_sem); /* may block */ 915 if ((dev != bp->b_edev && dev != NODEV) || 916 (panicstr && bp->b_flags & B_BUSY)) { 917 sema_v(&bp->b_sem); 918 delwri_list = bp->b_list; 919 bp->b_list = NULL; 920 continue; /* No longer a candidate */ 921 } 922 if (bp->b_flags & B_DELWRI) { 923 index = bio_bhash(bp->b_edev, bp->b_blkno); 924 hp = &hbuf[index]; 925 hmp = &hp->b_lock; 926 dp = (struct buf *)hp; 927 928 bp->b_flags |= B_ASYNC; 929 mutex_enter(hmp); 930 hp->b_length--; 931 notavail(bp); 932 mutex_exit(hmp); 933 if (bp->b_vp == NULL) { /* !ufs */ 934 BWRITE(bp); 935 } else { /* ufs */ 936 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 937 } 938 } else { 939 sema_v(&bp->b_sem); 940 } 941 delwri_list = bp->b_list; 942 bp->b_list = NULL; 943 } 944 mutex_enter(&blist_lock); 945 bio_doingflush--; 946 if (bio_flinv_cv_wanted) { 947 bio_flinv_cv_wanted = 0; 948 cv_broadcast(&bio_flushinval_cv); 949 } 950 mutex_exit(&blist_lock); 951 } 952 953 /* 954 * Ensure that a specified block is up-to-date on disk. 955 */ 956 void 957 blkflush(dev_t dev, daddr_t blkno) 958 { 959 struct buf *bp, *dp; 960 struct hbuf *hp; 961 struct buf *sbp = NULL; 962 uint_t index; 963 kmutex_t *hmp; 964 965 index = bio_bhash(dev, blkno); 966 hp = &hbuf[index]; 967 dp = (struct buf *)hp; 968 hmp = &hp->b_lock; 969 970 /* 971 * Identify the buffer in the cache belonging to 972 * this device and blkno (if any). 973 */ 974 mutex_enter(hmp); 975 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 976 if (bp->b_blkno != blkno || bp->b_edev != dev || 977 (bp->b_flags & B_STALE)) 978 continue; 979 sbp = bp; 980 break; 981 } 982 mutex_exit(hmp); 983 if (sbp == NULL) 984 return; 985 /* 986 * Now check the buffer we have identified and 987 * make sure it still belongs to the device and is B_DELWRI 988 */ 989 sema_p(&sbp->b_sem); 990 if (sbp->b_blkno == blkno && sbp->b_edev == dev && 991 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 992 mutex_enter(hmp); 993 hp->b_length--; 994 notavail(sbp); 995 mutex_exit(hmp); 996 /* 997 * XXX - There is nothing to guarantee a synchronous 998 * write here if the B_ASYNC flag is set. This needs 999 * some investigation. 1000 */ 1001 if (sbp->b_vp == NULL) { /* !ufs */ 1002 BWRITE(sbp); /* synchronous write */ 1003 } else { /* ufs */ 1004 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1005 } 1006 } else { 1007 sema_v(&sbp->b_sem); 1008 } 1009 } 1010 1011 /* 1012 * Same as binval, except can force-invalidate delayed-write buffers 1013 * (which are not be already flushed because of device errors). Also 1014 * makes sure that the retry write flag is cleared. 1015 */ 1016 int 1017 bfinval(dev_t dev, int force) 1018 { 1019 struct buf *dp; 1020 struct buf *bp; 1021 struct buf *binval_list = EMPTY_LIST; 1022 int i, error = 0; 1023 kmutex_t *hmp; 1024 uint_t index; 1025 struct buf **backp; 1026 1027 mutex_enter(&blist_lock); 1028 /* 1029 * Wait for any flushes ahead of us to finish, it's ok to 1030 * do invalidates in parallel. 1031 */ 1032 while (bio_doingflush) { 1033 bio_flinv_cv_wanted = 1; 1034 cv_wait(&bio_flushinval_cv, &blist_lock); 1035 } 1036 bio_doinginval++; 1037 1038 /* Gather bp's */ 1039 for (i = 0; i < v.v_hbuf; i++) { 1040 dp = (struct buf *)&hbuf[i]; 1041 hmp = &hbuf[i].b_lock; 1042 1043 mutex_enter(hmp); 1044 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1045 if (bp->b_edev == dev) { 1046 if (bp->b_list == NULL) { 1047 bp->b_list = binval_list; 1048 binval_list = bp; 1049 } 1050 } 1051 } 1052 mutex_exit(hmp); 1053 } 1054 mutex_exit(&blist_lock); 1055 1056 /* Invalidate all bp's found */ 1057 while (binval_list != EMPTY_LIST) { 1058 bp = binval_list; 1059 1060 sema_p(&bp->b_sem); 1061 if (bp->b_edev == dev) { 1062 if (force && (bp->b_flags & B_DELWRI)) { 1063 /* clear B_DELWRI, move to non-dw freelist */ 1064 index = bio_bhash(bp->b_edev, bp->b_blkno); 1065 hmp = &hbuf[index].b_lock; 1066 dp = (struct buf *)&hbuf[index]; 1067 mutex_enter(hmp); 1068 1069 /* remove from delayed write freelist */ 1070 notavail(bp); 1071 1072 /* add to B_AGE side of non-dw freelist */ 1073 backp = &dp->av_forw; 1074 (*backp)->av_back = bp; 1075 bp->av_forw = *backp; 1076 *backp = bp; 1077 bp->av_back = dp; 1078 1079 /* 1080 * make sure write retries and busy are cleared 1081 */ 1082 bp->b_flags &= 1083 ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1084 mutex_exit(hmp); 1085 } 1086 if ((bp->b_flags & B_DELWRI) == 0) 1087 bp->b_flags |= B_STALE|B_AGE; 1088 else 1089 error = EIO; 1090 } 1091 sema_v(&bp->b_sem); 1092 binval_list = bp->b_list; 1093 bp->b_list = NULL; 1094 } 1095 mutex_enter(&blist_lock); 1096 bio_doinginval--; 1097 if (bio_flinv_cv_wanted) { 1098 cv_broadcast(&bio_flushinval_cv); 1099 bio_flinv_cv_wanted = 0; 1100 } 1101 mutex_exit(&blist_lock); 1102 return (error); 1103 } 1104 1105 /* 1106 * If possible, invalidate blocks for a dev on demand 1107 */ 1108 void 1109 binval(dev_t dev) 1110 { 1111 (void) bfinval(dev, 0); 1112 } 1113 1114 /* 1115 * Initialize the buffer I/O system by freeing 1116 * all buffers and setting all device hash buffer lists to empty. 1117 */ 1118 void 1119 binit(void) 1120 { 1121 struct buf *bp; 1122 unsigned int i, pct; 1123 ulong_t bio_max_hwm, bio_default_hwm; 1124 1125 /* 1126 * Maximum/Default values for bufhwm are set to the smallest of: 1127 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1128 * - 1/4 of kernel virtual memory 1129 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1130 * Additionally, in order to allow simple tuning by percentage of 1131 * physical memory, bufhwm_pct is used to calculate the default if 1132 * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1133 * 1134 * Since the unit for v.v_bufhwm is kilobytes, this allows for 1135 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1136 */ 1137 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1138 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1139 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1140 1141 pct = BIO_BUF_PERCENT; 1142 if (bufhwm_pct != 0 && 1143 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1144 pct = BIO_BUF_PERCENT; 1145 /* 1146 * Invalid user specified value, emit a warning. 1147 */ 1148 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1149 range(1..%d). Using %d as default.", 1150 bufhwm_pct, 1151 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1152 } 1153 1154 bio_default_hwm = MIN(physmem / pct, 1155 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1156 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1157 1158 if ((v.v_bufhwm = bufhwm) == 0) 1159 v.v_bufhwm = bio_default_hwm; 1160 1161 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1162 v.v_bufhwm = (int)bio_max_hwm; 1163 /* 1164 * Invalid user specified value, emit a warning. 1165 */ 1166 cmn_err(CE_WARN, 1167 "binit: bufhwm(%d) out \ 1168 of range(%d..%lu). Using %lu as default", 1169 bufhwm, 1170 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1171 } 1172 1173 /* 1174 * Determine the number of hash buckets. Default is to 1175 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1176 * Round up number to the next power of 2. 1177 */ 1178 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1179 BIO_HASHLEN); 1180 v.v_hmask = v.v_hbuf - 1; 1181 v.v_buf = BIO_BHDR_POOL; 1182 1183 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1184 1185 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1186 1187 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1188 bp = &bfreelist; 1189 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1190 1191 for (i = 0; i < v.v_hbuf; i++) { 1192 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1193 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1194 1195 /* 1196 * Initialize the delayed write buffer list. 1197 */ 1198 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1199 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1200 } 1201 } 1202 1203 /* 1204 * Wait for I/O completion on the buffer; return error code. 1205 * If bp was for synchronous I/O, bp is invalid and associated 1206 * resources are freed on return. 1207 */ 1208 int 1209 biowait(struct buf *bp) 1210 { 1211 int error = 0; 1212 struct cpu *cpup; 1213 1214 ASSERT(SEMA_HELD(&bp->b_sem)); 1215 1216 cpup = CPU; 1217 atomic_inc_64(&cpup->cpu_stats.sys.iowait); 1218 DTRACE_IO1(wait__start, struct buf *, bp); 1219 1220 /* 1221 * In case of panic, busy wait for completion 1222 */ 1223 if (panicstr) { 1224 while ((bp->b_flags & B_DONE) == 0) 1225 drv_usecwait(10); 1226 } else 1227 sema_p(&bp->b_io); 1228 1229 DTRACE_IO1(wait__done, struct buf *, bp); 1230 atomic_dec_64(&cpup->cpu_stats.sys.iowait); 1231 1232 error = geterror(bp); 1233 if ((bp->b_flags & B_ASYNC) == 0) { 1234 if (bp->b_flags & B_REMAPPED) 1235 bp_mapout(bp); 1236 } 1237 return (error); 1238 } 1239 1240 static void 1241 biodone_tnf_probe(struct buf *bp) 1242 { 1243 /* Kernel probe */ 1244 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */, 1245 tnf_device, device, bp->b_edev, 1246 tnf_diskaddr, block, bp->b_lblkno, 1247 tnf_opaque, buf, bp); 1248 } 1249 1250 /* 1251 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1252 * and wake up anyone waiting for it. 1253 */ 1254 void 1255 biodone(struct buf *bp) 1256 { 1257 if (bp->b_flags & B_STARTED) { 1258 DTRACE_IO1(done, struct buf *, bp); 1259 bp->b_flags &= ~B_STARTED; 1260 } 1261 1262 /* 1263 * Call the TNF probe here instead of the inline code 1264 * to force our compiler to use the tail call optimization. 1265 */ 1266 biodone_tnf_probe(bp); 1267 1268 if (bp->b_iodone != NULL) { 1269 (*(bp->b_iodone))(bp); 1270 return; 1271 } 1272 ASSERT((bp->b_flags & B_DONE) == 0); 1273 ASSERT(SEMA_HELD(&bp->b_sem)); 1274 bp->b_flags |= B_DONE; 1275 if (bp->b_flags & B_ASYNC) { 1276 if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1277 bio_pageio_done(bp); 1278 else 1279 brelse(bp); /* release bp to freelist */ 1280 } else { 1281 sema_v(&bp->b_io); 1282 } 1283 } 1284 1285 /* 1286 * Pick up the device's error number and pass it to the user; 1287 * if there is an error but the number is 0 set a generalized code. 1288 */ 1289 int 1290 geterror(struct buf *bp) 1291 { 1292 int error = 0; 1293 1294 ASSERT(SEMA_HELD(&bp->b_sem)); 1295 if (bp->b_flags & B_ERROR) { 1296 error = bp->b_error; 1297 if (!error) 1298 error = EIO; 1299 } 1300 return (error); 1301 } 1302 1303 /* 1304 * Support for pageio buffers. 1305 * 1306 * This stuff should be generalized to provide a generalized bp 1307 * header facility that can be used for things other than pageio. 1308 */ 1309 1310 /* 1311 * Allocate and initialize a buf struct for use with pageio. 1312 */ 1313 struct buf * 1314 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1315 { 1316 struct buf *bp; 1317 struct cpu *cpup; 1318 1319 if (flags & B_READ) { 1320 CPU_STATS_ENTER_K(); 1321 cpup = CPU; /* get pointer AFTER preemption is disabled */ 1322 CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1323 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1324 1325 atomic_add_64(&curzone->zone_pgpgin, btopr(len)); 1326 1327 if ((flags & B_ASYNC) == 0) { 1328 klwp_t *lwp = ttolwp(curthread); 1329 if (lwp != NULL) 1330 lwp->lwp_ru.majflt++; 1331 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1332 /* Kernel probe */ 1333 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */, 1334 tnf_opaque, vnode, pp->p_vnode, 1335 tnf_offset, offset, pp->p_offset); 1336 } 1337 /* 1338 * Update statistics for pages being paged in 1339 */ 1340 if (pp != NULL && pp->p_vnode != NULL) { 1341 if (IS_SWAPFSVP(pp->p_vnode)) { 1342 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); 1343 atomic_add_64(&curzone->zone_anonpgin, 1344 btopr(len)); 1345 } else { 1346 if (pp->p_vnode->v_flag & VVMEXEC) { 1347 CPU_STATS_ADDQ(cpup, vm, execpgin, 1348 btopr(len)); 1349 atomic_add_64(&curzone->zone_execpgin, 1350 btopr(len)); 1351 } else { 1352 CPU_STATS_ADDQ(cpup, vm, fspgin, 1353 btopr(len)); 1354 atomic_add_64(&curzone->zone_fspgin, 1355 btopr(len)); 1356 } 1357 } 1358 } 1359 CPU_STATS_EXIT_K(); 1360 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1361 "page_ws_in:pp %p", pp); 1362 /* Kernel probe */ 1363 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */, 1364 tnf_opaque, vnode, pp->p_vnode, 1365 tnf_offset, offset, pp->p_offset, 1366 tnf_size, size, len); 1367 } 1368 1369 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1370 bp->b_bcount = len; 1371 bp->b_bufsize = len; 1372 bp->b_pages = pp; 1373 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1374 bp->b_offset = -1; 1375 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1376 1377 /* Initialize bp->b_sem in "locked" state */ 1378 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1379 1380 VN_HOLD(vp); 1381 bp->b_vp = vp; 1382 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */ 1383 1384 /* 1385 * Caller sets dev & blkno and can adjust 1386 * b_addr for page offset and can use bp_mapin 1387 * to make pages kernel addressable. 1388 */ 1389 return (bp); 1390 } 1391 1392 void 1393 pageio_done(struct buf *bp) 1394 { 1395 ASSERT(SEMA_HELD(&bp->b_sem)); 1396 if (bp->b_flags & B_REMAPPED) 1397 bp_mapout(bp); 1398 VN_RELE(bp->b_vp); 1399 bp->b_vp = NULL; 1400 ASSERT((bp->b_flags & B_NOCACHE) != 0); 1401 1402 /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1403 sema_destroy(&bp->b_sem); 1404 sema_destroy(&bp->b_io); 1405 kmem_free(bp, sizeof (struct buf)); 1406 } 1407 1408 /* 1409 * Check to see whether the buffers, except the one pointed by sbp, 1410 * associated with the device are busy. 1411 * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1412 */ 1413 int 1414 bcheck(dev_t dev, struct buf *sbp) 1415 { 1416 struct buf *bp; 1417 struct buf *dp; 1418 int i; 1419 kmutex_t *hmp; 1420 1421 /* 1422 * check for busy bufs for this filesystem 1423 */ 1424 for (i = 0; i < v.v_hbuf; i++) { 1425 dp = (struct buf *)&hbuf[i]; 1426 hmp = &hbuf[i].b_lock; 1427 1428 mutex_enter(hmp); 1429 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1430 /* 1431 * if buf is busy or dirty, then filesystem is busy 1432 */ 1433 if ((bp->b_edev == dev) && 1434 ((bp->b_flags & B_STALE) == 0) && 1435 (bp->b_flags & (B_DELWRI|B_BUSY)) && 1436 (bp != sbp)) { 1437 mutex_exit(hmp); 1438 return (1); 1439 } 1440 } 1441 mutex_exit(hmp); 1442 } 1443 return (0); 1444 } 1445 1446 /* 1447 * Hash two 32 bit entities. 1448 */ 1449 int 1450 hash2ints(int x, int y) 1451 { 1452 int hash = 0; 1453 1454 hash = x - 1; 1455 hash = ((hash * 7) + (x >> 8)) - 1; 1456 hash = ((hash * 7) + (x >> 16)) - 1; 1457 hash = ((hash * 7) + (x >> 24)) - 1; 1458 hash = ((hash * 7) + y) - 1; 1459 hash = ((hash * 7) + (y >> 8)) - 1; 1460 hash = ((hash * 7) + (y >> 16)) - 1; 1461 hash = ((hash * 7) + (y >> 24)) - 1; 1462 1463 return (hash); 1464 } 1465 1466 1467 /* 1468 * Return a new buffer struct. 1469 * Create a new buffer if we haven't gone over our high water 1470 * mark for memory, otherwise try to get one off the freelist. 1471 * 1472 * Returns a locked buf that has no id and is not on any hash or free 1473 * list. 1474 */ 1475 static struct buf * 1476 bio_getfreeblk(long bsize) 1477 { 1478 struct buf *bp, *dp; 1479 struct hbuf *hp; 1480 kmutex_t *hmp; 1481 uint_t start, end; 1482 1483 /* 1484 * mutex_enter(&bfree_lock); 1485 * bfreelist.b_bufsize represents the amount of memory 1486 * mutex_exit(&bfree_lock); protect ref to bfreelist 1487 * we are allowed to allocate in the cache before we hit our hwm. 1488 */ 1489 bio_mem_get(bsize); /* Account for our memory request */ 1490 1491 again: 1492 bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1493 sema_p(&bp->b_sem); /* Should never fail */ 1494 1495 ASSERT(bp->b_un.b_addr == NULL); 1496 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1497 if (bp->b_un.b_addr != NULL) { 1498 /* 1499 * Make the common path short 1500 */ 1501 bp->b_bufsize = bsize; 1502 ASSERT(SEMA_HELD(&bp->b_sem)); 1503 return (bp); 1504 } else { 1505 struct buf *save; 1506 1507 save = bp; /* Save bp we allocated */ 1508 start = end = lastindex; 1509 1510 biostats.bio_bufwant.value.ui32++; 1511 1512 /* 1513 * Memory isn't available from the system now. Scan 1514 * the hash buckets till enough space is found. 1515 */ 1516 do { 1517 hp = &hbuf[start]; 1518 hmp = &hp->b_lock; 1519 dp = (struct buf *)hp; 1520 1521 mutex_enter(hmp); 1522 bp = dp->av_forw; 1523 1524 while (bp != dp) { 1525 1526 ASSERT(bp != NULL); 1527 1528 if (!sema_tryp(&bp->b_sem)) { 1529 bp = bp->av_forw; 1530 continue; 1531 } 1532 1533 /* 1534 * Since we are going down the freelist 1535 * associated with this hash bucket the 1536 * B_DELWRI flag should not be set. 1537 */ 1538 ASSERT(!(bp->b_flags & B_DELWRI)); 1539 1540 if (bp->b_bufsize == bsize) { 1541 hp->b_length--; 1542 notavail(bp); 1543 bremhash(bp); 1544 mutex_exit(hmp); 1545 1546 /* 1547 * Didn't kmem_alloc any more, so don't 1548 * count it twice. 1549 */ 1550 mutex_enter(&bfree_lock); 1551 bfreelist.b_bufsize += bsize; 1552 mutex_exit(&bfree_lock); 1553 1554 /* 1555 * Update the lastindex value. 1556 */ 1557 lastindex = start; 1558 1559 /* 1560 * Put our saved bp back on the list 1561 */ 1562 sema_v(&save->b_sem); 1563 bio_bhdr_free(save); 1564 ASSERT(SEMA_HELD(&bp->b_sem)); 1565 return (bp); 1566 } 1567 sema_v(&bp->b_sem); 1568 bp = bp->av_forw; 1569 } 1570 mutex_exit(hmp); 1571 start = ((start + 1) % v.v_hbuf); 1572 } while (start != end); 1573 1574 biostats.bio_bufwait.value.ui32++; 1575 bp = save; /* Use original bp */ 1576 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1577 } 1578 1579 bp->b_bufsize = bsize; 1580 ASSERT(SEMA_HELD(&bp->b_sem)); 1581 return (bp); 1582 } 1583 1584 /* 1585 * Allocate a buffer header. If none currently available, allocate 1586 * a new pool. 1587 */ 1588 static struct buf * 1589 bio_bhdr_alloc(void) 1590 { 1591 struct buf *dp, *sdp; 1592 struct buf *bp; 1593 int i; 1594 1595 for (;;) { 1596 mutex_enter(&bhdr_lock); 1597 if (bhdrlist != NULL) { 1598 bp = bhdrlist; 1599 bhdrlist = bp->av_forw; 1600 mutex_exit(&bhdr_lock); 1601 bp->av_forw = NULL; 1602 return (bp); 1603 } 1604 mutex_exit(&bhdr_lock); 1605 1606 /* 1607 * Need to allocate a new pool. If the system is currently 1608 * out of memory, then try freeing things on the freelist. 1609 */ 1610 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1611 if (dp == NULL) { 1612 /* 1613 * System can't give us a pool of headers, try 1614 * recycling from the free lists. 1615 */ 1616 bio_recycle(BIO_HEADER, 0); 1617 } else { 1618 sdp = dp; 1619 for (i = 0; i < v.v_buf; i++, dp++) { 1620 /* 1621 * The next two lines are needed since NODEV 1622 * is -1 and not NULL 1623 */ 1624 dp->b_dev = (o_dev_t)NODEV; 1625 dp->b_edev = NODEV; 1626 dp->av_forw = dp + 1; 1627 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1628 NULL); 1629 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1630 NULL); 1631 dp->b_offset = -1; 1632 } 1633 mutex_enter(&bhdr_lock); 1634 (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1635 bhdrlist = sdp; 1636 nbuf += v.v_buf; 1637 bp = bhdrlist; 1638 bhdrlist = bp->av_forw; 1639 mutex_exit(&bhdr_lock); 1640 1641 bp->av_forw = NULL; 1642 return (bp); 1643 } 1644 } 1645 } 1646 1647 static void 1648 bio_bhdr_free(struct buf *bp) 1649 { 1650 ASSERT(bp->b_back == NULL); 1651 ASSERT(bp->b_forw == NULL); 1652 ASSERT(bp->av_back == NULL); 1653 ASSERT(bp->av_forw == NULL); 1654 ASSERT(bp->b_un.b_addr == NULL); 1655 ASSERT(bp->b_dev == (o_dev_t)NODEV); 1656 ASSERT(bp->b_edev == NODEV); 1657 ASSERT(bp->b_flags == 0); 1658 1659 mutex_enter(&bhdr_lock); 1660 bp->av_forw = bhdrlist; 1661 bhdrlist = bp; 1662 mutex_exit(&bhdr_lock); 1663 } 1664 1665 /* 1666 * If we haven't gone over the high water mark, it's o.k. to 1667 * allocate more buffer space, otherwise recycle buffers 1668 * from the freelist until enough memory is free for a bsize request. 1669 * 1670 * We account for this memory, even though 1671 * we don't allocate it here. 1672 */ 1673 static void 1674 bio_mem_get(long bsize) 1675 { 1676 mutex_enter(&bfree_lock); 1677 if (bfreelist.b_bufsize > bsize) { 1678 bfreelist.b_bufsize -= bsize; 1679 mutex_exit(&bfree_lock); 1680 return; 1681 } 1682 mutex_exit(&bfree_lock); 1683 bio_recycle(BIO_MEM, bsize); 1684 } 1685 1686 /* 1687 * flush a list of delayed write buffers. 1688 * (currently used only by bio_recycle below.) 1689 */ 1690 static void 1691 bio_flushlist(struct buf *delwri_list) 1692 { 1693 struct buf *bp; 1694 1695 while (delwri_list != EMPTY_LIST) { 1696 bp = delwri_list; 1697 bp->b_flags |= B_AGE | B_ASYNC; 1698 if (bp->b_vp == NULL) { /* !ufs */ 1699 BWRITE(bp); 1700 } else { /* ufs */ 1701 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1702 } 1703 delwri_list = bp->b_list; 1704 bp->b_list = NULL; 1705 } 1706 } 1707 1708 /* 1709 * Start recycling buffers on the freelist for one of 2 reasons: 1710 * - we need a buffer header 1711 * - we need to free up memory 1712 * Once started we continue to recycle buffers until the B_AGE 1713 * buffers are gone. 1714 */ 1715 static void 1716 bio_recycle(int want, long bsize) 1717 { 1718 struct buf *bp, *dp, *dwp, *nbp; 1719 struct hbuf *hp; 1720 int found = 0; 1721 kmutex_t *hmp; 1722 int start, end; 1723 struct buf *delwri_list = EMPTY_LIST; 1724 1725 /* 1726 * Recycle buffers. 1727 */ 1728 top: 1729 start = end = lastindex; 1730 do { 1731 hp = &hbuf[start]; 1732 hmp = &hp->b_lock; 1733 dp = (struct buf *)hp; 1734 1735 mutex_enter(hmp); 1736 bp = dp->av_forw; 1737 1738 while (bp != dp) { 1739 1740 ASSERT(bp != NULL); 1741 1742 if (!sema_tryp(&bp->b_sem)) { 1743 bp = bp->av_forw; 1744 continue; 1745 } 1746 /* 1747 * Do we really want to nuke all of the B_AGE stuff?? 1748 */ 1749 if ((bp->b_flags & B_AGE) == 0 && found) { 1750 sema_v(&bp->b_sem); 1751 mutex_exit(hmp); 1752 lastindex = start; 1753 return; /* All done */ 1754 } 1755 1756 ASSERT(MUTEX_HELD(&hp->b_lock)); 1757 ASSERT(!(bp->b_flags & B_DELWRI)); 1758 hp->b_length--; 1759 notavail(bp); 1760 1761 /* 1762 * Remove bhdr from cache, free up memory, 1763 * and add the hdr to the freelist. 1764 */ 1765 bremhash(bp); 1766 mutex_exit(hmp); 1767 1768 if (bp->b_bufsize) { 1769 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1770 bp->b_un.b_addr = NULL; 1771 mutex_enter(&bfree_lock); 1772 bfreelist.b_bufsize += bp->b_bufsize; 1773 mutex_exit(&bfree_lock); 1774 } 1775 1776 bp->b_dev = (o_dev_t)NODEV; 1777 bp->b_edev = NODEV; 1778 bp->b_flags = 0; 1779 sema_v(&bp->b_sem); 1780 bio_bhdr_free(bp); 1781 if (want == BIO_HEADER) { 1782 found = 1; 1783 } else { 1784 ASSERT(want == BIO_MEM); 1785 if (!found && bfreelist.b_bufsize >= bsize) { 1786 /* Account for the memory we want */ 1787 mutex_enter(&bfree_lock); 1788 if (bfreelist.b_bufsize >= bsize) { 1789 bfreelist.b_bufsize -= bsize; 1790 found = 1; 1791 } 1792 mutex_exit(&bfree_lock); 1793 } 1794 } 1795 1796 /* 1797 * Since we dropped hmp start from the 1798 * begining. 1799 */ 1800 mutex_enter(hmp); 1801 bp = dp->av_forw; 1802 } 1803 mutex_exit(hmp); 1804 1805 /* 1806 * Look at the delayed write list. 1807 * First gather into a private list, then write them. 1808 */ 1809 dwp = (struct buf *)&dwbuf[start]; 1810 mutex_enter(&blist_lock); 1811 bio_doingflush++; 1812 mutex_enter(hmp); 1813 for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1814 1815 ASSERT(bp != NULL); 1816 nbp = bp->av_forw; 1817 1818 if (!sema_tryp(&bp->b_sem)) 1819 continue; 1820 ASSERT(bp->b_flags & B_DELWRI); 1821 /* 1822 * Do we really want to nuke all of the B_AGE stuff?? 1823 */ 1824 1825 if ((bp->b_flags & B_AGE) == 0 && found) { 1826 sema_v(&bp->b_sem); 1827 mutex_exit(hmp); 1828 lastindex = start; 1829 mutex_exit(&blist_lock); 1830 bio_flushlist(delwri_list); 1831 mutex_enter(&blist_lock); 1832 bio_doingflush--; 1833 if (bio_flinv_cv_wanted) { 1834 bio_flinv_cv_wanted = 0; 1835 cv_broadcast(&bio_flushinval_cv); 1836 } 1837 mutex_exit(&blist_lock); 1838 return; /* All done */ 1839 } 1840 1841 /* 1842 * If the buffer is already on a flush or 1843 * invalidate list then just skip it. 1844 */ 1845 if (bp->b_list != NULL) { 1846 sema_v(&bp->b_sem); 1847 continue; 1848 } 1849 /* 1850 * We are still on the same bucket. 1851 */ 1852 hp->b_length--; 1853 notavail(bp); 1854 bp->b_list = delwri_list; 1855 delwri_list = bp; 1856 } 1857 mutex_exit(hmp); 1858 mutex_exit(&blist_lock); 1859 bio_flushlist(delwri_list); 1860 delwri_list = EMPTY_LIST; 1861 mutex_enter(&blist_lock); 1862 bio_doingflush--; 1863 if (bio_flinv_cv_wanted) { 1864 bio_flinv_cv_wanted = 0; 1865 cv_broadcast(&bio_flushinval_cv); 1866 } 1867 mutex_exit(&blist_lock); 1868 start = (start + 1) % v.v_hbuf; 1869 1870 } while (start != end); 1871 1872 if (found) 1873 return; 1874 1875 /* 1876 * Free lists exhausted and we haven't satisfied the request. 1877 * Wait here for more entries to be added to freelist. 1878 * Because this might have just happened, make it timed. 1879 */ 1880 mutex_enter(&bfree_lock); 1881 bfreelist.b_flags |= B_WANTED; 1882 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK); 1883 mutex_exit(&bfree_lock); 1884 goto top; 1885 } 1886 1887 /* 1888 * See if the block is associated with some buffer 1889 * (mainly to avoid getting hung up on a wait in breada). 1890 */ 1891 static int 1892 bio_incore(dev_t dev, daddr_t blkno) 1893 { 1894 struct buf *bp; 1895 struct buf *dp; 1896 uint_t index; 1897 kmutex_t *hmp; 1898 1899 index = bio_bhash(dev, blkno); 1900 dp = (struct buf *)&hbuf[index]; 1901 hmp = &hbuf[index].b_lock; 1902 1903 mutex_enter(hmp); 1904 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1905 if (bp->b_blkno == blkno && bp->b_edev == dev && 1906 (bp->b_flags & B_STALE) == 0) { 1907 mutex_exit(hmp); 1908 return (1); 1909 } 1910 } 1911 mutex_exit(hmp); 1912 return (0); 1913 } 1914 1915 static void 1916 bio_pageio_done(struct buf *bp) 1917 { 1918 if (bp->b_flags & B_PAGEIO) { 1919 1920 if (bp->b_flags & B_REMAPPED) 1921 bp_mapout(bp); 1922 1923 if (bp->b_flags & B_READ) 1924 pvn_read_done(bp->b_pages, bp->b_flags); 1925 else 1926 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1927 pageio_done(bp); 1928 } else { 1929 ASSERT(bp->b_flags & B_REMAPPED); 1930 bp_mapout(bp); 1931 brelse(bp); 1932 } 1933 } 1934 1935 /* 1936 * bioerror(9F) - indicate error in buffer header 1937 * If 'error' is zero, remove the error indication. 1938 */ 1939 void 1940 bioerror(struct buf *bp, int error) 1941 { 1942 ASSERT(bp != NULL); 1943 ASSERT(error >= 0); 1944 ASSERT(SEMA_HELD(&bp->b_sem)); 1945 1946 if (error != 0) { 1947 bp->b_flags |= B_ERROR; 1948 } else { 1949 bp->b_flags &= ~B_ERROR; 1950 } 1951 bp->b_error = error; 1952 } 1953 1954 /* 1955 * bioreset(9F) - reuse a private buffer header after I/O is complete 1956 */ 1957 void 1958 bioreset(struct buf *bp) 1959 { 1960 ASSERT(bp != NULL); 1961 1962 biofini(bp); 1963 bioinit(bp); 1964 } 1965 1966 /* 1967 * biosize(9F) - return size of a buffer header 1968 */ 1969 size_t 1970 biosize(void) 1971 { 1972 return (sizeof (struct buf)); 1973 } 1974 1975 /* 1976 * biomodified(9F) - check if buffer is modified 1977 */ 1978 int 1979 biomodified(struct buf *bp) 1980 { 1981 int npf; 1982 int ppattr; 1983 struct page *pp; 1984 1985 ASSERT(bp != NULL); 1986 1987 if ((bp->b_flags & B_PAGEIO) == 0) { 1988 return (-1); 1989 } 1990 pp = bp->b_pages; 1991 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1992 1993 while (npf > 0) { 1994 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1995 HAT_SYNC_STOPON_MOD); 1996 if (ppattr & P_MOD) 1997 return (1); 1998 pp = pp->p_next; 1999 npf--; 2000 } 2001 2002 return (0); 2003 } 2004 2005 /* 2006 * bioinit(9F) - initialize a buffer structure 2007 */ 2008 void 2009 bioinit(struct buf *bp) 2010 { 2011 bzero(bp, sizeof (struct buf)); 2012 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 2013 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 2014 bp->b_offset = -1; 2015 } 2016 2017 /* 2018 * biofini(9F) - uninitialize a buffer structure 2019 */ 2020 void 2021 biofini(struct buf *bp) 2022 { 2023 sema_destroy(&bp->b_io); 2024 sema_destroy(&bp->b_sem); 2025 } 2026 2027 /* 2028 * bioclone(9F) - clone a buffer 2029 */ 2030 struct buf * 2031 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2032 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2033 { 2034 struct buf *bufp; 2035 2036 ASSERT(bp); 2037 if (bp_mem == NULL) { 2038 bufp = kmem_alloc(sizeof (struct buf), sleep); 2039 if (bufp == NULL) { 2040 return (NULL); 2041 } 2042 bioinit(bufp); 2043 } else { 2044 bufp = bp_mem; 2045 bioreset(bufp); 2046 } 2047 2048 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2049 B_ABRWRITE) 2050 2051 /* 2052 * The cloned buffer does not inherit the B_REMAPPED flag. 2053 */ 2054 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2055 bufp->b_bcount = len; 2056 bufp->b_blkno = blkno; 2057 bufp->b_iodone = iodone; 2058 bufp->b_proc = bp->b_proc; 2059 bufp->b_edev = dev; 2060 bufp->b_file = bp->b_file; 2061 bufp->b_offset = bp->b_offset; 2062 2063 if (bp->b_flags & B_SHADOW) { 2064 ASSERT(bp->b_shadow); 2065 ASSERT(bp->b_flags & B_PHYS); 2066 2067 bufp->b_shadow = bp->b_shadow + 2068 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2069 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2070 if (bp->b_flags & B_REMAPPED) 2071 bufp->b_proc = NULL; 2072 } else { 2073 if (bp->b_flags & B_PAGEIO) { 2074 struct page *pp; 2075 off_t o; 2076 int i; 2077 2078 pp = bp->b_pages; 2079 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2080 for (i = btop(o); i > 0; i--) { 2081 pp = pp->p_next; 2082 } 2083 bufp->b_pages = pp; 2084 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2085 } else { 2086 bufp->b_un.b_addr = 2087 (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2088 if (bp->b_flags & B_REMAPPED) 2089 bufp->b_proc = NULL; 2090 } 2091 } 2092 return (bufp); 2093 } 2094